[GPU] DC_LUT_RW_INDEX/WRITE_EN_MASK + gamma ramp and registers in traces

This commit is contained in:
Triang3l 2022-05-05 13:10:29 +03:00
parent 2d90d5940f
commit c794d0d538
19 changed files with 743 additions and 242 deletions

View File

@ -12,6 +12,7 @@
#include <algorithm>
#include <cinttypes>
#include <cmath>
#include <cstring>
#include "third_party/fmt/include/fmt/format.h"
#include "xenia/base/byte_stream.h"
@ -49,22 +50,23 @@ CommandProcessor::~CommandProcessor() = default;
bool CommandProcessor::Initialize() {
// Initialize the gamma ramps to their default (linear) values - taken from
// what games set when starting.
// what games set when starting with the sRGB (return value 1)
// VdGetCurrentDisplayGamma.
for (uint32_t i = 0; i < 256; ++i) {
uint32_t value = i * 1023 / 255;
gamma_ramp_.table[i].value = value | (value << 10) | (value << 20);
uint32_t value = i * 0x3FF / 0xFF;
reg::DC_LUT_30_COLOR& gamma_ramp_entry = gamma_ramp_256_entry_table_[i];
gamma_ramp_entry.color_10_blue = value;
gamma_ramp_entry.color_10_green = value;
gamma_ramp_entry.color_10_red = value;
}
for (uint32_t i = 0; i < 128; ++i) {
uint32_t value = (i * 65535 / 127) & ~63;
if (i < 127) {
value |= 0x200 << 16;
}
reg::DC_LUT_PWL_DATA gamma_ramp_entry = {};
gamma_ramp_entry.base = (i * 0xFFFF / 0x7F) & ~UINT32_C(0x3F);
gamma_ramp_entry.delta = i < 0x7F ? 0x200 : 0;
for (uint32_t j = 0; j < 3; ++j) {
gamma_ramp_.pwl[i].values[j].value = value;
gamma_ramp_pwl_rgb_[i][j] = gamma_ramp_entry;
}
}
dirty_gamma_ramp_table_ = true;
dirty_gamma_ramp_pwl_ = true;
worker_running_ = true;
worker_thread_ = kernel::object_ref<kernel::XHostThread>(
@ -128,6 +130,46 @@ void CommandProcessor::EndTracing() {
trace_writer_.Close();
}
void CommandProcessor::RestoreRegisters(uint32_t first_register,
const uint32_t* register_values,
uint32_t register_count,
bool execute_callbacks) {
if (first_register > RegisterFile::kRegisterCount ||
RegisterFile::kRegisterCount - first_register < register_count) {
XELOGW(
"CommandProcessor::RestoreRegisters out of bounds (0x{:X} registers "
"starting with 0x{:X}, while a total of 0x{:X} registers are stored)",
register_count, first_register, RegisterFile::kRegisterCount);
if (first_register > RegisterFile::kRegisterCount) {
return;
}
register_count =
std::min(uint32_t(RegisterFile::kRegisterCount) - first_register,
register_count);
}
if (execute_callbacks) {
for (uint32_t i = 0; i < register_count; ++i) {
WriteRegister(first_register + i, register_values[i]);
}
} else {
std::memcpy(register_file_->values + first_register, register_values,
sizeof(uint32_t) * register_count);
}
}
void CommandProcessor::RestoreGammaRamp(
const reg::DC_LUT_30_COLOR* new_gamma_ramp_256_entry_table,
const reg::DC_LUT_PWL_DATA* new_gamma_ramp_pwl_rgb,
uint32_t new_gamma_ramp_rw_component) {
std::memcpy(gamma_ramp_256_entry_table_, new_gamma_ramp_256_entry_table,
sizeof(reg::DC_LUT_30_COLOR) * 256);
std::memcpy(gamma_ramp_pwl_rgb_, new_gamma_ramp_pwl_rgb,
sizeof(reg::DC_LUT_PWL_DATA) * 3 * 128);
gamma_ramp_rw_component_ = new_gamma_ramp_rw_component;
OnGammaRamp256EntryTableValueWritten();
OnGammaRampPWLValueWritten();
}
void CommandProcessor::CallInThread(std::function<void()> fn) {
if (pending_fns_.empty() &&
kernel::XThread::IsInThread(worker_thread_.get())) {
@ -286,68 +328,141 @@ void CommandProcessor::UpdateWritePointer(uint32_t value) {
}
void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
RegisterFile* regs = register_file_;
RegisterFile& regs = *register_file_;
if (index >= RegisterFile::kRegisterCount) {
XELOGW("CommandProcessor::WriteRegister index out of bounds: {}", index);
return;
}
regs->values[index].u32 = value;
if (!regs->GetRegisterInfo(index)) {
regs.values[index].u32 = value;
if (!regs.GetRegisterInfo(index)) {
XELOGW("GPU: Write to unknown register ({:04X} = {:08X})", index, value);
}
// If this is a COHER register, set the dirty flag.
// This will block the command processor the next time it WAIT_MEM_REGs and
// allow us to synchronize the memory.
if (index == XE_GPU_REG_COHER_STATUS_HOST) {
regs->values[index].u32 |= 0x80000000ul;
}
// Scratch register writeback.
if (index >= XE_GPU_REG_SCRATCH_REG0 && index <= XE_GPU_REG_SCRATCH_REG7) {
uint32_t scratch_reg = index - XE_GPU_REG_SCRATCH_REG0;
if ((1 << scratch_reg) & regs->values[XE_GPU_REG_SCRATCH_UMSK].u32) {
if ((1 << scratch_reg) & regs.values[XE_GPU_REG_SCRATCH_UMSK].u32) {
// Enabled - write to address.
uint32_t scratch_addr = regs->values[XE_GPU_REG_SCRATCH_ADDR].u32;
uint32_t scratch_addr = regs.values[XE_GPU_REG_SCRATCH_ADDR].u32;
uint32_t mem_addr = scratch_addr + (scratch_reg * 4);
xe::store_and_swap<uint32_t>(memory_->TranslatePhysical(mem_addr), value);
}
}
}
} else {
switch (index) {
// If this is a COHER register, set the dirty flag.
// This will block the command processor the next time it WAIT_MEM_REGs
// and allow us to synchronize the memory.
case XE_GPU_REG_COHER_STATUS_HOST: {
regs.values[index].u32 |= UINT32_C(0x80000000);
} break;
void CommandProcessor::UpdateGammaRampValue(GammaRampType type,
uint32_t value) {
RegisterFile* regs = register_file_;
case XE_GPU_REG_DC_LUT_RW_INDEX: {
// Reset the sequential read / write component index (see the M56
// DC_LUT_SEQ_COLOR documentation).
gamma_ramp_rw_component_ = 0;
} break;
auto index = regs->values[XE_GPU_REG_DC_LUT_RW_INDEX].u32;
case XE_GPU_REG_DC_LUT_SEQ_COLOR: {
// Should be in the 256-entry table writing mode.
assert_zero(regs[XE_GPU_REG_DC_LUT_RW_MODE].u32 & 0b1);
auto& gamma_ramp_rw_index = regs.Get<reg::DC_LUT_RW_INDEX>();
// DC_LUT_SEQ_COLOR is in the red, green, blue order, but the write
// enable mask is blue, green, red.
bool write_gamma_ramp_component =
(regs[XE_GPU_REG_DC_LUT_WRITE_EN_MASK].u32 &
(UINT32_C(1) << (2 - gamma_ramp_rw_component_))) != 0;
if (write_gamma_ramp_component) {
reg::DC_LUT_30_COLOR& gamma_ramp_entry =
gamma_ramp_256_entry_table_[gamma_ramp_rw_index.rw_index];
// Bits 0:5 are hardwired to zero.
uint32_t gamma_ramp_seq_color =
regs.Get<reg::DC_LUT_SEQ_COLOR>().seq_color >> 6;
switch (gamma_ramp_rw_component_) {
case 0:
gamma_ramp_entry.color_10_red = gamma_ramp_seq_color;
break;
case 1:
gamma_ramp_entry.color_10_green = gamma_ramp_seq_color;
break;
case 2:
gamma_ramp_entry.color_10_blue = gamma_ramp_seq_color;
break;
}
}
if (++gamma_ramp_rw_component_ >= 3) {
gamma_ramp_rw_component_ = 0;
++gamma_ramp_rw_index.rw_index;
}
if (write_gamma_ramp_component) {
OnGammaRamp256EntryTableValueWritten();
}
} break;
auto mask = regs->values[XE_GPU_REG_DC_LUT_WRITE_EN_MASK].u32;
auto mask_lo = (mask >> 0) & 0x7;
auto mask_hi = (mask >> 3) & 0x7;
case XE_GPU_REG_DC_LUT_PWL_DATA: {
// Should be in the PWL writing mode.
assert_not_zero(regs[XE_GPU_REG_DC_LUT_RW_MODE].u32 & 0b1);
auto& gamma_ramp_rw_index = regs.Get<reg::DC_LUT_RW_INDEX>();
// Bit 7 of the index is ignored for PWL.
uint32_t gamma_ramp_rw_index_pwl = gamma_ramp_rw_index.rw_index & 0x7F;
// DC_LUT_RW_INDEX is likely in the red, green, blue order because
// DC_LUT_SEQ_COLOR is, but the write enable mask is blue, green, red.
bool write_gamma_ramp_component =
(regs[XE_GPU_REG_DC_LUT_WRITE_EN_MASK].u32 &
(UINT32_C(1) << (2 - gamma_ramp_rw_component_))) != 0;
if (write_gamma_ramp_component) {
reg::DC_LUT_PWL_DATA& gamma_ramp_entry =
gamma_ramp_pwl_rgb_[gamma_ramp_rw_index_pwl]
[gamma_ramp_rw_component_];
auto gamma_ramp_value = regs.Get<reg::DC_LUT_PWL_DATA>();
// Bits 0:5 are hardwired to zero.
gamma_ramp_entry.base = gamma_ramp_value.base & ~UINT32_C(0x3F);
gamma_ramp_entry.delta = gamma_ramp_value.delta & ~UINT32_C(0x3F);
}
if (++gamma_ramp_rw_component_ >= 3) {
gamma_ramp_rw_component_ = 0;
// TODO(Triang3l): Should this increase beyond 7 bits for PWL?
// Direct3D 9 explicitly sets rw_index to 0x80 after writing the last
// PWL entry. However, the DC_LUT_RW_INDEX documentation says that for
// PWL, the bit 7 is ignored.
gamma_ramp_rw_index.rw_index =
(gamma_ramp_rw_index.rw_index & ~UINT32_C(0x7F)) |
((gamma_ramp_rw_index_pwl + 1) & 0x7F);
}
if (write_gamma_ramp_component) {
OnGammaRampPWLValueWritten();
}
} break;
// If games update individual components we're going to have a problem.
assert_true(mask_lo == 0 || mask_lo == 7);
assert_true(mask_hi == 0);
if (mask_lo) {
switch (type) {
case GammaRampType::kTable:
assert_true(regs->values[XE_GPU_REG_DC_LUT_RW_MODE].u32 == 0);
gamma_ramp_.table[index].value = value;
dirty_gamma_ramp_table_ = true;
break;
case GammaRampType::kPWL:
assert_true(regs->values[XE_GPU_REG_DC_LUT_RW_MODE].u32 == 1);
// The lower 6 bits are hardwired to 0.
// https://developer.amd.com/wordpress/media/2012/10/RRG-216M56-03oOEM.pdf
gamma_ramp_.pwl[index].values[gamma_ramp_rw_subindex_].value =
value & ~(uint32_t(63) | (uint32_t(63) << 16));
gamma_ramp_rw_subindex_ = (gamma_ramp_rw_subindex_ + 1) % 3;
dirty_gamma_ramp_pwl_ = true;
break;
default:
assert_unhandled_case(type);
case XE_GPU_REG_DC_LUT_30_COLOR: {
// Should be in the 256-entry table writing mode.
assert_zero(regs[XE_GPU_REG_DC_LUT_RW_MODE].u32 & 0b1);
auto& gamma_ramp_rw_index = regs.Get<reg::DC_LUT_RW_INDEX>();
uint32_t gamma_ramp_write_enable_mask =
regs[XE_GPU_REG_DC_LUT_WRITE_EN_MASK].u32 & 0b111;
if (gamma_ramp_write_enable_mask) {
reg::DC_LUT_30_COLOR& gamma_ramp_entry =
gamma_ramp_256_entry_table_[gamma_ramp_rw_index.rw_index];
auto gamma_ramp_value = regs.Get<reg::DC_LUT_30_COLOR>();
if (gamma_ramp_write_enable_mask & 0b001) {
gamma_ramp_entry.color_10_blue = gamma_ramp_value.color_10_blue;
}
if (gamma_ramp_write_enable_mask & 0b010) {
gamma_ramp_entry.color_10_green = gamma_ramp_value.color_10_green;
}
if (gamma_ramp_write_enable_mask & 0b100) {
gamma_ramp_entry.color_10_red = gamma_ramp_value.color_10_red;
}
}
++gamma_ramp_rw_index.rw_index;
// TODO(Triang3l): Should this reset the component write index? If this
// increase is assumed to behave like a full DC_LUT_RW_INDEX write, it
// probably should.
gamma_ramp_rw_component_ = 0;
if (gamma_ramp_write_enable_mask) {
OnGammaRamp256EntryTableValueWritten();
}
} break;
}
}
}
@ -1493,5 +1608,17 @@ bool CommandProcessor::ExecutePacketType3_VIZ_QUERY(RingBuffer* reader,
return true;
}
void CommandProcessor::InitializeTrace() {
// Write the initial register values, to be loaded directly into the
// RegisterFile since all registers, including those that may have side
// effects on setting, will be saved.
trace_writer_.WriteRegisters(
0, reinterpret_cast<const uint32_t*>(register_file_->values),
RegisterFile::kRegisterCount, false);
trace_writer_.WriteGammaRamp(gamma_ramp_256_entry_table(),
gamma_ramp_pwl_rgb(), gamma_ramp_rw_component_);
}
} // namespace gpu
} // namespace xe

View File

@ -22,6 +22,7 @@
#include "xenia/base/ring_buffer.h"
#include "xenia/base/threading.h"
#include "xenia/gpu/register_file.h"
#include "xenia/gpu/registers.h"
#include "xenia/gpu/trace_writer.h"
#include "xenia/gpu/xenos.h"
#include "xenia/kernel/xthread.h"
@ -64,61 +65,6 @@ enum class GammaRampType {
kPWL,
};
struct GammaRamp {
// A lot of gamma ramp (DC_LUT) documentation:
// https://developer.amd.com/wordpress/media/2012/10/RRG-216M56-03oOEM.pdf
// The ramps entries are BGR, not RGB.
// For the 256-entry table (used by Direct3D 9 for a 8bpc front buffer),
// 535107D4 has in-game settings allowing separate configuration.
// The component order of the PWL table is untested, however, it's likely BGR
// too, since DC_LUTA/B registers have values for blue first, and for red
// last.
struct TableEntry {
union {
uint32_t value;
struct {
uint32_t b : 10;
uint32_t g : 10;
uint32_t r : 10;
uint32_t : 2;
};
};
};
struct PWLValue {
union {
uint32_t value;
struct {
// The lower 6 bits are always zero (these are 10-bit in the upper bits
// thus, not fully 16-bit).
// See DC_LUTA/B_CONTROL for information about the way they should be
// interpreted (`output = base + (multiplier * delta) / 2^increment`,
// where the increment is the value specified in DC_LUTA/B_CONTROL for
// the specific color channel, the base is 7 bits of the front buffer
// value above `increment` bits, the multiplier is the lower `increment`
// bits of it; the increment is nonzero, otherwise the 256-entry table
// should be used instead).
uint16_t base;
uint16_t delta;
};
};
};
struct PWLEntry {
union {
PWLValue values[3];
struct {
PWLValue b;
PWLValue g;
PWLValue r;
};
};
};
TableEntry table[256];
PWLEntry pwl[128];
};
class CommandProcessor {
public:
enum class SwapPostEffect {
@ -170,6 +116,13 @@ class CommandProcessor {
virtual void TracePlaybackWroteMemory(uint32_t base_ptr, uint32_t length) = 0;
void RestoreRegisters(uint32_t first_register,
const uint32_t* register_values,
uint32_t register_count, bool execute_callbacks);
void RestoreGammaRamp(
const reg::DC_LUT_30_COLOR* new_gamma_ramp_256_entry_table,
const reg::DC_LUT_PWL_DATA* new_gamma_ramp_pwl_rgb,
uint32_t new_gamma_ramp_rw_component);
virtual void RestoreEdramSnapshot(const void* snapshot) = 0;
void InitializeRingBuffer(uint32_t ptr, uint32_t size_log2);
@ -201,7 +154,14 @@ class CommandProcessor {
virtual void WriteRegister(uint32_t index, uint32_t value);
void UpdateGammaRampValue(GammaRampType type, uint32_t value);
const reg::DC_LUT_30_COLOR* gamma_ramp_256_entry_table() const {
return gamma_ramp_256_entry_table_;
}
const reg::DC_LUT_PWL_DATA* gamma_ramp_pwl_rgb() const {
return gamma_ramp_pwl_rgb_[0];
}
virtual void OnGammaRamp256EntryTableValueWritten() {}
virtual void OnGammaRampPWLValueWritten() {}
virtual void MakeCoherent();
virtual void PrepareForWait();
@ -285,9 +245,7 @@ class CommandProcessor {
return swap_post_effect_actual_;
}
// TODO(Triang3l): Write the gamma ramp (including the display controller
// write pointers) in the common code.
virtual void InitializeTrace() = 0;
virtual void InitializeTrace();
Memory* memory_ = nullptr;
kernel::KernelState* kernel_state_ = nullptr;
@ -334,15 +292,15 @@ class CommandProcessor {
bool paused_ = false;
GammaRamp gamma_ramp_ = {};
int gamma_ramp_rw_subindex_ = 0;
bool dirty_gamma_ramp_table_ = true;
bool dirty_gamma_ramp_pwl_ = true;
// By default (such as for tools), post-processing is disabled.
// "Desired" is for the external thread managing the post-processing effect.
SwapPostEffect swap_post_effect_desired_ = SwapPostEffect::kNone;
SwapPostEffect swap_post_effect_actual_ = SwapPostEffect::kNone;
private:
reg::DC_LUT_30_COLOR gamma_ramp_256_entry_table_[256] = {};
reg::DC_LUT_PWL_DATA gamma_ramp_pwl_rgb_[128][3] = {};
uint32_t gamma_ramp_rw_component_ = 0;
};
} // namespace gpu

View File

@ -13,6 +13,7 @@
#include <utility>
#include "xenia/base/assert.h"
#include "xenia/base/byte_order.h"
#include "xenia/base/cvar.h"
#include "xenia/base/logging.h"
#include "xenia/base/math.h"
@ -1161,8 +1162,8 @@ bool D3D12CommandProcessor::SetupContext() {
provider.GetHeapFlagCreateNotZeroed();
// Create gamma ramp resources.
dirty_gamma_ramp_table_ = true;
dirty_gamma_ramp_pwl_ = true;
gamma_ramp_256_entry_table_up_to_date_ = false;
gamma_ramp_pwl_up_to_date_ = false;
D3D12_RESOURCE_DESC gamma_ramp_buffer_desc;
ui::d3d12::util::FillBufferResourceDesc(
gamma_ramp_buffer_desc, (256 + 128 * 3) * 4, D3D12_RESOURCE_FLAG_NONE);
@ -1699,15 +1700,17 @@ void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
texture_cache_->TextureFetchConstantWritten(
(index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6);
}
} else if (index == XE_GPU_REG_DC_LUT_PWL_DATA) {
UpdateGammaRampValue(GammaRampType::kPWL, value);
} else if (index == XE_GPU_REG_DC_LUT_30_COLOR) {
UpdateGammaRampValue(GammaRampType::kTable, value);
} else if (index == XE_GPU_REG_DC_LUT_RW_MODE) {
gamma_ramp_rw_subindex_ = 0;
}
}
void D3D12CommandProcessor::OnGammaRamp256EntryTableValueWritten() {
gamma_ramp_256_entry_table_up_to_date_ = false;
}
void D3D12CommandProcessor::OnGammaRampPWLValueWritten() {
gamma_ramp_pwl_up_to_date_ = false;
}
void D3D12CommandProcessor::IssueSwap(uint32_t frontbuffer_ptr,
uint32_t frontbuffer_width,
uint32_t frontbuffer_height) {
@ -1801,6 +1804,9 @@ void D3D12CommandProcessor::IssueSwap(uint32_t frontbuffer_ptr,
// This is according to D3D::InitializePresentationParameters from a
// game executable, which initializes the 256-entry table gamma ramp for
// 8_8_8_8 output and the PWL gamma ramp for 2_10_10_10.
// TODO(Triang3l): Choose between the table and PWL based on
// DC_LUTA_CONTROL, support both for all formats (and also different
// increments for PWL).
bool use_pwl_gamma_ramp =
frontbuffer_format == xenos::TextureFormat::k_2_10_10_10 ||
frontbuffer_format ==
@ -1811,20 +1817,43 @@ void D3D12CommandProcessor::IssueSwap(uint32_t frontbuffer_ptr,
// Upload the new gamma ramp, using the upload buffer for the current
// frame (will close the frame after this anyway, so can't write
// multiple times per frame).
if (use_pwl_gamma_ramp ? dirty_gamma_ramp_pwl_
: dirty_gamma_ramp_table_) {
if (!(use_pwl_gamma_ramp ? gamma_ramp_pwl_up_to_date_
: gamma_ramp_256_entry_table_up_to_date_)) {
uint32_t gamma_ramp_offset_bytes = use_pwl_gamma_ramp ? 256 * 4 : 0;
uint32_t gamma_ramp_upload_offset_bytes =
uint32_t(frame_current_ % kQueueFrames) * ((256 + 128 * 3) * 4) +
gamma_ramp_offset_bytes;
uint32_t gamma_ramp_size_bytes =
(use_pwl_gamma_ramp ? 128 * 3 : 256) * 4;
std::memcpy(gamma_ramp_upload_buffer_mapping_ +
gamma_ramp_upload_offset_bytes,
use_pwl_gamma_ramp
? static_cast<const void*>(gamma_ramp_.pwl)
: static_cast<const void*>(gamma_ramp_.table),
gamma_ramp_size_bytes);
if (std::endian::native != std::endian::little &&
use_pwl_gamma_ramp) {
// R16G16 is first R16, where the shader expects the base, and
// second G16, where the delta should be, but gamma_ramp_pwl_rgb()
// is an array of 32-bit DC_LUT_PWL_DATA registers - swap 16 bits in
// each 32.
auto gamma_ramp_pwl_upload_buffer =
reinterpret_cast<reg::DC_LUT_PWL_DATA*>(
gamma_ramp_upload_buffer_mapping_ +
gamma_ramp_upload_offset_bytes);
const reg::DC_LUT_PWL_DATA* gamma_ramp_pwl = gamma_ramp_pwl_rgb();
for (size_t i = 0; i < 128 * 3; ++i) {
reg::DC_LUT_PWL_DATA& gamma_ramp_pwl_upload_buffer_entry =
gamma_ramp_pwl_upload_buffer[i];
reg::DC_LUT_PWL_DATA gamma_ramp_pwl_entry = gamma_ramp_pwl[i];
gamma_ramp_pwl_upload_buffer_entry.base =
gamma_ramp_pwl_entry.delta;
gamma_ramp_pwl_upload_buffer_entry.delta =
gamma_ramp_pwl_entry.base;
}
} else {
std::memcpy(
gamma_ramp_upload_buffer_mapping_ +
gamma_ramp_upload_offset_bytes,
use_pwl_gamma_ramp
? static_cast<const void*>(gamma_ramp_pwl_rgb())
: static_cast<const void*>(gamma_ramp_256_entry_table()),
gamma_ramp_size_bytes);
}
PushTransitionBarrier(gamma_ramp_buffer_.Get(),
gamma_ramp_buffer_state_,
D3D12_RESOURCE_STATE_COPY_DEST);
@ -1834,8 +1863,8 @@ void D3D12CommandProcessor::IssueSwap(uint32_t frontbuffer_ptr,
gamma_ramp_buffer_.Get(), gamma_ramp_offset_bytes,
gamma_ramp_upload_buffer_.Get(), gamma_ramp_upload_offset_bytes,
gamma_ramp_size_bytes);
(use_pwl_gamma_ramp ? dirty_gamma_ramp_pwl_
: dirty_gamma_ramp_table_) = false;
(use_pwl_gamma_ramp ? gamma_ramp_pwl_up_to_date_
: gamma_ramp_256_entry_table_up_to_date_) = true;
}
// Destination, source, and if bindful, gamma ramp.
@ -2589,6 +2618,8 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
}
void D3D12CommandProcessor::InitializeTrace() {
CommandProcessor::InitializeTrace();
if (!BeginSubmission(false)) {
return;
}

View File

@ -209,6 +209,9 @@ class D3D12CommandProcessor : public CommandProcessor {
void WriteRegister(uint32_t index, uint32_t value) override;
void OnGammaRamp256EntryTableValueWritten() override;
void OnGammaRampPWLValueWritten() override;
void IssueSwap(uint32_t frontbuffer_ptr, uint32_t frontbuffer_width,
uint32_t frontbuffer_height) override;
@ -496,17 +499,18 @@ class D3D12CommandProcessor : public CommandProcessor {
std::unique_ptr<TextureCache> texture_cache_;
// Bytes 0x0...0x3FF - 256-entry R10G10B10X2 gamma ramp (red and blue must be
// read as swapped - 535107D4 has settings allowing separate configuration).
// Bytes 0x0...0x3FF - 256-entry gamma ramp table with B10G10R10X2 data (read
// as R10G10B10X2 with swizzle).
// Bytes 0x400...0x9FF - 128-entry PWL R16G16 gamma ramp (R - base, G - delta,
// low 6 bits of each are zero, 3 elements per entry).
// https://www.x.org/docs/AMD/old/42590_m76_rrg_1.01o.pdf
Microsoft::WRL::ComPtr<ID3D12Resource> gamma_ramp_buffer_;
D3D12_RESOURCE_STATES gamma_ramp_buffer_state_;
// Upload buffer for an image that is the same as gamma_ramp_, but with
// kQueueFrames array layers.
Microsoft::WRL::ComPtr<ID3D12Resource> gamma_ramp_upload_buffer_;
uint8_t* gamma_ramp_upload_buffer_mapping_ = nullptr;
bool gamma_ramp_256_entry_table_up_to_date_ = false;
bool gamma_ramp_pwl_up_to_date_ = false;
struct ApplyGammaConstants {
uint32_t size[2];

View File

@ -275,14 +275,183 @@ XE_GPU_REGISTER(0x1844, kDword, D1GRPH_PRIMARY_SURFACE_ADDRESS)
XE_GPU_REGISTER(0x1852, kDword, D1GRPH_FLIP_CONTROL)
XE_GPU_REGISTER(0x1921, kDword, DC_LUT_RW_MODE)
XE_GPU_REGISTER(0x1922, kDword, DC_LUT_RW_INDEX)
// In 4B4F07FE, the 256-entry gamma ramp for the 8bpc framebuffer is set to
// different values in multiple places in the game. For VdGetCurrentDisplayGamma
// returning 1 (sRGB), it's set up in the beginning as:
// DC_LUTA_CONTROL = 0x00000000 (256-entry unsigned fixed-point)
// DC_LUT_RW_MODE = 0x00000000
// DC_LUT_RW_INDEX = 0x00000000
// DC_LUT_WRITE_EN_MASK = 0x00000007
// DC_LUT_30_COLOR = 0x00000000
// DC_LUT_RW_INDEX = 0x00000001
// DC_LUT_30_COLOR = 0x04812048
// DC_LUT_RW_INDEX = 0x00000002
// DC_LUT_30_COLOR = 0x05916459
// DC_LUT_RW_INDEX = 0x00000003
// DC_LUT_30_COLOR = 0x06519465
// ...
// DC_LUT_RW_INDEX = 0x000000FE
// DC_LUT_30_COLOR = 0x3FBFEFFB
// DC_LUT_RW_INDEX = 0x000000FF
// DC_LUT_30_COLOR = 0x3FFFFFFF
// DC_LUT_RW_INDEX = 0x00000100
//
// One another possible setup in 4B4F07FE is:
// DC_LUTA_CONTROL = 0x00000000 (256-entry unsigned fixed-point)
// DC_LUT_RW_MODE = 0x00000000
// DC_LUT_RW_INDEX = 0x00000000
// DC_LUT_WRITE_EN_MASK = 0x00000007
// DC_LUT_30_COLOR = 0x00000000
// DC_LUT_RW_INDEX = 0x00000001
// DC_LUT_30_COLOR = 0x01A0681A
// DC_LUT_RW_INDEX = 0x00000002
// DC_LUT_30_COLOR = 0x02709C27
// ...
// DC_LUT_RW_INDEX = 0x000000FE
// DC_LUT_30_COLOR = 0x3FBFEFFB
// DC_LUT_RW_INDEX = 0x000000FF
// DC_LUT_30_COLOR = 0x3FFFFFFF
// DC_LUT_RW_INDEX = 0x00000100
//
// In 4D5307E6, the 128-entry PWL gamma ramp for the 10bpc framebuffer, for
// VdGetCurrentDisplayGamma returning 1 (sRGB), is set up right after launching
// the game as:
// DC_LUTA_CONTROL = 0x00000003 (8-increment unsigned fixed-point)
// DC_LUT_RW_MODE = 0x00000001
// DC_LUT_RW_INDEX = 0x00000000
// DC_LUT_WRITE_EN_MASK = 0x00000007
// DC_LUT_PWL_DATA = 0x02000000
// DC_LUT_PWL_DATA = 0x02000000
// DC_LUT_PWL_DATA = 0x02000000
// DC_LUT_RW_INDEX = 0x00000001
// DC_LUT_PWL_DATA = 0x02000200
// DC_LUT_PWL_DATA = 0x02000200
// DC_LUT_PWL_DATA = 0x02000200
// DC_LUT_RW_INDEX = 0x00000001
// DC_LUT_PWL_DATA = 0x02000400
// DC_LUT_PWL_DATA = 0x02000400
// DC_LUT_PWL_DATA = 0x02000400
// ...
// DC_LUT_RW_INDEX = 0x0000007D
// DC_LUT_PWL_DATA = 0x0200FBC0
// DC_LUT_PWL_DATA = 0x0200FBC0
// DC_LUT_PWL_DATA = 0x0200FBC0
// DC_LUT_RW_INDEX = 0x0000007E
// DC_LUT_PWL_DATA = 0x0200FDC0
// DC_LUT_PWL_DATA = 0x0200FDC0
// DC_LUT_PWL_DATA = 0x0200FDC0
// DC_LUT_RW_INDEX = 0x0000007F
// DC_LUT_PWL_DATA = 0x0000FFC0
// DC_LUT_PWL_DATA = 0x0000FFC0
// DC_LUT_PWL_DATA = 0x0000FFC0
// DC_LUT_RW_INDEX = 0x00000080
//
// Later in 4D5307E6, for the game itself (apparently for conversion of the bit
// representation of 7e3 floating-point data in the front buffer to 10-bit fixed
// point, as the game draws the final passes to a 7e3 framebuffer), with
// VdGetCurrentDisplayGamma returning 1 (sRGB) and the normal brightness in the
// game settings, it's:
// DC_LUTA_CONTROL = 0x00000003 (8-increment unsigned fixed-point)
// DC_LUT_RW_MODE = 0x00000001
// DC_LUT_RW_INDEX = 0x00000000
// DC_LUT_WRITE_EN_MASK = 0x00000007
// DC_LUT_PWL_DATA = 0x05000000
// DC_LUT_PWL_DATA = 0x05000000
// DC_LUT_PWL_DATA = 0x05000000
// DC_LUT_RW_INDEX = 0x00000001
// DC_LUT_PWL_DATA = 0x02000500
// DC_LUT_PWL_DATA = 0x02000500
// DC_LUT_PWL_DATA = 0x02000500
// DC_LUT_RW_INDEX = 0x00000001
// DC_LUT_PWL_DATA = 0x01800740
// DC_LUT_PWL_DATA = 0x01800740
// DC_LUT_PWL_DATA = 0x01800740
// ...
// DC_LUT_RW_INDEX = 0x0000007D
// DC_LUT_PWL_DATA = 0x0440F340
// DC_LUT_PWL_DATA = 0x0440F340
// DC_LUT_PWL_DATA = 0x0440F340
// DC_LUT_RW_INDEX = 0x0000007E
// DC_LUT_PWL_DATA = 0x0400F780
// DC_LUT_PWL_DATA = 0x0400F780
// DC_LUT_PWL_DATA = 0x0400F780
// DC_LUT_RW_INDEX = 0x0000007F
// DC_LUT_PWL_DATA = 0x0400FBC0
// DC_LUT_PWL_DATA = 0x0400FBC0
// DC_LUT_PWL_DATA = 0x0400FBC0
// DC_LUT_RW_INDEX = 0x00000080
//
// In 535107D4, the 256-entry gamma ramp for the 8bpc framebuffer is
// configurable from the game's settings menu for each channel independently.
// For VdGetCurrentDisplayGamma returning 1 (sRGB), when in the settings, the
// red gamma is at the maximum of 5.56, green is at 1.00, and blue is at the
// minimum of 0.17, the setup is done as:
// DC_LUT_RW_MODE = 0x00000000
// DC_LUT_RW_INDEX = 0x00000000
// DC_LUT_WRITE_EN_MASK = 0x00000007
// DC_LUT_30_COLOR = 0x00000000
// DC_LUT_RW_INDEX = 0x00000001
// DC_LUT_30_COLOR = 0x17901000
// DC_LUT_RW_INDEX = 0x00000002
// DC_LUT_30_COLOR = 0x1AB02000
// ...
// DC_LUT_RW_INDEX = 0x000000FE
// DC_LUT_30_COLOR = 0x3FEFE3D2
// DC_LUT_RW_INDEX = 0x000000FF
// DC_LUT_30_COLOR = 0x3FFFF3E9
// DC_LUT_RW_INDEX = 0x00000100
// Read / write mode in bit 0: 0 - 256-entry table, 1 - PWL.
// Default: 0x00000000.
XE_GPU_REGISTER(0x1921, kDword, DC_LUT_RW_MODE)
// Read / write index. No lower and upper halves on the Xenos apparently, for
// the 256-entry table, the bits 0:7 are the index directly (unlike on the M56,
// not split into the index in 1:7 and the lower or upper 10 bits selection in
// 0:0, instead, on the Xenos, the index in 0:7 is just increased
// monotonically). For some reason though Direct3D 9 writes an index that
// overflows by one (0x100 for the 256-entry table, 0x80 for the 128-entry PWL
// gamma ramp) after setting up all the values. However, the index is 8-bit, and
// for PWL, according to the M56 documentation, the bit 7 is not used.
// Default: 0x00000000.
XE_GPU_REGISTER(0x1922, kDword, DC_LUT_RW_INDEX)
// Sequential 10-bit R, G, B host read / write for the 256-entry table. After
// reset or writing DC_LUT_RW_INDEX, the first access is for the red component,
// the second is for green, the third is for blue, and after blue is accessed,
// the LUT index is increased by 1 (without having to explicitly change
// DC_LUT_RW_INDEX). Bits 0:5 are hardwired to zero.
// Default: 0x00000000.
XE_GPU_REGISTER(0x1923, kDword, DC_LUT_SEQ_COLOR)
// Read / write, 0:15 - base, 16:31 - delta. Bits 0:5 of both the base and the
// delta are hardwired to zero. The LUT index is increased by 1 when
// DC_LUT_PWL_DATA is accessed, though three DC_LUT_PWL_DATA writes are done for
// one entry (the order is likely R, G, B, similar to DC_LUT_SEQ_COLOR, but this
// hasn't been verified yet as no games using the PWL gamma ramp with separate
// settings for each channel have been found yet).
// Default: 0x00000000.
XE_GPU_REGISTER(0x1924, kDword, DC_LUT_PWL_DATA)
// Read / write, 0:9 - blue, 10:19 - green, 20:29 - red. The LUT index is
// increased by 1 when DC_LUT_30_COLOR is accessed.
// Default: 0x00000000.
XE_GPU_REGISTER(0x1925, kDword, DC_LUT_30_COLOR)
// Only LUT pipe 1 on the Xenos apparently (Direct3D 9 sets DC_LUT_WRITE_EN_MASK
// to 0b111 before writing the gamma ramp), 3 bits set, rather than 6 on the
// M56.
// Bit 0 - blue write enable mask.
// Bit 1 - green write enable mask.
// Bit 2 - red write enable mask.
// Default: 0x00000007 (though 0x0000003F on the M56 where there are two pipes).
XE_GPU_REGISTER(0x1927, kDword, DC_LUT_WRITE_EN_MASK)
// Single set of parameters for all channels apparently unlike on the M56
// (4D5307E6 sets DC_LUTA_CONTROL to 0x00000003 for the data increment of 8 in
// the 128-entry PWL gamma ramp for a 10bpc framebuffer). Also set not only
// during setup, but also apparently during every swap by Direct3D 9, though not
// directly in all games (happens in 4B4F07FE and 4D5307E6 even without proper
// VdSwap emulation, but in 535107D4, with a fake VdSwap packet rather than the
// real ones, the register is not set at all, though the expected behavior is
// that of the value of 0x00000000).
// Default: 0x00000000.
XE_GPU_REGISTER(0x1930, kDword, DC_LUTA_CONTROL)
XE_GPU_REGISTER(0x1961, kDword, AVIVO_D1MODE_VIEWPORT_SIZE)

View File

@ -825,6 +825,68 @@ union alignas(uint32_t) RB_COPY_DEST_PITCH {
};
static_assert_size(RB_COPY_DEST_PITCH, sizeof(uint32_t));
/*******************************************************************************
___ ___ ___ ___ _ ___ __
| \_ _/ __| _ \ | /_\ \ / /
| |) | |\__ \ _/ |__ / _ \ V /
|___/___|___/_| |____/_/ \_\_|
___ ___ _ _ _____ ___ ___ _ _ ___ ___
/ __/ _ \| \| |_ _| _ \/ _ \| | | | | __| _ \
| (_| (_) | .` | | | | / (_) | |__| |__| _|| /
\___\___/|_|\_| |_| |_|_\\___/|____|____|___|_|_\
*******************************************************************************/
union alignas(uint32_t) DC_LUT_RW_INDEX {
uint32_t value;
struct {
// Unlike in the M56 documentation, for the 256-table entry, this is the
// absolute index, without the lower or upper 10 bits selection in the
// bit 0. For PWL, the bit 7 is ignored.
uint32_t rw_index : 8; // +0
};
static constexpr Register register_index = XE_GPU_REG_DC_LUT_RW_INDEX;
};
static_assert_size(DC_LUT_RW_INDEX, sizeof(uint32_t));
union alignas(uint32_t) DC_LUT_SEQ_COLOR {
uint32_t value;
struct {
uint32_t seq_color : 16; // +0, bits 0:5 are hardwired to zero
};
static constexpr Register register_index = XE_GPU_REG_DC_LUT_SEQ_COLOR;
};
static_assert_size(DC_LUT_SEQ_COLOR, sizeof(uint32_t));
union alignas(uint32_t) DC_LUT_PWL_DATA {
uint32_t value;
struct {
// See the M56 DC_LUTA_CONTROL for information about the way these should be
// interpreted (`output = base + (multiplier * delta) / 2^increment`, where
// the increment is the value specified in DC_LUTA_CONTROL for the specific
// color channel, the base is 7 bits of the front buffer value above
// `increment` bits, the multiplier is the lower `increment` bits of it; the
// increment is nonzero, otherwise the 256-entry table should be used
// instead).
uint32_t base : 16; // +0, bits 0:5 are hardwired to zero
uint32_t delta : 16; // +16, bits 0:5 are hardwired to zero
};
static constexpr Register register_index = XE_GPU_REG_DC_LUT_PWL_DATA;
};
static_assert_size(DC_LUT_PWL_DATA, sizeof(uint32_t));
union alignas(uint32_t) DC_LUT_30_COLOR {
uint32_t value;
struct {
uint32_t color_10_blue : 10; // +0
uint32_t color_10_green : 10; // +10
uint32_t color_10_red : 10; // +20
};
static constexpr Register register_index = XE_GPU_REG_DC_LUT_30_COLOR;
};
static_assert_size(DC_LUT_30_COLOR, sizeof(uint32_t));
} // namespace reg
} // namespace gpu

View File

@ -32,10 +32,9 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
}
// UNORM conversion according to the Direct3D 10+ rules.
uint3 input = uint3(xe_apply_gamma_source[xe_thread_id.xy] * 1023.0f + 0.5f);
// The ramp is BGR, not RGB.
float3 output = float3(XeApplyPWLGamma(input.r, 2u),
float3 output = float3(XeApplyPWLGamma(input.r, 0u),
XeApplyPWLGamma(input.g, 1u),
XeApplyPWLGamma(input.b, 0u));
XeApplyPWLGamma(input.b, 2u));
xe_apply_gamma_dest[xe_thread_id.xy] =
float4(output, XeApplyGammaGetAlpha(output));
}

View File

@ -14,7 +14,8 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
}
// UNORM conversion according to the Direct3D 10+ rules.
uint3 input = uint3(xe_apply_gamma_source[xe_thread_id.xy] * 255.0f + 0.5f);
// The ramp is BGR, not RGB.
// The ramp has blue in bits 0:9, green in 10:19, red in 20:29 - BGR passed as
// an R10G10B10A2 buffer.
float3 output = float3(xe_apply_gamma_ramp[input.r].b,
xe_apply_gamma_ramp[input.g].g,
xe_apply_gamma_ramp[input.b].r);

View File

@ -55,17 +55,17 @@ ld r0.xyz, r0.xyzw, T0[0].xyzw
mad r0.xyz, r0.xyzx, l(1023.000000, 1023.000000, 1023.000000, 0.000000), l(0.500000, 0.500000, 0.500000, 0.000000)
ftou r0.xyz, r0.xyzx
ushr r1.xyz, r0.xyzx, l(3, 3, 3, 0)
imul null, r0.w, r1.z, l(3)
imad r1.xy, r1.xyxx, l(3, 3, 0, 0), l(2, 1, 0, 0)
ld r1.xz, r1.xxxx, T1[1].xzyw
utof r1.x, r1.x
imul null, r0.w, r1.x, l(3)
ld r1.xw, r0.wwww, T1[1].xzwy
utof r0.w, r1.x
and r0.xyz, r0.xyzx, l(7, 7, 7, 0)
imul null, r0.x, r1.z, r0.x
imul null, r0.x, r1.w, r0.x
utof r0.x, r0.x
mad r0.x, r0.x, l(0.125000), r1.x
mad r0.x, r0.x, l(0.125000), r0.w
mul r0.x, r0.x, l(0.000015)
min r2.x, r0.x, l(1.000000)
ld r1.xy, r1.yyyy, T1[1].xyzw
imad r0.xw, r1.yyyz, l(3, 0, 0, 3), l(1, 0, 0, 2)
ld r1.xy, r0.xxxx, T1[1].xyzw
utof r0.x, r1.x
imul null, r0.y, r0.y, r1.y
utof r0.y, r0.y
@ -86,10 +86,10 @@ ret
const BYTE apply_gamma_pwl_cs[] =
{
68, 88, 66, 67, 180, 180,
222, 28, 4, 138, 188, 113,
52, 97, 214, 88, 116, 106,
105, 240, 1, 0, 0, 0,
68, 88, 66, 67, 134, 193,
189, 188, 150, 246, 151, 78,
29, 10, 33, 117, 212, 145,
204, 130, 1, 0, 0, 0,
128, 7, 0, 0, 5, 0,
0, 0, 52, 0, 0, 0,
24, 2, 0, 0, 40, 2,
@ -257,26 +257,16 @@ const BYTE apply_gamma_pwl_cs[] =
0, 0, 0, 0, 38, 0,
0, 8, 0, 208, 0, 0,
130, 0, 16, 0, 0, 0,
0, 0, 42, 0, 16, 0,
0, 0, 10, 0, 16, 0,
1, 0, 0, 0, 1, 64,
0, 0, 3, 0, 0, 0,
35, 0, 0, 15, 50, 0,
45, 0, 0, 8, 146, 0,
16, 0, 1, 0, 0, 0,
70, 0, 16, 0, 1, 0,
0, 0, 2, 64, 0, 0,
3, 0, 0, 0, 3, 0,
0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 2, 64,
0, 0, 2, 0, 0, 0,
1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0,
45, 0, 0, 8, 82, 0,
16, 0, 1, 0, 0, 0,
6, 0, 16, 0, 1, 0,
0, 0, 134, 125, 32, 0,
246, 15, 16, 0, 0, 0,
0, 0, 134, 119, 32, 0,
1, 0, 0, 0, 1, 0,
0, 0, 86, 0, 0, 5,
18, 0, 16, 0, 1, 0,
130, 0, 16, 0, 0, 0,
0, 0, 10, 0, 16, 0,
1, 0, 0, 0, 1, 0,
0, 10, 114, 0, 16, 0,
@ -288,7 +278,7 @@ const BYTE apply_gamma_pwl_cs[] =
0, 0, 38, 0, 0, 8,
0, 208, 0, 0, 18, 0,
16, 0, 0, 0, 0, 0,
42, 0, 16, 0, 1, 0,
58, 0, 16, 0, 1, 0,
0, 0, 10, 0, 16, 0,
0, 0, 0, 0, 86, 0,
0, 5, 18, 0, 16, 0,
@ -298,8 +288,8 @@ const BYTE apply_gamma_pwl_cs[] =
16, 0, 0, 0, 0, 0,
10, 0, 16, 0, 0, 0,
0, 0, 1, 64, 0, 0,
0, 0, 0, 62, 10, 0,
16, 0, 1, 0, 0, 0,
0, 0, 0, 62, 58, 0,
16, 0, 0, 0, 0, 0,
56, 0, 0, 7, 18, 0,
16, 0, 0, 0, 0, 0,
10, 0, 16, 0, 0, 0,
@ -309,10 +299,20 @@ const BYTE apply_gamma_pwl_cs[] =
2, 0, 0, 0, 10, 0,
16, 0, 0, 0, 0, 0,
1, 64, 0, 0, 0, 0,
128, 63, 45, 0, 0, 8,
128, 63, 35, 0, 0, 15,
146, 0, 16, 0, 0, 0,
0, 0, 86, 9, 16, 0,
1, 0, 0, 0, 2, 64,
0, 0, 3, 0, 0, 0,
0, 0, 0, 0, 0, 0,
0, 0, 3, 0, 0, 0,
2, 64, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 2, 0,
0, 0, 45, 0, 0, 8,
50, 0, 16, 0, 1, 0,
0, 0, 86, 5, 16, 0,
1, 0, 0, 0, 70, 126,
0, 0, 6, 0, 16, 0,
0, 0, 0, 0, 70, 126,
32, 0, 1, 0, 0, 0,
1, 0, 0, 0, 86, 0,
0, 5, 18, 0, 16, 0,

View File

@ -55,17 +55,17 @@ ld r0.xyz, r0.xyzw, T0[0].xyzw
mad r0.xyz, r0.xyzx, l(1023.000000, 1023.000000, 1023.000000, 0.000000), l(0.500000, 0.500000, 0.500000, 0.000000)
ftou r0.xyz, r0.xyzx
ushr r1.xyz, r0.xyzx, l(3, 3, 3, 0)
imul null, r0.w, r1.z, l(3)
imad r1.xy, r1.xyxx, l(3, 3, 0, 0), l(2, 1, 0, 0)
ld r1.xz, r1.xxxx, T1[1].xzyw
utof r1.x, r1.x
imul null, r0.w, r1.x, l(3)
ld r1.xw, r0.wwww, T1[1].xzwy
utof r0.w, r1.x
and r0.xyz, r0.xyzx, l(7, 7, 7, 0)
imul null, r0.x, r1.z, r0.x
imul null, r0.x, r1.w, r0.x
utof r0.x, r0.x
mad r0.x, r0.x, l(0.125000), r1.x
mad r0.x, r0.x, l(0.125000), r0.w
mul r0.x, r0.x, l(0.000015)
min r2.x, r0.x, l(1.000000)
ld r1.xy, r1.yyyy, T1[1].xyzw
imad r0.xw, r1.yyyz, l(3, 0, 0, 3), l(1, 0, 0, 2)
ld r1.xy, r0.xxxx, T1[1].xyzw
utof r0.x, r1.x
imul null, r0.y, r0.y, r1.y
utof r0.y, r0.y
@ -86,10 +86,10 @@ ret
const BYTE apply_gamma_pwl_fxaa_luma_cs[] =
{
68, 88, 66, 67, 165, 122,
242, 36, 160, 218, 193, 67,
37, 43, 138, 45, 109, 219,
226, 109, 1, 0, 0, 0,
68, 88, 66, 67, 115, 68,
69, 234, 116, 212, 118, 193,
71, 10, 44, 165, 244, 209,
63, 198, 1, 0, 0, 0,
148, 7, 0, 0, 5, 0,
0, 0, 52, 0, 0, 0,
24, 2, 0, 0, 40, 2,
@ -257,26 +257,16 @@ const BYTE apply_gamma_pwl_fxaa_luma_cs[] =
0, 0, 0, 0, 38, 0,
0, 8, 0, 208, 0, 0,
130, 0, 16, 0, 0, 0,
0, 0, 42, 0, 16, 0,
0, 0, 10, 0, 16, 0,
1, 0, 0, 0, 1, 64,
0, 0, 3, 0, 0, 0,
35, 0, 0, 15, 50, 0,
45, 0, 0, 8, 146, 0,
16, 0, 1, 0, 0, 0,
70, 0, 16, 0, 1, 0,
0, 0, 2, 64, 0, 0,
3, 0, 0, 0, 3, 0,
0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 2, 64,
0, 0, 2, 0, 0, 0,
1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0,
45, 0, 0, 8, 82, 0,
16, 0, 1, 0, 0, 0,
6, 0, 16, 0, 1, 0,
0, 0, 134, 125, 32, 0,
246, 15, 16, 0, 0, 0,
0, 0, 134, 119, 32, 0,
1, 0, 0, 0, 1, 0,
0, 0, 86, 0, 0, 5,
18, 0, 16, 0, 1, 0,
130, 0, 16, 0, 0, 0,
0, 0, 10, 0, 16, 0,
1, 0, 0, 0, 1, 0,
0, 10, 114, 0, 16, 0,
@ -288,7 +278,7 @@ const BYTE apply_gamma_pwl_fxaa_luma_cs[] =
0, 0, 38, 0, 0, 8,
0, 208, 0, 0, 18, 0,
16, 0, 0, 0, 0, 0,
42, 0, 16, 0, 1, 0,
58, 0, 16, 0, 1, 0,
0, 0, 10, 0, 16, 0,
0, 0, 0, 0, 86, 0,
0, 5, 18, 0, 16, 0,
@ -298,8 +288,8 @@ const BYTE apply_gamma_pwl_fxaa_luma_cs[] =
16, 0, 0, 0, 0, 0,
10, 0, 16, 0, 0, 0,
0, 0, 1, 64, 0, 0,
0, 0, 0, 62, 10, 0,
16, 0, 1, 0, 0, 0,
0, 0, 0, 62, 58, 0,
16, 0, 0, 0, 0, 0,
56, 0, 0, 7, 18, 0,
16, 0, 0, 0, 0, 0,
10, 0, 16, 0, 0, 0,
@ -309,10 +299,20 @@ const BYTE apply_gamma_pwl_fxaa_luma_cs[] =
2, 0, 0, 0, 10, 0,
16, 0, 0, 0, 0, 0,
1, 64, 0, 0, 0, 0,
128, 63, 45, 0, 0, 8,
128, 63, 35, 0, 0, 15,
146, 0, 16, 0, 0, 0,
0, 0, 86, 9, 16, 0,
1, 0, 0, 0, 2, 64,
0, 0, 3, 0, 0, 0,
0, 0, 0, 0, 0, 0,
0, 0, 3, 0, 0, 0,
2, 64, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 2, 0,
0, 0, 45, 0, 0, 8,
50, 0, 16, 0, 1, 0,
0, 0, 86, 5, 16, 0,
1, 0, 0, 0, 70, 126,
0, 0, 6, 0, 16, 0,
0, 0, 0, 0, 70, 126,
32, 0, 1, 0, 0, 0,
1, 0, 0, 0, 86, 0,
0, 5, 18, 0, 16, 0,

View File

@ -9,8 +9,11 @@
#include "xenia/gpu/trace_player.h"
#include <memory>
#include "xenia/gpu/command_processor.h"
#include "xenia/gpu/graphics_system.h"
#include "xenia/gpu/registers.h"
#include "xenia/gpu/xenos.h"
#include "xenia/memory.h"
@ -33,8 +36,6 @@ TracePlayer::TracePlayer(GraphicsSystem* graphics_system)
assert_not_null(playback_event_);
}
TracePlayer::~TracePlayer() { delete[] edram_snapshot_; }
const TraceReader::Frame* TracePlayer::current_frame() const {
if (current_frame_index_ >= frame_count()) {
return nullptr;
@ -197,13 +198,12 @@ void TracePlayer::PlayTraceOnThread(const uint8_t* trace_data,
case TraceCommandType::kEdramSnapshot: {
auto cmd = reinterpret_cast<const EdramSnapshotCommand*>(trace_ptr);
trace_ptr += sizeof(*cmd);
if (!edram_snapshot_) {
edram_snapshot_ = new uint8_t[xenos::kEdramSizeBytes];
}
std::unique_ptr<uint8_t[]> edram_snapshot(
new uint8_t[xenos::kEdramSizeBytes]);
DecompressMemory(cmd->encoding_format, trace_ptr, cmd->encoded_length,
edram_snapshot_, xenos::kEdramSizeBytes);
edram_snapshot.get(), xenos::kEdramSizeBytes);
trace_ptr += cmd->encoded_length;
command_processor->RestoreEdramSnapshot(edram_snapshot_);
command_processor->RestoreEdramSnapshot(edram_snapshot.get());
break;
}
case TraceCommandType::kEvent: {
@ -219,6 +219,34 @@ void TracePlayer::PlayTraceOnThread(const uint8_t* trace_data,
}
break;
}
case TraceCommandType::kRegisters: {
auto cmd = reinterpret_cast<const RegistersCommand*>(trace_ptr);
trace_ptr += sizeof(*cmd);
std::unique_ptr<uint32_t[]> register_values(
new uint32_t[cmd->register_count]);
DecompressMemory(cmd->encoding_format, trace_ptr, cmd->encoded_length,
register_values.get(),
sizeof(uint32_t) * cmd->register_count);
trace_ptr += cmd->encoded_length;
command_processor->RestoreRegisters(
cmd->first_register, register_values.get(), cmd->register_count,
cmd->execute_callbacks);
break;
}
case TraceCommandType::kGammaRamp: {
auto cmd = reinterpret_cast<const GammaRampCommand*>(trace_ptr);
trace_ptr += sizeof(*cmd);
std::unique_ptr<uint32_t[]> gamma_ramps(new uint32_t[256 + 3 * 128]);
DecompressMemory(cmd->encoding_format, trace_ptr, cmd->encoded_length,
gamma_ramps.get(), sizeof(uint32_t) * (256 + 3 * 128));
trace_ptr += cmd->encoded_length;
command_processor->RestoreGammaRamp(
reinterpret_cast<const reg::DC_LUT_30_COLOR*>(gamma_ramps.get()),
reinterpret_cast<const reg::DC_LUT_PWL_DATA*>(gamma_ramps.get() +
256),
cmd->rw_component);
break;
}
}
}

View File

@ -30,7 +30,6 @@ enum class TracePlaybackMode {
class TracePlayer : public TraceReader {
public:
TracePlayer(GraphicsSystem* graphics_system);
~TracePlayer() override;
GraphicsSystem* graphics_system() const { return graphics_system_; }
void SetPresentLastCopy(bool present_last_copy) {
@ -66,7 +65,6 @@ class TracePlayer : public TraceReader {
bool playing_trace_ = false;
std::atomic<uint32_t> playback_percent_ = {0};
std::unique_ptr<xe::threading::Event> playback_event_;
uint8_t* edram_snapshot_ = nullptr;
};
} // namespace gpu

View File

@ -53,6 +53,8 @@ enum class TraceCommandType : uint32_t {
kMemoryWrite,
kEdramSnapshot,
kEvent,
kRegisters,
kGammaRamp,
};
struct PrimaryBufferStartCommand {
@ -134,6 +136,40 @@ struct EventCommand {
Type event_type;
};
// Represents a range of registers.
struct RegistersCommand {
TraceCommandType type;
uint32_t first_register;
uint32_t register_count;
// Whether to set the registers via WriteRegister, which may have side
// effects, rather than by copying them directly to the register file.
bool execute_callbacks;
// Encoding format of the values in the trace file.
MemoryEncodingFormat encoding_format;
// Number of bytes the values occupy in the trace file in their encoded form.
// If no encoding is used, this will be sizeof(uint32_t) * register_count.
uint32_t encoded_length;
};
// Represents a gamma ramp - encoded 256 DC_LUT_30_COLOR values and 128
// interleaved RGB DC_LUT_PWL_DATA values.
// Assuming that all other gamma ramp state is saved as plain registers.
struct GammaRampCommand {
TraceCommandType type;
// The component index (0 = red, 1 = green, 2 = blue) for the next
// DC_LUT_SEQ_COLOR or DC_LUT_PWL_DATA read or write.
uint8_t rw_component;
// Encoding format of the ramps in the trace file.
MemoryEncodingFormat encoding_format;
// Number of bytes the ramps occupy in the trace file in their encoded form.
// If no encoding is used, this will be sizeof(uint32_t) * (256 + 3 * 128).
uint32_t encoded_length;
};
} // namespace gpu
} // namespace xe

View File

@ -205,6 +205,16 @@ void TraceReader::ParseTrace() {
}
break;
}
case TraceCommandType::kRegisters: {
auto cmd = reinterpret_cast<const RegistersCommand*>(trace_ptr);
trace_ptr += sizeof(*cmd) + cmd->encoded_length;
break;
}
case TraceCommandType::kGammaRamp: {
auto cmd = reinterpret_cast<const GammaRampCommand*>(trace_ptr);
trace_ptr += sizeof(*cmd) + cmd->encoded_length;
break;
}
default:
// Broken trace file?
assert_unhandled_case(type);
@ -218,8 +228,8 @@ void TraceReader::ParseTrace() {
}
bool TraceReader::DecompressMemory(MemoryEncodingFormat encoding_format,
const uint8_t* src, size_t src_size,
uint8_t* dest, size_t dest_size) {
const void* src, size_t src_size, void* dest,
size_t dest_size) {
switch (encoding_format) {
case MemoryEncodingFormat::kNone:
assert_true(src_size == dest_size);

View File

@ -135,9 +135,8 @@ class TraceReader {
protected:
void ParseTrace();
bool DecompressMemory(MemoryEncodingFormat encoding_format,
const uint8_t* src, size_t src_size, uint8_t* dest,
size_t dest_size);
bool DecompressMemory(MemoryEncodingFormat encoding_format, const void* src,
size_t src_size, void* dest, size_t dest_size);
std::unique_ptr<MappedMemory> mmap_;
const uint8_t* trace_data_ = nullptr;

View File

@ -10,6 +10,7 @@
#include "xenia/gpu/trace_writer.h"
#include <cstring>
#include <memory>
#include "third_party/snappy/snappy-sinksource.h"
#include "third_party/snappy/snappy.h"
@ -19,6 +20,7 @@
#include "xenia/base/filesystem.h"
#include "xenia/base/logging.h"
#include "xenia/base/string.h"
#include "xenia/gpu/registers.h"
#include "xenia/gpu/xenos.h"
namespace xe {
@ -194,7 +196,7 @@ class SnappySink : public snappy::Sink {
void TraceWriter::WriteMemoryCommand(TraceCommandType type, uint32_t base_ptr,
size_t length, const void* host_ptr) {
MemoryCommand cmd;
MemoryCommand cmd = {};
cmd.type = type;
cmd.base_ptr = base_ptr;
cmd.encoding_format = MemoryEncodingFormat::kNone;
@ -232,8 +234,9 @@ void TraceWriter::WriteMemoryCommand(TraceCommandType type, uint32_t base_ptr,
}
void TraceWriter::WriteEdramSnapshot(const void* snapshot) {
EdramSnapshotCommand cmd;
EdramSnapshotCommand cmd = {};
cmd.type = TraceCommandType::kEdramSnapshot;
if (compress_output_) {
// Write the header now so we reserve space in the buffer.
long header_position = std::ftell(file_);
@ -272,5 +275,93 @@ void TraceWriter::WriteEvent(EventCommand::Type event_type) {
fwrite(&cmd, 1, sizeof(cmd), file_);
}
void TraceWriter::WriteRegisters(uint32_t first_register,
const uint32_t* register_values,
uint32_t register_count,
bool execute_callbacks_on_play) {
RegistersCommand cmd = {};
cmd.type = TraceCommandType::kRegisters;
cmd.first_register = first_register;
cmd.register_count = register_count;
cmd.execute_callbacks = execute_callbacks_on_play;
uint32_t uncompressed_length = uint32_t(sizeof(uint32_t) * register_count);
if (compress_output_) {
// Write the header now so we reserve space in the buffer.
long header_position = std::ftell(file_);
cmd.encoding_format = MemoryEncodingFormat::kSnappy;
fwrite(&cmd, 1, sizeof(cmd), file_);
// Stream the content right to the buffer.
snappy::ByteArraySource snappy_source(
reinterpret_cast<const char*>(register_values), uncompressed_length);
SnappySink snappy_sink(file_);
cmd.encoded_length =
static_cast<uint32_t>(snappy::Compress(&snappy_source, &snappy_sink));
// Seek back and overwrite the header with our final size.
std::fseek(file_, header_position, SEEK_SET);
fwrite(&cmd, 1, sizeof(cmd), file_);
std::fseek(file_, header_position + sizeof(cmd) + cmd.encoded_length,
SEEK_SET);
} else {
// Uncompressed - write the values directly to the file.
cmd.encoding_format = MemoryEncodingFormat::kNone;
cmd.encoded_length = uncompressed_length;
fwrite(&cmd, 1, sizeof(cmd), file_);
fwrite(register_values, 1, uncompressed_length, file_);
}
}
void TraceWriter::WriteGammaRamp(
const reg::DC_LUT_30_COLOR* gamma_ramp_256_entry_table,
const reg::DC_LUT_PWL_DATA* gamma_ramp_pwl_rgb,
uint32_t gamma_ramp_rw_component) {
GammaRampCommand cmd = {};
cmd.type = TraceCommandType::kGammaRamp;
cmd.rw_component = uint8_t(gamma_ramp_rw_component);
constexpr uint32_t k256EntryTableUncompressedLength =
sizeof(reg::DC_LUT_30_COLOR) * 256;
constexpr uint32_t kPWLUncompressedLength =
sizeof(reg::DC_LUT_PWL_DATA) * 3 * 128;
constexpr uint32_t kUncompressedLength =
k256EntryTableUncompressedLength + kPWLUncompressedLength;
if (compress_output_) {
// Write the header now so we reserve space in the buffer.
long header_position = std::ftell(file_);
cmd.encoding_format = MemoryEncodingFormat::kSnappy;
fwrite(&cmd, 1, sizeof(cmd), file_);
// Stream the content right to the buffer.
{
std::unique_ptr<char[]> gamma_ramps(new char[kUncompressedLength]);
std::memcpy(gamma_ramps.get(), gamma_ramp_256_entry_table,
k256EntryTableUncompressedLength);
std::memcpy(gamma_ramps.get() + k256EntryTableUncompressedLength,
gamma_ramp_pwl_rgb, kPWLUncompressedLength);
snappy::ByteArraySource snappy_source(gamma_ramps.get(),
kUncompressedLength);
SnappySink snappy_sink(file_);
cmd.encoded_length =
static_cast<uint32_t>(snappy::Compress(&snappy_source, &snappy_sink));
}
// Seek back and overwrite the header with our final size.
std::fseek(file_, header_position, SEEK_SET);
fwrite(&cmd, 1, sizeof(cmd), file_);
std::fseek(file_, header_position + sizeof(cmd) + cmd.encoded_length,
SEEK_SET);
} else {
// Uncompressed - write the values directly to the file.
cmd.encoding_format = MemoryEncodingFormat::kNone;
cmd.encoded_length = kUncompressedLength;
fwrite(&cmd, 1, sizeof(cmd), file_);
fwrite(gamma_ramp_256_entry_table, 1, k256EntryTableUncompressedLength,
file_);
fwrite(gamma_ramp_pwl_rgb, 1, kPWLUncompressedLength, file_);
}
}
} // namespace gpu
} // namespace xe

View File

@ -14,6 +14,7 @@
#include <set>
#include <string>
#include "xenia/gpu/registers.h"
#include "xenia/gpu/trace_protocol.h"
namespace xe {
@ -44,6 +45,11 @@ class TraceWriter {
const void* host_ptr = nullptr);
void WriteEdramSnapshot(const void* snapshot);
void WriteEvent(EventCommand::Type event_type);
void WriteRegisters(uint32_t first_register, const uint32_t* register_values,
uint32_t register_count, bool execute_callbacks_on_play);
void WriteGammaRamp(const reg::DC_LUT_30_COLOR* gamma_ramp_256_entry_table,
const reg::DC_LUT_PWL_DATA* gamma_ramp_pwl_rgb,
uint32_t gamma_ramp_rw_component);
private:
void WriteMemoryCommand(TraceCommandType type, uint32_t base_ptr,

View File

@ -191,20 +191,6 @@ void VulkanCommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
offset ^= 0x1F;
dirty_loop_constants_ |= (1 << offset);
} else if (index == XE_GPU_REG_DC_LUT_PWL_DATA) {
UpdateGammaRampValue(GammaRampType::kPWL, value);
} else if (index == XE_GPU_REG_DC_LUT_30_COLOR) {
UpdateGammaRampValue(GammaRampType::kTable, value);
} else if (index >= XE_GPU_REG_DC_LUT_RW_MODE &&
index <= XE_GPU_REG_DC_LUTA_CONTROL) {
uint32_t offset = index - XE_GPU_REG_DC_LUT_RW_MODE;
offset ^= 0x05;
dirty_gamma_constants_ |= (1 << offset);
if (index == XE_GPU_REG_DC_LUT_RW_INDEX) {
gamma_ramp_rw_subindex_ = 0;
}
}
}
@ -1400,8 +1386,6 @@ bool VulkanCommandProcessor::IssueCopy() {
return true;
}
void VulkanCommandProcessor::InitializeTrace() {}
} // namespace vulkan
} // namespace gpu
} // namespace xe

View File

@ -98,8 +98,6 @@ class VulkanCommandProcessor : public CommandProcessor {
VulkanShader* pixel_shader);
bool IssueCopy() override;
void InitializeTrace() override;
uint64_t dirty_float_constants_ = 0; // Dirty float constants in blocks of 4
uint8_t dirty_bool_constants_ = 0;
uint32_t dirty_loop_constants_ = 0;