mirror of https://github.com/xemu-project/xemu.git
Merge branch 'master' into DeviceEmulation-SteelBattalionController
This commit is contained in:
commit
abba28216a
|
@ -4,6 +4,7 @@
|
|||
* Copyright (c) 2015 Jannik Vogel
|
||||
* Copyright (c) 2013 espes
|
||||
* Copyright (c) 2007-2010 The Nouveau Project.
|
||||
* Copyright (c) 2025 Matt Borgerson
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
|
@ -22,14 +23,27 @@
|
|||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include "qemu/osdep.h"
|
||||
#include <stdbool.h>
|
||||
|
||||
#include "swizzle.h"
|
||||
|
||||
/* This should be pretty straightforward.
|
||||
* It creates a bit pattern like ..zyxzyxzyx from ..xxx, ..yyy and ..zzz
|
||||
* If there are no bits left from any component it will pack the other masks
|
||||
* more tighly (Example: zzxzxzyx = Fewer x than z and even fewer y)
|
||||
/*
|
||||
* Helpers for converting to and from swizzled (Z-ordered) texture formats.
|
||||
* Swizzled textures store pixels in a more cache-friendly layout for rendering
|
||||
* than linear textures.
|
||||
* Width, height, and depth must be powers of two.
|
||||
* See also:
|
||||
* https://en.wikipedia.org/wiki/Z-order_curve
|
||||
*/
|
||||
|
||||
/*
|
||||
* Create masks representing the interleaving of each linear texture dimension (x, y, z).
|
||||
* These can be used to map linear texture coordinates to a swizzled "Z" offset.
|
||||
* For example, a 2D 8x32 texture needs 3 bits for x, and 5 bits for y:
|
||||
* mask_x: 00010101
|
||||
* mask_y: 11101010
|
||||
* mask_z: 00000000
|
||||
* for "Z": yyyxyxyx
|
||||
*/
|
||||
static void generate_swizzle_masks(unsigned int width,
|
||||
unsigned int height,
|
||||
|
@ -49,41 +63,13 @@ static void generate_swizzle_masks(unsigned int width,
|
|||
if (bit < depth) { z |= mask_bit; mask_bit <<= 1; done = false; }
|
||||
bit <<= 1;
|
||||
} while(!done);
|
||||
assert((x ^ y ^ z) == (mask_bit - 1));
|
||||
assert((x ^ y ^ z) == (mask_bit - 1)); /* masks are mutually exclusive */
|
||||
*mask_x = x;
|
||||
*mask_y = y;
|
||||
*mask_z = z;
|
||||
}
|
||||
|
||||
/* This fills a pattern with a value if your value has bits abcd and your
|
||||
* pattern is 11010100100 this will return: 0a0b0c00d00
|
||||
*/
|
||||
static uint32_t fill_pattern(uint32_t pattern, uint32_t value)
|
||||
{
|
||||
uint32_t result = 0;
|
||||
uint32_t bit = 1;
|
||||
while(value) {
|
||||
if (pattern & bit) {
|
||||
/* Copy bit to result */
|
||||
result |= value & 1 ? bit : 0;
|
||||
value >>= 1;
|
||||
}
|
||||
bit <<= 1;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static unsigned int get_swizzled_offset(
|
||||
unsigned int x, unsigned int y, unsigned int z,
|
||||
uint32_t mask_x, uint32_t mask_y, uint32_t mask_z,
|
||||
unsigned int bytes_per_pixel)
|
||||
{
|
||||
return bytes_per_pixel * (fill_pattern(mask_x, x)
|
||||
| fill_pattern(mask_y, y)
|
||||
| fill_pattern(mask_z, z));
|
||||
}
|
||||
|
||||
void swizzle_box(
|
||||
static inline void swizzle_box_internal(
|
||||
const uint8_t *src_buf,
|
||||
unsigned int width,
|
||||
unsigned int height,
|
||||
|
@ -96,23 +82,40 @@ void swizzle_box(
|
|||
uint32_t mask_x, mask_y, mask_z;
|
||||
generate_swizzle_masks(width, height, depth, &mask_x, &mask_y, &mask_z);
|
||||
|
||||
/*
|
||||
* Map linear texture to swizzled texture using swizzle masks.
|
||||
* https://fgiesen.wordpress.com/2011/01/17/texture-tiling-and-swizzling/
|
||||
*/
|
||||
|
||||
int x, y, z;
|
||||
int off_z = 0;
|
||||
for (z = 0; z < depth; z++) {
|
||||
int off_y = 0;
|
||||
for (y = 0; y < height; y++) {
|
||||
int off_x = 0;
|
||||
const uint8_t *src_tmp = src_buf + y * row_pitch;
|
||||
uint8_t *dst_tmp = dst_buf + (off_y + off_z) * bytes_per_pixel;
|
||||
for (x = 0; x < width; x++) {
|
||||
const uint8_t *src = src_buf
|
||||
+ y * row_pitch + x * bytes_per_pixel;
|
||||
uint8_t *dst = dst_buf +
|
||||
get_swizzled_offset(x, y, z, mask_x, mask_y, mask_z,
|
||||
bytes_per_pixel);
|
||||
const uint8_t *src = src_tmp + x * bytes_per_pixel;
|
||||
uint8_t *dst = dst_tmp + off_x * bytes_per_pixel;
|
||||
memcpy(dst, src, bytes_per_pixel);
|
||||
|
||||
/*
|
||||
* Increment x offset, letting the increment
|
||||
* ripple through bits that aren't in the mask.
|
||||
* Equivalent to:
|
||||
* off_x = (off_x + (~mask_x + 1)) & mask_x;
|
||||
*/
|
||||
off_x = (off_x - mask_x) & mask_x;
|
||||
}
|
||||
off_y = (off_y - mask_y) & mask_y;
|
||||
}
|
||||
src_buf += slice_pitch;
|
||||
off_z = (off_z - mask_z) & mask_z;
|
||||
}
|
||||
}
|
||||
|
||||
void unswizzle_box(
|
||||
static inline void unswizzle_box_internal(
|
||||
const uint8_t *src_buf,
|
||||
unsigned int width,
|
||||
unsigned int height,
|
||||
|
@ -126,38 +129,56 @@ void unswizzle_box(
|
|||
generate_swizzle_masks(width, height, depth, &mask_x, &mask_y, &mask_z);
|
||||
|
||||
int x, y, z;
|
||||
int off_z = 0;
|
||||
for (z = 0; z < depth; z++) {
|
||||
int off_y = 0;
|
||||
for (y = 0; y < height; y++) {
|
||||
int off_x = 0;
|
||||
const uint8_t *src_tmp = src_buf + (off_y + off_z) * bytes_per_pixel;
|
||||
uint8_t *dst_tmp = dst_buf + y * row_pitch;
|
||||
for (x = 0; x < width; x++) {
|
||||
const uint8_t *src = src_buf
|
||||
+ get_swizzled_offset(x, y, z, mask_x, mask_y, mask_z,
|
||||
bytes_per_pixel);
|
||||
uint8_t *dst = dst_buf + y * row_pitch + x * bytes_per_pixel;
|
||||
const uint8_t *src = src_tmp + off_x * bytes_per_pixel;
|
||||
uint8_t *dst = dst_tmp + x * bytes_per_pixel;
|
||||
memcpy(dst, src, bytes_per_pixel);
|
||||
|
||||
off_x = (off_x - mask_x) & mask_x;
|
||||
}
|
||||
off_y = (off_y - mask_y) & mask_y;
|
||||
}
|
||||
dst_buf += slice_pitch;
|
||||
off_z = (off_z - mask_z) & mask_z;
|
||||
}
|
||||
}
|
||||
|
||||
void unswizzle_rect(
|
||||
const uint8_t *src_buf,
|
||||
unsigned int width,
|
||||
unsigned int height,
|
||||
uint8_t *dst_buf,
|
||||
unsigned int pitch,
|
||||
unsigned int bytes_per_pixel)
|
||||
{
|
||||
unswizzle_box(src_buf, width, height, 1, dst_buf, pitch, 0, bytes_per_pixel);
|
||||
}
|
||||
/* Multiversioned to optimize for common bytes_per_pixel */ \
|
||||
#define C(m, bpp) \
|
||||
m##_internal(src_buf, width, height, depth, dst_buf, row_pitch, \
|
||||
slice_pitch, bpp)
|
||||
#define MULTIVERSION(m) \
|
||||
void m(const uint8_t *src_buf, unsigned int width, unsigned int height, \
|
||||
unsigned int depth, uint8_t *dst_buf, unsigned int row_pitch, \
|
||||
unsigned int slice_pitch, unsigned int bytes_per_pixel) \
|
||||
{ \
|
||||
switch (bytes_per_pixel) { \
|
||||
case 1: \
|
||||
C(m, 1); \
|
||||
break; \
|
||||
case 2: \
|
||||
C(m, 2); \
|
||||
break; \
|
||||
case 3: \
|
||||
C(m, 3); \
|
||||
break; \
|
||||
case 4: \
|
||||
C(m, 4); \
|
||||
break; \
|
||||
default: \
|
||||
C(m, bytes_per_pixel); \
|
||||
} \
|
||||
}
|
||||
|
||||
void swizzle_rect(
|
||||
const uint8_t *src_buf,
|
||||
unsigned int width,
|
||||
unsigned int height,
|
||||
uint8_t *dst_buf,
|
||||
unsigned int pitch,
|
||||
unsigned int bytes_per_pixel)
|
||||
{
|
||||
swizzle_box(src_buf, width, height, 1, dst_buf, pitch, 0, bytes_per_pixel);
|
||||
}
|
||||
MULTIVERSION(swizzle_box)
|
||||
MULTIVERSION(unswizzle_box)
|
||||
|
||||
#undef C
|
||||
#undef MULTIVERSION
|
||||
|
|
|
@ -43,20 +43,26 @@ void unswizzle_box(
|
|||
unsigned int slice_pitch,
|
||||
unsigned int bytes_per_pixel);
|
||||
|
||||
void unswizzle_rect(
|
||||
static inline void unswizzle_rect(
|
||||
const uint8_t *src_buf,
|
||||
unsigned int width,
|
||||
unsigned int height,
|
||||
uint8_t *dst_buf,
|
||||
unsigned int pitch,
|
||||
unsigned int bytes_per_pixel);
|
||||
unsigned int bytes_per_pixel)
|
||||
{
|
||||
unswizzle_box(src_buf, width, height, 1, dst_buf, pitch, 0, bytes_per_pixel);
|
||||
}
|
||||
|
||||
void swizzle_rect(
|
||||
static inline void swizzle_rect(
|
||||
const uint8_t *src_buf,
|
||||
unsigned int width,
|
||||
unsigned int height,
|
||||
uint8_t *dst_buf,
|
||||
unsigned int pitch,
|
||||
unsigned int bytes_per_pixel);
|
||||
unsigned int bytes_per_pixel)
|
||||
{
|
||||
swizzle_box(src_buf, width, height, 1, dst_buf, pitch, 0, bytes_per_pixel);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
CC=clang
|
||||
CC=gcc
|
||||
CFLAGS=-O2 -Wall -g
|
||||
|
||||
swizzle-test: swizzle-test.o swizzle-a.o
|
||||
$(CC) -o $@ $^
|
||||
|
||||
swizzle-test.o: swizzle-test.c
|
||||
|
||||
swizzle-a.o: swizzle.o
|
||||
objcopy \
|
||||
--redefine-sym swizzle_box=swizzle_box_A \
|
||||
--redefine-sym unswizzle_box=unswizzle_box_A \
|
||||
$< $@
|
||||
|
||||
swizzle.o: ../../../hw/xbox/nv2a/pgraph/swizzle.c
|
||||
$(CC) -o $@ $(CFLAGS) -c $<
|
||||
|
||||
%.o: %.c
|
||||
$(CC) -o $@ $(CFLAGS) -c $<
|
||||
|
||||
.PHONY: clean
|
||||
clean:
|
||||
rm -f swizzle-test swizzle.o swizzle-a.o
|
|
@ -0,0 +1,217 @@
|
|||
/*
|
||||
* Crosscheck and benchmark swizzle.
|
||||
*
|
||||
* Copyright (c) 2025 Matt Borgerson
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <time.h>
|
||||
|
||||
#define X_METHODS \
|
||||
X(A)
|
||||
// X(B)
|
||||
|
||||
typedef void (*swizzle_box_handler)(
|
||||
const uint8_t *src_buf,
|
||||
unsigned int width,
|
||||
unsigned int height,
|
||||
unsigned int depth,
|
||||
uint8_t *dst_buf,
|
||||
unsigned int row_pitch,
|
||||
unsigned int slice_pitch,
|
||||
unsigned int bytes_per_pixel);
|
||||
|
||||
typedef struct Method {
|
||||
const char *name;
|
||||
swizzle_box_handler swizzle, unswizzle;
|
||||
} Method;
|
||||
|
||||
#define PROTO(m) \
|
||||
void m( \
|
||||
const uint8_t *src_buf, \
|
||||
unsigned int width, \
|
||||
unsigned int height, \
|
||||
unsigned int depth, \
|
||||
uint8_t *dst_buf, \
|
||||
unsigned int row_pitch, \
|
||||
unsigned int slice_pitch, \
|
||||
unsigned int bytes_per_pixel);
|
||||
|
||||
#define X(m) \
|
||||
PROTO(swizzle_box_ ## m) \
|
||||
PROTO(unswizzle_box_ ## m)
|
||||
X_METHODS
|
||||
#undef X
|
||||
|
||||
const Method methods[] = {
|
||||
#define X(m) { #m, swizzle_box_ ## m, unswizzle_box_ ## m},
|
||||
X_METHODS
|
||||
#undef X
|
||||
};
|
||||
|
||||
#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
|
||||
|
||||
int widths[] = { 1, 2, 4, 8, 16, 32 };
|
||||
int heights[] = { 1, 2, 4, 8, 16, 32 };
|
||||
int depths[] = { 1, 2, 4, 8, 16, 32 };
|
||||
int bpps[] = { 1, 2, 3, 4 };
|
||||
|
||||
static void crosscheck(void)
|
||||
{
|
||||
assert(ARRAY_SIZE(methods) > 0);
|
||||
fprintf(stderr, "%s...", __func__);
|
||||
for (int row_pitch_adjust = 0; row_pitch_adjust < 4; row_pitch_adjust++)
|
||||
for (int slice_pitch_adjust = 0; slice_pitch_adjust < 4; slice_pitch_adjust++)
|
||||
for (int depth_idx = 0; depth_idx < ARRAY_SIZE(depths); depth_idx++)
|
||||
for (int width_idx = 0; width_idx < ARRAY_SIZE(widths); width_idx++)
|
||||
for (int height_idx = 0; height_idx < ARRAY_SIZE(heights); height_idx++)
|
||||
for (int bpp_idx = 0; bpp_idx < ARRAY_SIZE(bpps); bpp_idx++) {
|
||||
|
||||
int width = widths[width_idx];
|
||||
int height = heights[height_idx];
|
||||
int depth = depths[depth_idx];
|
||||
int bpp = bpps[bpp_idx];
|
||||
|
||||
size_t row_pitch = width * bpp + row_pitch_adjust;
|
||||
size_t slice_pitch = row_pitch * height;
|
||||
size_t size_bytes = slice_pitch * depth + slice_pitch_adjust;
|
||||
|
||||
uint8_t *original_data = malloc(size_bytes);
|
||||
for (int i = 0; i < size_bytes; i++) {
|
||||
original_data[i] = rand();
|
||||
}
|
||||
|
||||
void *swizzled_data_A = malloc(size_bytes);
|
||||
memcpy(swizzled_data_A, original_data, size_bytes);
|
||||
methods[0].swizzle(original_data, width, height, depth, swizzled_data_A,
|
||||
row_pitch, slice_pitch, bpp);
|
||||
|
||||
void *unswizzled_data_A = malloc(size_bytes);
|
||||
memcpy(unswizzled_data_A, original_data, size_bytes);
|
||||
methods[0].unswizzle(swizzled_data_A, width, height, depth,
|
||||
unswizzled_data_A, row_pitch, slice_pitch, bpp);
|
||||
assert(!memcmp(original_data, unswizzled_data_A, size_bytes));
|
||||
|
||||
for (int method_idx = 1;
|
||||
method_idx < ARRAY_SIZE(methods);
|
||||
method_idx++) {
|
||||
void *swizzled_data_B = malloc(size_bytes);
|
||||
memcpy(swizzled_data_B, original_data, size_bytes);
|
||||
methods[method_idx].swizzle(original_data, width, height, depth,
|
||||
swizzled_data_B, row_pitch, slice_pitch,
|
||||
bpp);
|
||||
assert(!memcmp(swizzled_data_B, swizzled_data_A, size_bytes));
|
||||
|
||||
void *unswizzled_data_B = malloc(size_bytes);
|
||||
memcpy(unswizzled_data_B, original_data, size_bytes);
|
||||
methods[method_idx].unswizzle(swizzled_data_B, width, height, depth,
|
||||
unswizzled_data_B, row_pitch,
|
||||
slice_pitch, bpp);
|
||||
assert(!memcmp(original_data, unswizzled_data_B, size_bytes));
|
||||
|
||||
free(unswizzled_data_B);
|
||||
free(swizzled_data_B);
|
||||
}
|
||||
|
||||
free(unswizzled_data_A);
|
||||
free(swizzled_data_A);
|
||||
free(original_data);
|
||||
|
||||
// fprintf(stderr, "w:%d, h:%d, d:%d, bpp:%d pitch:%d,%d\n", width, height, depth, bpp, row_pitch_adjust, slice_pitch_adjust);
|
||||
}
|
||||
|
||||
fprintf(stderr, "ok!\n");
|
||||
}
|
||||
|
||||
#define NUM_ITERATIONS 10
|
||||
|
||||
static int compare_ints(const void *a, const void *b)
|
||||
{
|
||||
return *(int*)a - *(int*)b;
|
||||
}
|
||||
|
||||
static void bench(void)
|
||||
{
|
||||
fprintf(stderr, "%s...", __func__);
|
||||
|
||||
int width = 256;
|
||||
int height = 256;
|
||||
int depth = 256;
|
||||
int bpp = 4;
|
||||
|
||||
size_t row_pitch = width * bpp;
|
||||
size_t slice_pitch = row_pitch * height;
|
||||
size_t size_bytes = slice_pitch * depth;
|
||||
size_t size_mib = size_bytes / (1024*1024);
|
||||
fprintf(stderr, "with w: %d, h: %d, d: %d, bpp: %d, "
|
||||
"size: %zu MiB, iterations: %d\n",
|
||||
width, height, depth, bpp, size_mib, NUM_ITERATIONS);
|
||||
|
||||
void *original_data = malloc(size_bytes);
|
||||
memset(original_data, 0, size_bytes);
|
||||
|
||||
void *swizzled_data = malloc(size_bytes);
|
||||
memset(swizzled_data, 0, size_bytes);
|
||||
|
||||
|
||||
for (int method_idx = 0; method_idx < ARRAY_SIZE(methods); method_idx++) {
|
||||
const Method * const method = &methods[method_idx];
|
||||
fprintf(stderr, "[%6s] ", method->name);
|
||||
|
||||
int samples[NUM_ITERATIONS];
|
||||
int sum = 0;
|
||||
|
||||
for (int iter = 0; iter < NUM_ITERATIONS; iter++ ) {
|
||||
struct timespec start, end;
|
||||
|
||||
clock_gettime(CLOCK_MONOTONIC, &start);
|
||||
method->swizzle(original_data, width, height, depth, swizzled_data, row_pitch, slice_pitch, bpp);
|
||||
clock_gettime(CLOCK_MONOTONIC, &end);
|
||||
|
||||
uint64_t start_ns = (uint64_t)start.tv_sec * (uint64_t)1000000000 + start.tv_nsec;
|
||||
uint64_t end_ns = (uint64_t)end.tv_sec * (uint64_t)1000000000 + end.tv_nsec;
|
||||
|
||||
samples[iter] = (end_ns - start_ns) / 1000;
|
||||
sum += samples[iter];
|
||||
}
|
||||
|
||||
qsort(samples, ARRAY_SIZE(samples), sizeof(samples[0]), compare_ints);
|
||||
|
||||
int min = samples[0],
|
||||
max = samples[ARRAY_SIZE(samples) - 1],
|
||||
avg = sum / ARRAY_SIZE(samples),
|
||||
med = samples[ARRAY_SIZE(samples) / 2];
|
||||
fprintf(stderr, "min: %6d us, max: %6d us, avg: %6d us, med: %6d us -- %.2g GiB/s\n",
|
||||
min, max, avg, med, (size_mib / 1024.0) / (med / 1000000.0));
|
||||
}
|
||||
|
||||
free(swizzled_data);
|
||||
free(original_data);
|
||||
}
|
||||
|
||||
int main(int argc, char const *argv[])
|
||||
{
|
||||
srand(1337);
|
||||
|
||||
crosscheck();
|
||||
bench();
|
||||
|
||||
return 0;
|
||||
}
|
Loading…
Reference in New Issue