rsx: Improve NV3089_IMAGE_IN_SIZE and use faster loop for swizzle conversions

This commit is contained in:
Jake 2015-12-01 07:55:15 -06:00
parent 83cb137721
commit 178bcfc8df
4 changed files with 214 additions and 131 deletions

View File

@ -61,17 +61,14 @@ writeTexelsSwizzled(const char *src, char *dst, size_t widthInBlock, size_t heig
Result.push_back(currentMipmapLevelInfo);
u32 *castedSrc, *castedDst;
u32 log2width, log2height;
castedSrc = (u32*)src + offsetInSrc;
castedDst = (u32*)dst + offsetInDst;
log2width = (u32)(logf((float)currentWidth) / logf(2.f));
log2height = (u32)(logf((float)currentHeight) / logf(2.f));
for (int row = 0; row < currentHeight; row++)
for (int j = 0; j < currentWidth; j++)
castedDst[(row * rowPitch / 4) + j] = castedSrc[rsx::linear_to_swizzle(j, row, 0, log2width, log2height, 0)];
std::unique_ptr<u32[]> tempSwizzled(new u32[currentHeight * currentWidth]);
rsx::convert_linear_swizzle<u32>(castedSrc, tempSwizzled.get(), currentWidth, currentHeight, true);
for (unsigned row = 0; row < currentHeight; row++)
memcpy((char*)dst + offsetInDst + row * rowPitch, (char*)tempSwizzled.get() + offsetInSrc + row * widthInBlock * blockSize, currentWidth * blockSize);
offsetInDst += currentHeight * rowPitch;
offsetInSrc += currentHeight * widthInBlock * blockSize;
@ -124,6 +121,7 @@ write16bTexelsSwizzled(const char *src, char *dst, size_t widthInBlock, size_t h
std::vector<MipmapLevelInfo> Result;
size_t offsetInDst = 0, offsetInSrc = 0;
size_t currentHeight = heightInBlock, currentWidth = widthInBlock;
size_t srcPitch = widthInBlock * blockSize;
for (unsigned mipLevel = 0; mipLevel < mipmapCount; mipLevel++)
{
size_t rowPitch = align(currentWidth * blockSize, 256);
@ -141,12 +139,14 @@ write16bTexelsSwizzled(const char *src, char *dst, size_t widthInBlock, size_t h
castedSrc = (u16*)src + offsetInSrc;
castedDst = (u16*)dst + offsetInDst;
log2width = (u32)(logf((float)currentWidth) / logf(2.f));
log2height = (u32)(logf((float)currentHeight) / logf(2.f));
for (int row = 0; row < currentHeight; row++)
std::unique_ptr<u16[]> tempSwizzled(new u16[currentHeight * currentWidth]);
rsx::convert_linear_swizzle<u16>(castedSrc, tempSwizzled.get(), currentWidth, currentHeight, true);
for (unsigned row = 0; row < heightInBlock; row++)
for (int j = 0; j < currentWidth; j++)
castedDst[(row * rowPitch / 2) + j] = castedSrc[rsx::linear_to_swizzle(j, row, 0, log2width, log2height, 0)];
{
u16 tmp = tempSwizzled[offsetInSrc / 2 + row * srcPitch / 2 + j];
castedDst[offsetInDst / 2 + row * rowPitch / 2 + j] = (tmp >> 8) | (tmp << 8);
}
offsetInDst += currentHeight * rowPitch;
offsetInSrc += currentHeight * widthInBlock * blockSize;

View File

@ -122,22 +122,19 @@ void GLTexture::init(rsx::texture& tex)
if (is_swizzled)
{
u32 *src, *dst;
u32 log2width, log2height;
u16 height = tex.height();
u16 width = tex.width();
unswizzledPixels = (u8*)malloc(tex.width() * tex.height() * 4);
unswizzledPixels = (u8*)malloc(width * height * 4);
src = (u32*)pixels;
dst = (u32*)unswizzledPixels;
log2width = (u32)log2(tex.width());
log2height = (u32)log2(tex.height());
for (int i = 0; i < tex.height(); i++)
if ((height & (height - 1)) || (width & (width - 1)))
{
for (int j = 0; j < tex.width(); j++)
{
dst[(i*tex.height()) + j] = src[rsx::linear_to_swizzle(j, i, 0, log2width, log2height, 0)];
}
LOG_ERROR(RSX, "Swizzle Texture: Width or height not power of 2! (h=%d,w=%d).", height, width);
}
rsx::convert_linear_swizzle<u32>(src, dst, width, height, true);
}
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, tex.width(), tex.height(), 0, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8, is_swizzled ? unswizzledPixels : pixels);

View File

@ -347,20 +347,38 @@ namespace rsx
{
never_inline void image_in(u32 arg)
{
const u16 width = method_registers[NV3089_IMAGE_IN_SIZE];
const u16 height = method_registers[NV3089_IMAGE_IN_SIZE] >> 16;
const u16 pitch = method_registers[NV3089_IMAGE_IN_FORMAT];
const u8 origin = method_registers[NV3089_IMAGE_IN_FORMAT] >> 16;
const u8 inter = method_registers[NV3089_IMAGE_IN_FORMAT] >> 24;
const u16 src_height = method_registers[NV3089_IMAGE_IN_SIZE] >> 16;
const u16 src_pitch = method_registers[NV3089_IMAGE_IN_FORMAT];
const u8 src_origin = method_registers[NV3089_IMAGE_IN_FORMAT] >> 16;
const u8 src_inter = method_registers[NV3089_IMAGE_IN_FORMAT] >> 24;
const u32 src_color_format = method_registers[NV3089_SET_COLOR_FORMAT];
const u32 operation = method_registers[NV3089_SET_OPERATION];
if (origin != 2 /* CELL_GCM_TRANSFER_ORIGIN_CORNER */)
{
LOG_ERROR(RSX, "NV3089_IMAGE_IN_SIZE: unknown origin (%d)", origin);
const u16 out_w = method_registers[NV3089_IMAGE_OUT_SIZE];
const u16 out_h = method_registers[NV3089_IMAGE_OUT_SIZE] >> 16;
// handle weird RSX quirk, doesn't report less than 16 pixels width in some cases
u16 src_width = method_registers[NV3089_IMAGE_IN_SIZE];
if (src_width == 16 && out_w < 16 && method_registers[NV3089_DS_DX] == (1 << 20)) {
src_width = out_w;
}
if (inter != 0 /* CELL_GCM_TRANSFER_INTERPOLATOR_ZOH */ && inter != 1 /* CELL_GCM_TRANSFER_INTERPOLATOR_FOH */)
const u16 u = method_registers[NV3089_IMAGE_IN]; // inX (currently ignored)
const u16 v = method_registers[NV3089_IMAGE_IN] >> 16; // inY (currently ignored)
if (src_origin != CELL_GCM_TRANSFER_ORIGIN_CORNER)
{
LOG_ERROR(RSX, "NV3089_IMAGE_IN_SIZE: unknown inter (%d)", inter);
LOG_ERROR(RSX, "NV3089_IMAGE_IN_SIZE: unknown origin (%d)", src_origin);
}
if (src_inter != CELL_GCM_TRANSFER_INTERPOLATOR_ZOH && src_inter != CELL_GCM_TRANSFER_INTERPOLATOR_FOH)
{
LOG_ERROR(RSX, "NV3089_IMAGE_IN_SIZE: unknown inter (%d)", src_inter);
}
if (operation != CELL_GCM_TRANSFER_OPERATION_SRCCOPY)
{
LOG_ERROR(RSX, "NV3089_IMAGE_IN_SIZE: unknown operation (%d)", operation);
}
const u32 src_offset = method_registers[NV3089_IMAGE_IN_OFFSET];
@ -368,17 +386,20 @@ namespace rsx
u32 dst_offset;
u32 dst_dma = 0;
u16 dst_color_format;
switch (method_registers[NV3089_SET_CONTEXT_SURFACE])
{
case CELL_GCM_CONTEXT_SURFACE2D:
dst_dma = method_registers[NV3062_SET_CONTEXT_DMA_IMAGE_DESTIN];
dst_offset = method_registers[NV3062_SET_OFFSET_DESTIN];
dst_color_format = method_registers[NV3062_SET_COLOR_FORMAT];
break;
case CELL_GCM_CONTEXT_SWIZZLE2D:
dst_dma = method_registers[NV309E_SET_CONTEXT_DMA_IMAGE];
dst_offset = method_registers[NV309E_SET_OFFSET];
dst_color_format = method_registers[NV309E_SET_FORMAT];
break;
default:
@ -394,95 +415,61 @@ namespace rsx
LOG_ERROR(RSX, "NV3089_IMAGE_IN_SIZE: src = 0x%x, dst = 0x%x", src_offset, dst_offset);
const u16 u = arg; // inX (currently ignored)
const u16 v = arg >> 16; // inY (currently ignored)
u8* pixels_src = vm::_ptr<u8>(get_address(src_offset, src_dma));
u8* pixels_dst = vm::_ptr<u8>(get_address(dst_offset, dst_dma));
if (method_registers[NV3062_SET_COLOR_FORMAT] != 4 /* CELL_GCM_TRANSFER_SURFACE_FORMAT_R5G6B5 */ &&
method_registers[NV3062_SET_COLOR_FORMAT] != 10 /* CELL_GCM_TRANSFER_SURFACE_FORMAT_A8R8G8B8 */)
if (dst_color_format != CELL_GCM_TRANSFER_SURFACE_FORMAT_R5G6B5 &&
dst_color_format != CELL_GCM_TRANSFER_SURFACE_FORMAT_A8R8G8B8)
{
LOG_ERROR(RSX, "NV3089_IMAGE_IN_SIZE: unknown m_color_format (%d)", method_registers[NV3062_SET_COLOR_FORMAT]);
LOG_ERROR(RSX, "NV3089_IMAGE_IN_SIZE: unknown dst_color_format (%d)", dst_color_format);
}
const u32 in_bpp = method_registers[NV3062_SET_COLOR_FORMAT] == 4 ? 2 : 4; // bytes per pixel
const u32 out_bpp = method_registers[NV3089_SET_COLOR_FORMAT] == 7 ? 2 : 4;
const s32 out_w = (s32)(u64(width) * (1 << 20) / method_registers[NV3089_DS_DX]);
const s32 out_h = (s32)(u64(height) * (1 << 20) / method_registers[NV3089_DT_DY]);
std::unique_ptr<u8[]> temp1, temp2;
if (method_registers[NV3089_SET_CONTEXT_SURFACE] == CELL_GCM_CONTEXT_SWIZZLE2D)
if (src_color_format != CELL_GCM_TRANSFER_SCALE_FORMAT_R5G6B5 &&
src_color_format != CELL_GCM_TRANSFER_SCALE_FORMAT_A8R8G8B8)
{
temp1.reset(new u8[in_bpp * width * height]);
u8* linear_pixels = pixels_src;
u8* swizzled_pixels = temp1.get();
int sw_width = 1 << (int)log2(width);
int sw_height = 1 << (int)log2(height);
for (int y = 0; y < sw_height; y++)
{
for (int x = 0; x < sw_width; x++)
{
switch (in_bpp)
{
case 1:
swizzled_pixels[linear_to_swizzle(x, y, 0, sw_width, sw_height, 0)] = linear_pixels[y * sw_height + x];
break;
case 2:
((u16*)swizzled_pixels)[linear_to_swizzle(x, y, 0, sw_width, sw_height, 0)] = ((u16*)linear_pixels)[y * sw_height + x];
break;
case 4:
((u32*)swizzled_pixels)[linear_to_swizzle(x, y, 0, sw_width, sw_height, 0)] = ((u32*)linear_pixels)[y * sw_height + x];
break;
}
}
}
pixels_src = swizzled_pixels;
LOG_ERROR(RSX, "NV3089_IMAGE_IN_SIZE: unknown src_color_format (%d)", src_color_format);
}
LOG_WARNING(RSX, "NV3089_IMAGE_IN_SIZE: SIZE=0x%08x, pitch=0x%x, offset=0x%x, scaleX=%f, scaleY=%f, CLIP_SIZE=0x%08x, OUT_SIZE=0x%08x",
method_registers[NV3089_IMAGE_IN_SIZE], pitch, src_offset, double(1 << 20) / (method_registers[NV3089_DS_DX]), double(1 << 20) / (method_registers[NV3089_DT_DY]),
method_registers[NV3089_IMAGE_IN_SIZE], src_pitch, src_offset, double(1 << 20) / (method_registers[NV3089_DS_DX]), double(1 << 20) / (method_registers[NV3089_DT_DY]),
method_registers[NV3089_CLIP_SIZE], method_registers[NV3089_IMAGE_OUT_SIZE]);
if (in_bpp != out_bpp && width != out_w && height != out_h)
const u32 in_bpp = src_color_format == CELL_GCM_TRANSFER_SCALE_FORMAT_R5G6B5 ? 2 : 4; // bytes per pixel
const u32 out_bpp = dst_color_format == CELL_GCM_TRANSFER_SURFACE_FORMAT_R5G6B5 ? 2 : 4;
std::unique_ptr<u8[]> temp1, temp2;
// resize/convert if necessary
if (in_bpp != out_bpp && src_width != out_w && src_height != out_h)
{
// resize/convert if necessary
temp1.reset(new u8[out_bpp * out_w * out_h]);
temp2.reset(new u8[out_bpp * out_w * out_h]);
AVPixelFormat in_format = src_color_format == CELL_GCM_TRANSFER_SCALE_FORMAT_R5G6B5 ? AV_PIX_FMT_RGB565BE : AV_PIX_FMT_ARGB;
AVPixelFormat out_format = dst_color_format == CELL_GCM_TRANSFER_SURFACE_FORMAT_R5G6B5 ? AV_PIX_FMT_RGB565BE : AV_PIX_FMT_ARGB;
AVPixelFormat in_format = method_registers[NV3062_SET_COLOR_FORMAT] == 4 ? AV_PIX_FMT_RGB565BE : AV_PIX_FMT_ARGB; // ???
AVPixelFormat out_format = method_registers[NV3089_SET_COLOR_FORMAT] == 7 ? AV_PIX_FMT_RGB565BE : AV_PIX_FMT_ARGB; // ???
std::unique_ptr<SwsContext, void(*)(SwsContext*)> sws(sws_getContext(src_width, src_height, in_format, out_w, out_h, out_format,
src_inter ? SWS_FAST_BILINEAR : SWS_POINT, NULL, NULL, NULL), sws_freeContext);
std::unique_ptr<SwsContext, void(*)(SwsContext*)> sws(sws_getContext(width, height, in_format, out_w, out_h, out_format,
inter ? SWS_FAST_BILINEAR : SWS_POINT, NULL, NULL, NULL), sws_freeContext);
int in_line = in_bpp * width;
u8* out_ptr = temp2.get();
int in_line = in_bpp * src_width;
u8* out_ptr = temp1.get();
int out_line = out_bpp * out_w;
sws_scale(sws.get(), &pixels_src, &in_line, 0, height, &out_ptr, &out_line);
sws_scale(sws.get(), &pixels_src, &in_line, 0, src_height, &out_ptr, &out_line);
pixels_src = out_ptr; // use resized image as a source
}
// clip if necessary
if (method_registers[NV3089_CLIP_SIZE] != method_registers[NV3089_IMAGE_OUT_SIZE] ||
method_registers[NV3089_IMAGE_OUT_SIZE] != (out_w | (out_h << 16)) ||
method_registers[NV3089_IMAGE_OUT_POINT] || method_registers[NV3089_CLIP_POINT])
method_registers[NV3089_CLIP_POINT] != method_registers[NV3089_IMAGE_OUT_POINT])
{
// clip if necessary
for (s32 y = method_registers[NV3089_CLIP_POINT] >> 16, dst_y = method_registers[NV3089_IMAGE_OUT_POINT] >> 16; y < out_h; y++, dst_y++)
temp2.reset(new u8[out_bpp * out_w * out_h]);
for (s32 y = (method_registers[NV3089_CLIP_POINT] >> 16), dst_y = (method_registers[NV3089_IMAGE_OUT_POINT] >> 16); y < out_h; y++, dst_y++)
{
if (dst_y >= 0 && dst_y < method_registers[NV3089_IMAGE_OUT_SIZE] >> 16)
{
// destination line
u8* dst_line = pixels_dst + dst_y * out_bpp * (method_registers[NV3089_IMAGE_OUT_SIZE] & 0xffff)
u8* dst_line = temp2.get() + dst_y * out_bpp * (method_registers[NV3089_IMAGE_OUT_SIZE] & 0xffff)
+ std::min<s32>(std::max<s32>(method_registers[NV3089_IMAGE_OUT_POINT] & 0xffff, 0), method_registers[NV3089_IMAGE_OUT_SIZE] & 0xffff);
size_t dst_max = std::min<s32>(
@ -513,8 +500,64 @@ namespace rsx
}
}
}
pixels_src = temp2.get();
}
else
// Swizzle texture last after scaling is done
if (method_registers[NV3089_SET_CONTEXT_SURFACE] == CELL_GCM_CONTEXT_SWIZZLE2D)
{
u8 sw_width_log2 = method_registers[NV309E_SET_FORMAT] >> 16;
u8 sw_height_log2 = method_registers[NV309E_SET_FORMAT] >> 24;
// 0 indicates height of 1 pixel
sw_height_log2 = sw_height_log2 == 0 ? 1 : sw_height_log2;
// swizzle based on destination size
u16 sw_width = 1 << sw_width_log2;
u16 sw_height = 1 << sw_height_log2;
std::unique_ptr<u8[]> sw_temp, sw_temp2;
sw_temp.reset(new u8[out_bpp * sw_width * sw_height]);
u8* linear_pixels = pixels_src;
u8* swizzled_pixels = sw_temp.get();
// Check and pad texture out if we are given non square texture for swizzle to be correct
if (sw_width != out_w || sw_height != out_h) {
sw_temp2.reset(new u8[out_bpp * sw_width * sw_height]());
switch (out_bpp) {
case 1:
pad_texture<u8>(linear_pixels, sw_temp2.get(), out_w, out_h, sw_width, sw_height);
break;
case 2:
pad_texture<u16>(linear_pixels, sw_temp2.get(), out_w, out_h, sw_width, sw_height);
break;
case 4:
pad_texture<u32>(linear_pixels, sw_temp2.get(), out_w, out_h, sw_width, sw_height);
break;
}
linear_pixels = sw_temp2.get();
}
switch (out_bpp)
{
case 1:
convert_linear_swizzle<u8>(linear_pixels, swizzled_pixels, sw_width, sw_height, false);
break;
case 2:
convert_linear_swizzle<u16>(linear_pixels, swizzled_pixels, sw_width, sw_height, false);
break;
case 4:
convert_linear_swizzle<u32>(linear_pixels, swizzled_pixels, sw_width, sw_height, false);
break;
}
std::memcpy(pixels_dst, swizzled_pixels, out_bpp * sw_width * sw_height);
}
else
{
std::memcpy(pixels_dst, pixels_src, out_w * out_h * out_bpp);
}
@ -752,39 +795,6 @@ namespace rsx
}
} __rsx_methods;
u32 linear_to_swizzle(u32 x, u32 y, u32 z, u32 log2_width, u32 log2_height, u32 log2_depth)
{
u32 offset = 0;
u32 shift_count = 0;
while (log2_width | log2_height | log2_depth)
{
if (log2_width)
{
offset |= (x & 0x01) << shift_count;
x >>= 1;
++shift_count;
--log2_width;
}
if (log2_height)
{
offset |= (y & 0x01) << shift_count;
y >>= 1;
++shift_count;
--log2_height;
}
if (log2_depth)
{
offset |= (z & 0x01) << shift_count;
z >>= 1;
++shift_count;
--log2_depth;
}
}
return offset;
}
u32 get_address(u32 offset, u32 location)
{
u32 res = 0;

View File

@ -121,10 +121,86 @@ namespace rsx
extern u32 method_registers[0x10000 >> 2];
u32 get_address(u32 offset, u32 location);
u32 linear_to_swizzle(u32 x, u32 y, u32 z, u32 log2_width, u32 log2_height, u32 log2_depth);
u32 get_vertex_type_size(u32 type);
u32 get_vertex_type_size(u32 type);
u32 get_address(u32 offset, u32 location);
template<typename T>
void pad_texture(void* inputPixels, void* outputPixels, u16 inputWidth, u16 inputHeight, u16 outputWidth, u16 outputHeight) {
T *src, *dst;
src = (T *)(inputPixels);
dst = (T *)(outputPixels);
for (u16 h = 0; h < inputHeight; ++h) {
const u32 paddedPos = h * outputWidth;
const u32 pos = h * inputWidth;
for (u16 w = 0; w < inputWidth; ++w) {
dst[paddedPos + w] = src[pos + w];
}
}
}
/* Note: What the ps3 calls swizzling in this case is actually z-ordering / morton ordering of pixels
* - Input can be swizzled or linear, bool flag handles conversion to and from
* - It will handle any width and height that are a power of 2, square or non square
* Restriction: It has mixed results if the height or width is not a power of 2
*/
template<typename T>
void convert_linear_swizzle(void* inputPixels, void* outputPixels, u16 width, u16 height, bool inputIsSwizzled)
{
u32 log2width, log2height;
log2width = log2(width);
log2height = log2(height);
// Max mask possible for square texture (should be 2^11, or 22 bits for x and y)
u32 x_mask = 0x555555;
u32 y_mask = 0xAAAAAA;
// We have to limit the masks to the lower of the two dimensions to allow for non-square textures
u32 limitMask = (log2width < log2height) ? log2width : log2height;
// double the limit mask to account for bits in both x and y
limitMask = 1 << (limitMask << 1);
//x_mask, bits above limit are 1's for x-carry
x_mask = (x_mask | ~(limitMask - 1));
//y_mask. bits above limit are 0'd, as we use a different method for y-carry over
y_mask = (y_mask & (limitMask - 1));
u32 offs_y = 0;
u32 offs_x = 0;
u32 offs_x0 = 0; //total y-carry offset for x
u32 y_incr = limitMask;
T *src, *dst;
if (inputIsSwizzled) {
for (int y = 0; y < height; ++y) {
src = (T *)((T*)inputPixels + y*width);
dst = (T *)((T*)outputPixels + offs_y);
offs_x = offs_x0;
for (int x = 0; x < width; ++x) {
dst[offs_x] = src[x];
offs_x = (offs_x - x_mask) & x_mask;
}
offs_y = (offs_y - y_mask) & y_mask;
if (offs_y == 0) offs_x0 += y_incr;
}
}
else {
for (int y = 0; y < height; ++y) {
src = (T *)((T*)inputPixels + offs_y);
dst = (T *)((T*)outputPixels + y*width);
offs_x = offs_x0;
for (int x = 0; x < width; ++x) {
dst[x] = src[offs_x];
offs_x = (offs_x - x_mask) & x_mask;
}
offs_y = (offs_y - y_mask) & y_mask;
if (offs_y == 0) offs_x0 += y_incr;
}
}
}
struct surface_info
{