mirror of https://github.com/xemu-project/xemu.git
util/bufferiszero: Reorganize for early test for acceleration
Test for length >= 256 inline, where is is often a constant. Before calling into the accelerated routine, sample three bytes from the buffer, which handles most non-zero buffers. Signed-off-by: Alexander Monakov <amonakov@ispras.ru> Signed-off-by: Mikhail Romanov <mmromanov@ispras.ru> Message-Id: <20240206204809.9859-3-amonakov@ispras.ru> [rth: Use __builtin_constant_p; move the indirect call out of line.] Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
This commit is contained in:
parent
d018425c32
commit
cbe3d52646
|
@ -187,9 +187,39 @@ char *freq_to_str(uint64_t freq_hz);
|
|||
/* used to print char* safely */
|
||||
#define STR_OR_NULL(str) ((str) ? (str) : "null")
|
||||
|
||||
bool buffer_is_zero(const void *buf, size_t len);
|
||||
/*
|
||||
* Check if a buffer is all zeroes.
|
||||
*/
|
||||
|
||||
bool buffer_is_zero_ool(const void *vbuf, size_t len);
|
||||
bool buffer_is_zero_ge256(const void *vbuf, size_t len);
|
||||
bool test_buffer_is_zero_next_accel(void);
|
||||
|
||||
static inline bool buffer_is_zero_sample3(const char *buf, size_t len)
|
||||
{
|
||||
/*
|
||||
* For any reasonably sized buffer, these three samples come from
|
||||
* three different cachelines. In qemu-img usage, we find that
|
||||
* each byte eliminates more than half of all buffer testing.
|
||||
* It is therefore critical to performance that the byte tests
|
||||
* short-circuit, so that we do not pull in additional cache lines.
|
||||
* Do not "optimize" this to !(a | b | c).
|
||||
*/
|
||||
return !buf[0] && !buf[len - 1] && !buf[len / 2];
|
||||
}
|
||||
|
||||
#ifdef __OPTIMIZE__
|
||||
static inline bool buffer_is_zero(const void *buf, size_t len)
|
||||
{
|
||||
return (__builtin_constant_p(len) && len >= 256
|
||||
? buffer_is_zero_sample3(buf, len) &&
|
||||
buffer_is_zero_ge256(buf, len)
|
||||
: buffer_is_zero_ool(buf, len));
|
||||
}
|
||||
#else
|
||||
#define buffer_is_zero buffer_is_zero_ool
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Implementation of ULEB128 (http://en.wikipedia.org/wiki/LEB128)
|
||||
* Input is limited to 14-bit numbers
|
||||
|
|
|
@ -26,8 +26,9 @@
|
|||
#include "qemu/bswap.h"
|
||||
#include "host/cpuinfo.h"
|
||||
|
||||
static bool
|
||||
buffer_zero_int(const void *buf, size_t len)
|
||||
static bool (*buffer_is_zero_accel)(const void *, size_t);
|
||||
|
||||
static bool buffer_is_zero_integer(const void *buf, size_t len)
|
||||
{
|
||||
if (unlikely(len < 8)) {
|
||||
/* For a very small buffer, simply accumulate all the bytes. */
|
||||
|
@ -128,60 +129,38 @@ buffer_zero_avx2(const void *buf, size_t len)
|
|||
}
|
||||
#endif /* CONFIG_AVX2_OPT */
|
||||
|
||||
/*
|
||||
* Make sure that these variables are appropriately initialized when
|
||||
* SSE2 is enabled on the compiler command-line, but the compiler is
|
||||
* too old to support CONFIG_AVX2_OPT.
|
||||
*/
|
||||
#if defined(CONFIG_AVX2_OPT)
|
||||
# define INIT_USED 0
|
||||
# define INIT_LENGTH 0
|
||||
# define INIT_ACCEL buffer_zero_int
|
||||
#else
|
||||
# ifndef __SSE2__
|
||||
# error "ISA selection confusion"
|
||||
# endif
|
||||
# define INIT_USED CPUINFO_SSE2
|
||||
# define INIT_LENGTH 64
|
||||
# define INIT_ACCEL buffer_zero_sse2
|
||||
#endif
|
||||
|
||||
static unsigned used_accel = INIT_USED;
|
||||
static unsigned length_to_accel = INIT_LENGTH;
|
||||
static bool (*buffer_accel)(const void *, size_t) = INIT_ACCEL;
|
||||
|
||||
static unsigned __attribute__((noinline))
|
||||
select_accel_cpuinfo(unsigned info)
|
||||
{
|
||||
/* Array is sorted in order of algorithm preference. */
|
||||
static const struct {
|
||||
unsigned bit;
|
||||
unsigned len;
|
||||
bool (*fn)(const void *, size_t);
|
||||
} all[] = {
|
||||
#ifdef CONFIG_AVX2_OPT
|
||||
{ CPUINFO_AVX2, 128, buffer_zero_avx2 },
|
||||
{ CPUINFO_AVX2, buffer_zero_avx2 },
|
||||
#endif
|
||||
{ CPUINFO_SSE2, 64, buffer_zero_sse2 },
|
||||
{ CPUINFO_ALWAYS, 0, buffer_zero_int },
|
||||
{ CPUINFO_SSE2, buffer_zero_sse2 },
|
||||
{ CPUINFO_ALWAYS, buffer_is_zero_integer },
|
||||
};
|
||||
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(all); ++i) {
|
||||
if (info & all[i].bit) {
|
||||
length_to_accel = all[i].len;
|
||||
buffer_accel = all[i].fn;
|
||||
buffer_is_zero_accel = all[i].fn;
|
||||
return all[i].bit;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if defined(CONFIG_AVX2_OPT)
|
||||
static unsigned used_accel;
|
||||
|
||||
static void __attribute__((constructor)) init_accel(void)
|
||||
{
|
||||
used_accel = select_accel_cpuinfo(cpuinfo_init());
|
||||
}
|
||||
#endif /* CONFIG_AVX2_OPT */
|
||||
|
||||
#define INIT_ACCEL NULL
|
||||
|
||||
bool test_buffer_is_zero_next_accel(void)
|
||||
{
|
||||
|
@ -194,36 +173,37 @@ bool test_buffer_is_zero_next_accel(void)
|
|||
used_accel |= used;
|
||||
return used;
|
||||
}
|
||||
|
||||
static bool select_accel_fn(const void *buf, size_t len)
|
||||
{
|
||||
if (likely(len >= length_to_accel)) {
|
||||
return buffer_accel(buf, len);
|
||||
}
|
||||
return buffer_zero_int(buf, len);
|
||||
}
|
||||
|
||||
#else
|
||||
#define select_accel_fn buffer_zero_int
|
||||
bool test_buffer_is_zero_next_accel(void)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
#define INIT_ACCEL buffer_is_zero_integer
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Checks if a buffer is all zeroes
|
||||
*/
|
||||
bool buffer_is_zero(const void *buf, size_t len)
|
||||
static bool (*buffer_is_zero_accel)(const void *, size_t) = INIT_ACCEL;
|
||||
|
||||
bool buffer_is_zero_ool(const void *buf, size_t len)
|
||||
{
|
||||
if (unlikely(len == 0)) {
|
||||
return true;
|
||||
}
|
||||
if (!buffer_is_zero_sample3(buf, len)) {
|
||||
return false;
|
||||
}
|
||||
/* All bytes are covered for any len <= 3. */
|
||||
if (unlikely(len <= 3)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Fetch the beginning of the buffer while we select the accelerator. */
|
||||
__builtin_prefetch(buf);
|
||||
|
||||
/* Use an optimized zero check if possible. Note that this also
|
||||
includes a check for an unrolled loop over 64-bit integers. */
|
||||
return select_accel_fn(buf, len);
|
||||
if (likely(len >= 256)) {
|
||||
return buffer_is_zero_accel(buf, len);
|
||||
}
|
||||
return buffer_is_zero_integer(buf, len);
|
||||
}
|
||||
|
||||
bool buffer_is_zero_ge256(const void *buf, size_t len)
|
||||
{
|
||||
return buffer_is_zero_accel(buf, len);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue