diff --git a/retroarch.c b/retroarch.c index 9bb97653c5..6c89fba7f1 100644 --- a/retroarch.c +++ b/retroarch.c @@ -2214,7 +2214,10 @@ static void check_rewind(void) if (cnt == 0) #endif { + RARCH_PERFORMANCE_INIT(rewind_serialize); + RARCH_PERFORMANCE_START(rewind_serialize); pretro_serialize(g_extern.state_buf, g_extern.state_size); + RARCH_PERFORMANCE_STOP(rewind_serialize); state_manager_push(g_extern.state_manager, g_extern.state_buf); } } diff --git a/rewind.c b/rewind.c index 5a7229f422..f0a39cf7cb 100644 --- a/rewind.c +++ b/rewind.c @@ -14,6 +14,7 @@ */ #include "rewind.h" +#include "performance.h" #include #include #include "boolean.h" @@ -147,11 +148,45 @@ static void reassign_bottom(state_manager_t *state) state->bottom_ptr = (state->bottom_ptr + 1) & state->buf_size_mask; } +#if __SSE2__ +#include +// There's no equivalent in libc, you'd think so ... std::mismatch exists, but it's not optimized at all. :( +static unsigned find_mismatch(const uint32_t *a, const uint32_t *b, unsigned samples) +{ + unsigned i; + unsigned sse_samples = samples & ~3; + for (i = 0; i < sse_samples; i += 4) + { + __m128i v0 = _mm_loadu_si128((const __m128i*)(a + i)); + __m128i v1 = _mm_loadu_si128((const __m128i*)(b + i)); + __m128i c = _mm_cmpeq_epi32(v0, v1); + uint32_t mask = _mm_movemask_epi8(c); + if (mask != 0xffff) // Something has changed, figure out where. + return i + (__builtin_ctz(~mask) >> 2); + } + + for (; i < samples; i++) + if (a[i] != b[i]) + return i; + + return samples; +} +#else +static unsigned find_mismatch(const uint32_t *a, const uint32_t *b, unsigned samples) +{ + unsigned i; + for (i = 0; i < samples; i++) + if (a[i] != b[i]) + return i; + return samples; +} +#endif + static void generate_delta(state_manager_t *state, const void *data) { - uint64_t i; + size_t i; bool crossed = false; - const uint32_t *old_state = state->tmp_state; + uint32_t *old_state = state->tmp_state; const uint32_t *new_state = (const uint32_t*)data; state->buffer[state->top_ptr++] = 0; // For each separate delta, we have a 0 value sentinel in between. @@ -163,20 +198,25 @@ static void generate_delta(state_manager_t *state, const void *data) for (i = 0; i < state->state_size; i++) { - uint64_t xor_ = old_state[i] ^ new_state[i]; + unsigned avail = state->state_size - i; + unsigned pos = find_mismatch(old_state + i, new_state + i, avail); + if (pos == avail) + break; + + i += pos; // If the data differs (xor != 0), we push that xor on the stack with index and xor. // This can be reversed by reapplying the xor. // This, if states don't really differ much, we'll save lots of space :) // Hopefully this will work really well with save states. - if (xor_) - { - state->buffer[state->top_ptr] = (i << 32) | xor_; - state->top_ptr = (state->top_ptr + 1) & state->buf_size_mask; + uint32_t xor_ = old_state[i] ^ new_state[i]; + old_state[i] = new_state[i]; - if (state->top_ptr == state->bottom_ptr) - crossed = true; - } + state->buffer[state->top_ptr] = ((uint64_t)i << 32) | xor_; + state->top_ptr = (state->top_ptr + 1) & state->buf_size_mask; + + if (state->top_ptr == state->bottom_ptr) + crossed = true; } if (crossed) @@ -185,8 +225,11 @@ static void generate_delta(state_manager_t *state, const void *data) bool state_manager_push(state_manager_t *state, const void *data) { + RARCH_PERFORMANCE_INIT(gen_delta); + RARCH_PERFORMANCE_START(gen_delta); generate_delta(state, data); - memcpy(state->tmp_state, data, state->state_size * sizeof(uint32_t)); + RARCH_PERFORMANCE_STOP(gen_delta); + state->first_pop = true; return true;