From 0a83b497447563c17bbd44eed1ac890cd1e36723 Mon Sep 17 00:00:00 2001
From: Ben Vanik <ben.vanik@gmail.com>
Date: Fri, 29 May 2015 17:05:43 -0700
Subject: [PATCH] Speeding up inner loop.

---
 src/xenia/apu/audio_decoder.cc | 75 ++++++++++++++++++----------------
 src/xenia/apu/audio_system.cc  |  2 -
 2 files changed, 40 insertions(+), 37 deletions(-)

diff --git a/src/xenia/apu/audio_decoder.cc b/src/xenia/apu/audio_decoder.cc
index af75372f0..33d241a94 100644
--- a/src/xenia/apu/audio_decoder.cc
+++ b/src/xenia/apu/audio_decoder.cc
@@ -13,7 +13,7 @@
 #include "xenia/base/logging.h"
 
 extern "C" {
-  #include "libavcodec/avcodec.h"
+#include "libavcodec/avcodec.h"
 }
 
 // Credits for most of this code goes to:
@@ -22,8 +22,11 @@ extern "C" {
 namespace xe {
 namespace apu {
 
-AudioDecoder::AudioDecoder() : codec_(nullptr), context_(nullptr),
-                              decoded_frame_(nullptr), packet_(nullptr) {}
+AudioDecoder::AudioDecoder()
+    : codec_(nullptr),
+      context_(nullptr),
+      decoded_frame_(nullptr),
+      packet_(nullptr) {}
 
 AudioDecoder::~AudioDecoder() {
   if (context_) {
@@ -87,20 +90,21 @@ int AudioDecoder::Initialize(int bits) {
 
   // Current frame stuff whatever
   // samples per frame * 2 max channels * output bytes
-  current_frame_ = new uint8_t[XMAContextData::kSamplesPerFrame * 2 * (bits/8)];
+  current_frame_ =
+      new uint8_t[XMAContextData::kSamplesPerFrame * 2 * (bits / 8)];
   current_frame_pos_ = 0;
   frame_samples_size_ = 0;
 
-  *(short *)(context_->extradata) = 0x10; // bits per sample
-  *(int *)(context_->extradata + 2) = 1; // channel mask
-  *(short *)(context_->extradata + 14) = 0x10D6; // decode flags
+  *(short *)(context_->extradata) = 0x10;         // bits per sample
+  *(int *)(context_->extradata + 2) = 1;          // channel mask
+  *(short *)(context_->extradata + 14) = 0x10D6;  // decode flags
 
   // FYI: We're purposely not opening the context here. That is done later.
 
   return 0;
 }
 
-int AudioDecoder::PreparePacket(uint8_t* input, size_t seq_offset, size_t size,
+int AudioDecoder::PreparePacket(uint8_t *input, size_t seq_offset, size_t size,
                                 int sample_rate, int channels) {
   if (size != XMAContextData::kBytesPerBlock) {
     // Invalid packet size!
@@ -116,7 +120,7 @@ int AudioDecoder::PreparePacket(uint8_t* input, size_t seq_offset, size_t size,
 
   // Modify the packet header so it's WMAPro compatible
   *((int *)packet_data_) = (((seq_offset & 0x7800) | 0x400) >> 7) |
-                           (*((int*)packet_data_) & 0xFFFEFF08);
+                           (*((int *)packet_data_) & 0xFFFEFF08);
 
   packet_->data = packet_data_;
   packet_->size = XMAContextData::kBytesPerBlock;
@@ -146,7 +150,8 @@ void AudioDecoder::DiscardPacket() {
   }
 }
 
-int AudioDecoder::DecodePacket(uint8_t* output, size_t output_offset, size_t output_size) {
+int AudioDecoder::DecodePacket(uint8_t *output, size_t output_offset,
+                               size_t output_size) {
   size_t to_copy = 0;
   size_t original_offset = output_offset;
   uint32_t sample_size = bits_ / 8;
@@ -154,7 +159,8 @@ int AudioDecoder::DecodePacket(uint8_t* output, size_t output_offset, size_t out
   // We're holding onto an already-decoded frame. Copy it out.
   if (current_frame_pos_ != frame_samples_size_) {
     to_copy = std::min(output_size, frame_samples_size_ - current_frame_pos_);
-    memcpy(output + output_offset, current_frame_ + current_frame_pos_, to_copy);
+    memcpy(output + output_offset, current_frame_ + current_frame_pos_,
+           to_copy);
 
     current_frame_pos_ += to_copy;
     output_size -= to_copy;
@@ -165,7 +171,8 @@ int AudioDecoder::DecodePacket(uint8_t* output, size_t output_offset, size_t out
     int got_frame = 0;
 
     // Decode the current frame
-    int len = avcodec_decode_audio4(context_, decoded_frame_, &got_frame, packet_);
+    int len =
+        avcodec_decode_audio4(context_, decoded_frame_, &got_frame, packet_);
     if (len < 0) {
       // Error in codec (bad sample rate or something)
       return len;
@@ -188,37 +195,36 @@ int AudioDecoder::DecodePacket(uint8_t* output, size_t output_offset, size_t out
       // Check the returned buffer size
       if (av_samples_get_buffer_size(NULL, context_->channels,
                                      decoded_frame_->nb_samples,
-                                     context_->sample_fmt, 1)
-          != context_->channels * decoded_frame_->nb_samples * sizeof(float)) {
+                                     context_->sample_fmt, 1) !=
+          context_->channels * decoded_frame_->nb_samples * sizeof(float)) {
         return -4;
       }
 
       // Output sample array
-      float* sample_array = (float *)decoded_frame_->data[0];
+      float *sample_array = (float *)decoded_frame_->data[0];
 
-      // Loop through every sample, convert and drop it into the output array
-      for (int i = 0; i < decoded_frame_->nb_samples; i++) {
-        // Raw sample should be within [-1, 1]
-        float fRawSample = sample_array[i];
+      // Loop through every sample, convert and drop it into the output array.
+      if (sample_size == 2) {
+        for (int i = 0; i < decoded_frame_->nb_samples; i++) {
+          // Raw sample should be within [-1, 1].
+          // Clamp it, just in case.
+          float raw_sample = xe::saturate(sample_array[i]);
 
-        // Clamp it, just in case.
-        fRawSample = std::min( 1.f, fRawSample);
-        fRawSample = std::max(-1.f, fRawSample);
-
-        float fScaledSample = fRawSample * (1 << (bits_ - 1));
-
-        // Convert the sample and output it in big endian
-        int sample = (int)fScaledSample;
-        for (int32_t j = sample_size-1; j >= 0; j--) {
-          current_frame_[i * sample_size + j] = sample & 0xFF;
-          sample >>= 8;
+          // Convert the sample and output it in big endian.
+          float scaled_sample = raw_sample * (1 << (bits_ - 1));
+          int sample = static_cast<int>(scaled_sample);
+          xe::store_and_swap<uint16_t>(&current_frame_[i * 2],
+                                       sample & 0xFFFF);
         }
+      } else {
+        // 1 byte? 4 bytes?
+        assert_unhandled_case(sample_size);
       }
       current_frame_pos_ = 0;
 
       // Total size of the frame's samples
-      frame_samples_size_ = context_->channels * decoded_frame_->nb_samples
-                          * sample_size;
+      frame_samples_size_ =
+          context_->channels * decoded_frame_->nb_samples * sample_size;
 
       to_copy = std::min(output_size, (size_t)(frame_samples_size_));
       std::memcpy(output + output_offset, current_frame_, to_copy);
@@ -233,6 +239,5 @@ int AudioDecoder::DecodePacket(uint8_t* output, size_t output_offset, size_t out
   return (int)(output_offset - original_offset);
 }
 
-
-} // namespace xe
-} // namespace apu
\ No newline at end of file
+}  // namespace xe
+}  // namespace apu
diff --git a/src/xenia/apu/audio_system.cc b/src/xenia/apu/audio_system.cc
index 49357aced..8eb30db7f 100644
--- a/src/xenia/apu/audio_system.cc
+++ b/src/xenia/apu/audio_system.cc
@@ -341,8 +341,6 @@ void AudioSystem::ProcessXmaContext(XMAContext& context, XMAContextData& data) {
                      : nullptr;
   uint8_t* out = memory()->TranslatePhysical(data.output_buffer_ptr);
 
-  assert(!in1);
-
   // What I see:
   // XMA outputs 2 bytes per sample
   // 512 samples per frame (128 per subframe)