diff --git a/src/xenia/apu/xma_context.cc b/src/xenia/apu/xma_context.cc
index 17f06cabc..87ac2afa8 100644
--- a/src/xenia/apu/xma_context.cc
+++ b/src/xenia/apu/xma_context.cc
@@ -14,6 +14,7 @@
 
 #include "xenia/apu/xma_decoder.h"
 #include "xenia/apu/xma_helpers.h"
+#include "xenia/base/bit_stream.h"
 #include "xenia/base/logging.h"
 #include "xenia/base/ring_buffer.h"
 #include "xenia/profiling.h"
@@ -87,6 +88,8 @@ int XmaContext::Setup(uint32_t id, Memory* memory, uint32_t guest_ptr) {
   context_->extradata_size = sizeof(extra_data_);
   context_->extradata = reinterpret_cast<uint8_t*>(&extra_data_);
 
+  partial_frame_buffer_.resize(2048);
+
   // Current frame stuff whatever
   // samples per frame * 2 max channels * output bytes
   current_frame_ = new uint8_t[kSamplesPerFrame * kBytesPerSample * 2];
@@ -98,11 +101,11 @@ int XmaContext::Setup(uint32_t id, Memory* memory, uint32_t guest_ptr) {
 }
 
 void XmaContext::Work() {
+  std::lock_guard<xe::mutex> lock(lock_);
   if (!is_allocated() || !is_enabled()) {
     return;
   }
 
-  std::lock_guard<xe::mutex> lock(lock_);
   set_is_enabled(false);
 
   auto context_ptr = memory()->TranslateVirtual(guest_ptr());
@@ -117,10 +120,11 @@ void XmaContext::Enable() {
   auto context_ptr = memory()->TranslateVirtual(guest_ptr());
   XMA_CONTEXT_DATA data(context_ptr);
 
-  XELOGAPU("XmaContext: kicking context %d (%d/%d bits)", id(),
-           data.input_buffer_read_offset, (data.input_buffer_0_packet_count +
-                                           data.input_buffer_1_packet_count) *
-                                              kBytesPerPacket * 8);
+  XELOGAPU("XmaContext: kicking context %d (buffer %d %d/%d bits)", id(),
+           data.current_buffer, data.input_buffer_read_offset,
+           (data.current_buffer == 0 ? data.input_buffer_0_packet_count
+                                     : data.input_buffer_1_packet_count) *
+               kBytesPerPacket * 8);
 
   data.Store(context_ptr);
 
@@ -142,8 +146,6 @@ void XmaContext::Clear() {
   std::lock_guard<xe::mutex> lock(lock_);
   XELOGAPU("XmaContext: reset context %d", id());
 
-  DiscardPacket();
-
   auto context_ptr = memory()->TranslateVirtual(guest_ptr());
   XMA_CONTEXT_DATA data(context_ptr);
 
@@ -171,8 +173,6 @@ void XmaContext::Release() {
   set_is_allocated(false);
   auto context_ptr = memory()->TranslateVirtual(guest_ptr());
   std::memset(context_ptr, 0, sizeof(XMA_CONTEXT_DATA));  // Zero it.
-
-  DiscardPacket();
 }
 
 int XmaContext::GetSampleRate(int id) {
@@ -190,6 +190,83 @@ int XmaContext::GetSampleRate(int id) {
   return 0;
 }
 
+size_t XmaContext::SavePartial(uint8_t* packet, uint32_t frame_offset_bits,
+                               size_t frame_size_bits, bool append) {
+  uint8_t* buff = partial_frame_buffer_.data();
+
+  BitStream stream(packet, 2048 * 8);
+  stream.SetOffset(frame_offset_bits);
+
+  if (!append) {
+    // Reset the buffer.
+    // TODO: Probably not necessary.
+    std::memset(buff, 0, partial_frame_buffer_.size());
+
+    size_t copy_bits = (2048 * 8) - frame_offset_bits;
+    size_t copy_offset = stream.Copy(buff, copy_bits);
+    partial_frame_offset_bits_ = copy_bits;
+    partial_frame_start_offset_bits_ = copy_offset;
+
+    return copy_bits;
+  } else {
+    size_t copy_bits = frame_size_bits - partial_frame_offset_bits_;
+    size_t copy_offset = stream.Copy(
+        buff +
+            ((partial_frame_offset_bits_ + partial_frame_start_offset_bits_) /
+             8),
+        copy_bits);
+
+    partial_frame_offset_bits_ += copy_bits;
+
+    return copy_bits;
+  }
+}
+
+bool XmaContext::ValidFrameOffset(uint8_t* block, size_t size_bytes,
+                                  size_t frame_offset_bits) {
+  uint32_t packet_num =
+      GetFramePacketNumber(block, size_bytes, frame_offset_bits);
+  uint8_t* packet = block + (packet_num * kBytesPerPacket);
+  size_t relative_offset_bits = frame_offset_bits % (kBytesPerPacket * 8);
+
+  uint32_t first_frame_offset = xma::GetPacketFrameOffset(packet);
+  if (first_frame_offset == -1) {
+    // Packet only contains a partial frame, so no frames can start here.
+    return false;
+  }
+
+  BitStream stream(packet, kBytesPerPacket * 8);
+  stream.SetOffset(first_frame_offset);
+  while (true) {
+    if (stream.offset_bits() == relative_offset_bits) {
+      return true;
+    }
+
+    if (stream.BitsRemaining() < 15) {
+      // Not enough room for another frame header.
+      return false;
+    }
+
+    uint64_t size = stream.Read(15);
+    if ((size - 15) > stream.BitsRemaining()) {
+      // Last frame.
+      return false;
+    } else if (size == 0x7FFF) {
+      // Invalid frame (and last of this packet)
+      return false;
+    }
+
+    stream.Advance(size - 16);
+
+    // Read the trailing bit to see if frames follow
+    if (stream.Read(1) == 0) {
+      break;
+    }
+  }
+
+  return false;
+}
+
 void XmaContext::DecodePackets(XMA_CONTEXT_DATA* data) {
   SCOPE_profile_cpu_f("apu");
 
@@ -203,22 +280,16 @@ void XmaContext::DecodePackets(XMA_CONTEXT_DATA* data) {
   // 32bit header (big endian)
 
   // Frames are the smallest thing the SPUs can decode.
-  // They usually can span packets (libav handles this)
+  // They can and usually will span packets.
 
   // Sample rates (data.sample_rate):
-  // 0 - 24 kHz ?
+  // 0 - 24 kHz
   // 1 - 32 kHz
-  // 2 - 44.1 kHz ?
-  // 3 - 48 kHz ?
+  // 2 - 44.1 kHz
+  // 3 - 48 kHz
 
   // SPUs also support stereo decoding. (data.is_stereo)
 
-  // Quick die if there's no data.
-  if (!data->input_buffer_0_valid && !data->input_buffer_1_valid) {
-    XELOGAPU("Context %d: No valid input buffers!", id());
-    return;
-  }
-
   // Check the output buffer - we cannot decode anything else if it's
   // unavailable.
   if (!data->output_buffer_valid) {
@@ -236,25 +307,28 @@ void XmaContext::DecodePackets(XMA_CONTEXT_DATA* data) {
   // Translate pointers for future use.
   // Sometimes the game will use rolling input buffers. If they do, we cannot
   // assume they form a complete block! In addition, the buffers DO NOT have
-  // to be sequential!
-  // (bit.trip runner 2 does this)
-  // TODO: Collect partial frames into a buffer if the game uses rolling buffers,
-  // and present the full frame to libav when we get it.
+  // to be contiguous!
   uint8_t* in0 = data->input_buffer_0_valid
                      ? memory()->TranslatePhysical(data->input_buffer_0_ptr)
                      : nullptr;
   uint8_t* in1 = data->input_buffer_1_valid
                      ? memory()->TranslatePhysical(data->input_buffer_1_ptr)
                      : nullptr;
-  uint8_t* current_input_buffer = in0;
+  uint8_t* current_input_buffer = data->current_buffer ? in1 : in0;
+
+  XELOGAPU("Processing context %d (offset %d, buffer %d, ptr %.8X)", id(),
+           data->input_buffer_read_offset, data->current_buffer,
+           current_input_buffer);
 
   size_t input_buffer_0_size =
       data->input_buffer_0_packet_count * kBytesPerPacket;
   size_t input_buffer_1_size =
       data->input_buffer_1_packet_count * kBytesPerPacket;
+  size_t input_total_size = input_buffer_0_size + input_buffer_1_size;
+
   size_t current_input_size =
       data->current_buffer ? input_buffer_1_size : input_buffer_0_size;
-  size_t input_total_size = input_buffer_0_size + input_buffer_1_size;
+  size_t current_input_packet_count = current_input_size / kBytesPerPacket;
 
   // Output buffers are in raw PCM samples, 256 bytes per block.
   // Output buffer is a ring buffer. We need to write from the write offset
@@ -272,14 +346,10 @@ void XmaContext::DecodePackets(XMA_CONTEXT_DATA* data) {
   output_rb.set_write_offset(output_write_offset);
 
   size_t output_remaining_bytes = output_rb.write_count();
+  bool output_written = false;
 
   // Decode until we can't write any more data.
   while (output_remaining_bytes > 0) {
-    if (!data->input_buffer_0_valid && !data->input_buffer_1_valid) {
-      // Out of data.
-      break;
-    }
-
     int num_channels = data->is_stereo ? 2 : 1;
 
     // Check if we have part of a frame waiting (and the game hasn't jumped
@@ -289,7 +359,10 @@ void XmaContext::DecodePackets(XMA_CONTEXT_DATA* data) {
       size_t to_write = std::min(
           output_remaining_bytes,
           ((size_t)kBytesPerFrame * num_channels - current_frame_pos_));
-      output_rb.Write(current_frame_, to_write);
+      output_rb.Write(current_frame_ + current_frame_pos_, to_write);
+      output_written = true;
+      XELOGAPU("XmaContext %d: wrote out %d bytes of left-over samples", id(),
+               to_write);
 
       current_frame_pos_ += to_write;
       if (current_frame_pos_ >= kBytesPerFrame * num_channels) {
@@ -301,48 +374,189 @@ void XmaContext::DecodePackets(XMA_CONTEXT_DATA* data) {
       continue;
     }
 
+    if (!data->input_buffer_0_valid && !data->input_buffer_1_valid) {
+      // Out of data.
+      break;
+    }
+
+    if (data->input_buffer_read_offset == 0) {
+      // Invalid offset. Go ahead and set it.
+      uint32_t offset = xma::GetPacketFrameOffset(current_input_buffer);
+      if (offset == -1) {
+        // No more frames.
+        if (data->current_buffer == 0) {
+          data->input_buffer_0_valid = 0;
+          data->input_buffer_read_offset = 0;
+          data->current_buffer++;
+        } else if (data->current_buffer == 1) {
+          data->input_buffer_1_valid = 0;
+          data->input_buffer_read_offset = 0;
+          data->current_buffer--;
+        }
+
+        // Die if we have no partial saved.
+        if (!partial_frame_saved_) {
+          return;
+        }
+      } else {
+        data->input_buffer_read_offset = offset;
+      }
+    }
+
+    if (!ValidFrameOffset(current_input_buffer, current_input_size,
+                          data->input_buffer_read_offset)) {
+      XELOGAPU("XmaContext %d: Invalid read offset %d!", id(),
+               data->input_buffer_read_offset);
+      if (data->current_buffer == 0) {
+        data->current_buffer = 1;
+        data->input_buffer_0_valid = 0;
+      } else if (data->current_buffer == 1) {
+        data->current_buffer = 0;
+        data->input_buffer_1_valid = 0;
+      }
+
+      data->input_buffer_read_offset = 0;
+      return;
+    }
+
+    // Check if we need to save a partial frame.
+    if (data->input_buffer_read_offset != 0 && !partial_frame_saved_ &&
+        GetFramePacketNumber(current_input_buffer, current_input_size,
+                             data->input_buffer_read_offset) ==
+            current_input_packet_count - 1) {
+      BitStream stream(current_input_buffer, current_input_size * 8);
+      stream.SetOffset(data->input_buffer_read_offset);
+
+      if (stream.BitsRemaining() >= 15) {
+        uint64_t frame_size = stream.Read(15);
+        if (data->input_buffer_read_offset + frame_size >=
+                current_input_size * 8 &&
+            frame_size != 0x7FFF) {
+          uint32_t rel_offset = data->input_buffer_read_offset % (2048 * 8);
+
+          // Frame is cut off! Save and exit.
+          partial_frame_saved_ = true;
+          partial_frame_size_known_ = true;
+          partial_frame_total_size_bits_ = frame_size;
+          SavePartial(
+              current_input_buffer + (current_input_packet_count - 1) * 2048,
+              rel_offset, frame_size, false);
+        }
+      } else {
+        // Header cut in half :/
+        uint32_t rel_offset = data->input_buffer_read_offset % (2048 * 8);
+
+        partial_frame_saved_ = true;
+        partial_frame_size_known_ = false;
+        SavePartial(
+            current_input_buffer + (current_input_packet_count - 1) * 2048,
+            rel_offset, 0, false);
+      }
+
+      if (partial_frame_saved_) {
+        XELOGAPU("XmaContext %d: saved a partial frame", id());
+
+        if (data->current_buffer == 0) {
+          data->input_buffer_0_valid = 0;
+          data->input_buffer_read_offset = 0;
+          data->current_buffer++;
+        } else if (data->current_buffer == 1) {
+          data->input_buffer_1_valid = 0;
+          data->input_buffer_read_offset = 0;
+          data->current_buffer--;
+        }
+
+        return;
+      }
+    }
+
+    if (partial_frame_saved_ && !partial_frame_size_known_) {
+      // Append the rest of the header.
+      size_t offset = SavePartial(current_input_buffer, 32, 15, true);
+
+      // Read the frame size.
+      BitStream stream(partial_frame_buffer_.data(),
+                       15 + partial_frame_start_offset_bits_);
+      stream.SetOffset(partial_frame_start_offset_bits_);
+
+      uint64_t size = stream.Read(15);
+      partial_frame_size_known_ = true;
+      partial_frame_total_size_bits_ = size;
+
+      // Now append the rest of the frame.
+      SavePartial(current_input_buffer, 32 + (uint32_t)offset, size, true);
+    } else if (partial_frame_saved_) {
+      // Append the rest of the frame.
+      SavePartial(current_input_buffer, 32, partial_frame_total_size_bits_,
+                  true);
+    }
+
+    // Prepare the decoder. Reinitialize if any parameters have changed.
+    PrepareDecoder(current_input_buffer, current_input_size, data->sample_rate,
+                   num_channels);
+
+    bool partial = false;
+    size_t bit_offset = data->input_buffer_read_offset;
+    if (partial_frame_saved_) {
+      XELOGAPU("XmaContext %d: processing saved partial frame", id());
+      packet_->data = partial_frame_buffer_.data();
+      packet_->size = (int)partial_frame_buffer_.size();
+
+      bit_offset = partial_frame_start_offset_bits_;
+      partial = true;
+      partial_frame_saved_ = false;
+    } else {
+      packet_->data = current_input_buffer;
+      packet_->size = (int)current_input_size;
+    }
+
     int invalid_frame = 0;  // invalid frame?
     int got_frame = 0;      // successfully decoded a frame?
     int frame_size = 0;
-    packet_->data = current_input_buffer;
-    packet_->size = (int)current_input_size;
-    PrepareDecoder(in0, current_input_size, data->sample_rate, num_channels);
-    int len = xma2_decode_frame(context_, packet_, decoded_frame_, &got_frame,
-                                &invalid_frame, &frame_size, 1,
-                                data->input_buffer_read_offset);
-    if (invalid_frame) {
-      // Invalid frame/packet: length header is 0x7FFF
-      // Sometimes there's frames in the middle of the stream flagged as
-      // invalid.
-      // Double-check to make sure we're not in the middle.
-      uint32_t frame_byte_offset = data->input_buffer_read_offset >> 3;
-      uint32_t packet_number = frame_byte_offset / 2048;
-      if (packet_number < data->input_buffer_0_packet_count - 1) {
-        // Okay. Skip to the beginning of the next packet.
-        packet_number++;
-        data->input_buffer_read_offset = (packet_number * 2048 * 8) + 32;
-        continue;
-      }
-
-      // Last frame of the block. Swap buffers if necessary.
-      if (data->current_buffer == 0) {
-        if (data->input_buffer_1_valid) {
-          data->current_buffer++;
-        } else {
-          // End of input.
-          data->input_buffer_read_offset = input_total_size * 8;
+    int len =
+        xma2_decode_frame(context_, packet_, decoded_frame_, &got_frame,
+                          &invalid_frame, &frame_size, !partial, bit_offset);
+    if (!partial && len == 0) {
+      // Got the last frame of a packet. Advance the read offset to the next
+      // packet.
+      uint32_t packet_number =
+          GetFramePacketNumber(current_input_buffer, current_input_size,
+                               data->input_buffer_read_offset);
+      if (packet_number == current_input_packet_count - 1) {
+        // Last packet.
+        if (data->current_buffer == 0) {
+          data->input_buffer_0_valid = 0;
+          data->input_buffer_read_offset = 0;
+          data->current_buffer = 1;
+        } else if (data->current_buffer == 1) {
+          data->input_buffer_1_valid = 0;
+          data->input_buffer_read_offset = 0;
+          data->current_buffer = 0;
         }
-
-        data->input_buffer_0_valid = 0;
-        return;
       } else {
-        // End of input.
-        data->current_buffer = 0;
-        data->input_buffer_1_valid = 0;
-        data->input_buffer_read_offset = input_total_size * 8;
-        return;
+        // Advance the read offset.
+        packet_number++;
+        uint8_t* packet = current_input_buffer + (packet_number * 2048);
+        uint32_t first_frame_offset = xma::GetPacketFrameOffset(packet);
+        if (first_frame_offset == -1) {
+          // Invalid packet (only contained a frame partial). Out of input.
+          if (data->current_buffer == 0) {
+            data->input_buffer_0_valid = 0;
+            data->current_buffer = 1;
+          } else if (data->current_buffer == 1) {
+            data->input_buffer_1_valid = 0;
+            data->current_buffer = 0;
+          }
+
+          data->input_buffer_read_offset = 0;
+        } else {
+          data->input_buffer_read_offset =
+              packet_number * 2048 * 8 + first_frame_offset;
+        }
       }
-    } else if (got_frame && len > 0) {
+    }
+
+    if (got_frame) {
       // Valid frame.
       // Check and see if we need to loop back to any spot.
       if (data->loop_count > 0 &&
@@ -352,53 +566,28 @@ void XmaContext::DecodePackets(XMA_CONTEXT_DATA* data) {
         if (data->loop_count < 255) {
           data->loop_count--;
         }
-      } else {
+      } else if (!partial && len > 0) {
         data->input_buffer_read_offset += len;
-        if (data->current_buffer == 0 &&
-            data->input_buffer_read_offset > input_buffer_0_size * 8) {
-          // Overflow? Setup next buffer.
-          data->current_buffer++;
-          data->input_buffer_0_valid = 0;
-        } else if (data->input_buffer_read_offset > input_total_size * 8) {
-          // Overflow! The game will fix up the read offset.
-          data->current_buffer = 0;
-          data->input_buffer_0_valid = 0;
-          data->input_buffer_1_valid = 0;
-        }
       }
-    }
-
-    if ((len < 0 || !got_frame) && frame_size != 0) {
-      // Oh no! Skip the frame and hope everything works.
-      data->input_buffer_read_offset += frame_size;
-      data->input_buffer_read_offset = (uint32_t)xma2_correct_frame_offset(
-          in0, input_buffer_0_size, data->input_buffer_read_offset);
-
-      continue;
-    } else if (len < 0 || !got_frame) {
-      // Did not get frame and could not get frame size.
-      data->input_buffer_0_valid = 0;
-      data->input_buffer_1_valid = 0;
+    } else if (len < 0) {
+      // Did not get frame
+      XELOGAPU("libav failed to decode a frame!");
+      if (frame_size && frame_size != 0x7FFF) {
+        data->input_buffer_read_offset += frame_size;
+      } else {
+        data->input_buffer_0_valid = 0;
+        data->input_buffer_1_valid = 0;
+      }
       return;
     }
 
-    // Sometimes we may run up to <15 bits before the next packet. If this
-    // happens, we need to automatically advance to the next frame.
-    // We'll ask the XMA2 decoder to do this for us, since it's more qualified.
-    data->input_buffer_read_offset = (uint32_t)xma2_correct_frame_offset(
-        in0, input_buffer_0_size, data->input_buffer_read_offset);
     last_input_read_pos_ = data->input_buffer_read_offset;
 
-    if (data->input_buffer_read_offset == 0) {
-      // Invalid offset. Out of data.
-      data->input_buffer_0_valid = 0;
-      data->input_buffer_1_valid = 0;
-    }
-
-    // Copy to the output buffer.
-    // Successfully decoded a frame.
-    size_t written_bytes = 0;
     if (got_frame) {
+      // Successfully decoded a frame.
+      // Copy to the output buffer.
+      size_t written_bytes = 0;
+
 #ifdef DEBUG
       // Validity checks.
       if (decoded_frame_->nb_samples > kSamplesPerFrame) {
@@ -419,7 +608,7 @@ void XmaContext::DecodePackets(XMA_CONTEXT_DATA* data) {
 #endif
 
       // Convert the frame.
-      ConvertFrame((const float**)decoded_frame_->data, context_->channels,
+      ConvertFrame((const uint8_t**)decoded_frame_->data, context_->channels,
                    decoded_frame_->nb_samples, current_frame_);
       current_frame_pos_ = 0;
 
@@ -435,15 +624,18 @@ void XmaContext::DecodePackets(XMA_CONTEXT_DATA* data) {
 
         written_bytes = kBytesPerFrame * num_channels;
       }
-    }
 
-    output_remaining_bytes -= written_bytes;
-    data->output_buffer_write_offset = output_rb.write_offset() / 256;
+      output_written = true;
+      output_remaining_bytes -= written_bytes;
+      data->output_buffer_write_offset = output_rb.write_offset() / 256;
+    }
   }
 
   // The game will kick us again with a new output buffer later.
   // It's important that we only invalidate this if we actually wrote to it!!
-  data->output_buffer_valid = 0;
+  if (output_written) {
+    data->output_buffer_valid = 0;
+  }
 }
 
 uint32_t XmaContext::GetFramePacketNumber(uint8_t* block, size_t size,
@@ -490,7 +682,7 @@ int XmaContext::PrepareDecoder(uint8_t* block, size_t size, int sample_rate,
   return 0;
 }
 
-bool XmaContext::ConvertFrame(const float** samples, int num_channels,
+bool XmaContext::ConvertFrame(const uint8_t** samples, int num_channels,
                               int num_samples, uint8_t* output_buffer) {
   // Loop through every sample, convert and drop it into the output array.
   // If more than one channel, we need to interleave the samples from each
@@ -500,7 +692,7 @@ bool XmaContext::ConvertFrame(const float** samples, int num_channels,
   for (int i = 0; i < num_samples; i++) {
     for (int j = 0; j < num_channels; j++) {
       // Select the appropriate array based on the current channel.
-      auto sample_array = samples[j];
+      auto sample_array = reinterpret_cast<const float*>(samples[j]);
 
       // Raw sample should be within [-1, 1].
       // Clamp it, just in case.
@@ -516,216 +708,5 @@ bool XmaContext::ConvertFrame(const float** samples, int num_channels,
   return true;
 }
 
-int XmaContext::StartPacket(XMA_CONTEXT_DATA* data) {
-  // Translate pointers for future use.
-  uint8_t* in0 = data->input_buffer_0_valid
-                     ? memory()->TranslatePhysical(data->input_buffer_0_ptr)
-                     : nullptr;
-  uint8_t* in1 = data->input_buffer_1_valid
-                     ? memory()->TranslatePhysical(data->input_buffer_1_ptr)
-                     : nullptr;
-
-  int sample_rate = GetSampleRate(data->sample_rate);
-  int channels = data->is_stereo ? 2 : 1;
-
-  // See if we've finished with the input.
-  // Block count is in packets, so expand by packet size.
-  uint32_t input_size_0_bytes = data->input_buffer_0_valid
-                                    ? (data->input_buffer_0_packet_count) * 2048
-                                    : 0;
-  uint32_t input_size_1_bytes = data->input_buffer_1_valid
-                                    ? (data->input_buffer_1_packet_count) * 2048
-                                    : 0;
-
-  // Total input size
-  uint32_t input_size_bytes = input_size_0_bytes + input_size_1_bytes;
-
-  // Calculate the first frame offset we need to decode.
-  uint32_t frame_offset_bits = (data->input_buffer_read_offset % (2048 * 8));
-
-  // Input read offset is in bits. Typically starts at 32 (4 bytes).
-  // "Sequence" offset - used internally for WMA Pro decoder.
-  // Just the read offset.
-  // NOTE: Read offset may not be at the first frame in a packet!
-  uint32_t packet_offset_bytes = (data->input_buffer_read_offset & ~0x7FF) / 8;
-  if (packet_offset_bytes % 2048 != 0) {
-    packet_offset_bytes -= packet_offset_bytes % 2048;
-  }
-  uint32_t input_remaining_bytes = input_size_bytes - packet_offset_bytes;
-
-  if (packet_offset_bytes >= input_size_bytes) {
-    // No more data available and no packet prepared.
-    return -1;
-  }
-
-  // Setup input offset and input buffer.
-  uint32_t input_offset_bytes = packet_offset_bytes;
-  auto input_buffer = in0;
-
-  if (packet_offset_bytes >= input_size_0_bytes && input_size_1_bytes) {
-    // Size overlap, select input buffer 1.
-    // TODO(DrChat): This needs testing.
-    input_offset_bytes -= input_size_0_bytes;
-    input_buffer = in1;
-  }
-
-  // Still have data to read.
-  auto packet = input_buffer + input_offset_bytes;
-  assert_true(input_offset_bytes % 2048 == 0);
-  PreparePacket(packet, packet_offset_bytes, kBytesPerPacket, sample_rate,
-                channels);
-
-  data->input_buffer_read_offset += kBytesPerPacket * 8;
-
-  input_remaining_bytes -= kBytesPerPacket;
-  if (input_remaining_bytes <= 0) {
-    // Used the last of the data but prepared a packet.
-    return 0;
-  }
-
-  return input_remaining_bytes;
-}
-
-int XmaContext::PreparePacket(uint8_t* input, size_t seq_offset, size_t size,
-                              int sample_rate, int channels) {
-  if (size != kBytesPerPacket) {
-    // Invalid packet size!
-    assert_always();
-    return 1;
-  }
-  if (packet_->size > 0 || current_frame_pos_ != frame_samples_size_) {
-    // Haven't finished parsing another packet.
-    return 1;
-  }
-
-  // Packet metadata is always 1 for XMA2
-  assert_true((input[2] & 0x7) == 1);
-
-  packet_->data = input;
-  packet_->size = (int)size;
-
-  // Re-initialize the context with new sample rate and channels.
-  if (context_->sample_rate != sample_rate || context_->channels != channels) {
-    // We have to reopen the codec so it'll realloc whatever data it needs.
-    // TODO(DrChat): Find a better way.
-    avcodec_close(context_);
-
-    context_->sample_rate = sample_rate;
-    context_->channels = channels;
-    extra_data_.channel_mask =
-        channels == 2 ? AV_CH_LAYOUT_STEREO : AV_CH_LAYOUT_MONO;
-
-    if (avcodec_open2(context_, codec_, NULL) < 0) {
-      XELOGE("XmaContext: Failed to reopen libav context");
-      return 1;
-    }
-  }
-
-  return 0;
-}
-
-void XmaContext::DiscardPacket() {
-  if (packet_->size > 0 || current_frame_pos_ != frame_samples_size_) {
-    packet_->data = 0;
-    packet_->size = 0;
-    current_frame_pos_ = frame_samples_size_;
-  }
-}
-
-int XmaContext::DecodePacket(uint8_t* output, size_t output_offset,
-                             size_t output_size, size_t* read_bytes) {
-  size_t to_copy = 0;
-  size_t original_offset = output_offset;
-  if (read_bytes) {
-    *read_bytes = 0;
-  }
-
-  // We're holding onto an already-decoded frame. Copy it out.
-  if (current_frame_pos_ != frame_samples_size_) {
-    to_copy = std::min(output_size, frame_samples_size_ - current_frame_pos_);
-    memcpy(output + output_offset, current_frame_ + current_frame_pos_,
-           to_copy);
-
-    current_frame_pos_ += to_copy;
-    output_size -= to_copy;
-    output_offset += to_copy;
-  }
-
-  while (output_size > 0 && packet_->size > 0) {
-    int got_frame = 0;
-
-    // Decode the current frame.
-    int len =
-        avcodec_decode_audio4(context_, decoded_frame_, &got_frame, packet_);
-    if (len < 0) {
-      // Error in codec (bad sample rate or something).
-      return len;
-    }
-
-    if (read_bytes) {
-      *read_bytes += len;
-    }
-
-    // Offset by decoded length.
-    packet_->size -= len;
-    packet_->data += len;
-    packet_->dts = packet_->pts = AV_NOPTS_VALUE;
-
-    // Successfully decoded a frame.
-    if (got_frame) {
-      // Validity checks.
-      if (decoded_frame_->nb_samples > kSamplesPerFrame) {
-        return -2;
-      } else if (context_->sample_fmt != AV_SAMPLE_FMT_FLTP) {
-        return -3;
-      }
-
-      // Check the returned buffer size.
-      if (av_samples_get_buffer_size(NULL, context_->channels,
-                                     decoded_frame_->nb_samples,
-                                     context_->sample_fmt, 1) !=
-          context_->channels * decoded_frame_->nb_samples * sizeof(float)) {
-        return -4;
-      }
-
-      // Loop through every sample, convert and drop it into the output array.
-      // If more than one channel, the game wants the samples from each channel
-      // interleaved next to each other.
-      uint32_t o = 0;
-      for (int i = 0; i < decoded_frame_->nb_samples; i++) {
-        for (int j = 0; j < context_->channels; j++) {
-          // Select the appropriate array based on the current channel.
-          auto sample_array = reinterpret_cast<float*>(decoded_frame_->data[j]);
-
-          // Raw sample should be within [-1, 1].
-          // Clamp it, just in case.
-          float raw_sample = xe::saturate(sample_array[i]);
-
-          // Convert the sample and output it in big endian.
-          float scaled_sample = raw_sample * ((1 << 15) - 1);
-          int sample = static_cast<int>(scaled_sample);
-          xe::store_and_swap<uint16_t>(&current_frame_[o++ * 2],
-                                       sample & 0xFFFF);
-        }
-      }
-      current_frame_pos_ = 0;
-
-      // Total size of the frame's samples.
-      // Magic number 2 is sizeof an output sample.
-      frame_samples_size_ = context_->channels * decoded_frame_->nb_samples * 2;
-
-      to_copy = std::min(output_size, (size_t)(frame_samples_size_));
-      std::memcpy(output + output_offset, current_frame_, to_copy);
-
-      current_frame_pos_ += to_copy;
-      output_size -= to_copy;
-      output_offset += to_copy;
-    }
-  }
-
-  // Return number of bytes written.
-  return static_cast<int>(output_offset - original_offset);
-}
-
 }  // namespace apu
 }  // namespace xe
diff --git a/src/xenia/apu/xma_context.h b/src/xenia/apu/xma_context.h
index a0f769f38..86eca8c1c 100644
--- a/src/xenia/apu/xma_context.h
+++ b/src/xenia/apu/xma_context.h
@@ -167,12 +167,14 @@ class XmaContext {
  private:
   static int GetSampleRate(int id);
 
+  size_t SavePartial(uint8_t* packet, uint32_t frame_offset_bits, size_t frame_size_bits, bool append);
+  bool ValidFrameOffset(uint8_t* block, size_t size_bytes, size_t frame_offset_bits);
   void DecodePackets(XMA_CONTEXT_DATA* data);
   uint32_t GetFramePacketNumber(uint8_t* block, size_t size, size_t bit_offset);
   int PrepareDecoder(uint8_t* block, size_t size, int sample_rate,
                      int channels);
 
-  bool ConvertFrame(const float** samples, int num_channels, int num_samples,
+  bool ConvertFrame(const uint8_t** samples, int num_channels, int num_samples,
                     uint8_t* output_buffer);
 
   int StartPacket(XMA_CONTEXT_DATA* data);
@@ -199,6 +201,13 @@ class XmaContext {
   AVPacket* packet_ = nullptr;
   WmaProExtraData extra_data_;
 
+  bool partial_frame_saved_ = false;
+  bool partial_frame_size_known_ = false;
+  size_t partial_frame_total_size_bits_ = 0;
+  size_t partial_frame_start_offset_bits_ = 0;
+  size_t partial_frame_offset_bits_ = 0; // blah internal don't use this
+  std::vector<uint8_t> partial_frame_buffer_;
+
   // If we didn't finish writing a frame to the output buffer, this is the offset.
   size_t current_frame_pos_ = 0;
   uint32_t last_input_read_pos_ = 0; // Last seen read buffer pos
diff --git a/src/xenia/apu/xma_decoder.cc b/src/xenia/apu/xma_decoder.cc
index 766ce231b..aaa0edf90 100644
--- a/src/xenia/apu/xma_decoder.cc
+++ b/src/xenia/apu/xma_decoder.cc
@@ -82,7 +82,7 @@ void av_log_callback(void* avcl, int level, const char* fmt, va_list va) {
 
   StringBuffer buff;
   buff.AppendVarargs(fmt, va);
-  xe::LogLineVarargs(level_char, "libav: %s", buff.GetString());
+  xe::LogLineFormat(level_char, "libav: %s", buff.GetString());
 }
 
 X_STATUS XmaDecoder::Setup(kernel::KernelState* kernel_state) {
@@ -113,7 +113,7 @@ X_STATUS XmaDecoder::Setup(kernel::KernelState* kernel_state) {
   }
   registers_.next_context = 1;
 
-  //worker_running_ = true;
+  worker_running_ = true;
   worker_thread_ = kernel::object_ref<kernel::XHostThread>(
       new kernel::XHostThread(kernel_state, 128 * 1024, 0, [this]() {
         WorkerThreadMain();
@@ -131,6 +131,11 @@ void XmaDecoder::WorkerThreadMain() {
     for (uint32_t n = 0; n < kContextCount; n++) {
       XmaContext& context = contexts_[n];
       context.Work();
+
+      // TODO: Need thread safety to do this.
+      // Probably not too important though.
+      //registers_.current_context = n;
+      //registers_.next_context = (n + 1) % kContextCount;
     }
   }
 }
@@ -209,7 +214,6 @@ uint32_t XmaDecoder::ReadRegister(uint32_t addr) {
     // number
     registers_.current_context = registers_.next_context;
     registers_.next_context = (registers_.next_context + 1) % kContextCount;
-    value = registers_.current_context;
   }
 
   value = xe::byte_swap(value);
@@ -240,7 +244,6 @@ void XmaDecoder::WriteRegister(uint32_t addr, uint32_t value) {
         uint32_t context_id = base_context_id + i;
         XmaContext& context = contexts_[context_id];
         context.Enable();
-        context.Work();
       }
     }
 
diff --git a/src/xenia/apu/xma_helpers.h b/src/xenia/apu/xma_helpers.h
index 8efc9d1d1..bf0f593ac 100644
--- a/src/xenia/apu/xma_helpers.h
+++ b/src/xenia/apu/xma_helpers.h
@@ -25,7 +25,12 @@ uint32_t GetPacketFrameCount(uint8_t* packet) {
 
 // Get the first frame offset in bits
 uint32_t GetPacketFrameOffset(uint8_t* packet) {
-  return (uint16_t)((packet[0] << 13) | (packet[1] << 5) | (packet[2] >> 3)) + 32;
+  uint32_t val = (uint16_t)(((packet[0] & 0x3) << 13) | (packet[1] << 5) | (packet[2] >> 3));
+  if (val == 0x7FFF) {
+    return -1;
+  } else {
+    return val + 32;
+  }
 }
 
 uint32_t GetPacketMetadata(uint8_t* packet) {
diff --git a/src/xenia/base/bit_stream.cc b/src/xenia/base/bit_stream.cc
new file mode 100644
index 000000000..2a073109e
--- /dev/null
+++ b/src/xenia/base/bit_stream.cc
@@ -0,0 +1,143 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2015 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include "xenia/base/bit_stream.h"
+
+#include <algorithm>
+
+#include "xenia/base/assert.h"
+#include "xenia/base/byte_order.h"
+
+namespace xe {
+BitStream::BitStream(uint8_t* buffer, size_t size_in_bits)
+    : buffer_(buffer), size_bits_(size_in_bits) {}
+
+BitStream::~BitStream() {}
+
+void BitStream::SetOffset(size_t offset_bits) {
+  assert_false(offset_bits > size_bits_);
+  offset_bits_ = std::min(offset_bits, size_bits_);
+}
+
+size_t BitStream::BitsRemaining() { return size_bits_ - offset_bits_; }
+
+uint64_t BitStream::Peek(size_t num_bits) {
+  // FYI: The reason we can't copy more than 57 bits is:
+  // 57 = 7 * 8 + 1 - that can only span a maximum of 8 bytes.
+  // We can't read in 9 bytes (easily), so we limit it.
+  assert_false(num_bits > 57);
+  assert_false(offset_bits_ + num_bits > size_bits_);
+
+  size_t offset_bytes = offset_bits_ >> 3;
+  size_t rel_offset_bits = offset_bits_ - (offset_bytes << 3);
+
+  // offset -->
+  // ..[junk]..| target bits |....[junk].............
+  uint64_t bits = *(uint64_t*)(buffer_ + offset_bytes);
+
+  // We need the data in little endian.
+  // TODO: Have a flag specifying endianness of data?
+  bits = xe::byte_swap(bits);
+
+  // Shift right
+  // .....[junk]........| target bits |
+  bits >>= 64 - (rel_offset_bits + num_bits);
+
+  // AND with mask
+  // ...................| target bits |
+  bits &= (1 << num_bits) - 1;
+
+  return bits;
+}
+
+uint64_t BitStream::Read(size_t num_bits) {
+  uint64_t val = Peek(num_bits);
+  Advance(num_bits);
+
+  return val;
+}
+
+// TODO: This is totally not tested!
+bool BitStream::Write(uint64_t val, size_t num_bits) {
+  assert_false(num_bits > 57);
+  assert_false(offset_bits_ + num_bits >= size_bits_);
+
+  size_t offset_bytes = offset_bits_ >> 3;
+  size_t rel_offset_bits = offset_bits_ - (offset_bytes << 3);
+
+  // Construct a mask
+  uint64_t mask = (1 << num_bits) - 1;
+  mask <<= 64 - (rel_offset_bits + num_bits);
+  mask = ~mask;
+
+  // Shift the value left into position.
+  val <<= 64 - (rel_offset_bits + num_bits);
+
+  // offset ----->
+  // ....[junk]...| target bits w/ junk |....[junk]......
+  uint64_t bits = *(uint64_t*)(buffer_ + offset_bytes);
+
+  // AND with mask
+  // ....[junk]...| target bits (0) |........[junk]......
+  bits &= mask;
+
+  // OR with val
+  // ....[junk]...| target bits (val) |......[junk]......
+  bits |= val;
+
+  // Store into the bitstream.
+  *(uint64_t*)(buffer_ + offset_bytes) = bits;
+
+  // Advance the bitstream forward.
+  Advance(num_bits);
+
+  return true;
+}
+
+size_t BitStream::Copy(uint8_t* dest_buffer, size_t num_bits) {
+  size_t offset_bytes = offset_bits_ >> 3;
+  size_t rel_offset_bits = offset_bits_ - (offset_bytes << 3);
+  size_t bits_left = num_bits;
+  size_t out_offset_bytes = 0;
+
+  // First: Copy the first few bits up to a byte boundary.
+  if (rel_offset_bits) {
+    uint64_t bits = Peek(8 - rel_offset_bits);
+    dest_buffer[out_offset_bytes] |= (uint8_t)bits;
+
+    bits_left -= 8 - rel_offset_bits;
+    Advance(8 - rel_offset_bits);
+    out_offset_bytes++;
+  }
+
+  // Second: Use memcpy for the bytes left.
+  if (bits_left >= 8) {
+    std::memcpy(dest_buffer + out_offset_bytes,
+                buffer_ + offset_bytes + out_offset_bytes, bits_left / 8);
+    out_offset_bytes += (bits_left / 8);
+    Advance((bits_left / 8) * 8);
+    bits_left -= (bits_left / 8) * 8;
+  }
+
+  // Third: Copy the last few bits.
+  if (bits_left) {
+    uint64_t bits = Peek(bits_left);
+    bits <<= 8 - bits_left;
+
+    dest_buffer[out_offset_bytes] |= (uint8_t)bits;
+    Advance(bits_left);
+  }
+
+  // Return the bit offset to the copied bits.
+  return rel_offset_bits;
+}
+
+void BitStream::Advance(size_t num_bits) { SetOffset(offset_bits_ + num_bits); }
+
+}  // namespace xe
\ No newline at end of file
diff --git a/src/xenia/base/bit_stream.h b/src/xenia/base/bit_stream.h
new file mode 100644
index 000000000..64cc2704b
--- /dev/null
+++ b/src/xenia/base/bit_stream.h
@@ -0,0 +1,44 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2015 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_BASE_BIT_STREAM_H_
+#define XENIA_BASE_BIT_STREAM_H_
+
+#include <cstdint>
+
+namespace xe {
+class BitStream {
+ public:
+  BitStream(uint8_t* buffer, size_t size_in_bits);
+  ~BitStream();
+
+  const uint8_t* buffer() const { return buffer_; }
+  uint8_t* buffer() { return buffer_; }
+  size_t offset_bits() const { return offset_bits_; }
+  size_t size_bits() const { return size_bits_; }
+
+  void Advance(size_t num_bits);
+  void SetOffset(size_t offset_bits);
+  size_t BitsRemaining();
+
+  // Note: num_bits MUST be in the range 0-57 (inclusive)
+  uint64_t Peek(size_t num_bits);
+  uint64_t Read(size_t num_bits);
+  bool Write(uint64_t val, size_t num_bits); // TODO: Not tested!
+
+  size_t Copy(uint8_t* dest_buffer, size_t num_bits);
+
+ private:
+  uint8_t* buffer_ = nullptr;
+  size_t offset_bits_ = 0;
+  size_t size_bits_ = 0;
+};
+}  // namespace xe
+
+#endif  // XENIA_BASE_BIT_STREAM_H_
\ No newline at end of file
diff --git a/third_party/libav.lua b/third_party/libav.lua
new file mode 100644
index 000000000..3341a696c
--- /dev/null
+++ b/third_party/libav.lua
@@ -0,0 +1,3 @@
+group("third_party")
+ include("libav/libavcodec/premake5.lua")
+ include("libav/libavutil/premake5.lua")