WIP Audio decoder

Remove some old comments
2015-05-22 21:39:26 -05:00 · 2015-05-22 21:39:26 -05:00 · f31b34256b
parent bd8db4810a
commit f31b34256b
7 changed files with 587 additions and 73 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -22,3 +22,6 @@
 [submodule "third_party/binutils-ppc-cygwin"]
 	path = third_party/binutils-ppc-cygwin
 	url = https://github.com/benvanik/binutils-ppc-cygwin
+[submodule "third_party/libav-bin"]
+	path = third_party/libav-bin
+	url = https://github.com/DrChat/xenia-libav-bin.git
--- a/libxenia.vcxproj
+++ b/libxenia.vcxproj
@ -16,6 +16,7 @@
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="src\xenia\apu\apu.cc" />
+    <ClCompile Include="src\xenia\apu\audio_decoder.cpp" />
    <ClCompile Include="src\xenia\apu\audio_driver.cc" />
    <ClCompile Include="src\xenia\apu\audio_system.cc" />
    <ClCompile Include="src\xenia\apu\nop\nop_apu.cc" />
@ -203,6 +204,7 @@
  <ItemGroup>
    <ClInclude Include="src\xenia\apu\apu-private.h" />
    <ClInclude Include="src\xenia\apu\apu.h" />
+    <ClInclude Include="src\xenia\apu\audio_decoder.h" />
    <ClInclude Include="src\xenia\apu\audio_driver.h" />
    <ClInclude Include="src\xenia\apu\audio_system.h" />
    <ClInclude Include="src\xenia\apu\nop\nop_apu-private.h" />
@ -483,14 +485,15 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>BEA_ENGINE_STATIC=1;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(SolutionDir)\third_party\beaengine\include\;$(SolutionDir)\third_party\llvm\include\;$(SolutionDir)\third_party\gflags\src\;$(SolutionDir)\src\;$(SolutionDir)\third_party;$(SolutionDir)\</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>$(SolutionDir)\third_party\libav-bin\include\;$(SolutionDir)\third_party\beaengine\include\;$(SolutionDir)\third_party\llvm\include\;$(SolutionDir)\third_party\gflags\src\;$(SolutionDir)\src\;$(SolutionDir)\third_party;$(SolutionDir)\</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Windows</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
    <Lib>
-      <AdditionalDependencies>libgflags.lib;libglew.lib</AdditionalDependencies>
+      <AdditionalDependencies>libavcodec.a;libavutil.a;libgflags.lib;libglew.lib</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(SolutionDir)third_party\libav-bin\lib\$(Configuration);$(SolutionDir)build\bin\$(Configuration)\</AdditionalLibraryDirectories>
    </Lib>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Checked|x64'">
@ -500,14 +503,15 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>BEA_ENGINE_STATIC=1;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(SolutionDir)\third_party\beaengine\include\;$(SolutionDir)\third_party\llvm\include\;$(SolutionDir)\third_party\gflags\src\;$(SolutionDir)\src\;$(SolutionDir)\third_party;$(SolutionDir)\</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>$(SolutionDir)\third_party\libav-bin\include\;$(SolutionDir)\third_party\beaengine\include\;$(SolutionDir)\third_party\llvm\include\;$(SolutionDir)\third_party\gflags\src\;$(SolutionDir)\src\;$(SolutionDir)\third_party;$(SolutionDir)\</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Windows</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
    <Lib>
-      <AdditionalDependencies>libgflags.lib;libglew.lib</AdditionalDependencies>
+      <AdditionalDependencies>libavcodec.a;libavutil.a;libgflags.lib;libglew.lib</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(SolutionDir)third_party\libav-bin\lib\$(Configuration);$(SolutionDir)build\bin\$(Configuration)\</AdditionalLibraryDirectories>
    </Lib>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
@ -519,7 +523,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>BEA_ENGINE_STATIC=1;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(SolutionDir)\third_party\beaengine\include\;$(SolutionDir)\third_party\llvm\include\;$(SolutionDir)\third_party\gflags\src\;$(SolutionDir)\src\;$(SolutionDir)\third_party;$(SolutionDir)\</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>$(SolutionDir)\third_party\libav-bin\include\;$(SolutionDir)\third_party\beaengine\include\;$(SolutionDir)\third_party\llvm\include\;$(SolutionDir)\third_party\gflags\src\;$(SolutionDir)\src\;$(SolutionDir)\third_party;$(SolutionDir)\</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Windows</SubSystem>
@ -528,7 +532,8 @@
      <OptimizeReferences>true</OptimizeReferences>
    </Link>
    <Lib>
-      <AdditionalDependencies>libgflags.lib;libglew.lib</AdditionalDependencies>
+      <AdditionalDependencies>libavcodec.a;libavutil.a;libgflags.lib;libglew.lib</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(SolutionDir)third_party\libav-bin\lib\$(Configuration);$(SolutionDir)build\bin\$(Configuration)\</AdditionalLibraryDirectories>
    </Lib>
  </ItemDefinitionGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/libxenia.vcxproj.filters
+++ b/libxenia.vcxproj.filters
@ -700,6 +700,9 @@
    <ClCompile Include="src\xenia\kernel\xboxkrnl_error.cc">
      <Filter>src\xenia\kernel</Filter>
    </ClCompile>
+    <ClCompile Include="src\xenia\apu\audio_decoder.cc">
+      <Filter>src\xenia\apu</Filter>
+    </ClCompile>
    <ClCompile Include="src\xenia\cpu\backend\x64\x64_code_cache.cc">
      <Filter>src\xenia\cpu\backend\x64</Filter>
    </ClCompile>
@ -1341,6 +1344,9 @@
    <ClInclude Include="src\xenia\kernel\xboxkrnl_error.h">
      <Filter>src\xenia\kernel</Filter>
    </ClInclude>
+    <ClInclude Include="src\xenia\apu\audio_decoder.h">
+      <Filter>src\xenia\apu</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <None Include="src\xenia\cpu\backend\x64\x64_sequence.inl">
--- a/src/xenia/apu/audio_decoder.cpp
+++ b/src/xenia/apu/audio_decoder.cpp
@ -0,0 +1,228 @@
+/**
+******************************************************************************
+* Xenia : Xbox 360 Emulator Research Project                                 *
+******************************************************************************
+* Copyright 2015 Ben Vanik. All rights reserved.                             *
+* Released under the BSD license - see LICENSE in the root for more details. *
+******************************************************************************
+*/
+
+#include "xenia/apu/audio_decoder.h"
+
+#include "xenia/apu/audio_system.h"
+#include "xenia/base/logging.h"
+
+extern "C" {
+  #include "libavcodec/avcodec.h"
+}
+
+// Credits for most of this code goes to:
+// https://github.com/koolkdev/libertyv/blob/master/libav_wrapper/xma2dec.c
+
+namespace xe {
+namespace apu {
+
+AudioDecoder::AudioDecoder() : offset_(0) {
+}
+
+AudioDecoder::~AudioDecoder() {
+}
+
+int AudioDecoder::Initialize(int bits) {
+  static bool avcodec_initialized = false;
+  if (!avcodec_initialized) {
+    avcodec_register_all();
+    avcodec_initialized = true;
+  }
+
+  if (bits <= 0 || bits > 32 || (bits % 8) != 0) {
+    assert_always();
+  }
+
+  // Output bits per sample
+  bits_ = bits;
+
+  // Allocate important stuff
+  codec_ = avcodec_find_decoder(AV_CODEC_ID_WMAPRO);
+  if (!codec_) {
+    return 1;
+  }
+
+  context_ = avcodec_alloc_context3(codec_);
+  if (!context_) {
+    return 1;
+  }
+
+  decoded_frame_ = av_frame_alloc();
+  if (!decoded_frame_) {
+    return 1;
+  }
+
+  packet_ = new AVPacket();
+  av_init_packet(packet_);
+
+  context_->channels = 0; // Only 1 channel for our purposes.
+  context_->sample_rate = 0;
+  context_->block_align = XMAContextData::kBytesPerBlock;
+
+  // Extra data passed to the decoder
+  context_->extradata_size = 18;
+  context_->extradata = new uint8_t[18];
+
+  // Current frame stuff whatever
+  // samples per frame * 2 max channels * output bytes
+  current_frame_ = new uint8_t[XMAContextData::kSamplesPerFrame * 2 * (bits / 8)];
+  current_frame_pos_ = 0;
+  frame_samples_size_ = 0;
+
+  *(short *)(context_->extradata) = 0x10; // bits per sample
+  *(int *)(context_->extradata + 2) = 1; // channel mask
+  *(short *)(context_->extradata + 14) = 0x10D6; // decode flags
+
+  // FYI: We're purposely not opening the context here. That is done later.
+
+  return 0;
+}
+
+void AudioDecoder::Cleanup() {
+
+}
+
+int AudioDecoder::PreparePacket(uint8_t* input, size_t size,
+                                int sample_rate, int channels) {
+  if (size != XMAContextData::kBytesPerBlock) {
+    // Invalid packet size!
+    assert_always();
+    return 1;
+  }
+  if (packet_->size > 0 || current_frame_pos_ != frame_samples_size_) {
+    // Haven't finished parsing another packet
+    return 1;
+  }
+
+  std::memcpy(packet_data_, input, size);
+
+  // Modify the packet header so it's WMAPro compatible
+  *((int *)packet_data_) = (((offset_ & 0x7800) | 0x400) >> 7) |
+                           (*((int*)packet_data_) & 0xFFFEFF08);
+  offset_ += XMAContextData::kBytesPerBlock; // Sequence number
+
+  //std::memcpy(packet_data_, input, size);
+  packet_->data = packet_data_;
+  packet_->size = XMAContextData::kBytesPerBlock;
+
+  // Re-initialize the context with new sample rate and channels
+  if (context_->sample_rate != sample_rate || context_->channels != channels) {
+    context_->sample_rate = sample_rate;
+    context_->channels = channels;
+
+    // We have to reopen the codec so it'll realloc whatever data it needs.
+    // TODO: Find a better way.
+    avcodec_close(context_);
+    if (avcodec_open2(context_, codec_, NULL) < 0) {
+      // Shit.
+      return 1;
+    }
+  }
+
+  return 0;
+}
+
+void AudioDecoder::DiscardPacket() {
+  if (packet_->size > 0 || current_frame_pos_ != frame_samples_size_) {
+    packet_->data = 0;
+    packet_->size = 0;
+    current_frame_pos_ = frame_samples_size_;
+  }
+}
+
+int AudioDecoder::DecodePacket(uint8_t* output, size_t output_offset, size_t output_size) {
+  size_t to_copy = 0;
+  size_t original_offset = output_offset;
+  uint32_t sample_size = bits_ / 8;
+
+  // We're holding onto an already-decoded frame. Copy it out.
+  if (current_frame_pos_ != frame_samples_size_) {
+    to_copy = std::min(output_size, frame_samples_size_ - current_frame_pos_);
+    memcpy(output + output_offset, current_frame_ + current_frame_pos_, to_copy);
+
+    current_frame_pos_ += to_copy;
+    output_size -= to_copy;
+    output_offset += to_copy;
+  }
+
+  while (output_size > 0 && packet_->size > 0) {
+    int got_frame = 0;
+
+    // Decode the current frame
+    int len = avcodec_decode_audio4(context_, decoded_frame_, &got_frame, packet_);
+    if (len < 0) {
+      // Error in codec (bad sample rate or something)
+      return len;
+    }
+
+    // Offset by decoded length
+    packet_->size -= len;
+    packet_->data += len;
+    packet_->dts = packet_->pts = AV_NOPTS_VALUE;
+
+    // Successfully decoded a frame
+    if (got_frame) {
+      if (decoded_frame_->nb_samples > XMAContextData::kSamplesPerFrame) {
+        return -2;
+      } else if (context_->sample_fmt != AV_SAMPLE_FMT_FLTP) {
+        return -3;
+      }
+
+      // Check the returned buffer size
+      if (av_samples_get_buffer_size(NULL, context_->channels,
+                                     decoded_frame_->nb_samples,
+                                     context_->sample_fmt, 1)
+          != context_->channels * decoded_frame_->nb_samples * sizeof(float)) {
+        return -4;
+      }
+
+      float* curSample = (float *)decoded_frame_->data[0];
+
+      // Loop through every sample, convert and drop it into the output array
+      for (int i = 0; i < decoded_frame_->nb_samples; i++) {
+        float fSample = curSample[i] * (1 << (bits_ - 1));
+        // Weird problem: Sometimes the samples are outside [-1,1]
+        if (fSample >= 0) {
+          fSample += 0.5f;
+          if (fSample > (1 << (bits_ - 1)) - 1) {
+            fSample = (float)(1 << (bits_ - 1)) - 1;
+          }
+        } else {
+          fSample -= 0.5f;
+        }
+
+        // Convert the sample
+        int sample = (int)fSample;
+        for (uint32_t j = 0; j < sample_size; j++) {
+          current_frame_[i * sample_size + j] = sample & 0xFF;
+          sample >>= 8;
+        }
+      }
+      current_frame_pos_ = 0;
+
+      // Total size of the frame's samples
+      frame_samples_size_ = context_->channels * decoded_frame_->nb_samples
+                          * sample_size;
+
+      to_copy = std::min(output_size, (size_t)(frame_samples_size_));
+      std::memcpy(output + output_offset, current_frame_, to_copy);
+
+      current_frame_pos_ += to_copy;
+      output_size -= to_copy;
+      output_offset += to_copy;
+    }
+  }
+
+  // Return number of bytes written (typically 2048)
+  return (int)(output_offset - original_offset);
+}
+
+
+} // namespace xe
+} // namespace apu
--- a/src/xenia/apu/audio_decoder.h
+++ b/src/xenia/apu/audio_decoder.h
@ -0,0 +1,67 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2015 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_APU_AUDIO_DECODER_H_
+#define XENIA_APU_AUDIO_DECODER_H_
+
+#include "xenia/apu/audio_system.h"
+
+// XMA audio format:
+// From research, XMA appears to be based on WMA Pro with
+// a few (very slight) modifications.
+// XMA2 is fully backwards-compatible with XMA1.
+
+// Helpful resources:
+// https://github.com/koolkdev/libertyv/blob/master/libav_wrapper/xma2dec.c
+// http://hcs64.com/mboard/forum.php?showthread=14818
+// https://github.com/hrydgard/minidx9/blob/master/Include/xma2defs.h
+
+// Forward declarations
+struct AVCodec;
+struct AVCodecContext;
+struct AVFrame;
+struct AVPacket;
+
+namespace xe {
+namespace apu {
+
+class AudioDecoder {
+  public:
+    AudioDecoder();
+    ~AudioDecoder();
+
+    int Initialize(int bits);
+    void Cleanup();
+
+    int PreparePacket(uint8_t* input, size_t size, int sample_rate, int channels);
+    void DiscardPacket();
+
+    int DecodePacket(uint8_t* output, size_t offset, size_t size);
+
+  private:
+    AVCodec* codec_;
+    AVCodecContext* context_;
+    AVFrame* decoded_frame_;
+    AVPacket* packet_;
+
+    uint8_t bits_per_frame_;
+    uint32_t bits_;
+    size_t current_frame_pos_;
+    uint8_t* current_frame_;
+    uint32_t frame_samples_size_;
+    int offset_;
+
+    uint8_t packet_data_[XMAContextData::kBytesPerBlock];
+};
+
+} // namespace apu
+} // namespace xe
+
+
+#endif  // XENIA_APU_AUDIO_DECODER_H_
--- a/src/xenia/apu/audio_system.cc
+++ b/src/xenia/apu/audio_system.cc
@ -10,6 +10,7 @@
 #include "xenia/apu/audio_system.h"

 #include "xenia/apu/audio_driver.h"
+#include "xenia/apu/audio_decoder.h"
 #include "xenia/base/logging.h"
 #include "xenia/base/math.h"
 #include "xenia/cpu/processor.h"
@ -18,6 +19,8 @@
 #include "xenia/kernel/objects/xthread.h"
 #include "xenia/profiling.h"

+#include "libavutil/log.h"
+
 // As with normal Microsoft, there are like twelve different ways to access
 // the audio APIs. Early games use XMA*() methods almost exclusively to touch
 // decoders. Later games use XAudio*() and direct memory writes to the XMA
@ -55,7 +58,8 @@ const uint32_t kXmaContextSize = 64;
 const uint32_t kXmaContextCount = 320;

 AudioSystem::AudioSystem(Emulator* emulator)
-    : emulator_(emulator), memory_(emulator->memory()), worker_running_(false) {
+    : emulator_(emulator), memory_(emulator->memory()), worker_running_(false),
+    decoder_running_(false) {
  memset(clients_, 0, sizeof(clients_));
  for (size_t i = 0; i < maximum_client_count_; ++i) {
    unused_clients_.push(i);
@ -85,8 +89,16 @@ X_STATUS AudioSystem::Setup() {
      kXmaContextSize * kXmaContextCount, 256, kSystemHeapPhysical);
  // Add all contexts to the free list.
  for (int i = kXmaContextCount - 1; i >= 0; --i) {
-    xma_context_free_list_.push_back(registers_.xma_context_array_ptr +
-                                     i * kXmaContextSize);
+    uint32_t ptr = registers_.xma_context_array_ptr + i * kXmaContextSize;
+
+    // Initialize it
+    xma_context_array_[i].guest_ptr = ptr;
+    xma_context_array_[i].in_use = false;
+
+    // Create a new decoder per context
+    // Needed because some data needs to be persisted across calls
+    xma_context_array_[i].decoder = new AudioDecoder();
+    xma_context_array_[i].decoder->Initialize(16);
  }
  registers_.next_context = 1;

@ -98,6 +110,9 @@ X_STATUS AudioSystem::Setup() {
  });
  worker_thread_->Create();

+  decoder_running_ = true;
+  decoder_thread_ = std::thread(std::bind(&AudioSystem::DecoderThreadMain, this));
+
  return X_STATUS_SUCCESS;
 }

@ -128,6 +143,7 @@ void AudioSystem::WorkerThreadMain() {
        uint32_t client_callback = clients_[index].callback;
        uint32_t client_callback_arg = clients_[index].wrapped_callback_arg;
        lock_.unlock();
+
        if (client_callback) {
          uint64_t args[] = {client_callback_arg};
          processor->Execute(worker_thread_->thread_state(), client_callback,
@ -154,6 +170,156 @@ void AudioSystem::WorkerThreadMain() {
  // TODO(benvanik): call module API to kill?
 }

+void AudioSystem::DecoderThreadMain() {
+  xe::threading::set_name("Audio Decoder");
+  xe::Profiler::ThreadEnter("Audio Decoder");
+
+  while (decoder_running_) {
+    // Wait for the fence
+    // FIXME: This actually does nothing once signaled once
+    decoder_fence_.Wait();
+
+    // Check to see if we're supposed to exit
+    if (!decoder_running_) {
+      break;
+    }
+
+    // Okay, let's loop through XMA contexts to find ones we need to decode!
+    for (uint32_t n = 0; n < kXmaContextCount; n++) {
+      XMAContext& context = xma_context_array_[n];
+      if (!context.lock.try_lock()) {
+        // Someone else has the lock.
+        continue;
+      }
+
+      // Skip unused contexts
+      if (!context.in_use) {
+        context.lock.unlock();
+        continue;
+      }
+
+      uint8_t* ptr = memory()->TranslatePhysical(context.guest_ptr);
+      auto data = XMAContextData(ptr);
+
+      if (data.input_buffer_0_valid || data.input_buffer_1_valid) {
+        // A buffer is valid. Run the decoder!
+
+        // Reset valid flags
+        data.input_buffer_0_valid = 0;
+        data.input_buffer_1_valid = 0;
+        data.output_buffer_valid = 0;
+
+        // Translate pointers for future use.
+        auto in0 = memory()->TranslatePhysical(data.input_buffer_0_ptr);
+        auto in1 = memory()->TranslatePhysical(data.input_buffer_1_ptr);
+        auto out = memory()->TranslatePhysical(data.output_buffer_ptr);
+
+        // I haven't seen this be used yet.
+        assert(!data.input_buffer_1_block_count);
+
+        // What I see:
+        // XMA outputs 2 bytes per sample
+        // 512 samples per frame (128 per subframe)
+        // Max output size is data.output_buffer_block_count * 256
+
+        // This decoder is fed packets (max 4095 per buffer)
+        // Packets contain "some" frames
+        // 32bit header (big endian)
+
+        // Frames are the smallest thing the SPUs can decode.
+        // They usually can span packets (libav handles this)
+
+        // Sample rates (data.sample_rate):
+        // 0 - 24 kHz ?
+        // 1 - 32 kHz
+        // 2 - 44.1 kHz ?
+        // 3 - 48 kHz ?
+
+        // SPUs also support stereo decoding. (data.is_stereo)
+
+        while (true) {
+          // Initial check - see if we've finished with the input
+          // TODO - Probably need to move this, I think it might skip the very
+          // last packet (see the call to PreparePacket)
+          size_t input_size = (data.input_buffer_0_block_count +
+                              data.input_buffer_1_block_count) * 2048;
+          size_t input_offset = (data.input_buffer_read_offset / 8 - 4);
+          size_t input_remaining = input_size - input_offset;
+          if (input_remaining == 0) {
+            // We're finished!
+            break;
+          }
+
+          // Now check the output buffer.
+          size_t output_size = data.output_buffer_block_count * 256;
+          size_t output_offset = data.output_buffer_write_offset * 256;
+          size_t output_remaining = output_size - output_offset;
+          if (output_remaining == 0) {
+            // Can't write any more data. Break.
+            // The game will kick us again with a new output buffer later.
+            break;
+          }
+
+          // This'll copy audio samples into the output buffer.
+          // The samples need to be 2 bytes long!
+          // Copies one frame at a time, so keep calling this until size == 0
+          int read = context.decoder->DecodePacket(out, output_offset,
+                                                   output_remaining);
+          if (read < 0) {
+            XELOGAPU("APU failed to decode packet (returned %.8X)", -read);
+            context.decoder->DiscardPacket();
+
+            // TODO: Set error state
+
+            break;
+          }
+
+          if (read == 0) {
+            // Select sample rate.
+            int sample_rate = 0;
+            if (data.sample_rate == 0) {
+              // TODO: Test this
+              sample_rate = 24000;
+            } else if (data.sample_rate == 1) {
+              sample_rate = 32000;
+            } else if (data.sample_rate == 2) {
+              // TODO: Test this
+              sample_rate = 44100;
+            } else if (data.sample_rate == 3) {
+              // TODO: Test this
+              sample_rate = 48000;
+            }
+
+            // Channels
+            int channels = 1;
+            if (data.is_stereo == 1) {
+              channels = 2;
+            }
+
+            // New packet time.
+            // TODO: Select input buffer 1 if necessary.
+            auto packet = in0 + input_offset;
+            context.decoder->PreparePacket(packet, 2048, sample_rate, channels);
+            input_offset += 2048;
+          }
+
+          output_offset += read;
+
+          // blah copy these back to the context
+          data.input_buffer_read_offset = (input_offset + 4) * 8;
+          data.output_buffer_write_offset = output_offset / 256;
+        }
+
+        data.Store(ptr);
+      }
+
+      context.lock.unlock();
+    }
+  }
+
+  xe::Profiler::ThreadExit();
+}
+
 void AudioSystem::Initialize() {}

 void AudioSystem::Shutdown() {
@ -162,30 +328,44 @@ void AudioSystem::Shutdown() {
  worker_thread_->Wait(0, 0, 0, nullptr);
  worker_thread_->Release();

+  decoder_running_ = false;
+  decoder_fence_.Signal();
+
  memory()->SystemHeapFree(registers_.xma_context_array_ptr);
 }

 uint32_t AudioSystem::AllocateXmaContext() {
  std::lock_guard<std::mutex> lock(lock_);
-  if (xma_context_free_list_.empty()) {
-    // No contexts available.
-    return 0;
+
+  for (uint32_t n = 0; n < kXmaContextCount; n++) {
+    XMAContext& context = xma_context_array_[n];
+    if (!context.in_use) {
+      context.in_use = true;
+      return context.guest_ptr;
+    }
  }

-  auto guest_ptr = xma_context_free_list_.back();
-  xma_context_free_list_.pop_back();
-  auto context_ptr = memory()->TranslateVirtual(guest_ptr);
-
-  return guest_ptr;
+  return 0;
 }

 void AudioSystem::ReleaseXmaContext(uint32_t guest_ptr) {
  std::lock_guard<std::mutex> lock(lock_);

-  auto context_ptr = memory()->TranslateVirtual(guest_ptr);
-  std::memset(context_ptr, 0, kXmaContextSize);
+  // Find it in the list.
+  for (uint32_t n = 0; n < kXmaContextCount; n++) {
+    XMAContext& context = xma_context_array_[n];
+    if (context.guest_ptr == guest_ptr) {
+      // Found it!
+      // Lock it in case the decoder thread is working on it now
+      context.lock.lock();

-  xma_context_free_list_.push_back(guest_ptr);
+      context.in_use = false;
+      auto context_ptr = memory()->TranslateVirtual(guest_ptr);
+      std::memset(context_ptr, 0, kXmaContextSize); // Zero it.
+      
+      context.lock.unlock();
+    }
+  }
 }

 X_STATUS AudioSystem::RegisterClient(uint32_t callback, uint32_t callback_arg,
@ -280,42 +460,49 @@ void AudioSystem::WriteRegister(uint32_t addr, uint64_t value) {
  if (r >= 0x1940 && r <= 0x1940 + 9 * 4) {
    // Context kick command.
    // This will kick off the given hardware contexts.
+    // Basically, this kicks the SPU and says "hey, decode that audio!"
+    // XMAEnableContext
+
+    // The context ID is a bit in the range of the entire context array
    for (int i = 0; value && i < 32; ++i) {
      if (value & 1) {
        uint32_t context_id = i + (r - 0x1940) / 4 * 32;
-        XELOGAPU("AudioSystem: kicking context %d", context_id);
-        // Games check bits 20/21 of context[0].
-        // If both bits are set buffer full, otherwise room available.
-        // Right after a kick we always set buffers to invalid so games keep
-        // feeding data.
-        uint32_t guest_ptr =
-            registers_.xma_context_array_ptr + context_id * kXmaContextSize;
-        auto context_ptr = memory()->TranslateVirtual(guest_ptr);
+        XMAContext& context = xma_context_array_[context_id];
+
+        context.lock.lock();
+        auto context_ptr = memory()->TranslateVirtual(context.guest_ptr);
        XMAContextData data(context_ptr);

-        bool has_valid_input = data.input_buffer_0_valid ||
-                               data.input_buffer_1_valid;
-        if (has_valid_input) {
-          // Invalidate the buffers.
-          data.input_buffer_0_valid = 0;
-          data.input_buffer_1_valid = 0;
+        XELOGAPU("AudioSystem: kicking context %d (%d/%d bytes)", context_id,
+                 data.input_buffer_read_offset, data.input_buffer_0_block_count
+                 * XMAContextData::kBytesPerBlock);

-          // Set output buffer to invalid.
-          data.output_buffer_valid = false;
-        }
+        // Reset valid flags so our audio decoder knows to process this one
+        data.input_buffer_0_valid = data.input_buffer_0_ptr != 0;
+        data.input_buffer_1_valid = data.input_buffer_1_ptr != 0;
+        data.output_buffer_write_offset = 0;

        data.Store(context_ptr);
+        context.lock.unlock();
+
+        // Signal the decoder thread
+        decoder_fence_.Signal();
      }
      value >>= 1;
    }
  } else if (r >= 0x1A40 && r <= 0x1A40 + 9 * 4) {
    // Context lock command.
    // This requests a lock by flagging the context.
+    // XMADisableContext
    for (int i = 0; value && i < 32; ++i) {
      if (value & 1) {
        uint32_t context_id = i + (r - 0x1A40) / 4 * 32;
        XELOGAPU("AudioSystem: set context lock %d", context_id);
-        // TODO(benvanik): set lock?
+
+        // TODO: Find the correct way to lock/unlock this.
+        // I thought we could lock it here, unlock it in the kick but that
+        // doesn't seem to work
+        XMAContext& context = xma_context_array_[context_id];
      }
      value >>= 1;
    }
@ -326,15 +513,11 @@ void AudioSystem::WriteRegister(uint32_t addr, uint64_t value) {
      if (value & 1) {
        uint32_t context_id = i + (r - 0x1A80) / 4 * 32;
        XELOGAPU("AudioSystem: reset context %d", context_id);
+
        // TODO(benvanik): something?
        uint32_t guest_ptr =
            registers_.xma_context_array_ptr + context_id * kXmaContextSize;
        auto context_ptr = memory()->TranslateVirtual(guest_ptr);
-        XMAContextData data(context_ptr);
-
-
-
-        data.Store(context_ptr);
      }
      value >>= 1;
    }
--- a/src/xenia/apu/audio_system.h
+++ b/src/xenia/apu/audio_system.h
@ -27,10 +27,16 @@ namespace xe {
 namespace apu {

 class AudioDriver;
+class AudioDecoder;

 // This is stored in guest space in big-endian order.
 // We load and swap the whole thing to splat here so that we can
 // use bitfields.
+// This could be important:
+// http://www.fmod.org/questions/question/forum-15859
+// Appears to be dumped in order (for the most part)
+
+// http://pastebin.com/9amqJ2kQ
 struct XMAContextData {
  static const uint32_t kSize = 64;
  static const uint32_t kBytesPerBlock = 2048;
@ -39,53 +45,54 @@ struct XMAContextData {

  // DWORD 0
  uint32_t input_buffer_0_block_count : 12;  // XMASetInputBuffer0, number of
-                                             // 2KB blocks.
-  uint32_t loop_count : 8;                   // +12bit, XMASetLoopData
+                                             // 2KB blocks. AKA SizeRead0
+                                             // Maximum 4095 packets.
+  uint32_t loop_count : 8;                   // +12bit, XMASetLoopData NumLoops
  uint32_t input_buffer_0_valid : 1;         // +20bit, XMAIsInputBuffer0Valid
  uint32_t input_buffer_1_valid : 1;         // +21bit, XMAIsInputBuffer1Valid
-  uint32_t output_buffer_block_count : 5;    // +22bit
-  uint32_t
-    output_buffer_write_offset : 5;  // +27bit, XMAGetOutputBufferWriteOffset
+  uint32_t output_buffer_block_count : 5;    // +22bit SizeWrite 256byte blocks
+  uint32_t output_buffer_write_offset : 5;   // +27bit, XMAGetOutputBufferWriteOffset
+                                             // AKA OffsetWrite

-                                     // DWORD 1
+  // DWORD 1
  uint32_t input_buffer_1_block_count : 12;  // XMASetInputBuffer1, number of
                                             // 2KB blocks.
  uint32_t loop_subframe_end : 2;            // +12bit, XMASetLoopData
-  uint32_t unk_dword_1_a : 3;                // ?
-  uint32_t loop_subframe_skip : 3;           // +17bit, XMASetLoopData
-  uint32_t subframe_decode_count : 4;        // +20bit
-  uint32_t unk_dword_1_b : 3;                // ?
-  uint32_t sample_rate : 2;                  // +27bit
-  uint32_t is_stereo : 1;                    // +29bit
-  uint32_t unk_dword_1_c : 1;                // ?
+  uint32_t unk_dword_1_a : 3;                // ? might be loop_subframe_skip
+  uint32_t loop_subframe_skip : 3;           // +17bit, XMASetLoopData might be subframe_decode_count
+  uint32_t subframe_decode_count : 4;        // +20bit might be subframe_skip_count
+  uint32_t unk_dword_1_b : 3;                // ? NumSubframesToSkip/NumChannels(?)
+  uint32_t sample_rate : 2;                  // +27bit multiplied by something?
+  uint32_t is_stereo : 1;                    // +29bit might be NumChannels
+  uint32_t unk_dword_1_c : 1;                // ? NumChannels?
  uint32_t output_buffer_valid : 1;          // +31bit, XMAIsOutputBufferValid

-                                             // DWORD 2
-  uint32_t input_buffer_read_offset : 30;  // XMAGetInputBufferReadOffset
-  uint32_t unk_dword_2 : 2;                // ?
+  // DWORD 2
+  uint32_t input_buffer_read_offset : 30;   // XMAGetInputBufferReadOffset
+  uint32_t unk_dword_2 : 2;                 // ErrorStatus/ErrorSet (?)

-                                           // DWORD 3
-  uint32_t loop_start : 26;  // XMASetLoopData
-  uint32_t unk_dword_3 : 6;  // ?
+  // DWORD 3
+  uint32_t loop_start : 26;  // XMASetLoopData LoopStartOffset
+  uint32_t unk_dword_3 : 6;  // ? ParserErrorStatus/ParserErrorSet(?)

-                             // DWORD 4
-  uint32_t loop_end : 26;        // XMASetLoopData
+  // DWORD 4
+  uint32_t loop_end : 26;        // XMASetLoopData LoopEndOffset
  uint32_t packet_metadata : 5;  // XMAGetPacketMetadata
  uint32_t current_buffer : 1;   // ?

-                                // DWORD 5
+  // DWORD 5
  uint32_t input_buffer_0_ptr;  // physical address
-                                // DWORD 6
+  // DWORD 6
  uint32_t input_buffer_1_ptr;  // physical address
-                                // DWORD 7
+  // DWORD 7
  uint32_t output_buffer_ptr;   // physical address
-                                // DWORD 8
-  uint32_t unk_dword_8;         // Some kind of pointer like output_buffer_ptr
+  // DWORD 8
+  uint32_t overlap_add_ptr;     // PtrOverlapAdd(?)

-                                // DWORD 9
-  uint32_t
-    output_buffer_read_offset : 5;  // +0bit, XMAGetOutputBufferReadOffset
-  uint32_t unk_dword_9 : 27;
+  // DWORD 9
+  // +0bit, XMAGetOutputBufferReadOffset AKA WriteBufferOffsetRead
+  uint32_t output_buffer_read_offset : 5;  
+  uint32_t unk_dword_9 : 27; // StopWhenDone/InterruptWhenDone(?)

  XMAContextData(const void* ptr) {
    xe::copy_and_swap_32_aligned(reinterpret_cast<uint32_t*>(this),
@ -135,6 +142,7 @@ class AudioSystem {

 private:
  void WorkerThreadMain();
+  void DecoderThreadMain();

  static uint64_t MMIOReadRegisterThunk(AudioSystem* as, uint32_t addr) {
    return as->ReadRegister(addr);
@ -154,6 +162,10 @@ class AudioSystem {
  std::atomic<bool> worker_running_;
  kernel::XHostThread* worker_thread_;

+  std::atomic<bool> decoder_running_;
+  std::thread decoder_thread_;
+  xe::threading::Fence decoder_fence_;
+
  std::mutex lock_;

  // Stored little endian, accessed through 0x7FEA....
@ -176,7 +188,17 @@ class AudioSystem {
    } registers_;
    uint32_t register_file_[0xFFFF / 4];
  };
+  struct XMAContext {
+    uint32_t    guest_ptr;
+    std::mutex  lock;
+    bool        in_use;
+
+    AudioDecoder* decoder;
+  };
+
+  XMAContext xma_context_array_[320];
  std::vector<uint32_t> xma_context_free_list_;
+  std::vector<uint32_t> xma_context_used_list_; // XMA contexts in use

  static const size_t maximum_client_count_ = 8;