From 8a144a735f4116650f59e6931fed86c1c00fa03c Mon Sep 17 00:00:00 2001
From: Pokechu22 <Pokechu022@gmail.com>
Date: Tue, 14 Jun 2022 16:25:37 -0700
Subject: [PATCH] DSPHLE: Add HLE version of libasnd ucode

---
 Source/Core/Core/CMakeLists.txt              |   2 +
 Source/Core/Core/HW/DSPHLE/UCodes/ASnd.cpp   | 568 +++++++++++++++++++
 Source/Core/Core/HW/DSPHLE/UCodes/ASnd.h     | 113 ++++
 Source/Core/Core/HW/DSPHLE/UCodes/UCodes.cpp |   8 +
 Source/Core/DolphinLib.props                 |   2 +
 5 files changed, 693 insertions(+)
 create mode 100644 Source/Core/Core/HW/DSPHLE/UCodes/ASnd.cpp
 create mode 100644 Source/Core/Core/HW/DSPHLE/UCodes/ASnd.h
diff --git a/Source/Core/Core/CMakeLists.txt b/Source/Core/Core/CMakeLists.txt
index f958f7dd70..8a4db4c961 100644
--- a/Source/Core/Core/CMakeLists.txt
+++ b/Source/Core/Core/CMakeLists.txt
@@ -142,6 +142,8 @@ add_library(core
   HW/DSPHLE/DSPHLE.h
   HW/DSPHLE/MailHandler.cpp
   HW/DSPHLE/MailHandler.h
+  HW/DSPHLE/UCodes/ASnd.cpp
+  HW/DSPHLE/UCodes/ASnd.h
   HW/DSPHLE/UCodes/AX.cpp
   HW/DSPHLE/UCodes/AX.h
   HW/DSPHLE/UCodes/AXStructs.h
diff --git a/Source/Core/Core/HW/DSPHLE/UCodes/ASnd.cpp b/Source/Core/Core/HW/DSPHLE/UCodes/ASnd.cpp
new file mode 100644
index 0000000000..44fd55b00a
--- /dev/null
+++ b/Source/Core/Core/HW/DSPHLE/UCodes/ASnd.cpp
@@ -0,0 +1,568 @@
+// Copyright 2022 Dolphin Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+// High-level emulation for the libasnd ucode, used by older homebrew
+// libasnd is copyright 2008 Hermes <www.entuwii.net> and released under the BSD-3-Clause license
+
+#include "Core/HW/DSPHLE/UCodes/ASnd.h"
+
+#include <algorithm>
+
+#include "Common/ChunkFile.h"
+#include "Common/Logging/Log.h"
+#include "Common/Swap.h"
+#include "Core/HW/DSP.h"
+#include "Core/HW/DSPHLE/DSPHLE.h"
+#include "Core/HW/DSPHLE/MailHandler.h"
+#include "Core/HW/DSPHLE/UCodes/UCodes.h"
+
+namespace DSP::HLE
+{
+// "fill the internal sample buffer and process the voice internally"
+constexpr u32 MAIL_INPUT_SAMPLES = 0x0111;
+// "get samples from the external buffer to the internal buffer and process the voice mixing the
+// samples internally" (not used)
+constexpr u32 MAIL_INPUT_SAMPLES_2 = 0x0112;
+// "get the address of the voice datas buffer (CHANNEL DATAS)" (actually sets it)
+constexpr u32 MAIL_SET_VOICE_DATA_BUFFER = 0x0123;
+// "process the voice mixing the samples internally"
+constexpr u32 MAIL_INPUT_NEXT_SAMPLES = 0x0222;
+// "send the samples for the internal buffer to the external buffer"
+constexpr u32 MAIN_SEND_SAMPLES = 0x0666;
+// "special: to dump the IROM Datas (remember disable others functions from the interrupt vector to
+// use) (CMBH+0x8000) countain the address of IROM" (not used)
+constexpr u32 MAIL_ROM_DUMP_WORD = 0x0777;
+// "Used for test" (not used)
+constexpr u32 MAIL_TEST = 0x0888;
+constexpr u32 MAIL_TERMINATE = 0x0999;
+
+// Note that there are additional flags used on the powerpc side (UPDATEADD, UPDATE, and VOLUPDATE)
+// that are not relevant to the DSP side.
+// The old flags are used in the 2008 and 2009 versions, while the new ones are used in the 2011 and
+// 2020 versions.
+constexpr u32 OLD_FLAGS_VOICE_PAUSE = 1 << 5;
+constexpr u32 OLD_FLAGS_VOICE_LOOP = 1 << 2;
+constexpr u32 OLD_FLAGS_SAMPLE_FORMAT_MASK = 3;
+
+constexpr u32 NEW_FLAGS_VOICE_PAUSE = 1 << 9;
+constexpr u32 NEW_FLAGS_VOICE_LOOP = 1 << 8;
+constexpr u32 NEW_FLAGS_SAMPLE_FORMAT_MASK = 7;
+// Used on the PowerPC side as an enabled flag (if 0, then the voice isn't active).
+// Used on the DSP side as the step in bytes after each read.
+constexpr u32 FLAGS_SAMPLE_FORMAT_BYTES_MASK = 0xffff0000;
+constexpr u32 FLAGS_SAMPLE_FORMAT_BYTES_SHIFT = 16;
+
+// November 14, 2008 version (padded to 0x05a0 bytes) - initial release
+// https://github.com/devkitPro/libogc/compare/c76d8b851fafc11b0a5debc0b40842929d5a5825~...353a44f038e75e5982eb550173ec8127ab35e3e3
+constexpr u32 HASH_2008 = 0x8d69a19b;
+// February 5, 2009 version (0x05ac bytes) - added MAIL_TERMINATE
+// https://github.com/devkitPro/libogc/compare/1925217ffb4c97cbee5cf21fa3c0231029b340e2~...3b1f018dbe372859a43bff8560e2525f6efa4433
+constexpr u32 HASH_2009 = 0xcc2fd441;
+// June 11, 2011 version (padded to 0x0620 bytes) - added new sample formats, which shifted flags
+// Note that the source include in the repo does not match the compiled binary exactly; the compiled
+// version differs by using asl instead of lsl, $acc1 instead of $acc0, and $ac0.l instead of $ac0.m
+// in various locations, as well as having the "jmp out_samp" line uncommented in stereo_16bits_le.
+// None of these result in a behavior difference, from the source, though.
+// Note that gcdsptool was also updated, which results in some differences in the source that don't
+// actually correspond to different instructions (e.g. s40 was renamed to s16)
+// https://github.com/devkitPro/libogc/commit/b1b8ecab3af3745c8df0b401abd512bdf5fcc011
+constexpr u32 HASH_2011 = 0xa81582e2;
+// June 12, 2020 version (0x0606 bytes) - libogc switched to compiling the ucode at build time
+// instead of including a pre-compiled version in the repo, so this now corresponds to the code
+// provided in the repo. There appear to be no behavior differences from the 2011 version.
+// https://github.com/devkitPro/libogc/compare/bfb705fe1607a3031d18b65d603975b68a1cffd4~...d20f9bdcfb43260c6c759f4fb98d724931443f93
+constexpr u32 HASH_2020 = 0xdbbeeb61;
+
+constexpr u32 SAMPLE_RATE = 48000;
+
+ASndUCode::ASndUCode(DSPHLE* dsphle, u32 crc) : UCodeInterface(dsphle, crc)
+{
+}
+
+void ASndUCode::Initialize()
+{
+  m_mail_handler.PushMail(DSP_INIT);
+}
+
+void ASndUCode::Update()
+{
+  // This is dubious in general, since we set the interrupt parameter on m_mail_handler.PushMail
+  if (!m_mail_handler.IsEmpty())
+  {
+    DSP::GenerateDSPInterruptFromDSPEmu(DSP::INT_DSP);
+  }
+}
+
+void ASndUCode::HandleMail(u32 mail)
+{
+  if (m_upload_setup_in_progress)
+  {
+    PrepareBootUCode(mail);
+  }
+  else if (m_next_command_is_voice_addr)
+  {
+    m_voice_addr = mail;
+    INFO_LOG_FMT(DSPHLE, "ASndUCode - Voice data is at {:08x}", mail);
+    m_next_command_is_voice_addr = false;
+    // No mail is sent in response
+  }
+  else if ((mail & TASK_MAIL_MASK) == TASK_MAIL_TO_DSP)
+  {
+    switch (mail)
+    {
+    case MAIL_NEW_UCODE:
+      m_upload_setup_in_progress = true;
+      break;
+    case MAIL_RESET:
+      m_dsphle->SetUCode(UCODE_ROM);
+      break;
+    default:
+      WARN_LOG_FMT(DSPHLE, "ASndUCode - unknown 0xcdd1 command: {:08x}", mail);
+      break;
+    }
+    // No mail is sent in response to any of these
+  }
+  else
+  {
+    // The ucode only checks the lower word (as long as the upper word is not 0xCDD1).
+    switch (mail & 0xffff)
+    {
+    case MAIL_INPUT_SAMPLES:
+      DEBUG_LOG_FMT(DSPHLE, "ASndUCode - MAIL_INPUT_SAMPLES: {:08x}", mail);
+      // input_samples
+      DMAInVoiceData();
+      // loop_get1
+      m_output_buffer.fill(0);
+      DoMixing(DSP_SYNC);
+      // Mail is handled by DoMixing()
+      break;
+    case MAIL_INPUT_SAMPLES_2:
+      WARN_LOG_FMT(DSPHLE, "ASndUCode - MAIL_INPUT_SAMPLES_2: {:08x} - not normally used", mail);
+      // input_samples2
+      DMAInVoiceData();  // first do_dma call
+      // second do_dma call
+      for (u32 i = 0; i < NUM_OUTPUT_SAMPLES * 2; i++)
+      {
+        m_output_buffer[i] = HLEMemory_Read_U16(m_current_voice.out_buf + i * sizeof(u16));
+      }
+      DoMixing(DSP_SYNC);
+      // Mail is handled by DoMixing()
+      break;
+    case MAIL_SET_VOICE_DATA_BUFFER:
+      DEBUG_LOG_FMT(DSPHLE, "ASndUCode - MAIL_SET_VOICE_DATA_BUFFER: {:08x}", mail);
+      m_next_command_is_voice_addr = true;
+      // No mail is sent in response
+      break;
+    case MAIL_INPUT_NEXT_SAMPLES:
+      DEBUG_LOG_FMT(DSPHLE, "ASndUCode - MAIL_INPUT_NEXT_SAMPLES: {:08x}", mail);
+      // input_next_samples
+      DMAInVoiceData();
+      // called via fallthrough
+      DoMixing(DSP_SYNC);
+      // Mail is handled by DoMixing()
+      break;
+    case MAIN_SEND_SAMPLES:
+      DEBUG_LOG_FMT(DSPHLE, "ASndUCode - MAIN_SEND_SAMPLES: {:08x}", mail);
+      for (u32 i = 0; i < NUM_OUTPUT_SAMPLES * 2; i++)
+      {
+        HLEMemory_Write_U16(m_current_voice.out_buf + i * sizeof(u16), m_output_buffer[i]);
+      }
+      m_mail_handler.PushMail(DSP_SYNC, true);
+      break;
+    case MAIL_ROM_DUMP_WORD:
+      WARN_LOG_FMT(DSPHLE, "ASndUCode - MAIL_ROM_DUMP_WORD: {:08x} - not normally used", mail);
+      // Reads instruction at 0x8000 | (mail >> 16), and sends it back in DMBL. DMBH is 0.
+      m_mail_handler.PushMail(0x0000'0000, false);  // DIRQ is not set
+      break;
+    case MAIL_TEST:
+      WARN_LOG_FMT(DSPHLE, "ASndUCode - MAIL_TEST: {:08x} - not normally used", mail);
+      // Runs `lri $ac0.m, #0x0` and `andf $ac0.m, #0x1`
+      // and then sends $sr in DMBH and $ac0.m in DMBL
+      // The exact value of $sr will vary, but this isn't used by anything in practice.
+      m_mail_handler.PushMail(0x2264'0000, false);  // DIRQ is not set
+      break;
+    case MAIL_TERMINATE:
+      if (m_crc != HASH_2008)
+      {
+        INFO_LOG_FMT(DSPHLE, "ASndUCode - MAIL_TERMINATE: {:08x}", mail);
+        // This doesn't actually change the state of the system.
+        m_mail_handler.PushMail(DSP_DONE, true);
+      }
+      else
+      {
+        WARN_LOG_FMT(DSPHLE, "ASndUCode - MAIL_TERMINATE is not supported in this version: {:08x}",
+                     mail);
+        m_mail_handler.PushMail(DSP_SYNC, true);
+      }
+      break;
+    default:
+      WARN_LOG_FMT(DSPHLE, "ASndUCode - unknown command: {:08x}", mail);
+      m_mail_handler.PushMail(DSP_SYNC, true);
+      break;
+    }
+  }
+}
+
+void ASndUCode::DMAInVoiceData()
+{
+  m_current_voice.out_buf = HLEMemory_Read_U32(m_voice_addr);
+  m_current_voice.delay_samples = HLEMemory_Read_U32(m_voice_addr + 4);
+  u32 new_flags = HLEMemory_Read_U32(m_voice_addr + 8);
+  if (m_current_voice.flags != new_flags)
+    DEBUG_LOG_FMT(DSPHLE, "ASndUCode - flags: {:08x}", new_flags);
+  m_current_voice.flags = new_flags;
+  m_current_voice.start_addr = HLEMemory_Read_U32(m_voice_addr + 12);
+  m_current_voice.end_addr = HLEMemory_Read_U32(m_voice_addr + 16);
+  m_current_voice.freq = HLEMemory_Read_U32(m_voice_addr + 20);
+  m_current_voice.left = HLEMemory_Read_U16(m_voice_addr + 24);
+  m_current_voice.right = HLEMemory_Read_U16(m_voice_addr + 26);
+  m_current_voice.counter = HLEMemory_Read_U32(m_voice_addr + 28);
+  m_current_voice.volume_l = HLEMemory_Read_U16(m_voice_addr + 32);
+  m_current_voice.volume_r = HLEMemory_Read_U16(m_voice_addr + 34);
+  m_current_voice.start_addr2 = HLEMemory_Read_U32(m_voice_addr + 36);
+  m_current_voice.end_addr2 = HLEMemory_Read_U32(m_voice_addr + 40);
+  m_current_voice.volume2_l = HLEMemory_Read_U16(m_voice_addr + 44);
+  m_current_voice.volume2_r = HLEMemory_Read_U16(m_voice_addr + 46);
+  m_current_voice.backup_addr = HLEMemory_Read_U32(m_voice_addr + 48);
+  m_current_voice.tick_counter = HLEMemory_Read_U32(m_voice_addr + 52);
+  m_current_voice.cb = HLEMemory_Read_U32(m_voice_addr + 56);
+  m_current_voice._pad = HLEMemory_Read_U32(m_voice_addr + 60);
+}
+
+void ASndUCode::DMAOutVoiceData()
+{
+  HLEMemory_Write_U32(m_voice_addr, m_current_voice.out_buf);
+  HLEMemory_Write_U32(m_voice_addr + 4, m_current_voice.delay_samples);
+  HLEMemory_Write_U32(m_voice_addr + 8, m_current_voice.flags);
+  HLEMemory_Write_U32(m_voice_addr + 12, m_current_voice.start_addr);
+  HLEMemory_Write_U32(m_voice_addr + 16, m_current_voice.end_addr);
+  HLEMemory_Write_U32(m_voice_addr + 20, m_current_voice.freq);
+  HLEMemory_Write_U16(m_voice_addr + 24, m_current_voice.left);
+  HLEMemory_Write_U16(m_voice_addr + 26, m_current_voice.right);
+  HLEMemory_Write_U32(m_voice_addr + 28, m_current_voice.counter);
+  HLEMemory_Write_U16(m_voice_addr + 32, m_current_voice.volume_l);
+  HLEMemory_Write_U16(m_voice_addr + 34, m_current_voice.volume_r);
+  HLEMemory_Write_U32(m_voice_addr + 36, m_current_voice.start_addr2);
+  HLEMemory_Write_U32(m_voice_addr + 40, m_current_voice.end_addr2);
+  HLEMemory_Write_U16(m_voice_addr + 44, m_current_voice.volume2_l);
+  HLEMemory_Write_U16(m_voice_addr + 46, m_current_voice.volume2_r);
+  HLEMemory_Write_U32(m_voice_addr + 48, m_current_voice.backup_addr);
+  HLEMemory_Write_U32(m_voice_addr + 52, m_current_voice.tick_counter);
+  HLEMemory_Write_U32(m_voice_addr + 56, m_current_voice.cb);
+  HLEMemory_Write_U32(m_voice_addr + 60, m_current_voice._pad);
+}
+
+void ASndUCode::DoMixing(u32 return_mail)
+{
+  // Note: return_mail is set to DSP_SYNC by all callers, but then changed here... to DSP_SYNC.
+  // This doesn't really make sense, but I'm keeping that behavior in case it helps with porting
+  // older versions.
+
+  // start_main
+
+  const u32 sample_format_mask = (m_crc == HASH_2011 || m_crc == HASH_2020) ?
+                                     NEW_FLAGS_SAMPLE_FORMAT_MASK :
+                                     OLD_FLAGS_SAMPLE_FORMAT_MASK;
+  const u32 sample_format = m_current_voice.flags & sample_format_mask;
+  const u32 sample_format_step =
+      (m_current_voice.flags & FLAGS_SAMPLE_FORMAT_BYTES_MASK) >> FLAGS_SAMPLE_FORMAT_BYTES_SHIFT;
+
+  // sample_selector jump table
+  static constexpr std::array<std::pair<s16, s16> (ASndUCode::*)() const, 8> sample_selector{
+      &ASndUCode::ReadSampleMono8Bits,           &ASndUCode::ReadSampleMono16Bits,
+      &ASndUCode::ReadSampleStereo8Bits,         &ASndUCode::ReadSampleStereo16Bits,
+      &ASndUCode::ReadSampleMono8BitsUnsigned,   &ASndUCode::ReadSampleMono16BitsLittleEndian,
+      &ASndUCode::ReadSampleStereo8BitsUnsigned, &ASndUCode::ReadSampleStereo16BitsLittleEndian,
+  };
+  const auto sample_function = sample_selector[sample_format];
+
+  const u32 pause_mask =
+      (m_crc == HASH_2011 || m_crc == HASH_2020) ? NEW_FLAGS_VOICE_PAUSE : OLD_FLAGS_VOICE_PAUSE;
+
+  if ((m_current_voice.flags & pause_mask) == 0)
+  {
+    if (m_current_voice.start_addr == 0)
+    {
+      // "set return as "change of buffer""
+      // (which doesn't make sense, since we're changing the return value to what it already was)
+      return_mail = DSP_SYNC;
+      ChangeBuffer();
+      if (m_current_voice.start_addr == 0)
+      {
+        // Jump to save_datas_end, but since we write to start_addr directly in ChangeBuffer and
+        // can't have changed prev_r or prev_l here, there's nothing that needs to be saved.
+        // Thus, we can jump to end_main instead. This is still pretty ugly, but I don't think
+        // there's any way to tidy it further without making the structure much more convoluted.
+        goto end_main;
+      }
+    }
+    // do_not_change1
+
+    u32 buffer_offset = 0;
+    // "delay time section"
+    u16 remaining_samples = NUM_OUTPUT_SAMPLES;
+
+    // Note: We don't handle masking/sign extension from use of $ac0.h and $ac1.h, as this isn't
+    // something that should happen if the PowerPC code is using the library normally.
+    // This applies throughout.
+    if (m_current_voice.delay_samples != 0)
+    {
+      m_current_voice.right = 0;
+      m_current_voice.left = 0;
+      u16 remaining_samples_delay = remaining_samples;
+      // l_delay
+      while (true)
+      {
+        buffer_offset += 2;
+        remaining_samples_delay--;
+        if (remaining_samples_delay == 0)
+        {
+          // exit_delay1
+          remaining_samples = 0;
+          break;
+        }
+        m_current_voice.delay_samples--;
+        if (m_current_voice.delay_samples == 0)
+        {
+          // exit_delay2
+          // commented with "load remanent samples to be processed in AXL0",
+          // but they use "MRR $AX0.L, $AC1.L" when the remaining samples are in $AC1.M.
+          remaining_samples = 0;
+          break;
+        }
+      }
+    }
+    // no_delay
+    // "bucle de generacion de samples", i.e. sample generation loop
+    DMAInSampleData();
+    // "loops for samples to be processed", corresponding to BLOOP $ax0.l, loop_end
+    while (remaining_samples > 0)
+    {
+      remaining_samples--;
+      // "Mix right sample section"
+      s32 sample_r = static_cast<s16>(m_output_buffer[buffer_offset]);
+      sample_r += m_current_voice.right;
+      sample_r = std::clamp(sample_r, -32768, 32767);
+      m_output_buffer[buffer_offset++] = sample_r;
+      // "Mix left sample section"
+      s32 sample_l = static_cast<s16>(m_output_buffer[buffer_offset]);
+      sample_l += m_current_voice.left;
+      sample_l = std::clamp(sample_l, -32768, 32767);
+      m_output_buffer[buffer_offset++] = sample_l;
+      // "adds the counter with the voice frequency and test if it >=48000 to get the next sample"
+      m_current_voice.counter += m_current_voice.freq;
+      if (m_current_voice.counter >= SAMPLE_RATE)
+      {
+        if (m_current_voice.freq <= SAMPLE_RATE)
+        {
+          // get_sample: "fast method"
+          m_current_voice.counter -= SAMPLE_RATE;
+          m_current_voice.start_addr += sample_format_step;
+        }
+        else
+        {
+          // get_sample2: "slow method"
+          while (m_current_voice.counter >= SAMPLE_RATE)
+          {
+            m_current_voice.counter -= SAMPLE_RATE;
+            m_current_voice.start_addr += sample_format_step;
+            if ((m_current_voice.start_addr & INPUT_SAMPLE_BUFFER_BYTE_MASK) == 0)
+            {
+              DMAInSampleData();
+            }
+          }
+        }
+
+        if (m_current_voice.start_addr >= m_current_voice.end_addr)
+        {
+          // get_new_buffer (the comparison/jump is performed separately in get_sample and
+          // get_sample2, but the same comparison is done for both)
+          return_mail = DSP_SYNC;
+          ChangeBuffer();
+          if (m_current_voice.start_addr == 0)
+          {
+            // zero_samples
+            // This directly jumps to out_samp as well, bypassing the code below.
+            // Fortunately we can replicate this behavior with `continue`, and don't need to handle
+            // the multiplication logic in out_samp since the samples are zero.
+            m_current_voice.right = 0;
+            m_current_voice.left = 0;
+            continue;
+          }
+          DMAInSampleData();
+        }
+        // jump_load_smp_addr
+        const u16 input_sample_offset =
+            (m_current_voice.start_addr >> 1) & INPUT_SAMPLE_BUFFER_WORD_MASK;
+        if (input_sample_offset == 0)
+        {
+          // jump_load_smp_dma
+          DMAInSampleDataAssumeAligned();
+        }
+        // Both paths jmpr $AR3, which is an index into sample_selector
+
+        const auto [new_r, new_l] = (this->*sample_function)();
+        // out_samp: "multiply sample x volume" - left is put in $ax0.h, right is put in $ax1.h
+
+        // All functions jumped to from sample_selector jump or fall through here (zero_samples also
+        // jumps here, but is handled separately above for structural reasons)
+        const s32 temp_l = static_cast<s32>(new_l) * m_current_voice.volume_l;
+        m_current_voice.left = static_cast<s16>(temp_l >> 8);
+        const s32 temp_r = static_cast<s32>(new_r) * m_current_voice.volume_r;
+        m_current_voice.right = static_cast<s16>(temp_r >> 8);
+      }
+      // loop_end, remember that the labeled instruction is the last instruction in the loop
+      // so jumping to the NOP here causes the loop to repeat (and decrement its counter)
+    }
+    // end_process (not actually jumped to)
+    if (m_current_voice.start_addr == 0)
+    {
+      return_mail = DSP_SYNC;
+      ChangeBuffer();
+    }
+    // save_datas_end (we mutate m_current_voice.start_addr, left, and right in place,
+    // so nothing needs to happen here here)
+  }
+end_main:  // end_main
+  DMAOutVoiceData();
+
+  m_mail_handler.PushMail(return_mail, true);
+}
+
+void ASndUCode::ChangeBuffer()
+{
+  // change_buffer
+  m_current_voice.volume_l = m_current_voice.volume2_l;
+  m_current_voice.volume_r = m_current_voice.volume2_r;
+  m_current_voice.end_addr = m_current_voice.end_addr2;
+  m_current_voice.start_addr = m_current_voice.start_addr2;
+  m_current_voice.backup_addr = m_current_voice.start_addr2;
+
+  const u32 loop_mask =
+      (m_crc == HASH_2011 || m_crc == HASH_2020) ? NEW_FLAGS_VOICE_LOOP : OLD_FLAGS_VOICE_LOOP;
+
+  if ((m_current_voice.flags & loop_mask) == 0)
+  {
+    m_current_voice.start_addr2 = 0;
+    m_current_voice.end_addr2 = 0;
+  }
+}
+
+void ASndUCode::DMAInSampleData()
+{
+  // load_smp_addr_align
+  // This is its own function, and it contains its own copy of the DMA logic.
+  // The only difference is that this one forces the address to be aligned, while when
+  // jump_load_smp_dma is used, the address is expected to already be aligned.
+  const u32 addr = m_current_voice.start_addr & ~INPUT_SAMPLE_BUFFER_BYTE_MASK;
+  for (u16 i = 0; i < INPUT_SAMPLE_BUFFER_SIZE_WORDS; i++)
+  {
+    m_input_sample_buffer[i] = HLEMemory_Read_U16(addr + i * sizeof(u16));
+  }
+}
+
+void ASndUCode::DMAInSampleDataAssumeAligned()
+{
+  // jump_load_smp_dma
+  // This is technically not a function, but instead is directly jumped to and then jumps to $ar3
+  // (which is set to an address from sample_selector). We can just treat it as a function though.
+  const u32 addr = m_current_voice.start_addr;
+  for (u16 i = 0; i < INPUT_SAMPLE_BUFFER_SIZE_WORDS; i++)
+  {
+    m_input_sample_buffer[i] = HLEMemory_Read_U16(addr + i * sizeof(u16));
+  }
+}
+
+std::pair<s16, s16> ASndUCode::ReadSampleMono8Bits() const
+{
+  // mono_8bits
+  const u32 index = (m_current_voice.start_addr >> 1) & INPUT_SAMPLE_BUFFER_WORD_MASK;
+  s16 result = m_input_sample_buffer[index];
+  if ((m_current_voice.start_addr & 1) == 0)
+    result >>= 8;
+  result <<= 8;
+  return {result, result};
+}
+
+std::pair<s16, s16> ASndUCode::ReadSampleStereo8Bits() const
+{
+  // stereo_8bits
+  const u32 index = (m_current_voice.start_addr >> 1) & INPUT_SAMPLE_BUFFER_WORD_MASK;
+  const u16 sample = m_input_sample_buffer[index];
+  const s16 right = sample & 0xff00;
+  const s16 left = sample << 8;
+  return {right, left};
+}
+std::pair<s16, s16> ASndUCode::ReadSampleMono16Bits() const
+{
+  // mono_16bits
+  const u32 index = (m_current_voice.start_addr >> 1) & INPUT_SAMPLE_BUFFER_WORD_MASK;
+  const s16 result = m_input_sample_buffer[index];
+  return {result, result};
+}
+
+std::pair<s16, s16> ASndUCode::ReadSampleStereo16Bits() const
+{
+  // stereo_16bits
+  const u32 index = (m_current_voice.start_addr >> 1) & INPUT_SAMPLE_BUFFER_WORD_MASK;
+  const s16 right = m_input_sample_buffer[index];
+  // Note that 1 is added to index after the masking - meaning that theoretically an out-of-bounds
+  // index 0x10 can be read (but the buffer is oversized both here and in the actual uCode, so the
+  // data at the out-of-bounds index will instead be 0).
+  const s16 left = m_input_sample_buffer[index + 1];
+  return {right, left};
+}
+
+std::pair<s16, s16> ASndUCode::ReadSampleMono8BitsUnsigned() const
+{
+  // mono_8bits_unsigned
+  const u32 index = (m_current_voice.start_addr >> 1) & INPUT_SAMPLE_BUFFER_WORD_MASK;
+  s16 result = m_input_sample_buffer[index];
+  if ((m_current_voice.start_addr & 1) == 0)
+    result >>= 8;
+  result <<= 8;
+  result ^= 0x8000;  // Flip the sign bit - effectively adding 0x8000
+  return {result, result};
+}
+
+std::pair<s16, s16> ASndUCode::ReadSampleMono16BitsLittleEndian() const
+{
+  // mono_16bits_le
+  const u32 index = (m_current_voice.start_addr >> 1) & INPUT_SAMPLE_BUFFER_WORD_MASK;
+  // Actual implementation is u32 result_l = result | result << 16; result = result_l >> 8;
+  const s16 result = Common::swap16(m_input_sample_buffer[index]);
+  return {result, result};
+}
+
+std::pair<s16, s16> ASndUCode::ReadSampleStereo8BitsUnsigned() const
+{
+  // stereo_8bits_unsigned
+  const u32 index = (m_current_voice.start_addr >> 1) & INPUT_SAMPLE_BUFFER_WORD_MASK;
+  u16 sample = m_input_sample_buffer[index] ^ 0x8080;
+  const s16 right = sample & 0xff00;
+  const s16 left = sample << 8;
+  return {right, left};
+}
+
+std::pair<s16, s16> ASndUCode::ReadSampleStereo16BitsLittleEndian() const
+{
+  // stereo_16bits_le
+  const u32 index = (m_current_voice.start_addr >> 1) & INPUT_SAMPLE_BUFFER_WORD_MASK;
+  const s16 right = Common::swap16(m_input_sample_buffer[index]);
+  // Note that 1 is added to index after the masking - meaning that theoretically an out-of-bounds
+  // index 0x10 can be read (but the buffer is oversized both here and in the actual uCode, so the
+  // data at the out-of-bounds index will instead be 0).
+  const s16 left = Common::swap16(m_input_sample_buffer[index + 1]);
+  return {right, left};
+}
+
+void ASndUCode::DoState(PointerWrap& p)
+{
+  DoStateShared(p);
+  p.Do(m_next_command_is_voice_addr);
+  p.Do(m_voice_addr);
+  p.Do(m_current_voice);
+  p.Do(m_input_sample_buffer);
+  p.Do(m_output_buffer);
+}
+}  // namespace DSP::HLE
diff --git a/Source/Core/Core/HW/DSPHLE/UCodes/ASnd.h b/Source/Core/Core/HW/DSPHLE/UCodes/ASnd.h
new file mode 100644
index 0000000000..b30bf32ec8
--- /dev/null
+++ b/Source/Core/Core/HW/DSPHLE/UCodes/ASnd.h
@@ -0,0 +1,113 @@
+// Copyright 2022 Dolphin Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include <array>
+#include <utility>
+
+#include "Common/CommonTypes.h"
+#include "Core/HW/DSPHLE/UCodes/UCodes.h"
+
+namespace DSP::HLE
+{
+class DSPHLE;
+
+class ASndUCode final : public UCodeInterface
+{
+public:
+  ASndUCode(DSPHLE* dsphle, u32 crc);
+
+  void Initialize() override;
+  void HandleMail(u32 mail) override;
+  void Update() override;
+  void DoState(PointerWrap& p) override;
+
+private:
+  void DMAInVoiceData();
+  void DMAOutVoiceData();
+  void DMAInSampleData();
+  void DMAInSampleDataAssumeAligned();
+  void ChangeBuffer();
+  void DoMixing(u32 return_mail);
+
+  std::pair<s16, s16> ReadSampleMono8Bits() const;
+  std::pair<s16, s16> ReadSampleStereo8Bits() const;
+  std::pair<s16, s16> ReadSampleMono16Bits() const;
+  std::pair<s16, s16> ReadSampleStereo16Bits() const;
+  std::pair<s16, s16> ReadSampleMono8BitsUnsigned() const;
+  std::pair<s16, s16> ReadSampleMono16BitsLittleEndian() const;
+  std::pair<s16, s16> ReadSampleStereo8BitsUnsigned() const;
+  std::pair<s16, s16> ReadSampleStereo16BitsLittleEndian() const;
+
+  // Copied from libasnd/asndlib.c's t_sound_data
+  struct VoiceData
+  {
+    // output buffer 4096 bytes aligned to 32
+    u32 out_buf;
+    // samples per delay to start (48000 == 1sec)
+    u32 delay_samples;
+    // 2008/2009 versions: (step<<16) | (loop<<2) | (type & 3) used in DSP side
+    // 2011/2020 versions: (step<<16) | (statuses<<8) | (type & 7) used in DSP side
+    u32 flags;
+    // internal addr counter
+    u32 start_addr;
+    // end voice physical pointer (bytes without alignament, but remember it reads in blocks of 32
+    // bytes (use padding to the end))
+    u32 end_addr;
+    // freq operation
+    u32 freq;
+    // internally used to store de last sample played
+    s16 left, right;
+    // internally used to convert freq to 48000Hz samples
+    u32 counter;
+    // volume (from 0 to 256)
+    u16 volume_l, volume_r;
+    // initial voice2 physical pointer (bytes aligned 32 bytes) (to do a ring)
+    u32 start_addr2;
+    // end voice2 physical pointer (bytes without alignament, but remember it reads in blocks of 32
+    // bytes (use padding to the end))
+    u32 end_addr2;
+    // volume (from 0 to 256) for voice 2
+    u16 volume2_l, volume2_r;
+    // initial voice physical pointer backup (bytes aligned to 32 bytes): It is used for test
+    // pointers purpose
+    u32 backup_addr;
+    // voice tick counter - not used by DSP code
+    u32 tick_counter;
+    // callback - not used by DSP code
+    u32 cb;
+    u32 _pad;
+  };
+  static_assert(sizeof(VoiceData) == sizeof(u16) * 0x20);
+
+  bool m_next_command_is_voice_addr = false;
+  u32 m_voice_addr = 0;
+
+  VoiceData m_current_voice{};
+
+  // Number of bytes in the input sample buffer.
+  static constexpr u32 INPUT_SAMPLE_BUFFER_SIZE_BYTES = 0x20;
+  // Mask used for addresses for the sample buffer - note that the sample buffer is also assumed to
+  // be 0x20-aligned in main memory.
+  static constexpr u32 INPUT_SAMPLE_BUFFER_BYTE_MASK = INPUT_SAMPLE_BUFFER_SIZE_BYTES - 1;
+  // The DSP itself operates on 16-bit words instead of individual bytes, meaning the size is 0x10.
+  static constexpr u32 INPUT_SAMPLE_BUFFER_SIZE_WORDS = INPUT_SAMPLE_BUFFER_SIZE_BYTES / 2;
+  // ... and thus uses a different mask of 0x0f.
+  static constexpr u32 INPUT_SAMPLE_BUFFER_WORD_MASK = INPUT_SAMPLE_BUFFER_SIZE_WORDS - 1;
+  // Lastly, the uCode actually allocates 0x20 words, not 0x10 words, for the sample buffer,
+  // but only the first 0x10 words are used except for an edge-case where they mask with 0xf and
+  // then add 1 in ReadSampleStereo16Bits and ReadSampleStereo16BitsLittleEndian - this results in
+  // index 0x10 possibly being read, which is otherwise unused (but this probably doesn't happen in
+  // practice, as it would require the buffer address starting at 2 instead of 0). We use the same
+  // oversized buffer to accurately emulate this behavior without actually reading invalid memory.
+  static constexpr u32 INPUT_SAMPLE_BUFFER_SIZE_WORDS_ACTUAL = INPUT_SAMPLE_BUFFER_SIZE_WORDS * 2;
+
+  std::array<u16, INPUT_SAMPLE_BUFFER_SIZE_WORDS_ACTUAL> m_input_sample_buffer{};
+
+  // Number of 16-bit stereo samples in the output buffer
+  static constexpr u32 NUM_OUTPUT_SAMPLES = 1024;
+
+  std::array<u16, NUM_OUTPUT_SAMPLES * 2> m_output_buffer{};
+};
+}  // namespace DSP::HLE
diff --git a/Source/Core/Core/HW/DSPHLE/UCodes/UCodes.cpp b/Source/Core/Core/HW/DSPHLE/UCodes/UCodes.cpp
index 97a33a1f8b..892c67dad7 100644
--- a/Source/Core/Core/HW/DSPHLE/UCodes/UCodes.cpp
+++ b/Source/Core/Core/HW/DSPHLE/UCodes/UCodes.cpp
@@ -20,6 +20,7 @@
 #include "Core/ConfigManager.h"
 #include "Core/DSP/DSPCodeUtil.h"
 #include "Core/HW/DSPHLE/DSPHLE.h"
+#include "Core/HW/DSPHLE/UCodes/ASnd.h"
 #include "Core/HW/DSPHLE/UCodes/AX.h"
 #include "Core/HW/DSPHLE/UCodes/AXWii.h"
 #include "Core/HW/DSPHLE/UCodes/CARD.h"
@@ -282,6 +283,13 @@ std::unique_ptr<UCodeInterface> UCodeFactory(u32 crc, DSPHLE* dsphle, bool wii)
     INFO_LOG_FMT(DSPHLE, "CRC {:08x}: Wii - AXWii chosen", crc);
     return std::make_unique<AXWiiUCode>(dsphle, crc);
 
+  case 0x8d69a19b:
+  case 0xcc2fd441:
+  case 0xa81582e2:
+  case 0xdbbeeb61:
+    INFO_LOG_FMT(DSPHLE, "CRC {:08x}: ASnd chosen (Homebrew)", crc);
+    return std::make_unique<ASndUCode>(dsphle, crc);
+
   default:
     if (wii)
     {
diff --git a/Source/Core/DolphinLib.props b/Source/Core/DolphinLib.props
index ea6c9705dc..c1635eb6fb 100644
--- a/Source/Core/DolphinLib.props
+++ b/Source/Core/DolphinLib.props
@@ -236,6 +236,7 @@
     <ClInclude Include="Core\HW\DSP.h" />
     <ClInclude Include="Core\HW\DSPHLE\DSPHLE.h" />
     <ClInclude Include="Core\HW\DSPHLE\MailHandler.h" />
+    <ClInclude Include="Core\HW\DSPHLE\UCodes\ASnd.h" />
     <ClInclude Include="Core\HW\DSPHLE\UCodes\AX.h" />
     <ClInclude Include="Core\HW\DSPHLE\UCodes\AXStructs.h" />
     <ClInclude Include="Core\HW\DSPHLE\UCodes\AXVoice.h" />
@@ -836,6 +837,7 @@
     <ClCompile Include="Core\HW\DSP.cpp" />
     <ClCompile Include="Core\HW\DSPHLE\DSPHLE.cpp" />
     <ClCompile Include="Core\HW\DSPHLE\MailHandler.cpp" />
+    <ClCompile Include="Core\HW\DSPHLE\UCodes\ASnd.cpp" />
     <ClCompile Include="Core\HW\DSPHLE\UCodes\AX.cpp" />
     <ClCompile Include="Core\HW\DSPHLE\UCodes\AXWii.cpp" />
     <ClCompile Include="Core\HW\DSPHLE\UCodes\CARD.cpp" />