// Copyright 2023 Dolphin Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later #include "Common/Assembler/GekkoLexer.h" #include "Common/Assert.h" #include #include namespace Common::GekkoAssembler::detail { namespace { constexpr bool IsOctal(char c) { return c >= '0' && c <= '7'; } constexpr bool IsBinary(char c) { return c == '0' || c == '1'; } template constexpr T ConvertNib(char c) { if (c >= 'a' && c <= 'f') { return static_cast(c - 'a' + 10); } if (c >= 'A' && c <= 'F') { return static_cast(c - 'A' + 10); } return static_cast(c - '0'); } constexpr TokenType SingleCharToken(char ch) { switch (ch) { case ',': return TokenType::Comma; case '(': return TokenType::Lparen; case ')': return TokenType::Rparen; case '|': return TokenType::Pipe; case '^': return TokenType::Caret; case '&': return TokenType::Ampersand; case '+': return TokenType::Plus; case '-': return TokenType::Minus; case '*': return TokenType::Star; case '/': return TokenType::Slash; case '~': return TokenType::Tilde; case '@': return TokenType::At; case ':': return TokenType::Colon; case '`': return TokenType::Grave; case '.': return TokenType::Dot; case '\0': return TokenType::Eof; case '\n': return TokenType::Eol; default: return TokenType::Invalid; } } // Convert a string literal into its raw-data form template void ConvertStringLiteral(std::string_view literal, std::back_insert_iterator out_it) { for (size_t i = 1; i < literal.size() - 1;) { if (literal[i] == '\\') { ++i; if (IsOctal(literal[i])) { // Octal escape char octal_escape = 0; for (char c = literal[i]; IsOctal(c); c = literal[++i]) { octal_escape = (octal_escape << 3) + (c - '0'); } out_it = static_cast(octal_escape); } else if (literal[i] == 'x') { // Hex escape char hex_escape = 0; for (char c = literal[++i]; std::isxdigit(c); c = literal[++i]) { hex_escape = (hex_escape << 4) + ConvertNib(c); } out_it = static_cast(hex_escape); } else { char simple_escape; switch (literal[i]) { case '\'': simple_escape = '\x27'; break; case '"': simple_escape = '\x22'; break; case '?': simple_escape = '\x3f'; break; case '\\': simple_escape = '\x5c'; break; case 'a': simple_escape = '\x07'; break; case 'b': simple_escape = '\x08'; break; case 'f': simple_escape = '\x0c'; break; case 'n': simple_escape = '\x0a'; break; case 'r': simple_escape = '\x0d'; break; case 't': simple_escape = '\x09'; break; case 'v': simple_escape = '\x0b'; break; default: simple_escape = literal[i]; break; } out_it = static_cast(simple_escape); ++i; } } else { out_it = static_cast(literal[i]); ++i; } } } template std::optional EvalIntegral(TokenType tp, std::string_view val) { constexpr auto hex_step = [](T acc, char c) { return acc << 4 | ConvertNib(c); }; constexpr auto dec_step = [](T acc, char c) { return acc * 10 + (c - '0'); }; constexpr auto oct_step = [](T acc, char c) { return acc << 3 | (c - '0'); }; constexpr auto bin_step = [](T acc, char c) { return acc << 1 | (c - '0'); }; switch (tp) { case TokenType::HexadecimalLit: return std::accumulate(val.begin() + 2, val.end(), T{0}, hex_step); case TokenType::DecimalLit: return std::accumulate(val.begin(), val.end(), T{0}, dec_step); case TokenType::OctalLit: return std::accumulate(val.begin() + 1, val.end(), T{0}, oct_step); case TokenType::BinaryLit: return std::accumulate(val.begin() + 2, val.end(), T{0}, bin_step); case TokenType::GPR: case TokenType::FPR: return std::accumulate(val.begin() + 1, val.end(), T{0}, dec_step); case TokenType::CRField: return std::accumulate(val.begin() + 2, val.end(), T{0}, dec_step); case TokenType::SPR: return static_cast(*sprg_map.Find(val)); case TokenType::Lt: return T{0}; case TokenType::Gt: return T{1}; case TokenType::Eq: return T{2}; case TokenType::So: return T{3}; default: return std::nullopt; } } } // namespace void ConvertStringLiteral(std::string_view literal, std::vector* out_vec) { ConvertStringLiteral(literal, std::back_inserter(*out_vec)); } std::string_view TokenTypeToStr(TokenType tp) { switch (tp) { case TokenType::GPR: return "GPR"; case TokenType::FPR: return "FPR"; case TokenType::SPR: return "SPR"; case TokenType::CRField: return "CR Field"; case TokenType::Lt: case TokenType::Gt: case TokenType::Eq: case TokenType::So: return "CR Bit"; case TokenType::Identifier: return "Identifier"; case TokenType::StringLit: return "String Literal"; case TokenType::DecimalLit: return "Decimal Literal"; case TokenType::BinaryLit: return "Binary Literal"; case TokenType::HexadecimalLit: return "Hexadecimal Literal"; case TokenType::OctalLit: return "Octal Literal"; case TokenType::FloatLit: return "Float Literal"; case TokenType::Invalid: return "Invalid"; case TokenType::Lsh: return "<<"; case TokenType::Rsh: return ">>"; case TokenType::Comma: return ","; case TokenType::Lparen: return "("; case TokenType::Rparen: return ")"; case TokenType::Pipe: return "|"; case TokenType::Caret: return "^"; case TokenType::Ampersand: return "&"; case TokenType::Plus: return "+"; case TokenType::Minus: return "-"; case TokenType::Star: return "*"; case TokenType::Slash: return "/"; case TokenType::Tilde: return "~"; case TokenType::At: return "@"; case TokenType::Colon: return ":"; case TokenType::Grave: return "`"; case TokenType::Dot: return "."; case TokenType::Eof: return "End of File"; case TokenType::Eol: return "End of Line"; default: return ""; } } std::string_view AssemblerToken::TypeStr() const { return TokenTypeToStr(token_type); } std::string_view AssemblerToken::ValStr() const { switch (token_type) { case TokenType::Eol: return ""; case TokenType::Eof: return ""; default: return token_val; } } template <> std::optional AssemblerToken::EvalToken() const { if (token_type == TokenType::FloatLit) { return std::stof(std::string(token_val)); } return std::nullopt; } template <> std::optional AssemblerToken::EvalToken() const { if (token_type == TokenType::FloatLit) { return std::stod(std::string(token_val)); } return std::nullopt; } template <> std::optional AssemblerToken::EvalToken() const { return EvalIntegral(token_type, token_val); } template <> std::optional AssemblerToken::EvalToken() const { return EvalIntegral(token_type, token_val); } template <> std::optional AssemblerToken::EvalToken() const { return EvalIntegral(token_type, token_val); } template <> std::optional AssemblerToken::EvalToken() const { return EvalIntegral(token_type, token_val); } size_t Lexer::LineNumber() const { return m_lexed_tokens.empty() ? m_pos.line : TagOf(m_lexed_tokens.front()).line; } size_t Lexer::ColNumber() const { return m_lexed_tokens.empty() ? m_pos.col : TagOf(m_lexed_tokens.front()).col; } std::string_view Lexer::CurrentLine() const { const size_t line_index = m_lexed_tokens.empty() ? m_pos.index : TagOf(m_lexed_tokens.front()).index; size_t begin_index = line_index == 0 ? 0 : line_index - 1; for (; begin_index > 0; begin_index--) { if (m_lex_string[begin_index] == '\n') { begin_index++; break; } } size_t end_index = begin_index; for (; end_index < m_lex_string.size(); end_index++) { if (m_lex_string[end_index] == '\n') { end_index++; break; } } return m_lex_string.substr(begin_index, end_index - begin_index); } void Lexer::SetIdentifierMatchRule(IdentifierMatchRule set) { FeedbackTokens(); m_match_rule = set; } const Tagged& Lexer::LookaheadTagRef(size_t num_fwd) const { while (m_lexed_tokens.size() < num_fwd) { LookaheadRef(); } return m_lexed_tokens[num_fwd]; } AssemblerToken Lexer::Lookahead() const { if (m_lexed_tokens.empty()) { CursorPosition pos_pre = m_pos; m_lexed_tokens.emplace_back(pos_pre, LexSingle()); } return ValueOf(m_lexed_tokens.front()); } const AssemblerToken& Lexer::LookaheadRef() const { if (m_lexed_tokens.empty()) { CursorPosition pos_pre = m_pos; m_lexed_tokens.emplace_back(pos_pre, LexSingle()); } return ValueOf(m_lexed_tokens.front()); } TokenType Lexer::LookaheadType() const { return LookaheadRef().token_type; } AssemblerToken Lexer::LookaheadFloat() const { FeedbackTokens(); SkipWs(); CursorPosition pos_pre = m_pos; ScanStart(); std::optional failure_reason = RunDfa(float_dfa); // Special case: lex at least a single char for no matches for errors to make sense if (m_scan_pos.index == pos_pre.index) { Step(); } std::string_view tok_str = ScanFinishOut(); AssemblerToken tok; if (!failure_reason) { tok = AssemblerToken{ TokenType::FloatLit, tok_str, "", Interval{0, 0}, }; } else { tok = AssemblerToken{ TokenType::Invalid, tok_str, *failure_reason, Interval{0, tok_str.length()}, }; } m_lexed_tokens.emplace_back(pos_pre, tok); return tok; } void Lexer::Eat() { if (m_lexed_tokens.empty()) { LexSingle(); } else { m_lexed_tokens.pop_front(); } } void Lexer::EatAndReset() { Eat(); SetIdentifierMatchRule(IdentifierMatchRule::Typical); } std::optional Lexer::RunDfa(const std::vector& dfa) const { size_t dfa_index = 0; bool transition_found; do { transition_found = false; if (Peek() == '\0') { break; } const DfaNode& n = dfa[dfa_index]; for (auto&& edge : n.edges) { if (edge.first(Peek())) { transition_found = true; dfa_index = edge.second; break; } } if (transition_found) { Step(); } } while (transition_found); return dfa[dfa_index].match_failure_reason; } void Lexer::SkipWs() const { ScanStart(); for (char c = Peek(); std::isspace(c) && c != '\n'; c = Step().Peek()) { } if (Peek() == '#') { while (Peek() != '\n' && Peek() != '\0') { Step(); } } ScanFinish(); } void Lexer::FeedbackTokens() const { if (m_lexed_tokens.empty()) { return; } m_pos = m_scan_pos = TagOf(m_lexed_tokens.front()); m_lexed_tokens.clear(); } bool Lexer::IdentifierHeadExtra(char h) const { switch (m_match_rule) { case IdentifierMatchRule::Typical: case IdentifierMatchRule::Mnemonic: return false; case IdentifierMatchRule::Directive: return std::isdigit(h); } return false; } bool Lexer::IdentifierExtra(char c) const { switch (m_match_rule) { case IdentifierMatchRule::Typical: case IdentifierMatchRule::Directive: return false; case IdentifierMatchRule::Mnemonic: return c == '+' || c == '-' || c == '.'; } return false; } void Lexer::ScanStart() const { m_scan_pos = m_pos; } void Lexer::ScanFinish() const { m_pos = m_scan_pos; } std::string_view Lexer::ScanFinishOut() const { const size_t start = m_pos.index; m_pos = m_scan_pos; return m_lex_string.substr(start, m_scan_pos.index - start); } char Lexer::Peek() const { if (m_scan_pos.index >= m_lex_string.length()) { return 0; } return m_lex_string[m_scan_pos.index]; } const Lexer& Lexer::Step() const { if (m_scan_pos.index >= m_lex_string.length()) { return *this; } if (Peek() == '\n') { m_scan_pos.line++; m_scan_pos.col = 0; } else { m_scan_pos.col++; } m_scan_pos.index++; return *this; } TokenType Lexer::LexStringLit(std::string_view& invalid_reason, Interval& invalid_region) const { // The open quote has alread been matched const size_t string_start = m_scan_pos.index - 1; TokenType token_type = TokenType::StringLit; std::optional failure_reason = RunDfa(string_dfa); if (failure_reason) { token_type = TokenType::Invalid; invalid_reason = *failure_reason; invalid_region = Interval{0, m_scan_pos.index - string_start}; } return token_type; } TokenType Lexer::ClassifyAlnum() const { const std::string_view alnum = m_lex_string.substr(m_pos.index, m_scan_pos.index - m_pos.index); constexpr auto valid_regnum = [](std::string_view rn) { if (rn.length() == 1 && std::isdigit(rn[0])) { return true; } else if (rn.length() == 2 && std::isdigit(rn[0]) && std::isdigit(rn[1])) { if (rn[0] == '1' || rn[0] == '2') { return true; } if (rn[0] == '3') { return rn[1] <= '2'; } } return false; }; constexpr auto eq_nocase = [](std::string_view str, std::string_view lwr) { auto it_l = str.cbegin(), it_r = lwr.cbegin(); for (; it_l != str.cend() && it_r != lwr.cend(); it_l++, it_r++) { if (std::tolower(*it_l) != *it_r) { return false; } } return it_l == str.end() && it_r == lwr.end(); }; if (std::tolower(alnum[0]) == 'r' && valid_regnum(alnum.substr(1))) { return TokenType::GPR; } else if (std::tolower(alnum[0]) == 'f' && valid_regnum(alnum.substr(1))) { return TokenType::FPR; } else if (alnum.length() == 3 && eq_nocase(alnum.substr(0, 2), "cr") && alnum[2] >= '0' && alnum[2] <= '7') { return TokenType::CRField; } else if (eq_nocase(alnum, "lt")) { return TokenType::Lt; } else if (eq_nocase(alnum, "gt")) { return TokenType::Gt; } else if (eq_nocase(alnum, "eq")) { return TokenType::Eq; } else if (eq_nocase(alnum, "so")) { return TokenType::So; } else if (sprg_map.Find(alnum) != nullptr) { return TokenType::SPR; } return TokenType::Identifier; } AssemblerToken Lexer::LexSingle() const { SkipWs(); ScanStart(); const char h = Peek(); TokenType token_type; std::string_view invalid_reason = ""; Interval invalid_region = Interval{0, 0}; Step(); if (std::isalpha(h) || h == '_' || IdentifierHeadExtra(h)) { for (char c = Peek(); std::isalnum(c) || c == '_' || IdentifierExtra(c); c = Step().Peek()) { } token_type = ClassifyAlnum(); } else if (h == '"') { token_type = LexStringLit(invalid_reason, invalid_region); } else if (h == '0') { const char imm_type = Peek(); if (imm_type == 'x') { token_type = TokenType::HexadecimalLit; Step(); for (char c = Peek(); std::isxdigit(c); c = Step().Peek()) { } } else if (imm_type == 'b') { token_type = TokenType::BinaryLit; Step(); for (char c = Peek(); IsBinary(c); c = Step().Peek()) { } } else if (IsOctal(imm_type)) { token_type = TokenType::OctalLit; for (char c = Peek(); IsOctal(c); c = Step().Peek()) { } } else { token_type = TokenType::DecimalLit; } } else if (std::isdigit(h)) { for (char c = Peek(); std::isdigit(c); c = Step().Peek()) { } token_type = TokenType::DecimalLit; } else if (h == '<' || h == '>') { // Special case for two-character operators const char second_ch = Peek(); if (second_ch == h) { Step(); token_type = second_ch == '<' ? TokenType::Lsh : TokenType::Rsh; } else { token_type = TokenType::Invalid; invalid_reason = "Unrecognized character"; invalid_region = Interval{0, 1}; } } else { token_type = SingleCharToken(h); if (token_type == TokenType::Invalid) { invalid_reason = "Unrecognized character"; invalid_region = Interval{0, 1}; } } AssemblerToken new_tok = {token_type, ScanFinishOut(), invalid_reason, invalid_region}; SkipWs(); return new_tok; } } // namespace Common::GekkoAssembler::detail