diff --git a/README.md b/README.md index 6cdc19d9e..61377f5d1 100644 --- a/README.md +++ b/README.md @@ -4,8 +4,11 @@ Xenia - Xbox 360 Emulator Research Project Xenia is an experimental emulator for the Xbox 360. For more information see the [main xenia website](https://xenia.jp/). +**Interested in supporting the core contributors? +[Xenia Project on Patreon](https://www.patreon.com/xenia_project).** + Come chat with us about **emulator-related topics** on [Discord](https://discord.gg/Q9mxZf9). -For developer chat join `#dev` but stay on topic. Lurking is fine. +For developer chat join `#dev` but stay on topic. Lurking is not only fine, but encouraged! Please check the [frequently asked questions](https://xenia.jp/faq/) page before asking questions. We've got jobs/lives/etc, so don't expect instant answers. diff --git a/assets/icon/LICENSE b/assets/icon/LICENSE new file mode 100644 index 000000000..fe8dbc50f --- /dev/null +++ b/assets/icon/LICENSE @@ -0,0 +1,428 @@ +Attribution-ShareAlike 4.0 International + +======================================================================= + +Creative Commons Corporation ("Creative Commons") is not a law firm and +does not provide legal services or legal advice. Distribution of +Creative Commons public licenses does not create a lawyer-client or +other relationship. Creative Commons makes its licenses and related +information available on an "as-is" basis. Creative Commons gives no +warranties regarding its licenses, any material licensed under their +terms and conditions, or any related information. Creative Commons +disclaims all liability for damages resulting from their use to the +fullest extent possible. + +Using Creative Commons Public Licenses + +Creative Commons public licenses provide a standard set of terms and +conditions that creators and other rights holders may use to share +original works of authorship and other material subject to copyright +and certain other rights specified in the public license below. The +following considerations are for informational purposes only, are not +exhaustive, and do not form part of our licenses. + + Considerations for licensors: Our public licenses are + intended for use by those authorized to give the public + permission to use material in ways otherwise restricted by + copyright and certain other rights. Our licenses are + irrevocable. Licensors should read and understand the terms + and conditions of the license they choose before applying it. + Licensors should also secure all rights necessary before + applying our licenses so that the public can reuse the + material as expected. Licensors should clearly mark any + material not subject to the license. This includes other CC- + licensed material, or material used under an exception or + limitation to copyright. More considerations for licensors: + wiki.creativecommons.org/Considerations_for_licensors + + Considerations for the public: By using one of our public + licenses, a licensor grants the public permission to use the + licensed material under specified terms and conditions. If + the licensor's permission is not necessary for any reason--for + example, because of any applicable exception or limitation to + copyright--then that use is not regulated by the license. Our + licenses grant only permissions under copyright and certain + other rights that a licensor has authority to grant. Use of + the licensed material may still be restricted for other + reasons, including because others have copyright or other + rights in the material. A licensor may make special requests, + such as asking that all changes be marked or described. + Although not required by our licenses, you are encouraged to + respect those requests where reasonable. More considerations + for the public: + wiki.creativecommons.org/Considerations_for_licensees + +======================================================================= + +Creative Commons Attribution-ShareAlike 4.0 International Public +License + +By exercising the Licensed Rights (defined below), You accept and agree +to be bound by the terms and conditions of this Creative Commons +Attribution-ShareAlike 4.0 International Public License ("Public +License"). To the extent this Public License may be interpreted as a +contract, You are granted the Licensed Rights in consideration of Your +acceptance of these terms and conditions, and the Licensor grants You +such rights in consideration of benefits the Licensor receives from +making the Licensed Material available under these terms and +conditions. + + +Section 1 -- Definitions. + + a. Adapted Material means material subject to Copyright and Similar + Rights that is derived from or based upon the Licensed Material + and in which the Licensed Material is translated, altered, + arranged, transformed, or otherwise modified in a manner requiring + permission under the Copyright and Similar Rights held by the + Licensor. For purposes of this Public License, where the Licensed + Material is a musical work, performance, or sound recording, + Adapted Material is always produced where the Licensed Material is + synched in timed relation with a moving image. + + b. Adapter's License means the license You apply to Your Copyright + and Similar Rights in Your contributions to Adapted Material in + accordance with the terms and conditions of this Public License. + + c. BY-SA Compatible License means a license listed at + creativecommons.org/compatiblelicenses, approved by Creative + Commons as essentially the equivalent of this Public License. + + d. Copyright and Similar Rights means copyright and/or similar rights + closely related to copyright including, without limitation, + performance, broadcast, sound recording, and Sui Generis Database + Rights, without regard to how the rights are labeled or + categorized. For purposes of this Public License, the rights + specified in Section 2(b)(1)-(2) are not Copyright and Similar + Rights. + + e. Effective Technological Measures means those measures that, in the + absence of proper authority, may not be circumvented under laws + fulfilling obligations under Article 11 of the WIPO Copyright + Treaty adopted on December 20, 1996, and/or similar international + agreements. + + f. Exceptions and Limitations means fair use, fair dealing, and/or + any other exception or limitation to Copyright and Similar Rights + that applies to Your use of the Licensed Material. + + g. License Elements means the license attributes listed in the name + of a Creative Commons Public License. The License Elements of this + Public License are Attribution and ShareAlike. + + h. Licensed Material means the artistic or literary work, database, + or other material to which the Licensor applied this Public + License. + + i. Licensed Rights means the rights granted to You subject to the + terms and conditions of this Public License, which are limited to + all Copyright and Similar Rights that apply to Your use of the + Licensed Material and that the Licensor has authority to license. + + j. Licensor means the individual(s) or entity(ies) granting rights + under this Public License. + + k. Share means to provide material to the public by any means or + process that requires permission under the Licensed Rights, such + as reproduction, public display, public performance, distribution, + dissemination, communication, or importation, and to make material + available to the public including in ways that members of the + public may access the material from a place and at a time + individually chosen by them. + + l. Sui Generis Database Rights means rights other than copyright + resulting from Directive 96/9/EC of the European Parliament and of + the Council of 11 March 1996 on the legal protection of databases, + as amended and/or succeeded, as well as other essentially + equivalent rights anywhere in the world. + + m. You means the individual or entity exercising the Licensed Rights + under this Public License. Your has a corresponding meaning. + + +Section 2 -- Scope. + + a. License grant. + + 1. Subject to the terms and conditions of this Public License, + the Licensor hereby grants You a worldwide, royalty-free, + non-sublicensable, non-exclusive, irrevocable license to + exercise the Licensed Rights in the Licensed Material to: + + a. reproduce and Share the Licensed Material, in whole or + in part; and + + b. produce, reproduce, and Share Adapted Material. + + 2. Exceptions and Limitations. For the avoidance of doubt, where + Exceptions and Limitations apply to Your use, this Public + License does not apply, and You do not need to comply with + its terms and conditions. + + 3. Term. The term of this Public License is specified in Section + 6(a). + + 4. Media and formats; technical modifications allowed. The + Licensor authorizes You to exercise the Licensed Rights in + all media and formats whether now known or hereafter created, + and to make technical modifications necessary to do so. The + Licensor waives and/or agrees not to assert any right or + authority to forbid You from making technical modifications + necessary to exercise the Licensed Rights, including + technical modifications necessary to circumvent Effective + Technological Measures. For purposes of this Public License, + simply making modifications authorized by this Section 2(a) + (4) never produces Adapted Material. + + 5. Downstream recipients. + + a. Offer from the Licensor -- Licensed Material. Every + recipient of the Licensed Material automatically + receives an offer from the Licensor to exercise the + Licensed Rights under the terms and conditions of this + Public License. + + b. Additional offer from the Licensor -- Adapted Material. + Every recipient of Adapted Material from You + automatically receives an offer from the Licensor to + exercise the Licensed Rights in the Adapted Material + under the conditions of the Adapter's License You apply. + + c. No downstream restrictions. You may not offer or impose + any additional or different terms or conditions on, or + apply any Effective Technological Measures to, the + Licensed Material if doing so restricts exercise of the + Licensed Rights by any recipient of the Licensed + Material. + + 6. No endorsement. Nothing in this Public License constitutes or + may be construed as permission to assert or imply that You + are, or that Your use of the Licensed Material is, connected + with, or sponsored, endorsed, or granted official status by, + the Licensor or others designated to receive attribution as + provided in Section 3(a)(1)(A)(i). + + b. Other rights. + + 1. Moral rights, such as the right of integrity, are not + licensed under this Public License, nor are publicity, + privacy, and/or other similar personality rights; however, to + the extent possible, the Licensor waives and/or agrees not to + assert any such rights held by the Licensor to the limited + extent necessary to allow You to exercise the Licensed + Rights, but not otherwise. + + 2. Patent and trademark rights are not licensed under this + Public License. + + 3. To the extent possible, the Licensor waives any right to + collect royalties from You for the exercise of the Licensed + Rights, whether directly or through a collecting society + under any voluntary or waivable statutory or compulsory + licensing scheme. In all other cases the Licensor expressly + reserves any right to collect such royalties. + + +Section 3 -- License Conditions. + +Your exercise of the Licensed Rights is expressly made subject to the +following conditions. + + a. Attribution. + + 1. If You Share the Licensed Material (including in modified + form), You must: + + a. retain the following if it is supplied by the Licensor + with the Licensed Material: + + i. identification of the creator(s) of the Licensed + Material and any others designated to receive + attribution, in any reasonable manner requested by + the Licensor (including by pseudonym if + designated); + + ii. a copyright notice; + + iii. a notice that refers to this Public License; + + iv. a notice that refers to the disclaimer of + warranties; + + v. a URI or hyperlink to the Licensed Material to the + extent reasonably practicable; + + b. indicate if You modified the Licensed Material and + retain an indication of any previous modifications; and + + c. indicate the Licensed Material is licensed under this + Public License, and include the text of, or the URI or + hyperlink to, this Public License. + + 2. You may satisfy the conditions in Section 3(a)(1) in any + reasonable manner based on the medium, means, and context in + which You Share the Licensed Material. For example, it may be + reasonable to satisfy the conditions by providing a URI or + hyperlink to a resource that includes the required + information. + + 3. If requested by the Licensor, You must remove any of the + information required by Section 3(a)(1)(A) to the extent + reasonably practicable. + + b. ShareAlike. + + In addition to the conditions in Section 3(a), if You Share + Adapted Material You produce, the following conditions also apply. + + 1. The Adapter's License You apply must be a Creative Commons + license with the same License Elements, this version or + later, or a BY-SA Compatible License. + + 2. You must include the text of, or the URI or hyperlink to, the + Adapter's License You apply. You may satisfy this condition + in any reasonable manner based on the medium, means, and + context in which You Share Adapted Material. + + 3. You may not offer or impose any additional or different terms + or conditions on, or apply any Effective Technological + Measures to, Adapted Material that restrict exercise of the + rights granted under the Adapter's License You apply. + + +Section 4 -- Sui Generis Database Rights. + +Where the Licensed Rights include Sui Generis Database Rights that +apply to Your use of the Licensed Material: + + a. for the avoidance of doubt, Section 2(a)(1) grants You the right + to extract, reuse, reproduce, and Share all or a substantial + portion of the contents of the database; + + b. if You include all or a substantial portion of the database + contents in a database in which You have Sui Generis Database + Rights, then the database in which You have Sui Generis Database + Rights (but not its individual contents) is Adapted Material, + + including for purposes of Section 3(b); and + c. You must comply with the conditions in Section 3(a) if You Share + all or a substantial portion of the contents of the database. + +For the avoidance of doubt, this Section 4 supplements and does not +replace Your obligations under this Public License where the Licensed +Rights include other Copyright and Similar Rights. + + +Section 5 -- Disclaimer of Warranties and Limitation of Liability. + + a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE + EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS + AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF + ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, + IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, + WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR + PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, + ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT + KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT + ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. + + b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE + TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, + NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, + INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, + COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR + USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN + ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR + DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR + IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. + + c. The disclaimer of warranties and limitation of liability provided + above shall be interpreted in a manner that, to the extent + possible, most closely approximates an absolute disclaimer and + waiver of all liability. + + +Section 6 -- Term and Termination. + + a. This Public License applies for the term of the Copyright and + Similar Rights licensed here. However, if You fail to comply with + this Public License, then Your rights under this Public License + terminate automatically. + + b. Where Your right to use the Licensed Material has terminated under + Section 6(a), it reinstates: + + 1. automatically as of the date the violation is cured, provided + it is cured within 30 days of Your discovery of the + violation; or + + 2. upon express reinstatement by the Licensor. + + For the avoidance of doubt, this Section 6(b) does not affect any + right the Licensor may have to seek remedies for Your violations + of this Public License. + + c. For the avoidance of doubt, the Licensor may also offer the + Licensed Material under separate terms or conditions or stop + distributing the Licensed Material at any time; however, doing so + will not terminate this Public License. + + d. Sections 1, 5, 6, 7, and 8 survive termination of this Public + License. + + +Section 7 -- Other Terms and Conditions. + + a. The Licensor shall not be bound by any additional or different + terms or conditions communicated by You unless expressly agreed. + + b. Any arrangements, understandings, or agreements regarding the + Licensed Material not stated herein are separate from and + independent of the terms and conditions of this Public License. + + +Section 8 -- Interpretation. + + a. For the avoidance of doubt, this Public License does not, and + shall not be interpreted to, reduce, limit, restrict, or impose + conditions on any use of the Licensed Material that could lawfully + be made without permission under this Public License. + + b. To the extent possible, if any provision of this Public License is + deemed unenforceable, it shall be automatically reformed to the + minimum extent necessary to make it enforceable. If the provision + cannot be reformed, it shall be severed from this Public License + without affecting the enforceability of the remaining terms and + conditions. + + c. No term or condition of this Public License will be waived and no + failure to comply consented to unless expressly agreed to by the + Licensor. + + d. Nothing in this Public License constitutes or may be interpreted + as a limitation upon, or waiver of, any privileges and immunities + that apply to the Licensor or You, including from the legal + processes of any jurisdiction or authority. + + +======================================================================= + +Creative Commons is not a party to its public +licenses. Notwithstanding, Creative Commons may elect to apply one of +its public licenses to material it publishes and in those instances +will be considered the “Licensor.” The text of the Creative Commons +public licenses is dedicated to the public domain under the CC0 Public +Domain Dedication. Except for the limited purpose of indicating that +material is shared under a Creative Commons public license or as +otherwise permitted by the Creative Commons policies published at +creativecommons.org/policies, Creative Commons does not authorize the +use of the trademark "Creative Commons" or any other trademark or logo +of Creative Commons without its prior written consent including, +without limitation, in connection with any unauthorized modifications +to any of its public licenses or any other arrangements, +understandings, or agreements concerning use of licensed material. For +the avoidance of doubt, this paragraph does not form part of the +public licenses. + +Creative Commons may be contacted at creativecommons.org. + diff --git a/assets/icon/icon.ico b/assets/icon/icon.ico index 3f0fd9aeb..26cd5ad33 100644 Binary files a/assets/icon/icon.ico and b/assets/icon/icon.ico differ diff --git a/premake5.lua b/premake5.lua index 6a9095cb4..185e0e206 100644 --- a/premake5.lua +++ b/premake5.lua @@ -233,6 +233,7 @@ solution("xenia") include("third_party/glslang-spirv.lua") include("third_party/imgui.lua") include("third_party/libav.lua") + include("third_party/mspack.lua") include("third_party/snappy.lua") include("third_party/spirv-tools.lua") include("third_party/volk.lua") diff --git a/src/xenia/app/premake5.lua b/src/xenia/app/premake5.lua index 4e2dc399f..8b734294f 100644 --- a/src/xenia/app/premake5.lua +++ b/src/xenia/app/premake5.lua @@ -16,6 +16,7 @@ project("xenia-app") "imgui", "libavcodec", "libavutil", + "mspack", "snappy", "spirv-tools", "volk", diff --git a/src/xenia/cpu/backend/x64/x64_backend.cc b/src/xenia/cpu/backend/x64/x64_backend.cc index e45141eac..4e6356959 100644 --- a/src/xenia/cpu/backend/x64/x64_backend.cc +++ b/src/xenia/cpu/backend/x64/x64_backend.cc @@ -42,6 +42,15 @@ class X64ThunkEmitter : public X64Emitter { HostToGuestThunk EmitHostToGuestThunk(); GuestToHostThunk EmitGuestToHostThunk(); ResolveFunctionThunk EmitResolveFunctionThunk(); + + private: + // The following four functions provide save/load functionality for registers. + // They assume at least StackLayout::THUNK_STACK_SIZE bytes have been + // allocated on the stack. + void EmitSaveVolatileRegs(); + void EmitLoadVolatileRegs(); + void EmitSaveNonvolatileRegs(); + void EmitLoadNonvolatileRegs(); }; X64Backend::X64Backend() : Backend(), code_cache_(nullptr) { @@ -73,8 +82,6 @@ bool X64Backend::Initialize(Processor* processor) { return false; } - RegisterSequences(); - // Need movbe to do advanced LOAD/STORE tricks. if (FLAGS_enable_haswell_instructions) { machine_info_.supports_extended_load_store = @@ -406,6 +413,117 @@ HostToGuestThunk X64ThunkEmitter::EmitHostToGuestThunk() { mov(qword[rsp + 8 * 1], rcx); sub(rsp, stack_size); + // Save nonvolatile registers. + EmitSaveNonvolatileRegs(); + + mov(rax, rcx); + mov(rsi, rdx); // context + mov(rcx, r8); // return address + call(rax); + + EmitLoadNonvolatileRegs(); + + add(rsp, stack_size); + mov(rcx, qword[rsp + 8 * 1]); + mov(rdx, qword[rsp + 8 * 2]); + mov(r8, qword[rsp + 8 * 3]); + ret(); + + void* fn = Emplace(stack_size); + return (HostToGuestThunk)fn; +} + +GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() { + // rcx = target function + // rdx = arg0 + // r8 = arg1 + // r9 = arg2 + + const size_t stack_size = StackLayout::THUNK_STACK_SIZE; + // rsp + 0 = return address + sub(rsp, stack_size); + + // Save off volatile registers. + EmitSaveVolatileRegs(); + + mov(rax, rcx); // function + mov(rcx, GetContextReg()); // context + call(rax); + + EmitLoadVolatileRegs(); + + add(rsp, stack_size); + ret(); + + void* fn = Emplace(stack_size); + return (GuestToHostThunk)fn; +} + +// X64Emitter handles actually resolving functions. +extern "C" uint64_t ResolveFunction(void* raw_context, uint32_t target_address); + +ResolveFunctionThunk X64ThunkEmitter::EmitResolveFunctionThunk() { + // ebx = target PPC address + // rcx = context + const size_t stack_size = StackLayout::THUNK_STACK_SIZE; + + // rsp + 0 = return address + sub(rsp, stack_size); + + // Save volatile registers + EmitSaveVolatileRegs(); + + mov(rcx, rsi); // context + mov(rdx, rbx); + mov(rax, uint64_t(&ResolveFunction)); + call(rax); + + EmitLoadVolatileRegs(); + + add(rsp, stack_size); + jmp(rax); + + void* fn = Emplace(stack_size); + return (ResolveFunctionThunk)fn; +} + +void X64ThunkEmitter::EmitSaveVolatileRegs() { + // Save off volatile registers. + // mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rax); + mov(qword[rsp + offsetof(StackLayout::Thunk, r[1])], rcx); + mov(qword[rsp + offsetof(StackLayout::Thunk, r[2])], rdx); + mov(qword[rsp + offsetof(StackLayout::Thunk, r[3])], r8); + mov(qword[rsp + offsetof(StackLayout::Thunk, r[4])], r9); + mov(qword[rsp + offsetof(StackLayout::Thunk, r[5])], r10); + mov(qword[rsp + offsetof(StackLayout::Thunk, r[6])], r11); + + // movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[0])], xmm0); + movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[1])], xmm1); + movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[2])], xmm2); + movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[3])], xmm3); + movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[4])], xmm4); + movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[5])], xmm5); +} + +void X64ThunkEmitter::EmitLoadVolatileRegs() { + // Load volatile registers from our stack frame. + // movaps(xmm0, qword[rsp + offsetof(StackLayout::Thunk, xmm[0])]); + movaps(xmm1, qword[rsp + offsetof(StackLayout::Thunk, xmm[1])]); + movaps(xmm2, qword[rsp + offsetof(StackLayout::Thunk, xmm[2])]); + movaps(xmm3, qword[rsp + offsetof(StackLayout::Thunk, xmm[3])]); + movaps(xmm4, qword[rsp + offsetof(StackLayout::Thunk, xmm[4])]); + movaps(xmm5, qword[rsp + offsetof(StackLayout::Thunk, xmm[5])]); + + // mov(rax, qword[rsp + offsetof(StackLayout::Thunk, r[0])]); + mov(rcx, qword[rsp + offsetof(StackLayout::Thunk, r[1])]); + mov(rdx, qword[rsp + offsetof(StackLayout::Thunk, r[2])]); + mov(r8, qword[rsp + offsetof(StackLayout::Thunk, r[3])]); + mov(r9, qword[rsp + offsetof(StackLayout::Thunk, r[4])]); + mov(r10, qword[rsp + offsetof(StackLayout::Thunk, r[5])]); + mov(r11, qword[rsp + offsetof(StackLayout::Thunk, r[6])]); +} + +void X64ThunkEmitter::EmitSaveNonvolatileRegs() { // Preserve nonvolatile registers. mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rbx); mov(qword[rsp + offsetof(StackLayout::Thunk, r[1])], rcx); @@ -427,12 +545,9 @@ HostToGuestThunk X64ThunkEmitter::EmitHostToGuestThunk() { movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[7])], xmm13); movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[8])], xmm14); movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[9])], xmm15); +} - mov(rax, rcx); - mov(rsi, rdx); // context - mov(rcx, r8); // return address - call(rax); - +void X64ThunkEmitter::EmitLoadNonvolatileRegs() { movaps(xmm6, qword[rsp + offsetof(StackLayout::Thunk, xmm[0])]); movaps(xmm7, qword[rsp + offsetof(StackLayout::Thunk, xmm[1])]); movaps(xmm8, qword[rsp + offsetof(StackLayout::Thunk, xmm[2])]); @@ -453,100 +568,6 @@ HostToGuestThunk X64ThunkEmitter::EmitHostToGuestThunk() { mov(r13, qword[rsp + offsetof(StackLayout::Thunk, r[6])]); mov(r14, qword[rsp + offsetof(StackLayout::Thunk, r[7])]); mov(r15, qword[rsp + offsetof(StackLayout::Thunk, r[8])]); - - add(rsp, stack_size); - mov(rcx, qword[rsp + 8 * 1]); - mov(rdx, qword[rsp + 8 * 2]); - mov(r8, qword[rsp + 8 * 3]); - ret(); - - void* fn = Emplace(stack_size); - return (HostToGuestThunk)fn; -} - -GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() { - // rcx = context - // rdx = target function - // r8 = arg0 - // r9 = arg1 - // r10 = arg2 - - const size_t stack_size = StackLayout::THUNK_STACK_SIZE; - // rsp + 0 = return address - mov(qword[rsp + 8 * 2], rdx); - mov(qword[rsp + 8 * 1], rcx); - sub(rsp, stack_size); - - // Save off volatile registers. - // TODO(DrChat): Enable this when we actually need this. - // mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rcx); - // mov(qword[rsp + offsetof(StackLayout::Thunk, r[1])], rdx); - // mov(qword[rsp + offsetof(StackLayout::Thunk, r[2])], r8); - // mov(qword[rsp + offsetof(StackLayout::Thunk, r[3])], r9); - // mov(qword[rsp + offsetof(StackLayout::Thunk, r[4])], r10); - // mov(qword[rsp + offsetof(StackLayout::Thunk, r[5])], r11); - - // movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[1])], xmm1); - // movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[2])], xmm2); - // movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[3])], xmm3); - // movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[4])], xmm4); - // movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[5])], xmm5); - - mov(rax, rdx); - mov(rcx, rsi); // context - mov(rdx, r8); - mov(r8, r9); - mov(r9, r10); - call(rax); - - // movaps(xmm1, qword[rsp + offsetof(StackLayout::Thunk, xmm[1])]); - // movaps(xmm2, qword[rsp + offsetof(StackLayout::Thunk, xmm[2])]); - // movaps(xmm3, qword[rsp + offsetof(StackLayout::Thunk, xmm[3])]); - // movaps(xmm4, qword[rsp + offsetof(StackLayout::Thunk, xmm[4])]); - // movaps(xmm5, qword[rsp + offsetof(StackLayout::Thunk, xmm[5])]); - - // mov(rcx, qword[rsp + offsetof(StackLayout::Thunk, r[0])]); - // mov(rdx, qword[rsp + offsetof(StackLayout::Thunk, r[1])]); - // mov(r8, qword[rsp + offsetof(StackLayout::Thunk, r[2])]); - // mov(r9, qword[rsp + offsetof(StackLayout::Thunk, r[3])]); - // mov(r10, qword[rsp + offsetof(StackLayout::Thunk, r[4])]); - // mov(r11, qword[rsp + offsetof(StackLayout::Thunk, r[5])]); - - add(rsp, stack_size); - mov(rcx, qword[rsp + 8 * 1]); - mov(rdx, qword[rsp + 8 * 2]); - ret(); - - void* fn = Emplace(stack_size); - return (GuestToHostThunk)fn; -} - -// X64Emitter handles actually resolving functions. -extern "C" uint64_t ResolveFunction(void* raw_context, uint32_t target_address); - -ResolveFunctionThunk X64ThunkEmitter::EmitResolveFunctionThunk() { - // ebx = target PPC address - // rcx = context - - uint32_t stack_size = 0x18; - - // rsp + 0 = return address - mov(qword[rsp + 8 * 2], rdx); - mov(qword[rsp + 8 * 1], rcx); - sub(rsp, stack_size); - - mov(rcx, rsi); // context - mov(rdx, rbx); - mov(rax, uint64_t(&ResolveFunction)); - call(rax); - - add(rsp, stack_size); - mov(rcx, qword[rsp + 8 * 1]); - mov(rdx, qword[rsp + 8 * 2]); - jmp(rax); - - void* fn = Emplace(stack_size); - return (ResolveFunctionThunk)fn; } } // namespace x64 diff --git a/src/xenia/cpu/backend/x64/x64_code_cache.cc b/src/xenia/cpu/backend/x64/x64_code_cache.cc index b258f2658..e4a23248e 100644 --- a/src/xenia/cpu/backend/x64/x64_code_cache.cc +++ b/src/xenia/cpu/backend/x64/x64_code_cache.cc @@ -174,15 +174,17 @@ void* X64CodeCache::PlaceGuestCode(uint32_t guest_address, void* machine_code, // If we are going above the high water mark of committed memory, commit // some more. It's ok if multiple threads do this, as redundant commits // aren't harmful. - size_t old_commit_mark = generated_code_commit_mark_; - if (high_mark > old_commit_mark) { - size_t new_commit_mark = old_commit_mark + 16 * 1024 * 1024; + size_t old_commit_mark, new_commit_mark; + do { + old_commit_mark = generated_code_commit_mark_; + if (high_mark <= old_commit_mark) break; + + new_commit_mark = old_commit_mark + 16 * 1024 * 1024; xe::memory::AllocFixed(generated_code_base_, new_commit_mark, xe::memory::AllocationType::kCommit, xe::memory::PageAccess::kExecuteReadWrite); - generated_code_commit_mark_.compare_exchange_strong(old_commit_mark, - new_commit_mark); - } + } while (generated_code_commit_mark_.compare_exchange_weak( + old_commit_mark, new_commit_mark)); // Copy code. std::memcpy(code_address, machine_code, code_size); @@ -248,15 +250,17 @@ uint32_t X64CodeCache::PlaceData(const void* data, size_t length) { // If we are going above the high water mark of committed memory, commit some // more. It's ok if multiple threads do this, as redundant commits aren't // harmful. - size_t old_commit_mark = generated_code_commit_mark_; - if (high_mark > old_commit_mark) { - size_t new_commit_mark = old_commit_mark + 16 * 1024 * 1024; + size_t old_commit_mark, new_commit_mark; + do { + old_commit_mark = generated_code_commit_mark_; + if (high_mark <= old_commit_mark) break; + + new_commit_mark = old_commit_mark + 16 * 1024 * 1024; xe::memory::AllocFixed(generated_code_base_, new_commit_mark, xe::memory::AllocationType::kCommit, xe::memory::PageAccess::kExecuteReadWrite); - generated_code_commit_mark_.compare_exchange_strong(old_commit_mark, - new_commit_mark); - } + } while (generated_code_commit_mark_.compare_exchange_weak(old_commit_mark, + new_commit_mark)); // Copy code. std::memcpy(data_address, data, length); diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index 0c6957acc..7ffd7b582 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -56,12 +56,13 @@ static const size_t kStashOffset = 32; // static const size_t kStashOffsetHigh = 32 + 32; const uint32_t X64Emitter::gpr_reg_map_[X64Emitter::GPR_COUNT] = { - Xbyak::Operand::RBX, Xbyak::Operand::R12, Xbyak::Operand::R13, - Xbyak::Operand::R14, Xbyak::Operand::R15, + Xbyak::Operand::RBX, Xbyak::Operand::R10, Xbyak::Operand::R11, + Xbyak::Operand::R12, Xbyak::Operand::R13, Xbyak::Operand::R14, + Xbyak::Operand::R15, }; const uint32_t X64Emitter::xmm_reg_map_[X64Emitter::XMM_COUNT] = { - 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, }; X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator) @@ -148,11 +149,13 @@ bool X64Emitter::Emit(HIRBuilder* builder, size_t* out_stack_size) { for (auto it = locals.begin(); it != locals.end(); ++it) { auto slot = *it; size_t type_size = GetTypeSize(slot->type); + // Align to natural size. stack_offset = xe::align(stack_offset, type_size); slot->set_constant((uint32_t)stack_offset); stack_offset += type_size; } + // Ensure 16b alignment. stack_offset -= StackLayout::GUEST_STACK_SIZE; stack_offset = xe::align(stack_offset, static_cast(16)); @@ -160,7 +163,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, size_t* out_stack_size) { // Function prolog. // Must be 16b aligned. // Windows is very strict about the form of this and the epilog: - // https://msdn.microsoft.com/en-us/library/tawsa7cb.aspx + // https://docs.microsoft.com/en-us/cpp/build/prolog-and-epilog?view=vs-2017 // IMPORTANT: any changes to the prolog must be kept in sync with // X64CodeCache, which dynamically generates exception information. // Adding or changing anything here must be matched! @@ -168,6 +171,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, size_t* out_stack_size) { assert_true((stack_size + 8) % 16 == 0); *out_stack_size = stack_size; stack_size_ = stack_size; + sub(rsp, (uint32_t)stack_size); mov(qword[rsp + StackLayout::GUEST_CTX_HOME], GetContextReg()); mov(qword[rsp + StackLayout::GUEST_RET_ADDR], rcx); @@ -221,6 +225,8 @@ bool X64Emitter::Emit(HIRBuilder* builder, size_t* out_stack_size) { const Instr* new_tail = instr; if (!SelectSequence(this, instr, &new_tail)) { // No sequence found! + // NOTE: If you encounter this after adding a new instruction, do a full + // rebuild! assert_always(); XELOGE("Unable to process HIR opcode %s", instr->opcode->name); break; @@ -340,13 +346,14 @@ void X64Emitter::UnimplementedInstr(const hir::Instr* i) { // This is used by the X64ThunkEmitter's ResolveFunctionThunk. extern "C" uint64_t ResolveFunction(void* raw_context, - uint32_t target_address) { + uint64_t target_address) { auto thread_state = *reinterpret_cast(raw_context); // TODO(benvanik): required? assert_not_zero(target_address); - auto fn = thread_state->processor()->ResolveFunction(target_address); + auto fn = + thread_state->processor()->ResolveFunction((uint32_t)target_address); assert_not_null(fn); auto x64_fn = static_cast(fn); uint64_t addr = reinterpret_cast(x64_fn->machine_code()); @@ -373,10 +380,7 @@ void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) { // Old-style resolve. // Not too important because indirection table is almost always available. // TODO: Overwrite the call-site with a straight call. - mov(rax, reinterpret_cast(ResolveFunction)); - mov(rcx, GetContextReg()); - mov(rdx, function->address()); - call(rax); + CallNative(&ResolveFunction, function->address()); } // Actually jump/call to rax. @@ -457,16 +461,15 @@ void X64Emitter::CallExtern(const hir::Instr* instr, const Function* function) { auto builtin_function = static_cast(function); if (builtin_function->handler()) { undefined = false; - // rcx = context - // rdx = target host function - // r8 = arg0 - // r9 = arg1 - mov(rcx, GetContextReg()); - mov(rdx, reinterpret_cast(builtin_function->handler())); - mov(r8, reinterpret_cast(builtin_function->arg0())); - mov(r9, reinterpret_cast(builtin_function->arg1())); + // rcx = target function + // rdx = arg0 + // r8 = arg1 + // r9 = arg2 auto thunk = backend()->guest_to_host_thunk(); mov(rax, reinterpret_cast(thunk)); + mov(rcx, reinterpret_cast(builtin_function->handler())); + mov(rdx, reinterpret_cast(builtin_function->arg0())); + mov(r8, reinterpret_cast(builtin_function->arg1())); call(rax); // rax = host return } @@ -474,13 +477,15 @@ void X64Emitter::CallExtern(const hir::Instr* instr, const Function* function) { auto extern_function = static_cast(function); if (extern_function->extern_handler()) { undefined = false; - // rcx = context - // rdx = target host function - mov(rcx, GetContextReg()); - mov(rdx, reinterpret_cast(extern_function->extern_handler())); - mov(r8, qword[GetContextReg() + offsetof(ppc::PPCContext, kernel_state)]); + // rcx = target function + // rdx = arg0 + // r8 = arg1 + // r9 = arg2 auto thunk = backend()->guest_to_host_thunk(); mov(rax, reinterpret_cast(thunk)); + mov(rcx, reinterpret_cast(extern_function->extern_handler())); + mov(rdx, + qword[GetContextReg() + offsetof(ppc::PPCContext, kernel_state)]); call(rax); // rax = host return } @@ -490,42 +495,30 @@ void X64Emitter::CallExtern(const hir::Instr* instr, const Function* function) { } } -void X64Emitter::CallNative(void* fn) { - mov(rax, reinterpret_cast(fn)); - mov(rcx, GetContextReg()); - call(rax); -} +void X64Emitter::CallNative(void* fn) { CallNativeSafe(fn); } void X64Emitter::CallNative(uint64_t (*fn)(void* raw_context)) { - mov(rax, reinterpret_cast(fn)); - mov(rcx, GetContextReg()); - call(rax); + CallNativeSafe(reinterpret_cast(fn)); } void X64Emitter::CallNative(uint64_t (*fn)(void* raw_context, uint64_t arg0)) { - mov(rax, reinterpret_cast(fn)); - mov(rcx, GetContextReg()); - call(rax); + CallNativeSafe(reinterpret_cast(fn)); } void X64Emitter::CallNative(uint64_t (*fn)(void* raw_context, uint64_t arg0), uint64_t arg0) { - mov(rax, reinterpret_cast(fn)); - mov(rcx, GetContextReg()); - mov(rdx, arg0); - call(rax); + mov(GetNativeParam(0), arg0); + CallNativeSafe(reinterpret_cast(fn)); } void X64Emitter::CallNativeSafe(void* fn) { - // rcx = context - // rdx = target function - // r8 = arg0 - // r9 = arg1 - // r10 = arg2 + // rcx = target function + // rdx = arg0 + // r8 = arg1 + // r9 = arg2 auto thunk = backend()->guest_to_host_thunk(); mov(rax, reinterpret_cast(thunk)); - mov(rcx, GetContextReg()); - mov(rdx, reinterpret_cast(fn)); + mov(rcx, reinterpret_cast(fn)); call(rax); // rax = host return } @@ -535,6 +528,18 @@ void X64Emitter::SetReturnAddress(uint64_t value) { mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], rax); } +Xbyak::Reg64 X64Emitter::GetNativeParam(uint32_t param) { + if (param == 0) + return rdx; + else if (param == 1) + return r8; + else if (param == 2) + return r9; + + assert_always(); + return r9; +} + // Important: If you change these, you must update the thunks in x64_backend.cc! Xbyak::Reg64 X64Emitter::GetContextReg() { return rsi; } Xbyak::Reg64 X64Emitter::GetMembaseReg() { return rdi; } diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h index 33ce2c0a2..a35c2d2b0 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.h +++ b/src/xenia/cpu/backend/x64/x64_emitter.h @@ -139,13 +139,13 @@ class X64Emitter : public Xbyak::CodeGenerator { std::vector* out_source_map); public: - // Reserved: rsp + // Reserved: rsp, rsi, rdi // Scratch: rax/rcx/rdx // xmm0-2 - // Available: rbx, r12-r15 (save to get r8-r11, rbp, rsi, rdi?) - // xmm6-xmm15 (save to get xmm3-xmm5) - static const int GPR_COUNT = 5; - static const int XMM_COUNT = 10; + // Available: rbx, r10-r15 + // xmm4-xmm15 (save to get xmm3) + static const int GPR_COUNT = 7; + static const int XMM_COUNT = 12; static void SetupReg(const hir::Value* v, Xbyak::Reg8& r) { auto idx = gpr_reg_map_[v->reg.index]; @@ -187,6 +187,8 @@ class X64Emitter : public Xbyak::CodeGenerator { void CallNativeSafe(void* fn); void SetReturnAddress(uint64_t value); + Xbyak::Reg64 GetNativeParam(uint32_t param); + Xbyak::Reg64 GetContextReg(); Xbyak::Reg64 GetMembaseReg(); void ReloadContext(); diff --git a/src/xenia/cpu/backend/x64/x64_op.h b/src/xenia/cpu/backend/x64/x64_op.h new file mode 100644 index 000000000..f71338304 --- /dev/null +++ b/src/xenia/cpu/backend/x64/x64_op.h @@ -0,0 +1,629 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2018 Xenia Developers. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ +#ifndef XENIA_CPU_BACKEND_X64_X64_OP_H_ +#define XENIA_CPU_BACKEND_X64_X64_OP_H_ + +#include "xenia/cpu/backend/x64/x64_emitter.h" + +#include "xenia/cpu/hir/instr.h" + +namespace xe { +namespace cpu { +namespace backend { +namespace x64 { + +// TODO(benvanik): direct usings. +using namespace xe::cpu; +using namespace xe::cpu::hir; +using namespace Xbyak; + +// Selects the right byte/word/etc from a vector. We need to flip logical +// indices (0,1,2,3,4,5,6,7,...) = (3,2,1,0,7,6,5,4,...) +#define VEC128_B(n) ((n) ^ 0x3) +#define VEC128_W(n) ((n) ^ 0x1) +#define VEC128_D(n) (n) +#define VEC128_F(n) (n) + +enum KeyType { + KEY_TYPE_X = OPCODE_SIG_TYPE_X, + KEY_TYPE_L = OPCODE_SIG_TYPE_L, + KEY_TYPE_O = OPCODE_SIG_TYPE_O, + KEY_TYPE_S = OPCODE_SIG_TYPE_S, + KEY_TYPE_V_I8 = OPCODE_SIG_TYPE_V + INT8_TYPE, + KEY_TYPE_V_I16 = OPCODE_SIG_TYPE_V + INT16_TYPE, + KEY_TYPE_V_I32 = OPCODE_SIG_TYPE_V + INT32_TYPE, + KEY_TYPE_V_I64 = OPCODE_SIG_TYPE_V + INT64_TYPE, + KEY_TYPE_V_F32 = OPCODE_SIG_TYPE_V + FLOAT32_TYPE, + KEY_TYPE_V_F64 = OPCODE_SIG_TYPE_V + FLOAT64_TYPE, + KEY_TYPE_V_V128 = OPCODE_SIG_TYPE_V + VEC128_TYPE, +}; + +#pragma pack(push, 1) +union InstrKey { + struct { + uint32_t opcode : 8; + uint32_t dest : 5; + uint32_t src1 : 5; + uint32_t src2 : 5; + uint32_t src3 : 5; + uint32_t reserved : 4; + }; + uint32_t value; + + operator uint32_t() const { return value; } + + InstrKey() : value(0) {} + InstrKey(uint32_t v) : value(v) {} + InstrKey(const Instr* i) : value(0) { + opcode = i->opcode->num; + uint32_t sig = i->opcode->signature; + dest = + GET_OPCODE_SIG_TYPE_DEST(sig) ? OPCODE_SIG_TYPE_V + i->dest->type : 0; + src1 = GET_OPCODE_SIG_TYPE_SRC1(sig); + if (src1 == OPCODE_SIG_TYPE_V) { + src1 += i->src1.value->type; + } + src2 = GET_OPCODE_SIG_TYPE_SRC2(sig); + if (src2 == OPCODE_SIG_TYPE_V) { + src2 += i->src2.value->type; + } + src3 = GET_OPCODE_SIG_TYPE_SRC3(sig); + if (src3 == OPCODE_SIG_TYPE_V) { + src3 += i->src3.value->type; + } + } + + template + struct Construct { + static const uint32_t value = + (OPCODE) | (DEST << 8) | (SRC1 << 13) | (SRC2 << 18) | (SRC3 << 23); + }; +}; +#pragma pack(pop) +static_assert(sizeof(InstrKey) <= 4, "Key must be 4 bytes"); + +template +struct CombinedStruct; +template <> +struct CombinedStruct<> {}; +template +struct CombinedStruct : T, CombinedStruct {}; + +struct OpBase {}; + +template +struct Op : OpBase { + static const KeyType key_type = KEY_TYPE; +}; + +struct VoidOp : Op { + protected: + template + friend struct Op; + template + friend struct I; + void Load(const Instr::Op& op) {} +}; + +struct OffsetOp : Op { + uint64_t value; + + protected: + template + friend struct Op; + template + friend struct I; + void Load(const Instr::Op& op) { this->value = op.offset; } +}; + +struct SymbolOp : Op { + Function* value; + + protected: + template + friend struct Op; + template + friend struct I; + bool Load(const Instr::Op& op) { + this->value = op.symbol; + return true; + } +}; + +struct LabelOp : Op { + hir::Label* value; + + protected: + template + friend struct Op; + template + friend struct I; + void Load(const Instr::Op& op) { this->value = op.label; } +}; + +template +struct ValueOp : Op, KEY_TYPE> { + typedef REG_TYPE reg_type; + const Value* value; + bool is_constant; + virtual bool ConstantFitsIn32Reg() const { return true; } + const REG_TYPE& reg() const { + assert_true(!is_constant); + return reg_; + } + operator const REG_TYPE&() const { return reg(); } + bool IsEqual(const T& b) const { + if (is_constant && b.is_constant) { + return reinterpret_cast(this)->constant() == b.constant(); + } else if (!is_constant && !b.is_constant) { + return reg_.getIdx() == b.reg_.getIdx(); + } else { + return false; + } + } + bool IsEqual(const Xbyak::Reg& b) const { + if (is_constant) { + return false; + } else if (!is_constant) { + return reg_.getIdx() == b.getIdx(); + } else { + return false; + } + } + bool operator==(const T& b) const { return IsEqual(b); } + bool operator!=(const T& b) const { return !IsEqual(b); } + bool operator==(const Xbyak::Reg& b) const { return IsEqual(b); } + bool operator!=(const Xbyak::Reg& b) const { return !IsEqual(b); } + void Load(const Instr::Op& op) { + value = op.value; + is_constant = value->IsConstant(); + if (!is_constant) { + X64Emitter::SetupReg(value, reg_); + } + } + + protected: + REG_TYPE reg_; +}; + +struct I8Op : ValueOp { + typedef ValueOp BASE; + const int8_t constant() const { + assert_true(BASE::is_constant); + return BASE::value->constant.i8; + } +}; +struct I16Op : ValueOp { + typedef ValueOp BASE; + const int16_t constant() const { + assert_true(BASE::is_constant); + return BASE::value->constant.i16; + } +}; +struct I32Op : ValueOp { + typedef ValueOp BASE; + const int32_t constant() const { + assert_true(BASE::is_constant); + return BASE::value->constant.i32; + } +}; +struct I64Op : ValueOp { + typedef ValueOp BASE; + const int64_t constant() const { + assert_true(BASE::is_constant); + return BASE::value->constant.i64; + } + bool ConstantFitsIn32Reg() const override { + int64_t v = BASE::value->constant.i64; + if ((v & ~0x7FFFFFFF) == 0) { + // Fits under 31 bits, so just load using normal mov. + return true; + } else if ((v & ~0x7FFFFFFF) == ~0x7FFFFFFF) { + // Negative number that fits in 32bits. + return true; + } + return false; + } +}; +struct F32Op : ValueOp { + typedef ValueOp BASE; + const float constant() const { + assert_true(BASE::is_constant); + return BASE::value->constant.f32; + } +}; +struct F64Op : ValueOp { + typedef ValueOp BASE; + const double constant() const { + assert_true(BASE::is_constant); + return BASE::value->constant.f64; + } +}; +struct V128Op : ValueOp { + typedef ValueOp BASE; + const vec128_t& constant() const { + assert_true(BASE::is_constant); + return BASE::value->constant.v128; + } +}; + +template +struct DestField; +template +struct DestField { + DEST dest; + + protected: + bool LoadDest(const Instr* i) { + Instr::Op op; + op.value = i->dest; + dest.Load(op); + return true; + } +}; +template <> +struct DestField { + protected: + bool LoadDest(const Instr* i) { return true; } +}; + +template +struct I; +template +struct I : DestField { + typedef DestField BASE; + static const hir::Opcode opcode = OPCODE; + static const uint32_t key = + InstrKey::Construct::value; + static const KeyType dest_type = DEST::key_type; + const Instr* instr; + + protected: + template + friend struct Sequence; + bool Load(const Instr* i) { + if (InstrKey(i).value == key && BASE::LoadDest(i)) { + instr = i; + return true; + } + return false; + } +}; +template +struct I : DestField { + typedef DestField BASE; + static const hir::Opcode opcode = OPCODE; + static const uint32_t key = + InstrKey::Construct::value; + static const KeyType dest_type = DEST::key_type; + static const KeyType src1_type = SRC1::key_type; + const Instr* instr; + SRC1 src1; + + protected: + template + friend struct Sequence; + bool Load(const Instr* i) { + if (InstrKey(i).value == key && BASE::LoadDest(i)) { + instr = i; + src1.Load(i->src1); + return true; + } + return false; + } +}; +template +struct I : DestField { + typedef DestField BASE; + static const hir::Opcode opcode = OPCODE; + static const uint32_t key = + InstrKey::Construct::value; + static const KeyType dest_type = DEST::key_type; + static const KeyType src1_type = SRC1::key_type; + static const KeyType src2_type = SRC2::key_type; + const Instr* instr; + SRC1 src1; + SRC2 src2; + + protected: + template + friend struct Sequence; + bool Load(const Instr* i) { + if (InstrKey(i).value == key && BASE::LoadDest(i)) { + instr = i; + src1.Load(i->src1); + src2.Load(i->src2); + return true; + } + return false; + } +}; +template +struct I : DestField { + typedef DestField BASE; + static const hir::Opcode opcode = OPCODE; + static const uint32_t key = + InstrKey::Construct::value; + static const KeyType dest_type = DEST::key_type; + static const KeyType src1_type = SRC1::key_type; + static const KeyType src2_type = SRC2::key_type; + static const KeyType src3_type = SRC3::key_type; + const Instr* instr; + SRC1 src1; + SRC2 src2; + SRC3 src3; + + protected: + template + friend struct Sequence; + bool Load(const Instr* i) { + if (InstrKey(i).value == key && BASE::LoadDest(i)) { + instr = i; + src1.Load(i->src1); + src2.Load(i->src2); + src3.Load(i->src3); + return true; + } + return false; + } +}; + +template +static const T GetTempReg(X64Emitter& e); +template <> +const Reg8 GetTempReg(X64Emitter& e) { + return e.al; +} +template <> +const Reg16 GetTempReg(X64Emitter& e) { + return e.ax; +} +template <> +const Reg32 GetTempReg(X64Emitter& e) { + return e.eax; +} +template <> +const Reg64 GetTempReg(X64Emitter& e) { + return e.rax; +} + +template +struct Sequence { + typedef T EmitArgType; + + static constexpr uint32_t head_key() { return T::key; } + + static bool Select(X64Emitter& e, const Instr* i) { + T args; + if (!args.Load(i)) { + return false; + } + SEQ::Emit(e, args); + return true; + } + + template + static void EmitUnaryOp(X64Emitter& e, const EmitArgType& i, + const REG_FN& reg_fn) { + if (i.src1.is_constant) { + e.mov(i.dest, i.src1.constant()); + reg_fn(e, i.dest); + } else { + if (i.dest != i.src1) { + e.mov(i.dest, i.src1); + } + reg_fn(e, i.dest); + } + } + + template + static void EmitCommutativeBinaryOp(X64Emitter& e, const EmitArgType& i, + const REG_REG_FN& reg_reg_fn, + const REG_CONST_FN& reg_const_fn) { + if (i.src1.is_constant) { + if (i.src2.is_constant) { + // Both constants. + if (i.src1.ConstantFitsIn32Reg()) { + e.mov(i.dest, i.src2.constant()); + reg_const_fn(e, i.dest, static_cast(i.src1.constant())); + } else if (i.src2.ConstantFitsIn32Reg()) { + e.mov(i.dest, i.src1.constant()); + reg_const_fn(e, i.dest, static_cast(i.src2.constant())); + } else { + e.mov(i.dest, i.src1.constant()); + auto temp = GetTempReg(e); + e.mov(temp, i.src2.constant()); + reg_reg_fn(e, i.dest, temp); + } + } else { + // src1 constant. + if (i.dest == i.src2) { + if (i.src1.ConstantFitsIn32Reg()) { + reg_const_fn(e, i.dest, static_cast(i.src1.constant())); + } else { + auto temp = GetTempReg(e); + e.mov(temp, i.src1.constant()); + reg_reg_fn(e, i.dest, temp); + } + } else { + e.mov(i.dest, i.src1.constant()); + reg_reg_fn(e, i.dest, i.src2); + } + } + } else if (i.src2.is_constant) { + if (i.dest == i.src1) { + if (i.src2.ConstantFitsIn32Reg()) { + reg_const_fn(e, i.dest, static_cast(i.src2.constant())); + } else { + auto temp = GetTempReg(e); + e.mov(temp, i.src2.constant()); + reg_reg_fn(e, i.dest, temp); + } + } else { + e.mov(i.dest, i.src2.constant()); + reg_reg_fn(e, i.dest, i.src1); + } + } else { + if (i.dest == i.src1) { + reg_reg_fn(e, i.dest, i.src2); + } else if (i.dest == i.src2) { + reg_reg_fn(e, i.dest, i.src1); + } else { + e.mov(i.dest, i.src1); + reg_reg_fn(e, i.dest, i.src2); + } + } + } + template + static void EmitAssociativeBinaryOp(X64Emitter& e, const EmitArgType& i, + const REG_REG_FN& reg_reg_fn, + const REG_CONST_FN& reg_const_fn) { + if (i.src1.is_constant) { + assert_true(!i.src2.is_constant); + if (i.dest == i.src2) { + auto temp = GetTempReg(e); + e.mov(temp, i.src2); + e.mov(i.dest, i.src1.constant()); + reg_reg_fn(e, i.dest, temp); + } else { + e.mov(i.dest, i.src1.constant()); + reg_reg_fn(e, i.dest, i.src2); + } + } else if (i.src2.is_constant) { + if (i.dest == i.src1) { + if (i.src2.ConstantFitsIn32Reg()) { + reg_const_fn(e, i.dest, static_cast(i.src2.constant())); + } else { + auto temp = GetTempReg(e); + e.mov(temp, i.src2.constant()); + reg_reg_fn(e, i.dest, temp); + } + } else { + e.mov(i.dest, i.src1); + if (i.src2.ConstantFitsIn32Reg()) { + reg_const_fn(e, i.dest, static_cast(i.src2.constant())); + } else { + auto temp = GetTempReg(e); + e.mov(temp, i.src2.constant()); + reg_reg_fn(e, i.dest, temp); + } + } + } else { + if (i.dest == i.src1) { + reg_reg_fn(e, i.dest, i.src2); + } else if (i.dest == i.src2) { + auto temp = GetTempReg(e); + e.mov(temp, i.src2); + e.mov(i.dest, i.src1); + reg_reg_fn(e, i.dest, temp); + } else { + e.mov(i.dest, i.src1); + reg_reg_fn(e, i.dest, i.src2); + } + } + } + + template + static void EmitCommutativeBinaryXmmOp(X64Emitter& e, const EmitArgType& i, + const FN& fn) { + if (i.src1.is_constant) { + assert_true(!i.src2.is_constant); + e.LoadConstantXmm(e.xmm0, i.src1.constant()); + fn(e, i.dest, e.xmm0, i.src2); + } else if (i.src2.is_constant) { + assert_true(!i.src1.is_constant); + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + fn(e, i.dest, i.src1, e.xmm0); + } else { + fn(e, i.dest, i.src1, i.src2); + } + } + + template + static void EmitAssociativeBinaryXmmOp(X64Emitter& e, const EmitArgType& i, + const FN& fn) { + if (i.src1.is_constant) { + assert_true(!i.src2.is_constant); + e.LoadConstantXmm(e.xmm0, i.src1.constant()); + fn(e, i.dest, e.xmm0, i.src2); + } else if (i.src2.is_constant) { + assert_true(!i.src1.is_constant); + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + fn(e, i.dest, i.src1, e.xmm0); + } else { + fn(e, i.dest, i.src1, i.src2); + } + } + + template + static void EmitCommutativeCompareOp(X64Emitter& e, const EmitArgType& i, + const REG_REG_FN& reg_reg_fn, + const REG_CONST_FN& reg_const_fn) { + if (i.src1.is_constant) { + assert_true(!i.src2.is_constant); + if (i.src1.ConstantFitsIn32Reg()) { + reg_const_fn(e, i.src2, static_cast(i.src1.constant())); + } else { + auto temp = GetTempReg(e); + e.mov(temp, i.src1.constant()); + reg_reg_fn(e, i.src2, temp); + } + } else if (i.src2.is_constant) { + assert_true(!i.src1.is_constant); + if (i.src2.ConstantFitsIn32Reg()) { + reg_const_fn(e, i.src1, static_cast(i.src2.constant())); + } else { + auto temp = GetTempReg(e); + e.mov(temp, i.src2.constant()); + reg_reg_fn(e, i.src1, temp); + } + } else { + reg_reg_fn(e, i.src1, i.src2); + } + } + template + static void EmitAssociativeCompareOp(X64Emitter& e, const EmitArgType& i, + const REG_REG_FN& reg_reg_fn, + const REG_CONST_FN& reg_const_fn) { + if (i.src1.is_constant) { + assert_true(!i.src2.is_constant); + if (i.src1.ConstantFitsIn32Reg()) { + reg_const_fn(e, i.dest, i.src2, static_cast(i.src1.constant()), + true); + } else { + auto temp = GetTempReg(e); + e.mov(temp, i.src1.constant()); + reg_reg_fn(e, i.dest, i.src2, temp, true); + } + } else if (i.src2.is_constant) { + assert_true(!i.src1.is_constant); + if (i.src2.ConstantFitsIn32Reg()) { + reg_const_fn(e, i.dest, i.src1, static_cast(i.src2.constant()), + false); + } else { + auto temp = GetTempReg(e); + e.mov(temp, i.src2.constant()); + reg_reg_fn(e, i.dest, i.src1, temp, false); + } + } else { + reg_reg_fn(e, i.dest, i.src1, i.src2, false); + } + } +}; + +} // namespace x64 +} // namespace backend +} // namespace cpu +} // namespace xe + +#endif // XENIA_CPU_BACKEND_X64_X64_OP_H_ diff --git a/src/xenia/cpu/backend/x64/x64_seq_control.cc b/src/xenia/cpu/backend/x64/x64_seq_control.cc new file mode 100644 index 000000000..80eeeebc7 --- /dev/null +++ b/src/xenia/cpu/backend/x64/x64_seq_control.cc @@ -0,0 +1,553 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2018 Xenia Developers. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/cpu/backend/x64/x64_sequences.h" + +#include +#include + +#include "xenia/cpu/backend/x64/x64_op.h" + +namespace xe { +namespace cpu { +namespace backend { +namespace x64 { + +volatile int anchor_control = 0; + +// ============================================================================ +// OPCODE_DEBUG_BREAK +// ============================================================================ +struct DEBUG_BREAK : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { e.DebugBreak(); } +}; +EMITTER_OPCODE_TABLE(OPCODE_DEBUG_BREAK, DEBUG_BREAK); + +// ============================================================================ +// OPCODE_DEBUG_BREAK_TRUE +// ============================================================================ +struct DEBUG_BREAK_TRUE_I8 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.DebugBreak(); + e.L(skip); + } +}; +struct DEBUG_BREAK_TRUE_I16 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.DebugBreak(); + e.L(skip); + } +}; +struct DEBUG_BREAK_TRUE_I32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.DebugBreak(); + e.L(skip); + } +}; +struct DEBUG_BREAK_TRUE_I64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.DebugBreak(); + e.L(skip); + } +}; +struct DEBUG_BREAK_TRUE_F32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.DebugBreak(); + e.L(skip); + } +}; +struct DEBUG_BREAK_TRUE_F64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.DebugBreak(); + e.L(skip); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_DEBUG_BREAK_TRUE, DEBUG_BREAK_TRUE_I8, + DEBUG_BREAK_TRUE_I16, DEBUG_BREAK_TRUE_I32, + DEBUG_BREAK_TRUE_I64, DEBUG_BREAK_TRUE_F32, + DEBUG_BREAK_TRUE_F64); + +// ============================================================================ +// OPCODE_TRAP +// ============================================================================ +struct TRAP : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.Trap(i.instr->flags); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_TRAP, TRAP); + +// ============================================================================ +// OPCODE_TRAP_TRUE +// ============================================================================ +struct TRAP_TRUE_I8 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Trap(i.instr->flags); + e.L(skip); + } +}; +struct TRAP_TRUE_I16 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Trap(i.instr->flags); + e.L(skip); + } +}; +struct TRAP_TRUE_I32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Trap(i.instr->flags); + e.L(skip); + } +}; +struct TRAP_TRUE_I64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Trap(i.instr->flags); + e.L(skip); + } +}; +struct TRAP_TRUE_F32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Trap(i.instr->flags); + e.L(skip); + } +}; +struct TRAP_TRUE_F64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Trap(i.instr->flags); + e.L(skip); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_TRAP_TRUE, TRAP_TRUE_I8, TRAP_TRUE_I16, + TRAP_TRUE_I32, TRAP_TRUE_I64, TRAP_TRUE_F32, + TRAP_TRUE_F64); + +// ============================================================================ +// OPCODE_CALL +// ============================================================================ +struct CALL : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + assert_true(i.src1.value->is_guest()); + e.Call(i.instr, static_cast(i.src1.value)); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_CALL, CALL); + +// ============================================================================ +// OPCODE_CALL_TRUE +// ============================================================================ +struct CALL_TRUE_I8 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + assert_true(i.src2.value->is_guest()); + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Call(i.instr, static_cast(i.src2.value)); + e.L(skip); + } +}; +struct CALL_TRUE_I16 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + assert_true(i.src2.value->is_guest()); + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Call(i.instr, static_cast(i.src2.value)); + e.L(skip); + } +}; +struct CALL_TRUE_I32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + assert_true(i.src2.value->is_guest()); + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Call(i.instr, static_cast(i.src2.value)); + e.L(skip); + } +}; +struct CALL_TRUE_I64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + assert_true(i.src2.value->is_guest()); + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Call(i.instr, static_cast(i.src2.value)); + e.L(skip); + } +}; +struct CALL_TRUE_F32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + assert_true(i.src2.value->is_guest()); + e.vptest(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Call(i.instr, static_cast(i.src2.value)); + e.L(skip); + } +}; +struct CALL_TRUE_F64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + assert_true(i.src2.value->is_guest()); + e.vptest(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Call(i.instr, static_cast(i.src2.value)); + e.L(skip); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_CALL_TRUE, CALL_TRUE_I8, CALL_TRUE_I16, + CALL_TRUE_I32, CALL_TRUE_I64, CALL_TRUE_F32, + CALL_TRUE_F64); + +// ============================================================================ +// OPCODE_CALL_INDIRECT +// ============================================================================ +struct CALL_INDIRECT + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.CallIndirect(i.instr, i.src1); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_CALL_INDIRECT, CALL_INDIRECT); + +// ============================================================================ +// OPCODE_CALL_INDIRECT_TRUE +// ============================================================================ +struct CALL_INDIRECT_TRUE_I8 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip, CodeGenerator::T_NEAR); + e.CallIndirect(i.instr, i.src2); + e.L(skip); + } +}; +struct CALL_INDIRECT_TRUE_I16 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip, CodeGenerator::T_NEAR); + e.CallIndirect(i.instr, i.src2); + e.L(skip); + } +}; +struct CALL_INDIRECT_TRUE_I32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip, CodeGenerator::T_NEAR); + e.CallIndirect(i.instr, i.src2); + e.L(skip); + } +}; +struct CALL_INDIRECT_TRUE_I64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip, CodeGenerator::T_NEAR); + e.CallIndirect(i.instr, i.src2); + e.L(skip); + } +}; +struct CALL_INDIRECT_TRUE_F32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip, CodeGenerator::T_NEAR); + e.CallIndirect(i.instr, i.src2); + e.L(skip); + } +}; +struct CALL_INDIRECT_TRUE_F64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip, CodeGenerator::T_NEAR); + e.CallIndirect(i.instr, i.src2); + e.L(skip); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_CALL_INDIRECT_TRUE, CALL_INDIRECT_TRUE_I8, + CALL_INDIRECT_TRUE_I16, CALL_INDIRECT_TRUE_I32, + CALL_INDIRECT_TRUE_I64, CALL_INDIRECT_TRUE_F32, + CALL_INDIRECT_TRUE_F64); + +// ============================================================================ +// OPCODE_CALL_EXTERN +// ============================================================================ +struct CALL_EXTERN + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.CallExtern(i.instr, i.src1.value); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_CALL_EXTERN, CALL_EXTERN); + +// ============================================================================ +// OPCODE_RETURN +// ============================================================================ +struct RETURN : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // If this is the last instruction in the last block, just let us + // fall through. + if (i.instr->next || i.instr->block->next) { + e.jmp(e.epilog_label(), CodeGenerator::T_NEAR); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_RETURN, RETURN); + +// ============================================================================ +// OPCODE_RETURN_TRUE +// ============================================================================ +struct RETURN_TRUE_I8 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jnz(e.epilog_label(), CodeGenerator::T_NEAR); + } +}; +struct RETURN_TRUE_I16 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jnz(e.epilog_label(), CodeGenerator::T_NEAR); + } +}; +struct RETURN_TRUE_I32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jnz(e.epilog_label(), CodeGenerator::T_NEAR); + } +}; +struct RETURN_TRUE_I64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jnz(e.epilog_label(), CodeGenerator::T_NEAR); + } +}; +struct RETURN_TRUE_F32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + e.jnz(e.epilog_label(), CodeGenerator::T_NEAR); + } +}; +struct RETURN_TRUE_F64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + e.jnz(e.epilog_label(), CodeGenerator::T_NEAR); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_RETURN_TRUE, RETURN_TRUE_I8, RETURN_TRUE_I16, + RETURN_TRUE_I32, RETURN_TRUE_I64, RETURN_TRUE_F32, + RETURN_TRUE_F64); + +// ============================================================================ +// OPCODE_SET_RETURN_ADDRESS +// ============================================================================ +struct SET_RETURN_ADDRESS + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.SetReturnAddress(i.src1.constant()); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_SET_RETURN_ADDRESS, SET_RETURN_ADDRESS); + +// ============================================================================ +// OPCODE_BRANCH +// ============================================================================ +struct BRANCH : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.jmp(i.src1.value->name, e.T_NEAR); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_BRANCH, BRANCH); + +// ============================================================================ +// OPCODE_BRANCH_TRUE +// ============================================================================ +struct BRANCH_TRUE_I8 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jnz(i.src2.value->name, e.T_NEAR); + } +}; +struct BRANCH_TRUE_I16 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jnz(i.src2.value->name, e.T_NEAR); + } +}; +struct BRANCH_TRUE_I32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jnz(i.src2.value->name, e.T_NEAR); + } +}; +struct BRANCH_TRUE_I64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jnz(i.src2.value->name, e.T_NEAR); + } +}; +struct BRANCH_TRUE_F32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + e.jnz(i.src2.value->name, e.T_NEAR); + } +}; +struct BRANCH_TRUE_F64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + e.jnz(i.src2.value->name, e.T_NEAR); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_BRANCH_TRUE, BRANCH_TRUE_I8, BRANCH_TRUE_I16, + BRANCH_TRUE_I32, BRANCH_TRUE_I64, BRANCH_TRUE_F32, + BRANCH_TRUE_F64); + +// ============================================================================ +// OPCODE_BRANCH_FALSE +// ============================================================================ +struct BRANCH_FALSE_I8 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jz(i.src2.value->name, e.T_NEAR); + } +}; +struct BRANCH_FALSE_I16 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jz(i.src2.value->name, e.T_NEAR); + } +}; +struct BRANCH_FALSE_I32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jz(i.src2.value->name, e.T_NEAR); + } +}; +struct BRANCH_FALSE_I64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jz(i.src2.value->name, e.T_NEAR); + } +}; +struct BRANCH_FALSE_F32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + e.jz(i.src2.value->name, e.T_NEAR); + } +}; +struct BRANCH_FALSE_F64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + e.jz(i.src2.value->name, e.T_NEAR); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_BRANCH_FALSE, BRANCH_FALSE_I8, BRANCH_FALSE_I16, + BRANCH_FALSE_I32, BRANCH_FALSE_I64, BRANCH_FALSE_F32, + BRANCH_FALSE_F64); + +} // namespace x64 +} // namespace backend +} // namespace cpu +} // namespace xe \ No newline at end of file diff --git a/src/xenia/cpu/backend/x64/x64_seq_memory.cc b/src/xenia/cpu/backend/x64/x64_seq_memory.cc new file mode 100644 index 000000000..7526d1fc8 --- /dev/null +++ b/src/xenia/cpu/backend/x64/x64_seq_memory.cc @@ -0,0 +1,1053 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2018 Xenia Developers. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/cpu/backend/x64/x64_sequences.h" + +#include +#include + +#include "xenia/cpu/backend/x64/x64_op.h" +#include "xenia/cpu/backend/x64/x64_tracers.h" + +namespace xe { +namespace cpu { +namespace backend { +namespace x64 { + +volatile int anchor_memory = 0; + +// Note: all types are always aligned in the context. +RegExp ComputeContextAddress(X64Emitter& e, const OffsetOp& offset) { + return e.GetContextReg() + offset.value; +} + +template +RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest, + const T& offset) { + assert_true(offset.is_constant); + int32_t offset_const = static_cast(offset.constant()); + + if (guest.is_constant) { + uint32_t address = static_cast(guest.constant()); + address += offset_const; + if (address < 0x80000000) { + return e.GetMembaseReg() + address; + } else { + e.mov(e.eax, address); + return e.GetMembaseReg() + e.rax; + } + } else { + // Clear the top 32 bits, as they are likely garbage. + // TODO(benvanik): find a way to avoid doing this. + e.mov(e.eax, guest.reg().cvt32()); + return e.GetMembaseReg() + e.rax + offset_const; + } +} + +// Note: most *should* be aligned, but needs to be checked! +template +RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) { + if (guest.is_constant) { + // TODO(benvanik): figure out how to do this without a temp. + // Since the constant is often 0x8... if we tried to use that as a + // displacement it would be sign extended and mess things up. + uint32_t address = static_cast(guest.constant()); + if (address < 0x80000000) { + return e.GetMembaseReg() + address; + } else { + e.mov(e.eax, address); + return e.GetMembaseReg() + e.rax; + } + } else { + // Clear the top 32 bits, as they are likely garbage. + // TODO(benvanik): find a way to avoid doing this. + e.mov(e.eax, guest.reg().cvt32()); + return e.GetMembaseReg() + e.rax; + } +} + +// ============================================================================ +// OPCODE_ATOMIC_EXCHANGE +// ============================================================================ +// Note that the address we use here is a real, host address! +// This is weird, and should be fixed. +template +void EmitAtomicExchangeXX(X64Emitter& e, const ARGS& i) { + if (i.dest == i.src1) { + e.mov(e.rax, i.src1); + if (i.dest != i.src2) { + if (i.src2.is_constant) { + e.mov(i.dest, i.src2.constant()); + } else { + e.mov(i.dest, i.src2); + } + } + e.lock(); + e.xchg(e.dword[e.rax], i.dest); + } else { + if (i.dest != i.src2) { + if (i.src2.is_constant) { + e.mov(i.dest, i.src2.constant()); + } else { + e.mov(i.dest, i.src2); + } + } + e.lock(); + e.xchg(e.dword[i.src1.reg()], i.dest); + } +} +struct ATOMIC_EXCHANGE_I8 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAtomicExchangeXX(e, i); + } +}; +struct ATOMIC_EXCHANGE_I16 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAtomicExchangeXX(e, i); + } +}; +struct ATOMIC_EXCHANGE_I32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAtomicExchangeXX(e, i); + } +}; +struct ATOMIC_EXCHANGE_I64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAtomicExchangeXX(e, i); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_ATOMIC_EXCHANGE, ATOMIC_EXCHANGE_I8, + ATOMIC_EXCHANGE_I16, ATOMIC_EXCHANGE_I32, + ATOMIC_EXCHANGE_I64); + +// ============================================================================ +// OPCODE_ATOMIC_COMPARE_EXCHANGE +// ============================================================================ +struct ATOMIC_COMPARE_EXCHANGE_I32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.mov(e.eax, i.src2); + e.mov(e.ecx, i.src1.reg().cvt32()); + e.lock(); + e.cmpxchg(e.dword[e.GetMembaseReg() + e.rcx], i.src3); + e.sete(i.dest); + } +}; +struct ATOMIC_COMPARE_EXCHANGE_I64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.mov(e.rax, i.src2); + e.mov(e.ecx, i.src1.reg().cvt32()); + e.lock(); + e.cmpxchg(e.qword[e.GetMembaseReg() + e.rcx], i.src3); + e.sete(i.dest); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_ATOMIC_COMPARE_EXCHANGE, + ATOMIC_COMPARE_EXCHANGE_I32, ATOMIC_COMPARE_EXCHANGE_I64); + +// ============================================================================ +// OPCODE_LOAD_LOCAL +// ============================================================================ +// Note: all types are always aligned on the stack. +struct LOAD_LOCAL_I8 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.mov(i.dest, e.byte[e.rsp + i.src1.constant()]); + // e.TraceLoadI8(DATA_LOCAL, i.src1.constant, i.dest); + } +}; +struct LOAD_LOCAL_I16 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.mov(i.dest, e.word[e.rsp + i.src1.constant()]); + // e.TraceLoadI16(DATA_LOCAL, i.src1.constant, i.dest); + } +}; +struct LOAD_LOCAL_I32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.mov(i.dest, e.dword[e.rsp + i.src1.constant()]); + // e.TraceLoadI32(DATA_LOCAL, i.src1.constant, i.dest); + } +}; +struct LOAD_LOCAL_I64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.mov(i.dest, e.qword[e.rsp + i.src1.constant()]); + // e.TraceLoadI64(DATA_LOCAL, i.src1.constant, i.dest); + } +}; +struct LOAD_LOCAL_F32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vmovss(i.dest, e.dword[e.rsp + i.src1.constant()]); + // e.TraceLoadF32(DATA_LOCAL, i.src1.constant, i.dest); + } +}; +struct LOAD_LOCAL_F64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vmovsd(i.dest, e.qword[e.rsp + i.src1.constant()]); + // e.TraceLoadF64(DATA_LOCAL, i.src1.constant, i.dest); + } +}; +struct LOAD_LOCAL_V128 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vmovaps(i.dest, e.ptr[e.rsp + i.src1.constant()]); + // e.TraceLoadV128(DATA_LOCAL, i.src1.constant, i.dest); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_LOAD_LOCAL, LOAD_LOCAL_I8, LOAD_LOCAL_I16, + LOAD_LOCAL_I32, LOAD_LOCAL_I64, LOAD_LOCAL_F32, + LOAD_LOCAL_F64, LOAD_LOCAL_V128); + +// ============================================================================ +// OPCODE_STORE_LOCAL +// ============================================================================ +// Note: all types are always aligned on the stack. +struct STORE_LOCAL_I8 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // e.TraceStoreI8(DATA_LOCAL, i.src1.constant, i.src2); + e.mov(e.byte[e.rsp + i.src1.constant()], i.src2); + } +}; +struct STORE_LOCAL_I16 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // e.TraceStoreI16(DATA_LOCAL, i.src1.constant, i.src2); + e.mov(e.word[e.rsp + i.src1.constant()], i.src2); + } +}; +struct STORE_LOCAL_I32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // e.TraceStoreI32(DATA_LOCAL, i.src1.constant, i.src2); + e.mov(e.dword[e.rsp + i.src1.constant()], i.src2); + } +}; +struct STORE_LOCAL_I64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // e.TraceStoreI64(DATA_LOCAL, i.src1.constant, i.src2); + e.mov(e.qword[e.rsp + i.src1.constant()], i.src2); + } +}; +struct STORE_LOCAL_F32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // e.TraceStoreF32(DATA_LOCAL, i.src1.constant, i.src2); + e.vmovss(e.dword[e.rsp + i.src1.constant()], i.src2); + } +}; +struct STORE_LOCAL_F64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // e.TraceStoreF64(DATA_LOCAL, i.src1.constant, i.src2); + e.vmovsd(e.qword[e.rsp + i.src1.constant()], i.src2); + } +}; +struct STORE_LOCAL_V128 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // e.TraceStoreV128(DATA_LOCAL, i.src1.constant, i.src2); + e.vmovaps(e.ptr[e.rsp + i.src1.constant()], i.src2); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_STORE_LOCAL, STORE_LOCAL_I8, STORE_LOCAL_I16, + STORE_LOCAL_I32, STORE_LOCAL_I64, STORE_LOCAL_F32, + STORE_LOCAL_F64, STORE_LOCAL_V128); + +// ============================================================================ +// OPCODE_LOAD_CONTEXT +// ============================================================================ +struct LOAD_CONTEXT_I8 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + e.mov(i.dest, e.byte[addr]); + if (IsTracingData()) { + e.mov(e.GetNativeParam(0), i.src1.value); + e.mov(e.GetNativeParam(1), e.byte[addr]); + e.CallNative(reinterpret_cast(TraceContextLoadI8)); + } + } +}; +struct LOAD_CONTEXT_I16 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + e.mov(i.dest, e.word[addr]); + if (IsTracingData()) { + e.mov(e.GetNativeParam(1), e.word[addr]); + e.mov(e.GetNativeParam(0), i.src1.value); + e.CallNative(reinterpret_cast(TraceContextLoadI16)); + } + } +}; +struct LOAD_CONTEXT_I32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + e.mov(i.dest, e.dword[addr]); + if (IsTracingData()) { + e.mov(e.GetNativeParam(1), e.dword[addr]); + e.mov(e.GetNativeParam(0), i.src1.value); + e.CallNative(reinterpret_cast(TraceContextLoadI32)); + } + } +}; +struct LOAD_CONTEXT_I64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + e.mov(i.dest, e.qword[addr]); + if (IsTracingData()) { + e.mov(e.GetNativeParam(1), e.qword[addr]); + e.mov(e.GetNativeParam(0), i.src1.value); + e.CallNative(reinterpret_cast(TraceContextLoadI64)); + } + } +}; +struct LOAD_CONTEXT_F32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + e.vmovss(i.dest, e.dword[addr]); + if (IsTracingData()) { + e.lea(e.GetNativeParam(1), e.dword[addr]); + e.mov(e.GetNativeParam(0), i.src1.value); + e.CallNative(reinterpret_cast(TraceContextLoadF32)); + } + } +}; +struct LOAD_CONTEXT_F64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + e.vmovsd(i.dest, e.qword[addr]); + if (IsTracingData()) { + e.lea(e.GetNativeParam(1), e.qword[addr]); + e.mov(e.GetNativeParam(0), i.src1.value); + e.CallNative(reinterpret_cast(TraceContextLoadF64)); + } + } +}; +struct LOAD_CONTEXT_V128 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + e.vmovaps(i.dest, e.ptr[addr]); + if (IsTracingData()) { + e.lea(e.GetNativeParam(1), e.ptr[addr]); + e.mov(e.GetNativeParam(0), i.src1.value); + e.CallNative(reinterpret_cast(TraceContextLoadV128)); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_LOAD_CONTEXT, LOAD_CONTEXT_I8, LOAD_CONTEXT_I16, + LOAD_CONTEXT_I32, LOAD_CONTEXT_I64, LOAD_CONTEXT_F32, + LOAD_CONTEXT_F64, LOAD_CONTEXT_V128); + +// ============================================================================ +// OPCODE_STORE_CONTEXT +// ============================================================================ +// Note: all types are always aligned on the stack. +struct STORE_CONTEXT_I8 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + if (i.src2.is_constant) { + e.mov(e.byte[addr], i.src2.constant()); + } else { + e.mov(e.byte[addr], i.src2); + } + if (IsTracingData()) { + e.mov(e.GetNativeParam(1), e.byte[addr]); + e.mov(e.GetNativeParam(0), i.src1.value); + e.CallNative(reinterpret_cast(TraceContextStoreI8)); + } + } +}; +struct STORE_CONTEXT_I16 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + if (i.src2.is_constant) { + e.mov(e.word[addr], i.src2.constant()); + } else { + e.mov(e.word[addr], i.src2); + } + if (IsTracingData()) { + e.mov(e.GetNativeParam(1), e.word[addr]); + e.mov(e.GetNativeParam(0), i.src1.value); + e.CallNative(reinterpret_cast(TraceContextStoreI16)); + } + } +}; +struct STORE_CONTEXT_I32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + if (i.src2.is_constant) { + e.mov(e.dword[addr], i.src2.constant()); + } else { + e.mov(e.dword[addr], i.src2); + } + if (IsTracingData()) { + e.mov(e.GetNativeParam(1), e.dword[addr]); + e.mov(e.GetNativeParam(0), i.src1.value); + e.CallNative(reinterpret_cast(TraceContextStoreI32)); + } + } +}; +struct STORE_CONTEXT_I64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + if (i.src2.is_constant) { + e.MovMem64(addr, i.src2.constant()); + } else { + e.mov(e.qword[addr], i.src2); + } + if (IsTracingData()) { + e.mov(e.GetNativeParam(1), e.qword[addr]); + e.mov(e.GetNativeParam(0), i.src1.value); + e.CallNative(reinterpret_cast(TraceContextStoreI64)); + } + } +}; +struct STORE_CONTEXT_F32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + if (i.src2.is_constant) { + e.mov(e.dword[addr], i.src2.value->constant.i32); + } else { + e.vmovss(e.dword[addr], i.src2); + } + if (IsTracingData()) { + e.lea(e.GetNativeParam(1), e.dword[addr]); + e.mov(e.GetNativeParam(0), i.src1.value); + e.CallNative(reinterpret_cast(TraceContextStoreF32)); + } + } +}; +struct STORE_CONTEXT_F64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + if (i.src2.is_constant) { + e.MovMem64(addr, i.src2.value->constant.i64); + } else { + e.vmovsd(e.qword[addr], i.src2); + } + if (IsTracingData()) { + e.lea(e.GetNativeParam(1), e.qword[addr]); + e.mov(e.GetNativeParam(0), i.src1.value); + e.CallNative(reinterpret_cast(TraceContextStoreF64)); + } + } +}; +struct STORE_CONTEXT_V128 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.vmovaps(e.ptr[addr], e.xmm0); + } else { + e.vmovaps(e.ptr[addr], i.src2); + } + if (IsTracingData()) { + e.lea(e.GetNativeParam(1), e.ptr[addr]); + e.mov(e.GetNativeParam(0), i.src1.value); + e.CallNative(reinterpret_cast(TraceContextStoreV128)); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_STORE_CONTEXT, STORE_CONTEXT_I8, STORE_CONTEXT_I16, + STORE_CONTEXT_I32, STORE_CONTEXT_I64, STORE_CONTEXT_F32, + STORE_CONTEXT_F64, STORE_CONTEXT_V128); + +// ============================================================================ +// OPCODE_LOAD_MMIO +// ============================================================================ +// Note: all types are always aligned in the context. +struct LOAD_MMIO_I32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // uint64_t (context, addr) + auto mmio_range = reinterpret_cast(i.src1.value); + auto read_address = uint32_t(i.src2.value); + e.mov(e.GetNativeParam(0), uint64_t(mmio_range->callback_context)); + e.mov(e.GetNativeParam(1).cvt32(), read_address); + e.CallNativeSafe(reinterpret_cast(mmio_range->read)); + e.bswap(e.eax); + e.mov(i.dest, e.eax); + if (IsTracingData()) { + e.mov(e.GetNativeParam(0), i.dest); + e.mov(e.edx, read_address); + e.CallNative(reinterpret_cast(TraceContextLoadI32)); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_LOAD_MMIO, LOAD_MMIO_I32); + +// ============================================================================ +// OPCODE_STORE_MMIO +// ============================================================================ +// Note: all types are always aligned on the stack. +struct STORE_MMIO_I32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // void (context, addr, value) + auto mmio_range = reinterpret_cast(i.src1.value); + auto write_address = uint32_t(i.src2.value); + e.mov(e.GetNativeParam(0), uint64_t(mmio_range->callback_context)); + e.mov(e.GetNativeParam(1).cvt32(), write_address); + if (i.src3.is_constant) { + e.mov(e.GetNativeParam(2).cvt32(), xe::byte_swap(i.src3.constant())); + } else { + e.mov(e.GetNativeParam(2).cvt32(), i.src3); + e.bswap(e.GetNativeParam(2).cvt32()); + } + e.CallNativeSafe(reinterpret_cast(mmio_range->write)); + if (IsTracingData()) { + if (i.src3.is_constant) { + e.mov(e.GetNativeParam(0).cvt32(), i.src3.constant()); + } else { + e.mov(e.GetNativeParam(0).cvt32(), i.src3); + } + e.mov(e.edx, write_address); + e.CallNative(reinterpret_cast(TraceContextStoreI32)); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_STORE_MMIO, STORE_MMIO_I32); + +// ============================================================================ +// OPCODE_LOAD_OFFSET +// ============================================================================ +struct LOAD_OFFSET_I8 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); + e.mov(i.dest, e.byte[addr]); + } +}; + +struct LOAD_OFFSET_I16 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + if (e.IsFeatureEnabled(kX64EmitMovbe)) { + e.movbe(i.dest, e.word[addr]); + } else { + e.mov(i.dest, e.word[addr]); + e.ror(i.dest, 8); + } + } else { + e.mov(i.dest, e.word[addr]); + } + } +}; + +struct LOAD_OFFSET_I32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + if (e.IsFeatureEnabled(kX64EmitMovbe)) { + e.movbe(i.dest, e.dword[addr]); + } else { + e.mov(i.dest, e.dword[addr]); + e.bswap(i.dest); + } + } else { + e.mov(i.dest, e.dword[addr]); + } + } +}; + +struct LOAD_OFFSET_I64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + if (e.IsFeatureEnabled(kX64EmitMovbe)) { + e.movbe(i.dest, e.qword[addr]); + } else { + e.mov(i.dest, e.qword[addr]); + e.bswap(i.dest); + } + } else { + e.mov(i.dest, e.qword[addr]); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_LOAD_OFFSET, LOAD_OFFSET_I8, LOAD_OFFSET_I16, + LOAD_OFFSET_I32, LOAD_OFFSET_I64); + +// ============================================================================ +// OPCODE_STORE_OFFSET +// ============================================================================ +struct STORE_OFFSET_I8 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); + if (i.src3.is_constant) { + e.mov(e.byte[addr], i.src3.constant()); + } else { + e.mov(e.byte[addr], i.src3); + } + } +}; + +struct STORE_OFFSET_I16 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + assert_false(i.src3.is_constant); + if (e.IsFeatureEnabled(kX64EmitMovbe)) { + e.movbe(e.word[addr], i.src3); + } else { + assert_always("not implemented"); + } + } else { + if (i.src3.is_constant) { + e.mov(e.word[addr], i.src3.constant()); + } else { + e.mov(e.word[addr], i.src3); + } + } + } +}; + +struct STORE_OFFSET_I32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + assert_false(i.src3.is_constant); + if (e.IsFeatureEnabled(kX64EmitMovbe)) { + e.movbe(e.dword[addr], i.src3); + } else { + assert_always("not implemented"); + } + } else { + if (i.src3.is_constant) { + e.mov(e.dword[addr], i.src3.constant()); + } else { + e.mov(e.dword[addr], i.src3); + } + } + } +}; + +struct STORE_OFFSET_I64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + assert_false(i.src3.is_constant); + if (e.IsFeatureEnabled(kX64EmitMovbe)) { + e.movbe(e.qword[addr], i.src3); + } else { + assert_always("not implemented"); + } + } else { + if (i.src3.is_constant) { + e.MovMem64(addr, i.src3.constant()); + } else { + e.mov(e.qword[addr], i.src3); + } + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_STORE_OFFSET, STORE_OFFSET_I8, STORE_OFFSET_I16, + STORE_OFFSET_I32, STORE_OFFSET_I64); + +// ============================================================================ +// OPCODE_LOAD +// ============================================================================ +struct LOAD_I8 : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + e.mov(i.dest, e.byte[addr]); + if (IsTracingData()) { + e.mov(e.GetNativeParam(1).cvt8(), i.dest); + e.lea(e.GetNativeParam(0), e.ptr[addr]); + e.CallNative(reinterpret_cast(TraceMemoryLoadI8)); + } + } +}; +struct LOAD_I16 : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + if (e.IsFeatureEnabled(kX64EmitMovbe)) { + e.movbe(i.dest, e.word[addr]); + } else { + e.mov(i.dest, e.word[addr]); + e.ror(i.dest, 8); + } + } else { + e.mov(i.dest, e.word[addr]); + } + if (IsTracingData()) { + e.mov(e.GetNativeParam(1).cvt16(), i.dest); + e.lea(e.GetNativeParam(0), e.ptr[addr]); + e.CallNative(reinterpret_cast(TraceMemoryLoadI16)); + } + } +}; +struct LOAD_I32 : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + if (e.IsFeatureEnabled(kX64EmitMovbe)) { + e.movbe(i.dest, e.dword[addr]); + } else { + e.mov(i.dest, e.dword[addr]); + e.bswap(i.dest); + } + } else { + e.mov(i.dest, e.dword[addr]); + } + if (IsTracingData()) { + e.mov(e.GetNativeParam(1).cvt32(), i.dest); + e.lea(e.GetNativeParam(0), e.ptr[addr]); + e.CallNative(reinterpret_cast(TraceMemoryLoadI32)); + } + } +}; +struct LOAD_I64 : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + if (e.IsFeatureEnabled(kX64EmitMovbe)) { + e.movbe(i.dest, e.qword[addr]); + } else { + e.mov(i.dest, e.qword[addr]); + e.bswap(i.dest); + } + } else { + e.mov(i.dest, e.qword[addr]); + } + if (IsTracingData()) { + e.mov(e.GetNativeParam(1), i.dest); + e.lea(e.GetNativeParam(0), e.ptr[addr]); + e.CallNative(reinterpret_cast(TraceMemoryLoadI64)); + } + } +}; +struct LOAD_F32 : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + e.vmovss(i.dest, e.dword[addr]); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + assert_always("not implemented yet"); + } + if (IsTracingData()) { + e.lea(e.GetNativeParam(1), e.dword[addr]); + e.lea(e.GetNativeParam(0), e.ptr[addr]); + e.CallNative(reinterpret_cast(TraceMemoryLoadF32)); + } + } +}; +struct LOAD_F64 : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + e.vmovsd(i.dest, e.qword[addr]); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + assert_always("not implemented yet"); + } + if (IsTracingData()) { + e.lea(e.GetNativeParam(1), e.qword[addr]); + e.lea(e.GetNativeParam(0), e.ptr[addr]); + e.CallNative(reinterpret_cast(TraceMemoryLoadF64)); + } + } +}; +struct LOAD_V128 : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + // TODO(benvanik): we should try to stick to movaps if possible. + e.vmovups(i.dest, e.ptr[addr]); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + // TODO(benvanik): find a way to do this without the memory load. + e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteSwapMask)); + } + if (IsTracingData()) { + e.lea(e.GetNativeParam(1), e.ptr[addr]); + e.lea(e.GetNativeParam(0), e.ptr[addr]); + e.CallNative(reinterpret_cast(TraceMemoryLoadV128)); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_LOAD, LOAD_I8, LOAD_I16, LOAD_I32, LOAD_I64, + LOAD_F32, LOAD_F64, LOAD_V128); + +// ============================================================================ +// OPCODE_STORE +// ============================================================================ +// Note: most *should* be aligned, but needs to be checked! +struct STORE_I8 : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + if (i.src2.is_constant) { + e.mov(e.byte[addr], i.src2.constant()); + } else { + e.mov(e.byte[addr], i.src2); + } + if (IsTracingData()) { + addr = ComputeMemoryAddress(e, i.src1); + e.mov(e.GetNativeParam(1).cvt8(), e.byte[addr]); + e.lea(e.GetNativeParam(0), e.ptr[addr]); + e.CallNative(reinterpret_cast(TraceMemoryStoreI8)); + } + } +}; +struct STORE_I16 : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + assert_false(i.src2.is_constant); + if (e.IsFeatureEnabled(kX64EmitMovbe)) { + e.movbe(e.word[addr], i.src2); + } else { + assert_always("not implemented"); + } + } else { + if (i.src2.is_constant) { + e.mov(e.word[addr], i.src2.constant()); + } else { + e.mov(e.word[addr], i.src2); + } + } + if (IsTracingData()) { + addr = ComputeMemoryAddress(e, i.src1); + e.mov(e.GetNativeParam(1).cvt16(), e.word[addr]); + e.lea(e.GetNativeParam(0), e.ptr[addr]); + e.CallNative(reinterpret_cast(TraceMemoryStoreI16)); + } + } +}; +struct STORE_I32 : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + assert_false(i.src2.is_constant); + if (e.IsFeatureEnabled(kX64EmitMovbe)) { + e.movbe(e.dword[addr], i.src2); + } else { + assert_always("not implemented"); + } + } else { + if (i.src2.is_constant) { + e.mov(e.dword[addr], i.src2.constant()); + } else { + e.mov(e.dword[addr], i.src2); + } + } + if (IsTracingData()) { + addr = ComputeMemoryAddress(e, i.src1); + e.mov(e.GetNativeParam(1).cvt32(), e.dword[addr]); + e.lea(e.GetNativeParam(0), e.ptr[addr]); + e.CallNative(reinterpret_cast(TraceMemoryStoreI32)); + } + } +}; +struct STORE_I64 : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + assert_false(i.src2.is_constant); + if (e.IsFeatureEnabled(kX64EmitMovbe)) { + e.movbe(e.qword[addr], i.src2); + } else { + assert_always("not implemented"); + } + } else { + if (i.src2.is_constant) { + e.MovMem64(addr, i.src2.constant()); + } else { + e.mov(e.qword[addr], i.src2); + } + } + if (IsTracingData()) { + addr = ComputeMemoryAddress(e, i.src1); + e.mov(e.GetNativeParam(1), e.qword[addr]); + e.lea(e.GetNativeParam(0), e.ptr[addr]); + e.CallNative(reinterpret_cast(TraceMemoryStoreI64)); + } + } +}; +struct STORE_F32 : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + assert_false(i.src2.is_constant); + assert_always("not yet implemented"); + } else { + if (i.src2.is_constant) { + e.mov(e.dword[addr], i.src2.value->constant.i32); + } else { + e.vmovss(e.dword[addr], i.src2); + } + } + if (IsTracingData()) { + addr = ComputeMemoryAddress(e, i.src1); + e.lea(e.GetNativeParam(1), e.ptr[addr]); + e.lea(e.GetNativeParam(0), e.ptr[addr]); + e.CallNative(reinterpret_cast(TraceMemoryStoreF32)); + } + } +}; +struct STORE_F64 : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + assert_false(i.src2.is_constant); + assert_always("not yet implemented"); + } else { + if (i.src2.is_constant) { + e.MovMem64(addr, i.src2.value->constant.i64); + } else { + e.vmovsd(e.qword[addr], i.src2); + } + } + if (IsTracingData()) { + addr = ComputeMemoryAddress(e, i.src1); + e.lea(e.GetNativeParam(1), e.ptr[addr]); + e.lea(e.GetNativeParam(0), e.ptr[addr]); + e.CallNative(reinterpret_cast(TraceMemoryStoreF64)); + } + } +}; +struct STORE_V128 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + assert_false(i.src2.is_constant); + e.vpshufb(e.xmm0, i.src2, e.GetXmmConstPtr(XMMByteSwapMask)); + e.vmovaps(e.ptr[addr], e.xmm0); + } else { + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.vmovaps(e.ptr[addr], e.xmm0); + } else { + e.vmovaps(e.ptr[addr], i.src2); + } + } + if (IsTracingData()) { + addr = ComputeMemoryAddress(e, i.src1); + e.lea(e.GetNativeParam(1), e.ptr[addr]); + e.lea(e.GetNativeParam(0), e.ptr[addr]); + e.CallNative(reinterpret_cast(TraceMemoryStoreV128)); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_STORE, STORE_I8, STORE_I16, STORE_I32, STORE_I64, + STORE_F32, STORE_F64, STORE_V128); + +// ============================================================================ +// OPCODE_PREFETCH +// ============================================================================ +struct PREFETCH + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): prefetch addr -> length. + } +}; +EMITTER_OPCODE_TABLE(OPCODE_PREFETCH, PREFETCH); + +// ============================================================================ +// OPCODE_MEMORY_BARRIER +// ============================================================================ +struct MEMORY_BARRIER + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { e.mfence(); } +}; +EMITTER_OPCODE_TABLE(OPCODE_MEMORY_BARRIER, MEMORY_BARRIER); + +// ============================================================================ +// OPCODE_MEMSET +// ============================================================================ +struct MEMSET_I64_I8_I64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + assert_true(i.src2.is_constant); + assert_true(i.src3.is_constant); + assert_true(i.src2.constant() == 0); + e.vpxor(e.xmm0, e.xmm0); + auto addr = ComputeMemoryAddress(e, i.src1); + switch (i.src3.constant()) { + case 32: + e.vmovaps(e.ptr[addr + 0 * 16], e.xmm0); + e.vmovaps(e.ptr[addr + 1 * 16], e.xmm0); + break; + case 128: + e.vmovaps(e.ptr[addr + 0 * 16], e.xmm0); + e.vmovaps(e.ptr[addr + 1 * 16], e.xmm0); + e.vmovaps(e.ptr[addr + 2 * 16], e.xmm0); + e.vmovaps(e.ptr[addr + 3 * 16], e.xmm0); + e.vmovaps(e.ptr[addr + 4 * 16], e.xmm0); + e.vmovaps(e.ptr[addr + 5 * 16], e.xmm0); + e.vmovaps(e.ptr[addr + 6 * 16], e.xmm0); + e.vmovaps(e.ptr[addr + 7 * 16], e.xmm0); + break; + default: + assert_unhandled_case(i.src3.constant()); + break; + } + if (IsTracingData()) { + addr = ComputeMemoryAddress(e, i.src1); + e.mov(e.GetNativeParam(2), i.src3.constant()); + e.mov(e.GetNativeParam(1), i.src2.constant()); + e.lea(e.GetNativeParam(0), e.ptr[addr]); + e.CallNative(reinterpret_cast(TraceMemset)); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_MEMSET, MEMSET_I64_I8_I64); + +} // namespace x64 +} // namespace backend +} // namespace cpu +} // namespace xe diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc new file mode 100644 index 000000000..89d3bee14 --- /dev/null +++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc @@ -0,0 +1,2553 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2018 Xenia Developers. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/cpu/backend/x64/x64_sequences.h" + +#include +#include + +#include "xenia/cpu/backend/x64/x64_op.h" + +// For OPCODE_PACK/OPCODE_UNPACK +#include "third_party/half/include/half.hpp" + +namespace xe { +namespace cpu { +namespace backend { +namespace x64 { + +volatile int anchor_vector = 0; + +// ============================================================================ +// OPCODE_VECTOR_CONVERT_I2F +// ============================================================================ +struct VECTOR_CONVERT_I2F + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // flags = ARITHMETIC_UNSIGNED + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + // xmm0 = mask of positive values + e.vpcmpgtd(e.xmm0, i.src1, e.GetXmmConstPtr(XMMFFFF)); + + // scale any values >= (unsigned)INT_MIN back to [0, INT_MAX] + e.vpsubd(e.xmm1, i.src1, e.GetXmmConstPtr(XMMSignMaskI32)); + e.vblendvps(e.xmm1, e.xmm1, i.src1, e.xmm0); + + // xmm1 = [0, INT_MAX] + e.vcvtdq2ps(i.dest, e.xmm1); + + // scale values back above [INT_MIN, UINT_MAX] + e.vpandn(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMPosIntMinPS)); + e.vaddps(i.dest, i.dest, e.xmm0); + } else { + e.vcvtdq2ps(i.dest, i.src1); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_CONVERT_I2F, VECTOR_CONVERT_I2F); + +// ============================================================================ +// OPCODE_VECTOR_CONVERT_F2I +// ============================================================================ +struct VECTOR_CONVERT_F2I + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + // clamp to min 0 + e.vmaxps(e.xmm0, i.src1, e.GetXmmConstPtr(XMMZero)); + + // xmm1 = mask of values >= (unsigned)INT_MIN + e.vcmpgeps(e.xmm1, e.xmm0, e.GetXmmConstPtr(XMMPosIntMinPS)); + + // scale any values >= (unsigned)INT_MIN back to [0, ...] + e.vsubps(e.xmm2, e.xmm0, e.GetXmmConstPtr(XMMPosIntMinPS)); + e.vblendvps(e.xmm0, e.xmm0, e.xmm2, e.xmm1); + + // xmm0 = [0, INT_MAX] + // this may still contain values > INT_MAX (if src has vals > UINT_MAX) + e.vcvttps2dq(i.dest, e.xmm0); + + // xmm0 = mask of values that need saturation + e.vpcmpeqd(e.xmm0, i.dest, e.GetXmmConstPtr(XMMIntMin)); + + // scale values back above [INT_MIN, UINT_MAX] + e.vpand(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMIntMin)); + e.vpaddd(i.dest, i.dest, e.xmm1); + + // saturate values > UINT_MAX + e.vpor(i.dest, i.dest, e.xmm0); + } else { + // xmm2 = NaN mask + e.vcmpunordps(e.xmm2, i.src1, i.src1); + + // convert packed floats to packed dwords + e.vcvttps2dq(e.xmm0, i.src1); + + // (high bit) xmm1 = dest is indeterminate and i.src1 >= 0 + e.vpcmpeqd(e.xmm1, e.xmm0, e.GetXmmConstPtr(XMMIntMin)); + e.vpandn(e.xmm1, i.src1, e.xmm1); + + // saturate positive values + e.vblendvps(i.dest, e.xmm0, e.GetXmmConstPtr(XMMIntMax), e.xmm1); + + // mask NaNs + e.vpandn(i.dest, e.xmm2, i.dest); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_CONVERT_F2I, VECTOR_CONVERT_F2I); + +// ============================================================================ +// OPCODE_LOAD_VECTOR_SHL +// ============================================================================ +static const vec128_t lvsl_table[16] = { + vec128b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), + vec128b(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), + vec128b(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17), + vec128b(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18), + vec128b(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19), + vec128b(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20), + vec128b(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21), + vec128b(7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22), + vec128b(8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23), + vec128b(9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24), + vec128b(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25), + vec128b(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), + vec128b(12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27), + vec128b(13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28), + vec128b(14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29), + vec128b(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30), +}; +struct LOAD_VECTOR_SHL_I8 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.src1.is_constant) { + auto sh = i.src1.constant(); + assert_true(sh < xe::countof(lvsl_table)); + e.mov(e.rax, (uintptr_t)&lvsl_table[sh]); + e.vmovaps(i.dest, e.ptr[e.rax]); + } else { + // TODO(benvanik): find a cheaper way of doing this. + e.movzx(e.rdx, i.src1); + e.and_(e.dx, 0xF); + e.shl(e.dx, 4); + e.mov(e.rax, (uintptr_t)lvsl_table); + e.vmovaps(i.dest, e.ptr[e.rax + e.rdx]); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_LOAD_VECTOR_SHL, LOAD_VECTOR_SHL_I8); + +// ============================================================================ +// OPCODE_LOAD_VECTOR_SHR +// ============================================================================ +static const vec128_t lvsr_table[16] = { + vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), + vec128b(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30), + vec128b(14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29), + vec128b(13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28), + vec128b(12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27), + vec128b(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), + vec128b(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25), + vec128b(9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24), + vec128b(8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23), + vec128b(7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22), + vec128b(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21), + vec128b(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20), + vec128b(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19), + vec128b(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18), + vec128b(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17), + vec128b(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), +}; +struct LOAD_VECTOR_SHR_I8 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.src1.is_constant) { + auto sh = i.src1.constant(); + assert_true(sh < xe::countof(lvsr_table)); + e.mov(e.rax, (uintptr_t)&lvsr_table[sh]); + e.vmovaps(i.dest, e.ptr[e.rax]); + } else { + // TODO(benvanik): find a cheaper way of doing this. + e.movzx(e.rdx, i.src1); + e.and_(e.dx, 0xF); + e.shl(e.dx, 4); + e.mov(e.rax, (uintptr_t)lvsr_table); + e.vmovaps(i.dest, e.ptr[e.rax + e.rdx]); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_LOAD_VECTOR_SHR, LOAD_VECTOR_SHR_I8); + +// ============================================================================ +// OPCODE_VECTOR_MAX +// ============================================================================ +struct VECTOR_MAX + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryXmmOp( + e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + uint32_t part_type = i.instr->flags >> 8; + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + switch (part_type) { + case INT8_TYPE: + e.vpmaxub(dest, src1, src2); + break; + case INT16_TYPE: + e.vpmaxuw(dest, src1, src2); + break; + case INT32_TYPE: + e.vpmaxud(dest, src1, src2); + break; + default: + assert_unhandled_case(part_type); + break; + } + } else { + switch (part_type) { + case INT8_TYPE: + e.vpmaxsb(dest, src1, src2); + break; + case INT16_TYPE: + e.vpmaxsw(dest, src1, src2); + break; + case INT32_TYPE: + e.vpmaxsd(dest, src1, src2); + break; + default: + assert_unhandled_case(part_type); + break; + } + } + }); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_MAX, VECTOR_MAX); + +// ============================================================================ +// OPCODE_VECTOR_MIN +// ============================================================================ +struct VECTOR_MIN + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryXmmOp( + e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + uint32_t part_type = i.instr->flags >> 8; + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + switch (part_type) { + case INT8_TYPE: + e.vpminub(dest, src1, src2); + break; + case INT16_TYPE: + e.vpminuw(dest, src1, src2); + break; + case INT32_TYPE: + e.vpminud(dest, src1, src2); + break; + default: + assert_unhandled_case(part_type); + break; + } + } else { + switch (part_type) { + case INT8_TYPE: + e.vpminsb(dest, src1, src2); + break; + case INT16_TYPE: + e.vpminsw(dest, src1, src2); + break; + case INT32_TYPE: + e.vpminsd(dest, src1, src2); + break; + default: + assert_unhandled_case(part_type); + break; + } + } + }); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_MIN, VECTOR_MIN); + +// ============================================================================ +// OPCODE_VECTOR_COMPARE_EQ +// ============================================================================ +struct VECTOR_COMPARE_EQ_V128 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryXmmOp( + e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + switch (i.instr->flags) { + case INT8_TYPE: + e.vpcmpeqb(dest, src1, src2); + break; + case INT16_TYPE: + e.vpcmpeqw(dest, src1, src2); + break; + case INT32_TYPE: + e.vpcmpeqd(dest, src1, src2); + break; + case FLOAT32_TYPE: + e.vcmpeqps(dest, src1, src2); + break; + } + }); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_EQ, VECTOR_COMPARE_EQ_V128); + +// ============================================================================ +// OPCODE_VECTOR_COMPARE_SGT +// ============================================================================ +struct VECTOR_COMPARE_SGT_V128 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAssociativeBinaryXmmOp( + e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + switch (i.instr->flags) { + case INT8_TYPE: + e.vpcmpgtb(dest, src1, src2); + break; + case INT16_TYPE: + e.vpcmpgtw(dest, src1, src2); + break; + case INT32_TYPE: + e.vpcmpgtd(dest, src1, src2); + break; + case FLOAT32_TYPE: + e.vcmpgtps(dest, src1, src2); + break; + } + }); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_SGT, VECTOR_COMPARE_SGT_V128); + +// ============================================================================ +// OPCODE_VECTOR_COMPARE_SGE +// ============================================================================ +struct VECTOR_COMPARE_SGE_V128 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAssociativeBinaryXmmOp( + e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + switch (i.instr->flags) { + case INT8_TYPE: + e.vpcmpeqb(e.xmm0, src1, src2); + e.vpcmpgtb(dest, src1, src2); + e.vpor(dest, e.xmm0); + break; + case INT16_TYPE: + e.vpcmpeqw(e.xmm0, src1, src2); + e.vpcmpgtw(dest, src1, src2); + e.vpor(dest, e.xmm0); + break; + case INT32_TYPE: + e.vpcmpeqd(e.xmm0, src1, src2); + e.vpcmpgtd(dest, src1, src2); + e.vpor(dest, e.xmm0); + break; + case FLOAT32_TYPE: + e.vcmpgeps(dest, src1, src2); + break; + } + }); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_SGE, VECTOR_COMPARE_SGE_V128); + +// ============================================================================ +// OPCODE_VECTOR_COMPARE_UGT +// ============================================================================ +struct VECTOR_COMPARE_UGT_V128 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + Xbyak::Address sign_addr = e.ptr[e.rax]; // dummy + switch (i.instr->flags) { + case INT8_TYPE: + sign_addr = e.GetXmmConstPtr(XMMSignMaskI8); + break; + case INT16_TYPE: + sign_addr = e.GetXmmConstPtr(XMMSignMaskI16); + break; + case INT32_TYPE: + sign_addr = e.GetXmmConstPtr(XMMSignMaskI32); + break; + case FLOAT32_TYPE: + sign_addr = e.GetXmmConstPtr(XMMSignMaskF32); + break; + default: + assert_always(); + break; + } + if (i.src1.is_constant) { + // TODO(benvanik): make this constant. + e.LoadConstantXmm(e.xmm0, i.src1.constant()); + e.vpxor(e.xmm0, sign_addr); + } else { + e.vpxor(e.xmm0, i.src1, sign_addr); + } + if (i.src2.is_constant) { + // TODO(benvanik): make this constant. + e.LoadConstantXmm(e.xmm1, i.src2.constant()); + e.vpxor(e.xmm1, sign_addr); + } else { + e.vpxor(e.xmm1, i.src2, sign_addr); + } + switch (i.instr->flags) { + case INT8_TYPE: + e.vpcmpgtb(i.dest, e.xmm0, e.xmm1); + break; + case INT16_TYPE: + e.vpcmpgtw(i.dest, e.xmm0, e.xmm1); + break; + case INT32_TYPE: + e.vpcmpgtd(i.dest, e.xmm0, e.xmm1); + break; + case FLOAT32_TYPE: + e.vcmpgtps(i.dest, e.xmm0, e.xmm1); + break; + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_UGT, VECTOR_COMPARE_UGT_V128); + +// ============================================================================ +// OPCODE_VECTOR_COMPARE_UGE +// ============================================================================ +struct VECTOR_COMPARE_UGE_V128 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + Xbyak::Address sign_addr = e.ptr[e.rax]; // dummy + switch (i.instr->flags) { + case INT8_TYPE: + sign_addr = e.GetXmmConstPtr(XMMSignMaskI8); + break; + case INT16_TYPE: + sign_addr = e.GetXmmConstPtr(XMMSignMaskI16); + break; + case INT32_TYPE: + sign_addr = e.GetXmmConstPtr(XMMSignMaskI32); + break; + case FLOAT32_TYPE: + sign_addr = e.GetXmmConstPtr(XMMSignMaskF32); + break; + } + if (i.src1.is_constant) { + // TODO(benvanik): make this constant. + e.LoadConstantXmm(e.xmm0, i.src1.constant()); + e.vpxor(e.xmm0, sign_addr); + } else { + e.vpxor(e.xmm0, i.src1, sign_addr); + } + if (i.src2.is_constant) { + // TODO(benvanik): make this constant. + e.LoadConstantXmm(e.xmm1, i.src2.constant()); + e.vpxor(e.xmm1, sign_addr); + } else { + e.vpxor(e.xmm1, i.src2, sign_addr); + } + switch (i.instr->flags) { + case INT8_TYPE: + e.vpcmpeqb(e.xmm2, e.xmm0, e.xmm1); + e.vpcmpgtb(i.dest, e.xmm0, e.xmm1); + e.vpor(i.dest, e.xmm2); + break; + case INT16_TYPE: + e.vpcmpeqw(e.xmm2, e.xmm0, e.xmm1); + e.vpcmpgtw(i.dest, e.xmm0, e.xmm1); + e.vpor(i.dest, e.xmm2); + break; + case INT32_TYPE: + e.vpcmpeqd(e.xmm2, e.xmm0, e.xmm1); + e.vpcmpgtd(i.dest, e.xmm0, e.xmm1); + e.vpor(i.dest, e.xmm2); + break; + case FLOAT32_TYPE: + e.vcmpgeps(i.dest, e.xmm0, e.xmm1); + break; + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_UGE, VECTOR_COMPARE_UGE_V128); + +// ============================================================================ +// OPCODE_VECTOR_ADD +// ============================================================================ +struct VECTOR_ADD + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryXmmOp( + e, i, [&i](X64Emitter& e, const Xmm& dest, Xmm src1, Xmm src2) { + const TypeName part_type = + static_cast(i.instr->flags & 0xFF); + const uint32_t arithmetic_flags = i.instr->flags >> 8; + bool is_unsigned = !!(arithmetic_flags & ARITHMETIC_UNSIGNED); + bool saturate = !!(arithmetic_flags & ARITHMETIC_SATURATE); + switch (part_type) { + case INT8_TYPE: + if (saturate) { + // TODO(benvanik): trace DID_SATURATE + if (is_unsigned) { + e.vpaddusb(dest, src1, src2); + } else { + e.vpaddsb(dest, src1, src2); + } + } else { + e.vpaddb(dest, src1, src2); + } + break; + case INT16_TYPE: + if (saturate) { + // TODO(benvanik): trace DID_SATURATE + if (is_unsigned) { + e.vpaddusw(dest, src1, src2); + } else { + e.vpaddsw(dest, src1, src2); + } + } else { + e.vpaddw(dest, src1, src2); + } + break; + case INT32_TYPE: + if (saturate) { + if (is_unsigned) { + // xmm0 is the only temp register that can be used by + // src1/src2. + e.vpaddd(e.xmm1, src1, src2); + + // If result is smaller than either of the inputs, we've + // overflowed (only need to check one input) + // if (src1 > res) then overflowed + // http://locklessinc.com/articles/sat_arithmetic/ + e.vpxor(e.xmm2, src1, e.GetXmmConstPtr(XMMSignMaskI32)); + e.vpxor(e.xmm0, e.xmm1, e.GetXmmConstPtr(XMMSignMaskI32)); + e.vpcmpgtd(e.xmm0, e.xmm2, e.xmm0); + e.vpor(dest, e.xmm1, e.xmm0); + } else { + e.vpaddd(e.xmm1, src1, src2); + + // Overflow results if two inputs are the same sign and the + // result isn't the same sign. if ((s32b)(~(src1 ^ src2) & + // (src1 ^ res)) < 0) then overflowed + // http://locklessinc.com/articles/sat_arithmetic/ + e.vpxor(e.xmm2, src1, src2); + e.vpxor(e.xmm3, src1, e.xmm1); + e.vpandn(e.xmm2, e.xmm2, e.xmm3); + + // Set any negative overflowed elements of src1 to INT_MIN + e.vpand(e.xmm3, src1, e.xmm2); + e.vblendvps(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMSignMaskI32), + e.xmm3); + + // Set any positive overflowed elements of src1 to INT_MAX + e.vpandn(e.xmm3, src1, e.xmm2); + e.vblendvps(dest, e.xmm1, e.GetXmmConstPtr(XMMAbsMaskPS), + e.xmm3); + } + } else { + e.vpaddd(dest, src1, src2); + } + break; + case FLOAT32_TYPE: + assert_false(is_unsigned); + assert_false(saturate); + e.vaddps(dest, src1, src2); + break; + default: + assert_unhandled_case(part_type); + break; + } + }); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_ADD, VECTOR_ADD); + +// ============================================================================ +// OPCODE_VECTOR_SUB +// ============================================================================ +struct VECTOR_SUB + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryXmmOp( + e, i, [&i](X64Emitter& e, const Xmm& dest, Xmm src1, Xmm src2) { + const TypeName part_type = + static_cast(i.instr->flags & 0xFF); + const uint32_t arithmetic_flags = i.instr->flags >> 8; + bool is_unsigned = !!(arithmetic_flags & ARITHMETIC_UNSIGNED); + bool saturate = !!(arithmetic_flags & ARITHMETIC_SATURATE); + switch (part_type) { + case INT8_TYPE: + if (saturate) { + // TODO(benvanik): trace DID_SATURATE + if (is_unsigned) { + e.vpsubusb(dest, src1, src2); + } else { + e.vpsubsb(dest, src1, src2); + } + } else { + e.vpsubb(dest, src1, src2); + } + break; + case INT16_TYPE: + if (saturate) { + // TODO(benvanik): trace DID_SATURATE + if (is_unsigned) { + e.vpsubusw(dest, src1, src2); + } else { + e.vpsubsw(dest, src1, src2); + } + } else { + e.vpsubw(dest, src1, src2); + } + break; + case INT32_TYPE: + if (saturate) { + if (is_unsigned) { + // xmm0 is the only temp register that can be used by + // src1/src2. + e.vpsubd(e.xmm1, src1, src2); + + // If result is greater than either of the inputs, we've + // underflowed (only need to check one input) + // if (res > src1) then underflowed + // http://locklessinc.com/articles/sat_arithmetic/ + e.vpxor(e.xmm2, src1, e.GetXmmConstPtr(XMMSignMaskI32)); + e.vpxor(e.xmm0, e.xmm1, e.GetXmmConstPtr(XMMSignMaskI32)); + e.vpcmpgtd(e.xmm0, e.xmm0, e.xmm2); + e.vpandn(dest, e.xmm0, e.xmm1); + } else { + e.vpsubd(e.xmm1, src1, src2); + + // We can only overflow if the signs of the operands are + // opposite. If signs are opposite and result sign isn't the + // same as src1's sign, we've overflowed. if ((s32b)((src1 ^ + // src2) & (src1 ^ res)) < 0) then overflowed + // http://locklessinc.com/articles/sat_arithmetic/ + e.vpxor(e.xmm2, src1, src2); + e.vpxor(e.xmm3, src1, e.xmm1); + e.vpand(e.xmm2, e.xmm2, e.xmm3); + + // Set any negative overflowed elements of src1 to INT_MIN + e.vpand(e.xmm3, src1, e.xmm2); + e.vblendvps(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMSignMaskI32), + e.xmm3); + + // Set any positive overflowed elements of src1 to INT_MAX + e.vpandn(e.xmm3, src1, e.xmm2); + e.vblendvps(dest, e.xmm1, e.GetXmmConstPtr(XMMAbsMaskPS), + e.xmm3); + } + } else { + e.vpsubd(dest, src1, src2); + } + break; + case FLOAT32_TYPE: + e.vsubps(dest, src1, src2); + break; + default: + assert_unhandled_case(part_type); + break; + } + }); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SUB, VECTOR_SUB); + +// ============================================================================ +// OPCODE_VECTOR_SHL +// ============================================================================ +template ::value, int> = 0> +static __m128i EmulateVectorShl(void*, __m128i src1, __m128i src2) { + alignas(16) T value[16 / sizeof(T)]; + alignas(16) T shamt[16 / sizeof(T)]; + + // Load SSE registers into a C array. + _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); + _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); + + for (size_t i = 0; i < (16 / sizeof(T)); ++i) { + value[i] = value[i] << (shamt[i] & ((sizeof(T) * 8) - 1)); + } + + // Store result and return it. + return _mm_load_si128(reinterpret_cast<__m128i*>(value)); +} + +struct VECTOR_SHL_V128 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case INT8_TYPE: + EmitInt8(e, i); + break; + case INT16_TYPE: + EmitInt16(e, i); + break; + case INT32_TYPE: + EmitInt32(e, i); + break; + default: + assert_always(); + break; + } + } + + static void EmitInt8(X64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): native version (with shift magic). + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.GetNativeParam(1), e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + } + e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShl)); + e.vmovaps(i.dest, e.xmm0); + } + + static void EmitInt16(X64Emitter& e, const EmitArgType& i) { + Xmm src1; + if (i.src1.is_constant) { + src1 = e.xmm2; + e.LoadConstantXmm(src1, i.src1.constant()); + } else { + src1 = i.src1; + } + + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 8 - n; ++n) { + if (shamt.u16[n] != shamt.u16[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use vpsllw. + e.vpsllw(i.dest, src1, shamt.u16[0] & 0xF); + return; + } + } + + // Shift 8 words in src1 by amount specified in src2. + Xbyak::Label emu, end; + + // Only bother with this check if shift amt isn't constant. + if (!i.src2.is_constant) { + // See if the shift is equal first for a shortcut. + e.vpshuflw(e.xmm0, i.src2, 0b00000000); + e.vpshufd(e.xmm0, e.xmm0, 0b00000000); + e.vpxor(e.xmm1, e.xmm0, i.src2); + e.vptest(e.xmm1, e.xmm1); + e.jnz(emu); + + // Equal. Shift using vpsllw. + e.mov(e.rax, 0xF); + e.vmovq(e.xmm1, e.rax); + e.vpand(e.xmm0, e.xmm0, e.xmm1); + e.vpsllw(i.dest, src1, e.xmm0); + e.jmp(end); + } + + // TODO(benvanik): native version (with shift magic). + e.L(emu); + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.GetNativeParam(1), e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + } + e.lea(e.GetNativeParam(0), e.StashXmm(0, src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShl)); + e.vmovaps(i.dest, e.xmm0); + + e.L(end); + } + + static void EmitInt32(X64Emitter& e, const EmitArgType& i) { + Xmm src1; + if (i.src1.is_constant) { + src1 = e.xmm2; + e.LoadConstantXmm(src1, i.src1.constant()); + } else { + src1 = i.src1; + } + + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 4 - n; ++n) { + if (shamt.u32[n] != shamt.u32[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use vpslld. + e.vpslld(i.dest, src1, shamt.u8[0] & 0x1F); + return; + } + } + + if (e.IsFeatureEnabled(kX64EmitAVX2)) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + // Counts differ, so pre-mask and load constant. + vec128_t masked = i.src2.constant(); + for (size_t n = 0; n < 4; ++n) { + masked.u32[n] &= 0x1F; + } + e.LoadConstantXmm(e.xmm0, masked); + e.vpsllvd(i.dest, src1, e.xmm0); + } else { + // Fully variable shift. + // src shift mask may have values >31, and x86 sets to zero when + // that happens so we mask. + e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); + e.vpsllvd(i.dest, src1, e.xmm0); + } + } else { + // Shift 4 words in src1 by amount specified in src2. + Xbyak::Label emu, end; + + // See if the shift is equal first for a shortcut. + // Only bother with this check if shift amt isn't constant. + if (!i.src2.is_constant) { + e.vpshufd(e.xmm0, i.src2, 0b00000000); + e.vpxor(e.xmm1, e.xmm0, i.src2); + e.vptest(e.xmm1, e.xmm1); + e.jnz(emu); + + // Equal. Shift using vpsrad. + e.mov(e.rax, 0x1F); + e.vmovq(e.xmm1, e.rax); + e.vpand(e.xmm0, e.xmm0, e.xmm1); + e.vpslld(i.dest, src1, e.xmm0); + e.jmp(end); + } + + // TODO(benvanik): native version (with shift magic). + e.L(emu); + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.GetNativeParam(1), e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + } + e.lea(e.GetNativeParam(0), e.StashXmm(0, src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShl)); + e.vmovaps(i.dest, e.xmm0); + + e.L(end); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHL, VECTOR_SHL_V128); + +// ============================================================================ +// OPCODE_VECTOR_SHR +// ============================================================================ +template ::value, int> = 0> +static __m128i EmulateVectorShr(void*, __m128i src1, __m128i src2) { + alignas(16) T value[16 / sizeof(T)]; + alignas(16) T shamt[16 / sizeof(T)]; + + // Load SSE registers into a C array. + _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); + _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); + + for (size_t i = 0; i < (16 / sizeof(T)); ++i) { + value[i] = value[i] >> (shamt[i] & ((sizeof(T) * 8) - 1)); + } + + // Store result and return it. + return _mm_load_si128(reinterpret_cast<__m128i*>(value)); +} + +struct VECTOR_SHR_V128 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case INT8_TYPE: + EmitInt8(e, i); + break; + case INT16_TYPE: + EmitInt16(e, i); + break; + case INT32_TYPE: + EmitInt32(e, i); + break; + default: + assert_always(); + break; + } + } + + static void EmitInt8(X64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): native version (with shift magic). + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.GetNativeParam(1), e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + } + e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); + e.vmovaps(i.dest, e.xmm0); + } + + static void EmitInt16(X64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 8 - n; ++n) { + if (shamt.u16[n] != shamt.u16[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use vpsllw. + e.vpsrlw(i.dest, i.src1, shamt.u16[0] & 0xF); + return; + } + } + + // Shift 8 words in src1 by amount specified in src2. + Xbyak::Label emu, end; + + // See if the shift is equal first for a shortcut. + // Only bother with this check if shift amt isn't constant. + if (!i.src2.is_constant) { + e.vpshuflw(e.xmm0, i.src2, 0b00000000); + e.vpshufd(e.xmm0, e.xmm0, 0b00000000); + e.vpxor(e.xmm1, e.xmm0, i.src2); + e.vptest(e.xmm1, e.xmm1); + e.jnz(emu); + + // Equal. Shift using vpsrlw. + e.mov(e.rax, 0xF); + e.vmovq(e.xmm1, e.rax); + e.vpand(e.xmm0, e.xmm0, e.xmm1); + e.vpsrlw(i.dest, i.src1, e.xmm0); + e.jmp(end); + } + + // TODO(benvanik): native version (with shift magic). + e.L(emu); + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.GetNativeParam(1), e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + } + e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); + e.vmovaps(i.dest, e.xmm0); + + e.L(end); + } + + static void EmitInt32(X64Emitter& e, const EmitArgType& i) { + Xmm src1; + if (i.src1.is_constant) { + src1 = e.xmm2; + e.LoadConstantXmm(src1, i.src1.constant()); + } else { + src1 = i.src1; + } + + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 4 - n; ++n) { + if (shamt.u32[n] != shamt.u32[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use vpsrld. + e.vpsrld(i.dest, src1, shamt.u8[0] & 0x1F); + return; + } else { + if (e.IsFeatureEnabled(kX64EmitAVX2)) { + // Counts differ, so pre-mask and load constant. + vec128_t masked = i.src2.constant(); + for (size_t n = 0; n < 4; ++n) { + masked.u32[n] &= 0x1F; + } + e.LoadConstantXmm(e.xmm0, masked); + e.vpsrlvd(i.dest, src1, e.xmm0); + return; + } + } + } + + if (e.IsFeatureEnabled(kX64EmitAVX2)) { + // Fully variable shift. + // src shift mask may have values >31, and x86 sets to zero when + // that happens so we mask. + e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); + e.vpsrlvd(i.dest, src1, e.xmm0); + } else { + // Shift 4 words in src1 by amount specified in src2. + Xbyak::Label emu, end; + + // See if the shift is equal first for a shortcut. + // Only bother with this check if shift amt isn't constant. + if (!i.src2.is_constant) { + e.vpshufd(e.xmm0, i.src2, 0b00000000); + e.vpxor(e.xmm1, e.xmm0, i.src2); + e.vptest(e.xmm1, e.xmm1); + e.jnz(emu); + + // Equal. Shift using vpsrld. + e.mov(e.rax, 0x1F); + e.vmovq(e.xmm1, e.rax); + e.vpand(e.xmm0, e.xmm0, e.xmm1); + e.vpsrld(i.dest, src1, e.xmm0); + e.jmp(end); + } + + // TODO(benvanik): native version. + e.L(emu); + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.GetNativeParam(1), e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + } + e.lea(e.GetNativeParam(0), e.StashXmm(0, src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); + e.vmovaps(i.dest, e.xmm0); + + e.L(end); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHR, VECTOR_SHR_V128); + +// ============================================================================ +// OPCODE_VECTOR_SHA +// ============================================================================ +struct VECTOR_SHA_V128 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case INT8_TYPE: + EmitInt8(e, i); + break; + case INT16_TYPE: + EmitInt16(e, i); + break; + case INT32_TYPE: + EmitInt32(e, i); + break; + default: + assert_always(); + break; + } + } + + static void EmitInt8(X64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): native version (with shift magic). + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.GetNativeParam(1), e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + } + e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); + e.vmovaps(i.dest, e.xmm0); + } + + static void EmitInt16(X64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 8 - n; ++n) { + if (shamt.u16[n] != shamt.u16[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use vpsraw. + e.vpsraw(i.dest, i.src1, shamt.u16[0] & 0xF); + return; + } + } + + // Shift 8 words in src1 by amount specified in src2. + Xbyak::Label emu, end; + + // See if the shift is equal first for a shortcut. + // Only bother with this check if shift amt isn't constant. + if (!i.src2.is_constant) { + e.vpshuflw(e.xmm0, i.src2, 0b00000000); + e.vpshufd(e.xmm0, e.xmm0, 0b00000000); + e.vpxor(e.xmm1, e.xmm0, i.src2); + e.vptest(e.xmm1, e.xmm1); + e.jnz(emu); + + // Equal. Shift using vpsraw. + e.mov(e.rax, 0xF); + e.vmovq(e.xmm1, e.rax); + e.vpand(e.xmm0, e.xmm0, e.xmm1); + e.vpsraw(i.dest, i.src1, e.xmm0); + e.jmp(end); + } + + // TODO(benvanik): native version (with shift magic). + e.L(emu); + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.GetNativeParam(1), e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + } + e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); + e.vmovaps(i.dest, e.xmm0); + + e.L(end); + } + + static void EmitInt32(X64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 4 - n; ++n) { + if (shamt.u32[n] != shamt.u32[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use vpsrad. + e.vpsrad(i.dest, i.src1, shamt.u32[0] & 0x1F); + return; + } + } + + if (e.IsFeatureEnabled(kX64EmitAVX2)) { + // src shift mask may have values >31, and x86 sets to zero when + // that happens so we mask. + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.vandps(e.xmm0, e.GetXmmConstPtr(XMMShiftMaskPS)); + } else { + e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); + } + e.vpsravd(i.dest, i.src1, e.xmm0); + } else { + // Shift 4 words in src1 by amount specified in src2. + Xbyak::Label emu, end; + + // See if the shift is equal first for a shortcut. + // Only bother with this check if shift amt isn't constant. + if (!i.src2.is_constant) { + e.vpshufd(e.xmm0, i.src2, 0b00000000); + e.vpxor(e.xmm1, e.xmm0, i.src2); + e.vptest(e.xmm1, e.xmm1); + e.jnz(emu); + + // Equal. Shift using vpsrad. + e.mov(e.rax, 0x1F); + e.vmovq(e.xmm1, e.rax); + e.vpand(e.xmm0, e.xmm0, e.xmm1); + e.vpsrad(i.dest, i.src1, e.xmm0); + e.jmp(end); + } + + // TODO(benvanik): native version. + e.L(emu); + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.GetNativeParam(1), e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + } + e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); + e.vmovaps(i.dest, e.xmm0); + + e.L(end); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHA, VECTOR_SHA_V128); + +// ============================================================================ +// OPCODE_VECTOR_ROTATE_LEFT +// ============================================================================ +template ::value, int> = 0> +static __m128i EmulateVectorRotateLeft(void*, __m128i src1, __m128i src2) { + alignas(16) T value[16 / sizeof(T)]; + alignas(16) T shamt[16 / sizeof(T)]; + + // Load SSE registers into a C array. + _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); + _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); + + for (size_t i = 0; i < (16 / sizeof(T)); ++i) { + value[i] = xe::rotate_left(value[i], shamt[i] & ((sizeof(T) * 8) - 1)); + } + + // Store result and return it. + return _mm_load_si128(reinterpret_cast<__m128i*>(value)); +} + +// TODO(benvanik): AVX512 has a native variable rotate (rolv). +struct VECTOR_ROTATE_LEFT_V128 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case INT8_TYPE: + // TODO(benvanik): native version (with shift magic). + e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.GetNativeParam(1), e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + } + e.CallNativeSafe( + reinterpret_cast(EmulateVectorRotateLeft)); + e.vmovaps(i.dest, e.xmm0); + break; + case INT16_TYPE: + // TODO(benvanik): native version (with shift magic). + e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.GetNativeParam(1), e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + } + e.CallNativeSafe( + reinterpret_cast(EmulateVectorRotateLeft)); + e.vmovaps(i.dest, e.xmm0); + break; + case INT32_TYPE: { + if (e.IsFeatureEnabled(kX64EmitAVX2)) { + Xmm temp = i.dest; + if (i.dest == i.src1 || i.dest == i.src2) { + temp = e.xmm2; + } + // Shift left (to get high bits): + e.vpand(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); + e.vpsllvd(e.xmm1, i.src1, e.xmm0); + // Shift right (to get low bits): + e.vmovaps(temp, e.GetXmmConstPtr(XMMPI32)); + e.vpsubd(temp, e.xmm0); + e.vpsrlvd(i.dest, i.src1, temp); + // Merge: + e.vpor(i.dest, e.xmm1); + } else { + // TODO(benvanik): non-AVX2 native version. + e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.GetNativeParam(1), e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + } + e.CallNativeSafe( + reinterpret_cast(EmulateVectorRotateLeft)); + e.vmovaps(i.dest, e.xmm0); + } + break; + } + default: + assert_always(); + break; + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_ROTATE_LEFT, VECTOR_ROTATE_LEFT_V128); + +// ============================================================================ +// OPCODE_VECTOR_AVERAGE +// ============================================================================ +template ::value, int> = 0> +static __m128i EmulateVectorAverage(void*, __m128i src1, __m128i src2) { + alignas(16) T src1v[16 / sizeof(T)]; + alignas(16) T src2v[16 / sizeof(T)]; + alignas(16) T value[16 / sizeof(T)]; + + // Load SSE registers into a C array. + _mm_store_si128(reinterpret_cast<__m128i*>(src1v), src1); + _mm_store_si128(reinterpret_cast<__m128i*>(src2v), src2); + + for (size_t i = 0; i < (16 / sizeof(T)); ++i) { + auto t = (uint64_t(src1v[i]) + uint64_t(src2v[i]) + 1) / 2; + value[i] = T(t); + } + + // Store result and return it. + return _mm_load_si128(reinterpret_cast<__m128i*>(value)); +} + +struct VECTOR_AVERAGE + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryXmmOp( + e, i, + [&i](X64Emitter& e, const Xmm& dest, const Xmm& src1, const Xmm& src2) { + const TypeName part_type = + static_cast(i.instr->flags & 0xFF); + const uint32_t arithmetic_flags = i.instr->flags >> 8; + bool is_unsigned = !!(arithmetic_flags & ARITHMETIC_UNSIGNED); + switch (part_type) { + case INT8_TYPE: + if (is_unsigned) { + e.vpavgb(dest, src1, src2); + } else { + assert_always(); + } + break; + case INT16_TYPE: + if (is_unsigned) { + e.vpavgw(dest, src1, src2); + } else { + assert_always(); + } + break; + case INT32_TYPE: + // No 32bit averages in AVX. + if (is_unsigned) { + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.GetNativeParam(1), e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + } + e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); + e.CallNativeSafe( + reinterpret_cast(EmulateVectorAverage)); + e.vmovaps(i.dest, e.xmm0); + } else { + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.GetNativeParam(1), e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + } + e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); + e.CallNativeSafe( + reinterpret_cast(EmulateVectorAverage)); + e.vmovaps(i.dest, e.xmm0); + } + break; + default: + assert_unhandled_case(part_type); + break; + } + }); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_AVERAGE, VECTOR_AVERAGE); + +// ============================================================================ +// OPCODE_INSERT +// ============================================================================ +struct INSERT_I8 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + assert_true(i.src2.is_constant); + e.vpinsrb(i.dest, i.src3.reg().cvt32(), i.src2.constant() ^ 0x3); + } +}; +struct INSERT_I16 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + assert_true(i.src2.is_constant); + e.vpinsrw(i.dest, i.src3.reg().cvt32(), i.src2.constant() ^ 0x1); + } +}; +struct INSERT_I32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + assert_true(i.src2.is_constant); + e.vpinsrd(i.dest, i.src3, i.src2.constant()); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_INSERT, INSERT_I8, INSERT_I16, INSERT_I32); + +// ============================================================================ +// OPCODE_EXTRACT +// ============================================================================ +// TODO(benvanik): sequence extract/splat: +// v0.i32 = extract v0.v128, 0 +// v0.v128 = splat v0.i32 +// This can be a single broadcast. +struct EXTRACT_I8 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + e.vpextrb(i.dest.reg().cvt32(), i.src1, VEC128_B(i.src2.constant())); + } else { + e.mov(e.eax, 0x00000003); + e.xor_(e.al, i.src2); + e.and_(e.al, 0x1F); + e.vmovd(e.xmm0, e.eax); + e.vpshufb(e.xmm0, i.src1, e.xmm0); + e.vmovd(i.dest.reg().cvt32(), e.xmm0); + e.and_(i.dest, uint8_t(0xFF)); + } + } +}; +struct EXTRACT_I16 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + e.vpextrw(i.dest.reg().cvt32(), i.src1, VEC128_W(i.src2.constant())); + } else { + e.mov(e.al, i.src2); + e.xor_(e.al, 0x01); + e.shl(e.al, 1); + e.mov(e.ah, e.al); + e.add(e.ah, 1); + e.vmovd(e.xmm0, e.eax); + e.vpshufb(e.xmm0, i.src1, e.xmm0); + e.vmovd(i.dest.reg().cvt32(), e.xmm0); + e.and_(i.dest.reg().cvt32(), 0xFFFFu); + } + } +}; +struct EXTRACT_I32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + static const vec128_t extract_table_32[4] = { + vec128b(3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), + vec128b(7, 6, 5, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), + vec128b(11, 10, 9, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), + vec128b(15, 14, 13, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), + }; + if (i.src2.is_constant) { + // TODO(gibbed): add support to constant propagation pass for + // OPCODE_EXTRACT. + Xmm src1; + if (i.src1.is_constant) { + src1 = e.xmm0; + e.LoadConstantXmm(src1, i.src1.constant()); + } else { + src1 = i.src1; + } + if (i.src2.constant() == 0) { + e.vmovd(i.dest, src1); + } else { + e.vpextrd(i.dest, src1, VEC128_D(i.src2.constant())); + } + } else { + // TODO(benvanik): try out hlide's version: + // e.mov(e.eax, 3); + // e.and_(e.al, i.src2); // eax = [(i&3), 0, 0, 0] + // e.imul(e.eax, 0x04040404); // [(i&3)*4, (i&3)*4, (i&3)*4, (i&3)*4] + // e.add(e.eax, 0x00010203); // [((i&3)*4)+3, ((i&3)*4)+2, ((i&3)*4)+1, + // ((i&3)*4)+0] + // e.vmovd(e.xmm0, e.eax); + // e.vpshufb(e.xmm0, i.src1, e.xmm0); + // e.vmovd(i.dest.reg().cvt32(), e.xmm0); + // Get the desired word in xmm0, then extract that. + Xmm src1; + if (i.src1.is_constant) { + src1 = e.xmm1; + e.LoadConstantXmm(src1, i.src1.constant()); + } else { + src1 = i.src1.reg(); + } + + e.xor_(e.rax, e.rax); + e.mov(e.al, i.src2); + e.and_(e.al, 0x03); + e.shl(e.al, 4); + e.mov(e.rdx, reinterpret_cast(extract_table_32)); + e.vmovaps(e.xmm0, e.ptr[e.rdx + e.rax]); + e.vpshufb(e.xmm0, src1, e.xmm0); + e.vpextrd(i.dest, e.xmm0, 0); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_EXTRACT, EXTRACT_I8, EXTRACT_I16, EXTRACT_I32); + +// ============================================================================ +// OPCODE_SPLAT +// ============================================================================ +// Copy a value into all elements of a vector +struct SPLAT_I8 : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.src1.is_constant) { + // TODO(benvanik): faster constant splats. + e.mov(e.eax, i.src1.constant()); + e.vmovd(e.xmm0, e.eax); + } else { + e.vmovd(e.xmm0, i.src1.reg().cvt32()); + } + + if (e.IsFeatureEnabled(kX64EmitAVX2)) { + e.vpbroadcastb(i.dest, e.xmm0); + } else { + e.vpunpcklbw(e.xmm0, e.xmm0); + e.vpunpcklwd(e.xmm0, e.xmm0); + e.vpshufd(i.dest, e.xmm0, 0); + } + } +}; +struct SPLAT_I16 : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.src1.is_constant) { + // TODO(benvanik): faster constant splats. + e.mov(e.eax, i.src1.constant()); + e.vmovd(e.xmm0, e.eax); + } else { + e.vmovd(e.xmm0, i.src1.reg().cvt32()); + } + + if (e.IsFeatureEnabled(kX64EmitAVX2)) { + e.vpbroadcastw(i.dest, e.xmm0); + } else { + e.vpunpcklwd(e.xmm0, e.xmm0); // unpack low word data + e.vpshufd(i.dest, e.xmm0, 0); + } + } +}; +struct SPLAT_I32 : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.src1.is_constant) { + // TODO(benvanik): faster constant splats. + e.mov(e.eax, i.src1.constant()); + e.vmovd(e.xmm0, e.eax); + } else { + e.vmovd(e.xmm0, i.src1); + } + + if (e.IsFeatureEnabled(kX64EmitAVX2)) { + e.vpbroadcastd(i.dest, e.xmm0); + } else { + e.vpshufd(i.dest, e.xmm0, 0); + } + } +}; +struct SPLAT_F32 : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (e.IsFeatureEnabled(kX64EmitAVX2)) { + if (i.src1.is_constant) { + // TODO(benvanik): faster constant splats. + e.mov(e.eax, i.src1.value->constant.i32); + e.vmovd(e.xmm0, e.eax); + e.vbroadcastss(i.dest, e.xmm0); + } else { + e.vbroadcastss(i.dest, i.src1); + } + } else { + if (i.src1.is_constant) { + e.mov(e.eax, i.src1.value->constant.i32); + e.vmovd(i.dest, e.eax); + e.vshufps(i.dest, i.dest, i.dest, 0); + } else { + e.vshufps(i.dest, i.src1, i.src1, 0); + } + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_SPLAT, SPLAT_I8, SPLAT_I16, SPLAT_I32, SPLAT_F32); + +// ============================================================================ +// OPCODE_PERMUTE +// ============================================================================ +struct PERMUTE_I32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + assert_true(i.instr->flags == INT32_TYPE); + // Permute words between src2 and src3. + // TODO(benvanik): check src3 for zero. if 0, we can use pshufb. + if (i.src1.is_constant) { + uint32_t control = i.src1.constant(); + // Shuffle things into the right places in dest & xmm0, + // then we blend them together. + uint32_t src_control = + (((control >> 24) & 0x3) << 6) | (((control >> 16) & 0x3) << 4) | + (((control >> 8) & 0x3) << 2) | (((control >> 0) & 0x3) << 0); + + uint32_t blend_control = 0; + if (e.IsFeatureEnabled(kX64EmitAVX2)) { + // Blender for vpblendd + blend_control = + (((control >> 26) & 0x1) << 3) | (((control >> 18) & 0x1) << 2) | + (((control >> 10) & 0x1) << 1) | (((control >> 2) & 0x1) << 0); + } else { + // Blender for vpblendw + blend_control = + (((control >> 26) & 0x1) << 6) | (((control >> 18) & 0x1) << 4) | + (((control >> 10) & 0x1) << 2) | (((control >> 2) & 0x1) << 0); + blend_control |= blend_control << 1; + } + + // TODO(benvanik): if src2/src3 are constants, shuffle now! + Xmm src2; + if (i.src2.is_constant) { + src2 = e.xmm1; + e.LoadConstantXmm(src2, i.src2.constant()); + } else { + src2 = i.src2; + } + Xmm src3; + if (i.src3.is_constant) { + src3 = e.xmm2; + e.LoadConstantXmm(src3, i.src3.constant()); + } else { + src3 = i.src3; + } + if (i.dest != src3) { + e.vpshufd(i.dest, src2, src_control); + e.vpshufd(e.xmm0, src3, src_control); + } else { + e.vmovaps(e.xmm0, src3); + e.vpshufd(i.dest, src2, src_control); + e.vpshufd(e.xmm0, e.xmm0, src_control); + } + + if (e.IsFeatureEnabled(kX64EmitAVX2)) { + e.vpblendd(i.dest, e.xmm0, blend_control); // $0 = $1 $2 + } else { + e.vpblendw(i.dest, e.xmm0, blend_control); // $0 = $1 $2 + } + } else { + // Permute by non-constant. + assert_always(); + } + } +}; +struct PERMUTE_V128 + : Sequence> { + static void EmitByInt8(X64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): find out how to do this with only one temp register! + // Permute bytes between src2 and src3. + // src1 is an array of indices corresponding to positions within src2 and + // src3. + if (i.src3.value->IsConstantZero()) { + // Permuting with src2/zero, so just shuffle/mask. + if (i.src2.value->IsConstantZero()) { + // src2 & src3 are zero, so result will always be zero. + e.vpxor(i.dest, i.dest); + } else { + // Control mask needs to be shuffled. + if (i.src1.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src1.constant()); + e.vxorps(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMSwapWordMask)); + } else { + e.vxorps(e.xmm0, i.src1, e.GetXmmConstPtr(XMMSwapWordMask)); + } + e.vpand(e.xmm0, e.GetXmmConstPtr(XMMPermuteByteMask)); + if (i.src2.is_constant) { + e.LoadConstantXmm(i.dest, i.src2.constant()); + e.vpshufb(i.dest, i.dest, e.xmm0); + } else { + e.vpshufb(i.dest, i.src2, e.xmm0); + } + // Build a mask with values in src2 having 0 and values in src3 having + // 1. + e.vpcmpgtb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMPermuteControl15)); + e.vpandn(i.dest, e.xmm0, i.dest); + } + } else { + // General permute. + // Control mask needs to be shuffled. + // TODO(benvanik): do constants here instead of in generated code. + if (i.src1.is_constant) { + e.LoadConstantXmm(e.xmm2, i.src1.constant()); + e.vxorps(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMSwapWordMask)); + } else { + e.vxorps(e.xmm2, i.src1, e.GetXmmConstPtr(XMMSwapWordMask)); + } + e.vpand(e.xmm2, e.GetXmmConstPtr(XMMPermuteByteMask)); + Xmm src2_shuf = e.xmm0; + if (i.src2.value->IsConstantZero()) { + e.vpxor(src2_shuf, src2_shuf); + } else if (i.src2.is_constant) { + e.LoadConstantXmm(src2_shuf, i.src2.constant()); + e.vpshufb(src2_shuf, src2_shuf, e.xmm2); + } else { + e.vpshufb(src2_shuf, i.src2, e.xmm2); + } + Xmm src3_shuf = e.xmm1; + if (i.src3.value->IsConstantZero()) { + e.vpxor(src3_shuf, src3_shuf); + } else if (i.src3.is_constant) { + e.LoadConstantXmm(src3_shuf, i.src3.constant()); + e.vpshufb(src3_shuf, src3_shuf, e.xmm2); + } else { + e.vpshufb(src3_shuf, i.src3, e.xmm2); + } + // Build a mask with values in src2 having 0 and values in src3 having 1. + e.vpcmpgtb(i.dest, e.xmm2, e.GetXmmConstPtr(XMMPermuteControl15)); + e.vpblendvb(i.dest, src2_shuf, src3_shuf, i.dest); + } + } + + static void EmitByInt16(X64Emitter& e, const EmitArgType& i) { + // src1 is an array of indices corresponding to positions within src2 and + // src3. + assert_true(i.src1.is_constant); + vec128_t perm = (i.src1.constant() & vec128s(0xF)) ^ vec128s(0x1); + vec128_t perm_ctrl = vec128b(0); + for (int i = 0; i < 8; i++) { + perm_ctrl.i16[i] = perm.i16[i] > 7 ? -1 : 0; + + auto v = uint8_t(perm.u16[i]); + perm.u8[i * 2] = v * 2; + perm.u8[i * 2 + 1] = v * 2 + 1; + } + e.LoadConstantXmm(e.xmm0, perm); + + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm1, i.src2.constant()); + } else { + e.vmovdqa(e.xmm1, i.src2); + } + if (i.src3.is_constant) { + e.LoadConstantXmm(e.xmm2, i.src3.constant()); + } else { + e.vmovdqa(e.xmm2, i.src3); + } + + e.vpshufb(e.xmm1, e.xmm1, e.xmm0); + e.vpshufb(e.xmm2, e.xmm2, e.xmm0); + + uint8_t mask = 0; + for (int i = 0; i < 8; i++) { + if (perm_ctrl.i16[i] == 0) { + mask |= 1 << (7 - i); + } + } + e.vpblendw(i.dest, e.xmm1, e.xmm2, mask); + } + + static void EmitByInt32(X64Emitter& e, const EmitArgType& i) { + assert_always(); + } + + static void Emit(X64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case INT8_TYPE: + EmitByInt8(e, i); + break; + case INT16_TYPE: + EmitByInt16(e, i); + break; + case INT32_TYPE: + EmitByInt32(e, i); + break; + default: + assert_unhandled_case(i.instr->flags); + return; + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_PERMUTE, PERMUTE_I32, PERMUTE_V128); + +// ============================================================================ +// OPCODE_SWIZZLE +// ============================================================================ +struct SWIZZLE + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto element_type = i.instr->flags; + if (element_type == INT8_TYPE) { + assert_always(); + } else if (element_type == INT16_TYPE) { + assert_always(); + } else if (element_type == INT32_TYPE || element_type == FLOAT32_TYPE) { + uint8_t swizzle_mask = static_cast(i.src2.value); + Xmm src1; + if (i.src1.is_constant) { + src1 = e.xmm0; + e.LoadConstantXmm(src1, i.src1.constant()); + } else { + src1 = i.src1; + } + e.vpshufd(i.dest, src1, swizzle_mask); + } else if (element_type == INT64_TYPE || element_type == FLOAT64_TYPE) { + assert_always(); + } else { + assert_always(); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_SWIZZLE, SWIZZLE); + +// ============================================================================ +// OPCODE_PACK +// ============================================================================ +struct PACK : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags & PACK_TYPE_MODE) { + case PACK_TYPE_D3DCOLOR: + EmitD3DCOLOR(e, i); + break; + case PACK_TYPE_FLOAT16_2: + EmitFLOAT16_2(e, i); + break; + case PACK_TYPE_FLOAT16_4: + EmitFLOAT16_4(e, i); + break; + case PACK_TYPE_SHORT_2: + EmitSHORT_2(e, i); + break; + case PACK_TYPE_SHORT_4: + EmitSHORT_4(e, i); + break; + case PACK_TYPE_UINT_2101010: + EmitUINT_2101010(e, i); + break; + case PACK_TYPE_8_IN_16: + Emit8_IN_16(e, i, i.instr->flags); + break; + case PACK_TYPE_16_IN_32: + Emit16_IN_32(e, i, i.instr->flags); + break; + default: + assert_unhandled_case(i.instr->flags); + break; + } + } + static void EmitD3DCOLOR(X64Emitter& e, const EmitArgType& i) { + assert_true(i.src2.value->IsConstantZero()); + Xmm src; + if (i.src1.is_constant) { + src = i.dest; + e.LoadConstantXmm(src, i.src1.constant()); + } else { + src = i.src1; + } + // Saturate to [3,3....] so that only values between 3...[00] and 3...[FF] + // are valid. + e.vminps(i.dest, src, e.GetXmmConstPtr(XMMPackD3DCOLORSat)); + e.vmaxps(i.dest, i.dest, e.GetXmmConstPtr(XMM3333)); + // Extract bytes. + // RGBA (XYZW) -> ARGB (WXYZ) + // w = ((src1.uw & 0xFF) << 24) | ((src1.ux & 0xFF) << 16) | + // ((src1.uy & 0xFF) << 8) | (src1.uz & 0xFF) + e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackD3DCOLOR)); + } + static __m128i EmulateFLOAT16_2(void*, __m128 src1) { + alignas(16) float a[4]; + alignas(16) uint16_t b[8]; + _mm_store_ps(a, src1); + std::memset(b, 0, sizeof(b)); + + for (int i = 0; i < 2; i++) { + b[7 - i] = half_float::detail::float2half(a[i]); + } + + return _mm_load_si128(reinterpret_cast<__m128i*>(b)); + } + static void EmitFLOAT16_2(X64Emitter& e, const EmitArgType& i) { + assert_true(i.src2.value->IsConstantZero()); + // http://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx + // dest = [(src1.x | src1.y), 0, 0, 0] + + Xmm src; + if (e.IsFeatureEnabled(kX64EmitF16C)) { + if (i.src1.is_constant) { + src = i.dest; + e.LoadConstantXmm(src, i.src1.constant()); + } else { + src = i.src1; + } + // 0|0|0|0|W|Z|Y|X + e.vcvtps2ph(i.dest, src, 0b00000011); + // Shuffle to X|Y|0|0|0|0|0|0 + e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackFLOAT16_2)); + } else { + if (i.src1.is_constant) { + src = e.xmm0; + e.LoadConstantXmm(src, i.src1.constant()); + } else { + src = i.src1; + } + e.lea(e.GetNativeParam(0), e.StashXmm(0, src)); + e.CallNativeSafe(reinterpret_cast(EmulateFLOAT16_2)); + e.vmovaps(i.dest, e.xmm0); + } + } + static __m128i EmulateFLOAT16_4(void*, __m128 src1) { + alignas(16) float a[4]; + alignas(16) uint16_t b[8]; + _mm_store_ps(a, src1); + std::memset(b, 0, sizeof(b)); + + for (int i = 0; i < 4; i++) { + b[7 - i] = half_float::detail::float2half(a[i]); + } + + return _mm_load_si128(reinterpret_cast<__m128i*>(b)); + } + static void EmitFLOAT16_4(X64Emitter& e, const EmitArgType& i) { + assert_true(i.src2.value->IsConstantZero()); + // dest = [(src1.x | src1.y), (src1.z | src1.w), 0, 0] + + Xmm src; + if (e.IsFeatureEnabled(kX64EmitF16C)) { + if (i.src1.is_constant) { + src = i.dest; + e.LoadConstantXmm(src, i.src1.constant()); + } else { + src = i.src1; + } + // 0|0|0|0|W|Z|Y|X + e.vcvtps2ph(i.dest, src, 0b00000011); + // Shuffle to X|Y|Z|W|0|0|0|0 + e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackFLOAT16_4)); + } else { + if (i.src1.is_constant) { + src = e.xmm0; + e.LoadConstantXmm(src, i.src1.constant()); + } else { + src = i.src1; + } + e.lea(e.GetNativeParam(0), e.StashXmm(0, src)); + e.CallNativeSafe(reinterpret_cast(EmulateFLOAT16_4)); + e.vmovaps(i.dest, e.xmm0); + } + } + static void EmitSHORT_2(X64Emitter& e, const EmitArgType& i) { + assert_true(i.src2.value->IsConstantZero()); + Xmm src; + if (i.src1.is_constant) { + src = i.dest; + e.LoadConstantXmm(src, i.src1.constant()); + } else { + src = i.src1; + } + // Saturate. + e.vmaxps(i.dest, src, e.GetXmmConstPtr(XMMPackSHORT_Min)); + e.vminps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_Max)); + // Pack. + e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_2)); + } + static void EmitSHORT_4(X64Emitter& e, const EmitArgType& i) { + assert_true(i.src2.value->IsConstantZero()); + Xmm src; + if (i.src1.is_constant) { + src = i.dest; + e.LoadConstantXmm(src, i.src1.constant()); + } else { + src = i.src1; + } + // Saturate. + e.vmaxps(i.dest, src, e.GetXmmConstPtr(XMMPackSHORT_Min)); + e.vminps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_Max)); + // Pack. + e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_4)); + } + static void EmitUINT_2101010(X64Emitter& e, const EmitArgType& i) { + // https://www.opengl.org/registry/specs/ARB/vertex_type_2_10_10_10_rev.txt + // XYZ are 10 bits, signed and saturated. + // W is 2 bits, unsigned and saturated. + Xmm src; + if (i.src1.is_constant) { + src = i.dest; + e.LoadConstantXmm(src, i.src1.constant()); + } else { + src = i.src1; + } + // Saturate. + e.vmaxps(i.dest, src, e.GetXmmConstPtr(XMMPackUINT_2101010_MinUnpacked)); + e.vminps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaxUnpacked)); + // Remove the unneeded bits of the floats. + e.vpand(i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaskUnpacked)); + if (e.IsFeatureEnabled(kX64EmitAVX2)) { + // Shift the components up. + e.vpsllvd(i.dest, i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_Shift)); + } else { + // Duplicate all the components into bits 10-19. + e.vpslld(e.xmm0, i.dest, 10); + e.vpor(i.dest, e.xmm0); + // Duplicate all the components into bits 20-39 + // (so alpha will be in 30-31). + e.vpslld(e.xmm0, i.dest, 20); + e.vpor(i.dest, e.xmm0); + // Leave only the needed components. + e.vpand(i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaskPacked)); + } + // Combine the components. + e.vshufps(e.xmm0, i.dest, i.dest, _MM_SHUFFLE(2, 3, 0, 1)); + e.vorps(i.dest, e.xmm0); + e.vshufps(e.xmm0, i.dest, i.dest, _MM_SHUFFLE(1, 0, 3, 2)); + e.vorps(i.dest, e.xmm0); + } + static __m128i EmulatePack8_IN_16_UN_UN_SAT(void*, __m128i src1, + __m128i src2) { + alignas(16) uint16_t a[8]; + alignas(16) uint16_t b[8]; + alignas(16) uint8_t c[16]; + _mm_store_si128(reinterpret_cast<__m128i*>(a), src1); + _mm_store_si128(reinterpret_cast<__m128i*>(b), src2); + for (int i = 0; i < 8; ++i) { + c[i] = uint8_t(std::max(uint16_t(0), std::min(uint16_t(255), a[i]))); + c[i + 8] = uint8_t(std::max(uint16_t(0), std::min(uint16_t(255), b[i]))); + } + return _mm_load_si128(reinterpret_cast<__m128i*>(c)); + } + static __m128i EmulatePack8_IN_16_UN_UN(void*, __m128i src1, __m128i src2) { + alignas(16) uint8_t a[16]; + alignas(16) uint8_t b[16]; + alignas(16) uint8_t c[16]; + _mm_store_si128(reinterpret_cast<__m128i*>(a), src1); + _mm_store_si128(reinterpret_cast<__m128i*>(b), src2); + for (int i = 0; i < 8; ++i) { + c[i] = a[i * 2]; + c[i + 8] = b[i * 2]; + } + return _mm_load_si128(reinterpret_cast<__m128i*>(c)); + } + static void Emit8_IN_16(X64Emitter& e, const EmitArgType& i, uint32_t flags) { + // TODO(benvanik): handle src2 (or src1) being constant zero + if (IsPackInUnsigned(flags)) { + if (IsPackOutUnsigned(flags)) { + if (IsPackOutSaturate(flags)) { + // unsigned -> unsigned + saturate + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.GetNativeParam(1), e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + } + e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); + e.CallNativeSafe( + reinterpret_cast(EmulatePack8_IN_16_UN_UN_SAT)); + e.vmovaps(i.dest, e.xmm0); + e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask)); + } else { + // unsigned -> unsigned + e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulatePack8_IN_16_UN_UN)); + e.vmovaps(i.dest, e.xmm0); + e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask)); + } + } else { + if (IsPackOutSaturate(flags)) { + // unsigned -> signed + saturate + assert_always(); + } else { + // unsigned -> signed + assert_always(); + } + } + } else { + if (IsPackOutUnsigned(flags)) { + if (IsPackOutSaturate(flags)) { + // signed -> unsigned + saturate + // PACKUSWB / SaturateSignedWordToUnsignedByte + Xbyak::Xmm src2 = i.src2.is_constant ? e.xmm0 : i.src2; + if (i.src2.is_constant) { + e.LoadConstantXmm(src2, i.src2.constant()); + } + + e.vpackuswb(i.dest, i.src1, src2); + e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask)); + } else { + // signed -> unsigned + assert_always(); + } + } else { + if (IsPackOutSaturate(flags)) { + // signed -> signed + saturate + // PACKSSWB / SaturateSignedWordToSignedByte + e.vpacksswb(i.dest, i.src1, i.src2); + e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask)); + } else { + // signed -> signed + assert_always(); + } + } + } + } + // Pack 2 32-bit vectors into a 16-bit vector. + static void Emit16_IN_32(X64Emitter& e, const EmitArgType& i, + uint32_t flags) { + // TODO(benvanik): handle src2 (or src1) being constant zero + if (IsPackInUnsigned(flags)) { + if (IsPackOutUnsigned(flags)) { + if (IsPackOutSaturate(flags)) { + // unsigned -> unsigned + saturate + // Construct a saturation max value + e.mov(e.eax, 0xFFFFu); + e.vmovd(e.xmm0, e.eax); + e.vpshufd(e.xmm0, e.xmm0, 0b00000000); + + if (!i.src1.is_constant) { + e.vpminud(e.xmm1, i.src1, e.xmm0); // Saturate src1 + e.vpshuflw(e.xmm1, e.xmm1, 0b00100010); + e.vpshufhw(e.xmm1, e.xmm1, 0b00100010); + e.vpshufd(e.xmm1, e.xmm1, 0b00001000); + } else { + // TODO(DrChat): Non-zero constants + assert_true(i.src1.constant().u64[0] == 0 && + i.src1.constant().u64[1] == 0); + e.vpxor(e.xmm1, e.xmm1); + } + + if (!i.src2.is_constant) { + e.vpminud(i.dest, i.src2, e.xmm0); // Saturate src2 + e.vpshuflw(i.dest, i.dest, 0b00100010); + e.vpshufhw(i.dest, i.dest, 0b00100010); + e.vpshufd(i.dest, i.dest, 0b10000000); + } else { + // TODO(DrChat): Non-zero constants + assert_true(i.src2.constant().u64[0] == 0 && + i.src2.constant().u64[1] == 0); + e.vpxor(i.dest, i.dest); + } + + e.vpblendw(i.dest, i.dest, e.xmm1, 0b00001111); + } else { + // unsigned -> unsigned + e.vmovaps(e.xmm0, i.src1); + e.vpshuflw(e.xmm0, e.xmm0, 0b00100010); + e.vpshufhw(e.xmm0, e.xmm0, 0b00100010); + e.vpshufd(e.xmm0, e.xmm0, 0b00001000); + + e.vmovaps(i.dest, i.src2); + e.vpshuflw(i.dest, i.dest, 0b00100010); + e.vpshufhw(i.dest, i.dest, 0b00100010); + e.vpshufd(i.dest, i.dest, 0b10000000); + + e.vpblendw(i.dest, i.dest, e.xmm0, 0b00001111); + } + } else { + if (IsPackOutSaturate(flags)) { + // unsigned -> signed + saturate + assert_always(); + } else { + // unsigned -> signed + assert_always(); + } + } + } else { + if (IsPackOutUnsigned(flags)) { + if (IsPackOutSaturate(flags)) { + // signed -> unsigned + saturate + // PACKUSDW + // TMP[15:0] <- (DEST[31:0] < 0) ? 0 : DEST[15:0]; + // DEST[15:0] <- (DEST[31:0] > FFFFH) ? FFFFH : TMP[15:0]; + e.vpackusdw(i.dest, i.src1, i.src2); + e.vpshuflw(i.dest, i.dest, 0b10110001); + e.vpshufhw(i.dest, i.dest, 0b10110001); + } else { + // signed -> unsigned + assert_always(); + } + } else { + if (IsPackOutSaturate(flags)) { + // signed -> signed + saturate + // PACKSSDW / SaturateSignedDwordToSignedWord + Xmm src2; + if (!i.src2.is_constant) { + src2 = i.src2; + } else { + assert_false(i.src1 == e.xmm0); + src2 = e.xmm0; + e.LoadConstantXmm(src2, i.src2.constant()); + } + e.vpackssdw(i.dest, i.src1, src2); + e.vpshuflw(i.dest, i.dest, 0b10110001); + e.vpshufhw(i.dest, i.dest, 0b10110001); + } else { + // signed -> signed + assert_always(); + } + } + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_PACK, PACK); + +// ============================================================================ +// OPCODE_UNPACK +// ============================================================================ +struct UNPACK : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags & PACK_TYPE_MODE) { + case PACK_TYPE_D3DCOLOR: + EmitD3DCOLOR(e, i); + break; + case PACK_TYPE_FLOAT16_2: + EmitFLOAT16_2(e, i); + break; + case PACK_TYPE_FLOAT16_4: + EmitFLOAT16_4(e, i); + break; + case PACK_TYPE_SHORT_2: + EmitSHORT_2(e, i); + break; + case PACK_TYPE_SHORT_4: + EmitSHORT_4(e, i); + break; + case PACK_TYPE_UINT_2101010: + EmitUINT_2101010(e, i); + break; + case PACK_TYPE_8_IN_16: + Emit8_IN_16(e, i, i.instr->flags); + break; + case PACK_TYPE_16_IN_32: + Emit16_IN_32(e, i, i.instr->flags); + break; + default: + assert_unhandled_case(i.instr->flags); + break; + } + } + static void EmitD3DCOLOR(X64Emitter& e, const EmitArgType& i) { + // ARGB (WXYZ) -> RGBA (XYZW) + Xmm src; + if (i.src1.is_constant) { + if (i.src1.value->IsConstantZero()) { + e.vmovaps(i.dest, e.GetXmmConstPtr(XMMOne)); + return; + } + src = i.dest; + e.LoadConstantXmm(src, i.src1.constant()); + } else { + src = i.src1; + } + // src = ZZYYXXWW + // Unpack to 000000ZZ,000000YY,000000XX,000000WW + e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackD3DCOLOR)); + // Add 1.0f to each. + e.vpor(i.dest, e.GetXmmConstPtr(XMMOne)); + // To convert to 0 to 1, games multiply by 0x47008081 and add 0xC7008081. + } + static __m128 EmulateFLOAT16_2(void*, __m128i src1) { + alignas(16) uint16_t a[8]; + alignas(16) float b[4]; + _mm_store_si128(reinterpret_cast<__m128i*>(a), src1); + + for (int i = 0; i < 2; i++) { + b[i] = half_float::detail::half2float(a[VEC128_W(6 + i)]); + } + + // Constants, or something + b[2] = 0.f; + b[3] = 1.f; + + return _mm_load_ps(b); + } + static void EmitFLOAT16_2(X64Emitter& e, const EmitArgType& i) { + // 1 bit sign, 5 bit exponent, 10 bit mantissa + // D3D10 half float format + // TODO(benvanik): + // http://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx + // Use _mm_cvtph_ps -- requires very modern processors (SSE5+) + // Unpacking half floats: + // http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/ + // Packing half floats: https://gist.github.com/rygorous/2156668 + // Load source, move from tight pack of X16Y16.... to X16...Y16... + // Also zero out the high end. + // TODO(benvanik): special case constant unpacks that just get 0/1/etc. + + Xmm src; + if (e.IsFeatureEnabled(kX64EmitF16C)) { + if (i.src1.is_constant) { + src = i.dest; + e.LoadConstantXmm(src, i.src1.constant()); + } else { + src = i.src1; + } + // sx = src.iw >> 16; + // sy = src.iw & 0xFFFF; + // dest = { XMConvertHalfToFloat(sx), + // XMConvertHalfToFloat(sy), + // 0.0, + // 1.0 }; + // Shuffle to 0|0|0|0|0|0|Y|X + e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackFLOAT16_2)); + e.vcvtph2ps(i.dest, i.dest); + e.vpshufd(i.dest, i.dest, 0b10100100); + e.vpor(i.dest, e.GetXmmConstPtr(XMM0001)); + } else { + if (i.src1.is_constant) { + src = e.xmm0; + e.LoadConstantXmm(src, i.src1.constant()); + } else { + src = i.src1; + } + e.lea(e.GetNativeParam(0), e.StashXmm(0, src)); + e.CallNativeSafe(reinterpret_cast(EmulateFLOAT16_2)); + e.vmovaps(i.dest, e.xmm0); + } + } + static __m128 EmulateFLOAT16_4(void*, __m128i src1) { + alignas(16) uint16_t a[8]; + alignas(16) float b[4]; + _mm_store_si128(reinterpret_cast<__m128i*>(a), src1); + + for (int i = 0; i < 4; i++) { + b[i] = half_float::detail::half2float(a[VEC128_W(4 + i)]); + } + + return _mm_load_ps(b); + } + static void EmitFLOAT16_4(X64Emitter& e, const EmitArgType& i) { + // src = [(dest.x | dest.y), (dest.z | dest.w), 0, 0] + Xmm src; + if (e.IsFeatureEnabled(kX64EmitF16C)) { + if (i.src1.is_constant) { + src = i.dest; + e.LoadConstantXmm(src, i.src1.constant()); + } else { + src = i.src1; + } + // Shuffle to 0|0|0|0|W|Z|Y|X + e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackFLOAT16_4)); + e.vcvtph2ps(i.dest, i.dest); + } else { + if (i.src1.is_constant) { + src = e.xmm0; + e.LoadConstantXmm(src, i.src1.constant()); + } else { + src = i.src1; + } + e.lea(e.GetNativeParam(0), e.StashXmm(0, src)); + e.CallNativeSafe(reinterpret_cast(EmulateFLOAT16_4)); + e.vmovaps(i.dest, e.xmm0); + } + } + static void EmitSHORT_2(X64Emitter& e, const EmitArgType& i) { + // (VD.x) = 3.0 + (VB.x>>16)*2^-22 + // (VD.y) = 3.0 + (VB.x)*2^-22 + // (VD.z) = 0.0 + // (VD.w) = 1.0 (games splat W after unpacking to get vectors of 1.0f) + // src is (xx,xx,xx,VALUE) + Xmm src; + if (i.src1.is_constant) { + if (i.src1.value->IsConstantZero()) { + e.vmovdqa(i.dest, e.GetXmmConstPtr(XMM3301)); + return; + } + // TODO(benvanik): check other common constants/perform shuffle/or here. + src = i.dest; + e.LoadConstantXmm(src, i.src1.constant()); + } else { + src = i.src1; + } + // Shuffle bytes. + e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackSHORT_2)); + // If negative, make smaller than 3 - sign extend before adding. + e.vpslld(i.dest, 16); + e.vpsrad(i.dest, 16); + // Add 3,3,0,1. + e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3301)); + // Return quiet NaNs in case of negative overflow. + e.vcmpeqps(e.xmm0, i.dest, e.GetXmmConstPtr(XMMUnpackSHORT_Overflow)); + e.vblendvps(i.dest, i.dest, e.GetXmmConstPtr(XMMUnpackOverflowNaN), e.xmm0); + } + static void EmitSHORT_4(X64Emitter& e, const EmitArgType& i) { + // (VD.x) = 3.0 + (VB.x>>16)*2^-22 + // (VD.y) = 3.0 + (VB.x)*2^-22 + // (VD.z) = 3.0 + (VB.y>>16)*2^-22 + // (VD.w) = 3.0 + (VB.y)*2^-22 + // src is (xx,xx,VALUE,VALUE) + Xmm src; + if (i.src1.is_constant) { + if (i.src1.value->IsConstantZero()) { + e.vmovdqa(i.dest, e.GetXmmConstPtr(XMM3333)); + return; + } + // TODO(benvanik): check other common constants/perform shuffle/or here. + src = i.dest; + e.LoadConstantXmm(src, i.src1.constant()); + } else { + src = i.src1; + } + // Shuffle bytes. + e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackSHORT_4)); + // If negative, make smaller than 3 - sign extend before adding. + e.vpslld(i.dest, 16); + e.vpsrad(i.dest, 16); + // Add 3,3,3,3. + e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3333)); + // Return quiet NaNs in case of negative overflow. + e.vcmpeqps(e.xmm0, i.dest, e.GetXmmConstPtr(XMMUnpackSHORT_Overflow)); + e.vblendvps(i.dest, i.dest, e.GetXmmConstPtr(XMMUnpackOverflowNaN), e.xmm0); + } + static void EmitUINT_2101010(X64Emitter& e, const EmitArgType& i) { + Xmm src; + if (i.src1.is_constant) { + if (i.src1.value->IsConstantZero()) { + e.vmovdqa(i.dest, e.GetXmmConstPtr(XMM3331)); + return; + } + src = i.dest; + e.LoadConstantXmm(src, i.src1.constant()); + } else { + src = i.src1; + } + // Splat W. + e.vshufps(i.dest, src, src, _MM_SHUFFLE(3, 3, 3, 3)); + // Keep only the needed components. + // Red in 0-9 now, green in 10-19, blue in 20-29, alpha in 30-31. + e.vpand(i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaskPacked)); + if (e.IsFeatureEnabled(kX64EmitAVX2)) { + // Shift the components down. + e.vpsrlvd(i.dest, i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_Shift)); + } else { + // Duplicate green in 0-9 and alpha in 20-21. + e.vpsrld(e.xmm0, i.dest, 10); + e.vpor(i.dest, e.xmm0); + // Duplicate blue in 0-9 and alpha in 0-1. + e.vpsrld(e.xmm0, i.dest, 20); + e.vpor(i.dest, e.xmm0); + // Remove higher duplicate components. + e.vpand(i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaskUnpacked)); + } + // If XYZ are negative, make smaller than 3 - sign extend XYZ before adding. + // W is unsigned. + e.vpslld(i.dest, 22); + e.vpsrad(i.dest, 22); + // Add 3,3,3,1. + e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3331)); + // Return quiet NaNs in case of negative overflow. + e.vcmpeqps(e.xmm0, i.dest, + e.GetXmmConstPtr(XMMUnpackUINT_2101010_Overflow)); + e.vblendvps(i.dest, i.dest, e.GetXmmConstPtr(XMMUnpackOverflowNaN), e.xmm0); + // To convert XYZ to -1 to 1, games multiply by 0x46004020 & sub 0x46C06030. + // For W to 0 to 1, they multiply by and subtract 0x4A2AAAAB. + } + static void Emit8_IN_16(X64Emitter& e, const EmitArgType& i, uint32_t flags) { + assert_false(IsPackOutSaturate(flags)); + Xmm src; + if (i.src1.is_constant) { + src = i.dest; + e.LoadConstantXmm(src, i.src1.constant()); + } else { + src = i.src1; + } + if (IsPackToLo(flags)) { + // Unpack to LO. + if (IsPackInUnsigned(flags)) { + if (IsPackOutUnsigned(flags)) { + // unsigned -> unsigned + assert_always(); + } else { + // unsigned -> signed + assert_always(); + } + } else { + if (IsPackOutUnsigned(flags)) { + // signed -> unsigned + assert_always(); + } else { + // signed -> signed + e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMByteOrderMask)); + e.vpunpckhbw(i.dest, i.dest, i.dest); + e.vpsraw(i.dest, 8); + } + } + } else { + // Unpack to HI. + if (IsPackInUnsigned(flags)) { + if (IsPackOutUnsigned(flags)) { + // unsigned -> unsigned + assert_always(); + } else { + // unsigned -> signed + assert_always(); + } + } else { + if (IsPackOutUnsigned(flags)) { + // signed -> unsigned + assert_always(); + } else { + // signed -> signed + e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMByteOrderMask)); + e.vpunpcklbw(i.dest, i.dest, i.dest); + e.vpsraw(i.dest, 8); + } + } + } + } + static void Emit16_IN_32(X64Emitter& e, const EmitArgType& i, + uint32_t flags) { + assert_false(IsPackOutSaturate(flags)); + Xmm src; + if (i.src1.is_constant) { + src = i.dest; + e.LoadConstantXmm(src, i.src1.constant()); + } else { + src = i.src1; + } + if (IsPackToLo(flags)) { + // Unpack to LO. + if (IsPackInUnsigned(flags)) { + if (IsPackOutUnsigned(flags)) { + // unsigned -> unsigned + assert_always(); + } else { + // unsigned -> signed + assert_always(); + } + } else { + if (IsPackOutUnsigned(flags)) { + // signed -> unsigned + assert_always(); + } else { + // signed -> signed + e.vpunpckhwd(i.dest, src, src); + e.vpsrad(i.dest, 16); + } + } + } else { + // Unpack to HI. + if (IsPackInUnsigned(flags)) { + if (IsPackOutUnsigned(flags)) { + // unsigned -> unsigned + assert_always(); + } else { + // unsigned -> signed + assert_always(); + } + } else { + if (IsPackOutUnsigned(flags)) { + // signed -> unsigned + assert_always(); + } else { + // signed -> signed + e.vpunpcklwd(i.dest, src, src); + e.vpsrad(i.dest, 16); + } + } + } + e.vpshufd(i.dest, i.dest, 0xB1); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_UNPACK, UNPACK); + +} // namespace x64 +} // namespace backend +} // namespace cpu +} // namespace xe \ No newline at end of file diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index 5c2118fc7..7d18cb4d0 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -33,13 +33,11 @@ #include "xenia/base/logging.h" #include "xenia/base/threading.h" #include "xenia/cpu/backend/x64/x64_emitter.h" +#include "xenia/cpu/backend/x64/x64_op.h" #include "xenia/cpu/backend/x64/x64_tracers.h" #include "xenia/cpu/hir/hir_builder.h" #include "xenia/cpu/processor.h" -// For OPCODE_PACK/OPCODE_UNPACK -#include "third_party/half/include/half.hpp" - namespace xe { namespace cpu { namespace backend { @@ -56,616 +54,6 @@ using xe::cpu::hir::Instr; typedef bool (*SequenceSelectFn)(X64Emitter&, const Instr*); std::unordered_map sequence_table; -// Selects the right byte/word/etc from a vector. We need to flip logical -// indices (0,1,2,3,4,5,6,7,...) = (3,2,1,0,7,6,5,4,...) -#define VEC128_B(n) ((n) ^ 0x3) -#define VEC128_W(n) ((n) ^ 0x1) -#define VEC128_D(n) (n) -#define VEC128_F(n) (n) - -enum KeyType { - KEY_TYPE_X = OPCODE_SIG_TYPE_X, - KEY_TYPE_L = OPCODE_SIG_TYPE_L, - KEY_TYPE_O = OPCODE_SIG_TYPE_O, - KEY_TYPE_S = OPCODE_SIG_TYPE_S, - KEY_TYPE_V_I8 = OPCODE_SIG_TYPE_V + INT8_TYPE, - KEY_TYPE_V_I16 = OPCODE_SIG_TYPE_V + INT16_TYPE, - KEY_TYPE_V_I32 = OPCODE_SIG_TYPE_V + INT32_TYPE, - KEY_TYPE_V_I64 = OPCODE_SIG_TYPE_V + INT64_TYPE, - KEY_TYPE_V_F32 = OPCODE_SIG_TYPE_V + FLOAT32_TYPE, - KEY_TYPE_V_F64 = OPCODE_SIG_TYPE_V + FLOAT64_TYPE, - KEY_TYPE_V_V128 = OPCODE_SIG_TYPE_V + VEC128_TYPE, -}; - -#pragma pack(push, 1) -union InstrKey { - struct { - uint32_t opcode : 8; - uint32_t dest : 5; - uint32_t src1 : 5; - uint32_t src2 : 5; - uint32_t src3 : 5; - uint32_t reserved : 4; - }; - uint32_t value; - - operator uint32_t() const { return value; } - - InstrKey() : value(0) {} - InstrKey(uint32_t v) : value(v) {} - InstrKey(const Instr* i) : value(0) { - opcode = i->opcode->num; - uint32_t sig = i->opcode->signature; - dest = - GET_OPCODE_SIG_TYPE_DEST(sig) ? OPCODE_SIG_TYPE_V + i->dest->type : 0; - src1 = GET_OPCODE_SIG_TYPE_SRC1(sig); - if (src1 == OPCODE_SIG_TYPE_V) { - src1 += i->src1.value->type; - } - src2 = GET_OPCODE_SIG_TYPE_SRC2(sig); - if (src2 == OPCODE_SIG_TYPE_V) { - src2 += i->src2.value->type; - } - src3 = GET_OPCODE_SIG_TYPE_SRC3(sig); - if (src3 == OPCODE_SIG_TYPE_V) { - src3 += i->src3.value->type; - } - } - - template - struct Construct { - static const uint32_t value = - (OPCODE) | (DEST << 8) | (SRC1 << 13) | (SRC2 << 18) | (SRC3 << 23); - }; -}; -#pragma pack(pop) -static_assert(sizeof(InstrKey) <= 4, "Key must be 4 bytes"); - -template -struct CombinedStruct; -template <> -struct CombinedStruct<> {}; -template -struct CombinedStruct : T, CombinedStruct {}; - -struct OpBase {}; - -template -struct Op : OpBase { - static const KeyType key_type = KEY_TYPE; -}; - -struct VoidOp : Op { - protected: - template - friend struct Op; - template - friend struct I; - void Load(const Instr::Op& op) {} -}; - -struct OffsetOp : Op { - uint64_t value; - - protected: - template - friend struct Op; - template - friend struct I; - void Load(const Instr::Op& op) { this->value = op.offset; } -}; - -struct SymbolOp : Op { - Function* value; - - protected: - template - friend struct Op; - template - friend struct I; - bool Load(const Instr::Op& op) { - this->value = op.symbol; - return true; - } -}; - -struct LabelOp : Op { - hir::Label* value; - - protected: - template - friend struct Op; - template - friend struct I; - void Load(const Instr::Op& op) { this->value = op.label; } -}; - -template -struct ValueOp : Op, KEY_TYPE> { - typedef REG_TYPE reg_type; - const Value* value; - bool is_constant; - virtual bool ConstantFitsIn32Reg() const { return true; } - const REG_TYPE& reg() const { - assert_true(!is_constant); - return reg_; - } - operator const REG_TYPE&() const { return reg(); } - bool IsEqual(const T& b) const { - if (is_constant && b.is_constant) { - return reinterpret_cast(this)->constant() == b.constant(); - } else if (!is_constant && !b.is_constant) { - return reg_.getIdx() == b.reg_.getIdx(); - } else { - return false; - } - } - bool IsEqual(const Xbyak::Reg& b) const { - if (is_constant) { - return false; - } else if (!is_constant) { - return reg_.getIdx() == b.getIdx(); - } else { - return false; - } - } - bool operator==(const T& b) const { return IsEqual(b); } - bool operator!=(const T& b) const { return !IsEqual(b); } - bool operator==(const Xbyak::Reg& b) const { return IsEqual(b); } - bool operator!=(const Xbyak::Reg& b) const { return !IsEqual(b); } - void Load(const Instr::Op& op) { - value = op.value; - is_constant = value->IsConstant(); - if (!is_constant) { - X64Emitter::SetupReg(value, reg_); - } - } - - protected: - REG_TYPE reg_; -}; - -struct I8Op : ValueOp { - typedef ValueOp BASE; - const int8_t constant() const { - assert_true(BASE::is_constant); - return BASE::value->constant.i8; - } -}; -struct I16Op : ValueOp { - typedef ValueOp BASE; - const int16_t constant() const { - assert_true(BASE::is_constant); - return BASE::value->constant.i16; - } -}; -struct I32Op : ValueOp { - typedef ValueOp BASE; - const int32_t constant() const { - assert_true(BASE::is_constant); - return BASE::value->constant.i32; - } -}; -struct I64Op : ValueOp { - typedef ValueOp BASE; - const int64_t constant() const { - assert_true(BASE::is_constant); - return BASE::value->constant.i64; - } - bool ConstantFitsIn32Reg() const override { - int64_t v = BASE::value->constant.i64; - if ((v & ~0x7FFFFFFF) == 0) { - // Fits under 31 bits, so just load using normal mov. - return true; - } else if ((v & ~0x7FFFFFFF) == ~0x7FFFFFFF) { - // Negative number that fits in 32bits. - return true; - } - return false; - } -}; -struct F32Op : ValueOp { - typedef ValueOp BASE; - const float constant() const { - assert_true(BASE::is_constant); - return BASE::value->constant.f32; - } -}; -struct F64Op : ValueOp { - typedef ValueOp BASE; - const double constant() const { - assert_true(BASE::is_constant); - return BASE::value->constant.f64; - } -}; -struct V128Op : ValueOp { - typedef ValueOp BASE; - const vec128_t& constant() const { - assert_true(BASE::is_constant); - return BASE::value->constant.v128; - } -}; - -template -struct DestField; -template -struct DestField { - DEST dest; - - protected: - bool LoadDest(const Instr* i) { - Instr::Op op; - op.value = i->dest; - dest.Load(op); - return true; - } -}; -template <> -struct DestField { - protected: - bool LoadDest(const Instr* i) { return true; } -}; - -template -struct I; -template -struct I : DestField { - typedef DestField BASE; - static const hir::Opcode opcode = OPCODE; - static const uint32_t key = - InstrKey::Construct::value; - static const KeyType dest_type = DEST::key_type; - const Instr* instr; - - protected: - template - friend struct Sequence; - bool Load(const Instr* i) { - if (InstrKey(i).value == key && BASE::LoadDest(i)) { - instr = i; - return true; - } - return false; - } -}; -template -struct I : DestField { - typedef DestField BASE; - static const hir::Opcode opcode = OPCODE; - static const uint32_t key = - InstrKey::Construct::value; - static const KeyType dest_type = DEST::key_type; - static const KeyType src1_type = SRC1::key_type; - const Instr* instr; - SRC1 src1; - - protected: - template - friend struct Sequence; - bool Load(const Instr* i) { - if (InstrKey(i).value == key && BASE::LoadDest(i)) { - instr = i; - src1.Load(i->src1); - return true; - } - return false; - } -}; -template -struct I : DestField { - typedef DestField BASE; - static const hir::Opcode opcode = OPCODE; - static const uint32_t key = - InstrKey::Construct::value; - static const KeyType dest_type = DEST::key_type; - static const KeyType src1_type = SRC1::key_type; - static const KeyType src2_type = SRC2::key_type; - const Instr* instr; - SRC1 src1; - SRC2 src2; - - protected: - template - friend struct Sequence; - bool Load(const Instr* i) { - if (InstrKey(i).value == key && BASE::LoadDest(i)) { - instr = i; - src1.Load(i->src1); - src2.Load(i->src2); - return true; - } - return false; - } -}; -template -struct I : DestField { - typedef DestField BASE; - static const hir::Opcode opcode = OPCODE; - static const uint32_t key = - InstrKey::Construct::value; - static const KeyType dest_type = DEST::key_type; - static const KeyType src1_type = SRC1::key_type; - static const KeyType src2_type = SRC2::key_type; - static const KeyType src3_type = SRC3::key_type; - const Instr* instr; - SRC1 src1; - SRC2 src2; - SRC3 src3; - - protected: - template - friend struct Sequence; - bool Load(const Instr* i) { - if (InstrKey(i).value == key && BASE::LoadDest(i)) { - instr = i; - src1.Load(i->src1); - src2.Load(i->src2); - src3.Load(i->src3); - return true; - } - return false; - } -}; - -template -const T GetTempReg(X64Emitter& e); -template <> -const Reg8 GetTempReg(X64Emitter& e) { - return e.al; -} -template <> -const Reg16 GetTempReg(X64Emitter& e) { - return e.ax; -} -template <> -const Reg32 GetTempReg(X64Emitter& e) { - return e.eax; -} -template <> -const Reg64 GetTempReg(X64Emitter& e) { - return e.rax; -} - -template -struct Sequence { - typedef T EmitArgType; - - static constexpr uint32_t head_key() { return T::key; } - - static bool Select(X64Emitter& e, const Instr* i) { - T args; - if (!args.Load(i)) { - return false; - } - SEQ::Emit(e, args); - return true; - } - - template - static void EmitUnaryOp(X64Emitter& e, const EmitArgType& i, - const REG_FN& reg_fn) { - if (i.src1.is_constant) { - e.mov(i.dest, i.src1.constant()); - reg_fn(e, i.dest); - } else { - if (i.dest != i.src1) { - e.mov(i.dest, i.src1); - } - reg_fn(e, i.dest); - } - } - - template - static void EmitCommutativeBinaryOp(X64Emitter& e, const EmitArgType& i, - const REG_REG_FN& reg_reg_fn, - const REG_CONST_FN& reg_const_fn) { - if (i.src1.is_constant) { - if (i.src2.is_constant) { - // Both constants. - if (i.src1.ConstantFitsIn32Reg()) { - e.mov(i.dest, i.src2.constant()); - reg_const_fn(e, i.dest, static_cast(i.src1.constant())); - } else if (i.src2.ConstantFitsIn32Reg()) { - e.mov(i.dest, i.src1.constant()); - reg_const_fn(e, i.dest, static_cast(i.src2.constant())); - } else { - e.mov(i.dest, i.src1.constant()); - auto temp = GetTempReg(e); - e.mov(temp, i.src2.constant()); - reg_reg_fn(e, i.dest, temp); - } - } else { - // src1 constant. - if (i.dest == i.src2) { - if (i.src1.ConstantFitsIn32Reg()) { - reg_const_fn(e, i.dest, static_cast(i.src1.constant())); - } else { - auto temp = GetTempReg(e); - e.mov(temp, i.src1.constant()); - reg_reg_fn(e, i.dest, temp); - } - } else { - e.mov(i.dest, i.src1.constant()); - reg_reg_fn(e, i.dest, i.src2); - } - } - } else if (i.src2.is_constant) { - if (i.dest == i.src1) { - if (i.src2.ConstantFitsIn32Reg()) { - reg_const_fn(e, i.dest, static_cast(i.src2.constant())); - } else { - auto temp = GetTempReg(e); - e.mov(temp, i.src2.constant()); - reg_reg_fn(e, i.dest, temp); - } - } else { - e.mov(i.dest, i.src2.constant()); - reg_reg_fn(e, i.dest, i.src1); - } - } else { - if (i.dest == i.src1) { - reg_reg_fn(e, i.dest, i.src2); - } else if (i.dest == i.src2) { - reg_reg_fn(e, i.dest, i.src1); - } else { - e.mov(i.dest, i.src1); - reg_reg_fn(e, i.dest, i.src2); - } - } - } - template - static void EmitAssociativeBinaryOp(X64Emitter& e, const EmitArgType& i, - const REG_REG_FN& reg_reg_fn, - const REG_CONST_FN& reg_const_fn) { - if (i.src1.is_constant) { - assert_true(!i.src2.is_constant); - if (i.dest == i.src2) { - auto temp = GetTempReg(e); - e.mov(temp, i.src2); - e.mov(i.dest, i.src1.constant()); - reg_reg_fn(e, i.dest, temp); - } else { - e.mov(i.dest, i.src1.constant()); - reg_reg_fn(e, i.dest, i.src2); - } - } else if (i.src2.is_constant) { - if (i.dest == i.src1) { - if (i.src2.ConstantFitsIn32Reg()) { - reg_const_fn(e, i.dest, static_cast(i.src2.constant())); - } else { - auto temp = GetTempReg(e); - e.mov(temp, i.src2.constant()); - reg_reg_fn(e, i.dest, temp); - } - } else { - e.mov(i.dest, i.src1); - if (i.src2.ConstantFitsIn32Reg()) { - reg_const_fn(e, i.dest, static_cast(i.src2.constant())); - } else { - auto temp = GetTempReg(e); - e.mov(temp, i.src2.constant()); - reg_reg_fn(e, i.dest, temp); - } - } - } else { - if (i.dest == i.src1) { - reg_reg_fn(e, i.dest, i.src2); - } else if (i.dest == i.src2) { - auto temp = GetTempReg(e); - e.mov(temp, i.src2); - e.mov(i.dest, i.src1); - reg_reg_fn(e, i.dest, temp); - } else { - e.mov(i.dest, i.src1); - reg_reg_fn(e, i.dest, i.src2); - } - } - } - - template - static void EmitCommutativeBinaryXmmOp(X64Emitter& e, const EmitArgType& i, - const FN& fn) { - if (i.src1.is_constant) { - assert_true(!i.src2.is_constant); - e.LoadConstantXmm(e.xmm0, i.src1.constant()); - fn(e, i.dest, e.xmm0, i.src2); - } else if (i.src2.is_constant) { - assert_true(!i.src1.is_constant); - e.LoadConstantXmm(e.xmm0, i.src2.constant()); - fn(e, i.dest, i.src1, e.xmm0); - } else { - fn(e, i.dest, i.src1, i.src2); - } - } - - template - static void EmitAssociativeBinaryXmmOp(X64Emitter& e, const EmitArgType& i, - const FN& fn) { - if (i.src1.is_constant) { - assert_true(!i.src2.is_constant); - e.LoadConstantXmm(e.xmm0, i.src1.constant()); - fn(e, i.dest, e.xmm0, i.src2); - } else if (i.src2.is_constant) { - assert_true(!i.src1.is_constant); - e.LoadConstantXmm(e.xmm0, i.src2.constant()); - fn(e, i.dest, i.src1, e.xmm0); - } else { - fn(e, i.dest, i.src1, i.src2); - } - } - - template - static void EmitCommutativeCompareOp(X64Emitter& e, const EmitArgType& i, - const REG_REG_FN& reg_reg_fn, - const REG_CONST_FN& reg_const_fn) { - if (i.src1.is_constant) { - assert_true(!i.src2.is_constant); - if (i.src1.ConstantFitsIn32Reg()) { - reg_const_fn(e, i.src2, static_cast(i.src1.constant())); - } else { - auto temp = GetTempReg(e); - e.mov(temp, i.src1.constant()); - reg_reg_fn(e, i.src2, temp); - } - } else if (i.src2.is_constant) { - assert_true(!i.src1.is_constant); - if (i.src2.ConstantFitsIn32Reg()) { - reg_const_fn(e, i.src1, static_cast(i.src2.constant())); - } else { - auto temp = GetTempReg(e); - e.mov(temp, i.src2.constant()); - reg_reg_fn(e, i.src1, temp); - } - } else { - reg_reg_fn(e, i.src1, i.src2); - } - } - template - static void EmitAssociativeCompareOp(X64Emitter& e, const EmitArgType& i, - const REG_REG_FN& reg_reg_fn, - const REG_CONST_FN& reg_const_fn) { - if (i.src1.is_constant) { - assert_true(!i.src2.is_constant); - if (i.src1.ConstantFitsIn32Reg()) { - reg_const_fn(e, i.dest, i.src2, static_cast(i.src1.constant()), - true); - } else { - auto temp = GetTempReg(e); - e.mov(temp, i.src1.constant()); - reg_reg_fn(e, i.dest, i.src2, temp, true); - } - } else if (i.src2.is_constant) { - assert_true(!i.src1.is_constant); - if (i.src2.ConstantFitsIn32Reg()) { - reg_const_fn(e, i.dest, i.src1, static_cast(i.src2.constant()), - false); - } else { - auto temp = GetTempReg(e); - e.mov(temp, i.src2.constant()); - reg_reg_fn(e, i.dest, i.src1, temp, false); - } - } else { - reg_reg_fn(e, i.dest, i.src1, i.src2, false); - } - } -}; - -template -void Register() { - sequence_table.insert({T::head_key(), T::Select}); -} -template -void Register() { - Register(); - Register(); -} -#define EMITTER_OPCODE_TABLE(name, ...) \ - void Register_##name() { Register<__VA_ARGS__>(); } - // ============================================================================ // OPCODE_COMMENT // ============================================================================ @@ -702,532 +90,6 @@ struct SOURCE_OFFSET }; EMITTER_OPCODE_TABLE(OPCODE_SOURCE_OFFSET, SOURCE_OFFSET); -// ============================================================================ -// OPCODE_DEBUG_BREAK -// ============================================================================ -struct DEBUG_BREAK : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { e.DebugBreak(); } -}; -EMITTER_OPCODE_TABLE(OPCODE_DEBUG_BREAK, DEBUG_BREAK); - -// ============================================================================ -// OPCODE_DEBUG_BREAK_TRUE -// ============================================================================ -struct DEBUG_BREAK_TRUE_I8 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip); - e.DebugBreak(); - e.L(skip); - } -}; -struct DEBUG_BREAK_TRUE_I16 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip); - e.DebugBreak(); - e.L(skip); - } -}; -struct DEBUG_BREAK_TRUE_I32 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip); - e.DebugBreak(); - e.L(skip); - } -}; -struct DEBUG_BREAK_TRUE_I64 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip); - e.DebugBreak(); - e.L(skip); - } -}; -struct DEBUG_BREAK_TRUE_F32 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vptest(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip); - e.DebugBreak(); - e.L(skip); - } -}; -struct DEBUG_BREAK_TRUE_F64 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vptest(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip); - e.DebugBreak(); - e.L(skip); - } -}; -EMITTER_OPCODE_TABLE(OPCODE_DEBUG_BREAK_TRUE, DEBUG_BREAK_TRUE_I8, - DEBUG_BREAK_TRUE_I16, DEBUG_BREAK_TRUE_I32, - DEBUG_BREAK_TRUE_I64, DEBUG_BREAK_TRUE_F32, - DEBUG_BREAK_TRUE_F64); - -// ============================================================================ -// OPCODE_TRAP -// ============================================================================ -struct TRAP : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.Trap(i.instr->flags); - } -}; -EMITTER_OPCODE_TABLE(OPCODE_TRAP, TRAP); - -// ============================================================================ -// OPCODE_TRAP_TRUE -// ============================================================================ -struct TRAP_TRUE_I8 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip); - e.Trap(i.instr->flags); - e.L(skip); - } -}; -struct TRAP_TRUE_I16 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip); - e.Trap(i.instr->flags); - e.L(skip); - } -}; -struct TRAP_TRUE_I32 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip); - e.Trap(i.instr->flags); - e.L(skip); - } -}; -struct TRAP_TRUE_I64 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip); - e.Trap(i.instr->flags); - e.L(skip); - } -}; -struct TRAP_TRUE_F32 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vptest(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip); - e.Trap(i.instr->flags); - e.L(skip); - } -}; -struct TRAP_TRUE_F64 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vptest(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip); - e.Trap(i.instr->flags); - e.L(skip); - } -}; -EMITTER_OPCODE_TABLE(OPCODE_TRAP_TRUE, TRAP_TRUE_I8, TRAP_TRUE_I16, - TRAP_TRUE_I32, TRAP_TRUE_I64, TRAP_TRUE_F32, - TRAP_TRUE_F64); - -// ============================================================================ -// OPCODE_CALL -// ============================================================================ -struct CALL : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - assert_true(i.src1.value->is_guest()); - e.Call(i.instr, static_cast(i.src1.value)); - } -}; -EMITTER_OPCODE_TABLE(OPCODE_CALL, CALL); - -// ============================================================================ -// OPCODE_CALL_TRUE -// ============================================================================ -struct CALL_TRUE_I8 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - assert_true(i.src2.value->is_guest()); - e.test(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip); - e.Call(i.instr, static_cast(i.src2.value)); - e.L(skip); - } -}; -struct CALL_TRUE_I16 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - assert_true(i.src2.value->is_guest()); - e.test(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip); - e.Call(i.instr, static_cast(i.src2.value)); - e.L(skip); - } -}; -struct CALL_TRUE_I32 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - assert_true(i.src2.value->is_guest()); - e.test(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip); - e.Call(i.instr, static_cast(i.src2.value)); - e.L(skip); - } -}; -struct CALL_TRUE_I64 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - assert_true(i.src2.value->is_guest()); - e.test(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip); - e.Call(i.instr, static_cast(i.src2.value)); - e.L(skip); - } -}; -struct CALL_TRUE_F32 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - assert_true(i.src2.value->is_guest()); - e.vptest(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip); - e.Call(i.instr, static_cast(i.src2.value)); - e.L(skip); - } -}; -struct CALL_TRUE_F64 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - assert_true(i.src2.value->is_guest()); - e.vptest(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip); - e.Call(i.instr, static_cast(i.src2.value)); - e.L(skip); - } -}; -EMITTER_OPCODE_TABLE(OPCODE_CALL_TRUE, CALL_TRUE_I8, CALL_TRUE_I16, - CALL_TRUE_I32, CALL_TRUE_I64, CALL_TRUE_F32, - CALL_TRUE_F64); - -// ============================================================================ -// OPCODE_CALL_INDIRECT -// ============================================================================ -struct CALL_INDIRECT - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.CallIndirect(i.instr, i.src1); - } -}; -EMITTER_OPCODE_TABLE(OPCODE_CALL_INDIRECT, CALL_INDIRECT); - -// ============================================================================ -// OPCODE_CALL_INDIRECT_TRUE -// ============================================================================ -struct CALL_INDIRECT_TRUE_I8 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip, CodeGenerator::T_NEAR); - e.CallIndirect(i.instr, i.src2); - e.L(skip); - } -}; -struct CALL_INDIRECT_TRUE_I16 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip, CodeGenerator::T_NEAR); - e.CallIndirect(i.instr, i.src2); - e.L(skip); - } -}; -struct CALL_INDIRECT_TRUE_I32 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip, CodeGenerator::T_NEAR); - e.CallIndirect(i.instr, i.src2); - e.L(skip); - } -}; -struct CALL_INDIRECT_TRUE_I64 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip, CodeGenerator::T_NEAR); - e.CallIndirect(i.instr, i.src2); - e.L(skip); - } -}; -struct CALL_INDIRECT_TRUE_F32 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vptest(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip, CodeGenerator::T_NEAR); - e.CallIndirect(i.instr, i.src2); - e.L(skip); - } -}; -struct CALL_INDIRECT_TRUE_F64 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vptest(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip, CodeGenerator::T_NEAR); - e.CallIndirect(i.instr, i.src2); - e.L(skip); - } -}; -EMITTER_OPCODE_TABLE(OPCODE_CALL_INDIRECT_TRUE, CALL_INDIRECT_TRUE_I8, - CALL_INDIRECT_TRUE_I16, CALL_INDIRECT_TRUE_I32, - CALL_INDIRECT_TRUE_I64, CALL_INDIRECT_TRUE_F32, - CALL_INDIRECT_TRUE_F64); - -// ============================================================================ -// OPCODE_CALL_EXTERN -// ============================================================================ -struct CALL_EXTERN - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.CallExtern(i.instr, i.src1.value); - } -}; -EMITTER_OPCODE_TABLE(OPCODE_CALL_EXTERN, CALL_EXTERN); - -// ============================================================================ -// OPCODE_RETURN -// ============================================================================ -struct RETURN : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - // If this is the last instruction in the last block, just let us - // fall through. - if (i.instr->next || i.instr->block->next) { - e.jmp(e.epilog_label(), CodeGenerator::T_NEAR); - } - } -}; -EMITTER_OPCODE_TABLE(OPCODE_RETURN, RETURN); - -// ============================================================================ -// OPCODE_RETURN_TRUE -// ============================================================================ -struct RETURN_TRUE_I8 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - e.jnz(e.epilog_label(), CodeGenerator::T_NEAR); - } -}; -struct RETURN_TRUE_I16 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - e.jnz(e.epilog_label(), CodeGenerator::T_NEAR); - } -}; -struct RETURN_TRUE_I32 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - e.jnz(e.epilog_label(), CodeGenerator::T_NEAR); - } -}; -struct RETURN_TRUE_I64 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - e.jnz(e.epilog_label(), CodeGenerator::T_NEAR); - } -}; -struct RETURN_TRUE_F32 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vptest(i.src1, i.src1); - e.jnz(e.epilog_label(), CodeGenerator::T_NEAR); - } -}; -struct RETURN_TRUE_F64 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vptest(i.src1, i.src1); - e.jnz(e.epilog_label(), CodeGenerator::T_NEAR); - } -}; -EMITTER_OPCODE_TABLE(OPCODE_RETURN_TRUE, RETURN_TRUE_I8, RETURN_TRUE_I16, - RETURN_TRUE_I32, RETURN_TRUE_I64, RETURN_TRUE_F32, - RETURN_TRUE_F64); - -// ============================================================================ -// OPCODE_SET_RETURN_ADDRESS -// ============================================================================ -struct SET_RETURN_ADDRESS - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.SetReturnAddress(i.src1.constant()); - } -}; -EMITTER_OPCODE_TABLE(OPCODE_SET_RETURN_ADDRESS, SET_RETURN_ADDRESS); - -// ============================================================================ -// OPCODE_BRANCH -// ============================================================================ -struct BRANCH : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.jmp(i.src1.value->name, e.T_NEAR); - } -}; -EMITTER_OPCODE_TABLE(OPCODE_BRANCH, BRANCH); - -// ============================================================================ -// OPCODE_BRANCH_TRUE -// ============================================================================ -struct BRANCH_TRUE_I8 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - e.jnz(i.src2.value->name, e.T_NEAR); - } -}; -struct BRANCH_TRUE_I16 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - e.jnz(i.src2.value->name, e.T_NEAR); - } -}; -struct BRANCH_TRUE_I32 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - e.jnz(i.src2.value->name, e.T_NEAR); - } -}; -struct BRANCH_TRUE_I64 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - e.jnz(i.src2.value->name, e.T_NEAR); - } -}; -struct BRANCH_TRUE_F32 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vptest(i.src1, i.src1); - e.jnz(i.src2.value->name, e.T_NEAR); - } -}; -struct BRANCH_TRUE_F64 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vptest(i.src1, i.src1); - e.jnz(i.src2.value->name, e.T_NEAR); - } -}; -EMITTER_OPCODE_TABLE(OPCODE_BRANCH_TRUE, BRANCH_TRUE_I8, BRANCH_TRUE_I16, - BRANCH_TRUE_I32, BRANCH_TRUE_I64, BRANCH_TRUE_F32, - BRANCH_TRUE_F64); - -// ============================================================================ -// OPCODE_BRANCH_FALSE -// ============================================================================ -struct BRANCH_FALSE_I8 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - e.jz(i.src2.value->name, e.T_NEAR); - } -}; -struct BRANCH_FALSE_I16 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - e.jz(i.src2.value->name, e.T_NEAR); - } -}; -struct BRANCH_FALSE_I32 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - e.jz(i.src2.value->name, e.T_NEAR); - } -}; -struct BRANCH_FALSE_I64 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - e.jz(i.src2.value->name, e.T_NEAR); - } -}; -struct BRANCH_FALSE_F32 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vptest(i.src1, i.src1); - e.jz(i.src2.value->name, e.T_NEAR); - } -}; -struct BRANCH_FALSE_F64 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vptest(i.src1, i.src1); - e.jz(i.src2.value->name, e.T_NEAR); - } -}; -EMITTER_OPCODE_TABLE(OPCODE_BRANCH_FALSE, BRANCH_FALSE_I8, BRANCH_FALSE_I16, - BRANCH_FALSE_I32, BRANCH_FALSE_I64, BRANCH_FALSE_F32, - BRANCH_FALSE_F64); - // ============================================================================ // OPCODE_ASSIGN // ============================================================================ @@ -1573,169 +435,6 @@ struct ROUND_V128 : Sequence> { }; EMITTER_OPCODE_TABLE(OPCODE_ROUND, ROUND_F32, ROUND_F64, ROUND_V128); -// ============================================================================ -// OPCODE_VECTOR_CONVERT_I2F -// ============================================================================ -struct VECTOR_CONVERT_I2F - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - // flags = ARITHMETIC_UNSIGNED - if (i.instr->flags & ARITHMETIC_UNSIGNED) { - // xmm0 = mask of positive values - e.vpcmpgtd(e.xmm0, i.src1, e.GetXmmConstPtr(XMMFFFF)); - - // scale any values >= (unsigned)INT_MIN back to [0, INT_MAX] - e.vpsubd(e.xmm1, i.src1, e.GetXmmConstPtr(XMMSignMaskI32)); - e.vblendvps(e.xmm1, e.xmm1, i.src1, e.xmm0); - - // xmm1 = [0, INT_MAX] - e.vcvtdq2ps(i.dest, e.xmm1); - - // scale values back above [INT_MIN, UINT_MAX] - e.vpandn(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMPosIntMinPS)); - e.vaddps(i.dest, i.dest, e.xmm0); - } else { - e.vcvtdq2ps(i.dest, i.src1); - } - } -}; -EMITTER_OPCODE_TABLE(OPCODE_VECTOR_CONVERT_I2F, VECTOR_CONVERT_I2F); - -// ============================================================================ -// OPCODE_VECTOR_CONVERT_F2I -// ============================================================================ -struct VECTOR_CONVERT_F2I - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - if (i.instr->flags & ARITHMETIC_UNSIGNED) { - // clamp to min 0 - e.vmaxps(e.xmm0, i.src1, e.GetXmmConstPtr(XMMZero)); - - // xmm1 = mask of values >= (unsigned)INT_MIN - e.vcmpgeps(e.xmm1, e.xmm0, e.GetXmmConstPtr(XMMPosIntMinPS)); - - // scale any values >= (unsigned)INT_MIN back to [0, ...] - e.vsubps(e.xmm2, e.xmm0, e.GetXmmConstPtr(XMMPosIntMinPS)); - e.vblendvps(e.xmm0, e.xmm0, e.xmm2, e.xmm1); - - // xmm0 = [0, INT_MAX] - // this may still contain values > INT_MAX (if src has vals > UINT_MAX) - e.vcvttps2dq(i.dest, e.xmm0); - - // xmm0 = mask of values that need saturation - e.vpcmpeqd(e.xmm0, i.dest, e.GetXmmConstPtr(XMMIntMin)); - - // scale values back above [INT_MIN, UINT_MAX] - e.vpand(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMIntMin)); - e.vpaddd(i.dest, i.dest, e.xmm1); - - // saturate values > UINT_MAX - e.vpor(i.dest, i.dest, e.xmm0); - } else { - // xmm2 = NaN mask - e.vcmpunordps(e.xmm2, i.src1, i.src1); - - // convert packed floats to packed dwords - e.vcvttps2dq(e.xmm0, i.src1); - - // (high bit) xmm1 = dest is indeterminate and i.src1 >= 0 - e.vpcmpeqd(e.xmm1, e.xmm0, e.GetXmmConstPtr(XMMIntMin)); - e.vpandn(e.xmm1, i.src1, e.xmm1); - - // saturate positive values - e.vblendvps(i.dest, e.xmm0, e.GetXmmConstPtr(XMMIntMax), e.xmm1); - - // mask NaNs - e.vpandn(i.dest, e.xmm2, i.dest); - } - } -}; -EMITTER_OPCODE_TABLE(OPCODE_VECTOR_CONVERT_F2I, VECTOR_CONVERT_F2I); - -// ============================================================================ -// OPCODE_LOAD_VECTOR_SHL -// ============================================================================ -static const vec128_t lvsl_table[16] = { - vec128b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), - vec128b(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), - vec128b(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17), - vec128b(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18), - vec128b(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19), - vec128b(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20), - vec128b(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21), - vec128b(7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22), - vec128b(8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23), - vec128b(9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24), - vec128b(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25), - vec128b(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), - vec128b(12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27), - vec128b(13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28), - vec128b(14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29), - vec128b(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30), -}; -struct LOAD_VECTOR_SHL_I8 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - if (i.src1.is_constant) { - auto sh = i.src1.constant(); - assert_true(sh < xe::countof(lvsl_table)); - e.mov(e.rax, (uintptr_t)&lvsl_table[sh]); - e.vmovaps(i.dest, e.ptr[e.rax]); - } else { - // TODO(benvanik): find a cheaper way of doing this. - e.movzx(e.rdx, i.src1); - e.and_(e.dx, 0xF); - e.shl(e.dx, 4); - e.mov(e.rax, (uintptr_t)lvsl_table); - e.vmovaps(i.dest, e.ptr[e.rax + e.rdx]); - } - } -}; -EMITTER_OPCODE_TABLE(OPCODE_LOAD_VECTOR_SHL, LOAD_VECTOR_SHL_I8); - -// ============================================================================ -// OPCODE_LOAD_VECTOR_SHR -// ============================================================================ -static const vec128_t lvsr_table[16] = { - vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), - vec128b(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30), - vec128b(14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29), - vec128b(13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28), - vec128b(12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27), - vec128b(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), - vec128b(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25), - vec128b(9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24), - vec128b(8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23), - vec128b(7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22), - vec128b(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21), - vec128b(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20), - vec128b(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19), - vec128b(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18), - vec128b(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17), - vec128b(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), -}; -struct LOAD_VECTOR_SHR_I8 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - if (i.src1.is_constant) { - auto sh = i.src1.constant(); - assert_true(sh < xe::countof(lvsr_table)); - e.mov(e.rax, (uintptr_t)&lvsr_table[sh]); - e.vmovaps(i.dest, e.ptr[e.rax]); - } else { - // TODO(benvanik): find a cheaper way of doing this. - e.movzx(e.rdx, i.src1); - e.and_(e.dx, 0xF); - e.shl(e.dx, 4); - e.mov(e.rax, (uintptr_t)lvsr_table); - e.vmovaps(i.dest, e.ptr[e.rax + e.rdx]); - } - } -}; -EMITTER_OPCODE_TABLE(OPCODE_LOAD_VECTOR_SHR, LOAD_VECTOR_SHR_I8); - // ============================================================================ // OPCODE_LOAD_CLOCK // ============================================================================ @@ -1751,343 +450,6 @@ struct LOAD_CLOCK : Sequence> { }; EMITTER_OPCODE_TABLE(OPCODE_LOAD_CLOCK, LOAD_CLOCK); -// ============================================================================ -// OPCODE_LOAD_LOCAL -// ============================================================================ -// Note: all types are always aligned on the stack. -struct LOAD_LOCAL_I8 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.mov(i.dest, e.byte[e.rsp + i.src1.constant()]); - // e.TraceLoadI8(DATA_LOCAL, i.src1.constant, i.dest); - } -}; -struct LOAD_LOCAL_I16 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.mov(i.dest, e.word[e.rsp + i.src1.constant()]); - // e.TraceLoadI16(DATA_LOCAL, i.src1.constant, i.dest); - } -}; -struct LOAD_LOCAL_I32 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.mov(i.dest, e.dword[e.rsp + i.src1.constant()]); - // e.TraceLoadI32(DATA_LOCAL, i.src1.constant, i.dest); - } -}; -struct LOAD_LOCAL_I64 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.mov(i.dest, e.qword[e.rsp + i.src1.constant()]); - // e.TraceLoadI64(DATA_LOCAL, i.src1.constant, i.dest); - } -}; -struct LOAD_LOCAL_F32 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vmovss(i.dest, e.dword[e.rsp + i.src1.constant()]); - // e.TraceLoadF32(DATA_LOCAL, i.src1.constant, i.dest); - } -}; -struct LOAD_LOCAL_F64 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vmovsd(i.dest, e.qword[e.rsp + i.src1.constant()]); - // e.TraceLoadF64(DATA_LOCAL, i.src1.constant, i.dest); - } -}; -struct LOAD_LOCAL_V128 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vmovaps(i.dest, e.ptr[e.rsp + i.src1.constant()]); - // e.TraceLoadV128(DATA_LOCAL, i.src1.constant, i.dest); - } -}; -EMITTER_OPCODE_TABLE(OPCODE_LOAD_LOCAL, LOAD_LOCAL_I8, LOAD_LOCAL_I16, - LOAD_LOCAL_I32, LOAD_LOCAL_I64, LOAD_LOCAL_F32, - LOAD_LOCAL_F64, LOAD_LOCAL_V128); - -// ============================================================================ -// OPCODE_STORE_LOCAL -// ============================================================================ -// Note: all types are always aligned on the stack. -struct STORE_LOCAL_I8 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - // e.TraceStoreI8(DATA_LOCAL, i.src1.constant, i.src2); - e.mov(e.byte[e.rsp + i.src1.constant()], i.src2); - } -}; -struct STORE_LOCAL_I16 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - // e.TraceStoreI16(DATA_LOCAL, i.src1.constant, i.src2); - e.mov(e.word[e.rsp + i.src1.constant()], i.src2); - } -}; -struct STORE_LOCAL_I32 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - // e.TraceStoreI32(DATA_LOCAL, i.src1.constant, i.src2); - e.mov(e.dword[e.rsp + i.src1.constant()], i.src2); - } -}; -struct STORE_LOCAL_I64 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - // e.TraceStoreI64(DATA_LOCAL, i.src1.constant, i.src2); - e.mov(e.qword[e.rsp + i.src1.constant()], i.src2); - } -}; -struct STORE_LOCAL_F32 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - // e.TraceStoreF32(DATA_LOCAL, i.src1.constant, i.src2); - e.vmovss(e.dword[e.rsp + i.src1.constant()], i.src2); - } -}; -struct STORE_LOCAL_F64 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - // e.TraceStoreF64(DATA_LOCAL, i.src1.constant, i.src2); - e.vmovsd(e.qword[e.rsp + i.src1.constant()], i.src2); - } -}; -struct STORE_LOCAL_V128 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - // e.TraceStoreV128(DATA_LOCAL, i.src1.constant, i.src2); - e.vmovaps(e.ptr[e.rsp + i.src1.constant()], i.src2); - } -}; -EMITTER_OPCODE_TABLE(OPCODE_STORE_LOCAL, STORE_LOCAL_I8, STORE_LOCAL_I16, - STORE_LOCAL_I32, STORE_LOCAL_I64, STORE_LOCAL_F32, - STORE_LOCAL_F64, STORE_LOCAL_V128); - -// ============================================================================ -// OPCODE_LOAD_CONTEXT -// ============================================================================ -// Note: all types are always aligned in the context. -RegExp ComputeContextAddress(X64Emitter& e, const OffsetOp& offset) { - return e.GetContextReg() + offset.value; -} -struct LOAD_CONTEXT_I8 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeContextAddress(e, i.src1); - e.mov(i.dest, e.byte[addr]); - if (IsTracingData()) { - e.mov(e.r8, e.byte[addr]); - e.mov(e.rdx, i.src1.value); - e.CallNative(reinterpret_cast(TraceContextLoadI8)); - } - } -}; -struct LOAD_CONTEXT_I16 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeContextAddress(e, i.src1); - e.mov(i.dest, e.word[addr]); - if (IsTracingData()) { - e.mov(e.r8, e.word[addr]); - e.mov(e.rdx, i.src1.value); - e.CallNative(reinterpret_cast(TraceContextLoadI16)); - } - } -}; -struct LOAD_CONTEXT_I32 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeContextAddress(e, i.src1); - e.mov(i.dest, e.dword[addr]); - if (IsTracingData()) { - e.mov(e.r8, e.dword[addr]); - e.mov(e.rdx, i.src1.value); - e.CallNative(reinterpret_cast(TraceContextLoadI32)); - } - } -}; -struct LOAD_CONTEXT_I64 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeContextAddress(e, i.src1); - e.mov(i.dest, e.qword[addr]); - if (IsTracingData()) { - e.mov(e.r8, e.qword[addr]); - e.mov(e.rdx, i.src1.value); - e.CallNative(reinterpret_cast(TraceContextLoadI64)); - } - } -}; -struct LOAD_CONTEXT_F32 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeContextAddress(e, i.src1); - e.vmovss(i.dest, e.dword[addr]); - if (IsTracingData()) { - e.lea(e.r8, e.dword[addr]); - e.mov(e.rdx, i.src1.value); - e.CallNative(reinterpret_cast(TraceContextLoadF32)); - } - } -}; -struct LOAD_CONTEXT_F64 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeContextAddress(e, i.src1); - e.vmovsd(i.dest, e.qword[addr]); - if (IsTracingData()) { - e.lea(e.r8, e.qword[addr]); - e.mov(e.rdx, i.src1.value); - e.CallNative(reinterpret_cast(TraceContextLoadF64)); - } - } -}; -struct LOAD_CONTEXT_V128 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeContextAddress(e, i.src1); - e.vmovaps(i.dest, e.ptr[addr]); - if (IsTracingData()) { - e.lea(e.r8, e.ptr[addr]); - e.mov(e.rdx, i.src1.value); - e.CallNative(reinterpret_cast(TraceContextLoadV128)); - } - } -}; -EMITTER_OPCODE_TABLE(OPCODE_LOAD_CONTEXT, LOAD_CONTEXT_I8, LOAD_CONTEXT_I16, - LOAD_CONTEXT_I32, LOAD_CONTEXT_I64, LOAD_CONTEXT_F32, - LOAD_CONTEXT_F64, LOAD_CONTEXT_V128); - -// ============================================================================ -// OPCODE_STORE_CONTEXT -// ============================================================================ -// Note: all types are always aligned on the stack. -struct STORE_CONTEXT_I8 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeContextAddress(e, i.src1); - if (i.src2.is_constant) { - e.mov(e.byte[addr], i.src2.constant()); - } else { - e.mov(e.byte[addr], i.src2); - } - if (IsTracingData()) { - e.mov(e.r8, e.byte[addr]); - e.mov(e.rdx, i.src1.value); - e.CallNative(reinterpret_cast(TraceContextStoreI8)); - } - } -}; -struct STORE_CONTEXT_I16 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeContextAddress(e, i.src1); - if (i.src2.is_constant) { - e.mov(e.word[addr], i.src2.constant()); - } else { - e.mov(e.word[addr], i.src2); - } - if (IsTracingData()) { - e.mov(e.r8, e.word[addr]); - e.mov(e.rdx, i.src1.value); - e.CallNative(reinterpret_cast(TraceContextStoreI16)); - } - } -}; -struct STORE_CONTEXT_I32 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeContextAddress(e, i.src1); - if (i.src2.is_constant) { - e.mov(e.dword[addr], i.src2.constant()); - } else { - e.mov(e.dword[addr], i.src2); - } - if (IsTracingData()) { - e.mov(e.r8, e.dword[addr]); - e.mov(e.rdx, i.src1.value); - e.CallNative(reinterpret_cast(TraceContextStoreI32)); - } - } -}; -struct STORE_CONTEXT_I64 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeContextAddress(e, i.src1); - if (i.src2.is_constant) { - e.MovMem64(addr, i.src2.constant()); - } else { - e.mov(e.qword[addr], i.src2); - } - if (IsTracingData()) { - e.mov(e.r8, e.qword[addr]); - e.mov(e.rdx, i.src1.value); - e.CallNative(reinterpret_cast(TraceContextStoreI64)); - } - } -}; -struct STORE_CONTEXT_F32 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeContextAddress(e, i.src1); - if (i.src2.is_constant) { - e.mov(e.dword[addr], i.src2.value->constant.i32); - } else { - e.vmovss(e.dword[addr], i.src2); - } - if (IsTracingData()) { - e.lea(e.r8, e.dword[addr]); - e.mov(e.rdx, i.src1.value); - e.CallNative(reinterpret_cast(TraceContextStoreF32)); - } - } -}; -struct STORE_CONTEXT_F64 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeContextAddress(e, i.src1); - if (i.src2.is_constant) { - e.MovMem64(addr, i.src2.value->constant.i64); - } else { - e.vmovsd(e.qword[addr], i.src2); - } - if (IsTracingData()) { - e.lea(e.r8, e.qword[addr]); - e.mov(e.rdx, i.src1.value); - e.CallNative(reinterpret_cast(TraceContextStoreF64)); - } - } -}; -struct STORE_CONTEXT_V128 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeContextAddress(e, i.src1); - if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src2.constant()); - e.vmovaps(e.ptr[addr], e.xmm0); - } else { - e.vmovaps(e.ptr[addr], i.src2); - } - if (IsTracingData()) { - e.lea(e.r8, e.ptr[addr]); - e.mov(e.rdx, i.src1.value); - e.CallNative(reinterpret_cast(TraceContextStoreV128)); - } - } -}; -EMITTER_OPCODE_TABLE(OPCODE_STORE_CONTEXT, STORE_CONTEXT_I8, STORE_CONTEXT_I16, - STORE_CONTEXT_I32, STORE_CONTEXT_I64, STORE_CONTEXT_F32, - STORE_CONTEXT_F64, STORE_CONTEXT_V128); - // ============================================================================ // OPCODE_CONTEXT_BARRIER // ============================================================================ @@ -2097,601 +459,6 @@ struct CONTEXT_BARRIER }; EMITTER_OPCODE_TABLE(OPCODE_CONTEXT_BARRIER, CONTEXT_BARRIER); -// ============================================================================ -// OPCODE_LOAD_MMIO -// ============================================================================ -// Note: all types are always aligned in the context. -struct LOAD_MMIO_I32 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - // uint64_t (context, addr) - auto mmio_range = reinterpret_cast(i.src1.value); - auto read_address = uint32_t(i.src2.value); - e.mov(e.r8, uint64_t(mmio_range->callback_context)); - e.mov(e.r9d, read_address); - e.CallNativeSafe(reinterpret_cast(mmio_range->read)); - e.bswap(e.eax); - e.mov(i.dest, e.eax); - if (IsTracingData()) { - e.mov(e.r8, i.dest); - e.mov(e.edx, read_address); - e.CallNative(reinterpret_cast(TraceContextLoadI32)); - } - } -}; -EMITTER_OPCODE_TABLE(OPCODE_LOAD_MMIO, LOAD_MMIO_I32); - -// ============================================================================ -// OPCODE_STORE_MMIO -// ============================================================================ -// Note: all types are always aligned on the stack. -struct STORE_MMIO_I32 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - // void (context, addr, value) - auto mmio_range = reinterpret_cast(i.src1.value); - auto write_address = uint32_t(i.src2.value); - e.mov(e.r8, uint64_t(mmio_range->callback_context)); - e.mov(e.r9d, write_address); - if (i.src3.is_constant) { - e.mov(e.r10d, xe::byte_swap(i.src3.constant())); - } else { - e.mov(e.r10d, i.src3); - e.bswap(e.r10d); - } - e.CallNativeSafe(reinterpret_cast(mmio_range->write)); - if (IsTracingData()) { - if (i.src3.is_constant) { - e.mov(e.r8d, i.src3.constant()); - } else { - e.mov(e.r8d, i.src3); - } - e.mov(e.edx, write_address); - e.CallNative(reinterpret_cast(TraceContextStoreI32)); - } - } -}; -EMITTER_OPCODE_TABLE(OPCODE_STORE_MMIO, STORE_MMIO_I32); - -// ============================================================================ -// OPCODE_LOAD_OFFSET -// ============================================================================ -template -RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest, - const T& offset) { - int32_t offset_const = static_cast(offset.constant()); - - if (guest.is_constant) { - uint32_t address = static_cast(guest.constant()); - address += static_cast(offset.constant()); - if (address < 0x80000000) { - return e.GetMembaseReg() + address; - } else { - e.mov(e.eax, address); - return e.GetMembaseReg() + e.rax; - } - } else { - // Clear the top 32 bits, as they are likely garbage. - // TODO(benvanik): find a way to avoid doing this. - e.mov(e.eax, guest.reg().cvt32()); - return e.GetMembaseReg() + e.rax + offset_const; - } -} - -struct LOAD_OFFSET_I8 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); - e.mov(i.dest, e.byte[addr]); - } -}; - -struct LOAD_OFFSET_I16 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); - if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { - if (e.IsFeatureEnabled(kX64EmitMovbe)) { - e.movbe(i.dest, e.word[addr]); - } else { - e.mov(i.dest, e.word[addr]); - e.ror(i.dest, 8); - } - } else { - e.mov(i.dest, e.word[addr]); - } - } -}; - -struct LOAD_OFFSET_I32 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); - if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { - if (e.IsFeatureEnabled(kX64EmitMovbe)) { - e.movbe(i.dest, e.dword[addr]); - } else { - e.mov(i.dest, e.dword[addr]); - e.bswap(i.dest); - } - } else { - e.mov(i.dest, e.dword[addr]); - } - } -}; - -struct LOAD_OFFSET_I64 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); - if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { - if (e.IsFeatureEnabled(kX64EmitMovbe)) { - e.movbe(i.dest, e.qword[addr]); - } else { - e.mov(i.dest, e.qword[addr]); - e.bswap(i.dest); - } - } else { - e.mov(i.dest, e.qword[addr]); - } - } -}; -EMITTER_OPCODE_TABLE(OPCODE_LOAD_OFFSET, LOAD_OFFSET_I8, LOAD_OFFSET_I16, - LOAD_OFFSET_I32, LOAD_OFFSET_I64); - -// ============================================================================ -// OPCODE_STORE_OFFSET -// ============================================================================ -struct STORE_OFFSET_I8 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); - if (i.src3.is_constant) { - e.mov(e.byte[addr], i.src3.constant()); - } else { - e.mov(e.byte[addr], i.src3); - } - } -}; - -struct STORE_OFFSET_I16 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); - if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { - assert_false(i.src3.is_constant); - if (e.IsFeatureEnabled(kX64EmitMovbe)) { - e.movbe(e.word[addr], i.src3); - } else { - assert_always("not implemented"); - } - } else { - if (i.src3.is_constant) { - e.mov(e.word[addr], i.src3.constant()); - } else { - e.mov(e.word[addr], i.src3); - } - } - } -}; - -struct STORE_OFFSET_I32 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); - if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { - assert_false(i.src3.is_constant); - if (e.IsFeatureEnabled(kX64EmitMovbe)) { - e.movbe(e.dword[addr], i.src3); - } else { - assert_always("not implemented"); - } - } else { - if (i.src3.is_constant) { - e.mov(e.dword[addr], i.src3.constant()); - } else { - e.mov(e.dword[addr], i.src3); - } - } - } -}; - -struct STORE_OFFSET_I64 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); - if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { - assert_false(i.src3.is_constant); - if (e.IsFeatureEnabled(kX64EmitMovbe)) { - e.movbe(e.qword[addr], i.src3); - } else { - assert_always("not implemented"); - } - } else { - if (i.src3.is_constant) { - e.MovMem64(addr, i.src3.constant()); - } else { - e.mov(e.qword[addr], i.src3); - } - } - } -}; -EMITTER_OPCODE_TABLE(OPCODE_STORE_OFFSET, STORE_OFFSET_I8, STORE_OFFSET_I16, - STORE_OFFSET_I32, STORE_OFFSET_I64); - -// ============================================================================ -// OPCODE_LOAD -// ============================================================================ -// Note: most *should* be aligned, but needs to be checked! -template -RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) { - if (guest.is_constant) { - // TODO(benvanik): figure out how to do this without a temp. - // Since the constant is often 0x8... if we tried to use that as a - // displacement it would be sign extended and mess things up. - uint32_t address = static_cast(guest.constant()); - if (address < 0x80000000) { - return e.GetMembaseReg() + address; - } else { - e.mov(e.eax, address); - return e.GetMembaseReg() + e.rax; - } - } else { - // Clear the top 32 bits, as they are likely garbage. - // TODO(benvanik): find a way to avoid doing this. - e.mov(e.eax, guest.reg().cvt32()); - return e.GetMembaseReg() + e.rax; - } -} -struct LOAD_I8 : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeMemoryAddress(e, i.src1); - e.mov(i.dest, e.byte[addr]); - if (IsTracingData()) { - e.mov(e.r8b, i.dest); - e.lea(e.rdx, e.ptr[addr]); - e.CallNative(reinterpret_cast(TraceMemoryLoadI8)); - } - } -}; -struct LOAD_I16 : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeMemoryAddress(e, i.src1); - if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { - if (e.IsFeatureEnabled(kX64EmitMovbe)) { - e.movbe(i.dest, e.word[addr]); - } else { - e.mov(i.dest, e.word[addr]); - e.ror(i.dest, 8); - } - } else { - e.mov(i.dest, e.word[addr]); - } - if (IsTracingData()) { - e.mov(e.r8w, i.dest); - e.lea(e.rdx, e.ptr[addr]); - e.CallNative(reinterpret_cast(TraceMemoryLoadI16)); - } - } -}; -struct LOAD_I32 : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeMemoryAddress(e, i.src1); - if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { - if (e.IsFeatureEnabled(kX64EmitMovbe)) { - e.movbe(i.dest, e.dword[addr]); - } else { - e.mov(i.dest, e.dword[addr]); - e.bswap(i.dest); - } - } else { - e.mov(i.dest, e.dword[addr]); - } - if (IsTracingData()) { - e.mov(e.r8d, i.dest); - e.lea(e.rdx, e.ptr[addr]); - e.CallNative(reinterpret_cast(TraceMemoryLoadI32)); - } - } -}; -struct LOAD_I64 : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeMemoryAddress(e, i.src1); - if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { - if (e.IsFeatureEnabled(kX64EmitMovbe)) { - e.movbe(i.dest, e.qword[addr]); - } else { - e.mov(i.dest, e.qword[addr]); - e.bswap(i.dest); - } - } else { - e.mov(i.dest, e.qword[addr]); - } - if (IsTracingData()) { - e.mov(e.r8, i.dest); - e.lea(e.rdx, e.ptr[addr]); - e.CallNative(reinterpret_cast(TraceMemoryLoadI64)); - } - } -}; -struct LOAD_F32 : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeMemoryAddress(e, i.src1); - e.vmovss(i.dest, e.dword[addr]); - if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { - assert_always("not implemented yet"); - } - if (IsTracingData()) { - e.lea(e.r8, e.dword[addr]); - e.lea(e.rdx, e.ptr[addr]); - e.CallNative(reinterpret_cast(TraceMemoryLoadF32)); - } - } -}; -struct LOAD_F64 : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeMemoryAddress(e, i.src1); - e.vmovsd(i.dest, e.qword[addr]); - if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { - assert_always("not implemented yet"); - } - if (IsTracingData()) { - e.lea(e.r8, e.qword[addr]); - e.lea(e.rdx, e.ptr[addr]); - e.CallNative(reinterpret_cast(TraceMemoryLoadF64)); - } - } -}; -struct LOAD_V128 : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeMemoryAddress(e, i.src1); - // TODO(benvanik): we should try to stick to movaps if possible. - e.vmovups(i.dest, e.ptr[addr]); - if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { - // TODO(benvanik): find a way to do this without the memory load. - e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteSwapMask)); - } - if (IsTracingData()) { - e.lea(e.r8, e.ptr[addr]); - e.lea(e.rdx, e.ptr[addr]); - e.CallNative(reinterpret_cast(TraceMemoryLoadV128)); - } - } -}; -EMITTER_OPCODE_TABLE(OPCODE_LOAD, LOAD_I8, LOAD_I16, LOAD_I32, LOAD_I64, - LOAD_F32, LOAD_F64, LOAD_V128); - -// ============================================================================ -// OPCODE_STORE -// ============================================================================ -// Note: most *should* be aligned, but needs to be checked! -struct STORE_I8 : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeMemoryAddress(e, i.src1); - if (i.src2.is_constant) { - e.mov(e.byte[addr], i.src2.constant()); - } else { - e.mov(e.byte[addr], i.src2); - } - if (IsTracingData()) { - addr = ComputeMemoryAddress(e, i.src1); - e.mov(e.r8b, e.byte[addr]); - e.lea(e.rdx, e.ptr[addr]); - e.CallNative(reinterpret_cast(TraceMemoryStoreI8)); - } - } -}; -struct STORE_I16 : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeMemoryAddress(e, i.src1); - if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { - assert_false(i.src2.is_constant); - if (e.IsFeatureEnabled(kX64EmitMovbe)) { - e.movbe(e.word[addr], i.src2); - } else { - assert_always("not implemented"); - } - } else { - if (i.src2.is_constant) { - e.mov(e.word[addr], i.src2.constant()); - } else { - e.mov(e.word[addr], i.src2); - } - } - if (IsTracingData()) { - addr = ComputeMemoryAddress(e, i.src1); - e.mov(e.r8w, e.word[addr]); - e.lea(e.rdx, e.ptr[addr]); - e.CallNative(reinterpret_cast(TraceMemoryStoreI16)); - } - } -}; -struct STORE_I32 : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeMemoryAddress(e, i.src1); - if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { - assert_false(i.src2.is_constant); - if (e.IsFeatureEnabled(kX64EmitMovbe)) { - e.movbe(e.dword[addr], i.src2); - } else { - assert_always("not implemented"); - } - } else { - if (i.src2.is_constant) { - e.mov(e.dword[addr], i.src2.constant()); - } else { - e.mov(e.dword[addr], i.src2); - } - } - if (IsTracingData()) { - addr = ComputeMemoryAddress(e, i.src1); - e.mov(e.r8d, e.dword[addr]); - e.lea(e.rdx, e.ptr[addr]); - e.CallNative(reinterpret_cast(TraceMemoryStoreI32)); - } - } -}; -struct STORE_I64 : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeMemoryAddress(e, i.src1); - if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { - assert_false(i.src2.is_constant); - if (e.IsFeatureEnabled(kX64EmitMovbe)) { - e.movbe(e.qword[addr], i.src2); - } else { - assert_always("not implemented"); - } - } else { - if (i.src2.is_constant) { - e.MovMem64(addr, i.src2.constant()); - } else { - e.mov(e.qword[addr], i.src2); - } - } - if (IsTracingData()) { - addr = ComputeMemoryAddress(e, i.src1); - e.mov(e.r8, e.qword[addr]); - e.lea(e.rdx, e.ptr[addr]); - e.CallNative(reinterpret_cast(TraceMemoryStoreI64)); - } - } -}; -struct STORE_F32 : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeMemoryAddress(e, i.src1); - if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { - assert_false(i.src2.is_constant); - assert_always("not yet implemented"); - } else { - if (i.src2.is_constant) { - e.mov(e.dword[addr], i.src2.value->constant.i32); - } else { - e.vmovss(e.dword[addr], i.src2); - } - } - if (IsTracingData()) { - addr = ComputeMemoryAddress(e, i.src1); - e.lea(e.r8, e.ptr[addr]); - e.lea(e.rdx, e.ptr[addr]); - e.CallNative(reinterpret_cast(TraceMemoryStoreF32)); - } - } -}; -struct STORE_F64 : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeMemoryAddress(e, i.src1); - if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { - assert_false(i.src2.is_constant); - assert_always("not yet implemented"); - } else { - if (i.src2.is_constant) { - e.MovMem64(addr, i.src2.value->constant.i64); - } else { - e.vmovsd(e.qword[addr], i.src2); - } - } - if (IsTracingData()) { - addr = ComputeMemoryAddress(e, i.src1); - e.lea(e.r8, e.ptr[addr]); - e.lea(e.rdx, e.ptr[addr]); - e.CallNative(reinterpret_cast(TraceMemoryStoreF64)); - } - } -}; -struct STORE_V128 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeMemoryAddress(e, i.src1); - if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { - assert_false(i.src2.is_constant); - e.vpshufb(e.xmm0, i.src2, e.GetXmmConstPtr(XMMByteSwapMask)); - e.vmovaps(e.ptr[addr], e.xmm0); - } else { - if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src2.constant()); - e.vmovaps(e.ptr[addr], e.xmm0); - } else { - e.vmovaps(e.ptr[addr], i.src2); - } - } - if (IsTracingData()) { - addr = ComputeMemoryAddress(e, i.src1); - e.lea(e.r8, e.ptr[addr]); - e.lea(e.rdx, e.ptr[addr]); - e.CallNative(reinterpret_cast(TraceMemoryStoreV128)); - } - } -}; -EMITTER_OPCODE_TABLE(OPCODE_STORE, STORE_I8, STORE_I16, STORE_I32, STORE_I64, - STORE_F32, STORE_F64, STORE_V128); - -// ============================================================================ -// OPCODE_PREFETCH -// ============================================================================ -struct PREFETCH - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - // TODO(benvanik): prefetch addr -> length. - } -}; -EMITTER_OPCODE_TABLE(OPCODE_PREFETCH, PREFETCH); - -// ============================================================================ -// OPCODE_MEMORY_BARRIER -// ============================================================================ -struct MEMORY_BARRIER - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { e.mfence(); } -}; -EMITTER_OPCODE_TABLE(OPCODE_MEMORY_BARRIER, MEMORY_BARRIER); - -// ============================================================================ -// OPCODE_MEMSET -// ============================================================================ -struct MEMSET_I64_I8_I64 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - assert_true(i.src2.is_constant); - assert_true(i.src3.is_constant); - assert_true(i.src2.constant() == 0); - e.vpxor(e.xmm0, e.xmm0); - auto addr = ComputeMemoryAddress(e, i.src1); - switch (i.src3.constant()) { - case 32: - e.vmovaps(e.ptr[addr + 0 * 16], e.xmm0); - e.vmovaps(e.ptr[addr + 1 * 16], e.xmm0); - break; - case 128: - e.vmovaps(e.ptr[addr + 0 * 16], e.xmm0); - e.vmovaps(e.ptr[addr + 1 * 16], e.xmm0); - e.vmovaps(e.ptr[addr + 2 * 16], e.xmm0); - e.vmovaps(e.ptr[addr + 3 * 16], e.xmm0); - e.vmovaps(e.ptr[addr + 4 * 16], e.xmm0); - e.vmovaps(e.ptr[addr + 5 * 16], e.xmm0); - e.vmovaps(e.ptr[addr + 6 * 16], e.xmm0); - e.vmovaps(e.ptr[addr + 7 * 16], e.xmm0); - break; - default: - assert_unhandled_case(i.src3.constant()); - break; - } - if (IsTracingData()) { - addr = ComputeMemoryAddress(e, i.src1); - e.mov(e.r9, i.src3.constant()); - e.mov(e.r8, i.src2.constant()); - e.lea(e.rdx, e.ptr[addr]); - e.CallNative(reinterpret_cast(TraceMemset)); - } - } -}; -EMITTER_OPCODE_TABLE(OPCODE_MEMSET, MEMSET_I64_I8_I64); - // ============================================================================ // OPCODE_MAX // ============================================================================ @@ -2721,51 +488,6 @@ struct MAX_V128 : Sequence> { }; EMITTER_OPCODE_TABLE(OPCODE_MAX, MAX_F32, MAX_F64, MAX_V128); -// ============================================================================ -// OPCODE_VECTOR_MAX -// ============================================================================ -struct VECTOR_MAX - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitCommutativeBinaryXmmOp( - e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - uint32_t part_type = i.instr->flags >> 8; - if (i.instr->flags & ARITHMETIC_UNSIGNED) { - switch (part_type) { - case INT8_TYPE: - e.vpmaxub(dest, src1, src2); - break; - case INT16_TYPE: - e.vpmaxuw(dest, src1, src2); - break; - case INT32_TYPE: - e.vpmaxud(dest, src1, src2); - break; - default: - assert_unhandled_case(part_type); - break; - } - } else { - switch (part_type) { - case INT8_TYPE: - e.vpmaxsb(dest, src1, src2); - break; - case INT16_TYPE: - e.vpmaxsw(dest, src1, src2); - break; - case INT32_TYPE: - e.vpmaxsd(dest, src1, src2); - break; - default: - assert_unhandled_case(part_type); - break; - } - } - }); - } -}; -EMITTER_OPCODE_TABLE(OPCODE_VECTOR_MAX, VECTOR_MAX); - // ============================================================================ // OPCODE_MIN // ============================================================================ @@ -2856,51 +578,6 @@ struct MIN_V128 : Sequence> { EMITTER_OPCODE_TABLE(OPCODE_MIN, MIN_I8, MIN_I16, MIN_I32, MIN_I64, MIN_F32, MIN_F64, MIN_V128); -// ============================================================================ -// OPCODE_VECTOR_MIN -// ============================================================================ -struct VECTOR_MIN - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitCommutativeBinaryXmmOp( - e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - uint32_t part_type = i.instr->flags >> 8; - if (i.instr->flags & ARITHMETIC_UNSIGNED) { - switch (part_type) { - case INT8_TYPE: - e.vpminub(dest, src1, src2); - break; - case INT16_TYPE: - e.vpminuw(dest, src1, src2); - break; - case INT32_TYPE: - e.vpminud(dest, src1, src2); - break; - default: - assert_unhandled_case(part_type); - break; - } - } else { - switch (part_type) { - case INT8_TYPE: - e.vpminsb(dest, src1, src2); - break; - case INT16_TYPE: - e.vpminsw(dest, src1, src2); - break; - case INT32_TYPE: - e.vpminsd(dest, src1, src2); - break; - default: - assert_unhandled_case(part_type); - break; - } - } - }); - } -}; -EMITTER_OPCODE_TABLE(OPCODE_VECTOR_MIN, VECTOR_MIN); - // ============================================================================ // OPCODE_SELECT // ============================================================================ @@ -3417,213 +1094,6 @@ struct DID_SATURATE }; EMITTER_OPCODE_TABLE(OPCODE_DID_SATURATE, DID_SATURATE); -// ============================================================================ -// OPCODE_VECTOR_COMPARE_EQ -// ============================================================================ -struct VECTOR_COMPARE_EQ_V128 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitCommutativeBinaryXmmOp( - e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - switch (i.instr->flags) { - case INT8_TYPE: - e.vpcmpeqb(dest, src1, src2); - break; - case INT16_TYPE: - e.vpcmpeqw(dest, src1, src2); - break; - case INT32_TYPE: - e.vpcmpeqd(dest, src1, src2); - break; - case FLOAT32_TYPE: - e.vcmpeqps(dest, src1, src2); - break; - } - }); - } -}; -EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_EQ, VECTOR_COMPARE_EQ_V128); - -// ============================================================================ -// OPCODE_VECTOR_COMPARE_SGT -// ============================================================================ -struct VECTOR_COMPARE_SGT_V128 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitAssociativeBinaryXmmOp( - e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - switch (i.instr->flags) { - case INT8_TYPE: - e.vpcmpgtb(dest, src1, src2); - break; - case INT16_TYPE: - e.vpcmpgtw(dest, src1, src2); - break; - case INT32_TYPE: - e.vpcmpgtd(dest, src1, src2); - break; - case FLOAT32_TYPE: - e.vcmpgtps(dest, src1, src2); - break; - } - }); - } -}; -EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_SGT, VECTOR_COMPARE_SGT_V128); - -// ============================================================================ -// OPCODE_VECTOR_COMPARE_SGE -// ============================================================================ -struct VECTOR_COMPARE_SGE_V128 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitAssociativeBinaryXmmOp( - e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - switch (i.instr->flags) { - case INT8_TYPE: - e.vpcmpeqb(e.xmm0, src1, src2); - e.vpcmpgtb(dest, src1, src2); - e.vpor(dest, e.xmm0); - break; - case INT16_TYPE: - e.vpcmpeqw(e.xmm0, src1, src2); - e.vpcmpgtw(dest, src1, src2); - e.vpor(dest, e.xmm0); - break; - case INT32_TYPE: - e.vpcmpeqd(e.xmm0, src1, src2); - e.vpcmpgtd(dest, src1, src2); - e.vpor(dest, e.xmm0); - break; - case FLOAT32_TYPE: - e.vcmpgeps(dest, src1, src2); - break; - } - }); - } -}; -EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_SGE, VECTOR_COMPARE_SGE_V128); - -// ============================================================================ -// OPCODE_VECTOR_COMPARE_UGT -// ============================================================================ -struct VECTOR_COMPARE_UGT_V128 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - Xbyak::Address sign_addr = e.ptr[e.rax]; // dummy - switch (i.instr->flags) { - case INT8_TYPE: - sign_addr = e.GetXmmConstPtr(XMMSignMaskI8); - break; - case INT16_TYPE: - sign_addr = e.GetXmmConstPtr(XMMSignMaskI16); - break; - case INT32_TYPE: - sign_addr = e.GetXmmConstPtr(XMMSignMaskI32); - break; - case FLOAT32_TYPE: - sign_addr = e.GetXmmConstPtr(XMMSignMaskF32); - break; - default: - assert_always(); - break; - } - if (i.src1.is_constant) { - // TODO(benvanik): make this constant. - e.LoadConstantXmm(e.xmm0, i.src1.constant()); - e.vpxor(e.xmm0, sign_addr); - } else { - e.vpxor(e.xmm0, i.src1, sign_addr); - } - if (i.src2.is_constant) { - // TODO(benvanik): make this constant. - e.LoadConstantXmm(e.xmm1, i.src2.constant()); - e.vpxor(e.xmm1, sign_addr); - } else { - e.vpxor(e.xmm1, i.src2, sign_addr); - } - switch (i.instr->flags) { - case INT8_TYPE: - e.vpcmpgtb(i.dest, e.xmm0, e.xmm1); - break; - case INT16_TYPE: - e.vpcmpgtw(i.dest, e.xmm0, e.xmm1); - break; - case INT32_TYPE: - e.vpcmpgtd(i.dest, e.xmm0, e.xmm1); - break; - case FLOAT32_TYPE: - e.vcmpgtps(i.dest, e.xmm0, e.xmm1); - break; - } - } -}; -EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_UGT, VECTOR_COMPARE_UGT_V128); - -// ============================================================================ -// OPCODE_VECTOR_COMPARE_UGE -// ============================================================================ -struct VECTOR_COMPARE_UGE_V128 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - Xbyak::Address sign_addr = e.ptr[e.rax]; // dummy - switch (i.instr->flags) { - case INT8_TYPE: - sign_addr = e.GetXmmConstPtr(XMMSignMaskI8); - break; - case INT16_TYPE: - sign_addr = e.GetXmmConstPtr(XMMSignMaskI16); - break; - case INT32_TYPE: - sign_addr = e.GetXmmConstPtr(XMMSignMaskI32); - break; - case FLOAT32_TYPE: - sign_addr = e.GetXmmConstPtr(XMMSignMaskF32); - break; - } - if (i.src1.is_constant) { - // TODO(benvanik): make this constant. - e.LoadConstantXmm(e.xmm0, i.src1.constant()); - e.vpxor(e.xmm0, sign_addr); - } else { - e.vpxor(e.xmm0, i.src1, sign_addr); - } - if (i.src2.is_constant) { - // TODO(benvanik): make this constant. - e.LoadConstantXmm(e.xmm1, i.src2.constant()); - e.vpxor(e.xmm1, sign_addr); - } else { - e.vpxor(e.xmm1, i.src2, sign_addr); - } - switch (i.instr->flags) { - case INT8_TYPE: - e.vpcmpeqb(e.xmm2, e.xmm0, e.xmm1); - e.vpcmpgtb(i.dest, e.xmm0, e.xmm1); - e.vpor(i.dest, e.xmm2); - break; - case INT16_TYPE: - e.vpcmpeqw(e.xmm2, e.xmm0, e.xmm1); - e.vpcmpgtw(i.dest, e.xmm0, e.xmm1); - e.vpor(i.dest, e.xmm2); - break; - case INT32_TYPE: - e.vpcmpeqd(e.xmm2, e.xmm0, e.xmm1); - e.vpcmpgtd(i.dest, e.xmm0, e.xmm1); - e.vpor(i.dest, e.xmm2); - break; - case FLOAT32_TYPE: - e.vcmpgeps(i.dest, e.xmm0, e.xmm1); - break; - } - } -}; -EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_UGE, VECTOR_COMPARE_UGE_V128); - // ============================================================================ // OPCODE_ADD // ============================================================================ @@ -3746,98 +1216,6 @@ struct ADD_CARRY_I64 EMITTER_OPCODE_TABLE(OPCODE_ADD_CARRY, ADD_CARRY_I8, ADD_CARRY_I16, ADD_CARRY_I32, ADD_CARRY_I64); -// ============================================================================ -// OPCODE_VECTOR_ADD -// ============================================================================ -struct VECTOR_ADD - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitCommutativeBinaryXmmOp( - e, i, [&i](X64Emitter& e, const Xmm& dest, Xmm src1, Xmm src2) { - const TypeName part_type = - static_cast(i.instr->flags & 0xFF); - const uint32_t arithmetic_flags = i.instr->flags >> 8; - bool is_unsigned = !!(arithmetic_flags & ARITHMETIC_UNSIGNED); - bool saturate = !!(arithmetic_flags & ARITHMETIC_SATURATE); - switch (part_type) { - case INT8_TYPE: - if (saturate) { - // TODO(benvanik): trace DID_SATURATE - if (is_unsigned) { - e.vpaddusb(dest, src1, src2); - } else { - e.vpaddsb(dest, src1, src2); - } - } else { - e.vpaddb(dest, src1, src2); - } - break; - case INT16_TYPE: - if (saturate) { - // TODO(benvanik): trace DID_SATURATE - if (is_unsigned) { - e.vpaddusw(dest, src1, src2); - } else { - e.vpaddsw(dest, src1, src2); - } - } else { - e.vpaddw(dest, src1, src2); - } - break; - case INT32_TYPE: - if (saturate) { - if (is_unsigned) { - // xmm0 is the only temp register that can be used by - // src1/src2. - e.vpaddd(e.xmm1, src1, src2); - - // If result is smaller than either of the inputs, we've - // overflowed (only need to check one input) - // if (src1 > res) then overflowed - // https://locklessinc.com/articles/sat_arithmetic/ - e.vpxor(e.xmm2, src1, e.GetXmmConstPtr(XMMSignMaskI32)); - e.vpxor(e.xmm0, e.xmm1, e.GetXmmConstPtr(XMMSignMaskI32)); - e.vpcmpgtd(e.xmm0, e.xmm2, e.xmm0); - e.vpor(dest, e.xmm1, e.xmm0); - } else { - e.vpaddd(e.xmm1, src1, src2); - - // Overflow results if two inputs are the same sign and the - // result isn't the same sign. if ((s32b)(~(src1 ^ src2) & - // (src1 ^ res)) < 0) then overflowed - // https://locklessinc.com/articles/sat_arithmetic/ - e.vpxor(e.xmm2, src1, src2); - e.vpxor(e.xmm3, src1, e.xmm1); - e.vpandn(e.xmm2, e.xmm2, e.xmm3); - - // Set any negative overflowed elements of src1 to INT_MIN - e.vpand(e.xmm3, src1, e.xmm2); - e.vblendvps(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMSignMaskI32), - e.xmm3); - - // Set any positive overflowed elements of src1 to INT_MAX - e.vpandn(e.xmm3, src1, e.xmm2); - e.vblendvps(dest, e.xmm1, e.GetXmmConstPtr(XMMAbsMaskPS), - e.xmm3); - } - } else { - e.vpaddd(dest, src1, src2); - } - break; - case FLOAT32_TYPE: - assert_false(is_unsigned); - assert_false(saturate); - e.vaddps(dest, src1, src2); - break; - default: - assert_unhandled_case(part_type); - break; - } - }); - } -}; -EMITTER_OPCODE_TABLE(OPCODE_VECTOR_ADD, VECTOR_ADD); - // ============================================================================ // OPCODE_SUB // ============================================================================ @@ -3903,97 +1281,6 @@ struct SUB_V128 : Sequence> { EMITTER_OPCODE_TABLE(OPCODE_SUB, SUB_I8, SUB_I16, SUB_I32, SUB_I64, SUB_F32, SUB_F64, SUB_V128); -// ============================================================================ -// OPCODE_VECTOR_SUB -// ============================================================================ -struct VECTOR_SUB - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitCommutativeBinaryXmmOp( - e, i, [&i](X64Emitter& e, const Xmm& dest, Xmm src1, Xmm src2) { - const TypeName part_type = - static_cast(i.instr->flags & 0xFF); - const uint32_t arithmetic_flags = i.instr->flags >> 8; - bool is_unsigned = !!(arithmetic_flags & ARITHMETIC_UNSIGNED); - bool saturate = !!(arithmetic_flags & ARITHMETIC_SATURATE); - switch (part_type) { - case INT8_TYPE: - if (saturate) { - // TODO(benvanik): trace DID_SATURATE - if (is_unsigned) { - e.vpsubusb(dest, src1, src2); - } else { - e.vpsubsb(dest, src1, src2); - } - } else { - e.vpsubb(dest, src1, src2); - } - break; - case INT16_TYPE: - if (saturate) { - // TODO(benvanik): trace DID_SATURATE - if (is_unsigned) { - e.vpsubusw(dest, src1, src2); - } else { - e.vpsubsw(dest, src1, src2); - } - } else { - e.vpsubw(dest, src1, src2); - } - break; - case INT32_TYPE: - if (saturate) { - if (is_unsigned) { - // xmm0 is the only temp register that can be used by - // src1/src2. - e.vpsubd(e.xmm1, src1, src2); - - // If result is greater than either of the inputs, we've - // underflowed (only need to check one input) - // if (res > src1) then underflowed - // https://locklessinc.com/articles/sat_arithmetic/ - e.vpxor(e.xmm2, src1, e.GetXmmConstPtr(XMMSignMaskI32)); - e.vpxor(e.xmm0, e.xmm1, e.GetXmmConstPtr(XMMSignMaskI32)); - e.vpcmpgtd(e.xmm0, e.xmm0, e.xmm2); - e.vpandn(dest, e.xmm0, e.xmm1); - } else { - e.vpsubd(e.xmm1, src1, src2); - - // We can only overflow if the signs of the operands are - // opposite. If signs are opposite and result sign isn't the - // same as src1's sign, we've overflowed. if ((s32b)((src1 ^ - // src2) & (src1 ^ res)) < 0) then overflowed - // https://locklessinc.com/articles/sat_arithmetic/ - e.vpxor(e.xmm2, src1, src2); - e.vpxor(e.xmm3, src1, e.xmm1); - e.vpand(e.xmm2, e.xmm2, e.xmm3); - - // Set any negative overflowed elements of src1 to INT_MIN - e.vpand(e.xmm3, src1, e.xmm2); - e.vblendvps(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMSignMaskI32), - e.xmm3); - - // Set any positive overflowed elements of src1 to INT_MAX - e.vpandn(e.xmm3, src1, e.xmm2); - e.vblendvps(dest, e.xmm1, e.GetXmmConstPtr(XMMAbsMaskPS), - e.xmm3); - } - } else { - e.vpsubd(dest, src1, src2); - } - break; - case FLOAT32_TYPE: - e.vsubps(dest, src1, src2); - break; - default: - assert_unhandled_case(part_type); - break; - } - }); - } -}; -EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SUB, VECTOR_SUB); - // ============================================================================ // OPCODE_MUL // ============================================================================ @@ -5073,7 +2360,7 @@ struct POW2_F32 : Sequence> { } static void Emit(X64Emitter& e, const EmitArgType& i) { assert_always(); - e.lea(e.r8, e.StashXmm(0, i.src1)); + e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulatePow2)); e.vmovaps(i.dest, e.xmm0); } @@ -5087,7 +2374,7 @@ struct POW2_F64 : Sequence> { } static void Emit(X64Emitter& e, const EmitArgType& i) { assert_always(); - e.lea(e.r8, e.StashXmm(0, i.src1)); + e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulatePow2)); e.vmovaps(i.dest, e.xmm0); } @@ -5102,7 +2389,7 @@ struct POW2_V128 : Sequence> { return _mm_load_ps(values); } static void Emit(X64Emitter& e, const EmitArgType& i) { - e.lea(e.r8, e.StashXmm(0, i.src1)); + e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulatePow2)); e.vmovaps(i.dest, e.xmm0); } @@ -5124,7 +2411,7 @@ struct LOG2_F32 : Sequence> { } static void Emit(X64Emitter& e, const EmitArgType& i) { assert_always(); - e.lea(e.r8, e.StashXmm(0, i.src1)); + e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulateLog2)); e.vmovaps(i.dest, e.xmm0); } @@ -5138,7 +2425,7 @@ struct LOG2_F64 : Sequence> { } static void Emit(X64Emitter& e, const EmitArgType& i) { assert_always(); - e.lea(e.r8, e.StashXmm(0, i.src1)); + e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulateLog2)); e.vmovaps(i.dest, e.xmm0); } @@ -5153,7 +2440,7 @@ struct LOG2_V128 : Sequence> { return _mm_load_ps(values); } static void Emit(X64Emitter& e, const EmitArgType& i) { - e.lea(e.r8, e.StashXmm(0, i.src1)); + e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulateLog2)); e.vmovaps(i.dest, e.xmm0); } @@ -5418,11 +2705,11 @@ struct SHL_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { // TODO(benvanik): native version (with shift magic). if (i.src2.is_constant) { - e.mov(e.r9, i.src2.constant()); + e.mov(e.GetNativeParam(1), i.src2.constant()); } else { - e.mov(e.r9, i.src2); + e.mov(e.GetNativeParam(1), i.src2); } - e.lea(e.r8, e.StashXmm(0, i.src1)); + e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulateShlV128)); e.vmovaps(i.dest, e.xmm0); } @@ -5495,11 +2782,11 @@ struct SHR_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { // TODO(benvanik): native version (with shift magic). if (i.src2.is_constant) { - e.mov(e.r9, i.src2.constant()); + e.mov(e.GetNativeParam(1), i.src2.constant()); } else { - e.mov(e.r9, i.src2); + e.mov(e.GetNativeParam(1), i.src2); } - e.lea(e.r8, e.StashXmm(0, i.src1)); + e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulateShrV128)); e.vmovaps(i.dest, e.xmm0); } @@ -5568,590 +2855,6 @@ struct SHA_I64 : Sequence> { }; EMITTER_OPCODE_TABLE(OPCODE_SHA, SHA_I8, SHA_I16, SHA_I32, SHA_I64); -// ============================================================================ -// OPCODE_VECTOR_SHL -// ============================================================================ -struct VECTOR_SHL_V128 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - switch (i.instr->flags) { - case INT8_TYPE: - EmitInt8(e, i); - break; - case INT16_TYPE: - EmitInt16(e, i); - break; - case INT32_TYPE: - EmitInt32(e, i); - break; - default: - assert_always(); - break; - } - } - static __m128i EmulateVectorShlI8(void*, __m128i src1, __m128i src2) { - alignas(16) uint8_t value[16]; - alignas(16) uint8_t shamt[16]; - _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); - for (size_t i = 0; i < 16; ++i) { - value[i] = value[i] << (shamt[i] & 0x7); - } - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); - } - static void EmitInt8(X64Emitter& e, const EmitArgType& i) { - // TODO(benvanik): native version (with shift magic). - if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src2.constant()); - e.lea(e.r9, e.StashXmm(1, e.xmm0)); - } else { - e.lea(e.r9, e.StashXmm(1, i.src2)); - } - e.lea(e.r8, e.StashXmm(0, i.src1)); - e.CallNativeSafe(reinterpret_cast(EmulateVectorShlI8)); - e.vmovaps(i.dest, e.xmm0); - } - static __m128i EmulateVectorShlI16(void*, __m128i src1, __m128i src2) { - alignas(16) uint16_t value[8]; - alignas(16) uint16_t shamt[8]; - _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); - for (size_t i = 0; i < 8; ++i) { - value[i] = value[i] << (shamt[i] & 0xF); - } - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); - } - static void EmitInt16(X64Emitter& e, const EmitArgType& i) { - Xmm src1; - if (i.src1.is_constant) { - src1 = e.xmm2; - e.LoadConstantXmm(src1, i.src1.constant()); - } else { - src1 = i.src1; - } - - if (i.src2.is_constant) { - const auto& shamt = i.src2.constant(); - bool all_same = true; - for (size_t n = 0; n < 8 - n; ++n) { - if (shamt.u16[n] != shamt.u16[n + 1]) { - all_same = false; - break; - } - } - if (all_same) { - // Every count is the same, so we can use vpsllw. - e.vpsllw(i.dest, src1, shamt.u16[0] & 0xF); - return; - } - } - - // Shift 8 words in src1 by amount specified in src2. - Xbyak::Label emu, end; - - // Only bother with this check if shift amt isn't constant. - if (!i.src2.is_constant) { - // See if the shift is equal first for a shortcut. - e.vpshuflw(e.xmm0, i.src2, 0b00000000); - e.vpshufd(e.xmm0, e.xmm0, 0b00000000); - e.vpxor(e.xmm1, e.xmm0, i.src2); - e.vptest(e.xmm1, e.xmm1); - e.jnz(emu); - - // Equal. Shift using vpsllw. - e.mov(e.rax, 0xF); - e.vmovq(e.xmm1, e.rax); - e.vpand(e.xmm0, e.xmm0, e.xmm1); - e.vpsllw(i.dest, src1, e.xmm0); - e.jmp(end); - } - - // TODO(benvanik): native version (with shift magic). - e.L(emu); - if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src2.constant()); - e.lea(e.r9, e.StashXmm(1, e.xmm0)); - } else { - e.lea(e.r9, e.StashXmm(1, i.src2)); - } - e.lea(e.r8, e.StashXmm(0, src1)); - e.CallNativeSafe(reinterpret_cast(EmulateVectorShlI16)); - e.vmovaps(i.dest, e.xmm0); - - e.L(end); - } - static __m128i EmulateVectorShlI32(void*, __m128i src1, __m128i src2) { - alignas(16) uint32_t value[4]; - alignas(16) uint32_t shamt[4]; - _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); - for (size_t i = 0; i < 4; ++i) { - value[i] = value[i] << (shamt[i] & 0x1F); - } - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); - } - static void EmitInt32(X64Emitter& e, const EmitArgType& i) { - Xmm src1; - if (i.src1.is_constant) { - src1 = e.xmm2; - e.LoadConstantXmm(src1, i.src1.constant()); - } - else { - src1 = i.src1; - } - - if (i.src2.is_constant) { - const auto& shamt = i.src2.constant(); - bool all_same = true; - for (size_t n = 0; n < 4 - n; ++n) { - if (shamt.u32[n] != shamt.u32[n + 1]) { - all_same = false; - break; - } - } - if (all_same) { - // Every count is the same, so we can use vpslld. - e.vpslld(i.dest, src1, shamt.u8[0] & 0x1F); - return; - } - } - - if (e.IsFeatureEnabled(kX64EmitAVX2)) { - if (i.src2.is_constant) { - const auto& shamt = i.src2.constant(); - // Counts differ, so pre-mask and load constant. - vec128_t masked = i.src2.constant(); - for (size_t n = 0; n < 4; ++n) { - masked.u32[n] &= 0x1F; - } - e.LoadConstantXmm(e.xmm0, masked); - e.vpsllvd(i.dest, src1, e.xmm0); - } else { - // Fully variable shift. - // src shift mask may have values >31, and x86 sets to zero when - // that happens so we mask. - e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); - e.vpsllvd(i.dest, src1, e.xmm0); - } - } else { - // Shift 4 words in src1 by amount specified in src2. - Xbyak::Label emu, end; - - // See if the shift is equal first for a shortcut. - // Only bother with this check if shift amt isn't constant. - if (!i.src2.is_constant) { - e.vpshufd(e.xmm0, i.src2, 0b00000000); - e.vpxor(e.xmm1, e.xmm0, i.src2); - e.vptest(e.xmm1, e.xmm1); - e.jnz(emu); - - // Equal. Shift using vpsrad. - e.mov(e.rax, 0x1F); - e.vmovq(e.xmm1, e.rax); - e.vpand(e.xmm0, e.xmm0, e.xmm1); - - e.vpslld(i.dest, src1, e.xmm0); - e.jmp(end); - } - - // TODO(benvanik): native version (with shift magic). - e.L(emu); - if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src2.constant()); - e.lea(e.r9, e.StashXmm(1, e.xmm0)); - } else { - e.lea(e.r9, e.StashXmm(1, i.src2)); - } - e.lea(e.r8, e.StashXmm(0, src1)); - e.CallNativeSafe(reinterpret_cast(EmulateVectorShlI32)); - e.vmovaps(i.dest, e.xmm0); - - e.L(end); - } - } -}; -EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHL, VECTOR_SHL_V128); - -// ============================================================================ -// OPCODE_VECTOR_SHR -// ============================================================================ -struct VECTOR_SHR_V128 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - switch (i.instr->flags) { - case INT8_TYPE: - EmitInt8(e, i); - break; - case INT16_TYPE: - EmitInt16(e, i); - break; - case INT32_TYPE: - EmitInt32(e, i); - break; - default: - assert_always(); - break; - } - } - static __m128i EmulateVectorShrI8(void*, __m128i src1, __m128i src2) { - alignas(16) uint8_t value[16]; - alignas(16) uint8_t shamt[16]; - _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); - for (size_t i = 0; i < 16; ++i) { - value[i] = value[i] >> (shamt[i] & 0x7); - } - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); - } - static void EmitInt8(X64Emitter& e, const EmitArgType& i) { - // TODO(benvanik): native version (with shift magic). - if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src2.constant()); - e.lea(e.r9, e.StashXmm(1, e.xmm0)); - } else { - e.lea(e.r9, e.StashXmm(1, i.src2)); - } - e.lea(e.r8, e.StashXmm(0, i.src1)); - e.CallNativeSafe(reinterpret_cast(EmulateVectorShrI8)); - e.vmovaps(i.dest, e.xmm0); - } - static __m128i EmulateVectorShrI16(void*, __m128i src1, __m128i src2) { - alignas(16) uint16_t value[8]; - alignas(16) uint16_t shamt[8]; - _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); - for (size_t i = 0; i < 8; ++i) { - value[i] = value[i] >> (shamt[i] & 0xF); - } - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); - } - static void EmitInt16(X64Emitter& e, const EmitArgType& i) { - if (i.src2.is_constant) { - const auto& shamt = i.src2.constant(); - bool all_same = true; - for (size_t n = 0; n < 8 - n; ++n) { - if (shamt.u16[n] != shamt.u16[n + 1]) { - all_same = false; - break; - } - } - if (all_same) { - // Every count is the same, so we can use vpsllw. - e.vpsrlw(i.dest, i.src1, shamt.u16[0] & 0xF); - return; - } - } - - // Shift 8 words in src1 by amount specified in src2. - Xbyak::Label emu, end; - - // See if the shift is equal first for a shortcut. - // Only bother with this check if shift amt isn't constant. - if (!i.src2.is_constant) { - e.vpshuflw(e.xmm0, i.src2, 0b00000000); - e.vpshufd(e.xmm0, e.xmm0, 0b00000000); - e.vpxor(e.xmm1, e.xmm0, i.src2); - e.vptest(e.xmm1, e.xmm1); - e.jnz(emu); - - // Equal. Shift using vpsrlw. - e.mov(e.rax, 0xF); - e.vmovq(e.xmm1, e.rax); - e.vpand(e.xmm0, e.xmm0, e.xmm1); - e.vpsrlw(i.dest, i.src1, e.xmm0); - e.jmp(end); - } - - // TODO(benvanik): native version (with shift magic). - e.L(emu); - if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src2.constant()); - e.lea(e.r9, e.StashXmm(1, e.xmm0)); - } else { - e.lea(e.r9, e.StashXmm(1, i.src2)); - } - e.lea(e.r8, e.StashXmm(0, i.src1)); - e.CallNativeSafe(reinterpret_cast(EmulateVectorShrI16)); - e.vmovaps(i.dest, e.xmm0); - - e.L(end); - } - static __m128i EmulateVectorShrI32(void*, __m128i src1, __m128i src2) { - alignas(16) uint32_t value[4]; - alignas(16) uint32_t shamt[4]; - _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); - for (size_t i = 0; i < 4; ++i) { - value[i] = value[i] >> (shamt[i] & 0x1F); - } - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); - } - static void EmitInt32(X64Emitter& e, const EmitArgType& i) { - Xmm src1; - if (i.src1.is_constant) { - src1 = e.xmm2; - e.LoadConstantXmm(src1, i.src1.constant()); - } - else { - src1 = i.src1; - } - - if (i.src2.is_constant) { - const auto& shamt = i.src2.constant(); - bool all_same = true; - for (size_t n = 0; n < 4 - n; ++n) { - if (shamt.u32[n] != shamt.u32[n + 1]) { - all_same = false; - break; - } - } - if (all_same) { - // Every count is the same, so we can use vpsrld. - e.vpsrld(i.dest, src1, shamt.u8[0] & 0x1F); - return; - } else { - if (e.IsFeatureEnabled(kX64EmitAVX2)) { - // Counts differ, so pre-mask and load constant. - vec128_t masked = i.src2.constant(); - for (size_t n = 0; n < 4; ++n) { - masked.u32[n] &= 0x1F; - } - e.LoadConstantXmm(e.xmm0, masked); - e.vpsrlvd(i.dest, src1, e.xmm0); - return; - } - } - } - - if (e.IsFeatureEnabled(kX64EmitAVX2)) { - // Fully variable shift. - // src shift mask may have values >31, and x86 sets to zero when - // that happens so we mask. - e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); - e.vpsrlvd(i.dest, src1, e.xmm0); - } else { - // Shift 4 words in src1 by amount specified in src2. - Xbyak::Label emu, end; - - // See if the shift is equal first for a shortcut. - // Only bother with this check if shift amt isn't constant. - if (!i.src2.is_constant) { - e.vpshufd(e.xmm0, i.src2, 0b00000000); - e.vpxor(e.xmm1, e.xmm0, i.src2); - e.vptest(e.xmm1, e.xmm1); - e.jnz(emu); - - // Equal. Shift using vpsrld. - e.mov(e.rax, 0x1F); - e.vmovq(e.xmm1, e.rax); - e.vpand(e.xmm0, e.xmm0, e.xmm1); - e.vpsrld(i.dest, src1, e.xmm0); - e.jmp(end); - } - - // TODO(benvanik): native version. - e.L(emu); - if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src2.constant()); - e.lea(e.r9, e.StashXmm(1, e.xmm0)); - } else { - e.lea(e.r9, e.StashXmm(1, i.src2)); - } - e.lea(e.r8, e.StashXmm(0, src1)); - e.CallNativeSafe(reinterpret_cast(EmulateVectorShrI32)); - e.vmovaps(i.dest, e.xmm0); - - e.L(end); - } - } -}; -EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHR, VECTOR_SHR_V128); - -// ============================================================================ -// OPCODE_VECTOR_SHA -// ============================================================================ -struct VECTOR_SHA_V128 - : Sequence> { - static __m128i EmulateVectorShaI8(void*, __m128i src1, __m128i src2) { - alignas(16) int8_t value[16]; - alignas(16) int8_t shamt[16]; - _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); - for (size_t i = 0; i < 16; ++i) { - value[i] = value[i] >> (shamt[i] & 0x7); - } - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); - } - - static void EmitInt8(X64Emitter& e, const EmitArgType& i) { - // TODO(benvanik): native version (with shift magic). - if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src2.constant()); - e.lea(e.r9, e.StashXmm(1, e.xmm0)); - } else { - e.lea(e.r9, e.StashXmm(1, i.src2)); - } - e.lea(e.r8, e.StashXmm(0, i.src1)); - e.CallNativeSafe(reinterpret_cast(EmulateVectorShaI8)); - e.vmovaps(i.dest, e.xmm0); - } - - static __m128i EmulateVectorShaI16(void*, __m128i src1, __m128i src2) { - alignas(16) int16_t value[8]; - alignas(16) int16_t shamt[8]; - _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); - for (size_t i = 0; i < 8; ++i) { - value[i] = value[i] >> (shamt[i] & 0xF); - } - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); - } - - static void EmitInt16(X64Emitter& e, const EmitArgType& i) { - if (i.src2.is_constant) { - const auto& shamt = i.src2.constant(); - bool all_same = true; - for (size_t n = 0; n < 8 - n; ++n) { - if (shamt.u16[n] != shamt.u16[n + 1]) { - all_same = false; - break; - } - } - if (all_same) { - // Every count is the same, so we can use vpsraw. - e.vpsraw(i.dest, i.src1, shamt.u16[0] & 0xF); - return; - } - } - - // Shift 8 words in src1 by amount specified in src2. - Xbyak::Label emu, end; - - // See if the shift is equal first for a shortcut. - // Only bother with this check if shift amt isn't constant. - if (!i.src2.is_constant) { - e.vpshuflw(e.xmm0, i.src2, 0b00000000); - e.vpshufd(e.xmm0, e.xmm0, 0b00000000); - e.vpxor(e.xmm1, e.xmm0, i.src2); - e.vptest(e.xmm1, e.xmm1); - e.jnz(emu); - - // Equal. Shift using vpsraw. - e.mov(e.rax, 0xF); - e.vmovq(e.xmm1, e.rax); - e.vpand(e.xmm0, e.xmm0, e.xmm1); - e.vpsraw(i.dest, i.src1, e.xmm0); - e.jmp(end); - } - - // TODO(benvanik): native version (with shift magic). - e.L(emu); - if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src2.constant()); - e.lea(e.r9, e.StashXmm(1, e.xmm0)); - } else { - e.lea(e.r9, e.StashXmm(1, i.src2)); - } - e.lea(e.r8, e.StashXmm(0, i.src1)); - e.CallNativeSafe(reinterpret_cast(EmulateVectorShaI16)); - e.vmovaps(i.dest, e.xmm0); - - e.L(end); - } - - static __m128i EmulateVectorShaI32(void*, __m128i src1, __m128i src2) { - alignas(16) int32_t value[4]; - alignas(16) int32_t shamt[4]; - _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); - for (size_t i = 0; i < 4; ++i) { - value[i] = value[i] >> (shamt[i] & 0x1F); - } - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); - } - - static void EmitInt32(X64Emitter& e, const EmitArgType& i) { - if (i.src2.is_constant) { - const auto& shamt = i.src2.constant(); - bool all_same = true; - for (size_t n = 0; n < 4 - n; ++n) { - if (shamt.u32[n] != shamt.u32[n + 1]) { - all_same = false; - break; - } - } - if (all_same) { - // Every count is the same, so we can use vpsrad. - e.vpsrad(i.dest, i.src1, shamt.u32[0] & 0x1F); - return; - } - } - - if (e.IsFeatureEnabled(kX64EmitAVX2)) { - // src shift mask may have values >31, and x86 sets to zero when - // that happens so we mask. - if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src2.constant()); - e.vandps(e.xmm0, e.GetXmmConstPtr(XMMShiftMaskPS)); - } else { - e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); - } - e.vpsravd(i.dest, i.src1, e.xmm0); - } else { - // Shift 4 words in src1 by amount specified in src2. - Xbyak::Label emu, end; - - // See if the shift is equal first for a shortcut. - // Only bother with this check if shift amt isn't constant. - if (!i.src2.is_constant) { - e.vpshufd(e.xmm0, i.src2, 0b00000000); - e.vpxor(e.xmm1, e.xmm0, i.src2); - e.vptest(e.xmm1, e.xmm1); - e.jnz(emu); - - // Equal. Shift using vpsrad. - e.mov(e.rax, 0x1F); - e.vmovq(e.xmm1, e.rax); - e.vpand(e.xmm0, e.xmm0, e.xmm1); - e.vpsrad(i.dest, i.src1, e.xmm0); - e.jmp(end); - } - - // TODO(benvanik): native version. - e.L(emu); - if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src2.constant()); - e.lea(e.r9, e.StashXmm(1, e.xmm0)); - } else { - e.lea(e.r9, e.StashXmm(1, i.src2)); - } - e.lea(e.r8, e.StashXmm(0, i.src1)); - e.CallNativeSafe(reinterpret_cast(EmulateVectorShaI32)); - e.vmovaps(i.dest, e.xmm0); - - e.L(end); - } - } - - static void Emit(X64Emitter& e, const EmitArgType& i) { - switch (i.instr->flags) { - case INT8_TYPE: - EmitInt8(e, i); - break; - case INT16_TYPE: - EmitInt16(e, i); - break; - case INT32_TYPE: - EmitInt32(e, i); - break; - default: - assert_always(); - break; - } - } -}; -EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHA, VECTOR_SHA_V128); - // ============================================================================ // OPCODE_ROTATE_LEFT // ============================================================================ @@ -6210,196 +2913,6 @@ struct ROTATE_LEFT_I64 EMITTER_OPCODE_TABLE(OPCODE_ROTATE_LEFT, ROTATE_LEFT_I8, ROTATE_LEFT_I16, ROTATE_LEFT_I32, ROTATE_LEFT_I64); -// ============================================================================ -// OPCODE_VECTOR_ROTATE_LEFT -// ============================================================================ -// TODO(benvanik): AVX512 has a native variable rotate (rolv). -struct VECTOR_ROTATE_LEFT_V128 - : Sequence> { - static __m128i EmulateVectorRotateLeftI8(void*, __m128i src1, __m128i src2) { - alignas(16) uint8_t value[16]; - alignas(16) uint8_t shamt[16]; - _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); - for (size_t i = 0; i < 16; ++i) { - value[i] = xe::rotate_left(value[i], shamt[i] & 0x7); - } - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); - } - static __m128i EmulateVectorRotateLeftI16(void*, __m128i src1, __m128i src2) { - alignas(16) uint16_t value[8]; - alignas(16) uint16_t shamt[8]; - _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); - for (size_t i = 0; i < 8; ++i) { - value[i] = xe::rotate_left(value[i], shamt[i] & 0xF); - } - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); - } - static __m128i EmulateVectorRotateLeftI32(void*, __m128i src1, __m128i src2) { - alignas(16) uint32_t value[4]; - alignas(16) uint32_t shamt[4]; - _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); - for (size_t i = 0; i < 4; ++i) { - value[i] = xe::rotate_left(value[i], shamt[i] & 0x1F); - } - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); - } - static void Emit(X64Emitter& e, const EmitArgType& i) { - switch (i.instr->flags) { - case INT8_TYPE: - // TODO(benvanik): native version (with shift magic). - e.lea(e.r8, e.StashXmm(0, i.src1)); - if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src2.constant()); - e.lea(e.r9, e.StashXmm(1, e.xmm0)); - } else { - e.lea(e.r9, e.StashXmm(1, i.src2)); - } - e.CallNativeSafe(reinterpret_cast(EmulateVectorRotateLeftI8)); - e.vmovaps(i.dest, e.xmm0); - break; - case INT16_TYPE: - // TODO(benvanik): native version (with shift magic). - e.lea(e.r8, e.StashXmm(0, i.src1)); - if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src2.constant()); - e.lea(e.r9, e.StashXmm(1, e.xmm0)); - } else { - e.lea(e.r9, e.StashXmm(1, i.src2)); - } - e.CallNativeSafe(reinterpret_cast(EmulateVectorRotateLeftI16)); - e.vmovaps(i.dest, e.xmm0); - break; - case INT32_TYPE: { - if (e.IsFeatureEnabled(kX64EmitAVX2)) { - Xmm temp = i.dest; - if (i.dest == i.src1 || i.dest == i.src2) { - temp = e.xmm2; - } - // Shift left (to get high bits): - e.vpand(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); - e.vpsllvd(e.xmm1, i.src1, e.xmm0); - // Shift right (to get low bits): - e.vmovaps(temp, e.GetXmmConstPtr(XMMPI32)); - e.vpsubd(temp, e.xmm0); - e.vpsrlvd(i.dest, i.src1, temp); - // Merge: - e.vpor(i.dest, e.xmm1); - } else { - // TODO(benvanik): non-AVX2 native version. - e.lea(e.r8, e.StashXmm(0, i.src1)); - if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src2.constant()); - e.lea(e.r9, e.StashXmm(1, e.xmm0)); - } else { - e.lea(e.r9, e.StashXmm(1, i.src2)); - } - e.CallNativeSafe(reinterpret_cast(EmulateVectorRotateLeftI32)); - e.vmovaps(i.dest, e.xmm0); - } - break; - } - default: - assert_always(); - break; - } - } -}; -EMITTER_OPCODE_TABLE(OPCODE_VECTOR_ROTATE_LEFT, VECTOR_ROTATE_LEFT_V128); - -// ============================================================================ -// OPCODE_VECTOR_AVERAGE -// ============================================================================ -struct VECTOR_AVERAGE - : Sequence> { - static __m128i EmulateVectorAverageUnsignedI32(void*, __m128i src1, - __m128i src2) { - alignas(16) uint32_t src1v[4]; - alignas(16) uint32_t src2v[4]; - alignas(16) uint32_t value[4]; - _mm_store_si128(reinterpret_cast<__m128i*>(src1v), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(src2v), src2); - for (size_t i = 0; i < 4; ++i) { - auto t = (uint64_t(src1v[i]) + uint64_t(src2v[i]) + 1) >> 1; - value[i] = uint32_t(t); - } - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); - } - static __m128i EmulateVectorAverageSignedI32(void*, __m128i src1, - __m128i src2) { - alignas(16) int32_t src1v[4]; - alignas(16) int32_t src2v[4]; - alignas(16) int32_t value[4]; - _mm_store_si128(reinterpret_cast<__m128i*>(src1v), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(src2v), src2); - for (size_t i = 0; i < 4; ++i) { - auto t = (int64_t(src1v[i]) + int64_t(src2v[i]) + 1) >> 1; - value[i] = int32_t(t); - } - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); - } - static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitCommutativeBinaryXmmOp( - e, i, - [&i](X64Emitter& e, const Xmm& dest, const Xmm& src1, const Xmm& src2) { - const TypeName part_type = - static_cast(i.instr->flags & 0xFF); - const uint32_t arithmetic_flags = i.instr->flags >> 8; - bool is_unsigned = !!(arithmetic_flags & ARITHMETIC_UNSIGNED); - switch (part_type) { - case INT8_TYPE: - if (is_unsigned) { - e.vpavgb(dest, src1, src2); - } else { - assert_always(); - } - break; - case INT16_TYPE: - if (is_unsigned) { - e.vpavgw(dest, src1, src2); - } else { - assert_always(); - } - break; - case INT32_TYPE: - // No 32bit averages in AVX. - if (is_unsigned) { - if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src2.constant()); - e.lea(e.r9, e.StashXmm(1, e.xmm0)); - } else { - e.lea(e.r9, e.StashXmm(1, i.src2)); - } - e.lea(e.r8, e.StashXmm(0, i.src1)); - e.CallNativeSafe( - reinterpret_cast(EmulateVectorAverageUnsignedI32)); - e.vmovaps(i.dest, e.xmm0); - } else { - if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src2.constant()); - e.lea(e.r9, e.StashXmm(1, e.xmm0)); - } else { - e.lea(e.r9, e.StashXmm(1, i.src2)); - } - e.lea(e.r8, e.StashXmm(0, i.src1)); - e.CallNativeSafe( - reinterpret_cast(EmulateVectorAverageSignedI32)); - e.vmovaps(i.dest, e.xmm0); - } - break; - default: - assert_unhandled_case(part_type); - break; - } - }); - } -}; -EMITTER_OPCODE_TABLE(OPCODE_VECTOR_AVERAGE, VECTOR_AVERAGE); - // ============================================================================ // OPCODE_BYTE_SWAP // ============================================================================ @@ -6525,1275 +3038,6 @@ struct CNTLZ_I64 : Sequence> { }; EMITTER_OPCODE_TABLE(OPCODE_CNTLZ, CNTLZ_I8, CNTLZ_I16, CNTLZ_I32, CNTLZ_I64); -// ============================================================================ -// OPCODE_INSERT -// ============================================================================ -struct INSERT_I8 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - assert_true(i.src2.is_constant); - e.vpinsrb(i.dest, i.src3.reg().cvt32(), i.src2.constant() ^ 0x3); - } -}; -struct INSERT_I16 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - assert_true(i.src2.is_constant); - e.vpinsrw(i.dest, i.src3.reg().cvt32(), i.src2.constant() ^ 0x1); - } -}; -struct INSERT_I32 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - assert_true(i.src2.is_constant); - e.vpinsrd(i.dest, i.src3, i.src2.constant()); - } -}; -EMITTER_OPCODE_TABLE(OPCODE_INSERT, INSERT_I8, INSERT_I16, INSERT_I32); - -// ============================================================================ -// OPCODE_EXTRACT -// ============================================================================ -// TODO(benvanik): sequence extract/splat: -// v0.i32 = extract v0.v128, 0 -// v0.v128 = splat v0.i32 -// This can be a single broadcast. -struct EXTRACT_I8 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - if (i.src2.is_constant) { - e.vpextrb(i.dest.reg().cvt32(), i.src1, VEC128_B(i.src2.constant())); - } else { - e.mov(e.eax, 0x00000003); - e.xor_(e.al, i.src2); - e.and_(e.al, 0x1F); - e.vmovd(e.xmm0, e.eax); - e.vpshufb(e.xmm0, i.src1, e.xmm0); - e.vmovd(i.dest.reg().cvt32(), e.xmm0); - e.and_(i.dest, uint8_t(0xFF)); - } - } -}; -struct EXTRACT_I16 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - if (i.src2.is_constant) { - e.vpextrw(i.dest.reg().cvt32(), i.src1, VEC128_W(i.src2.constant())); - } else { - e.mov(e.al, i.src2); - e.xor_(e.al, 0x01); - e.shl(e.al, 1); - e.mov(e.ah, e.al); - e.add(e.ah, 1); - e.vmovd(e.xmm0, e.eax); - e.vpshufb(e.xmm0, i.src1, e.xmm0); - e.vmovd(i.dest.reg().cvt32(), e.xmm0); - e.and_(i.dest.reg().cvt32(), 0xFFFFu); - } - } -}; -struct EXTRACT_I32 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - static const vec128_t extract_table_32[4] = { - vec128b(3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), - vec128b(7, 6, 5, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), - vec128b(11, 10, 9, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), - vec128b(15, 14, 13, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), - }; - if (i.src2.is_constant) { - // TODO(gibbed): add support to constant propagation pass for - // OPCODE_EXTRACT. - Xmm src1; - if (i.src1.is_constant) { - src1 = e.xmm0; - e.LoadConstantXmm(src1, i.src1.constant()); - } else { - src1 = i.src1; - } - if (i.src2.constant() == 0) { - e.vmovd(i.dest, src1); - } else { - e.vpextrd(i.dest, src1, VEC128_D(i.src2.constant())); - } - } else { - // TODO(benvanik): try out hlide's version: - // e.mov(e.eax, 3); - // e.and_(e.al, i.src2); // eax = [(i&3), 0, 0, 0] - // e.imul(e.eax, 0x04040404); // [(i&3)*4, (i&3)*4, (i&3)*4, (i&3)*4] - // e.add(e.eax, 0x00010203); // [((i&3)*4)+3, ((i&3)*4)+2, ((i&3)*4)+1, - // ((i&3)*4)+0] - // e.vmovd(e.xmm0, e.eax); - // e.vpshufb(e.xmm0, i.src1, e.xmm0); - // e.vmovd(i.dest.reg().cvt32(), e.xmm0); - // Get the desired word in xmm0, then extract that. - Xmm src1; - if (i.src1.is_constant) { - src1 = e.xmm1; - e.LoadConstantXmm(src1, i.src1.constant()); - } else { - src1 = i.src1.reg(); - } - - e.xor_(e.rax, e.rax); - e.mov(e.al, i.src2); - e.and_(e.al, 0x03); - e.shl(e.al, 4); - e.mov(e.rdx, reinterpret_cast(extract_table_32)); - e.vmovaps(e.xmm0, e.ptr[e.rdx + e.rax]); - e.vpshufb(e.xmm0, src1, e.xmm0); - e.vpextrd(i.dest, e.xmm0, 0); - } - } -}; -EMITTER_OPCODE_TABLE(OPCODE_EXTRACT, EXTRACT_I8, EXTRACT_I16, EXTRACT_I32); - -// ============================================================================ -// OPCODE_SPLAT -// ============================================================================ -// Copy a value into all elements of a vector -struct SPLAT_I8 : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - if (i.src1.is_constant) { - // TODO(benvanik): faster constant splats. - e.mov(e.eax, i.src1.constant()); - e.vmovd(e.xmm0, e.eax); - } else { - e.vmovd(e.xmm0, i.src1.reg().cvt32()); - } - - if (e.IsFeatureEnabled(kX64EmitAVX2)) { - e.vpbroadcastb(i.dest, e.xmm0); - } else { - e.vpunpcklbw(e.xmm0, e.xmm0); - e.vpunpcklwd(e.xmm0, e.xmm0); - e.vpshufd(i.dest, e.xmm0, 0); - } - } -}; -struct SPLAT_I16 : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - if (i.src1.is_constant) { - // TODO(benvanik): faster constant splats. - e.mov(e.eax, i.src1.constant()); - e.vmovd(e.xmm0, e.eax); - } else { - e.vmovd(e.xmm0, i.src1.reg().cvt32()); - } - - if (e.IsFeatureEnabled(kX64EmitAVX2)) { - e.vpbroadcastw(i.dest, e.xmm0); - } else { - e.vpunpcklwd(e.xmm0, e.xmm0); // unpack low word data - e.vpshufd(i.dest, e.xmm0, 0); - } - } -}; -struct SPLAT_I32 : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - if (i.src1.is_constant) { - // TODO(benvanik): faster constant splats. - e.mov(e.eax, i.src1.constant()); - e.vmovd(e.xmm0, e.eax); - } else { - e.vmovd(e.xmm0, i.src1); - } - - if (e.IsFeatureEnabled(kX64EmitAVX2)) { - e.vpbroadcastd(i.dest, e.xmm0); - } else { - e.vpshufd(i.dest, e.xmm0, 0); - } - } -}; -struct SPLAT_F32 : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - if (e.IsFeatureEnabled(kX64EmitAVX2)) { - if (i.src1.is_constant) { - // TODO(benvanik): faster constant splats. - e.mov(e.eax, i.src1.value->constant.i32); - e.vmovd(e.xmm0, e.eax); - e.vbroadcastss(i.dest, e.xmm0); - } else { - e.vbroadcastss(i.dest, i.src1); - } - } else { - if (i.src1.is_constant) { - e.mov(e.eax, i.src1.value->constant.i32); - e.vmovd(i.dest, e.eax); - e.vshufps(i.dest, i.dest, i.dest, 0); - } else { - e.vshufps(i.dest, i.src1, i.src1, 0); - } - } - } -}; -EMITTER_OPCODE_TABLE(OPCODE_SPLAT, SPLAT_I8, SPLAT_I16, SPLAT_I32, SPLAT_F32); - -// ============================================================================ -// OPCODE_PERMUTE -// ============================================================================ -struct PERMUTE_I32 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - assert_true(i.instr->flags == INT32_TYPE); - // Permute words between src2 and src3. - // TODO(benvanik): check src3 for zero. if 0, we can use pshufb. - if (i.src1.is_constant) { - uint32_t control = i.src1.constant(); - // Shuffle things into the right places in dest & xmm0, - // then we blend them together. - uint32_t src_control = - (((control >> 24) & 0x3) << 6) | (((control >> 16) & 0x3) << 4) | - (((control >> 8) & 0x3) << 2) | (((control >> 0) & 0x3) << 0); - - uint32_t blend_control = 0; - if (e.IsFeatureEnabled(kX64EmitAVX2)) { - // Blender for vpblendd - blend_control = - (((control >> 26) & 0x1) << 3) | (((control >> 18) & 0x1) << 2) | - (((control >> 10) & 0x1) << 1) | (((control >> 2) & 0x1) << 0); - } else { - // Blender for vpblendw - blend_control = - (((control >> 26) & 0x1) << 6) | (((control >> 18) & 0x1) << 4) | - (((control >> 10) & 0x1) << 2) | (((control >> 2) & 0x1) << 0); - blend_control |= blend_control << 1; - } - - // TODO(benvanik): if src2/src3 are constants, shuffle now! - Xmm src2; - if (i.src2.is_constant) { - src2 = e.xmm1; - e.LoadConstantXmm(src2, i.src2.constant()); - } else { - src2 = i.src2; - } - Xmm src3; - if (i.src3.is_constant) { - src3 = e.xmm2; - e.LoadConstantXmm(src3, i.src3.constant()); - } else { - src3 = i.src3; - } - if (i.dest != src3) { - e.vpshufd(i.dest, src2, src_control); - e.vpshufd(e.xmm0, src3, src_control); - } else { - e.vmovaps(e.xmm0, src3); - e.vpshufd(i.dest, src2, src_control); - e.vpshufd(e.xmm0, e.xmm0, src_control); - } - - if (e.IsFeatureEnabled(kX64EmitAVX2)) { - e.vpblendd(i.dest, e.xmm0, blend_control); // $0 = $1 $2 - } else { - e.vpblendw(i.dest, e.xmm0, blend_control); // $0 = $1 $2 - } - } else { - // Permute by non-constant. - assert_always(); - } - } -}; -struct PERMUTE_V128 - : Sequence> { - static void EmitByInt8(X64Emitter& e, const EmitArgType& i) { - // TODO(benvanik): find out how to do this with only one temp register! - // Permute bytes between src2 and src3. - // src1 is an array of indices corresponding to positions within src2 and - // src3. - if (i.src3.value->IsConstantZero()) { - // Permuting with src2/zero, so just shuffle/mask. - if (i.src2.value->IsConstantZero()) { - // src2 & src3 are zero, so result will always be zero. - e.vpxor(i.dest, i.dest); - } else { - // Control mask needs to be shuffled. - if (i.src1.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src1.constant()); - e.vxorps(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMSwapWordMask)); - } else { - e.vxorps(e.xmm0, i.src1, e.GetXmmConstPtr(XMMSwapWordMask)); - } - e.vpand(e.xmm0, e.GetXmmConstPtr(XMMPermuteByteMask)); - if (i.src2.is_constant) { - e.LoadConstantXmm(i.dest, i.src2.constant()); - e.vpshufb(i.dest, i.dest, e.xmm0); - } else { - e.vpshufb(i.dest, i.src2, e.xmm0); - } - // Build a mask with values in src2 having 0 and values in src3 having - // 1. - e.vpcmpgtb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMPermuteControl15)); - e.vpandn(i.dest, e.xmm0, i.dest); - } - } else { - // General permute. - // Control mask needs to be shuffled. - // TODO(benvanik): do constants here instead of in generated code. - if (i.src1.is_constant) { - e.LoadConstantXmm(e.xmm2, i.src1.constant()); - e.vxorps(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMSwapWordMask)); - } else { - e.vxorps(e.xmm2, i.src1, e.GetXmmConstPtr(XMMSwapWordMask)); - } - e.vpand(e.xmm2, e.GetXmmConstPtr(XMMPermuteByteMask)); - Xmm src2_shuf = e.xmm0; - if (i.src2.value->IsConstantZero()) { - e.vpxor(src2_shuf, src2_shuf); - } else if (i.src2.is_constant) { - e.LoadConstantXmm(src2_shuf, i.src2.constant()); - e.vpshufb(src2_shuf, src2_shuf, e.xmm2); - } else { - e.vpshufb(src2_shuf, i.src2, e.xmm2); - } - Xmm src3_shuf = e.xmm1; - if (i.src3.value->IsConstantZero()) { - e.vpxor(src3_shuf, src3_shuf); - } else if (i.src3.is_constant) { - e.LoadConstantXmm(src3_shuf, i.src3.constant()); - e.vpshufb(src3_shuf, src3_shuf, e.xmm2); - } else { - e.vpshufb(src3_shuf, i.src3, e.xmm2); - } - // Build a mask with values in src2 having 0 and values in src3 having 1. - e.vpcmpgtb(i.dest, e.xmm2, e.GetXmmConstPtr(XMMPermuteControl15)); - e.vpblendvb(i.dest, src2_shuf, src3_shuf, i.dest); - } - } - - static void EmitByInt16(X64Emitter& e, const EmitArgType& i) { - // src1 is an array of indices corresponding to positions within src2 and - // src3. - assert_true(i.src1.is_constant); - vec128_t perm = (i.src1.constant() & vec128s(0xF)) ^ vec128s(0x1); - vec128_t perm_ctrl = vec128b(0); - for (int i = 0; i < 8; i++) { - perm_ctrl.i16[i] = perm.i16[i] > 7 ? -1 : 0; - - auto v = uint8_t(perm.u16[i]); - perm.u8[i * 2] = v * 2; - perm.u8[i * 2 + 1] = v * 2 + 1; - } - e.LoadConstantXmm(e.xmm0, perm); - - if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm1, i.src2.constant()); - } else { - e.vmovdqa(e.xmm1, i.src2); - } - if (i.src3.is_constant) { - e.LoadConstantXmm(e.xmm2, i.src3.constant()); - } else { - e.vmovdqa(e.xmm2, i.src3); - } - - e.vpshufb(e.xmm1, e.xmm1, e.xmm0); - e.vpshufb(e.xmm2, e.xmm2, e.xmm0); - - uint8_t mask = 0; - for (int i = 0; i < 8; i++) { - if (perm_ctrl.i16[i] == 0) { - mask |= 1 << (7 - i); - } - } - e.vpblendw(i.dest, e.xmm1, e.xmm2, mask); - } - - static void EmitByInt32(X64Emitter& e, const EmitArgType& i) { - assert_always(); - } - - static void Emit(X64Emitter& e, const EmitArgType& i) { - switch (i.instr->flags) { - case INT8_TYPE: - EmitByInt8(e, i); - break; - case INT16_TYPE: - EmitByInt16(e, i); - break; - case INT32_TYPE: - EmitByInt32(e, i); - break; - default: - assert_unhandled_case(i.instr->flags); - return; - } - } -}; -EMITTER_OPCODE_TABLE(OPCODE_PERMUTE, PERMUTE_I32, PERMUTE_V128); - -// ============================================================================ -// OPCODE_SWIZZLE -// ============================================================================ -struct SWIZZLE - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - auto element_type = i.instr->flags; - if (element_type == INT8_TYPE) { - assert_always(); - } else if (element_type == INT16_TYPE) { - assert_always(); - } else if (element_type == INT32_TYPE || element_type == FLOAT32_TYPE) { - uint8_t swizzle_mask = static_cast(i.src2.value); - Xmm src1; - if (i.src1.is_constant) { - src1 = e.xmm0; - e.LoadConstantXmm(src1, i.src1.constant()); - } else { - src1 = i.src1; - } - e.vpshufd(i.dest, src1, swizzle_mask); - } else if (element_type == INT64_TYPE || element_type == FLOAT64_TYPE) { - assert_always(); - } else { - assert_always(); - } - } -}; -EMITTER_OPCODE_TABLE(OPCODE_SWIZZLE, SWIZZLE); - -// ============================================================================ -// OPCODE_PACK -// ============================================================================ -struct PACK : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - switch (i.instr->flags & PACK_TYPE_MODE) { - case PACK_TYPE_D3DCOLOR: - EmitD3DCOLOR(e, i); - break; - case PACK_TYPE_FLOAT16_2: - EmitFLOAT16_2(e, i); - break; - case PACK_TYPE_FLOAT16_4: - EmitFLOAT16_4(e, i); - break; - case PACK_TYPE_SHORT_2: - EmitSHORT_2(e, i); - break; - case PACK_TYPE_SHORT_4: - EmitSHORT_4(e, i); - break; - case PACK_TYPE_UINT_2101010: - EmitUINT_2101010(e, i); - break; - case PACK_TYPE_8_IN_16: - Emit8_IN_16(e, i, i.instr->flags); - break; - case PACK_TYPE_16_IN_32: - Emit16_IN_32(e, i, i.instr->flags); - break; - default: - assert_unhandled_case(i.instr->flags); - break; - } - } - static void EmitD3DCOLOR(X64Emitter& e, const EmitArgType& i) { - assert_true(i.src2.value->IsConstantZero()); - Xmm src; - if (i.src1.is_constant) { - src = i.dest; - e.LoadConstantXmm(src, i.src1.constant()); - } else { - src = i.src1; - } - // Saturate to [3,3....] so that only values between 3...[00] and 3...[FF] - // are valid. - e.vminps(i.dest, src, e.GetXmmConstPtr(XMMPackD3DCOLORSat)); - e.vmaxps(i.dest, i.dest, e.GetXmmConstPtr(XMM3333)); - // Extract bytes. - // RGBA (XYZW) -> ARGB (WXYZ) - // w = ((src1.uw & 0xFF) << 24) | ((src1.ux & 0xFF) << 16) | - // ((src1.uy & 0xFF) << 8) | (src1.uz & 0xFF) - e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackD3DCOLOR)); - } - static __m128i EmulateFLOAT16_2(void*, __m128 src1) { - alignas(16) float a[4]; - alignas(16) uint16_t b[8]; - _mm_store_ps(a, src1); - std::memset(b, 0, sizeof(b)); - - for (int i = 0; i < 2; i++) { - b[7 - i] = half_float::detail::float2half(a[i]); - } - - return _mm_load_si128(reinterpret_cast<__m128i*>(b)); - } - static void EmitFLOAT16_2(X64Emitter& e, const EmitArgType& i) { - assert_true(i.src2.value->IsConstantZero()); - // https://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx - // dest = [(src1.x | src1.y), 0, 0, 0] - - Xmm src; - if (e.IsFeatureEnabled(kX64EmitF16C)) { - if (i.src1.is_constant) { - src = i.dest; - e.LoadConstantXmm(src, i.src1.constant()); - } else { - src = i.src1; - } - // 0|0|0|0|W|Z|Y|X - e.vcvtps2ph(i.dest, src, 0b00000011); - // Shuffle to X|Y|0|0|0|0|0|0 - e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackFLOAT16_2)); - } else { - if (i.src1.is_constant) { - src = e.xmm0; - e.LoadConstantXmm(src, i.src1.constant()); - } else { - src = i.src1; - } - e.lea(e.r8, e.StashXmm(0, src)); - e.CallNativeSafe(reinterpret_cast(EmulateFLOAT16_2)); - e.vmovaps(i.dest, e.xmm0); - } - } - static __m128i EmulateFLOAT16_4(void*, __m128 src1) { - alignas(16) float a[4]; - alignas(16) uint16_t b[8]; - _mm_store_ps(a, src1); - std::memset(b, 0, sizeof(b)); - - for (int i = 0; i < 4; i++) { - b[7 - i] = half_float::detail::float2half(a[i]); - } - - return _mm_load_si128(reinterpret_cast<__m128i*>(b)); - } - static void EmitFLOAT16_4(X64Emitter& e, const EmitArgType& i) { - assert_true(i.src2.value->IsConstantZero()); - // dest = [(src1.x | src1.y), (src1.z | src1.w), 0, 0] - - Xmm src; - if (e.IsFeatureEnabled(kX64EmitF16C)) { - if (i.src1.is_constant) { - src = i.dest; - e.LoadConstantXmm(src, i.src1.constant()); - } else { - src = i.src1; - } - // 0|0|0|0|W|Z|Y|X - e.vcvtps2ph(i.dest, src, 0b00000011); - // Shuffle to X|Y|Z|W|0|0|0|0 - e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackFLOAT16_4)); - } else { - if (i.src1.is_constant) { - src = e.xmm0; - e.LoadConstantXmm(src, i.src1.constant()); - } else { - src = i.src1; - } - e.lea(e.r8, e.StashXmm(0, src)); - e.CallNativeSafe(reinterpret_cast(EmulateFLOAT16_4)); - e.vmovaps(i.dest, e.xmm0); - } - } - static void EmitSHORT_2(X64Emitter& e, const EmitArgType& i) { - assert_true(i.src2.value->IsConstantZero()); - Xmm src; - if (i.src1.is_constant) { - src = i.dest; - e.LoadConstantXmm(src, i.src1.constant()); - } else { - src = i.src1; - } - // Saturate. - e.vmaxps(i.dest, src, e.GetXmmConstPtr(XMMPackSHORT_Min)); - e.vminps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_Max)); - // Pack. - e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_2)); - } - static void EmitSHORT_4(X64Emitter& e, const EmitArgType& i) { - assert_true(i.src2.value->IsConstantZero()); - Xmm src; - if (i.src1.is_constant) { - src = i.dest; - e.LoadConstantXmm(src, i.src1.constant()); - } else { - src = i.src1; - } - // Saturate. - e.vmaxps(i.dest, src, e.GetXmmConstPtr(XMMPackSHORT_Min)); - e.vminps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_Max)); - // Pack. - e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_4)); - } - static void EmitUINT_2101010(X64Emitter& e, const EmitArgType& i) { - // https://www.opengl.org/registry/specs/ARB/vertex_type_2_10_10_10_rev.txt - // XYZ are 10 bits, signed and saturated. - // W is 2 bits, unsigned and saturated. - Xmm src; - if (i.src1.is_constant) { - src = i.dest; - e.LoadConstantXmm(src, i.src1.constant()); - } else { - src = i.src1; - } - // Saturate. - e.vmaxps(i.dest, src, e.GetXmmConstPtr(XMMPackUINT_2101010_MinUnpacked)); - e.vminps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaxUnpacked)); - // Remove the unneeded bits of the floats. - e.vpand(i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaskUnpacked)); - if (e.IsFeatureEnabled(kX64EmitAVX2)) { - // Shift the components up. - e.vpsllvd(i.dest, i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_Shift)); - } else { - // Duplicate all the components into bits 10-19. - e.vpslld(e.xmm0, i.dest, 10); - e.vpor(i.dest, e.xmm0); - // Duplicate all the components into bits 20-39 - // (so alpha will be in 30-31). - e.vpslld(e.xmm0, i.dest, 20); - e.vpor(i.dest, e.xmm0); - // Leave only the needed components. - e.vpand(i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaskPacked)); - } - // Combine the components. - e.vshufps(e.xmm0, i.dest, i.dest, _MM_SHUFFLE(2, 3, 0, 1)); - e.vorps(i.dest, e.xmm0); - e.vshufps(e.xmm0, i.dest, i.dest, _MM_SHUFFLE(1, 0, 3, 2)); - e.vorps(i.dest, e.xmm0); - } - static __m128i EmulatePack8_IN_16_UN_UN_SAT(void*, __m128i src1, - __m128i src2) { - alignas(16) uint16_t a[8]; - alignas(16) uint16_t b[8]; - alignas(16) uint8_t c[16]; - _mm_store_si128(reinterpret_cast<__m128i*>(a), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(b), src2); - for (int i = 0; i < 8; ++i) { - c[i] = uint8_t(std::max(uint16_t(0), std::min(uint16_t(255), a[i]))); - c[i + 8] = uint8_t(std::max(uint16_t(0), std::min(uint16_t(255), b[i]))); - } - return _mm_load_si128(reinterpret_cast<__m128i*>(c)); - } - static __m128i EmulatePack8_IN_16_UN_UN(void*, __m128i src1, __m128i src2) { - alignas(16) uint8_t a[16]; - alignas(16) uint8_t b[16]; - alignas(16) uint8_t c[16]; - _mm_store_si128(reinterpret_cast<__m128i*>(a), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(b), src2); - for (int i = 0; i < 8; ++i) { - c[i] = a[i * 2]; - c[i + 8] = b[i * 2]; - } - return _mm_load_si128(reinterpret_cast<__m128i*>(c)); - } - static void Emit8_IN_16(X64Emitter& e, const EmitArgType& i, uint32_t flags) { - // TODO(benvanik): handle src2 (or src1) being constant zero - if (IsPackInUnsigned(flags)) { - if (IsPackOutUnsigned(flags)) { - if (IsPackOutSaturate(flags)) { - // unsigned -> unsigned + saturate - if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src2.constant()); - e.lea(e.r9, e.StashXmm(1, e.xmm0)); - } else { - e.lea(e.r9, e.StashXmm(1, i.src2)); - } - e.lea(e.r8, e.StashXmm(0, i.src1)); - e.CallNativeSafe( - reinterpret_cast(EmulatePack8_IN_16_UN_UN_SAT)); - e.vmovaps(i.dest, e.xmm0); - e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask)); - } else { - // unsigned -> unsigned - e.lea(e.r9, e.StashXmm(1, i.src2)); - e.lea(e.r8, e.StashXmm(0, i.src1)); - e.CallNativeSafe(reinterpret_cast(EmulatePack8_IN_16_UN_UN)); - e.vmovaps(i.dest, e.xmm0); - e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask)); - } - } else { - if (IsPackOutSaturate(flags)) { - // unsigned -> signed + saturate - assert_always(); - } else { - // unsigned -> signed - assert_always(); - } - } - } else { - if (IsPackOutUnsigned(flags)) { - if (IsPackOutSaturate(flags)) { - // signed -> unsigned + saturate - // PACKUSWB / SaturateSignedWordToUnsignedByte - Xbyak::Xmm src2 = i.src2.is_constant ? e.xmm0 : i.src2; - if (i.src2.is_constant) { - e.LoadConstantXmm(src2, i.src2.constant()); - } - - e.vpackuswb(i.dest, i.src1, src2); - e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask)); - } else { - // signed -> unsigned - assert_always(); - } - } else { - if (IsPackOutSaturate(flags)) { - // signed -> signed + saturate - // PACKSSWB / SaturateSignedWordToSignedByte - e.vpacksswb(i.dest, i.src1, i.src2); - e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask)); - } else { - // signed -> signed - assert_always(); - } - } - } - } - // Pack 2 32-bit vectors into a 16-bit vector. - static void Emit16_IN_32(X64Emitter& e, const EmitArgType& i, - uint32_t flags) { - // TODO(benvanik): handle src2 (or src1) being constant zero - if (IsPackInUnsigned(flags)) { - if (IsPackOutUnsigned(flags)) { - if (IsPackOutSaturate(flags)) { - // unsigned -> unsigned + saturate - // Construct a saturation max value - e.mov(e.eax, 0xFFFFu); - e.vmovd(e.xmm0, e.eax); - e.vpshufd(e.xmm0, e.xmm0, 0b00000000); - - if (!i.src1.is_constant) { - e.vpminud(e.xmm1, i.src1, e.xmm0); // Saturate src1 - e.vpshuflw(e.xmm1, e.xmm1, 0b00100010); - e.vpshufhw(e.xmm1, e.xmm1, 0b00100010); - e.vpshufd(e.xmm1, e.xmm1, 0b00001000); - } else { - // TODO(DrChat): Non-zero constants - assert_true(i.src1.constant().u64[0] == 0 && - i.src1.constant().u64[1] == 0); - e.vpxor(e.xmm1, e.xmm1); - } - - if (!i.src2.is_constant) { - e.vpminud(i.dest, i.src2, e.xmm0); // Saturate src2 - e.vpshuflw(i.dest, i.dest, 0b00100010); - e.vpshufhw(i.dest, i.dest, 0b00100010); - e.vpshufd(i.dest, i.dest, 0b10000000); - } else { - // TODO(DrChat): Non-zero constants - assert_true(i.src2.constant().u64[0] == 0 && - i.src2.constant().u64[1] == 0); - e.vpxor(i.dest, i.dest); - } - - e.vpblendw(i.dest, i.dest, e.xmm1, 0b00001111); - } else { - // unsigned -> unsigned - e.vmovaps(e.xmm0, i.src1); - e.vpshuflw(e.xmm0, e.xmm0, 0b00100010); - e.vpshufhw(e.xmm0, e.xmm0, 0b00100010); - e.vpshufd(e.xmm0, e.xmm0, 0b00001000); - - e.vmovaps(i.dest, i.src2); - e.vpshuflw(i.dest, i.dest, 0b00100010); - e.vpshufhw(i.dest, i.dest, 0b00100010); - e.vpshufd(i.dest, i.dest, 0b10000000); - - e.vpblendw(i.dest, i.dest, e.xmm0, 0b00001111); - } - } else { - if (IsPackOutSaturate(flags)) { - // unsigned -> signed + saturate - assert_always(); - } else { - // unsigned -> signed - assert_always(); - } - } - } else { - if (IsPackOutUnsigned(flags)) { - if (IsPackOutSaturate(flags)) { - // signed -> unsigned + saturate - // PACKUSDW - // TMP[15:0] <- (DEST[31:0] < 0) ? 0 : DEST[15:0]; - // DEST[15:0] <- (DEST[31:0] > FFFFH) ? FFFFH : TMP[15:0]; - e.vpackusdw(i.dest, i.src1, i.src2); - e.vpshuflw(i.dest, i.dest, 0b10110001); - e.vpshufhw(i.dest, i.dest, 0b10110001); - } else { - // signed -> unsigned - assert_always(); - } - } else { - if (IsPackOutSaturate(flags)) { - // signed -> signed + saturate - // PACKSSDW / SaturateSignedDwordToSignedWord - Xmm src2; - if (!i.src2.is_constant) { - src2 = i.src2; - } else { - assert_false(i.src1 == e.xmm0); - src2 = e.xmm0; - e.LoadConstantXmm(src2, i.src2.constant()); - } - e.vpackssdw(i.dest, i.src1, src2); - e.vpshuflw(i.dest, i.dest, 0b10110001); - e.vpshufhw(i.dest, i.dest, 0b10110001); - } else { - // signed -> signed - assert_always(); - } - } - } - } -}; -EMITTER_OPCODE_TABLE(OPCODE_PACK, PACK); - -// ============================================================================ -// OPCODE_UNPACK -// ============================================================================ -struct UNPACK : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - switch (i.instr->flags & PACK_TYPE_MODE) { - case PACK_TYPE_D3DCOLOR: - EmitD3DCOLOR(e, i); - break; - case PACK_TYPE_FLOAT16_2: - EmitFLOAT16_2(e, i); - break; - case PACK_TYPE_FLOAT16_4: - EmitFLOAT16_4(e, i); - break; - case PACK_TYPE_SHORT_2: - EmitSHORT_2(e, i); - break; - case PACK_TYPE_SHORT_4: - EmitSHORT_4(e, i); - break; - case PACK_TYPE_UINT_2101010: - EmitUINT_2101010(e, i); - break; - case PACK_TYPE_8_IN_16: - Emit8_IN_16(e, i, i.instr->flags); - break; - case PACK_TYPE_16_IN_32: - Emit16_IN_32(e, i, i.instr->flags); - break; - default: - assert_unhandled_case(i.instr->flags); - break; - } - } - static void EmitD3DCOLOR(X64Emitter& e, const EmitArgType& i) { - // ARGB (WXYZ) -> RGBA (XYZW) - Xmm src; - if (i.src1.is_constant) { - if (i.src1.value->IsConstantZero()) { - e.vmovaps(i.dest, e.GetXmmConstPtr(XMMOne)); - return; - } - src = i.dest; - e.LoadConstantXmm(src, i.src1.constant()); - } else { - src = i.src1; - } - // src = ZZYYXXWW - // Unpack to 000000ZZ,000000YY,000000XX,000000WW - e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackD3DCOLOR)); - // Add 1.0f to each. - e.vpor(i.dest, e.GetXmmConstPtr(XMMOne)); - // To convert to 0 to 1, games multiply by 0x47008081 and add 0xC7008081. - } - static __m128 EmulateFLOAT16_2(void*, __m128i src1) { - alignas(16) uint16_t a[8]; - alignas(16) float b[4]; - _mm_store_si128(reinterpret_cast<__m128i*>(a), src1); - - for (int i = 0; i < 2; i++) { - b[i] = half_float::detail::half2float(a[VEC128_W(6 + i)]); - } - - // Constants, or something - b[2] = 0.f; - b[3] = 1.f; - - return _mm_load_ps(b); - } - static void EmitFLOAT16_2(X64Emitter& e, const EmitArgType& i) { - // 1 bit sign, 5 bit exponent, 10 bit mantissa - // D3D10 half float format - // TODO(benvanik): - // https://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx - // Use _mm_cvtph_ps -- requires very modern processors (SSE5+) - // Unpacking half floats: - // https://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/ - // Packing half floats: https://gist.github.com/rygorous/2156668 - // Load source, move from tight pack of X16Y16.... to X16...Y16... - // Also zero out the high end. - // TODO(benvanik): special case constant unpacks that just get 0/1/etc. - - Xmm src; - if (e.IsFeatureEnabled(kX64EmitF16C)) { - if (i.src1.is_constant) { - src = i.dest; - e.LoadConstantXmm(src, i.src1.constant()); - } else { - src = i.src1; - } - // sx = src.iw >> 16; - // sy = src.iw & 0xFFFF; - // dest = { XMConvertHalfToFloat(sx), - // XMConvertHalfToFloat(sy), - // 0.0, - // 1.0 }; - // Shuffle to 0|0|0|0|0|0|Y|X - e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackFLOAT16_2)); - e.vcvtph2ps(i.dest, i.dest); - e.vpshufd(i.dest, i.dest, 0b10100100); - e.vpor(i.dest, e.GetXmmConstPtr(XMM0001)); - } else { - if (i.src1.is_constant) { - src = e.xmm0; - e.LoadConstantXmm(src, i.src1.constant()); - } else { - src = i.src1; - } - e.lea(e.r8, e.StashXmm(0, src)); - e.CallNativeSafe(reinterpret_cast(EmulateFLOAT16_2)); - e.vmovaps(i.dest, e.xmm0); - } - } - static __m128 EmulateFLOAT16_4(void*, __m128i src1) { - alignas(16) uint16_t a[8]; - alignas(16) float b[4]; - _mm_store_si128(reinterpret_cast<__m128i*>(a), src1); - - for (int i = 0; i < 4; i++) { - b[i] = half_float::detail::half2float(a[VEC128_W(4 + i)]); - } - - return _mm_load_ps(b); - } - static void EmitFLOAT16_4(X64Emitter& e, const EmitArgType& i) { - // src = [(dest.x | dest.y), (dest.z | dest.w), 0, 0] - Xmm src; - if (e.IsFeatureEnabled(kX64EmitF16C)) { - if (i.src1.is_constant) { - src = i.dest; - e.LoadConstantXmm(src, i.src1.constant()); - } else { - src = i.src1; - } - // Shuffle to 0|0|0|0|W|Z|Y|X - e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackFLOAT16_4)); - e.vcvtph2ps(i.dest, i.dest); - } else { - if (i.src1.is_constant) { - src = e.xmm0; - e.LoadConstantXmm(src, i.src1.constant()); - } else { - src = i.src1; - } - e.lea(e.r8, e.StashXmm(0, src)); - e.CallNativeSafe(reinterpret_cast(EmulateFLOAT16_4)); - e.vmovaps(i.dest, e.xmm0); - } - } - static void EmitSHORT_2(X64Emitter& e, const EmitArgType& i) { - // (VD.x) = 3.0 + (VB.x>>16)*2^-22 - // (VD.y) = 3.0 + (VB.x)*2^-22 - // (VD.z) = 0.0 - // (VD.w) = 1.0 (games splat W after unpacking to get vectors of 1.0f) - // src is (xx,xx,xx,VALUE) - Xmm src; - if (i.src1.is_constant) { - if (i.src1.value->IsConstantZero()) { - e.vmovdqa(i.dest, e.GetXmmConstPtr(XMM3301)); - return; - } - // TODO(benvanik): check other common constants/perform shuffle/or here. - src = i.dest; - e.LoadConstantXmm(src, i.src1.constant()); - } else { - src = i.src1; - } - // Shuffle bytes. - e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackSHORT_2)); - // If negative, make smaller than 3 - sign extend before adding. - e.vpslld(i.dest, 16); - e.vpsrad(i.dest, 16); - // Add 3,3,0,1. - e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3301)); - // Return quiet NaNs in case of negative overflow. - e.vcmpeqps(e.xmm0, i.dest, e.GetXmmConstPtr(XMMUnpackSHORT_Overflow)); - e.vblendvps(i.dest, i.dest, e.GetXmmConstPtr(XMMUnpackOverflowNaN), e.xmm0); - } - static void EmitSHORT_4(X64Emitter& e, const EmitArgType& i) { - // (VD.x) = 3.0 + (VB.x>>16)*2^-22 - // (VD.y) = 3.0 + (VB.x)*2^-22 - // (VD.z) = 3.0 + (VB.y>>16)*2^-22 - // (VD.w) = 3.0 + (VB.y)*2^-22 - // src is (xx,xx,VALUE,VALUE) - Xmm src; - if (i.src1.is_constant) { - if (i.src1.value->IsConstantZero()) { - e.vmovdqa(i.dest, e.GetXmmConstPtr(XMM3333)); - return; - } - // TODO(benvanik): check other common constants/perform shuffle/or here. - src = i.dest; - e.LoadConstantXmm(src, i.src1.constant()); - } else { - src = i.src1; - } - // Shuffle bytes. - e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackSHORT_4)); - // If negative, make smaller than 3 - sign extend before adding. - e.vpslld(i.dest, 16); - e.vpsrad(i.dest, 16); - // Add 3,3,3,3. - e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3333)); - // Return quiet NaNs in case of negative overflow. - e.vcmpeqps(e.xmm0, i.dest, e.GetXmmConstPtr(XMMUnpackSHORT_Overflow)); - e.vblendvps(i.dest, i.dest, e.GetXmmConstPtr(XMMUnpackOverflowNaN), e.xmm0); - } - static void EmitUINT_2101010(X64Emitter& e, const EmitArgType& i) { - Xmm src; - if (i.src1.is_constant) { - if (i.src1.value->IsConstantZero()) { - e.vmovdqa(i.dest, e.GetXmmConstPtr(XMM3331)); - return; - } - src = i.dest; - e.LoadConstantXmm(src, i.src1.constant()); - } else { - src = i.src1; - } - // Splat W. - e.vshufps(i.dest, src, src, _MM_SHUFFLE(3, 3, 3, 3)); - // Keep only the needed components. - // Red in 0-9 now, green in 10-19, blue in 20-29, alpha in 30-31. - e.vpand(i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaskPacked)); - if (e.IsFeatureEnabled(kX64EmitAVX2)) { - // Shift the components down. - e.vpsrlvd(i.dest, i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_Shift)); - } else { - // Duplicate green in 0-9 and alpha in 20-21. - e.vpsrld(e.xmm0, i.dest, 10); - e.vpor(i.dest, e.xmm0); - // Duplicate blue in 0-9 and alpha in 0-1. - e.vpsrld(e.xmm0, i.dest, 20); - e.vpor(i.dest, e.xmm0); - // Remove higher duplicate components. - e.vpand(i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaskUnpacked)); - } - // If XYZ are negative, make smaller than 3 - sign extend XYZ before adding. - // W is unsigned. - e.vpslld(i.dest, 22); - e.vpsrad(i.dest, 22); - // Add 3,3,3,1. - e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3331)); - // Return quiet NaNs in case of negative overflow. - e.vcmpeqps(e.xmm0, i.dest, - e.GetXmmConstPtr(XMMUnpackUINT_2101010_Overflow)); - e.vblendvps(i.dest, i.dest, e.GetXmmConstPtr(XMMUnpackOverflowNaN), e.xmm0); - // To convert XYZ to -1 to 1, games multiply by 0x46004020 & sub 0x46C06030. - // For W to 0 to 1, they multiply by and subtract 0x4A2AAAAB. - } - static void Emit8_IN_16(X64Emitter& e, const EmitArgType& i, uint32_t flags) { - assert_false(IsPackOutSaturate(flags)); - Xmm src; - if (i.src1.is_constant) { - src = i.dest; - e.LoadConstantXmm(src, i.src1.constant()); - } else { - src = i.src1; - } - if (IsPackToLo(flags)) { - // Unpack to LO. - if (IsPackInUnsigned(flags)) { - if (IsPackOutUnsigned(flags)) { - // unsigned -> unsigned - assert_always(); - } else { - // unsigned -> signed - assert_always(); - } - } else { - if (IsPackOutUnsigned(flags)) { - // signed -> unsigned - assert_always(); - } else { - // signed -> signed - e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMByteOrderMask)); - e.vpunpckhbw(i.dest, i.dest, i.dest); - e.vpsraw(i.dest, 8); - } - } - } else { - // Unpack to HI. - if (IsPackInUnsigned(flags)) { - if (IsPackOutUnsigned(flags)) { - // unsigned -> unsigned - assert_always(); - } else { - // unsigned -> signed - assert_always(); - } - } else { - if (IsPackOutUnsigned(flags)) { - // signed -> unsigned - assert_always(); - } else { - // signed -> signed - e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMByteOrderMask)); - e.vpunpcklbw(i.dest, i.dest, i.dest); - e.vpsraw(i.dest, 8); - } - } - } - } - static void Emit16_IN_32(X64Emitter& e, const EmitArgType& i, - uint32_t flags) { - assert_false(IsPackOutSaturate(flags)); - Xmm src; - if (i.src1.is_constant) { - src = i.dest; - e.LoadConstantXmm(src, i.src1.constant()); - } else { - src = i.src1; - } - if (IsPackToLo(flags)) { - // Unpack to LO. - if (IsPackInUnsigned(flags)) { - if (IsPackOutUnsigned(flags)) { - // unsigned -> unsigned - assert_always(); - } else { - // unsigned -> signed - assert_always(); - } - } else { - if (IsPackOutUnsigned(flags)) { - // signed -> unsigned - assert_always(); - } else { - // signed -> signed - e.vpunpckhwd(i.dest, src, src); - e.vpsrad(i.dest, 16); - } - } - } else { - // Unpack to HI. - if (IsPackInUnsigned(flags)) { - if (IsPackOutUnsigned(flags)) { - // unsigned -> unsigned - assert_always(); - } else { - // unsigned -> signed - assert_always(); - } - } else { - if (IsPackOutUnsigned(flags)) { - // signed -> unsigned - assert_always(); - } else { - // signed -> signed - e.vpunpcklwd(i.dest, src, src); - e.vpsrad(i.dest, 16); - } - } - } - e.vpshufd(i.dest, i.dest, 0xB1); - } -}; -EMITTER_OPCODE_TABLE(OPCODE_UNPACK, UNPACK); - -// ============================================================================ -// OPCODE_ATOMIC_EXCHANGE -// ============================================================================ -// Note that the address we use here is a real, host address! -// This is weird, and should be fixed. -template -void EmitAtomicExchangeXX(X64Emitter& e, const ARGS& i) { - if (i.dest == i.src1) { - e.mov(e.rax, i.src1); - if (i.dest != i.src2) { - if (i.src2.is_constant) { - e.mov(i.dest, i.src2.constant()); - } else { - e.mov(i.dest, i.src2); - } - } - e.lock(); - e.xchg(e.dword[e.rax], i.dest); - } else { - if (i.dest != i.src2) { - if (i.src2.is_constant) { - e.mov(i.dest, i.src2.constant()); - } else { - e.mov(i.dest, i.src2); - } - } - e.lock(); - e.xchg(e.dword[i.src1.reg()], i.dest); - } -} -struct ATOMIC_EXCHANGE_I8 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitAtomicExchangeXX(e, i); - } -}; -struct ATOMIC_EXCHANGE_I16 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitAtomicExchangeXX(e, i); - } -}; -struct ATOMIC_EXCHANGE_I32 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitAtomicExchangeXX(e, i); - } -}; -struct ATOMIC_EXCHANGE_I64 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitAtomicExchangeXX(e, i); - } -}; -EMITTER_OPCODE_TABLE(OPCODE_ATOMIC_EXCHANGE, ATOMIC_EXCHANGE_I8, - ATOMIC_EXCHANGE_I16, ATOMIC_EXCHANGE_I32, - ATOMIC_EXCHANGE_I64); - -// ============================================================================ -// OPCODE_ATOMIC_COMPARE_EXCHANGE -// ============================================================================ -struct ATOMIC_COMPARE_EXCHANGE_I32 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.mov(e.eax, i.src2); - e.mov(e.ecx, i.src1.reg().cvt32()); - e.lock(); - e.cmpxchg(e.dword[e.GetMembaseReg() + e.rcx], i.src3); - e.sete(i.dest); - } -}; -struct ATOMIC_COMPARE_EXCHANGE_I64 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.mov(e.rax, i.src2); - e.mov(e.ecx, i.src1.reg().cvt32()); - e.lock(); - e.cmpxchg(e.qword[e.GetMembaseReg() + e.rcx], i.src3); - e.sete(i.dest); - } -}; -EMITTER_OPCODE_TABLE(OPCODE_ATOMIC_COMPARE_EXCHANGE, - ATOMIC_COMPARE_EXCHANGE_I32, ATOMIC_COMPARE_EXCHANGE_I64); - // ============================================================================ // OPCODE_SET_ROUNDING_MODE // ============================================================================ @@ -7813,128 +3057,15 @@ struct SET_ROUNDING_MODE_I32 }; EMITTER_OPCODE_TABLE(OPCODE_SET_ROUNDING_MODE, SET_ROUNDING_MODE_I32); -void RegisterSequences() { - Register_OPCODE_COMMENT(); - Register_OPCODE_NOP(); - Register_OPCODE_SOURCE_OFFSET(); - Register_OPCODE_DEBUG_BREAK(); - Register_OPCODE_DEBUG_BREAK_TRUE(); - Register_OPCODE_TRAP(); - Register_OPCODE_TRAP_TRUE(); - Register_OPCODE_CALL(); - Register_OPCODE_CALL_TRUE(); - Register_OPCODE_CALL_INDIRECT(); - Register_OPCODE_CALL_INDIRECT_TRUE(); - Register_OPCODE_CALL_EXTERN(); - Register_OPCODE_RETURN(); - Register_OPCODE_RETURN_TRUE(); - Register_OPCODE_SET_RETURN_ADDRESS(); - Register_OPCODE_BRANCH(); - Register_OPCODE_BRANCH_TRUE(); - Register_OPCODE_BRANCH_FALSE(); - Register_OPCODE_ASSIGN(); - Register_OPCODE_CAST(); - Register_OPCODE_ZERO_EXTEND(); - Register_OPCODE_SIGN_EXTEND(); - Register_OPCODE_TRUNCATE(); - Register_OPCODE_CONVERT(); - Register_OPCODE_ROUND(); - Register_OPCODE_VECTOR_CONVERT_I2F(); - Register_OPCODE_VECTOR_CONVERT_F2I(); - Register_OPCODE_LOAD_VECTOR_SHL(); - Register_OPCODE_LOAD_VECTOR_SHR(); - Register_OPCODE_LOAD_CLOCK(); - Register_OPCODE_LOAD_LOCAL(); - Register_OPCODE_STORE_LOCAL(); - Register_OPCODE_LOAD_CONTEXT(); - Register_OPCODE_STORE_CONTEXT(); - Register_OPCODE_CONTEXT_BARRIER(); - Register_OPCODE_LOAD_MMIO(); - Register_OPCODE_STORE_MMIO(); - Register_OPCODE_LOAD_OFFSET(); - Register_OPCODE_STORE_OFFSET(); - Register_OPCODE_LOAD(); - Register_OPCODE_STORE(); - Register_OPCODE_MEMSET(); - Register_OPCODE_PREFETCH(); - Register_OPCODE_MEMORY_BARRIER(); - Register_OPCODE_MAX(); - Register_OPCODE_VECTOR_MAX(); - Register_OPCODE_MIN(); - Register_OPCODE_VECTOR_MIN(); - Register_OPCODE_SELECT(); - Register_OPCODE_IS_TRUE(); - Register_OPCODE_IS_FALSE(); - Register_OPCODE_IS_NAN(); - Register_OPCODE_COMPARE_EQ(); - Register_OPCODE_COMPARE_NE(); - Register_OPCODE_COMPARE_SLT(); - Register_OPCODE_COMPARE_SLE(); - Register_OPCODE_COMPARE_SGT(); - Register_OPCODE_COMPARE_SGE(); - Register_OPCODE_COMPARE_ULT(); - Register_OPCODE_COMPARE_ULE(); - Register_OPCODE_COMPARE_UGT(); - Register_OPCODE_COMPARE_UGE(); - Register_OPCODE_COMPARE_SLT_FLT(); - Register_OPCODE_COMPARE_SLE_FLT(); - Register_OPCODE_COMPARE_SGT_FLT(); - Register_OPCODE_COMPARE_SGE_FLT(); - Register_OPCODE_COMPARE_ULT_FLT(); - Register_OPCODE_COMPARE_ULE_FLT(); - Register_OPCODE_COMPARE_UGT_FLT(); - Register_OPCODE_COMPARE_UGE_FLT(); - Register_OPCODE_DID_SATURATE(); - Register_OPCODE_VECTOR_COMPARE_EQ(); - Register_OPCODE_VECTOR_COMPARE_SGT(); - Register_OPCODE_VECTOR_COMPARE_SGE(); - Register_OPCODE_VECTOR_COMPARE_UGT(); - Register_OPCODE_VECTOR_COMPARE_UGE(); - Register_OPCODE_ADD(); - Register_OPCODE_ADD_CARRY(); - Register_OPCODE_VECTOR_ADD(); - Register_OPCODE_SUB(); - Register_OPCODE_VECTOR_SUB(); - Register_OPCODE_MUL(); - Register_OPCODE_MUL_HI(); - Register_OPCODE_DIV(); - Register_OPCODE_MUL_ADD(); - Register_OPCODE_MUL_SUB(); - Register_OPCODE_NEG(); - Register_OPCODE_ABS(); - Register_OPCODE_SQRT(); - Register_OPCODE_RSQRT(); - Register_OPCODE_RECIP(); - Register_OPCODE_POW2(); - Register_OPCODE_LOG2(); - Register_OPCODE_DOT_PRODUCT_3(); - Register_OPCODE_DOT_PRODUCT_4(); - Register_OPCODE_AND(); - Register_OPCODE_OR(); - Register_OPCODE_XOR(); - Register_OPCODE_NOT(); - Register_OPCODE_SHL(); - Register_OPCODE_SHR(); - Register_OPCODE_SHA(); - Register_OPCODE_VECTOR_SHL(); - Register_OPCODE_VECTOR_SHR(); - Register_OPCODE_VECTOR_SHA(); - Register_OPCODE_ROTATE_LEFT(); - Register_OPCODE_VECTOR_ROTATE_LEFT(); - Register_OPCODE_VECTOR_AVERAGE(); - Register_OPCODE_BYTE_SWAP(); - Register_OPCODE_CNTLZ(); - Register_OPCODE_INSERT(); - Register_OPCODE_EXTRACT(); - Register_OPCODE_SPLAT(); - Register_OPCODE_PERMUTE(); - Register_OPCODE_SWIZZLE(); - Register_OPCODE_PACK(); - Register_OPCODE_UNPACK(); - Register_OPCODE_ATOMIC_EXCHANGE(); - Register_OPCODE_ATOMIC_COMPARE_EXCHANGE(); - Register_OPCODE_SET_ROUNDING_MODE(); -} +// Include anchors to other sequence sources so they get included in the build. +extern volatile int anchor_control; +static int anchor_control_dest = anchor_control; + +extern volatile int anchor_memory; +static int anchor_memory_dest = anchor_memory; + +extern volatile int anchor_vector; +static int anchor_vector_dest = anchor_vector; bool SelectSequence(X64Emitter* e, const Instr* i, const Instr** new_tail) { const InstrKey key(i); diff --git a/src/xenia/cpu/backend/x64/x64_sequences.h b/src/xenia/cpu/backend/x64/x64_sequences.h index a0103fca5..07b264ab2 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.h +++ b/src/xenia/cpu/backend/x64/x64_sequences.h @@ -12,6 +12,8 @@ #include "xenia/cpu/hir/instr.h" +#include + namespace xe { namespace cpu { namespace backend { @@ -19,7 +21,25 @@ namespace x64 { class X64Emitter; -void RegisterSequences(); +typedef bool (*SequenceSelectFn)(X64Emitter&, const hir::Instr*); +extern std::unordered_map sequence_table; + +template +bool Register() { + sequence_table.insert({T::head_key(), T::Select}); + return true; +} + +template +static bool Register() { + bool b = true; + b = b && Register(); // Call the above function + b = b && Register(); // Call ourself again (recursively) + return b; +} +#define EMITTER_OPCODE_TABLE(name, ...) \ + const auto X64_INSTR_##name = Register<__VA_ARGS__>(); + bool SelectSequence(X64Emitter* e, const hir::Instr* i, const hir::Instr** new_tail); diff --git a/src/xenia/cpu/compiler/compiler_passes.h b/src/xenia/cpu/compiler/compiler_passes.h index 6b81d1fb5..fc58ec710 100644 --- a/src/xenia/cpu/compiler/compiler_passes.h +++ b/src/xenia/cpu/compiler/compiler_passes.h @@ -10,6 +10,8 @@ #ifndef XENIA_CPU_COMPILER_COMPILER_PASSES_H_ #define XENIA_CPU_COMPILER_COMPILER_PASSES_H_ +#include "xenia/cpu/compiler/passes/conditional_group_pass.h" +#include "xenia/cpu/compiler/passes/conditional_group_subpass.h" #include "xenia/cpu/compiler/passes/constant_propagation_pass.h" #include "xenia/cpu/compiler/passes/context_promotion_pass.h" #include "xenia/cpu/compiler/passes/control_flow_analysis_pass.h" diff --git a/src/xenia/cpu/compiler/passes/conditional_group_pass.cc b/src/xenia/cpu/compiler/passes/conditional_group_pass.cc new file mode 100644 index 000000000..ef84991e8 --- /dev/null +++ b/src/xenia/cpu/compiler/passes/conditional_group_pass.cc @@ -0,0 +1,85 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2013 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/cpu/compiler/passes/conditional_group_pass.h" + +#include + +#include "xenia/base/profiling.h" +#include "xenia/cpu/compiler/compiler.h" +#include "xenia/cpu/ppc/ppc_context.h" +#include "xenia/cpu/processor.h" + +namespace xe { +namespace cpu { +namespace compiler { +namespace passes { + +// TODO(benvanik): remove when enums redefined. +using namespace xe::cpu::hir; + +using xe::cpu::hir::Block; +using xe::cpu::hir::HIRBuilder; +using xe::cpu::hir::Instr; +using xe::cpu::hir::Value; + +ConditionalGroupPass::ConditionalGroupPass() : CompilerPass() {} + +ConditionalGroupPass::~ConditionalGroupPass() {} + +bool ConditionalGroupPass::Initialize(Compiler* compiler) { + if (!CompilerPass::Initialize(compiler)) { + return false; + } + + for (size_t i = 0; i < passes_.size(); ++i) { + auto& pass = passes_[i]; + if (!pass->Initialize(compiler)) { + return false; + } + } + + return true; +} + +bool ConditionalGroupPass::Run(HIRBuilder* builder) { + bool dirty; + int loops = 0; + do { + assert_true(loops < 20); // arbitrary number + dirty = false; + for (size_t i = 0; i < passes_.size(); ++i) { + scratch_arena()->Reset(); + auto& pass = passes_[i]; + auto subpass = dynamic_cast(pass.get()); + if (!subpass) { + if (!pass->Run(builder)) { + return false; + } + } else { + bool result = false; + if (!subpass->Run(builder, result)) { + return false; + } + dirty |= result; + } + } + loops++; + } while (dirty); + return true; +} + +void ConditionalGroupPass::AddPass(std::unique_ptr pass) { + passes_.push_back(std::move(pass)); +} + +} // namespace passes +} // namespace compiler +} // namespace cpu +} // namespace xe diff --git a/src/xenia/cpu/compiler/passes/conditional_group_pass.h b/src/xenia/cpu/compiler/passes/conditional_group_pass.h new file mode 100644 index 000000000..7421fe1b5 --- /dev/null +++ b/src/xenia/cpu/compiler/passes/conditional_group_pass.h @@ -0,0 +1,45 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2013 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_CPU_COMPILER_PASSES_CONDITIONAL_GROUP_PASS_H_ +#define XENIA_CPU_COMPILER_PASSES_CONDITIONAL_GROUP_PASS_H_ + +#include +#include + +#include "xenia/base/platform.h" +#include "xenia/cpu/compiler/compiler_pass.h" +#include "xenia/cpu/compiler/passes/conditional_group_subpass.h" + +namespace xe { +namespace cpu { +namespace compiler { +namespace passes { + +class ConditionalGroupPass : public CompilerPass { + public: + ConditionalGroupPass(); + virtual ~ConditionalGroupPass() override; + + bool Initialize(Compiler* compiler) override; + + bool Run(hir::HIRBuilder* builder) override; + + void AddPass(std::unique_ptr pass); + + private: + std::vector> passes_; +}; + +} // namespace passes +} // namespace compiler +} // namespace cpu +} // namespace xe + +#endif // XENIA_CPU_COMPILER_PASSES_CONDITIONAL_GROUP_PASS_H_ diff --git a/src/xenia/cpu/compiler/passes/conditional_group_subpass.cc b/src/xenia/cpu/compiler/passes/conditional_group_subpass.cc new file mode 100644 index 000000000..39780e2f5 --- /dev/null +++ b/src/xenia/cpu/compiler/passes/conditional_group_subpass.cc @@ -0,0 +1,26 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2013 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/cpu/compiler/passes/conditional_group_subpass.h" + +#include "xenia/cpu/compiler/compiler.h" + +namespace xe { +namespace cpu { +namespace compiler { +namespace passes { + +ConditionalGroupSubpass::ConditionalGroupSubpass() : CompilerPass() {} + +ConditionalGroupSubpass::~ConditionalGroupSubpass() = default; + +} // namespace passes +} // namespace compiler +} // namespace cpu +} // namespace xe diff --git a/src/xenia/cpu/compiler/passes/conditional_group_subpass.h b/src/xenia/cpu/compiler/passes/conditional_group_subpass.h new file mode 100644 index 000000000..f62c50ed3 --- /dev/null +++ b/src/xenia/cpu/compiler/passes/conditional_group_subpass.h @@ -0,0 +1,47 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2013 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_CPU_COMPILER_PASSES_CONDITIONAL_GROUP_SUBPASS_H_ +#define XENIA_CPU_COMPILER_PASSES_CONDITIONAL_GROUP_SUBPASS_H_ + +#include "xenia/base/arena.h" +#include "xenia/cpu/compiler/compiler_pass.h" +#include "xenia/cpu/hir/hir_builder.h" + +namespace xe { +namespace cpu { +class Processor; +} // namespace cpu +} // namespace xe + +namespace xe { +namespace cpu { +namespace compiler { +class Compiler; +namespace passes { + +class ConditionalGroupSubpass : public CompilerPass { + public: + ConditionalGroupSubpass(); + virtual ~ConditionalGroupSubpass(); + + bool Run(hir::HIRBuilder* builder) override { + bool dummy; + return Run(builder, dummy); + } + + virtual bool Run(hir::HIRBuilder* builder, bool& result) = 0; +}; + +} // namespace passes +} // namespace compiler +} // namespace cpu +} // namespace xe + +#endif // XENIA_CPU_COMPILER_PASSES_CONDITIONAL_GROUP_SUBPASS_H_ diff --git a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc index 3db8e99d6..3a399cefd 100644 --- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc +++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc @@ -31,11 +31,12 @@ using xe::cpu::hir::HIRBuilder; using xe::cpu::hir::TypeName; using xe::cpu::hir::Value; -ConstantPropagationPass::ConstantPropagationPass() : CompilerPass() {} +ConstantPropagationPass::ConstantPropagationPass() + : ConditionalGroupSubpass() {} ConstantPropagationPass::~ConstantPropagationPass() {} -bool ConstantPropagationPass::Run(HIRBuilder* builder) { +bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { // Once ContextPromotion has run there will likely be a whole slew of // constants that can be pushed through the function. // Example: @@ -63,6 +64,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { // v1 = 19 // v2 = 0 + result = false; auto block = builder->first_block(); while (block) { auto i = block->instr_head; @@ -76,6 +78,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { } else { i->Remove(); } + result = true; } break; @@ -86,6 +89,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { } else { i->Remove(); } + result = true; } break; @@ -98,6 +102,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { } else { i->Remove(); } + result = true; } break; case OPCODE_CALL_INDIRECT: @@ -109,6 +114,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { } i->Replace(&OPCODE_CALL_info, i->flags); i->src1.symbol = function; + result = true; } break; case OPCODE_CALL_INDIRECT_TRUE: @@ -120,6 +126,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { } else { i->Remove(); } + result = true; } break; @@ -132,6 +139,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { } else { i->Remove(); } + result = true; } break; case OPCODE_BRANCH_FALSE: @@ -143,6 +151,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { } else { i->Remove(); } + result = true; } break; @@ -152,6 +161,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->Cast(target_type); i->Remove(); + result = true; } break; case OPCODE_CONVERT: @@ -160,6 +170,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->Convert(target_type, RoundMode(i->flags)); i->Remove(); + result = true; } break; case OPCODE_ROUND: @@ -167,6 +178,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->Round(RoundMode(i->flags)); i->Remove(); + result = true; } break; case OPCODE_ZERO_EXTEND: @@ -175,6 +187,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->ZeroExtend(target_type); i->Remove(); + result = true; } break; case OPCODE_SIGN_EXTEND: @@ -183,6 +196,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->SignExtend(target_type); i->Remove(); + result = true; } break; case OPCODE_TRUNCATE: @@ -191,6 +205,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->Truncate(target_type); i->Remove(); + result = true; } break; @@ -210,6 +225,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { i->Replace(&OPCODE_LOAD_MMIO_info, 0); i->src1.offset = reinterpret_cast(mmio_range); i->src2.offset = address; + result = true; } else { auto heap = memory->LookupHeap(address); uint32_t protect; @@ -222,18 +238,22 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { case INT8_TYPE: v->set_constant(xe::load(host_addr)); i->Remove(); + result = true; break; case INT16_TYPE: v->set_constant(xe::load(host_addr)); i->Remove(); + result = true; break; case INT32_TYPE: v->set_constant(xe::load(host_addr)); i->Remove(); + result = true; break; case INT64_TYPE: v->set_constant(xe::load(host_addr)); i->Remove(); + result = true; break; case VEC128_TYPE: vec128_t val; @@ -241,6 +261,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { val.high = xe::load(host_addr + 8); v->set_constant(val); i->Remove(); + result = true; break; default: assert_unhandled_case(v->type); @@ -270,6 +291,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { i->src1.offset = reinterpret_cast(mmio_range); i->src2.offset = address; i->set_src3(value); + result = true; } } break; @@ -281,10 +303,12 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { auto src2 = i->src2.value; i->Replace(&OPCODE_ASSIGN_info, 0); i->set_src1(src2); + result = true; } else if (i->src1.value->IsConstantFalse()) { auto src3 = i->src3.value; i->Replace(&OPCODE_ASSIGN_info, 0); i->set_src1(src3); + result = true; } else if (i->src2.value->IsConstant() && i->src3.value->IsConstant()) { // TODO: Select @@ -305,6 +329,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_constant(uint8_t(0)); } i->Remove(); + result = true; } break; case OPCODE_IS_FALSE: @@ -315,6 +340,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_constant(uint8_t(0)); } i->Remove(); + result = true; } break; case OPCODE_IS_NAN: @@ -329,6 +355,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_constant(uint8_t(0)); } i->Remove(); + result = true; } break; @@ -338,6 +365,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { bool value = i->src1.value->IsConstantEQ(i->src2.value); i->dest->set_constant(uint8_t(value)); i->Remove(); + result = true; } break; case OPCODE_COMPARE_NE: @@ -345,6 +373,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { bool value = i->src1.value->IsConstantNE(i->src2.value); i->dest->set_constant(uint8_t(value)); i->Remove(); + result = true; } break; case OPCODE_COMPARE_SLT: @@ -352,6 +381,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { bool value = i->src1.value->IsConstantSLT(i->src2.value); i->dest->set_constant(uint8_t(value)); i->Remove(); + result = true; } break; case OPCODE_COMPARE_SLE: @@ -359,6 +389,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { bool value = i->src1.value->IsConstantSLE(i->src2.value); i->dest->set_constant(uint8_t(value)); i->Remove(); + result = true; } break; case OPCODE_COMPARE_SGT: @@ -366,6 +397,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { bool value = i->src1.value->IsConstantSGT(i->src2.value); i->dest->set_constant(uint8_t(value)); i->Remove(); + result = true; } break; case OPCODE_COMPARE_SGE: @@ -373,6 +405,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { bool value = i->src1.value->IsConstantSGE(i->src2.value); i->dest->set_constant(uint8_t(value)); i->Remove(); + result = true; } break; case OPCODE_COMPARE_ULT: @@ -380,6 +413,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { bool value = i->src1.value->IsConstantULT(i->src2.value); i->dest->set_constant(uint8_t(value)); i->Remove(); + result = true; } break; case OPCODE_COMPARE_ULE: @@ -387,6 +421,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { bool value = i->src1.value->IsConstantULE(i->src2.value); i->dest->set_constant(uint8_t(value)); i->Remove(); + result = true; } break; case OPCODE_COMPARE_UGT: @@ -394,6 +429,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { bool value = i->src1.value->IsConstantUGT(i->src2.value); i->dest->set_constant(uint8_t(value)); i->Remove(); + result = true; } break; case OPCODE_COMPARE_UGE: @@ -401,6 +437,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { bool value = i->src1.value->IsConstantUGE(i->src2.value); i->dest->set_constant(uint8_t(value)); i->Remove(); + result = true; } break; @@ -413,6 +450,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->Add(i->src2.value); i->Remove(); + result = true; } break; case OPCODE_ADD_CARRY: @@ -433,6 +471,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { i->set_src1(ca); } } + result = true; } break; case OPCODE_SUB: @@ -440,6 +479,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->Sub(i->src2.value); i->Remove(); + result = true; } break; case OPCODE_MUL: @@ -447,6 +487,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->Mul(i->src2.value); i->Remove(); + result = true; } else if (i->src1.value->IsConstant() || i->src2.value->IsConstant()) { // Reorder the sources to make things simpler. @@ -460,12 +501,14 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { if (s2->type != VEC128_TYPE && s2->IsConstantOne()) { i->Replace(&OPCODE_ASSIGN_info, 0); i->set_src1(s1); + result = true; } else if (s2->type == VEC128_TYPE) { auto& c = s2->constant; if (c.v128.f32[0] == 1.f && c.v128.f32[1] == 1.f && c.v128.f32[2] == 1.f && c.v128.f32[3] == 1.f) { i->Replace(&OPCODE_ASSIGN_info, 0); i->set_src1(s1); + result = true; } } } @@ -475,6 +518,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->MulHi(i->src2.value, (i->flags & ARITHMETIC_UNSIGNED) != 0); i->Remove(); + result = true; } break; case OPCODE_DIV: @@ -482,6 +526,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->Div(i->src2.value, (i->flags & ARITHMETIC_UNSIGNED) != 0); i->Remove(); + result = true; } else if (i->src2.value->IsConstant()) { // Division by one = no-op. Value* src1 = i->src1.value; @@ -489,12 +534,14 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { i->src2.value->IsConstantOne()) { i->Replace(&OPCODE_ASSIGN_info, 0); i->set_src1(src1); + result = true; } else if (i->src2.value->type == VEC128_TYPE) { auto& c = i->src2.value->constant; if (c.v128.f32[0] == 1.f && c.v128.f32[1] == 1.f && c.v128.f32[2] == 1.f && c.v128.f32[3] == 1.f) { i->Replace(&OPCODE_ASSIGN_info, 0); i->set_src1(src1); + result = true; } } } @@ -505,6 +552,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); Value::MulAdd(v, i->src1.value, i->src2.value, i->src3.value); i->Remove(); + result = true; } else { // Multiply part is constant. Value* mul = builder->AllocValue(); @@ -515,6 +563,8 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { i->Replace(&OPCODE_ADD_info, 0); i->set_src1(mul); i->set_src2(add); + + result = true; } } break; @@ -525,6 +575,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); Value::MulSub(v, i->src1.value, i->src2.value, i->src3.value); i->Remove(); + result = true; } else { // Multiply part is constant. Value* mul = builder->AllocValue(); @@ -535,6 +586,8 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { i->Replace(&OPCODE_SUB_info, 0); i->set_src1(mul); i->set_src2(add); + + result = true; } } break; @@ -543,6 +596,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->Max(i->src2.value); i->Remove(); + result = true; } break; case OPCODE_NEG: @@ -550,6 +604,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->Neg(); i->Remove(); + result = true; } break; case OPCODE_ABS: @@ -557,6 +612,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->Abs(); i->Remove(); + result = true; } break; case OPCODE_SQRT: @@ -564,6 +620,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->Sqrt(); i->Remove(); + result = true; } break; case OPCODE_RSQRT: @@ -571,6 +628,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->RSqrt(); i->Remove(); + result = true; } break; case OPCODE_RECIP: @@ -578,6 +636,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->Recip(); i->Remove(); + result = true; } break; case OPCODE_AND: @@ -585,6 +644,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->And(i->src2.value); i->Remove(); + result = true; } break; case OPCODE_OR: @@ -592,6 +652,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->Or(i->src2.value); i->Remove(); + result = true; } break; case OPCODE_XOR: @@ -599,11 +660,13 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->Xor(i->src2.value); i->Remove(); + result = true; } else if (!i->src1.value->IsConstant() && !i->src2.value->IsConstant() && i->src1.value == i->src2.value) { v->set_zero(v->type); i->Remove(); + result = true; } break; case OPCODE_NOT: @@ -611,6 +674,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->Not(); i->Remove(); + result = true; } break; case OPCODE_SHL: @@ -618,10 +682,12 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->Shl(i->src2.value); i->Remove(); + result = true; } else if (i->src2.value->IsConstantZero()) { auto src1 = i->src1.value; i->Replace(&OPCODE_ASSIGN_info, 0); i->set_src1(src1); + result = true; } break; case OPCODE_SHR: @@ -629,10 +695,12 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->Shr(i->src2.value); i->Remove(); + result = true; } else if (i->src2.value->IsConstantZero()) { auto src1 = i->src1.value; i->Replace(&OPCODE_ASSIGN_info, 0); i->set_src1(src1); + result = true; } break; case OPCODE_SHA: @@ -640,6 +708,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->Sha(i->src2.value); i->Remove(); + result = true; } break; // TODO(benvanik): ROTATE_LEFT @@ -648,6 +717,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->ByteSwap(); i->Remove(); + result = true; } break; case OPCODE_CNTLZ: @@ -655,6 +725,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_zero(v->type); v->CountLeadingZeros(i->src1.value); i->Remove(); + result = true; } break; // TODO(benvanik): INSERT/EXTRACT @@ -664,6 +735,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_zero(v->type); v->Extract(i->src1.value, i->src2.value); i->Remove(); + result = true; } break; case OPCODE_SPLAT: @@ -671,6 +743,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_zero(v->type); v->Splat(i->src1.value); i->Remove(); + result = true; } break; case OPCODE_VECTOR_COMPARE_EQ: @@ -678,6 +751,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->VectorCompareEQ(i->src2.value, hir::TypeName(i->flags)); i->Remove(); + result = true; } break; case OPCODE_VECTOR_COMPARE_SGT: @@ -685,6 +759,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->VectorCompareSGT(i->src2.value, hir::TypeName(i->flags)); i->Remove(); + result = true; } break; case OPCODE_VECTOR_COMPARE_SGE: @@ -692,6 +767,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->VectorCompareSGE(i->src2.value, hir::TypeName(i->flags)); i->Remove(); + result = true; } break; case OPCODE_VECTOR_COMPARE_UGT: @@ -699,6 +775,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->VectorCompareUGT(i->src2.value, hir::TypeName(i->flags)); i->Remove(); + result = true; } break; case OPCODE_VECTOR_COMPARE_UGE: @@ -706,6 +783,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->VectorCompareUGE(i->src2.value, hir::TypeName(i->flags)); i->Remove(); + result = true; } break; case OPCODE_VECTOR_CONVERT_F2I: @@ -714,6 +792,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->VectorConvertF2I(i->src1.value, !!(i->flags & ARITHMETIC_UNSIGNED)); i->Remove(); + result = true; } break; case OPCODE_VECTOR_CONVERT_I2F: @@ -722,6 +801,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->VectorConvertI2F(i->src1.value, !!(i->flags & ARITHMETIC_UNSIGNED)); i->Remove(); + result = true; } break; case OPCODE_VECTOR_SHL: @@ -729,6 +809,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->VectorShl(i->src2.value, hir::TypeName(i->flags)); i->Remove(); + result = true; } break; case OPCODE_VECTOR_SHR: @@ -736,6 +817,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->VectorShr(i->src2.value, hir::TypeName(i->flags)); i->Remove(); + result = true; } break; case OPCODE_VECTOR_ROTATE_LEFT: @@ -743,6 +825,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->VectorRol(i->src2.value, hir::TypeName(i->flags)); i->Remove(); + result = true; } break; case OPCODE_VECTOR_ADD: @@ -753,6 +836,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { !!(arith_flags & ARITHMETIC_UNSIGNED), !!(arith_flags & ARITHMETIC_SATURATE)); i->Remove(); + result = true; } break; case OPCODE_VECTOR_SUB: @@ -763,6 +847,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { !!(arith_flags & ARITHMETIC_UNSIGNED), !!(arith_flags & ARITHMETIC_SATURATE)); i->Remove(); + result = true; } break; @@ -771,6 +856,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->DotProduct3(i->src2.value); i->Remove(); + result = true; } break; @@ -779,6 +865,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { v->set_from(i->src1.value); v->DotProduct4(i->src2.value); i->Remove(); + result = true; } break; @@ -790,6 +877,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { !!(arith_flags & ARITHMETIC_UNSIGNED), !!(arith_flags & ARITHMETIC_SATURATE)); i->Remove(); + result = true; } break; diff --git a/src/xenia/cpu/compiler/passes/constant_propagation_pass.h b/src/xenia/cpu/compiler/passes/constant_propagation_pass.h index 021bdc981..08bd25b4a 100644 --- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.h +++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.h @@ -10,19 +10,19 @@ #ifndef XENIA_CPU_COMPILER_PASSES_CONSTANT_PROPAGATION_PASS_H_ #define XENIA_CPU_COMPILER_PASSES_CONSTANT_PROPAGATION_PASS_H_ -#include "xenia/cpu/compiler/compiler_pass.h" +#include "xenia/cpu/compiler/passes/conditional_group_subpass.h" namespace xe { namespace cpu { namespace compiler { namespace passes { -class ConstantPropagationPass : public CompilerPass { +class ConstantPropagationPass : public ConditionalGroupSubpass { public: ConstantPropagationPass(); ~ConstantPropagationPass() override; - bool Run(hir::HIRBuilder* builder) override; + bool Run(hir::HIRBuilder* builder, bool& result) override; private: }; diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.cc b/src/xenia/cpu/compiler/passes/simplification_pass.cc index 3278ab7c6..3569887a4 100644 --- a/src/xenia/cpu/compiler/passes/simplification_pass.cc +++ b/src/xenia/cpu/compiler/passes/simplification_pass.cc @@ -23,17 +23,18 @@ using xe::cpu::hir::HIRBuilder; using xe::cpu::hir::Instr; using xe::cpu::hir::Value; -SimplificationPass::SimplificationPass() : CompilerPass() {} +SimplificationPass::SimplificationPass() : ConditionalGroupSubpass() {} SimplificationPass::~SimplificationPass() {} -bool SimplificationPass::Run(HIRBuilder* builder) { - EliminateConversions(builder); - SimplifyAssignments(builder); +bool SimplificationPass::Run(HIRBuilder* builder, bool& result) { + result = false; + result |= EliminateConversions(builder); + result |= SimplifyAssignments(builder); return true; } -void SimplificationPass::EliminateConversions(HIRBuilder* builder) { +bool SimplificationPass::EliminateConversions(HIRBuilder* builder) { // First, we check for truncates/extensions that can be skipped. // This generates some assignments which then the second step will clean up. // Both zero/sign extends can be skipped: @@ -43,6 +44,7 @@ void SimplificationPass::EliminateConversions(HIRBuilder* builder) { // v1.i64 = zero/sign_extend v0.i32 (may be dead code removed later) // v2.i32 = v0.i32 + bool result = false; auto block = builder->first_block(); while (block) { auto i = block->instr_head; @@ -51,20 +53,21 @@ void SimplificationPass::EliminateConversions(HIRBuilder* builder) { // back to definition). if (i->opcode == &OPCODE_TRUNCATE_info) { // Matches zero/sign_extend + truncate. - CheckTruncate(i); + result |= CheckTruncate(i); } else if (i->opcode == &OPCODE_BYTE_SWAP_info) { // Matches byte swap + byte swap. // This is pretty rare within the same basic block, but is in the // memcpy hot path and (probably) worth it. Maybe. - CheckByteSwap(i); + result |= CheckByteSwap(i); } i = i->next; } block = block->next; } + return result; } -void SimplificationPass::CheckTruncate(Instr* i) { +bool SimplificationPass::CheckTruncate(Instr* i) { // Walk backward up src's chain looking for an extend. We may have // assigns, so skip those. auto src = i->src1.value; @@ -80,6 +83,7 @@ void SimplificationPass::CheckTruncate(Instr* i) { // Types match, use original by turning this into an assign. i->Replace(&OPCODE_ASSIGN_info, 0); i->set_src1(def->src1.value); + return true; } } else if (def->opcode == &OPCODE_ZERO_EXTEND_info) { // Value comes from a zero extend. @@ -87,12 +91,14 @@ void SimplificationPass::CheckTruncate(Instr* i) { // Types match, use original by turning this into an assign. i->Replace(&OPCODE_ASSIGN_info, 0); i->set_src1(def->src1.value); + return true; } } } + return false; } -void SimplificationPass::CheckByteSwap(Instr* i) { +bool SimplificationPass::CheckByteSwap(Instr* i) { // Walk backward up src's chain looking for a byte swap. We may have // assigns, so skip those. auto src = i->src1.value; @@ -107,11 +113,13 @@ void SimplificationPass::CheckByteSwap(Instr* i) { // Types match, use original by turning this into an assign. i->Replace(&OPCODE_ASSIGN_info, 0); i->set_src1(def->src1.value); + return true; } } + return false; } -void SimplificationPass::SimplifyAssignments(HIRBuilder* builder) { +bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) { // Run over the instructions and rename assigned variables: // v1 = v0 // v2 = v1 @@ -129,27 +137,35 @@ void SimplificationPass::SimplifyAssignments(HIRBuilder* builder) { // of that instr. Because we may have chains, we do this recursively until // we find a non-assign def. + bool result = false; auto block = builder->first_block(); while (block) { auto i = block->instr_head; while (i) { uint32_t signature = i->opcode->signature; if (GET_OPCODE_SIG_TYPE_SRC1(signature) == OPCODE_SIG_TYPE_V) { - i->set_src1(CheckValue(i->src1.value)); + bool modified = false; + i->set_src1(CheckValue(i->src1.value, modified)); + result |= modified; } if (GET_OPCODE_SIG_TYPE_SRC2(signature) == OPCODE_SIG_TYPE_V) { - i->set_src2(CheckValue(i->src2.value)); + bool modified = false; + i->set_src2(CheckValue(i->src2.value, modified)); + result |= modified; } if (GET_OPCODE_SIG_TYPE_SRC3(signature) == OPCODE_SIG_TYPE_V) { - i->set_src3(CheckValue(i->src3.value)); + bool modified = false; + i->set_src3(CheckValue(i->src3.value, modified)); + result |= modified; } i = i->next; } block = block->next; } + return result; } -Value* SimplificationPass::CheckValue(Value* value) { +Value* SimplificationPass::CheckValue(Value* value, bool& result) { auto def = value->def; if (def && def->opcode == &OPCODE_ASSIGN_info) { // Value comes from an assignment - recursively find if it comes from @@ -162,8 +178,10 @@ Value* SimplificationPass::CheckValue(Value* value) { } replacement = def->src1.value; } + result = true; return replacement; } + result = false; return value; } diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.h b/src/xenia/cpu/compiler/passes/simplification_pass.h index 70275f8b4..2ba6efad7 100644 --- a/src/xenia/cpu/compiler/passes/simplification_pass.h +++ b/src/xenia/cpu/compiler/passes/simplification_pass.h @@ -10,27 +10,27 @@ #ifndef XENIA_CPU_COMPILER_PASSES_SIMPLIFICATION_PASS_H_ #define XENIA_CPU_COMPILER_PASSES_SIMPLIFICATION_PASS_H_ -#include "xenia/cpu/compiler/compiler_pass.h" +#include "xenia/cpu/compiler/passes/conditional_group_subpass.h" namespace xe { namespace cpu { namespace compiler { namespace passes { -class SimplificationPass : public CompilerPass { +class SimplificationPass : public ConditionalGroupSubpass { public: SimplificationPass(); ~SimplificationPass() override; - bool Run(hir::HIRBuilder* builder) override; + bool Run(hir::HIRBuilder* builder, bool& result) override; private: - void EliminateConversions(hir::HIRBuilder* builder); - void CheckTruncate(hir::Instr* i); - void CheckByteSwap(hir::Instr* i); + bool EliminateConversions(hir::HIRBuilder* builder); + bool CheckTruncate(hir::Instr* i); + bool CheckByteSwap(hir::Instr* i); - void SimplifyAssignments(hir::HIRBuilder* builder); - hir::Value* CheckValue(hir::Value* value); + bool SimplifyAssignments(hir::HIRBuilder* builder); + hir::Value* CheckValue(hir::Value* value, bool& result); }; } // namespace passes diff --git a/src/xenia/cpu/hir/value.h b/src/xenia/cpu/hir/value.h index ff41edf3b..dcc95ca8c 100644 --- a/src/xenia/cpu/hir/value.h +++ b/src/xenia/cpu/hir/value.h @@ -170,6 +170,7 @@ class Value { constant.v128 = value; } void set_from(const Value* other) { + assert_true(other->IsConstant()); type = other->type; flags = other->flags; constant.v128 = other->constant.v128; diff --git a/src/xenia/cpu/ppc/ppc_translator.cc b/src/xenia/cpu/ppc/ppc_translator.cc index ec1768163..d408f75b1 100644 --- a/src/xenia/cpu/ppc/ppc_translator.cc +++ b/src/xenia/cpu/ppc/ppc_translator.cc @@ -53,15 +53,16 @@ PPCTranslator::PPCTranslator(PPCFrontend* frontend) : frontend_(frontend) { if (validate) compiler_->AddPass(std::make_unique()); compiler_->AddPass(std::make_unique()); if (validate) compiler_->AddPass(std::make_unique()); - // TODO(gibbed): loop until these passes stop making changes? - for (int i = 0; i < 5; ++i) { - compiler_->AddPass(std::make_unique()); - if (validate) - compiler_->AddPass(std::make_unique()); - compiler_->AddPass(std::make_unique()); - if (validate) - compiler_->AddPass(std::make_unique()); - } + + // Grouped simplification + constant propagation. + // Loops until no changes are made. + auto sap = std::make_unique(); + sap->AddPass(std::make_unique()); + if (validate) sap->AddPass(std::make_unique()); + sap->AddPass(std::make_unique()); + if (validate) sap->AddPass(std::make_unique()); + compiler_->AddPass(std::move(sap)); + if (backend->machine_info()->supports_extended_load_store) { // Backend supports the advanced LOAD/STORE instructions. // These will save us a lot of HIR opcodes. diff --git a/src/xenia/cpu/ppc/testing/premake5.lua b/src/xenia/cpu/ppc/testing/premake5.lua index 78874ecc4..d2d5549cd 100644 --- a/src/xenia/cpu/ppc/testing/premake5.lua +++ b/src/xenia/cpu/ppc/testing/premake5.lua @@ -13,6 +13,7 @@ project("xenia-cpu-ppc-tests") "xenia-base", "gflags", "capstone", -- cpu-backend-x64 + "mspack", }) files({ "ppc_testing_main.cc", diff --git a/src/xenia/cpu/premake5.lua b/src/xenia/cpu/premake5.lua index 08fd41c0d..96a41f6e0 100644 --- a/src/xenia/cpu/premake5.lua +++ b/src/xenia/cpu/premake5.lua @@ -8,6 +8,7 @@ project("xenia-cpu") language("C++") links({ "xenia-base", + "mspack", }) includedirs({ project_root.."/third_party/llvm/include", diff --git a/src/xenia/cpu/xex_module.cc b/src/xenia/cpu/xex_module.cc index 8569e7a9a..6f8537970 100644 --- a/src/xenia/cpu/xex_module.cc +++ b/src/xenia/cpu/xex_module.cc @@ -25,7 +25,6 @@ #include "third_party/crypto/rijndael-alg-fst.c" #include "third_party/crypto/rijndael-alg-fst.h" #include "third_party/mspack/lzx.h" -#include "third_party/mspack/lzxd.c" #include "third_party/mspack/mspack.h" #include "third_party/pe/pe_image.h" @@ -120,7 +119,7 @@ int lzx_decompress(const void* lzx_data, size_t lzx_len, void* dest, mspack_memory_file* lzxdst = mspack_memory_open(sys, dest, dest_len); lzxd_stream* lzxd = lzxd_init(sys, (struct mspack_file*)lzxsrc, (struct mspack_file*)lzxdst, - window_bits, 0, 0x8000, (off_t)dest_len); + window_bits, 0, 0x8000, (off_t)dest_len, 0); if (lzxd) { if (window_data) { @@ -1120,23 +1119,23 @@ bool XexModule::LoadContinue() { processor_->backend()->CommitExecutableRange(low_address_, high_address_); // Add all imports (variables/functions). - xex2_opt_import_libraries* opt_import_header = nullptr; - GetOptHeader(XEX_HEADER_IMPORT_LIBRARIES, &opt_import_header); + xex2_opt_import_libraries* opt_import_libraries = nullptr; + GetOptHeader(XEX_HEADER_IMPORT_LIBRARIES, &opt_import_libraries); - if (opt_import_header) { + if (opt_import_libraries) { // FIXME: Don't know if 32 is the actual limit, but haven't seen more than // 2. const char* string_table[32]; std::memset(string_table, 0, sizeof(string_table)); - size_t max_string_table_index = 0; // Parse the string table - for (size_t i = 0; i < opt_import_header->string_table_size; - ++max_string_table_index) { - assert_true(max_string_table_index < xe::countof(string_table)); - const char* str = opt_import_header->string_table + i; + for (size_t i = 0, o = 0; i < opt_import_libraries->string_table.size && + o < opt_import_libraries->string_table.count; + ++o) { + assert_true(o < xe::countof(string_table)); + const char* str = &opt_import_libraries->string_table.data[i]; - string_table[max_string_table_index] = str; + string_table[o] = str; i += std::strlen(str) + 1; // Padding @@ -1145,15 +1144,19 @@ bool XexModule::LoadContinue() { } } - auto libraries_ptr = reinterpret_cast(opt_import_header) + - opt_import_header->string_table_size + 12; + auto library_data = reinterpret_cast(opt_import_libraries) + + opt_import_libraries->string_table.size + 12; uint32_t library_offset = 0; - uint32_t library_count = opt_import_header->library_count; - for (uint32_t i = 0; i < library_count; i++) { - auto library = reinterpret_cast(libraries_ptr + - library_offset); + while (library_offset < opt_import_libraries->size) { + auto library = + reinterpret_cast(library_data + library_offset); + if (!library->size) { + break; + } size_t library_name_index = library->name_index & 0xFF; - assert_true(library_name_index < max_string_table_index); + assert_true(library_name_index < + opt_import_libraries->string_table.count); + assert_not_null(string_table[library_name_index]); SetupLibraryImports(string_table[library_name_index], library); library_offset += library->size; } @@ -1313,10 +1316,12 @@ bool XexModule::SetupLibraryImports(const char* name, var_info->set_status(Symbol::Status::kDefined); } else if (record_type == 1) { // Thunk. - assert_true(library_info.imports.size() > 0); - auto& prev_import = library_info.imports[library_info.imports.size() - 1]; - assert_true(prev_import.ordinal == ordinal); - prev_import.thunk_address = record_addr; + if (library_info.imports.size() > 0) { + auto& prev_import = + library_info.imports[library_info.imports.size() - 1]; + assert_true(prev_import.ordinal == ordinal); + prev_import.thunk_address = record_addr; + } if (kernel_export) { import_name.AppendFormat("%s", kernel_export->name); diff --git a/src/xenia/gpu/vulkan/premake5.lua b/src/xenia/gpu/vulkan/premake5.lua index b399914a8..f8e9903fe 100644 --- a/src/xenia/gpu/vulkan/premake5.lua +++ b/src/xenia/gpu/vulkan/premake5.lua @@ -38,6 +38,7 @@ project("xenia-gpu-vulkan-trace-viewer") "imgui", "libavcodec", "libavutil", + "mspack", "snappy", "spirv-tools", "volk", @@ -110,6 +111,7 @@ project("xenia-gpu-vulkan-trace-dump") "imgui", "libavcodec", "libavutil", + "mspack", "snappy", "spirv-tools", "volk", diff --git a/src/xenia/kernel/user_module.cc b/src/xenia/kernel/user_module.cc index a79091d3c..d36aa67ad 100644 --- a/src/xenia/kernel/user_module.cc +++ b/src/xenia/kernel/user_module.cc @@ -486,29 +486,33 @@ void UserModule::Dump() { std::memset(string_table, 0, sizeof(string_table)); // Parse the string table - for (size_t l = 0, j = 0; l < opt_import_libraries->string_table_size; - j++) { - assert_true(j < xe::countof(string_table)); - const char* str = opt_import_libraries->string_table + l; + for (size_t j = 0, o = 0; j < opt_import_libraries->string_table.size && + o < opt_import_libraries->string_table.count; + o++) { + assert_true(o < xe::countof(string_table)); + const char* str = &opt_import_libraries->string_table.data[j]; - string_table[j] = str; - l += std::strlen(str) + 1; + string_table[o] = str; + j += std::strlen(str) + 1; // Padding - if ((l % 4) != 0) { - l += 4 - (l % 4); + if ((j % 4) != 0) { + j += 4 - (j % 4); } } - auto libraries = + auto library_data = reinterpret_cast(opt_import_libraries) + - opt_import_libraries->string_table_size + 12; + opt_import_libraries->string_table.size + 12; uint32_t library_offset = 0; - uint32_t library_count = opt_import_libraries->library_count; - for (uint32_t l = 0; l < library_count; l++) { + while (library_offset < opt_import_libraries->size) { auto library = reinterpret_cast( - libraries + library_offset); + library_data + library_offset); + if (!library->size) { + break; + } auto name = string_table[library->name_index & 0xFF]; + assert_not_null(name); sb.AppendFormat(" %s - %d imports\n", name, (uint16_t)library->count); @@ -786,11 +790,11 @@ void UserModule::Dump() { } if (kernel_export && kernel_export->type == cpu::Export::Type::kVariable) { - sb.AppendFormat(" V %.8X %.3X (%3d) %s %s\n", + sb.AppendFormat(" V %.8X %.3X (%4d) %s %s\n", info->value_address, info->ordinal, info->ordinal, implemented ? " " : "!!", name); } else if (info->thunk_address) { - sb.AppendFormat(" F %.8X %.8X %.3X (%3d) %s %s\n", + sb.AppendFormat(" F %.8X %.8X %.3X (%4d) %s %s\n", info->value_address, info->thunk_address, info->ordinal, info->ordinal, implemented ? " " : "!!", name); diff --git a/src/xenia/kernel/util/xex2_info.h b/src/xenia/kernel/util/xex2_info.h index 23aa62524..f91b7c30f 100644 --- a/src/xenia/kernel/util/xex2_info.h +++ b/src/xenia/kernel/util/xex2_info.h @@ -474,10 +474,12 @@ struct xex2_opt_execution_info { static_assert_size(xex2_opt_execution_info, 0x18); struct xex2_opt_import_libraries { - xe::be section_size; // 0x0 - xe::be string_table_size; // 0x4 - xe::be library_count; // 0x8 - char string_table[1]; // 0xC string_table_size bytes + xe::be size; // 0x0 + struct { + xe::be size; // 0x4 + xe::be count; // 0x8 + char data[1]; // 0xC string_table_size bytes + } string_table; }; struct xex2_import_library { diff --git a/src/xenia/kernel/xam/xam_content.cc b/src/xenia/kernel/xam/xam_content.cc index 5a3c718c0..570e29947 100644 --- a/src/xenia/kernel/xam/xam_content.cc +++ b/src/xenia/kernel/xam/xam_content.cc @@ -23,7 +23,7 @@ struct DeviceInfo { uint32_t device_type; uint64_t total_bytes; uint64_t free_bytes; - std::wstring name; + wchar_t name[28]; }; static const DeviceInfo dummy_device_info_ = { 0xF00D0000, @@ -57,7 +57,7 @@ dword_result_t XamContentGetDeviceName(dword_t device_id, return X_ERROR_DEVICE_NOT_CONNECTED; } - if (name_capacity < dummy_device_info_.name.size() + 1) { + if (name_capacity < wcslen(dummy_device_info_.name) + 1) { return X_ERROR_INSUFFICIENT_BUFFER; } @@ -174,6 +174,35 @@ dword_result_t XamContentCreateEnumerator(dword_t user_index, dword_t device_id, } DECLARE_XAM_EXPORT1(XamContentCreateEnumerator, kContent, kImplemented); +dword_result_t XamContentCreateDeviceEnumerator(dword_t content_type, + dword_t content_flags, + dword_t max_count, + lpdword_t buffer_size_ptr, + lpdword_t handle_out) { + assert_not_null(handle_out); + + if (buffer_size_ptr) { + *buffer_size_ptr = sizeof(DeviceInfo) * max_count; + } + + auto e = new XStaticEnumerator(kernel_state(), max_count, sizeof(DeviceInfo)); + e->Initialize(); + + // Copy our dummy device into the enumerator + DeviceInfo* dev = (DeviceInfo*)e->AppendItem(); + if (dev) { + xe::store_and_swap(&dev->device_id, dummy_device_info_.device_id); + xe::store_and_swap(&dev->device_type, dummy_device_info_.device_type); + xe::store_and_swap(&dev->total_bytes, dummy_device_info_.total_bytes); + xe::store_and_swap(&dev->free_bytes, dummy_device_info_.free_bytes); + xe::copy_and_swap(dev->name, dummy_device_info_.name, 28); + } + + *handle_out = e->handle(); + return X_ERROR_SUCCESS; +} +DECLARE_XAM_EXPORT1(XamContentCreateDeviceEnumerator, kNone, kImplemented); + dword_result_t XamContentCreateEx(dword_t user_index, lpstring_t root_name, lpvoid_t content_data_ptr, dword_t flags, lpdword_t disposition_ptr, diff --git a/src/xenia/kernel/xam/xam_info.cc b/src/xenia/kernel/xam/xam_info.cc index 38fdc04d0..997bbe33f 100644 --- a/src/xenia/kernel/xam/xam_info.cc +++ b/src/xenia/kernel/xam/xam_info.cc @@ -17,6 +17,10 @@ #include "xenia/kernel/xthread.h" #include "xenia/xbox.h" +#if XE_PLATFORM_WIN32 +#include "xenia/base/platform_win.h" +#endif + namespace xe { namespace kernel { namespace xam { @@ -24,6 +28,152 @@ namespace xam { constexpr uint32_t X_LANGUAGE_ENGLISH = 1; constexpr uint32_t X_LANGUAGE_JAPANESE = 2; +dword_result_t XamGetOnlineSchema() { + static uint32_t schema_guest = 0; + static uint32_t schema_ptr_guest = 0; + + if (!schema_guest) { + // create a dummy schema, 8 bytes of 0 seems to work fine + // (with another 8 bytes for schema ptr/schema size) + schema_guest = kernel_state()->memory()->SystemHeapAlloc(16); + schema_ptr_guest = schema_guest + 8; + + auto schema = kernel_state()->memory()->TranslateVirtual(schema_guest); + memset(schema, 0, 16); + + // store schema ptr + size + xe::store_and_swap(schema + 0x8, schema_guest); + xe::store_and_swap(schema + 0xC, 0x8); + } + + // return pointer to the schema ptr/schema size struct + return schema_ptr_guest; +} +DECLARE_XAM_EXPORT2(XamGetOnlineSchema, kNone, kImplemented, kSketchy); + +void XamFormatDateString(dword_t unk, qword_t filetime, lpvoid_t buffer, + dword_t buffer_length) { + std::memset(buffer, 0, buffer_length * 2); + +// TODO: implement this for other platforms +#if XE_PLATFORM_WIN32 + FILETIME t; + t.dwHighDateTime = filetime >> 32; + t.dwLowDateTime = (uint32_t)filetime; + + SYSTEMTIME st; + SYSTEMTIME stLocal; + + FileTimeToSystemTime(&t, &st); + SystemTimeToTzSpecificLocalTime(NULL, &st, &stLocal); + + wchar_t buf[256]; + // TODO: format this depending on users locale? + swprintf(buf, 256, L"%02d/%02d/%d", stLocal.wMonth, stLocal.wDay, + stLocal.wYear); + + xe::copy_and_swap((wchar_t*)buffer.host_address(), buf, buffer_length); +#else + assert_always(); +#endif +} +DECLARE_XAM_EXPORT1(XamFormatDateString, kNone, kImplemented); + +void XamFormatTimeString(dword_t unk, qword_t filetime, lpvoid_t buffer, + dword_t buffer_length) { + std::memset(buffer, 0, buffer_length * 2); + +// TODO: implement this for other platforms +#if XE_PLATFORM_WIN32 + FILETIME t; + t.dwHighDateTime = filetime >> 32; + t.dwLowDateTime = (uint32_t)filetime; + + SYSTEMTIME st; + SYSTEMTIME stLocal; + + FileTimeToSystemTime(&t, &st); + SystemTimeToTzSpecificLocalTime(NULL, &st, &stLocal); + + wchar_t buf[256]; + swprintf(buf, 256, L"%02d:%02d", stLocal.wHour, stLocal.wMinute); + + xe::copy_and_swap((wchar_t*)buffer.host_address(), buf, buffer_length); +#else + assert_always(); +#endif +} +DECLARE_XAM_EXPORT1(XamFormatTimeString, kNone, kImplemented); + +dword_result_t keXamBuildResourceLocator(uint64_t module, + const wchar_t* container, + const wchar_t* resource, + lpvoid_t buffer, + uint32_t buffer_length) { + wchar_t buf[256]; + + if (!module) { + swprintf(buf, 256, L"file://media:/%s.xzp#%s", container, resource); + XELOGD( + "XamBuildResourceLocator(%ws) returning locator to local file %ws.xzp", + container, container); + } else { + swprintf(buf, 256, L"section://%X,%s#%s", (uint32_t)module, container, + resource); + } + + xe::copy_and_swap((wchar_t*)buffer.host_address(), buf, buffer_length); + return 0; +} + +dword_result_t XamBuildResourceLocator(qword_t module, lpwstring_t container, + lpwstring_t resource, lpvoid_t buffer, + dword_t buffer_length) { + return keXamBuildResourceLocator(module, container.value().c_str(), + resource.value().c_str(), buffer, + buffer_length); +} +DECLARE_XAM_EXPORT1(XamBuildResourceLocator, kNone, kImplemented); + +dword_result_t XamBuildGamercardResourceLocator(lpwstring_t filename, + lpvoid_t buffer, + dword_t buffer_length) { + // On an actual xbox these funcs would return a locator to xam.xex resources, + // but for Xenia we can return a locator to the resources as local files. (big + // thanks to MS for letting XamBuildResourceLocator return local file + // locators!) + + // If you're running an app that'll need them, make sure to extract xam.xex + // resources with xextool ("xextool -d . xam.xex") and add a .xzp extension. + + return keXamBuildResourceLocator(0, L"gamercrd", filename.value().c_str(), + buffer, buffer_length); +} +DECLARE_XAM_EXPORT1(XamBuildGamercardResourceLocator, kNone, kImplemented); + +dword_result_t XamBuildSharedSystemResourceLocator(lpwstring_t filename, + lpvoid_t buffer, + dword_t buffer_length) { + // see notes inside XamBuildGamercardResourceLocator above + return keXamBuildResourceLocator(0, L"shrdres", filename.value().c_str(), + buffer, buffer_length); +} +DECLARE_XAM_EXPORT1(XamBuildSharedSystemResourceLocator, kNone, kImplemented); + +dword_result_t XamBuildLegacySystemResourceLocator(lpwstring_t filename, + lpvoid_t buffer, + dword_t buffer_length) { + return XamBuildSharedSystemResourceLocator(filename, buffer, buffer_length); +} +DECLARE_XAM_EXPORT1(XamBuildLegacySystemResourceLocator, kNone, kImplemented); + +dword_result_t XamBuildXamResourceLocator(lpwstring_t filename, lpvoid_t buffer, + dword_t buffer_length) { + return keXamBuildResourceLocator(0, L"xam", filename.value().c_str(), buffer, + buffer_length); +} +DECLARE_XAM_EXPORT1(XamBuildXamResourceLocator, kNone, kImplemented); + dword_result_t XamGetSystemVersion() { // eh, just picking one. If we go too low we may break new games, but // this value seems to be used for conditionally loading symbols and if diff --git a/src/xenia/kernel/xam/xam_notify.cc b/src/xenia/kernel/xam/xam_notify.cc index 1a7337ab8..e3765af25 100644 --- a/src/xenia/kernel/xam/xam_notify.cc +++ b/src/xenia/kernel/xam/xam_notify.cc @@ -18,7 +18,8 @@ namespace xe { namespace kernel { namespace xam { -dword_result_t XamNotifyCreateListener(qword_t mask, dword_t one) { +dword_result_t XamNotifyCreateListenerInternal(qword_t mask, dword_t unk, + dword_t one) { // r4=1 may indicate user process? auto listener = @@ -30,6 +31,12 @@ dword_result_t XamNotifyCreateListener(qword_t mask, dword_t one) { return handle; } +DECLARE_XAM_EXPORT2(XamNotifyCreateListenerInternal, kNone, kImplemented, + kSketchy); + +dword_result_t XamNotifyCreateListener(qword_t mask, dword_t one) { + return XamNotifyCreateListenerInternal(mask, 0, one); +} DECLARE_XAM_EXPORT1(XamNotifyCreateListener, kNone, kImplemented); // https://github.com/CodeAsm/ffplay360/blob/master/Common/AtgSignIn.cpp diff --git a/src/xenia/kernel/xam/xam_table.inc b/src/xenia/kernel/xam/xam_table.inc index cec253c69..0bc4f02e6 100644 --- a/src/xenia/kernel/xam/xam_table.inc +++ b/src/xenia/kernel/xam/xam_table.inc @@ -588,7 +588,7 @@ XE_EXPORT(xam, 0x00000318, XamVoiceGetMicArrayStatus, XE_EXPORT(xam, 0x00000319, XamVoiceSetAudioCaptureRoutine, kFunction), XE_EXPORT(xam, 0x0000031A, XamVoiceGetDirectionalData, kFunction), XE_EXPORT(xam, 0x0000031B, XamBuildResourceLocator, kFunction), -XE_EXPORT(xam, 0x0000031C, XamBuildSharedSystemResourceLocator_, kFunction), +XE_EXPORT(xam, 0x0000031C, XamBuildLegacySystemResourceLocator, kFunction), XE_EXPORT(xam, 0x0000031D, XamBuildGamercardResourceLocator, kFunction), XE_EXPORT(xam, 0x0000031E, XamBuildDynamicResourceLocator, kFunction), XE_EXPORT(xam, 0x0000031F, XamBuildXamResourceLocator, kFunction), diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_module.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_module.cc index 706c58ca8..f38d27d35 100644 --- a/src/xenia/kernel/xboxkrnl/xboxkrnl_module.cc +++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_module.cc @@ -159,6 +159,14 @@ XboxkrnlModule::XboxkrnlModule(Emulator* emulator, KernelState* kernel_state) xe::store_and_swap(lpXboxHardwareInfo + 4, 0x06); // cpu count // Remaining 11b are zeroes? + // ExConsoleGameRegion, probably same values as keyvault region uses? + // Just return all 0xFF, should satisfy anything that checks it + uint32_t pExConsoleGameRegion = memory_->SystemHeapAlloc(4); + auto lpExConsoleGameRegion = memory_->TranslateVirtual(pExConsoleGameRegion); + export_resolver_->SetVariableMapping( + "xboxkrnl.exe", ordinals::ExConsoleGameRegion, pExConsoleGameRegion); + xe::store(lpExConsoleGameRegion, 0xFFFFFFFF); + // XexExecutableModuleHandle (?**) // Games try to dereference this to get a pointer to some module struct. // So far it seems like it's just in loader code, and only used to look up diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_strings.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_strings.cc index 2067f29e6..aa1bbf245 100644 --- a/src/xenia/kernel/xboxkrnl/xboxkrnl_strings.cc +++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_strings.cc @@ -1009,6 +1009,46 @@ SHIM_CALL _vsnprintf_shim(PPCContext* ppc_context, KernelState* kernel_state) { SHIM_SET_RETURN_32(count); } +// https://msdn.microsoft.com/en-us/library/1kt27hek.aspx +SHIM_CALL _vsnwprintf_shim(PPCContext* ppc_context, KernelState* kernel_state) { + uint32_t buffer_ptr = SHIM_GET_ARG_32(0); + int32_t buffer_count = SHIM_GET_ARG_32(1); + uint32_t format_ptr = SHIM_GET_ARG_32(2); + uint32_t arg_ptr = SHIM_GET_ARG_32(3); + + XELOGD("_vsnwprintf(%08X, %i, %08X, %08X)", buffer_ptr, buffer_count, + format_ptr, arg_ptr); + + if (buffer_ptr == 0 || buffer_count <= 0 || format_ptr == 0) { + SHIM_SET_RETURN_32(-1); + return; + } + + auto buffer = (uint16_t*)SHIM_MEM_ADDR(buffer_ptr); + auto format = (const uint16_t*)SHIM_MEM_ADDR(format_ptr); + + ArrayArgList args(ppc_context, arg_ptr); + WideStringFormatData data(format); + + int32_t count = format_core(ppc_context, data, args, true); + if (count < 0) { + // Error. + if (buffer_count > 0) { + buffer[0] = '\0'; // write a null, just to be safe + } + } else if (count <= buffer_count) { + // Fit within the buffer. + xe::copy_and_swap(buffer, (uint16_t*)data.wstr().c_str(), count); + if (count < buffer_count) { + buffer[count] = '\0'; + } + } else { + // Overflowed buffer. We still return the count we would have written. + xe::copy_and_swap(buffer, (uint16_t*)data.wstr().c_str(), buffer_count); + } + SHIM_SET_RETURN_32(count); +} + // https://msdn.microsoft.com/en-us/library/28d5ce15.aspx SHIM_CALL vsprintf_shim(PPCContext* ppc_context, KernelState* kernel_state) { uint32_t buffer_ptr = SHIM_GET_ARG_32(0); @@ -1100,6 +1140,7 @@ void RegisterStringExports(xe::cpu::ExportResolver* export_resolver, SHIM_SET_MAPPING("xboxkrnl.exe", vsprintf, state); SHIM_SET_MAPPING("xboxkrnl.exe", _vscwprintf, state); SHIM_SET_MAPPING("xboxkrnl.exe", vswprintf, state); + SHIM_SET_MAPPING("xboxkrnl.exe", _vsnwprintf, state); } } // namespace xboxkrnl diff --git a/third_party/mspack.lua b/third_party/mspack.lua new file mode 100644 index 000000000..85b6bc08f --- /dev/null +++ b/third_party/mspack.lua @@ -0,0 +1,33 @@ +group("third_party") +project("mspack") + uuid("0881692A-75A1-4E7B-87D8-BB9108CEDEA4") + kind("StaticLib") + language("C") + + defines({ + "_LIB", + "HAVE_CONFIG_H", + }) + removedefines({ + "_UNICODE", + "UNICODE", + }) + includedirs({ + "mspack", + }) + files({ + "mspack/lzx.h", + "mspack/lzxd.c", + "mspack/mspack.h", + "mspack/readbits.h", + "mspack/readhuff.h", + "mspack/system.c", + "mspack/system.h", + }) + + filter("platforms:Windows") + defines({ + }) + filter("platforms:Linux") + defines({ + }) diff --git a/third_party/mspack/COPYING.LIB b/third_party/mspack/COPYING.LIB new file mode 100644 index 000000000..b1e3f5a26 --- /dev/null +++ b/third_party/mspack/COPYING.LIB @@ -0,0 +1,504 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it becomes +a de-facto standard. To achieve this, non-free programs must be +allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control compilation +and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at + least three years, to give the same user the materials + specified in Subsection 6a, above, for a charge no more + than the cost of performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under any +particular circumstance, the balance of the section is intended to apply, +and the section as a whole is intended to apply in other circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License may add +an explicit geographical distribution limitation excluding those countries, +so that distribution is permitted only in or among countries not thus +excluded. In such case, this License incorporates the limitation as if +written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Libraries + + If you develop a new library, and you want it to be of the greatest +possible use to the public, we recommend making it free software that +everyone can redistribute and change. You can do so by permitting +redistribution under these terms (or, alternatively, under the terms of the +ordinary General Public License). + + To apply these terms, attach the following notices to the library. It is +safest to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least the +"copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +Also add information on how to contact you by electronic and paper mail. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the library, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + library `Frob' (a library for tweaking knobs) written by James Random Hacker. + + , 1 April 1990 + Ty Coon, President of Vice + +That's all there is to it! + + diff --git a/third_party/mspack/config.h b/third_party/mspack/config.h new file mode 100644 index 000000000..c4d21f9f7 --- /dev/null +++ b/third_party/mspack/config.h @@ -0,0 +1,114 @@ +/* config.h.in. Generated from configure.ac by autoheader. */ + +/* Turn debugging mode on? */ +#undef DEBUG + +/* Define to 1 if you have the header file. */ +#undef HAVE_DLFCN_H + +/* Define to 1 if fseeko (and presumably ftello) exists and is declared. */ +#undef HAVE_FSEEKO + +/* Define to 1 if you have the header file. */ +#define HAVE_INTTYPES_H 1 + +/* Define to 1 if you have the header file. */ +#undef HAVE_MEMORY_H + +/* Define to 1 if you have the `mkdir' function. */ +#undef HAVE_MKDIR + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H 1 + +/* Define to 1 if you have the header file. */ +#undef HAVE_STRINGS_H + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the header file. */ +#undef HAVE_SYS_STAT_H + +/* Define to 1 if you have the header file. */ +#undef HAVE_SYS_TYPES_H + +/* Define to 1 if you have the `towlower' function. */ +#undef HAVE_TOWLOWER + +/* Define to 1 if you have the header file. */ +#undef HAVE_UNISTD_H + +/* Define to 1 if you have the `_mkdir' function. */ +#undef HAVE__MKDIR + +/* Define to the sub-directory where libtool stores uninstalled libraries. */ +#undef LT_OBJDIR + +/* Define if mkdir takes only one argument. */ +#undef MKDIR_TAKES_ONE_ARG + +/* Name of package */ +#undef PACKAGE + +/* Define to the address where bug reports for this package should be sent. */ +#undef PACKAGE_BUGREPORT + +/* Define to the full name of this package. */ +#undef PACKAGE_NAME + +/* Define to the full name and version of this package. */ +#undef PACKAGE_STRING + +/* Define to the one symbol short name of this package. */ +#undef PACKAGE_TARNAME + +/* Define to the home page for this package. */ +#undef PACKAGE_URL + +/* Define to the version of this package. */ +#undef PACKAGE_VERSION + +/* The size of `off_t', as computed by sizeof. */ +#undef SIZEOF_OFF_T + +/* Define to 1 if you have the ANSI C header files. */ +#undef STDC_HEADERS + +/* Version number of package */ +#undef VERSION + +/* Enable large inode numbers on Mac OS X 10.5. */ +#ifndef _DARWIN_USE_64_BIT_INODE +# define _DARWIN_USE_64_BIT_INODE 1 +#endif + +/* Number of bits in a file offset, on hosts where this is settable. */ +#undef _FILE_OFFSET_BITS + +/* Define to 1 to make fseeko visible on some hosts (e.g. glibc 2.2). */ +#undef _LARGEFILE_SOURCE + +/* Define for large files, on AIX-style hosts. */ +#undef _LARGE_FILES + +/* Define to empty if `const' does not conform to ANSI C. */ +#undef const + +/* Define to `__inline__' or `__inline' if that's what the C compiler + calls it, or to nothing if 'inline' is not supported under any name. */ +#ifndef __cplusplus +#undef inline +#endif + +/* Define to `int' if does not define. */ +#undef mode_t + +/* Define to `long int' if does not define. */ +#undef off_t + +/* Define to `unsigned int' if does not define. */ +#undef size_t diff --git a/third_party/mspack/lzx.h b/third_party/mspack/lzx.h index e9eda0fbb..a6152f622 100644 --- a/third_party/mspack/lzx.h +++ b/third_party/mspack/lzx.h @@ -1,5 +1,5 @@ /* This file is part of libmspack. - * (C) 2003-2004 Stuart Caie. + * (C) 2003-2013 Stuart Caie. * * The LZX method was created by Jonathan Forbes and Tomi Poutanen, adapted * by Microsoft Corporation. @@ -13,6 +13,10 @@ #ifndef MSPACK_LZX_H #define MSPACK_LZX_H 1 +#ifdef __cplusplus +extern "C" { +#endif + /* LZX compression / decompression definitions */ /* some constants defined by the LZX specification */ @@ -31,7 +35,7 @@ /* LZX huffman defines: tweak tablebits as desired */ #define LZX_PRETREE_MAXSYMBOLS (LZX_PRETREE_NUM_ELEMENTS) #define LZX_PRETREE_TABLEBITS (6) -#define LZX_MAINTREE_MAXSYMBOLS (LZX_NUM_CHARS + 50*8) +#define LZX_MAINTREE_MAXSYMBOLS (LZX_NUM_CHARS + 290*8) #define LZX_MAINTREE_TABLEBITS (12) #define LZX_LENGTH_MAXSYMBOLS (LZX_NUM_SECONDARY_LENGTHS+1) #define LZX_LENGTH_TABLEBITS (12) @@ -51,6 +55,8 @@ struct lzxd_stream { unsigned char *window; /* decoding window */ unsigned int window_size; /* window size */ + unsigned int ref_data_size; /* LZX DELTA reference data size */ + unsigned int num_offsets; /* number of match_offset entries in table */ unsigned int window_posn; /* decompression offset within window */ unsigned int frame_posn; /* current frame offset within in window */ unsigned int frame; /* the number of 32kb frames processed */ @@ -66,8 +72,8 @@ struct lzxd_stream { unsigned char intel_started; /* has intel E8 decoding started? */ unsigned char block_type; /* type of the current block */ unsigned char header_read; /* have we started decoding at all yet? */ - unsigned char posn_slots; /* how many posn slots in stream? */ unsigned char input_end; /* have we reached the end of input? */ + unsigned char is_delta; /* does stream follow LZX DELTA spec? */ int error; @@ -83,85 +89,133 @@ struct lzxd_stream { /* huffman decoding tables */ unsigned short PRETREE_table [(1 << LZX_PRETREE_TABLEBITS) + - (LZX_PRETREE_MAXSYMBOLS * 2)]; + (LZX_PRETREE_MAXSYMBOLS * 2)]; unsigned short MAINTREE_table[(1 << LZX_MAINTREE_TABLEBITS) + - (LZX_MAINTREE_MAXSYMBOLS * 2)]; + (LZX_MAINTREE_MAXSYMBOLS * 2)]; unsigned short LENGTH_table [(1 << LZX_LENGTH_TABLEBITS) + - (LZX_LENGTH_MAXSYMBOLS * 2)]; + (LZX_LENGTH_MAXSYMBOLS * 2)]; unsigned short ALIGNED_table [(1 << LZX_ALIGNED_TABLEBITS) + - (LZX_ALIGNED_MAXSYMBOLS * 2)]; + (LZX_ALIGNED_MAXSYMBOLS * 2)]; + unsigned char LENGTH_empty; /* this is used purely for doing the intel E8 transform */ unsigned char e8_buf[LZX_FRAME_SIZE]; }; -/* allocates LZX decompression state for decoding the given stream. +/** + * Allocates and initialises LZX decompression state for decoding an LZX + * stream. * - * - returns NULL if window_bits is outwith the range 15 to 21 (inclusive). + * This routine uses system->alloc() to allocate memory. If memory + * allocation fails, or the parameters to this function are invalid, + * NULL is returned. * - * - uses system->alloc() to allocate memory - * - * - returns NULL if not enough memory - * - * - window_bits is the size of the LZX window, from 32Kb (15) to 2Mb (21). - * - * - reset_interval is how often the bitstream is reset, measured in - * multiples of 32Kb bytes output. For CAB LZX streams, this is always 0 - * (does not occur). - * - * - input_buffer_size is how many bytes to use as an input bitstream buffer - * - * - output_length is the length in bytes of the entirely decompressed - * output stream, if known in advance. It is used to correctly perform - * the Intel E8 transformation, which must stop 6 bytes before the very - * end of the decompressed stream. It is not otherwise used or adhered - * to. If the full decompressed length is known in advance, set it here. - * If it is NOT known, use the value 0, and call lzxd_set_output_length() - * once it is known. If never set, 4 of the final 6 bytes of the output - * stream may be incorrect. + * @param system an mspack_system structure used to read from + * the input stream and write to the output + * stream, also to allocate and free memory. + * @param input an input stream with the LZX data. + * @param output an output stream to write the decoded data to. + * @param window_bits the size of the decoding window, which must be + * between 15 and 21 inclusive for regular LZX + * data, or between 17 and 25 inclusive for + * LZX DELTA data. + * @param reset_interval the interval at which the LZX bitstream is + * reset, in multiples of LZX frames (32678 + * bytes), e.g. a value of 2 indicates the input + * stream resets after every 65536 output bytes. + * A value of 0 indicates that the bitstream never + * resets, such as in CAB LZX streams. + * @param input_buffer_size the number of bytes to use as an input + * bitstream buffer. + * @param output_length the length in bytes of the entirely + * decompressed output stream, if known in + * advance. It is used to correctly perform the + * Intel E8 transformation, which must stop 6 + * bytes before the very end of the + * decompressed stream. It is not otherwise used + * or adhered to. If the full decompressed + * length is known in advance, set it here. + * If it is NOT known, use the value 0, and call + * lzxd_set_output_length() once it is + * known. If never set, 4 of the final 6 bytes + * of the output stream may be incorrect. + * @param is_delta should be zero for all regular LZX data, + * non-zero for LZX DELTA encoded data. + * @return a pointer to an initialised lzxd_stream structure, or NULL if + * there was not enough memory or parameters to the function were wrong. */ extern struct lzxd_stream *lzxd_init(struct mspack_system *system, - struct mspack_file *input, - struct mspack_file *output, - int window_bits, - int reset_interval, - int input_buffer_size, - off_t output_length); + struct mspack_file *input, + struct mspack_file *output, + int window_bits, + int reset_interval, + int input_buffer_size, + off_t output_length, + char is_delta); /* see description of output_length in lzxd_init() */ extern void lzxd_set_output_length(struct lzxd_stream *lzx, - off_t output_length); + off_t output_length); -/* decompresses, or decompresses more of, an LZX stream. +/** + * Reads LZX DELTA reference data into the window and allows + * lzxd_decompress() to reference it. * - * - out_bytes of data will be decompressed and the function will return - * with an MSPACK_ERR_OK return code. + * Call this before the first call to lzxd_decompress(). + + * @param lzx the LZX stream to apply this reference data to + * @param system an mspack_system implementation to use with the + * input param. Only read() will be called. + * @param input an input file handle to read reference data using + * system->read(). + * @param length the length of the reference data. Cannot be longer + * than the LZX window size. + * @return an error code, or MSPACK_ERR_OK if successful + */ +extern int lzxd_set_reference_data(struct lzxd_stream *lzx, + struct mspack_system *system, + struct mspack_file *input, + unsigned int length); + +/** + * Decompresses entire or partial LZX streams. * - * - decompressing will stop as soon as out_bytes is reached. if the true - * amount of bytes decoded spills over that amount, they will be kept for - * a later invocation of lzxd_decompress(). + * The number of bytes of data that should be decompressed is given as the + * out_bytes parameter. If more bytes are decoded than are needed, they + * will be kept over for a later invocation. * - * - the output bytes will be passed to the system->write() function given in - * lzxd_init(), using the output file handle given in lzxd_init(). More - * than one call may be made to system->write(). + * The output bytes will be passed to the system->write() function given in + * lzxd_init(), using the output file handle given in lzxd_init(). More than + * one call may be made to system->write(). + + * Input bytes will be read in as necessary using the system->read() + * function given in lzxd_init(), using the input file handle given in + * lzxd_init(). This will continue until system->read() returns 0 bytes, + * or an error. Errors will be passed out of the function as + * MSPACK_ERR_READ errors. Input streams should convey an "end of input + * stream" by refusing to supply all the bytes that LZX asks for when they + * reach the end of the stream, rather than return an error code. * - * - LZX will read input bytes as necessary using the system->read() function - * given in lzxd_init(), using the input file handle given in lzxd_init(). - * This will continue until system->read() returns 0 bytes, or an error. - * input streams should convey an "end of input stream" by refusing to - * supply all the bytes that LZX asks for when they reach the end of the - * stream, rather than return an error code. + * If any error code other than MSPACK_ERR_OK is returned, the stream + * should be considered unusable and lzxd_decompress() should not be + * called again on this stream. * - * - if an error code other than MSPACK_ERR_OK is returned, the stream should - * be considered unusable and lzxd_decompress() should not be called again - * on this stream. + * @param lzx LZX decompression state, as allocated by lzxd_init(). + * @param out_bytes the number of bytes of data to decompress. + * @return an error code, or MSPACK_ERR_OK if successful */ extern int lzxd_decompress(struct lzxd_stream *lzx, off_t out_bytes); -/* frees all state associated with an LZX data stream +/** + * Frees all state associated with an LZX data stream. This will call + * system->free() using the system pointer given in lzxd_init(). * - * - calls system->free() using the system pointer given in lzxd_init() + * @param lzx LZX decompression state to free. */ void lzxd_free(struct lzxd_stream *lzx); +#ifdef __cplusplus +} +#endif + #endif diff --git a/third_party/mspack/lzxd.c b/third_party/mspack/lzxd.c index 2fdf23e80..6cc33df08 100644 --- a/third_party/mspack/lzxd.c +++ b/third_party/mspack/lzxd.c @@ -1,5 +1,5 @@ /* This file is part of libmspack. - * (C) 2003-2004 Stuart Caie. + * (C) 2003-2013 Stuart Caie. * * The LZX method was created by Jonathan Forbes and Tomi Poutanen, adapted * by Microsoft Corporation. @@ -12,11 +12,11 @@ /* LZX decompression implementation */ -#include "mspack.h" -#include "lzx.h" +#include +#include -/* Microsoft's LZX document and their implementation of the - * com.ms.util.cab Java package do not concur. +/* Microsoft's LZX document (in cab-sdk.exe) and their implementation + * of the com.ms.util.cab Java package do not concur. * * In the LZX document, there is a table showing the correlation between * window size and the number of position slots. It states that the 1MB @@ -58,240 +58,85 @@ * least one element. However, many CAB files contain blocks where the * length tree is completely empty (because there are no matches), and * this is expected to succeed. + * + * The errors in LZX documentation appear have been corrected in the + * new documentation for the LZX DELTA format. + * + * http://msdn.microsoft.com/en-us/library/cc483133.aspx + * + * However, this is a different format, an extension of regular LZX. + * I have noticed the following differences, there may be more: + * + * The maximum window size has increased from 2MB to 32MB. This also + * increases the maximum number of position slots, etc. + * + * If the match length is 257 (the maximum possible), this signals + * a further length decoding step, that allows for matches up to + * 33024 bytes long. + * + * The format now allows for "reference data", supplied by the caller. + * If match offsets go further back than the number of bytes + * decompressed so far, that is them accessing the reference data. */ - -/* LZX decompressor input macros - * - * STORE_BITS stores bitstream state in lzxd_stream structure - * RESTORE_BITS restores bitstream state from lzxd_stream structure - * READ_BITS(var,n) takes N bits from the buffer and puts them in var - * ENSURE_BITS(n) ensures there are at least N bits in the bit buffer. - * PEEK_BITS(n) extracts without removing N bits from the bit buffer - * REMOVE_BITS(n) removes N bits from the bit buffer - * - * These bit access routines work by using the area beyond the MSB and the - * LSB as a free source of zeroes when shifting. This avoids having to - * mask any bits. So we have to know the bit width of the bit buffer - * variable. - * - * The bit buffer datatype should be at least 32 bits wide: it must be - * possible to ENSURE_BITS(16), so it must be possible to add 16 new bits - * to the bit buffer when the bit buffer already has 1 to 15 bits left. - */ - -#include -#ifndef CHAR_BIT -# define CHAR_BIT (8) -#endif -#define BITBUF_WIDTH (sizeof(bit_buffer) * CHAR_BIT) - -#ifdef LZXDEBUG -# include -# define D(x) do { printf("%s:%d (%s) ",__FILE__, __LINE__, __FUNCTION__); \ - printf x ; fputc('\n', stdout); fflush(stdout);} while (0); -#else -# define D(x) -#endif - -#define STORE_BITS do { \ - lzx->i_ptr = i_ptr; \ - lzx->i_end = i_end; \ - lzx->bit_buffer = bit_buffer; \ - lzx->bits_left = bits_left; \ +/* import bit-reading macros and code */ +#define BITS_TYPE struct lzxd_stream +#define BITS_VAR lzx +#define BITS_ORDER_MSB +#define READ_BYTES do { \ + unsigned char b0, b1; \ + READ_IF_NEEDED; b0 = *i_ptr++; \ + READ_IF_NEEDED; b1 = *i_ptr++; \ + INJECT_BITS((b1 << 8) | b0, 16); \ } while (0) +#include -#define RESTORE_BITS do { \ - i_ptr = lzx->i_ptr; \ - i_end = lzx->i_end; \ - bit_buffer = lzx->bit_buffer; \ - bits_left = lzx->bits_left; \ -} while (0) - -#define ENSURE_BITS(nbits) \ - while (bits_left < (nbits)) { \ - if (i_ptr >= i_end) { \ - if (lzxd_read_input(lzx)) return lzx->error; \ - i_ptr = lzx->i_ptr; \ - i_end = lzx->i_end; \ - } \ - bit_buffer |= ((i_ptr[1] << 8) | i_ptr[0]) \ - << (BITBUF_WIDTH - 16 - bits_left); \ - bits_left += 16; \ - i_ptr += 2; \ - } - -#define PEEK_BITS(nbits) (bit_buffer >> (BITBUF_WIDTH - (nbits))) - -#define REMOVE_BITS(nbits) ((bit_buffer <<= (nbits)), (bits_left -= (nbits))) - -#define READ_BITS(val, nbits) do { \ - ENSURE_BITS(nbits); \ - (val) = PEEK_BITS(nbits); \ - REMOVE_BITS(nbits); \ -} while (0) - -static int lzxd_read_input(struct lzxd_stream *lzx) { - int read = lzx->sys->read(lzx->input, &lzx->inbuf[0], (int)lzx->inbuf_size); - if (read < 0) return lzx->error = MSPACK_ERR_READ; - - /* huff decode's ENSURE_BYTES(16) might overrun the input stream, even - * if those bits aren't used, so fake 2 more bytes */ - if (read == 0) { - if (lzx->input_end) { - D(("out of input bytes")) - return lzx->error = MSPACK_ERR_READ; - } - else { - read = 2; - lzx->inbuf[0] = lzx->inbuf[1] = 0; - lzx->input_end = 1; - } - } - - lzx->i_ptr = &lzx->inbuf[0]; - lzx->i_end = &lzx->inbuf[read]; - - return MSPACK_ERR_OK; -} - -/* Huffman decoding macros */ - -/* READ_HUFFSYM(tablename, var) decodes one huffman symbol from the - * bitstream using the stated table and puts it in var. - */ -#define READ_HUFFSYM(tbl, var) do { \ - /* huffman symbols can be up to 16 bits long */ \ - ENSURE_BITS(16); \ - /* immediate table lookup of [tablebits] bits of the code */ \ - sym = lzx->tbl##_table[PEEK_BITS(LZX_##tbl##_TABLEBITS)]; \ - /* is the symbol is longer than [tablebits] bits? (i=node index) */ \ - if (sym >= LZX_##tbl##_MAXSYMBOLS) { \ - /* decode remaining bits by tree traversal */ \ - i = 1 << (BITBUF_WIDTH - LZX_##tbl##_TABLEBITS); \ - do { \ - /* one less bit. error if we run out of bits before decode */ \ - i >>= 1; \ - if (i == 0) { \ - D(("out of bits in huffman decode")) \ - return lzx->error = MSPACK_ERR_DECRUNCH; \ - } \ - /* double node index and add 0 (left branch) or 1 (right) */ \ - sym <<= 1; sym |= (bit_buffer & i) ? 1 : 0; \ - /* hop to next node index / decoded symbol */ \ - sym = lzx->tbl##_table[sym]; \ - /* while we are still in node indicies, not decoded symbols */ \ - } while (sym >= LZX_##tbl##_MAXSYMBOLS); \ - } \ - /* result */ \ - (var) = sym; \ - /* look up the code length of that symbol and discard those bits */ \ - i = lzx->tbl##_len[sym]; \ - REMOVE_BITS(i); \ -} while (0) +/* import huffman-reading macros and code */ +#define TABLEBITS(tbl) LZX_##tbl##_TABLEBITS +#define MAXSYMBOLS(tbl) LZX_##tbl##_MAXSYMBOLS +#define HUFF_TABLE(tbl,idx) lzx->tbl##_table[idx] +#define HUFF_LEN(tbl,idx) lzx->tbl##_len[idx] +#define HUFF_ERROR return lzx->error = MSPACK_ERR_DECRUNCH +#include /* BUILD_TABLE(tbl) builds a huffman lookup table from code lengths */ #define BUILD_TABLE(tbl) \ - if (make_decode_table(LZX_##tbl##_MAXSYMBOLS, LZX_##tbl##_TABLEBITS, \ - &lzx->tbl##_len[0], &lzx->tbl##_table[0])) \ - { \ - D(("failed to build %s table", #tbl)) \ - return lzx->error = MSPACK_ERR_DECRUNCH; \ - } - -/* make_decode_table(nsyms, nbits, length[], table[]) - * - * This function was coded by David Tritscher. It builds a fast huffman - * decoding table from a canonical huffman code lengths table. - * - * nsyms = total number of symbols in this huffman tree. - * nbits = any symbols with a code length of nbits or less can be decoded - * in one lookup of the table. - * length = A table to get code lengths from [0 to syms-1] - * table = The table to fill up with decoded symbols and pointers. - * - * Returns 0 for OK or 1 for error - */ - -static int make_decode_table(unsigned int nsyms, unsigned int nbits, - unsigned char *length, unsigned short *table) -{ - unsigned short sym; - unsigned int leaf, fill; - unsigned char bit_num; - unsigned int pos = 0; /* the current position in the decode table */ - unsigned int table_mask = 1 << nbits; - unsigned int bit_mask = table_mask >> 1; /* don't do 0 length codes */ - unsigned int next_symbol = bit_mask; /* base of allocation for long codes */ - - /* fill entries for codes short enough for a direct mapping */ - for (bit_num = 1; bit_num <= nbits; bit_num++) { - for (sym = 0; sym < nsyms; sym++) { - if (length[sym] != bit_num) continue; - leaf = pos; - if((pos += bit_mask) > table_mask) return 1; /* table overrun */ - /* fill all possible lookups of this symbol with the symbol itself */ - for (fill = bit_mask; fill-- > 0;) table[leaf++] = sym; + if (make_decode_table(MAXSYMBOLS(tbl), TABLEBITS(tbl), \ + &HUFF_LEN(tbl,0), &HUFF_TABLE(tbl,0))) \ + { \ + D(("failed to build %s table", #tbl)) \ + return lzx->error = MSPACK_ERR_DECRUNCH; \ } - bit_mask >>= 1; - } - - /* full table already? */ - if (pos == table_mask) return 0; - - /* clear the remainder of the table */ - for (sym = pos; sym < table_mask; sym++) table[sym] = 0xFFFF; - - /* allow codes to be up to nbits+16 long, instead of nbits */ - pos <<= 16; - table_mask <<= 16; - bit_mask = 1 << 15; - - for (bit_num = nbits+1; bit_num <= 16; bit_num++) { - for (sym = 0; sym < nsyms; sym++) { - if (length[sym] != bit_num) continue; - - leaf = pos >> 16; - for (fill = 0; fill < bit_num - nbits; fill++) { - /* if this path hasn't been taken yet, 'allocate' two entries */ - if (table[leaf] == 0xFFFF) { - table[(next_symbol << 1)] = 0xFFFF; - table[(next_symbol << 1) + 1] = 0xFFFF; - table[leaf] = next_symbol++; - } - /* follow the path and select either left or right for next bit */ - leaf = table[leaf] << 1; - if ((pos >> (15-fill)) & 1) leaf++; - } - table[leaf] = sym; - - if ((pos += bit_mask) > table_mask) return 1; /* table overflow */ - } - bit_mask >>= 1; - } - - /* full table? */ - if (pos == table_mask) return 0; - - /* either erroneous table, or all elements are 0 - let's find out. */ - for (sym = 0; sym < nsyms; sym++) if (length[sym]) return 1; - return 0; -} +#define BUILD_TABLE_MAYBE_EMPTY(tbl) do { \ + lzx->tbl##_empty = 0; \ + if (make_decode_table(MAXSYMBOLS(tbl), TABLEBITS(tbl), \ + &HUFF_LEN(tbl,0), &HUFF_TABLE(tbl,0))) \ + { \ + for (i = 0; i < MAXSYMBOLS(tbl); i++) { \ + if (HUFF_LEN(tbl, i) > 0) { \ + D(("failed to build %s table", #tbl)) \ + return lzx->error = MSPACK_ERR_DECRUNCH; \ + } \ + } \ + /* empty tree - allow it, but don't decode symbols with it */ \ + lzx->tbl##_empty = 1; \ + } \ +} while (0) /* READ_LENGTHS(tablename, first, last) reads in code lengths for symbols * first to last in the given table. The code lengths are stored in their * own special LZX way. */ -#define READ_LENGTHS(tbl, first, last) do { \ - STORE_BITS; \ - if (lzxd_read_lens(lzx, &lzx->tbl##_len[0], (first), \ - (unsigned int)(last))) return lzx->error; \ - RESTORE_BITS; \ +#define READ_LENGTHS(tbl, first, last) do { \ + STORE_BITS; \ + if (lzxd_read_lens(lzx, &HUFF_LEN(tbl, 0), (first), \ + (unsigned int)(last))) return lzx->error; \ + RESTORE_BITS; \ } while (0) static int lzxd_read_lens(struct lzxd_stream *lzx, unsigned char *lens, - unsigned int first, unsigned int last) + unsigned int first, unsigned int last) { /* bit buffer and huffman symbol decode variables */ unsigned int bit_buffer; @@ -348,27 +193,71 @@ static int lzxd_read_lens(struct lzxd_stream *lzx, unsigned char *lens, * a small 'position slot' number and a small offset from that slot are * encoded instead of one large offset. * + * The number of slots is decided by how many are needed to encode the + * largest offset for a given window size. This is easy when the gap between + * slots is less than 128Kb, it's a linear relationship. But when extra_bits + * reaches its limit of 17 (because LZX can only ensure reading 17 bits of + * data at a time), we can only jump 128Kb at a time and have to start + * using more and more position slots as each window size doubles. + * * position_base[] is an index to the position slot bases * * extra_bits[] states how many bits of offset-from-base data is needed. + * + * They are calculated as follows: + * extra_bits[i] = 0 where i < 4 + * extra_bits[i] = floor(i/2)-1 where i >= 4 && i < 36 + * extra_bits[i] = 17 where i >= 36 + * position_base[0] = 0 + * position_base[i] = position_base[i-1] + (1 << extra_bits[i-1]) */ -static unsigned int position_base[51]; -static unsigned char extra_bits[51]; - -static void lzxd_static_init() { - int i, j; - - for (i = 0, j = 0; i < 51; i += 2) { - extra_bits[i] = j; /* 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7... */ - extra_bits[i+1] = j; - if ((i != 0) && (j < 17)) j++; /* 0,0,1,2,3,4...15,16,17,17,17,17... */ - } - - for (i = 0, j = 0; i < 51; i++) { - position_base[i] = j; /* 0,1,2,3,4,6,8,12,16,24,32,... */ - j += 1 << extra_bits[i]; /* 1,1,1,1,2,2,4,4,8,8,16,16,32,32,... */ - } -} +static const unsigned int position_slots[11] = { + 30, 32, 34, 36, 38, 42, 50, 66, 98, 162, 290 +}; +static const unsigned char extra_bits[36] = { + 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, + 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16 +}; +static const unsigned int position_base[290] = { + 0, 1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384, 512, + 768, 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 24576, 32768, + 49152, 65536, 98304, 131072, 196608, 262144, 393216, 524288, 655360, + 786432, 917504, 1048576, 1179648, 1310720, 1441792, 1572864, 1703936, + 1835008, 1966080, 2097152, 2228224, 2359296, 2490368, 2621440, 2752512, + 2883584, 3014656, 3145728, 3276800, 3407872, 3538944, 3670016, 3801088, + 3932160, 4063232, 4194304, 4325376, 4456448, 4587520, 4718592, 4849664, + 4980736, 5111808, 5242880, 5373952, 5505024, 5636096, 5767168, 5898240, + 6029312, 6160384, 6291456, 6422528, 6553600, 6684672, 6815744, 6946816, + 7077888, 7208960, 7340032, 7471104, 7602176, 7733248, 7864320, 7995392, + 8126464, 8257536, 8388608, 8519680, 8650752, 8781824, 8912896, 9043968, + 9175040, 9306112, 9437184, 9568256, 9699328, 9830400, 9961472, 10092544, + 10223616, 10354688, 10485760, 10616832, 10747904, 10878976, 11010048, + 11141120, 11272192, 11403264, 11534336, 11665408, 11796480, 11927552, + 12058624, 12189696, 12320768, 12451840, 12582912, 12713984, 12845056, + 12976128, 13107200, 13238272, 13369344, 13500416, 13631488, 13762560, + 13893632, 14024704, 14155776, 14286848, 14417920, 14548992, 14680064, + 14811136, 14942208, 15073280, 15204352, 15335424, 15466496, 15597568, + 15728640, 15859712, 15990784, 16121856, 16252928, 16384000, 16515072, + 16646144, 16777216, 16908288, 17039360, 17170432, 17301504, 17432576, + 17563648, 17694720, 17825792, 17956864, 18087936, 18219008, 18350080, + 18481152, 18612224, 18743296, 18874368, 19005440, 19136512, 19267584, + 19398656, 19529728, 19660800, 19791872, 19922944, 20054016, 20185088, + 20316160, 20447232, 20578304, 20709376, 20840448, 20971520, 21102592, + 21233664, 21364736, 21495808, 21626880, 21757952, 21889024, 22020096, + 22151168, 22282240, 22413312, 22544384, 22675456, 22806528, 22937600, + 23068672, 23199744, 23330816, 23461888, 23592960, 23724032, 23855104, + 23986176, 24117248, 24248320, 24379392, 24510464, 24641536, 24772608, + 24903680, 25034752, 25165824, 25296896, 25427968, 25559040, 25690112, + 25821184, 25952256, 26083328, 26214400, 26345472, 26476544, 26607616, + 26738688, 26869760, 27000832, 27131904, 27262976, 27394048, 27525120, + 27656192, 27787264, 27918336, 28049408, 28180480, 28311552, 28442624, + 28573696, 28704768, 28835840, 28966912, 29097984, 29229056, 29360128, + 29491200, 29622272, 29753344, 29884416, 30015488, 30146560, 30277632, + 30408704, 30539776, 30670848, 30801920, 30932992, 31064064, 31195136, + 31326208, 31457280, 31588352, 31719424, 31850496, 31981568, 32112640, + 32243712, 32374784, 32505856, 32636928, 32768000, 32899072, 33030144, + 33161216, 33292288, 33423360 +}; static void lzxd_reset_state(struct lzxd_stream *lzx) { int i; @@ -388,35 +277,46 @@ static void lzxd_reset_state(struct lzxd_stream *lzx) { /*-------- main LZX code --------*/ struct lzxd_stream *lzxd_init(struct mspack_system *system, - struct mspack_file *input, - struct mspack_file *output, - int window_bits, - int reset_interval, - int input_buffer_size, - off_t output_length) + struct mspack_file *input, + struct mspack_file *output, + int window_bits, + int reset_interval, + int input_buffer_size, + off_t output_length, + char is_delta) { unsigned int window_size = 1 << window_bits; struct lzxd_stream *lzx; if (!system) return NULL; - /* LZX supports window sizes of 2^15 (32Kb) through 2^21 (2Mb) */ - if (window_bits < 15 || window_bits > 21) return NULL; + /* LZX DELTA window sizes are between 2^17 (128KiB) and 2^25 (32MiB), + * regular LZX windows are between 2^15 (32KiB) and 2^21 (2MiB) + */ + if (is_delta) { + if (window_bits < 17 || window_bits > 25) return NULL; + } + else { + if (window_bits < 15 || window_bits > 21) return NULL; + } + if (reset_interval < 0 || output_length < 0) { + D(("reset interval or output length < 0")) + return NULL; + } + + /* round up input buffer size to multiple of two */ input_buffer_size = (input_buffer_size + 1) & -2; - if (!input_buffer_size) return NULL; - - /* initialise static data */ - lzxd_static_init(); + if (input_buffer_size < 2) return NULL; /* allocate decompression state */ - if (!(lzx = (struct lzxd_stream *)system->alloc(system, sizeof(struct lzxd_stream)))) { + if (!(lzx = (struct lzxd_stream *) system->alloc(system, sizeof(struct lzxd_stream)))) { return NULL; } /* allocate decompression window and input buffer */ - lzx->window = (unsigned char *)system->alloc(system, (size_t) window_size); - lzx->inbuf = (unsigned char *)system->alloc(system, (size_t) input_buffer_size); + lzx->window = (unsigned char *) system->alloc(system, (size_t) window_size); + lzx->inbuf = (unsigned char *) system->alloc(system, (size_t) input_buffer_size); if (!lzx->window || !lzx->inbuf) { system->free(lzx->window); system->free(lzx->inbuf); @@ -433,43 +333,73 @@ struct lzxd_stream *lzxd_init(struct mspack_system *system, lzx->inbuf_size = input_buffer_size; lzx->window_size = 1 << window_bits; + lzx->ref_data_size = 0; lzx->window_posn = 0; lzx->frame_posn = 0; lzx->frame = 0; lzx->reset_interval = reset_interval; lzx->intel_filesize = 0; lzx->intel_curpos = 0; - - /* window bits: 15 16 17 18 19 20 21 - * position slots: 30 32 34 36 38 42 50 */ - lzx->posn_slots = ((window_bits == 21) ? 50 : - ((window_bits == 20) ? 42 : (window_bits << 1))); lzx->intel_started = 0; - lzx->input_end = 0; + lzx->error = MSPACK_ERR_OK; + lzx->num_offsets = position_slots[window_bits - 15] << 3; + lzx->is_delta = is_delta; - lzx->error = MSPACK_ERR_OK; - - lzx->i_ptr = lzx->i_end = &lzx->inbuf[0]; lzx->o_ptr = lzx->o_end = &lzx->e8_buf[0]; - lzx->bit_buffer = lzx->bits_left = 0; - lzxd_reset_state(lzx); + INIT_BITS; return lzx; } +int lzxd_set_reference_data(struct lzxd_stream *lzx, + struct mspack_system *system, + struct mspack_file *input, + unsigned int length) +{ + if (!lzx) return MSPACK_ERR_ARGS; + + if (!lzx->is_delta) { + D(("only LZX DELTA streams support reference data")) + return MSPACK_ERR_ARGS; + } + if (lzx->offset) { + D(("too late to set reference data after decoding starts")) + return MSPACK_ERR_ARGS; + } + if (length > lzx->window_size) { + D(("reference length (%u) is longer than the window", length)) + return MSPACK_ERR_ARGS; + } + if (length > 0 && (!system || !input)) { + D(("length > 0 but no system or input")) + return MSPACK_ERR_ARGS; + } + + lzx->ref_data_size = length; + if (length > 0) { + /* copy reference data */ + unsigned char *pos = &lzx->window[lzx->window_size - length]; + int bytes = system->read(input, pos, length); + /* length can't be more than 2^25, so no signedness problem */ + if (bytes < (int)length) return MSPACK_ERR_READ; + } + lzx->ref_data_size = length; + return MSPACK_ERR_OK; +} + void lzxd_set_output_length(struct lzxd_stream *lzx, off_t out_bytes) { - if (lzx) lzx->length = out_bytes; + if (lzx && out_bytes > 0) lzx->length = out_bytes; } int lzxd_decompress(struct lzxd_stream *lzx, off_t out_bytes) { - /* bitstream reading and huffman variables */ + /* bitstream and huffman reading variables */ unsigned int bit_buffer; int bits_left, i=0; - unsigned short sym; unsigned char *i_ptr, *i_end; + unsigned short sym; int match_length, length_footer, extra, verbatim_bits, bytes_todo; - int this_run, main_element, aligned_bits, j; + int this_run, main_element, aligned_bits, j, warned = 0; unsigned char *window, *runsrc, *rundest, buf[12]; unsigned int frame_size=0, end_frame, match_offset, window_posn; unsigned int R0, R1, R2; @@ -505,12 +435,25 @@ int lzxd_decompress(struct lzxd_stream *lzx, off_t out_bytes) { /* have we reached the reset interval? (if there is one?) */ if (lzx->reset_interval && ((lzx->frame % lzx->reset_interval) == 0)) { if (lzx->block_remaining) { - D(("%d bytes remaining at reset interval", lzx->block_remaining)) - return lzx->error = MSPACK_ERR_DECRUNCH; + /* this is a file format error, we can make a best effort to extract what we can */ + D(("%d bytes remaining at reset interval", lzx->block_remaining)) + if (!warned) { + lzx->sys->message(NULL, "WARNING; invalid reset interval detected during LZX decompression"); + warned++; + } } /* re-read the intel header and reset the huffman lengths */ lzxd_reset_state(lzx); + R0 = lzx->R0; + R1 = lzx->R1; + R2 = lzx->R2; + } + + /* LZX DELTA format has chunk_size, not present in LZX format */ + if (lzx->is_delta) { + ENSURE_BITS(16); + REMOVE_BITS(16); } /* read header if necessary */ @@ -527,7 +470,7 @@ int lzxd_decompress(struct lzxd_stream *lzx, off_t out_bytes) { * has been filled in. */ frame_size = LZX_FRAME_SIZE; if (lzx->length && (lzx->length - lzx->offset) < (off_t)frame_size) { - frame_size = (unsigned int)(lzx->length - lzx->offset); + frame_size = lzx->length - lzx->offset; } /* decode until one more frame is available */ @@ -535,70 +478,61 @@ int lzxd_decompress(struct lzxd_stream *lzx, off_t out_bytes) { while (bytes_todo > 0) { /* initialise new block, if one is needed */ if (lzx->block_remaining == 0) { - /* realign if previous block was an odd-sized UNCOMPRESSED block */ - if ((lzx->block_type == LZX_BLOCKTYPE_UNCOMPRESSED) && - (lzx->block_length & 1)) - { - if (i_ptr == i_end) { - if (lzxd_read_input(lzx)) return lzx->error; - i_ptr = lzx->i_ptr; - i_end = lzx->i_end; - } - i_ptr++; - } + /* realign if previous block was an odd-sized UNCOMPRESSED block */ + if ((lzx->block_type == LZX_BLOCKTYPE_UNCOMPRESSED) && + (lzx->block_length & 1)) + { + READ_IF_NEEDED; + i_ptr++; + } - /* read block type (3 bits) and block length (24 bits) */ - READ_BITS(lzx->block_type, 3); - READ_BITS(i, 16); READ_BITS(j, 8); - lzx->block_remaining = lzx->block_length = (i << 8) | j; - /*D(("new block t%d len %u", lzx->block_type, lzx->block_length))*/ + /* read block type (3 bits) and block length (24 bits) */ + READ_BITS(lzx->block_type, 3); + READ_BITS(i, 16); READ_BITS(j, 8); + lzx->block_remaining = lzx->block_length = (i << 8) | j; + /*D(("new block t%d len %u", lzx->block_type, lzx->block_length))*/ - /* read individual block headers */ - switch (lzx->block_type) { - case LZX_BLOCKTYPE_ALIGNED: - /* read lengths of and build aligned huffman decoding tree */ - for (i = 0; i < 8; i++) { READ_BITS(j, 3); lzx->ALIGNED_len[i] = j; } - BUILD_TABLE(ALIGNED); - /* no break -- rest of aligned header is same as verbatim */ - case LZX_BLOCKTYPE_VERBATIM: - /* read lengths of and build main huffman decoding tree */ - READ_LENGTHS(MAINTREE, 0, 256); - READ_LENGTHS(MAINTREE, 256, LZX_NUM_CHARS + (lzx->posn_slots << 3)); - BUILD_TABLE(MAINTREE); - /* if the literal 0xE8 is anywhere in the block... */ - if (lzx->MAINTREE_len[0xE8] != 0) lzx->intel_started = 1; - /* read lengths of and build lengths huffman decoding tree */ - READ_LENGTHS(LENGTH, 0, LZX_NUM_SECONDARY_LENGTHS); - BUILD_TABLE(LENGTH); - break; + /* read individual block headers */ + switch (lzx->block_type) { + case LZX_BLOCKTYPE_ALIGNED: + /* read lengths of and build aligned huffman decoding tree */ + for (i = 0; i < 8; i++) { READ_BITS(j, 3); lzx->ALIGNED_len[i] = j; } + BUILD_TABLE(ALIGNED); + /* rest of aligned header is same as verbatim */ /*@fallthrough@*/ + case LZX_BLOCKTYPE_VERBATIM: + /* read lengths of and build main huffman decoding tree */ + READ_LENGTHS(MAINTREE, 0, 256); + READ_LENGTHS(MAINTREE, 256, LZX_NUM_CHARS + lzx->num_offsets); + BUILD_TABLE(MAINTREE); + /* if the literal 0xE8 is anywhere in the block... */ + if (lzx->MAINTREE_len[0xE8] != 0) lzx->intel_started = 1; + /* read lengths of and build lengths huffman decoding tree */ + READ_LENGTHS(LENGTH, 0, LZX_NUM_SECONDARY_LENGTHS); + BUILD_TABLE_MAYBE_EMPTY(LENGTH); + break; - case LZX_BLOCKTYPE_UNCOMPRESSED: - /* because we can't assume otherwise */ - lzx->intel_started = 1; + case LZX_BLOCKTYPE_UNCOMPRESSED: + /* because we can't assume otherwise */ + lzx->intel_started = 1; - /* read 1-16 (not 0-15) bits to align to bytes */ - ENSURE_BITS(16); - if (bits_left > 16) i_ptr -= 2; - bits_left = 0; bit_buffer = 0; + /* read 1-16 (not 0-15) bits to align to bytes */ + if (bits_left == 0) ENSURE_BITS(16); + bits_left = 0; bit_buffer = 0; - /* read 12 bytes of stored R0 / R1 / R2 values */ - for (rundest = &buf[0], i = 0; i < 12; i++) { - if (i_ptr == i_end) { - if (lzxd_read_input(lzx)) return lzx->error; - i_ptr = lzx->i_ptr; - i_end = lzx->i_end; - } - *rundest++ = *i_ptr++; - } - R0 = buf[0] | (buf[1] << 8) | (buf[2] << 16) | (buf[3] << 24); - R1 = buf[4] | (buf[5] << 8) | (buf[6] << 16) | (buf[7] << 24); - R2 = buf[8] | (buf[9] << 8) | (buf[10] << 16) | (buf[11] << 24); - break; + /* read 12 bytes of stored R0 / R1 / R2 values */ + for (rundest = &buf[0], i = 0; i < 12; i++) { + READ_IF_NEEDED; + *rundest++ = *i_ptr++; + } + R0 = buf[0] | (buf[1] << 8) | (buf[2] << 16) | (buf[3] << 24); + R1 = buf[4] | (buf[5] << 8) | (buf[6] << 16) | (buf[7] << 24); + R2 = buf[8] | (buf[9] << 8) | (buf[10] << 16) | (buf[11] << 24); + break; - default: - D(("bad block type")) - return lzx->error = MSPACK_ERR_DECRUNCH; - } + default: + D(("bad block type")) + return lzx->error = MSPACK_ERR_DECRUNCH; + } } /* decode more of the block: @@ -613,202 +547,270 @@ int lzxd_decompress(struct lzxd_stream *lzx, off_t out_bytes) { /* decode at least this_run bytes */ switch (lzx->block_type) { case LZX_BLOCKTYPE_VERBATIM: - while (this_run > 0) { - READ_HUFFSYM(MAINTREE, main_element); - if (main_element < LZX_NUM_CHARS) { - /* literal: 0 to LZX_NUM_CHARS-1 */ - window[window_posn++] = main_element; - this_run--; - } - else { - /* match: LZX_NUM_CHARS + ((slot<<3) | length_header (3 bits)) */ - main_element -= LZX_NUM_CHARS; + while (this_run > 0) { + READ_HUFFSYM(MAINTREE, main_element); + if (main_element < LZX_NUM_CHARS) { + /* literal: 0 to LZX_NUM_CHARS-1 */ + window[window_posn++] = main_element; + this_run--; + } + else { + /* match: LZX_NUM_CHARS + ((slot<<3) | length_header (3 bits)) */ + main_element -= LZX_NUM_CHARS; - /* get match length */ - match_length = main_element & LZX_NUM_PRIMARY_LENGTHS; - if (match_length == LZX_NUM_PRIMARY_LENGTHS) { - READ_HUFFSYM(LENGTH, length_footer); - match_length += length_footer; - } - match_length += LZX_MIN_MATCH; - - /* get match offset */ - switch ((match_offset = (main_element >> 3))) { - case 0: match_offset = R0; break; - case 1: match_offset = R1; R1=R0; R0 = match_offset; break; - case 2: match_offset = R2; R2=R0; R0 = match_offset; break; - case 3: match_offset = 1; R2=R1; R1=R0; R0 = match_offset; break; - default: - extra = extra_bits[match_offset]; - READ_BITS(verbatim_bits, extra); - match_offset = position_base[match_offset] - 2 + verbatim_bits; - R2 = R1; R1 = R0; R0 = match_offset; - } + /* get match length */ + match_length = main_element & LZX_NUM_PRIMARY_LENGTHS; + if (match_length == LZX_NUM_PRIMARY_LENGTHS) { + if (lzx->LENGTH_empty) { + D(("LENGTH symbol needed but tree is empty")) + return lzx->error = MSPACK_ERR_DECRUNCH; + } + READ_HUFFSYM(LENGTH, length_footer); + match_length += length_footer; + } + match_length += LZX_MIN_MATCH; - if ((window_posn + match_length) > lzx->window_size) { - D(("match ran over window wrap")) - return lzx->error = MSPACK_ERR_DECRUNCH; - } - - /* copy match */ - rundest = &window[window_posn]; - i = match_length; - /* does match offset wrap the window? */ - if (match_offset > window_posn) { - /* j = length from match offset to end of window */ - j = match_offset - window_posn; - if (j > (int) lzx->window_size) { - D(("match offset beyond window boundaries")) - return lzx->error = MSPACK_ERR_DECRUNCH; - } - runsrc = &window[lzx->window_size - j]; - if (j < i) { - /* if match goes over the window edge, do two copy runs */ - i -= j; while (j-- > 0) *rundest++ = *runsrc++; - runsrc = window; - } - while (i-- > 0) *rundest++ = *runsrc++; - } - else { - runsrc = rundest - match_offset; - while (i-- > 0) *rundest++ = *runsrc++; - } + /* get match offset */ + switch ((match_offset = (main_element >> 3))) { + case 0: match_offset = R0; break; + case 1: match_offset = R1; R1=R0; R0 = match_offset; break; + case 2: match_offset = R2; R2=R0; R0 = match_offset; break; + case 3: match_offset = 1; R2=R1; R1=R0; R0 = match_offset; break; + default: + extra = (match_offset >= 36) ? 17 : extra_bits[match_offset]; + READ_BITS(verbatim_bits, extra); + match_offset = position_base[match_offset] - 2 + verbatim_bits; + R2 = R1; R1 = R0; R0 = match_offset; + } - this_run -= match_length; - window_posn += match_length; - } - } /* while (this_run > 0) */ - break; + /* LZX DELTA uses max match length to signal even longer match */ + if (match_length == LZX_MAX_MATCH && lzx->is_delta) { + int extra_len = 0; + ENSURE_BITS(3); /* 4 entry huffman tree */ + if (PEEK_BITS(1) == 0) { + REMOVE_BITS(1); /* '0' -> 8 extra length bits */ + READ_BITS(extra_len, 8); + } + else if (PEEK_BITS(2) == 2) { + REMOVE_BITS(2); /* '10' -> 10 extra length bits + 0x100 */ + READ_BITS(extra_len, 10); + extra_len += 0x100; + } + else if (PEEK_BITS(3) == 6) { + REMOVE_BITS(3); /* '110' -> 12 extra length bits + 0x500 */ + READ_BITS(extra_len, 12); + extra_len += 0x500; + } + else { + REMOVE_BITS(3); /* '111' -> 15 extra length bits */ + READ_BITS(extra_len, 15); + } + match_length += extra_len; + } + + if ((window_posn + match_length) > lzx->window_size) { + D(("match ran over window wrap")) + return lzx->error = MSPACK_ERR_DECRUNCH; + } + + /* copy match */ + rundest = &window[window_posn]; + i = match_length; + /* does match offset wrap the window? */ + if (match_offset > window_posn) { + if ((off_t)match_offset > lzx->offset && + (match_offset - window_posn) > lzx->ref_data_size) + { + D(("match offset beyond LZX stream")) + return lzx->error = MSPACK_ERR_DECRUNCH; + } + /* j = length from match offset to end of window */ + j = match_offset - window_posn; + if (j > (int) lzx->window_size) { + D(("match offset beyond window boundaries")) + return lzx->error = MSPACK_ERR_DECRUNCH; + } + runsrc = &window[lzx->window_size - j]; + if (j < i) { + /* if match goes over the window edge, do two copy runs */ + i -= j; while (j-- > 0) *rundest++ = *runsrc++; + runsrc = window; + } + while (i-- > 0) *rundest++ = *runsrc++; + } + else { + runsrc = rundest - match_offset; + while (i-- > 0) *rundest++ = *runsrc++; + } + + this_run -= match_length; + window_posn += match_length; + } + } /* while (this_run > 0) */ + break; case LZX_BLOCKTYPE_ALIGNED: - while (this_run > 0) { - READ_HUFFSYM(MAINTREE, main_element); - if (main_element < LZX_NUM_CHARS) { - /* literal: 0 to LZX_NUM_CHARS-1 */ - window[window_posn++] = main_element; - this_run--; - } - else { - /* match: LZX_NUM_CHARS + ((slot<<3) | length_header (3 bits)) */ - main_element -= LZX_NUM_CHARS; + while (this_run > 0) { + READ_HUFFSYM(MAINTREE, main_element); + if (main_element < LZX_NUM_CHARS) { + /* literal: 0 to LZX_NUM_CHARS-1 */ + window[window_posn++] = main_element; + this_run--; + } + else { + /* match: LZX_NUM_CHARS + ((slot<<3) | length_header (3 bits)) */ + main_element -= LZX_NUM_CHARS; - /* get match length */ - match_length = main_element & LZX_NUM_PRIMARY_LENGTHS; - if (match_length == LZX_NUM_PRIMARY_LENGTHS) { - READ_HUFFSYM(LENGTH, length_footer); - match_length += length_footer; - } - match_length += LZX_MIN_MATCH; + /* get match length */ + match_length = main_element & LZX_NUM_PRIMARY_LENGTHS; + if (match_length == LZX_NUM_PRIMARY_LENGTHS) { + if (lzx->LENGTH_empty) { + D(("LENGTH symbol needed but tree is empty")) + return lzx->error = MSPACK_ERR_DECRUNCH; + } + READ_HUFFSYM(LENGTH, length_footer); + match_length += length_footer; + } + match_length += LZX_MIN_MATCH; - /* get match offset */ - switch ((match_offset = (main_element >> 3))) { - case 0: match_offset = R0; break; - case 1: match_offset = R1; R1 = R0; R0 = match_offset; break; - case 2: match_offset = R2; R2 = R0; R0 = match_offset; break; - default: - extra = extra_bits[match_offset]; - match_offset = position_base[match_offset] - 2; - if (extra > 3) { - /* verbatim and aligned bits */ - extra -= 3; - READ_BITS(verbatim_bits, extra); - match_offset += (verbatim_bits << 3); - READ_HUFFSYM(ALIGNED, aligned_bits); - match_offset += aligned_bits; - } - else if (extra == 3) { - /* aligned bits only */ - READ_HUFFSYM(ALIGNED, aligned_bits); - match_offset += aligned_bits; - } - else if (extra > 0) { /* extra==1, extra==2 */ - /* verbatim bits only */ - READ_BITS(verbatim_bits, extra); - match_offset += verbatim_bits; - } - else /* extra == 0 */ { - /* ??? not defined in LZX specification! */ - match_offset = 1; - } - /* update repeated offset LRU queue */ - R2 = R1; R1 = R0; R0 = match_offset; - } + /* get match offset */ + switch ((match_offset = (main_element >> 3))) { + case 0: match_offset = R0; break; + case 1: match_offset = R1; R1 = R0; R0 = match_offset; break; + case 2: match_offset = R2; R2 = R0; R0 = match_offset; break; + default: + extra = (match_offset >= 36) ? 17 : extra_bits[match_offset]; + match_offset = position_base[match_offset] - 2; + if (extra > 3) { + /* verbatim and aligned bits */ + extra -= 3; + READ_BITS(verbatim_bits, extra); + match_offset += (verbatim_bits << 3); + READ_HUFFSYM(ALIGNED, aligned_bits); + match_offset += aligned_bits; + } + else if (extra == 3) { + /* aligned bits only */ + READ_HUFFSYM(ALIGNED, aligned_bits); + match_offset += aligned_bits; + } + else if (extra > 0) { /* extra==1, extra==2 */ + /* verbatim bits only */ + READ_BITS(verbatim_bits, extra); + match_offset += verbatim_bits; + } + else /* extra == 0 */ { + /* ??? not defined in LZX specification! */ + match_offset = 1; + } + /* update repeated offset LRU queue */ + R2 = R1; R1 = R0; R0 = match_offset; + } - if ((window_posn + match_length) > lzx->window_size) { - D(("match ran over window wrap")) - return lzx->error = MSPACK_ERR_DECRUNCH; - } + /* LZX DELTA uses max match length to signal even longer match */ + if (match_length == LZX_MAX_MATCH && lzx->is_delta) { + int extra_len = 0; + ENSURE_BITS(3); /* 4 entry huffman tree */ + if (PEEK_BITS(1) == 0) { + REMOVE_BITS(1); /* '0' -> 8 extra length bits */ + READ_BITS(extra_len, 8); + } + else if (PEEK_BITS(2) == 2) { + REMOVE_BITS(2); /* '10' -> 10 extra length bits + 0x100 */ + READ_BITS(extra_len, 10); + extra_len += 0x100; + } + else if (PEEK_BITS(3) == 6) { + REMOVE_BITS(3); /* '110' -> 12 extra length bits + 0x500 */ + READ_BITS(extra_len, 12); + extra_len += 0x500; + } + else { + REMOVE_BITS(3); /* '111' -> 15 extra length bits */ + READ_BITS(extra_len, 15); + } + match_length += extra_len; + } - /* copy match */ - rundest = &window[window_posn]; - i = match_length; - /* does match offset wrap the window? */ - if (match_offset > window_posn) { - /* j = length from match offset to end of window */ - j = match_offset - window_posn; - if (j > (int) lzx->window_size) { - D(("match offset beyond window boundaries")) - return lzx->error = MSPACK_ERR_DECRUNCH; - } - runsrc = &window[lzx->window_size - j]; - if (j < i) { - /* if match goes over the window edge, do two copy runs */ - i -= j; while (j-- > 0) *rundest++ = *runsrc++; - runsrc = window; - } - while (i-- > 0) *rundest++ = *runsrc++; - } - else { - runsrc = rundest - match_offset; - while (i-- > 0) *rundest++ = *runsrc++; - } + if ((window_posn + match_length) > lzx->window_size) { + D(("match ran over window wrap")) + return lzx->error = MSPACK_ERR_DECRUNCH; + } - this_run -= match_length; - window_posn += match_length; - } - } /* while (this_run > 0) */ - break; + /* copy match */ + rundest = &window[window_posn]; + i = match_length; + /* does match offset wrap the window? */ + if (match_offset > window_posn) { + if ((off_t)match_offset > lzx->offset && + (match_offset - window_posn) > lzx->ref_data_size) + { + D(("match offset beyond LZX stream")) + return lzx->error = MSPACK_ERR_DECRUNCH; + } + /* j = length from match offset to end of window */ + j = match_offset - window_posn; + if (j > (int) lzx->window_size) { + D(("match offset beyond window boundaries")) + return lzx->error = MSPACK_ERR_DECRUNCH; + } + runsrc = &window[lzx->window_size - j]; + if (j < i) { + /* if match goes over the window edge, do two copy runs */ + i -= j; while (j-- > 0) *rundest++ = *runsrc++; + runsrc = window; + } + while (i-- > 0) *rundest++ = *runsrc++; + } + else { + runsrc = rundest - match_offset; + while (i-- > 0) *rundest++ = *runsrc++; + } + + this_run -= match_length; + window_posn += match_length; + } + } /* while (this_run > 0) */ + break; case LZX_BLOCKTYPE_UNCOMPRESSED: - /* as this_run is limited not to wrap a frame, this also means it - * won't wrap the window (as the window is a multiple of 32k) */ - rundest = &window[window_posn]; - window_posn += this_run; - while (this_run > 0) { - if ((i = (int)(i_end - i_ptr))) { - if (i > this_run) i = this_run; - lzx->sys->copy(i_ptr, rundest, (size_t) i); - rundest += i; - i_ptr += i; - this_run -= i; - } - else { - if (lzxd_read_input(lzx)) return lzx->error; - i_ptr = lzx->i_ptr; - i_end = lzx->i_end; - } - } - break; + /* as this_run is limited not to wrap a frame, this also means it + * won't wrap the window (as the window is a multiple of 32k) */ + rundest = &window[window_posn]; + window_posn += this_run; + while (this_run > 0) { + if ((i = (int)(i_end - i_ptr)) == 0) { + READ_IF_NEEDED; + } + else { + if (i > this_run) i = this_run; + lzx->sys->copy(i_ptr, rundest, (size_t) i); + rundest += i; + i_ptr += i; + this_run -= i; + } + } + break; default: - return lzx->error = MSPACK_ERR_DECRUNCH; /* might as well */ + return lzx->error = MSPACK_ERR_DECRUNCH; /* might as well */ } /* did the final match overrun our desired this_run length? */ if (this_run < 0) { - if ((unsigned int)(-this_run) > lzx->block_remaining) { - D(("overrun went past end of block by %d (%d remaining)", - -this_run, lzx->block_remaining )) - return lzx->error = MSPACK_ERR_DECRUNCH; - } - lzx->block_remaining -= -this_run; + if ((unsigned int)(-this_run) > lzx->block_remaining) { + D(("overrun went past end of block by %d (%d remaining)", + -this_run, lzx->block_remaining )) + return lzx->error = MSPACK_ERR_DECRUNCH; + } + lzx->block_remaining -= -this_run; } } /* while (bytes_todo > 0) */ /* streams don't extend over frame boundaries */ if ((window_posn - lzx->frame_posn) != frame_size) { D(("decode beyond output frame limits! %d != %d", - window_posn - lzx->frame_posn, frame_size)) + window_posn - lzx->frame_posn, frame_size)) return lzx->error = MSPACK_ERR_DECRUNCH; } @@ -818,13 +820,14 @@ int lzxd_decompress(struct lzxd_stream *lzx, off_t out_bytes) { /* check that we've used all of the previous frame first */ if (lzx->o_ptr != lzx->o_end) { - D(("%d avail bytes, new %d frame", lzx->o_end-lzx->o_ptr, frame_size)) + D(("%ld avail bytes, new %d frame", + (long)(lzx->o_end - lzx->o_ptr), frame_size)) return lzx->error = MSPACK_ERR_DECRUNCH; } /* does this intel block _really_ need decoding? */ if (lzx->intel_started && lzx->intel_filesize && - (lzx->frame <= 32768) && (frame_size > 10)) + (lzx->frame <= 32768) && (frame_size > 10)) { unsigned char *data = &lzx->e8_buf[0]; unsigned char *dataend = &lzx->e8_buf[frame_size - 10]; @@ -837,17 +840,17 @@ int lzxd_decompress(struct lzxd_stream *lzx, off_t out_bytes) { lzx->sys->copy(&lzx->window[lzx->frame_posn], data, frame_size); while (data < dataend) { - if (*data++ != 0xE8) { curpos++; continue; } - abs_off = data[0] | (data[1]<<8) | (data[2]<<16) | (data[3]<<24); - if ((abs_off >= -curpos) && (abs_off < filesize)) { - rel_off = (abs_off >= 0) ? abs_off - curpos : abs_off + filesize; - data[0] = (unsigned char) rel_off; - data[1] = (unsigned char) (rel_off >> 8); - data[2] = (unsigned char) (rel_off >> 16); - data[3] = (unsigned char) (rel_off >> 24); - } - data += 4; - curpos += 5; + if (*data++ != 0xE8) { curpos++; continue; } + abs_off = data[0] | (data[1]<<8) | (data[2]<<16) | (data[3]<<24); + if ((abs_off >= -curpos) && (abs_off < filesize)) { + rel_off = (abs_off >= 0) ? abs_off - curpos : abs_off + filesize; + data[0] = (unsigned char) rel_off; + data[1] = (unsigned char) (rel_off >> 8); + data[2] = (unsigned char) (rel_off >> 16); + data[3] = (unsigned char) (rel_off >> 24); + } + data += 4; + curpos += 5; } lzx->intel_curpos += frame_size; } diff --git a/third_party/mspack/mspack.h b/third_party/mspack/mspack.h index 0d2584dee..f9161f983 100644 --- a/third_party/mspack/mspack.h +++ b/third_party/mspack/mspack.h @@ -1,5 +1,5 @@ /* libmspack -- a library for working with Microsoft compression formats. - * (C) 2003-2004 Stuart Caie + * (C) 2003-2016 Stuart Caie * * libmspack is free software; you can redistribute it and/or modify it under * the terms of the GNU Lesser General Public License (LGPL) version 2.1 @@ -21,6 +21,79 @@ * libmspack is a library which provides compressors and decompressors, * archivers and dearchivers for Microsoft compression formats. * + * \section formats Formats supported + * + * The following file formats are supported: + * - SZDD files, which use LZSS compression + * - KWAJ files, which use LZSS, LZSS+Huffman or deflate compression + * - .HLP (MS Help) files, which use LZSS compression + * - .CAB (MS Cabinet) files, which use deflate, LZX or Quantum compression + * - .CHM (HTML Help) files, which use LZX compression + * - .LIT (MS EBook) files, which use LZX compression and DES encryption + * - .LZX (Exchange Offline Addressbook) files, which use LZX compression + * + * To determine the capabilities of the library, and the binary + * compatibility version of any particular compressor or decompressor, use + * the mspack_version() function. The UNIX library interface version is + * defined as the highest-versioned library component. + * + * \section starting Getting started + * + * The macro MSPACK_SYS_SELFTEST() should be used to ensure the library can + * be used. In particular, it checks if the caller is using 32-bit file I/O + * when the library is compiled for 64-bit file I/O and vice versa. + * + * If compiled normally, the library includes basic file I/O and memory + * management functionality using the standard C library. This can be + * customised and replaced entirely by creating a mspack_system structure. + * + * A compressor or decompressor for the required format must be + * instantiated before it can be used. Each construction function takes + * one parameter, which is either a pointer to a custom mspack_system + * structure, or NULL to use the default. The instantiation returned, if + * not NULL, contains function pointers (methods) to work with the given + * file format. + * + * For compression: + * - mspack_create_cab_compressor() creates a mscab_compressor + * - mspack_create_chm_compressor() creates a mschm_compressor + * - mspack_create_lit_compressor() creates a mslit_compressor + * - mspack_create_hlp_compressor() creates a mshlp_compressor + * - mspack_create_szdd_compressor() creates a msszdd_compressor + * - mspack_create_kwaj_compressor() creates a mskwaj_compressor + * - mspack_create_oab_compressor() creates a msoab_compressor + * + * For decompression: + * - mspack_create_cab_decompressor() creates a mscab_decompressor + * - mspack_create_chm_decompressor() creates a mschm_decompressor + * - mspack_create_lit_decompressor() creates a mslit_decompressor + * - mspack_create_hlp_decompressor() creates a mshlp_decompressor + * - mspack_create_szdd_decompressor() creates a msszdd_decompressor + * - mspack_create_kwaj_decompressor() creates a mskwaj_decompressor + * - mspack_create_oab_decompressor() creates a msoab_decompressor + * + * Once finished working with a format, each kind of + * compressor/decompressor has its own specific destructor: + * - mspack_destroy_cab_compressor() + * - mspack_destroy_cab_decompressor() + * - mspack_destroy_chm_compressor() + * - mspack_destroy_chm_decompressor() + * - mspack_destroy_lit_compressor() + * - mspack_destroy_lit_decompressor() + * - mspack_destroy_hlp_compressor() + * - mspack_destroy_hlp_decompressor() + * - mspack_destroy_szdd_compressor() + * - mspack_destroy_szdd_decompressor() + * - mspack_destroy_kwaj_compressor() + * - mspack_destroy_kwaj_decompressor() + * - mspack_destroy_oab_compressor() + * - mspack_destroy_oab_decompressor() + * + * Destroying a compressor or decompressor does not destroy any objects, + * structures or handles that have been created using that compressor or + * decompressor. Ensure that everything created or opened is destroyed or + * closed before compressor/decompressor is itself destroyed. + * * \section errors Error codes * * All compressors and decompressors use the same set of error codes. Most @@ -45,6 +118,41 @@ * - #MSPACK_ERR_CHECKSUM indicates that a data checksum has failed. * - #MSPACK_ERR_CRUNCH indicates an error occured during compression. * - #MSPACK_ERR_DECRUNCH indicates an error occured during decompression. + * + * \section threading Multi-threading + * + * libmspack methods are reentrant and multithreading-safe when each + * thread has its own compressor or decompressor. + + * You should not call multiple methods simultaneously on a single + * compressor or decompressor instance. + * + * If this may happen, you can either use one compressor or + * decompressor per thread, or you can use your preferred lock, + * semaphore or mutex library to ensure no more than one method on a + * compressor/decompressor is called simultaneously. libmspack will + * not do this locking for you. + * + * Example of incorrect behaviour: + * - thread 1 calls mspack_create_cab_decompressor() + * - thread 1 calls open() + * - thread 1 calls extract() for one file + * - thread 2 simultaneously calls extract() for another file + * + * Correct behaviour: + * - thread 1 calls mspack_create_cab_decompressor() + * - thread 2 calls mspack_create_cab_decompressor() + * - thread 1 calls its own open() / extract() + * - thread 2 simultaneously calls its own open() / extract() + * + * Also correct behaviour: + * - thread 1 calls mspack_create_cab_decompressor() + * - thread 1 locks a mutex for with the decompressor before + * calling any methods on it, and unlocks the mutex after each + * method returns. + * - thread 1 can share the results of open() with thread 2, and both + * can call extract(), provided they both guard against simultaneous + * use of extract(), and any other methods, with the mutex */ #ifndef LIB_MSPACK_H @@ -57,6 +165,102 @@ extern "C" { #include #include +/** + * System self-test function, to ensure both library and calling program + * can use one another. + * + * A result of MSPACK_ERR_OK means the library and caller are + * compatible. Any other result indicates that the library and caller are + * not compatible and should not be used. In particular, a value of + * MSPACK_ERR_SEEK means the library and caller use different off_t + * datatypes. + * + * It should be used like so: + * + * @code + * int selftest_result; + * MSPACK_SYS_SELFTEST(selftest_result); + * if (selftest_result != MSPACK_ERR_OK) { + * fprintf(stderr, "incompatible with this build of libmspack\n"); + * exit(0); + * } + * @endcode + * + * @param result an int variable to store the result of the self-test + */ +#define MSPACK_SYS_SELFTEST(result) do { \ + (result) = mspack_sys_selftest_internal(sizeof(off_t)); \ +} while (0) + +/** Part of the MSPACK_SYS_SELFTEST() macro, must not be used directly. */ +extern int mspack_sys_selftest_internal(int); + +/** + * Enquire about the binary compatibility version of a specific interface in + * the library. Currently, the following interfaces are defined: + * + * - #MSPACK_VER_LIBRARY: the overall library + * - #MSPACK_VER_SYSTEM: the mspack_system interface + * - #MSPACK_VER_MSCABD: the mscab_decompressor interface + * - #MSPACK_VER_MSCABC: the mscab_compressor interface + * - #MSPACK_VER_MSCHMD: the mschm_decompressor interface + * - #MSPACK_VER_MSCHMC: the mschm_compressor interface + * - #MSPACK_VER_MSLITD: the mslit_decompressor interface + * - #MSPACK_VER_MSLITC: the mslit_compressor interface + * - #MSPACK_VER_MSHLPD: the mshlp_decompressor interface + * - #MSPACK_VER_MSHLPC: the mshlp_compressor interface + * - #MSPACK_VER_MSSZDDD: the msszdd_decompressor interface + * - #MSPACK_VER_MSSZDDC: the msszdd_compressor interface + * - #MSPACK_VER_MSKWAJD: the mskwaj_decompressor interface + * - #MSPACK_VER_MSKWAJC: the mskwaj_compressor interface + * - #MSPACK_VER_MSOABD: the msoab_decompressor interface + * - #MSPACK_VER_MSOABC: the msoab_compressor interface + * + * The result of the function should be interpreted as follows: + * - -1: this interface is completely unknown to the library + * - 0: this interface is known, but non-functioning + * - 1: this interface has all basic functionality + * - 2, 3, ...: this interface has additional functionality, clearly marked + * in the documentation as "version 2", "version 3" and so on. + * + * @param entity the interface to request current version of + * @return the version of the requested interface + */ +extern int mspack_version(int entity); + +/** Pass to mspack_version() to get the overall library version */ +#define MSPACK_VER_LIBRARY (0) +/** Pass to mspack_version() to get the mspack_system version */ +#define MSPACK_VER_SYSTEM (1) +/** Pass to mspack_version() to get the mscab_decompressor version */ +#define MSPACK_VER_MSCABD (2) +/** Pass to mspack_version() to get the mscab_compressor version */ +#define MSPACK_VER_MSCABC (3) +/** Pass to mspack_version() to get the mschm_decompressor version */ +#define MSPACK_VER_MSCHMD (4) +/** Pass to mspack_version() to get the mschm_compressor version */ +#define MSPACK_VER_MSCHMC (5) +/** Pass to mspack_version() to get the mslit_decompressor version */ +#define MSPACK_VER_MSLITD (6) +/** Pass to mspack_version() to get the mslit_compressor version */ +#define MSPACK_VER_MSLITC (7) +/** Pass to mspack_version() to get the mshlp_decompressor version */ +#define MSPACK_VER_MSHLPD (8) +/** Pass to mspack_version() to get the mshlp_compressor version */ +#define MSPACK_VER_MSHLPC (9) +/** Pass to mspack_version() to get the msszdd_decompressor version */ +#define MSPACK_VER_MSSZDDD (10) +/** Pass to mspack_version() to get the msszdd_compressor version */ +#define MSPACK_VER_MSSZDDC (11) +/** Pass to mspack_version() to get the mskwaj_decompressor version */ +#define MSPACK_VER_MSKWAJD (12) +/** Pass to mspack_version() to get the mskwaj_compressor version */ +#define MSPACK_VER_MSKWAJC (13) +/** Pass to mspack_version() to get the msoab_decompressor version */ +#define MSPACK_VER_MSOABD (14) +/** Pass to mspack_version() to get the msoab_compressor version */ +#define MSPACK_VER_MSOABC (15) + /* --- file I/O abstraction ------------------------------------------------ */ /** @@ -82,7 +286,7 @@ struct mspack_system { /** * Opens a file for reading, writing, appending or updating. * - * @param this a self-referential pointer to the mspack_system + * @param self a self-referential pointer to the mspack_system * structure whose open() method is being called. If * this pointer is required by close(), read(), write(), * seek() or tell(), it should be stored in the result @@ -99,12 +303,13 @@ struct mspack_system { * @return a pointer to a mspack_file structure. This structure officially * contains no members, its true contents are up to the * mspack_system implementor. It should contain whatever is needed - * for other mspack_system methods to operate. + * for other mspack_system methods to operate. Returning the NULL + * pointer indicates an error condition. * @see close(), read(), write(), seek(), tell(), message() */ - struct mspack_file * (*open)(struct mspack_system *sys, - char *filename, - int mode); + struct mspack_file * (*open)(struct mspack_system *self, + const char *filename, + int mode); /** * Closes a previously opened file. If any memory was allocated for this @@ -123,12 +328,14 @@ struct mspack_system { * @param bytes the number of bytes to read from the file. * @return the number of bytes successfully read (this can be less than * the number requested), zero to mark the end of file, or less - * than zero to indicate an error. + * than zero to indicate an error. The library does not "retry" + * reads and assumes short reads are due to EOF, so you should + * avoid returning short reads because of transient errors. * @see open(), write() */ int (*read)(struct mspack_file *file, - void *buffer, - int bytes); + void *buffer, + int bytes); /** * Writes a given number of bytes to an open file. @@ -144,8 +351,8 @@ struct mspack_system { * @see open(), read() */ int (*write)(struct mspack_file *file, - void *buffer, - int bytes); + void *buffer, + int bytes); /** * Seeks to a specific file offset within an open file. @@ -171,8 +378,8 @@ struct mspack_system { * @see open(), tell() */ int (*seek)(struct mspack_file *file, - off_t offset, - int mode); + off_t offset, + int mode); /** * Returns the current file position (in bytes) of the given file. @@ -198,26 +405,26 @@ struct mspack_system { * @see open() */ void (*message)(struct mspack_file *file, - char *format, - ...); + const char *format, + ...); /** * Allocates memory. * - * @param sys a self-referential pointer to the mspack_system + * @param self a self-referential pointer to the mspack_system * structure whose alloc() method is being called. * @param bytes the number of bytes to allocate * @result a pointer to the requested number of bytes, or NULL if * not enough memory is available * @see free() */ - void * (*alloc)(struct mspack_system *sys, - size_t bytes); + void * (*alloc)(struct mspack_system *self, + size_t bytes); /** * Frees memory. * - * @param ptr the memory to be freed. + * @param ptr the memory to be freed. NULL is accepted and ignored. * @see alloc() */ void (*free)(void *ptr); @@ -235,8 +442,8 @@ struct mspack_system { * @param bytes the size of the memory region, in bytes */ void (*copy)(void *src, - void *dest, - size_t bytes); + void *dest, + size_t bytes); /** * A null pointer to mark the end of mspack_system. It must equal NULL. @@ -299,8 +506,1857 @@ struct mspack_file { /** Error code: error during decompression */ #define MSPACK_ERR_DECRUNCH (11) -#ifdef __cplusplus +/* --- functions available in library -------------------------------------- */ + +/** Creates a new CAB compressor. + * @param sys a custom mspack_system structure, or NULL to use the default + * @return a #mscab_compressor or NULL + */ +extern struct mscab_compressor * + mspack_create_cab_compressor(struct mspack_system *sys); + +/** Creates a new CAB decompressor. + * @param sys a custom mspack_system structure, or NULL to use the default + * @return a #mscab_decompressor or NULL + */ +extern struct mscab_decompressor * + mspack_create_cab_decompressor(struct mspack_system *sys); + +/** Destroys an existing CAB compressor. + * @param self the #mscab_compressor to destroy + */ +extern void mspack_destroy_cab_compressor(struct mscab_compressor *self); + +/** Destroys an existing CAB decompressor. + * @param self the #mscab_decompressor to destroy + */ +extern void mspack_destroy_cab_decompressor(struct mscab_decompressor *self); + + +/** Creates a new CHM compressor. + * @param sys a custom mspack_system structure, or NULL to use the default + * @return a #mschm_compressor or NULL + */ +extern struct mschm_compressor * + mspack_create_chm_compressor(struct mspack_system *sys); + +/** Creates a new CHM decompressor. + * @param sys a custom mspack_system structure, or NULL to use the default + * @return a #mschm_decompressor or NULL + */ +extern struct mschm_decompressor * + mspack_create_chm_decompressor(struct mspack_system *sys); + +/** Destroys an existing CHM compressor. + * @param self the #mschm_compressor to destroy + */ +extern void mspack_destroy_chm_compressor(struct mschm_compressor *self); + +/** Destroys an existing CHM decompressor. + * @param self the #mschm_decompressor to destroy + */ +extern void mspack_destroy_chm_decompressor(struct mschm_decompressor *self); + + +/** Creates a new LIT compressor. + * @param sys a custom mspack_system structure, or NULL to use the default + * @return a #mslit_compressor or NULL + */ +extern struct mslit_compressor * + mspack_create_lit_compressor(struct mspack_system *sys); + +/** Creates a new LIT decompressor. + * @param sys a custom mspack_system structure, or NULL to use the default + * @return a #mslit_decompressor or NULL + */ +extern struct mslit_decompressor * + mspack_create_lit_decompressor(struct mspack_system *sys); + +/** Destroys an existing LIT compressor. + * @param self the #mslit_compressor to destroy + */ +extern void mspack_destroy_lit_compressor(struct mslit_compressor *self); + +/** Destroys an existing LIT decompressor. + * @param self the #mslit_decompressor to destroy + */ +extern void mspack_destroy_lit_decompressor(struct mslit_decompressor *self); + + +/** Creates a new HLP compressor. + * @param sys a custom mspack_system structure, or NULL to use the default + * @return a #mshlp_compressor or NULL + */ +extern struct mshlp_compressor * + mspack_create_hlp_compressor(struct mspack_system *sys); + +/** Creates a new HLP decompressor. + * @param sys a custom mspack_system structure, or NULL to use the default + * @return a #mshlp_decompressor or NULL + */ +extern struct mshlp_decompressor * + mspack_create_hlp_decompressor(struct mspack_system *sys); + +/** Destroys an existing hlp compressor. + * @param self the #mshlp_compressor to destroy + */ +extern void mspack_destroy_hlp_compressor(struct mshlp_compressor *self); + +/** Destroys an existing hlp decompressor. + * @param self the #mshlp_decompressor to destroy + */ +extern void mspack_destroy_hlp_decompressor(struct mshlp_decompressor *self); + + +/** Creates a new SZDD compressor. + * @param sys a custom mspack_system structure, or NULL to use the default + * @return a #msszdd_compressor or NULL + */ +extern struct msszdd_compressor * + mspack_create_szdd_compressor(struct mspack_system *sys); + +/** Creates a new SZDD decompressor. + * @param sys a custom mspack_system structure, or NULL to use the default + * @return a #msszdd_decompressor or NULL + */ +extern struct msszdd_decompressor * + mspack_create_szdd_decompressor(struct mspack_system *sys); + +/** Destroys an existing SZDD compressor. + * @param self the #msszdd_compressor to destroy + */ +extern void mspack_destroy_szdd_compressor(struct msszdd_compressor *self); + +/** Destroys an existing SZDD decompressor. + * @param self the #msszdd_decompressor to destroy + */ +extern void mspack_destroy_szdd_decompressor(struct msszdd_decompressor *self); + + +/** Creates a new KWAJ compressor. + * @param sys a custom mspack_system structure, or NULL to use the default + * @return a #mskwaj_compressor or NULL + */ +extern struct mskwaj_compressor * + mspack_create_kwaj_compressor(struct mspack_system *sys); + +/** Creates a new KWAJ decompressor. + * @param sys a custom mspack_system structure, or NULL to use the default + * @return a #mskwaj_decompressor or NULL + */ +extern struct mskwaj_decompressor * + mspack_create_kwaj_decompressor(struct mspack_system *sys); + +/** Destroys an existing KWAJ compressor. + * @param self the #mskwaj_compressor to destroy + */ +extern void mspack_destroy_kwaj_compressor(struct mskwaj_compressor *self); + +/** Destroys an existing KWAJ decompressor. + * @param self the #mskwaj_decompressor to destroy + */ +extern void mspack_destroy_kwaj_decompressor(struct mskwaj_decompressor *self); + + +/** Creates a new OAB compressor. + * @param sys a custom mspack_system structure, or NULL to use the default + * @return a #msoab_compressor or NULL + */ +extern struct msoab_compressor * + mspack_create_oab_compressor(struct mspack_system *sys); + +/** Creates a new OAB decompressor. + * @param sys a custom mspack_system structure, or NULL to use the default + * @return a #msoab_decompressor or NULL + */ +extern struct msoab_decompressor * + mspack_create_oab_decompressor(struct mspack_system *sys); + +/** Destroys an existing OAB compressor. + * @param self the #msoab_compressor to destroy + */ +extern void mspack_destroy_oab_compressor(struct msoab_compressor *self); + +/** Destroys an existing OAB decompressor. + * @param self the #msoab_decompressor to destroy + */ +extern void mspack_destroy_oab_decompressor(struct msoab_decompressor *self); + + +/* --- support for .CAB (MS Cabinet) file format --------------------------- */ + +/** + * A structure which represents a single cabinet file. + * + * All fields are READ ONLY. + * + * If this cabinet is part of a merged cabinet set, the #files and #folders + * fields are common to all cabinets in the set, and will be identical. + * + * @see mscab_decompressor::open(), mscab_decompressor::close(), + * mscab_decompressor::search() + */ +struct mscabd_cabinet { + /** + * The next cabinet in a chained list, if this cabinet was opened with + * mscab_decompressor::search(). May be NULL to mark the end of the + * list. + */ + struct mscabd_cabinet *next; + + /** + * The filename of the cabinet. More correctly, the filename of the + * physical file that the cabinet resides in. This is given by the + * library user and may be in any format. + */ + const char *filename; + + /** The file offset of cabinet within the physical file it resides in. */ + off_t base_offset; + + /** The length of the cabinet file in bytes. */ + unsigned int length; + + /** The previous cabinet in a cabinet set, or NULL. */ + struct mscabd_cabinet *prevcab; + + /** The next cabinet in a cabinet set, or NULL. */ + struct mscabd_cabinet *nextcab; + + /** The filename of the previous cabinet in a cabinet set, or NULL. */ + char *prevname; + + /** The filename of the next cabinet in a cabinet set, or NULL. */ + char *nextname; + + /** The name of the disk containing the previous cabinet in a cabinet + * set, or NULL. + */ + char *previnfo; + + /** The name of the disk containing the next cabinet in a cabinet set, + * or NULL. + */ + char *nextinfo; + + /** A list of all files in the cabinet or cabinet set. */ + struct mscabd_file *files; + + /** A list of all folders in the cabinet or cabinet set. */ + struct mscabd_folder *folders; + + /** + * The set ID of the cabinet. All cabinets in the same set should have + * the same set ID. + */ + unsigned short set_id; + + /** + * The index number of the cabinet within the set. Numbering should + * start from 0 for the first cabinet in the set, and increment by 1 for + * each following cabinet. + */ + unsigned short set_index; + + /** + * The number of bytes reserved in the header area of the cabinet. + * + * If this is non-zero and flags has MSCAB_HDR_RESV set, this data can + * be read by the calling application. It is of the given length, + * located at offset (base_offset + MSCAB_HDR_RESV_OFFSET) in the + * cabinet file. + * + * @see flags + */ + unsigned short header_resv; + + /** + * Header flags. + * + * - MSCAB_HDR_PREVCAB indicates the cabinet is part of a cabinet set, and + * has a predecessor cabinet. + * - MSCAB_HDR_NEXTCAB indicates the cabinet is part of a cabinet set, and + * has a successor cabinet. + * - MSCAB_HDR_RESV indicates the cabinet has reserved header space. + * + * @see prevname, previnfo, nextname, nextinfo, header_resv + */ + int flags; }; + +/** Offset from start of cabinet to the reserved header data (if present). */ +#define MSCAB_HDR_RESV_OFFSET (0x28) + +/** Cabinet header flag: cabinet has a predecessor */ +#define MSCAB_HDR_PREVCAB (0x01) +/** Cabinet header flag: cabinet has a successor */ +#define MSCAB_HDR_NEXTCAB (0x02) +/** Cabinet header flag: cabinet has reserved header space */ +#define MSCAB_HDR_RESV (0x04) + +/** + * A structure which represents a single folder in a cabinet or cabinet set. + * + * All fields are READ ONLY. + * + * A folder is a single compressed stream of data. When uncompressed, it + * holds the data of one or more files. A folder may be split across more + * than one cabinet. + */ +struct mscabd_folder { + /** + * A pointer to the next folder in this cabinet or cabinet set, or NULL + * if this is the final folder. + */ + struct mscabd_folder *next; + + /** + * The compression format used by this folder. + * + * The macro MSCABD_COMP_METHOD() should be used on this field to get + * the algorithm used. The macro MSCABD_COMP_LEVEL() should be used to get + * the "compression level". + * + * @see MSCABD_COMP_METHOD(), MSCABD_COMP_LEVEL() + */ + int comp_type; + + /** + * The total number of data blocks used by this folder. This includes + * data blocks present in other files, if this folder spans more than + * one cabinet. + */ + unsigned int num_blocks; +}; + +/** + * Returns the compression method used by a folder. + * + * @param comp_type a mscabd_folder::comp_type value + * @return one of #MSCAB_COMP_NONE, #MSCAB_COMP_MSZIP, #MSCAB_COMP_QUANTUM + * or #MSCAB_COMP_LZX + */ +#define MSCABD_COMP_METHOD(comp_type) ((comp_type) & 0x0F) +/** + * Returns the compression level used by a folder. + * + * @param comp_type a mscabd_folder::comp_type value + * @return the compression level. This is only defined by LZX and Quantum + * compression + */ +#define MSCABD_COMP_LEVEL(comp_type) (((comp_type) >> 8) & 0x1F) + +/** Compression mode: no compression. */ +#define MSCAB_COMP_NONE (0) +/** Compression mode: MSZIP (deflate) compression. */ +#define MSCAB_COMP_MSZIP (1) +/** Compression mode: Quantum compression */ +#define MSCAB_COMP_QUANTUM (2) +/** Compression mode: LZX compression */ +#define MSCAB_COMP_LZX (3) + +/** + * A structure which represents a single file in a cabinet or cabinet set. + * + * All fields are READ ONLY. + */ +struct mscabd_file { + /** + * The next file in the cabinet or cabinet set, or NULL if this is the + * final file. + */ + struct mscabd_file *next; + + /** + * The filename of the file. + * + * A null terminated string of up to 255 bytes in length, it may be in + * either ISO-8859-1 or UTF8 format, depending on the file attributes. + * + * @see attribs + */ + char *filename; + + /** The uncompressed length of the file, in bytes. */ + unsigned int length; + + /** + * File attributes. + * + * The following attributes are defined: + * - #MSCAB_ATTRIB_RDONLY indicates the file is write protected. + * - #MSCAB_ATTRIB_HIDDEN indicates the file is hidden. + * - #MSCAB_ATTRIB_SYSTEM indicates the file is a operating system file. + * - #MSCAB_ATTRIB_ARCH indicates the file is "archived". + * - #MSCAB_ATTRIB_EXEC indicates the file is an executable program. + * - #MSCAB_ATTRIB_UTF_NAME indicates the filename is in UTF8 format rather + * than ISO-8859-1. + */ + int attribs; + + /** File's last modified time, hour field. */ + char time_h; + /** File's last modified time, minute field. */ + char time_m; + /** File's last modified time, second field. */ + char time_s; + + /** File's last modified date, day field. */ + char date_d; + /** File's last modified date, month field. */ + char date_m; + /** File's last modified date, year field. */ + int date_y; + + /** A pointer to the folder that contains this file. */ + struct mscabd_folder *folder; + + /** The uncompressed offset of this file in its folder. */ + unsigned int offset; +}; + +/** mscabd_file::attribs attribute: file is read-only. */ +#define MSCAB_ATTRIB_RDONLY (0x01) +/** mscabd_file::attribs attribute: file is hidden. */ +#define MSCAB_ATTRIB_HIDDEN (0x02) +/** mscabd_file::attribs attribute: file is an operating system file. */ +#define MSCAB_ATTRIB_SYSTEM (0x04) +/** mscabd_file::attribs attribute: file is "archived". */ +#define MSCAB_ATTRIB_ARCH (0x20) +/** mscabd_file::attribs attribute: file is an executable program. */ +#define MSCAB_ATTRIB_EXEC (0x40) +/** mscabd_file::attribs attribute: filename is UTF8, not ISO-8859-1. */ +#define MSCAB_ATTRIB_UTF_NAME (0x80) + +/** mscab_decompressor::set_param() parameter: search buffer size. */ +#define MSCABD_PARAM_SEARCHBUF (0) +/** mscab_decompressor::set_param() parameter: repair MS-ZIP streams? */ +#define MSCABD_PARAM_FIXMSZIP (1) +/** mscab_decompressor::set_param() parameter: size of decompression buffer */ +#define MSCABD_PARAM_DECOMPBUF (2) +/** mscab_decompressor::set_param() parameter: salvage data from bad cabinets? + * If enabled, open() will skip file with bad folder indices or filenames + * rather than reject the whole cabinet, and extract() will limit rather than + * reject files with invalid offsets and lengths, and bad data block checksums + * will be ignored. Available only in CAB decoder version 2 and above. + */ +#define MSCABD_PARAM_SALVAGE (3) + +/** TODO */ +struct mscab_compressor { + int dummy; +}; + +/** + * A decompressor for .CAB (Microsoft Cabinet) files + * + * All fields are READ ONLY. + * + * @see mspack_create_cab_decompressor(), mspack_destroy_cab_decompressor() + */ +struct mscab_decompressor { + /** + * Opens a cabinet file and reads its contents. + * + * If the file opened is a valid cabinet file, all headers will be read + * and a mscabd_cabinet structure will be returned, with a full list of + * folders and files. + * + * In the case of an error occuring, NULL is returned and the error code + * is available from last_error(). + * + * The filename pointer should be considered "in use" until close() is + * called on the cabinet. + * + * @param self a self-referential pointer to the mscab_decompressor + * instance being called + * @param filename the filename of the cabinet file. This is passed + * directly to mspack_system::open(). + * @return a pointer to a mscabd_cabinet structure, or NULL on failure + * @see close(), search(), last_error() + */ + struct mscabd_cabinet * (*open) (struct mscab_decompressor *self, + const char *filename); + + /** + * Closes a previously opened cabinet or cabinet set. + * + * This closes a cabinet, all cabinets associated with it via the + * mscabd_cabinet::next, mscabd_cabinet::prevcab and + * mscabd_cabinet::nextcab pointers, and all folders and files. All + * memory used by these entities is freed. + * + * The cabinet pointer is now invalid and cannot be used again. All + * mscabd_folder and mscabd_file pointers from that cabinet or cabinet + * set are also now invalid, and cannot be used again. + * + * If the cabinet pointer given was created using search(), it MUST be + * the cabinet pointer returned by search() and not one of the later + * cabinet pointers further along the mscabd_cabinet::next chain. + + * If extra cabinets have been added using append() or prepend(), these + * will all be freed, even if the cabinet pointer given is not the first + * cabinet in the set. Do NOT close() more than one cabinet in the set. + * + * The mscabd_cabinet::filename is not freed by the library, as it is + * not allocated by the library. The caller should free this itself if + * necessary, before it is lost forever. + * + * @param self a self-referential pointer to the mscab_decompressor + * instance being called + * @param cab the cabinet to close + * @see open(), search(), append(), prepend() + */ + void (*close)(struct mscab_decompressor *self, + struct mscabd_cabinet *cab); + + /** + * Searches a regular file for embedded cabinets. + * + * This opens a normal file with the given filename and will search the + * entire file for embedded cabinet files + * + * If any cabinets are found, the equivalent of open() is called on each + * potential cabinet file at the offset it was found. All successfully + * open()ed cabinets are kept in a list. + * + * The first cabinet found will be returned directly as the result of + * this method. Any further cabinets found will be chained in a list + * using the mscabd_cabinet::next field. + * + * In the case of an error occuring anywhere other than the simulated + * open(), NULL is returned and the error code is available from + * last_error(). + * + * If no error occurs, but no cabinets can be found in the file, NULL is + * returned and last_error() returns MSPACK_ERR_OK. + * + * The filename pointer should be considered in use until close() is + * called on the cabinet. + * + * close() should only be called on the result of search(), not on any + * subsequent cabinets in the mscabd_cabinet::next chain. + * + * @param self a self-referential pointer to the mscab_decompressor + * instance being called + * @param filename the filename of the file to search for cabinets. This + * is passed directly to mspack_system::open(). + * @return a pointer to a mscabd_cabinet structure, or NULL + * @see close(), open(), last_error() + */ + struct mscabd_cabinet * (*search) (struct mscab_decompressor *self, + const char *filename); + + /** + * Appends one mscabd_cabinet to another, forming or extending a cabinet + * set. + * + * This will attempt to append one cabinet to another such that + * (cab->nextcab == nextcab) && (nextcab->prevcab == cab) and + * any folders split between the two cabinets are merged. + * + * The cabinets MUST be part of a cabinet set -- a cabinet set is a + * cabinet that spans more than one physical cabinet file on disk -- and + * must be appropriately matched. + * + * It can be determined if a cabinet has further parts to load by + * examining the mscabd_cabinet::flags field: + * + * - if (flags & MSCAB_HDR_PREVCAB) is non-zero, there is a + * predecessor cabinet to open() and prepend(). Its MS-DOS + * case-insensitive filename is mscabd_cabinet::prevname + * - if (flags & MSCAB_HDR_NEXTCAB) is non-zero, there is a + * successor cabinet to open() and append(). Its MS-DOS case-insensitive + * filename is mscabd_cabinet::nextname + * + * If the cabinets do not match, an error code will be returned. Neither + * cabinet has been altered, and both should be closed seperately. + * + * Files and folders in a cabinet set are a single entity. All cabinets + * in a set use the same file list, which is updated as cabinets in the + * set are added. All pointers to mscabd_folder and mscabd_file + * structures in either cabinet must be discarded and re-obtained after + * merging. + * + * @param self a self-referential pointer to the mscab_decompressor + * instance being called + * @param cab the cabinet which will be appended to, + * predecessor of nextcab + * @param nextcab the cabinet which will be appended, + * successor of cab + * @return an error code, or MSPACK_ERR_OK if successful + * @see prepend(), open(), close() + */ + int (*append) (struct mscab_decompressor *self, + struct mscabd_cabinet *cab, + struct mscabd_cabinet *nextcab); + + /** + * Prepends one mscabd_cabinet to another, forming or extending a + * cabinet set. + * + * This will attempt to prepend one cabinet to another, such that + * (cab->prevcab == prevcab) && (prevcab->nextcab == cab). In + * all other respects, it is identical to append(). See append() for the + * full documentation. + * + * @param self a self-referential pointer to the mscab_decompressor + * instance being called + * @param cab the cabinet which will be prepended to, + * successor of prevcab + * @param prevcab the cabinet which will be prepended, + * predecessor of cab + * @return an error code, or MSPACK_ERR_OK if successful + * @see append(), open(), close() + */ + int (*prepend) (struct mscab_decompressor *self, + struct mscabd_cabinet *cab, + struct mscabd_cabinet *prevcab); + + /** + * Extracts a file from a cabinet or cabinet set. + * + * This extracts a compressed file in a cabinet and writes it to the given + * filename. + * + * The MS-DOS filename of the file, mscabd_file::filename, is NOT USED + * by extract(). The caller must examine this MS-DOS filename, copy and + * change it as necessary, create directories as necessary, and provide + * the correct filename as a parameter, which will be passed unchanged + * to the decompressor's mspack_system::open() + * + * If the file belongs to a split folder in a multi-part cabinet set, + * and not enough parts of the cabinet set have been loaded and appended + * or prepended, an error will be returned immediately. + * + * @param self a self-referential pointer to the mscab_decompressor + * instance being called + * @param file the file to be decompressed + * @param filename the filename of the file being written to + * @return an error code, or MSPACK_ERR_OK if successful + */ + int (*extract)(struct mscab_decompressor *self, + struct mscabd_file *file, + const char *filename); + + /** + * Sets a CAB decompression engine parameter. + * + * The following parameters are defined: + * - #MSCABD_PARAM_SEARCHBUF: How many bytes should be allocated as a + * buffer when using search()? The minimum value is 4. The default + * value is 32768. + * - #MSCABD_PARAM_FIXMSZIP: If non-zero, extract() will ignore bad + * checksums and recover from decompression errors in MS-ZIP + * compressed folders. The default value is 0 (don't recover). + * - #MSCABD_PARAM_DECOMPBUF: How many bytes should be used as an input + * bit buffer by decompressors? The minimum value is 4. The default + * value is 4096. + * + * @param self a self-referential pointer to the mscab_decompressor + * instance being called + * @param param the parameter to set + * @param value the value to set the parameter to + * @return MSPACK_ERR_OK if all is OK, or MSPACK_ERR_ARGS if there + * is a problem with either parameter or value. + * @see search(), extract() + */ + int (*set_param)(struct mscab_decompressor *self, + int param, + int value); + + /** + * Returns the error code set by the most recently called method. + * + * This is useful for open() and search(), which do not return an error + * code directly. + * + * @param self a self-referential pointer to the mscab_decompressor + * instance being called + * @return the most recent error code + * @see open(), search() + */ + int (*last_error)(struct mscab_decompressor *self); +}; + +/* --- support for .CHM (HTMLHelp) file format ----------------------------- */ + +/** + * A structure which represents a file to be placed in a CHM helpfile. + * + * A contiguous array of these structures should be passed to + * mschm_compressor::generate(). The array list is terminated with an + * entry whose mschmc_file::section field is set to #MSCHMC_ENDLIST, the + * other fields in this entry are ignored. + */ +struct mschmc_file { + /** One of #MSCHMC_ENDLIST, #MSCHMC_UNCOMP or #MSCHMC_MSCOMP. */ + int section; + + /** The filename of the source file that will be added to the CHM. This + * is passed directly to mspack_system::open(). */ + const char *filename; + + /** The full path and filename of the file within the CHM helpfile, a + * UTF-1 encoded null-terminated string. */ + char *chm_filename; + + /** The length of the file, in bytes. This will be adhered to strictly + * and a read error will be issued if this many bytes cannot be read + * from the real file at CHM generation time. */ + off_t length; +}; + +/** + * A structure which represents a section of a CHM helpfile. + * + * All fields are READ ONLY. + * + * Not used directly, but used as a generic base type for + * mschmd_sec_uncompressed and mschmd_sec_mscompressed. + */ +struct mschmd_section { + /** A pointer to the CHM helpfile that contains this section. */ + struct mschmd_header *chm; + + /** + * The section ID. Either 0 for the uncompressed section + * mschmd_sec_uncompressed, or 1 for the LZX compressed section + * mschmd_sec_mscompressed. No other section IDs are known. + */ + unsigned int id; +}; + +/** + * A structure which represents the uncompressed section of a CHM helpfile. + * + * All fields are READ ONLY. + */ +struct mschmd_sec_uncompressed { + /** Generic section data. */ + struct mschmd_section base; + + /** The file offset of where this section begins in the CHM helpfile. */ + off_t offset; +}; + +/** + * A structure which represents the LZX compressed section of a CHM helpfile. + * + * All fields are READ ONLY. + */ +struct mschmd_sec_mscompressed { + /** Generic section data. */ + struct mschmd_section base; + + /** A pointer to the meta-file which represents all LZX compressed data. */ + struct mschmd_file *content; + + /** A pointer to the file which contains the LZX control data. */ + struct mschmd_file *control; + + /** A pointer to the file which contains the LZX reset table. */ + struct mschmd_file *rtable; + + /** A pointer to the file which contains the LZX span information. + * Available only in CHM decoder version 2 and above. + */ + struct mschmd_file *spaninfo; +}; + +/** + * A structure which represents a CHM helpfile. + * + * All fields are READ ONLY. + */ +struct mschmd_header { + /** The version of the CHM file format used in this file. */ + unsigned int version; + + /** + * The "timestamp" of the CHM helpfile. + * + * It is the lower 32 bits of a 64-bit value representing the number of + * centiseconds since 1601-01-01 00:00:00 UTC, plus 42. It is not useful + * as a timestamp, but it is useful as a semi-unique ID. + */ + unsigned int timestamp; + + /** + * The default Language and Country ID (LCID) of the user who ran the + * HTMLHelp Compiler. This is not the language of the CHM file itself. + */ + unsigned int language; + + /** + * The filename of the CHM helpfile. This is given by the library user + * and may be in any format. + */ + const char *filename; + + /** The length of the CHM helpfile, in bytes. */ + off_t length; + + /** A list of all non-system files in the CHM helpfile. */ + struct mschmd_file *files; + + /** + * A list of all system files in the CHM helpfile. + * + * System files are files which begin with "::". They are meta-files + * generated by the CHM creation process. + */ + struct mschmd_file *sysfiles; + + /** The section 0 (uncompressed) data in this CHM helpfile. */ + struct mschmd_sec_uncompressed sec0; + + /** The section 1 (MSCompressed) data in this CHM helpfile. */ + struct mschmd_sec_mscompressed sec1; + + /** The file offset of the first PMGL/PMGI directory chunk. */ + off_t dir_offset; + + /** The number of PMGL/PMGI directory chunks in this CHM helpfile. */ + unsigned int num_chunks; + + /** The size of each PMGL/PMGI chunk, in bytes. */ + unsigned int chunk_size; + + /** The "density" of the quick-reference section in PMGL/PMGI chunks. */ + unsigned int density; + + /** The depth of the index tree. + * + * - if 1, there are no PMGI chunks, only PMGL chunks. + * - if 2, there is 1 PMGI chunk. All chunk indices point to PMGL chunks. + * - if 3, the root PMGI chunk points to secondary PMGI chunks, which in + * turn point to PMGL chunks. + * - and so on... + */ + unsigned int depth; + + /** + * The number of the root PMGI chunk. + * + * If there is no index in the CHM helpfile, this will be 0xFFFFFFFF. + */ + unsigned int index_root; + + /** + * The number of the first PMGL chunk. Usually zero. + * Available only in CHM decoder version 2 and above. + */ + unsigned int first_pmgl; + + /** + * The number of the last PMGL chunk. Usually num_chunks-1. + * Available only in CHM decoder version 2 and above. + */ + unsigned int last_pmgl; + + /** + * A cache of loaded chunks, filled in by mschm_decoder::fast_find(). + * Available only in CHM decoder version 2 and above. + */ + unsigned char **chunk_cache; +}; + +/** + * A structure which represents a file stored in a CHM helpfile. + * + * All fields are READ ONLY. + */ +struct mschmd_file { + /** + * A pointer to the next file in the list, or NULL if this is the final + * file. + */ + struct mschmd_file *next; + + /** + * A pointer to the section that this file is located in. Indirectly, + * it also points to the CHM helpfile the file is located in. + */ + struct mschmd_section *section; + + /** The offset within the section data that this file is located at. */ + off_t offset; + + /** The length of this file, in bytes */ + off_t length; + + /** The filename of this file -- a null terminated string in UTF-8. */ + char *filename; +}; + +/** mschmc_file::section value: end of CHM file list */ +#define MSCHMC_ENDLIST (0) +/** mschmc_file::section value: this file is in the Uncompressed section */ +#define MSCHMC_UNCOMP (1) +/** mschmc_file::section value: this file is in the MSCompressed section */ +#define MSCHMC_MSCOMP (2) + +/** mschm_compressor::set_param() parameter: "timestamp" header */ +#define MSCHMC_PARAM_TIMESTAMP (0) +/** mschm_compressor::set_param() parameter: "language" header */ +#define MSCHMC_PARAM_LANGUAGE (1) +/** mschm_compressor::set_param() parameter: LZX window size */ +#define MSCHMC_PARAM_LZXWINDOW (2) +/** mschm_compressor::set_param() parameter: intra-chunk quickref density */ +#define MSCHMC_PARAM_DENSITY (3) +/** mschm_compressor::set_param() parameter: whether to create indices */ +#define MSCHMC_PARAM_INDEX (4) + +/** + * A compressor for .CHM (Microsoft HTMLHelp) files. + * + * All fields are READ ONLY. + * + * @see mspack_create_chm_compressor(), mspack_destroy_chm_compressor() + */ +struct mschm_compressor { + /** + * Generates a CHM help file. + * + * The help file will contain up to two sections, an Uncompressed + * section and potentially an MSCompressed (LZX compressed) + * section. + * + * While the contents listing of a CHM file is always in lexical order, + * the file list passed in will be taken as the correct order for files + * within the sections. It is in your interest to place similar files + * together for better compression. + * + * There are two modes of generation, to use a temporary file or not to + * use one. See use_temporary_file() for the behaviour of generate() in + * these two different modes. + * + * @param self a self-referential pointer to the mschm_compressor + * instance being called + * @param file_list an array of mschmc_file structures, terminated + * with an entry whose mschmc_file::section field is + * #MSCHMC_ENDLIST. The order of the list is + * preserved within each section. The length of any + * mschmc_file::chm_filename string cannot exceed + * roughly 4096 bytes. Each source file must be able + * to supply as many bytes as given in the + * mschmc_file::length field. + * @param output_file the file to write the generated CHM helpfile to. + * This is passed directly to mspack_system::open() + * @return an error code, or MSPACK_ERR_OK if successful + * @see use_temporary_file() set_param() + */ + int (*generate)(struct mschm_compressor *self, + struct mschmc_file file_list[], + const char *output_file); + + /** + * Specifies whether a temporary file is used during CHM generation. + * + * The CHM file format includes data about the compressed section (such + * as its overall size) that is stored in the output CHM file prior to + * the compressed section itself. This unavoidably requires that the + * compressed section has to be generated, before these details can be + * set. There are several ways this can be handled. Firstly, the + * compressed section could be generated entirely in memory before + * writing any of the output CHM file. This approach is not used in + * libmspack, as the compressed section can exceed the addressable + * memory space on most architectures. + * + * libmspack has two options, either to write these unknowable sections + * with blank data, generate the compressed section, then re-open the + * output file for update once the compressed section has been + * completed, or to write the compressed section to a temporary file, + * then write the entire output file at once, performing a simple + * file-to-file copy for the compressed section. + * + * The simple solution of buffering the entire compressed section in + * memory can still be used, if desired. As the temporary file's + * filename is passed directly to mspack_system::open(), it is possible + * for a custom mspack_system implementation to hold this file in memory, + * without writing to a disk. + * + * If a temporary file is set, generate() performs the following + * sequence of events: the temporary file is opened for writing, the + * compression algorithm writes to the temporary file, the temporary + * file is closed. Then the output file is opened for writing and the + * temporary file is re-opened for reading. The output file is written + * and the temporary file is read from. Both files are then closed. The + * temporary file itself is not deleted. If that is desired, the + * temporary file should be deleted after the completion of generate(), + * if it exists. + * + * If a temporary file is set not to be used, generate() performs the + * following sequence of events: the output file is opened for writing, + * then it is written and closed. The output file is then re-opened for + * update, the appropriate sections are seek()ed to and re-written, then + * the output file is closed. + * + * @param self a self-referential pointer to the + * mschm_compressor instance being called + * @param use_temp_file non-zero if the temporary file should be used, + * zero if the temporary file should not be used. + * @param temp_file a file to temporarily write compressed data to, + * before opening it for reading and copying the + * contents to the output file. This is passed + * directly to mspack_system::open(). + * @return an error code, or MSPACK_ERR_OK if successful + * @see generate() + */ + int (*use_temporary_file)(struct mschm_compressor *self, + int use_temp_file, + const char *temp_file); + /** + * Sets a CHM compression engine parameter. + * + * The following parameters are defined: + + * - #MSCHMC_PARAM_TIMESTAMP: Sets the "timestamp" of the CHM file + * generated. This is not a timestamp, see mschmd_header::timestamp + * for a description. If this timestamp is 0, generate() will use its + * own algorithm for making a unique ID, based on the lengths and + * names of files in the CHM itself. Defaults to 0, any value between + * 0 and (2^32)-1 is valid. + * - #MSCHMC_PARAM_LANGUAGE: Sets the "language" of the CHM file + * generated. This is not the language used in the CHM file, but the + * language setting of the user who ran the HTMLHelp compiler. It + * defaults to 0x0409. The valid range is between 0x0000 and 0x7F7F. + * - #MSCHMC_PARAM_LZXWINDOW: Sets the size of the LZX history window, + * which is also the interval at which the compressed data stream can be + * randomly accessed. The value is not a size in bytes, but a power of + * two. The default value is 16 (which makes the window 2^16 bytes, or + * 64 kilobytes), the valid range is from 15 (32 kilobytes) to 21 (2 + * megabytes). + * - #MSCHMC_PARAM_DENSITY: Sets the "density" of quick reference + * entries stored at the end of directory listing chunk. Each chunk is + * 4096 bytes in size, and contains as many file entries as there is + * room for. At the other end of the chunk, a list of "quick reference" + * pointers is included. The offset of every 'N'th file entry is given a + * quick reference, where N = (2^density) + 1. The default density is + * 2. The smallest density is 0 (N=2), the maximum is 10 (N=1025). As + * each file entry requires at least 5 bytes, the maximum number of + * entries in a single chunk is roughly 800, so the maximum value 10 + * can be used to indicate there are no quickrefs at all. + * - #MSCHMC_PARAM_INDEX: Sets whether or not to include quick lookup + * index chunk(s), in addition to normal directory listing chunks. A + * value of zero means no index chunks will be created, a non-zero value + * means index chunks will be created. The default is zero, "don't + * create an index". + * + * @param self a self-referential pointer to the mschm_compressor + * instance being called + * @param param the parameter to set + * @param value the value to set the parameter to + * @return MSPACK_ERR_OK if all is OK, or MSPACK_ERR_ARGS if there + * is a problem with either parameter or value. + * @see generate() + */ + int (*set_param)(struct mschm_compressor *self, + int param, + unsigned int value); + + /** + * Returns the error code set by the most recently called method. + * + * @param self a self-referential pointer to the mschm_compressor + * instance being called + * @return the most recent error code + * @see set_param(), generate() + */ + int (*last_error)(struct mschm_compressor *self); +}; + +/** + * A decompressor for .CHM (Microsoft HTMLHelp) files + * + * All fields are READ ONLY. + * + * @see mspack_create_chm_decompressor(), mspack_destroy_chm_decompressor() + */ +struct mschm_decompressor { + /** + * Opens a CHM helpfile and reads its contents. + * + * If the file opened is a valid CHM helpfile, all headers will be read + * and a mschmd_header structure will be returned, with a full list of + * files. + * + * In the case of an error occuring, NULL is returned and the error code + * is available from last_error(). + * + * The filename pointer should be considered "in use" until close() is + * called on the CHM helpfile. + * + * @param self a self-referential pointer to the mschm_decompressor + * instance being called + * @param filename the filename of the CHM helpfile. This is passed + * directly to mspack_system::open(). + * @return a pointer to a mschmd_header structure, or NULL on failure + * @see close() + */ + struct mschmd_header *(*open)(struct mschm_decompressor *self, + const char *filename); + + /** + * Closes a previously opened CHM helpfile. + * + * This closes a CHM helpfile, frees the mschmd_header and all + * mschmd_file structures associated with it (if any). This works on + * both helpfiles opened with open() and helpfiles opened with + * fast_open(). + * + * The CHM header pointer is now invalid and cannot be used again. All + * mschmd_file pointers referencing that CHM are also now invalid, and + * cannot be used again. + * + * @param self a self-referential pointer to the mschm_decompressor + * instance being called + * @param chm the CHM helpfile to close + * @see open(), fast_open() + */ + void (*close)(struct mschm_decompressor *self, + struct mschmd_header *chm); + + /** + * Extracts a file from a CHM helpfile. + * + * This extracts a file from a CHM helpfile and writes it to the given + * filename. The filename of the file, mscabd_file::filename, is not + * used by extract(), but can be used by the caller as a guide for + * constructing an appropriate filename. + * + * This method works both with files found in the mschmd_header::files + * and mschmd_header::sysfiles list and mschmd_file structures generated + * on the fly by fast_find(). + * + * @param self a self-referential pointer to the mschm_decompressor + * instance being called + * @param file the file to be decompressed + * @param filename the filename of the file being written to + * @return an error code, or MSPACK_ERR_OK if successful + */ + int (*extract)(struct mschm_decompressor *self, + struct mschmd_file *file, + const char *filename); + + /** + * Returns the error code set by the most recently called method. + * + * This is useful for open() and fast_open(), which do not return an + * error code directly. + * + * @param self a self-referential pointer to the mschm_decompressor + * instance being called + * @return the most recent error code + * @see open(), extract() + */ + int (*last_error)(struct mschm_decompressor *self); + + /** + * Opens a CHM helpfile quickly. + * + * If the file opened is a valid CHM helpfile, only essential headers + * will be read. A mschmd_header structure will be still be returned, as + * with open(), but the mschmd_header::files field will be NULL. No + * files details will be automatically read. The fast_find() method + * must be used to obtain file details. + * + * In the case of an error occuring, NULL is returned and the error code + * is available from last_error(). + * + * The filename pointer should be considered "in use" until close() is + * called on the CHM helpfile. + * + * @param self a self-referential pointer to the mschm_decompressor + * instance being called + * @param filename the filename of the CHM helpfile. This is passed + * directly to mspack_system::open(). + * @return a pointer to a mschmd_header structure, or NULL on failure + * @see open(), close(), fast_find(), extract() + */ + struct mschmd_header *(*fast_open)(struct mschm_decompressor *self, + const char *filename); + + /** + * Finds file details quickly. + * + * Instead of reading all CHM helpfile headers and building a list of + * files, fast_open() and fast_find() are intended for finding file + * details only when they are needed. The CHM file format includes an + * on-disk file index to allow this. + * + * Given a case-sensitive filename, fast_find() will search the on-disk + * index for that file. + * + * If the file was found, the caller-provided mschmd_file structure will + * be filled out like so: + * - section: the correct value for the found file + * - offset: the correct value for the found file + * - length: the correct value for the found file + * - all other structure elements: NULL or 0 + * + * If the file was not found, MSPACK_ERR_OK will still be returned as the + * result, but the caller-provided structure will be filled out like so: + * - section: NULL + * - offset: 0 + * - length: 0 + * - all other structure elements: NULL or 0 + * + * This method is intended to be used in conjunction with CHM helpfiles + * opened with fast_open(), but it also works with helpfiles opened + * using the regular open(). + * + * @param self a self-referential pointer to the mschm_decompressor + * instance being called + * @param chm the CHM helpfile to search for the file + * @param filename the filename of the file to search for + * @param f_ptr a pointer to a caller-provded mschmd_file structure + * @param f_size sizeof(struct mschmd_file) + * @return an error code, or MSPACK_ERR_OK if successful + * @see open(), close(), fast_find(), extract() + */ + int (*fast_find)(struct mschm_decompressor *self, + struct mschmd_header *chm, + const char *filename, + struct mschmd_file *f_ptr, + int f_size); +}; + +/* --- support for .LIT (EBook) file format -------------------------------- */ + +/** TODO */ +struct mslit_compressor { + int dummy; +}; + +/** TODO */ +struct mslit_decompressor { + int dummy; +}; + + +/* --- support for .HLP (MS Help) file format ------------------------------ */ + +/** TODO */ +struct mshlp_compressor { + int dummy; +}; + +/** TODO */ +struct mshlp_decompressor { + int dummy; +}; + + +/* --- support for SZDD file format ---------------------------------------- */ + +/** msszdd_compressor::set_param() parameter: the missing character */ +#define MSSZDDC_PARAM_MISSINGCHAR (0) + +/** msszddd_header::format value - a regular SZDD file */ +#define MSSZDD_FMT_NORMAL (0) + +/** msszddd_header::format value - a special QBasic SZDD file */ +#define MSSZDD_FMT_QBASIC (1) + +/** + * A structure which represents an SZDD compressed file. + * + * All fields are READ ONLY. + */ +struct msszddd_header { + /** The file format; either #MSSZDD_FMT_NORMAL or #MSSZDD_FMT_QBASIC */ + int format; + + /** The amount of data in the SZDD file once uncompressed. */ + off_t length; + + /** + * The last character in the filename, traditionally replaced with an + * underscore to show the file is compressed. The null character is used + * to show that this character has not been stored (e.g. because the + * filename is not known). Generally, only characters that may appear in + * an MS-DOS filename (except ".") are valid. + */ + char missing_char; +}; + +/** + * A compressor for the SZDD file format. + * + * All fields are READ ONLY. + * + * @see mspack_create_szdd_compressor(), mspack_destroy_szdd_compressor() + */ +struct msszdd_compressor { + /** + * Reads an input file and creates a compressed output file in the + * SZDD compressed file format. The SZDD compression format is quick + * but gives poor compression. It is possible for the compressed output + * file to be larger than the input file. + * + * Conventionally, SZDD compressed files have the final character in + * their filename replaced with an underscore, to show they are + * compressed. The missing character is stored in the compressed file + * itself. This is due to the restricted filename conventions of MS-DOS, + * most operating systems, such as UNIX, simply append another file + * extension to the existing filename. As mspack does not deal with + * filenames, this is left up to you. If you wish to set the missing + * character stored in the file header, use set_param() with the + * #MSSZDDC_PARAM_MISSINGCHAR parameter. + * + * "Stream" compression (where the length of the input data is not + * known) is not possible. The length of the input data is stored in the + * header of the SZDD file and must therefore be known before any data + * is compressed. Due to technical limitations of the file format, the + * maximum size of uncompressed file that will be accepted is 2147483647 + * bytes. + * + * @param self a self-referential pointer to the msszdd_compressor + * instance being called + * @param input the name of the file to compressed. This is passed + * passed directly to mspack_system::open() + * @param output the name of the file to write compressed data to. + * This is passed directly to mspack_system::open(). + * @param length the length of the uncompressed file, or -1 to indicate + * that this should be determined automatically by using + * mspack_system::seek() on the input file. + * @return an error code, or MSPACK_ERR_OK if successful + * @see set_param() + */ + int (*compress)(struct msszdd_compressor *self, + const char *input, + const char *output, + off_t length); + + /** + * Sets an SZDD compression engine parameter. + * + * The following parameters are defined: + + * - #MSSZDDC_PARAM_CHARACTER: the "missing character", the last character + * in the uncompressed file's filename, which is traditionally replaced + * with an underscore to show the file is compressed. Traditionally, + * this can only be a character that is a valid part of an MS-DOS, + * filename, but libmspack permits any character between 0x00 and 0xFF + * to be stored. 0x00 is the default, and it represents "no character + * stored". + * + * @param self a self-referential pointer to the msszdd_compressor + * instance being called + * @param param the parameter to set + * @param value the value to set the parameter to + * @return MSPACK_ERR_OK if all is OK, or MSPACK_ERR_ARGS if there + * is a problem with either parameter or value. + * @see compress() + */ + int (*set_param)(struct msszdd_compressor *self, + int param, + unsigned int value); + + /** + * Returns the error code set by the most recently called method. + * + * @param self a self-referential pointer to the msszdd_compressor + * instance being called + * @return the most recent error code + * @see compress() + */ + int (*last_error)(struct mschm_decompressor *self); +}; + +/** + * A decompressor for SZDD compressed files. + * + * All fields are READ ONLY. + * + * @see mspack_create_szdd_decompressor(), mspack_destroy_szdd_decompressor() + */ +struct msszdd_decompressor { + /** + * Opens a SZDD file and reads the header. + * + * If the file opened is a valid SZDD file, all headers will be read and + * a msszddd_header structure will be returned. + * + * In the case of an error occuring, NULL is returned and the error code + * is available from last_error(). + * + * The filename pointer should be considered "in use" until close() is + * called on the SZDD file. + * + * @param self a self-referential pointer to the msszdd_decompressor + * instance being called + * @param filename the filename of the SZDD compressed file. This is + * passed directly to mspack_system::open(). + * @return a pointer to a msszddd_header structure, or NULL on failure + * @see close() + */ + struct msszddd_header *(*open)(struct msszdd_decompressor *self, + const char *filename); + + /** + * Closes a previously opened SZDD file. + * + * This closes a SZDD file and frees the msszddd_header associated with + * it. + * + * The SZDD header pointer is now invalid and cannot be used again. + * + * @param self a self-referential pointer to the msszdd_decompressor + * instance being called + * @param szdd the SZDD file to close + * @see open() + */ + void (*close)(struct msszdd_decompressor *self, + struct msszddd_header *szdd); + + /** + * Extracts the compressed data from a SZDD file. + * + * This decompresses the compressed SZDD data stream and writes it to + * an output file. + * + * @param self a self-referential pointer to the msszdd_decompressor + * instance being called + * @param szdd the SZDD file to extract data from + * @param filename the filename to write the decompressed data to. This + * is passed directly to mspack_system::open(). + * @return an error code, or MSPACK_ERR_OK if successful + */ + int (*extract)(struct msszdd_decompressor *self, + struct msszddd_header *szdd, + const char *filename); + + /** + * Decompresses an SZDD file to an output file in one step. + * + * This opens an SZDD file as input, reads the header, then decompresses + * the compressed data immediately to an output file, finally closing + * both the input and output file. It is more convenient to use than + * open() then extract() then close(), if you do not need to know the + * SZDD output size or missing character. + * + * @param self a self-referential pointer to the msszdd_decompressor + * instance being called + * @param input the filename of the input SZDD file. This is passed + * directly to mspack_system::open(). + * @param output the filename to write the decompressed data to. This + * is passed directly to mspack_system::open(). + * @return an error code, or MSPACK_ERR_OK if successful + */ + int (*decompress)(struct msszdd_decompressor *self, + const char *input, + const char *output); + + /** + * Returns the error code set by the most recently called method. + * + * This is useful for open() which does not return an + * error code directly. + * + * @param self a self-referential pointer to the msszdd_decompressor + * instance being called + * @return the most recent error code + * @see open(), extract(), decompress() + */ + int (*last_error)(struct msszdd_decompressor *self); +}; + +/* --- support for KWAJ file format ---------------------------------------- */ + +/** mskwaj_compressor::set_param() parameter: compression type */ +#define MSKWAJC_PARAM_COMP_TYPE (0) + +/** mskwaj_compressor::set_param() parameter: include the length of the + * uncompressed file in the header? + */ +#define MSKWAJC_PARAM_INCLUDE_LENGTH (1) + +/** KWAJ compression type: no compression. */ +#define MSKWAJ_COMP_NONE (0) +/** KWAJ compression type: no compression, 0xFF XOR "encryption". */ +#define MSKWAJ_COMP_XOR (1) +/** KWAJ compression type: LZSS (same method as SZDD) */ +#define MSKWAJ_COMP_SZDD (2) +/** KWAJ compression type: LZ+Huffman compression */ +#define MSKWAJ_COMP_LZH (3) +/** KWAJ compression type: MSZIP */ +#define MSKWAJ_COMP_MSZIP (4) + +/** KWAJ optional header flag: decompressed file length is included */ +#define MSKWAJ_HDR_HASLENGTH (0x01) + +/** KWAJ optional header flag: unknown 2-byte structure is included */ +#define MSKWAJ_HDR_HASUNKNOWN1 (0x02) + +/** KWAJ optional header flag: unknown multi-sized structure is included */ +#define MSKWAJ_HDR_HASUNKNOWN2 (0x04) + +/** KWAJ optional header flag: file name (no extension) is included */ +#define MSKWAJ_HDR_HASFILENAME (0x08) + +/** KWAJ optional header flag: file extension is included */ +#define MSKWAJ_HDR_HASFILEEXT (0x10) + +/** KWAJ optional header flag: extra text is included */ +#define MSKWAJ_HDR_HASEXTRATEXT (0x20) + +/** + * A structure which represents an KWAJ compressed file. + * + * All fields are READ ONLY. + */ +struct mskwajd_header { + /** The compression type; should be one of #MSKWAJ_COMP_NONE, + * #MSKWAJ_COMP_XOR, #MSKWAJ_COMP_SZDD or #MSKWAJ_COMP_LZH + */ + unsigned short comp_type; + + /** The offset in the file where the compressed data stream begins */ + off_t data_offset; + + /** Flags indicating which optional headers were included. */ + int headers; + + /** The amount of uncompressed data in the file, or 0 if not present. */ + off_t length; + + /** output filename, or NULL if not present */ + char *filename; + + /** extra uncompressed data (usually text) in the header. + * This data can contain nulls so use extra_length to get the size. + */ + char *extra; + + /** length of extra uncompressed data in the header */ + unsigned short extra_length; +}; + +/** + * A compressor for the KWAJ file format. + * + * All fields are READ ONLY. + * + * @see mspack_create_kwaj_compressor(), mspack_destroy_kwaj_compressor() + */ +struct mskwaj_compressor { + /** + * Reads an input file and creates a compressed output file in the + * KWAJ compressed file format. The KWAJ compression format is quick + * but gives poor compression. It is possible for the compressed output + * file to be larger than the input file. + * + * @param self a self-referential pointer to the mskwaj_compressor + * instance being called + * @param input the name of the file to compressed. This is passed + * passed directly to mspack_system::open() + * @param output the name of the file to write compressed data to. + * This is passed directly to mspack_system::open(). + * @param length the length of the uncompressed file, or -1 to indicate + * that this should be determined automatically by using + * mspack_system::seek() on the input file. + * @return an error code, or MSPACK_ERR_OK if successful + * @see set_param() + */ + int (*compress)(struct mskwaj_compressor *self, + const char *input, + const char *output, + off_t length); + + /** + * Sets an KWAJ compression engine parameter. + * + * The following parameters are defined: + * + * - #MSKWAJC_PARAM_COMP_TYPE: the compression method to use. Must + * be one of #MSKWAJC_COMP_NONE, #MSKWAJC_COMP_XOR, #MSKWAJ_COMP_SZDD + * or #MSKWAJ_COMP_LZH. The default is #MSKWAJ_COMP_LZH. + * + * - #MSKWAJC_PARAM_INCLUDE_LENGTH: a boolean; should the compressed + * output file should include the uncompressed length of the input + * file in the header? This adds 4 bytes to the size of the output + * file. A value of zero says "no", non-zero says "yes". The default + * is "no". + * + * @param self a self-referential pointer to the mskwaj_compressor + * instance being called + * @param param the parameter to set + * @param value the value to set the parameter to + * @return MSPACK_ERR_OK if all is OK, or MSPACK_ERR_ARGS if there + * is a problem with either parameter or value. + * @see generate() + */ + int (*set_param)(struct mskwaj_compressor *self, + int param, + unsigned int value); + + + /** + * Sets the original filename of the file before compression, + * which will be stored in the header of the output file. + * + * The filename should be a null-terminated string, it must be an + * MS-DOS "8.3" type filename (up to 8 bytes for the filename, then + * optionally a "." and up to 3 bytes for a filename extension). + * + * If NULL is passed as the filename, no filename is included in the + * header. This is the default. + * + * @param self a self-referential pointer to the mskwaj_compressor + * instance being called + * @param filename the original filename to use + * @return MSPACK_ERR_OK if all is OK, or MSPACK_ERR_ARGS if the + * filename is too long + */ + int (*set_filename)(struct mskwaj_compressor *self, + const char *filename); + + /** + * Sets arbitrary data that will be stored in the header of the + * output file, uncompressed. It can be up to roughly 64 kilobytes, + * as the overall size of the header must not exceed 65535 bytes. + * The data can contain null bytes if desired. + * + * If NULL is passed as the data pointer, or zero is passed as the + * length, no extra data is included in the header. This is the + * default. + * + * @param self a self-referential pointer to the mskwaj_compressor + * instance being called + * @param data a pointer to the data to be stored in the header + * @param bytes the length of the data in bytes + * @return MSPACK_ERR_OK if all is OK, or MSPACK_ERR_ARGS extra data + * is too long + */ + int (*set_extra_data)(struct mskwaj_compressor *self, + void *data, + size_t bytes); + + /** + * Returns the error code set by the most recently called method. + * + * @param self a self-referential pointer to the mskwaj_compressor + * instance being called + * @return the most recent error code + * @see compress() + */ + int (*last_error)(struct mschm_decompressor *self); +}; + +/** + * A decompressor for KWAJ compressed files. + * + * All fields are READ ONLY. + * + * @see mspack_create_kwaj_decompressor(), mspack_destroy_kwaj_decompressor() + */ +struct mskwaj_decompressor { + /** + * Opens a KWAJ file and reads the header. + * + * If the file opened is a valid KWAJ file, all headers will be read and + * a mskwajd_header structure will be returned. + * + * In the case of an error occuring, NULL is returned and the error code + * is available from last_error(). + * + * The filename pointer should be considered "in use" until close() is + * called on the KWAJ file. + * + * @param self a self-referential pointer to the mskwaj_decompressor + * instance being called + * @param filename the filename of the KWAJ compressed file. This is + * passed directly to mspack_system::open(). + * @return a pointer to a mskwajd_header structure, or NULL on failure + * @see close() + */ + struct mskwajd_header *(*open)(struct mskwaj_decompressor *self, + const char *filename); + + /** + * Closes a previously opened KWAJ file. + * + * This closes a KWAJ file and frees the mskwajd_header associated + * with it. The KWAJ header pointer is now invalid and cannot be + * used again. + * + * @param self a self-referential pointer to the mskwaj_decompressor + * instance being called + * @param kwaj the KWAJ file to close + * @see open() + */ + void (*close)(struct mskwaj_decompressor *self, + struct mskwajd_header *kwaj); + + /** + * Extracts the compressed data from a KWAJ file. + * + * This decompresses the compressed KWAJ data stream and writes it to + * an output file. + * + * @param self a self-referential pointer to the mskwaj_decompressor + * instance being called + * @param kwaj the KWAJ file to extract data from + * @param filename the filename to write the decompressed data to. This + * is passed directly to mspack_system::open(). + * @return an error code, or MSPACK_ERR_OK if successful + */ + int (*extract)(struct mskwaj_decompressor *self, + struct mskwajd_header *kwaj, + const char *filename); + + /** + * Decompresses an KWAJ file to an output file in one step. + * + * This opens an KWAJ file as input, reads the header, then decompresses + * the compressed data immediately to an output file, finally closing + * both the input and output file. It is more convenient to use than + * open() then extract() then close(), if you do not need to know the + * KWAJ output size or output filename. + * + * @param self a self-referential pointer to the mskwaj_decompressor + * instance being called + * @param input the filename of the input KWAJ file. This is passed + * directly to mspack_system::open(). + * @param output the filename to write the decompressed data to. This + * is passed directly to mspack_system::open(). + * @return an error code, or MSPACK_ERR_OK if successful + */ + int (*decompress)(struct mskwaj_decompressor *self, + const char *input, + const char *output); + + /** + * Returns the error code set by the most recently called method. + * + * This is useful for open() which does not return an + * error code directly. + * + * @param self a self-referential pointer to the mskwaj_decompressor + * instance being called + * @return the most recent error code + * @see open(), search() + */ + int (*last_error)(struct mskwaj_decompressor *self); +}; + +/* --- support for .LZX (Offline Address Book) file format ----------------- */ + +/** + * A compressor for the Offline Address Book (OAB) format. + * + * All fields are READ ONLY. + * + * @see mspack_create_oab_compressor(), mspack_destroy_oab_compressor() + */ +struct msoab_compressor { + /** + * Compress a full OAB file. + * + * The input file will be read and the compressed contents written to the + * output file. + * + * @param self a self-referential pointer to the msoab_decompressor + * instance being called + * @param input the filename of the input file. This is passed + * directly to mspack_system::open(). + * @param output the filename of the output file. This is passed + * directly to mspack_system::open(). + * @return an error code, or MSPACK_ERR_OK if successful + */ + int (*compress) (struct msoab_compressor *self, + const char *input, + const char *output); + + /** + * Generate a compressed incremental OAB patch file. + * + * The two uncompressed files "input" and "base" will be read, and an + * incremental patch to generate "input" from "base" will be written to + * the output file. + * + * @param self a self-referential pointer to the msoab_compressor + * instance being called + * @param input the filename of the input file containing the new + * version of its contents. This is passed directly + * to mspack_system::open(). + * @param base the filename of the original base file containing + * the old version of its contents, against which the + * incremental patch shall generated. This is passed + * directly to mspack_system::open(). + * @param output the filename of the output file. This is passed + * directly to mspack_system::open(). + * @return an error code, or MSPACK_ERR_OK if successful + */ + int (*compress_incremental) (struct msoab_compressor *self, + const char *input, + const char *base, + const char *output); +}; + +/** + * A decompressor for .LZX (Offline Address Book) files + * + * All fields are READ ONLY. + * + * @see mspack_create_oab_decompressor(), mspack_destroy_oab_decompressor() + */ +struct msoab_decompressor { + /** + * Decompresses a full Offline Address Book file. + * + * If the input file is a valid compressed Offline Address Book file, + * it will be read and the decompressed contents will be written to + * the output file. + * + * @param self a self-referential pointer to the msoab_decompressor + * instance being called + * @param input the filename of the input file. This is passed + * directly to mspack_system::open(). + * @param output the filename of the output file. This is passed + * directly to mspack_system::open(). + * @return an error code, or MSPACK_ERR_OK if successful + */ + int (*decompress) (struct msoab_decompressor *self, + const char *input, + const char *output); + + /** + * Decompresses an Offline Address Book with an incremental patch file. + * + * This requires both a full UNCOMPRESSED Offline Address Book file to + * act as the "base", and a compressed incremental patch file as input. + * If the input file is valid, it will be decompressed with reference to + * the base file, and the decompressed contents will be written to the + * output file. + * + * There is no way to tell what the right base file is for the given + * incremental patch, but if you get it wrong, this will usually result + * in incorrect data being decompressed, which will then fail a checksum + * test. + * + * @param self a self-referential pointer to the msoab_decompressor + * instance being called + * @param input the filename of the input file. This is passed + * directly to mspack_system::open(). + * @param base the filename of the base file to which the + * incremental patch shall be applied. This is passed + * directly to mspack_system::open(). + * @param output the filename of the output file. This is passed + * directly to mspack_system::open(). + * @return an error code, or MSPACK_ERR_OK if successful + */ + int (*decompress_incremental) (struct msoab_decompressor *self, + const char *input, + const char *base, + const char *output); +}; + +#ifdef __cplusplus +} #endif #endif diff --git a/third_party/mspack/readbits.h b/third_party/mspack/readbits.h new file mode 100644 index 000000000..9b237a369 --- /dev/null +++ b/third_party/mspack/readbits.h @@ -0,0 +1,207 @@ +/* This file is part of libmspack. + * (C) 2003-2010 Stuart Caie. + * + * libmspack is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License (LGPL) version 2.1 + * + * For further details, see the file COPYING.LIB distributed with libmspack + */ + +#ifndef MSPACK_READBITS_H +#define MSPACK_READBITS_H 1 + +/* this header defines macros that read data streams by + * the individual bits + * + * INIT_BITS initialises bitstream state in state structure + * STORE_BITS stores bitstream state in state structure + * RESTORE_BITS restores bitstream state from state structure + * ENSURE_BITS(n) ensure there are at least N bits in the bit buffer + * READ_BITS(var,n) takes N bits from the buffer and puts them in var + * PEEK_BITS(n) extracts without removing N bits from the bit buffer + * REMOVE_BITS(n) removes N bits from the bit buffer + * + * READ_BITS simply calls ENSURE_BITS, PEEK_BITS and REMOVE_BITS, + * which means it's limited to reading the number of bits you can + * ensure at any one time. It also fails if asked to read zero bits. + * If you need to read zero bits, or more bits than can be ensured in + * one go, use READ_MANY_BITS instead. + * + * These macros have variable names baked into them, so to use them + * you have to define some macros: + * - BITS_TYPE: the type name of your state structure + * - BITS_VAR: the variable that points to your state structure + * - define BITS_ORDER_MSB if bits are read from the MSB, or + * define BITS_ORDER_LSB if bits are read from the LSB + * - READ_BYTES: some code that reads more data into the bit buffer, + * it should use READ_IF_NEEDED (calls read_input if the byte buffer + * is empty), then INJECT_BITS(data,n) to put data from the byte + * buffer into the bit buffer. + * + * You also need to define some variables and structure members: + * - unsigned char *i_ptr; // current position in the byte buffer + * - unsigned char *i_end; // end of the byte buffer + * - unsigned int bit_buffer; // the bit buffer itself + * - unsigned int bits_left; // number of bits remaining + * + * If you use read_input() and READ_IF_NEEDED, they also expect these + * structure members: + * - struct mspack_system *sys; // to access sys->read() + * - unsigned int error; // to record/return read errors + * - unsigned char input_end; // to mark reaching the EOF + * - unsigned char *inbuf; // the input byte buffer + * - unsigned int inbuf_size; // the size of the input byte buffer + * + * Your READ_BYTES implementation should read data from *i_ptr and + * put them in the bit buffer. READ_IF_NEEDED will call read_input() + * if i_ptr reaches i_end, and will fill up inbuf and set i_ptr to + * the start of inbuf and i_end to the end of inbuf. + * + * If you're reading in MSB order, the routines work by using the area + * beyond the MSB and the LSB of the bit buffer as a free source of + * zeroes when shifting. This avoids having to mask any bits. So we + * have to know the bit width of the bit buffer variable. We use + * and CHAR_BIT to find the size of the bit buffer in bits. + * + * If you are reading in LSB order, bits need to be masked. Normally + * this is done by computing the mask: N bits are masked by the value + * (1< +#endif +#ifndef CHAR_BIT +# define CHAR_BIT (8) +#endif +#define BITBUF_WIDTH (sizeof(bit_buffer) * CHAR_BIT) + +#define INIT_BITS do { \ + BITS_VAR->i_ptr = &BITS_VAR->inbuf[0]; \ + BITS_VAR->i_end = &BITS_VAR->inbuf[0]; \ + BITS_VAR->bit_buffer = 0; \ + BITS_VAR->bits_left = 0; \ + BITS_VAR->input_end = 0; \ +} while (0) + +#define STORE_BITS do { \ + BITS_VAR->i_ptr = i_ptr; \ + BITS_VAR->i_end = i_end; \ + BITS_VAR->bit_buffer = bit_buffer; \ + BITS_VAR->bits_left = bits_left; \ +} while (0) + +#define RESTORE_BITS do { \ + i_ptr = BITS_VAR->i_ptr; \ + i_end = BITS_VAR->i_end; \ + bit_buffer = BITS_VAR->bit_buffer; \ + bits_left = BITS_VAR->bits_left; \ +} while (0) + +#define ENSURE_BITS(nbits) do { \ + while (bits_left < (nbits)) READ_BYTES; \ +} while (0) + +#define READ_BITS(val, nbits) do { \ + ENSURE_BITS(nbits); \ + (val) = PEEK_BITS(nbits); \ + REMOVE_BITS(nbits); \ +} while (0) + +#define READ_MANY_BITS(val, bits) do { \ + unsigned char needed = (bits), bitrun; \ + (val) = 0; \ + while (needed > 0) { \ + if (bits_left <= (BITBUF_WIDTH - 16)) READ_BYTES; \ + bitrun = (bits_left < needed) ? bits_left : needed; \ + (val) = ((val) << bitrun) | PEEK_BITS(bitrun); \ + REMOVE_BITS(bitrun); \ + needed -= bitrun; \ + } \ +} while (0) + +#ifdef BITS_ORDER_MSB +# define PEEK_BITS(nbits) (bit_buffer >> (BITBUF_WIDTH - (nbits))) +# define REMOVE_BITS(nbits) ((bit_buffer <<= (nbits)), (bits_left -= (nbits))) +# define INJECT_BITS(bitdata,nbits) ((bit_buffer |= \ + (bitdata) << (BITBUF_WIDTH - (nbits) - bits_left)), (bits_left += (nbits))) +#else /* BITS_ORDER_LSB */ +# define PEEK_BITS(nbits) (bit_buffer & ((1 << (nbits))-1)) +# define REMOVE_BITS(nbits) ((bit_buffer >>= (nbits)), (bits_left -= (nbits))) +# define INJECT_BITS(bitdata,nbits) ((bit_buffer |= \ + (bitdata) << bits_left), (bits_left += (nbits))) +#endif + +#ifdef BITS_LSB_TABLE +/* lsb_bit_mask[n] = (1 << n) - 1 */ +static const unsigned short lsb_bit_mask[17] = { + 0x0000, 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff, + 0x01ff, 0x03ff, 0x07ff, 0x0fff, 0x1fff, 0x3fff, 0x7fff, 0xffff +}; +# define PEEK_BITS_T(nbits) (bit_buffer & lsb_bit_mask[(nbits)]) +# define READ_BITS_T(val, nbits) do { \ + ENSURE_BITS(nbits); \ + (val) = PEEK_BITS_T(nbits); \ + REMOVE_BITS(nbits); \ +} while (0) +#endif + +#ifndef BITS_NO_READ_INPUT +# define READ_IF_NEEDED do { \ + if (i_ptr >= i_end) { \ + if (read_input(BITS_VAR)) \ + return BITS_VAR->error; \ + i_ptr = BITS_VAR->i_ptr; \ + i_end = BITS_VAR->i_end; \ + } \ +} while (0) + +static int read_input(BITS_TYPE *p) { + int read = p->sys->read(p->input, &p->inbuf[0], (int)p->inbuf_size); + if (read < 0) return p->error = MSPACK_ERR_READ; + + /* we might overrun the input stream by asking for bits we don't use, + * so fake 2 more bytes at the end of input */ + if (read == 0) { + if (p->input_end) { + D(("out of input bytes")) + return p->error = MSPACK_ERR_READ; + } + else { + read = 2; + p->inbuf[0] = p->inbuf[1] = 0; + p->input_end = 1; + } + } + + /* update i_ptr and i_end */ + p->i_ptr = &p->inbuf[0]; + p->i_end = &p->inbuf[read]; + return MSPACK_ERR_OK; +} +#endif +#endif diff --git a/third_party/mspack/readhuff.h b/third_party/mspack/readhuff.h new file mode 100644 index 000000000..4d9422578 --- /dev/null +++ b/third_party/mspack/readhuff.h @@ -0,0 +1,172 @@ +/* This file is part of libmspack. + * (C) 2003-2014 Stuart Caie. + * + * libmspack is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License (LGPL) version 2.1 + * + * For further details, see the file COPYING.LIB distributed with libmspack + */ + +#ifndef MSPACK_READHUFF_H +#define MSPACK_READHUFF_H 1 + +/* This implements a fast Huffman tree decoding system. */ + +#if !(defined(BITS_ORDER_MSB) || defined(BITS_ORDER_LSB)) +# error "readhuff.h is used in conjunction with readbits.h, include that first" +#endif +#if !(defined(TABLEBITS) && defined(MAXSYMBOLS)) +# error "define TABLEBITS(tbl) and MAXSYMBOLS(tbl) before using readhuff.h" +#endif +#if !(defined(HUFF_TABLE) && defined(HUFF_LEN)) +# error "define HUFF_TABLE(tbl) and HUFF_LEN(tbl) before using readhuff.h" +#endif +#ifndef HUFF_ERROR +# error "define HUFF_ERROR before using readhuff.h" +#endif +#ifndef HUFF_MAXBITS +# define HUFF_MAXBITS 16 +#endif + +/* Decodes the next huffman symbol from the input bitstream into var. + * Do not use this macro on a table unless build_decode_table() succeeded. + */ +#define READ_HUFFSYM(tbl, var) do { \ + ENSURE_BITS(HUFF_MAXBITS); \ + sym = HUFF_TABLE(tbl, PEEK_BITS(TABLEBITS(tbl))); \ + if (sym >= MAXSYMBOLS(tbl)) HUFF_TRAVERSE(tbl); \ + (var) = sym; \ + i = HUFF_LEN(tbl, sym); \ + REMOVE_BITS(i); \ +} while (0) + +#ifdef BITS_ORDER_LSB +# define HUFF_TRAVERSE(tbl) do { \ + i = TABLEBITS(tbl) - 1; \ + do { \ + if (i++ > HUFF_MAXBITS) HUFF_ERROR; \ + sym = HUFF_TABLE(tbl, \ + (sym << 1) | ((bit_buffer >> i) & 1)); \ + } while (sym >= MAXSYMBOLS(tbl)); \ +} while (0) +#else +#define HUFF_TRAVERSE(tbl) do { \ + i = 1 << (BITBUF_WIDTH - TABLEBITS(tbl)); \ + do { \ + if ((i >>= 1) == 0) HUFF_ERROR; \ + sym = HUFF_TABLE(tbl, \ + (sym << 1) | ((bit_buffer & i) ? 1 : 0)); \ + } while (sym >= MAXSYMBOLS(tbl)); \ +} while (0) +#endif + +/* make_decode_table(nsyms, nbits, length[], table[]) + * + * This function was originally coded by David Tritscher. + * It builds a fast huffman decoding table from + * a canonical huffman code lengths table. + * + * nsyms = total number of symbols in this huffman tree. + * nbits = any symbols with a code length of nbits or less can be decoded + * in one lookup of the table. + * length = A table to get code lengths from [0 to nsyms-1] + * table = The table to fill up with decoded symbols and pointers. + * Should be ((1<> 1; /* don't do 0 length codes */ + + /* fill entries for codes short enough for a direct mapping */ + for (bit_num = 1; bit_num <= nbits; bit_num++) { + for (sym = 0; sym < nsyms; sym++) { + if (length[sym] != bit_num) continue; +#ifdef BITS_ORDER_MSB + leaf = pos; +#else + /* reverse the significant bits */ + fill = length[sym]; reverse = pos >> (nbits - fill); leaf = 0; + do {leaf <<= 1; leaf |= reverse & 1; reverse >>= 1;} while (--fill); +#endif + + if((pos += bit_mask) > table_mask) return 1; /* table overrun */ + + /* fill all possible lookups of this symbol with the symbol itself */ +#ifdef BITS_ORDER_MSB + for (fill = bit_mask; fill-- > 0;) table[leaf++] = sym; +#else + fill = bit_mask; next_symbol = 1 << bit_num; + do { table[leaf] = sym; leaf += next_symbol; } while (--fill); +#endif + } + bit_mask >>= 1; + } + + /* exit with success if table is now complete */ + if (pos == table_mask) return 0; + + /* mark all remaining table entries as unused */ + for (sym = pos; sym < table_mask; sym++) { +#ifdef BITS_ORDER_MSB + table[sym] = 0xFFFF; +#else + reverse = sym; leaf = 0; fill = nbits; + do { leaf <<= 1; leaf |= reverse & 1; reverse >>= 1; } while (--fill); + table[leaf] = 0xFFFF; +#endif + } + + /* next_symbol = base of allocation for long codes */ + next_symbol = ((table_mask >> 1) < nsyms) ? nsyms : (table_mask >> 1); + + /* give ourselves room for codes to grow by up to 16 more bits. + * codes now start at bit nbits+16 and end at (nbits+16-codelength) */ + pos <<= 16; + table_mask <<= 16; + bit_mask = 1 << 15; + + for (bit_num = nbits+1; bit_num <= HUFF_MAXBITS; bit_num++) { + for (sym = 0; sym < nsyms; sym++) { + if (length[sym] != bit_num) continue; + if (pos >= table_mask) return 1; /* table overflow */ + +#ifdef BITS_ORDER_MSB + leaf = pos >> 16; +#else + /* leaf = the first nbits of the code, reversed */ + reverse = pos >> 16; leaf = 0; fill = nbits; + do {leaf <<= 1; leaf |= reverse & 1; reverse >>= 1;} while (--fill); +#endif + for (fill = 0; fill < (bit_num - nbits); fill++) { + /* if this path hasn't been taken yet, 'allocate' two entries */ + if (table[leaf] == 0xFFFF) { + table[(next_symbol << 1) ] = 0xFFFF; + table[(next_symbol << 1) + 1 ] = 0xFFFF; + table[leaf] = next_symbol++; + } + + /* follow the path and select either left or right for next bit */ + leaf = table[leaf] << 1; + if ((pos >> (15-fill)) & 1) leaf++; + } + table[leaf] = sym; + pos += bit_mask; + } + bit_mask >>= 1; + } + + /* full table? */ + return (pos == table_mask) ? 0 : 1; +} +#endif diff --git a/third_party/mspack/system.c b/third_party/mspack/system.c new file mode 100644 index 000000000..16aa8806d --- /dev/null +++ b/third_party/mspack/system.c @@ -0,0 +1,242 @@ +/* This file is part of libmspack. + * (C) 2003-2004 Stuart Caie. + * + * libmspack is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License (LGPL) version 2.1 + * + * For further details, see the file COPYING.LIB distributed with libmspack + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include + +#if !LARGEFILE_SUPPORT +const char *largefile_msg = "library not compiled to support large files."; +#endif + + +int mspack_version(int entity) { + switch (entity) { + /* CHM decoder version 1 -> 2 changes: + * - added mschmd_sec_mscompressed::spaninfo + * - added mschmd_header::first_pmgl + * - added mschmd_header::last_pmgl + * - added mschmd_header::chunk_cache; + */ + case MSPACK_VER_MSCHMD: + /* CAB decoder version 1 -> 2 changes: + * - added MSCABD_PARAM_SALVAGE + */ + case MSPACK_VER_MSCABD: + return 2; + case MSPACK_VER_LIBRARY: + case MSPACK_VER_SYSTEM: + case MSPACK_VER_MSSZDDD: + case MSPACK_VER_MSKWAJD: + case MSPACK_VER_MSOABD: + return 1; + case MSPACK_VER_MSCABC: + case MSPACK_VER_MSCHMC: + case MSPACK_VER_MSLITD: + case MSPACK_VER_MSLITC: + case MSPACK_VER_MSHLPD: + case MSPACK_VER_MSHLPC: + case MSPACK_VER_MSSZDDC: + case MSPACK_VER_MSKWAJC: + case MSPACK_VER_MSOABC: + return 0; + } + return -1; +} + +int mspack_sys_selftest_internal(int offt_size) { + return (sizeof(off_t) == offt_size) ? MSPACK_ERR_OK : MSPACK_ERR_SEEK; +} + +/* validates a system structure */ +int mspack_valid_system(struct mspack_system *sys) { + return (sys != NULL) && (sys->open != NULL) && (sys->close != NULL) && + (sys->read != NULL) && (sys->write != NULL) && (sys->seek != NULL) && + (sys->tell != NULL) && (sys->message != NULL) && (sys->alloc != NULL) && + (sys->free != NULL) && (sys->copy != NULL) && (sys->null_ptr == NULL); +} + +/* returns the length of a file opened for reading */ +int mspack_sys_filelen(struct mspack_system *system, + struct mspack_file *file, off_t *length) +{ + off_t current; + + if (!system || !file || !length) return MSPACK_ERR_OPEN; + + /* get current offset */ + current = system->tell(file); + + /* seek to end of file */ + if (system->seek(file, (off_t) 0, MSPACK_SYS_SEEK_END)) { + return MSPACK_ERR_SEEK; + } + + /* get offset of end of file */ + *length = system->tell(file); + + /* seek back to original offset */ + if (system->seek(file, current, MSPACK_SYS_SEEK_START)) { + return MSPACK_ERR_SEEK; + } + + return MSPACK_ERR_OK; +} + + + +/* definition of mspack_default_system -- if the library is compiled with + * MSPACK_NO_DEFAULT_SYSTEM, no default system will be provided. Otherwise, + * an appropriate default system (e.g. the standard C library, or some native + * API calls) + */ + +#ifdef MSPACK_NO_DEFAULT_SYSTEM +struct mspack_system *mspack_default_system = NULL; +#else + +/* implementation of mspack_default_system for standard C library */ + +#include +#include +#include +#include + +struct mspack_file_p { + FILE *fh; + const char *name; +}; + +static struct mspack_file *msp_open(struct mspack_system *self, + const char *filename, int mode) +{ + struct mspack_file_p *fh; + const char *fmode; + + switch (mode) { + case MSPACK_SYS_OPEN_READ: fmode = "rb"; break; + case MSPACK_SYS_OPEN_WRITE: fmode = "wb"; break; + case MSPACK_SYS_OPEN_UPDATE: fmode = "r+b"; break; + case MSPACK_SYS_OPEN_APPEND: fmode = "ab"; break; + default: return NULL; + } + + if ((fh = (struct mspack_file_p *) malloc(sizeof(struct mspack_file_p)))) { + fh->name = filename; + if ((fh->fh = fopen(filename, fmode))) return (struct mspack_file *) fh; + free(fh); + } + return NULL; +} + +static void msp_close(struct mspack_file *file) { + struct mspack_file_p *self = (struct mspack_file_p *) file; + if (self) { + fclose(self->fh); + free(self); + } +} + +static int msp_read(struct mspack_file *file, void *buffer, int bytes) { + struct mspack_file_p *self = (struct mspack_file_p *) file; + if (self && buffer && bytes >= 0) { + size_t count = fread(buffer, 1, (size_t) bytes, self->fh); + if (!ferror(self->fh)) return (int) count; + } + return -1; +} + +static int msp_write(struct mspack_file *file, void *buffer, int bytes) { + struct mspack_file_p *self = (struct mspack_file_p *) file; + if (self && buffer && bytes >= 0) { + size_t count = fwrite(buffer, 1, (size_t) bytes, self->fh); + if (!ferror(self->fh)) return (int) count; + } + return -1; +} + +static int msp_seek(struct mspack_file *file, off_t offset, int mode) { + struct mspack_file_p *self = (struct mspack_file_p *) file; + if (self) { + switch (mode) { + case MSPACK_SYS_SEEK_START: mode = SEEK_SET; break; + case MSPACK_SYS_SEEK_CUR: mode = SEEK_CUR; break; + case MSPACK_SYS_SEEK_END: mode = SEEK_END; break; + default: return -1; + } +#if HAVE_FSEEKO + return fseeko(self->fh, offset, mode); +#else + return fseek(self->fh, offset, mode); +#endif + } + return -1; +} + +static off_t msp_tell(struct mspack_file *file) { + struct mspack_file_p *self = (struct mspack_file_p *) file; +#if HAVE_FSEEKO + return (self) ? (off_t) ftello(self->fh) : 0; +#else + return (self) ? (off_t) ftell(self->fh) : 0; +#endif +} + +static void msp_msg(struct mspack_file *file, const char *format, ...) { + va_list ap; + if (file) fprintf(stderr, "%s: ", ((struct mspack_file_p *) file)->name); + va_start(ap, format); + vfprintf(stderr, format, ap); + va_end(ap); + fputc((int) '\n', stderr); + fflush(stderr); +} + +static void *msp_alloc(struct mspack_system *self, size_t bytes) { +#if DEBUG + /* make uninitialised data obvious */ + char *buf = malloc(bytes + 8); + if (buf) memset(buf, 0xDC, bytes); + *((size_t *)buf) = bytes; + return &buf[8]; +#else + return malloc(bytes); +#endif +} + +static void msp_free(void *buffer) { +#if DEBUG + char *buf = buffer; + size_t bytes; + if (buf) { + buf -= 8; + bytes = *((size_t *)buf); + /* make freed data obvious */ + memset(buf, 0xED, bytes); + free(buf); + } +#else + free(buffer); +#endif +} + +static void msp_copy(void *src, void *dest, size_t bytes) { + memcpy(dest, src, bytes); +} + +static struct mspack_system msp_system = { + &msp_open, &msp_close, &msp_read, &msp_write, &msp_seek, + &msp_tell, &msp_msg, &msp_alloc, &msp_free, &msp_copy, NULL +}; + +struct mspack_system *mspack_default_system = &msp_system; + +#endif diff --git a/third_party/mspack/system.h b/third_party/mspack/system.h new file mode 100644 index 000000000..826e89f3e --- /dev/null +++ b/third_party/mspack/system.h @@ -0,0 +1,113 @@ +/* This file is part of libmspack. + * (C) 2003-2018 Stuart Caie. + * + * libmspack is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License (LGPL) version 2.1 + * + * For further details, see the file COPYING.LIB distributed with libmspack + */ + +#ifndef MSPACK_SYSTEM_H +#define MSPACK_SYSTEM_H 1 + +#ifdef __cplusplus +extern "C" { +#endif + +/* ensure config.h is read before mspack.h */ +#ifdef HAVE_CONFIG_H +# include +#endif + +#include + +/* assume exists */ +#include + +/* fix for problem with GCC 4 and glibc (thanks to Ville Skytta) + * http://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=150429 + */ +#ifdef read +# undef read +#endif + +/* Old GCCs don't have __func__, but __FUNCTION__: + * http://gcc.gnu.org/onlinedocs/gcc/Function-Names.html + */ +#if __STDC_VERSION__ < 199901L +# if __GNUC__ >= 2 +# define __func__ __FUNCTION__ +# else +# define __func__ "" +# endif +#endif + +#if DEBUG +# include +# define D(x) do { printf("%s:%d (%s) ",__FILE__, __LINE__, __func__); \ + printf x ; fputc('\n', stdout); fflush(stdout);} while (0); +#else +# define D(x) +#endif + +/* CAB supports searching through files over 4GB in size, and the CHM file + * format actively uses 64-bit offsets. These can only be fully supported + * if the system the code runs on supports large files. If not, the library + * will work as normal using only 32-bit arithmetic, but if an offset + * greater than 2GB is detected, an error message indicating the library + * can't support the file should be printed. + */ +#if HAVE_INTTYPES_H +# include +#else +# define PRId64 "lld" +# define PRIu64 "llu" +# define PRId32 "ld" +# define PRIu32 "lu" +#endif + +#include +#if ((defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS >= 64) || \ + (defined(FILESIZEBITS) && FILESIZEBITS >= 64) || \ + defined(_LARGEFILE_SOURCE) || defined(_LARGEFILE64_SOURCE) || \ + SIZEOF_OFF_T >= 8) +# define LARGEFILE_SUPPORT 1 +# define LD PRId64 +# define LU PRIu64 +#else +extern const char *largefile_msg; +# define LD PRId32 +# define LU PRIu32 +#endif + +/* endian-neutral reading of little-endian data */ +#define __egi32(a,n) ( ((((unsigned char *) a)[n+3]) << 24) | \ + ((((unsigned char *) a)[n+2]) << 16) | \ + ((((unsigned char *) a)[n+1]) << 8) | \ + ((((unsigned char *) a)[n+0]))) +#define EndGetI64(a) ((((unsigned long long int) __egi32(a,4)) << 32) | \ + ((unsigned int) __egi32(a,0))) +#define EndGetI32(a) __egi32(a,0) +#define EndGetI16(a) ((((a)[1])<<8)|((a)[0])) + +/* endian-neutral reading of big-endian data */ +#define EndGetM32(a) (((((unsigned char *) a)[0]) << 24) | \ + ((((unsigned char *) a)[1]) << 16) | \ + ((((unsigned char *) a)[2]) << 8) | \ + ((((unsigned char *) a)[3]))) +#define EndGetM16(a) ((((a)[0])<<8)|((a)[1])) + +extern struct mspack_system *mspack_default_system; + +/* returns the length of a file opened for reading */ +extern int mspack_sys_filelen(struct mspack_system *system, + struct mspack_file *file, off_t *length); + +/* validates a system structure */ +extern int mspack_valid_system(struct mspack_system *sys); + +#ifdef __cplusplus +} +#endif + +#endif