Migration pull request for 9.2

- Mattias's patch to support concurrent bounce buffers for PCI devices - David's memory leak fix in dirty_memory_extend() - Fabiano's CI fix to disable vmstate-static-checker test in compat tests - Denis's patch that adds one more trace point for cpu throttle changes - Yichen's multifd qatzip compressor support -----BEGIN PGP SIGNATURE----- iIgEABYKADAWIQS5GE3CDMRX2s990ak7X8zN86vXBgUCZt9VlhIccGV0ZXJ4QHJl ZGhhdC5jb20ACgkQO1/MzfOr1wZ+4QD+NPzprFD7RF2DPHT5bdo6NTWFSZxW4dyD oFp2vhYDEAYA/A5TTfOh3QpYBOaP2PxztZIZSLgs1bokhp+sLM3/PVsK =6JYP -----END PGP SIGNATURE----- Merge tag 'migration-20240909-pull-request' of https://gitlab.com/peterx/qemu into staging Migration pull request for 9.2 - Mattias's patch to support concurrent bounce buffers for PCI devices - David's memory leak fix in dirty_memory_extend() - Fabiano's CI fix to disable vmstate-static-checker test in compat tests - Denis's patch that adds one more trace point for cpu throttle changes - Yichen's multifd qatzip compressor support # -----BEGIN PGP SIGNATURE----- # # iIgEABYKADAWIQS5GE3CDMRX2s990ak7X8zN86vXBgUCZt9VlhIccGV0ZXJ4QHJl # ZGhhdC5jb20ACgkQO1/MzfOr1wZ+4QD+NPzprFD7RF2DPHT5bdo6NTWFSZxW4dyD # oFp2vhYDEAYA/A5TTfOh3QpYBOaP2PxztZIZSLgs1bokhp+sLM3/PVsK # =6JYP # -----END PGP SIGNATURE----- # gpg: Signature made Mon 09 Sep 2024 21:07:50 BST # gpg: using EDDSA key B9184DC20CC457DACF7DD1A93B5FCCCDF3ABD706 # gpg: issuer "peterx@redhat.com" # gpg: Good signature from "Peter Xu <xzpeter@gmail.com>" [marginal] # gpg: aka "Peter Xu <peterx@redhat.com>" [marginal] # gpg: WARNING: This key is not certified with sufficiently trusted signatures! # gpg: It is not certain that the signature belongs to the owner. # Primary key fingerprint: B918 4DC2 0CC4 57DA CF7D D1A9 3B5F CCCD F3AB D706 * tag 'migration-20240909-pull-request' of https://gitlab.com/peterx/qemu: system: improve migration debug tests/migration: Add integration test for 'qatzip' compression method migration: Introduce 'qatzip' compression method migration: Add migration parameters for QATzip meson: Introduce 'qatzip' feature to the build system docs/migration: add qatzip compression feature ci: migration: Don't run python tests in the compat job softmmu/physmem: fix memory leak in dirty_memory_extend() softmmu: Support concurrent bounce buffers Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2024-09-10 11:19:22 +01:00 · 2024-09-10 11:19:22 +01:00 · a66f28df65
parent 7bbadc60b5 89bccecdda
commit a66f28df65
23 changed files with 766 additions and 64 deletions
--- a/.gitlab-ci.d/buildtest.yml
+++ b/.gitlab-ci.d/buildtest.yml
@ -212,6 +212,14 @@ build-previous-qemu:
    # testing an old QEMU against new features/tests that it is not
    # compatible with.
    - cd build-previous
+    # Don't allow python-based tests to run. The
+    # vmstate-checker-script test has a race that causes it to fail
+    # sometimes. It cannot be fixed it because this job runs the test
+    # from the old QEMU version. The test will be removed on master,
+    # but this job will only see the change in the next release.
+    #
+    # TODO: remove this line after 9.2 release
+    - unset PYTHON
    # old to new
    - QTEST_QEMU_BINARY_SRC=./qemu-system-${TARGET}
          QTEST_QEMU_BINARY=../build/qemu-system-${TARGET} ./tests/qtest/migration-test
--- a/docs/devel/migration/features.rst
+++ b/docs/devel/migration/features.rst
@ -14,3 +14,4 @@ Migration has plenty of features to support different use cases.
   CPR
   qpl-compression
   uadk-compression
+   qatzip-compression
--- a/docs/devel/migration/qatzip-compression.rst
+++ b/docs/devel/migration/qatzip-compression.rst
@ -0,0 +1,165 @@
+==================
+QATzip Compression
+==================
+In scenarios with limited network bandwidth, the ``QATzip`` solution can help
+users save a lot of host CPU resources by accelerating compression and
+decompression through the Intel QuickAssist Technology(``QAT``) hardware.
+
+
+The following test was conducted using 8 multifd channels and 10Gbps network
+bandwidth. The results show that, compared to zstd, ``QATzip`` significantly
+saves CPU resources on the sender and reduces migration time. Compared to the
+uncompressed solution, ``QATzip`` greatly improves the dirty page processing
+capability, indicated by the Pages per Second metric, and also reduces the
+total migration time.
+
+::
+
+   VM Configuration: 16 vCPU and 64G memory
+   VM Workload: all vCPUs are idle and 54G memory is filled with Silesia data.
+   QAT Devices: 4
+   |-----------|--------|---------|----------|----------|------|------|
+   |8 Channels |Total   |down     |throughput|pages per | send | recv |
+   |           |time(ms)|time(ms) |(mbps)    |second    | cpu %| cpu% |
+   |-----------|--------|---------|----------|----------|------|------|
+   |qatzip     |   16630|       28|     10467|   2940235|   160|   360|
+   |-----------|--------|---------|----------|----------|------|------|
+   |zstd       |   20165|       24|      8579|   2391465|   810|   340|
+   |-----------|--------|---------|----------|----------|------|------|
+   |none       |   46063|       40|     10848|    330240|    45|    85|
+   |-----------|--------|---------|----------|----------|------|------|
+
+
+QATzip Compression Framework
+============================
+
+``QATzip`` is a user space library which builds on top of the Intel QuickAssist
+Technology to provide extended accelerated compression and decompression
+services.
+
+For more ``QATzip`` introduction, please refer to `QATzip Introduction
+<https://github.com/intel/QATzip?tab=readme-ov-file#introductionl>`_
+
+::
+
+  +----------------+
+  | MultiFd Thread |
+  +-------+--------+
+          |
+          | compress/decompress
+  +-------+--------+
+  | QATzip library |
+  +-------+--------+
+          |
+  +-------+--------+
+  |  QAT library   |
+  +-------+--------+
+          |         user space
+  --------+---------------------
+          |         kernel space
+   +------+-------+
+   |  QAT  Driver |
+   +------+-------+
+          |
+   +------+-------+
+   | QAT Devices  |
+   +--------------+
+
+
+QATzip Installation
+-------------------
+
+The ``QATzip`` installation package has been integrated into some Linux
+distributions and can be installed directly. For example, the Ubuntu Server
+24.04 LTS system can be installed using below command
+
+.. code-block:: shell
+
+   #apt search qatzip
+   libqatzip-dev/noble 1.2.0-0ubuntu3 amd64
+     Intel QuickAssist user space library development files
+
+   libqatzip3/noble 1.2.0-0ubuntu3 amd64
+     Intel QuickAssist user space library
+
+   qatzip/noble,now 1.2.0-0ubuntu3 amd64 [installed]
+     Compression user-space tool for Intel QuickAssist Technology
+
+   #sudo apt install libqatzip-dev libqatzip3 qatzip
+
+If your system does not support the ``QATzip`` installation package, you can
+use the source code to build and install, please refer to `QATzip source code installation
+<https://github.com/intel/QATzip?tab=readme-ov-file#build-intel-quickassist-technology-driver>`_
+
+QAT Hardware Deployment
+-----------------------
+
+``QAT`` supports physical functions(PFs) and virtual functions(VFs) for
+deployment, and users can configure ``QAT`` resources for migration according
+to actual needs. For more details about ``QAT`` deployment, please refer to
+`Intel QuickAssist Technology Documentation
+<https://intel.github.io/quickassist/index.html>`_
+
+For more ``QAT`` hardware introduction, please refer to `intel-quick-assist-technology-overview
+<https://www.intel.com/content/www/us/en/architecture-and-technology/intel-quick-assist-technology-overview.html>`_
+
+How To Use QATzip Compression
+=============================
+
+1 - Install ``QATzip`` library
+
+2 - Build ``QEMU`` with ``--enable-qatzip`` parameter
+
+  E.g. configure --target-list=x86_64-softmmu --enable-kvm ``--enable-qatzip``
+
+3 - Set ``migrate_set_parameter multifd-compression qatzip``
+
+4 - Set ``migrate_set_parameter multifd-qatzip-level comp_level``, the default
+comp_level value is 1, and it supports levels from 1 to 9
+
+QAT Memory Requirements
+=======================
+
+The user needs to reserve system memory for the QAT memory management to
+allocate DMA memory. The size of the reserved system memory depends on the
+number of devices used for migration and the number of multifd channels.
+
+Because memory usage depends on QAT configuration, please refer to `QAT Memory
+Driver Queries
+<https://intel.github.io/quickassist/PG/infrastructure_debugability.html?highlight=memory>`_
+for memory usage calculation.
+
+.. list-table:: An example of a PF used for migration
+  :header-rows: 1
+
+  * - Number of channels
+    - Sender memory usage
+    - Receiver memory usage
+  * - 2
+    - 10M
+    - 10M
+  * - 4
+    - 12M
+    - 14M
+  * - 8
+    - 16M
+    - 20M
+
+How To Choose Between QATzip and QPL
+====================================
+Starting from 4th Gen Intel Xeon Scalable processors, codenamed Sapphire Rapids
+processor(``SPR``), multiple built-in accelerators are supported including
+``QAT`` and ``IAA``.  The former can accelerate ``QATzip`` and the latter is
+used to accelerate ``QPL``.
+
+Here are some suggestions:
+
+1 - If the live migration scenario is limited by network bandwidth and ``QAT``
+hardware resources exceed ``IAA``, use the ``QATzip`` method, which can save a
+lot of host CPU resources for compression.
+
+2 - If the system cannot support shared virtual memory (SVM) technology, use
+the ``QATzip`` method because ``QPL`` performance is not good without SVM
+support.
+
+3 - For other scenarios, use the ``QPL`` method first.
--- a/hw/core/qdev-properties-system.c
+++ b/hw/core/qdev-properties-system.c
@ -659,7 +659,7 @@ const PropertyInfo qdev_prop_fdc_drive_type = {
 const PropertyInfo qdev_prop_multifd_compression = {
    .name = "MultiFDCompression",
    .description = "multifd_compression values, "
-                   "none/zlib/zstd/qpl/uadk",
+                   "none/zlib/zstd/qpl/uadk/qatzip",
    .enum_table = &MultiFDCompression_lookup,
    .get = qdev_propinfo_get_enum,
    .set = qdev_propinfo_set_enum,
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@ -85,6 +85,8 @@ static Property pci_props[] = {
                    QEMU_PCIE_ERR_UNC_MASK_BITNR, true),
    DEFINE_PROP_BIT("x-pcie-ari-nextfn-1", PCIDevice, cap_present,
                    QEMU_PCIE_ARI_NEXTFN_1_BITNR, false),
+    DEFINE_PROP_SIZE32("x-max-bounce-buffer-size", PCIDevice,
+                     max_bounce_buffer_size, DEFAULT_MAX_BOUNCE_BUFFER_SIZE),
    DEFINE_PROP_END_OF_LIST()
 };

@ -1204,6 +1206,8 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev,
                       "bus master container", UINT64_MAX);
    address_space_init(&pci_dev->bus_master_as,
                       &pci_dev->bus_master_container_region, pci_dev->name);
+    pci_dev->bus_master_as.max_bounce_buffer_size =
+        pci_dev->max_bounce_buffer_size;

    if (phase_check(PHASE_MACHINE_READY)) {
        pci_init_bus_master(pci_dev);
@ -2633,6 +2637,10 @@ static void pci_device_class_init(ObjectClass *klass, void *data)
    k->unrealize = pci_qdev_unrealize;
    k->bus_type = TYPE_PCI_BUS;
    device_class_set_props(k, pci_props);
+    object_class_property_set_description(
+        klass, "x-max-bounce-buffer-size",
+        "Maximum buffer size allocated for bounce buffers used for mapped "
+        "access to indirect DMA memory");
 }

 static void pci_device_class_base_init(ObjectClass *klass, void *data)
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@ -1084,13 +1084,7 @@ typedef struct AddressSpaceMapClient {
    QLIST_ENTRY(AddressSpaceMapClient) link;
 } AddressSpaceMapClient;

-typedef struct {
-    MemoryRegion *mr;
-    void *buffer;
-    hwaddr addr;
-    hwaddr len;
-    bool in_use;
-} BounceBuffer;
+#define DEFAULT_MAX_BOUNCE_BUFFER_SIZE (4096)

 /**
 * struct AddressSpace: describes a mapping of addresses to #MemoryRegion objects
@ -1110,8 +1104,10 @@ struct AddressSpace {
    QTAILQ_HEAD(, MemoryListener) listeners;
    QTAILQ_ENTRY(AddressSpace) address_spaces_link;

-    /* Bounce buffer to use for this address space. */
-    BounceBuffer bounce;
+    /* Maximum DMA bounce buffer size used for indirect memory map requests */
+    size_t max_bounce_buffer_size;
+    /* Total size of bounce buffers currently allocated, atomically accessed */
+    size_t bounce_buffer_size;
    /* List of callbacks to invoke when buffers free up */
    QemuMutex map_client_list_lock;
    QLIST_HEAD(, AddressSpaceMapClient) map_client_list;
--- a/include/exec/ramlist.h
+++ b/include/exec/ramlist.h
@ -50,6 +50,7 @@ typedef struct RAMList {
    /* RCU-enabled, writes protected by the ramlist lock. */
    QLIST_HEAD(, RAMBlock) blocks;
    DirtyMemoryBlocks *dirty_memory[DIRTY_MEMORY_NUM];
+    unsigned int num_dirty_blocks;
    uint32_t version;
    QLIST_HEAD(, RAMBlockNotifier) ramblock_notifiers;
 } RAMList;
--- a/include/hw/pci/pci_device.h
+++ b/include/hw/pci/pci_device.h
@ -167,6 +167,9 @@ struct PCIDevice {
    /* ID of standby device in net_failover pair */
    char *failover_pair_id;
    uint32_t acpi_index;
+
+    /* Maximum DMA bounce buffer size used for indirect memory map requests */
+    uint32_t max_bounce_buffer_size;
 };

 static inline int pci_intx(PCIDevice *pci_dev)
--- a/meson.build
+++ b/meson.build
@ -1262,6 +1262,14 @@ if not get_option('uadk').auto() or have_system
     uadk = declare_dependency(dependencies: [libwd, libwd_comp])
  endif
 endif
+
+qatzip = not_found
+if not get_option('qatzip').auto() or have_system
+  qatzip = dependency('qatzip', version: '>=1.1.2',
+                      required: get_option('qatzip'),
+                      method: 'pkg-config')
+endif
+
 virgl = not_found

 have_vhost_user_gpu = have_tools and host_os == 'linux' and pixman.found()
@ -2412,6 +2420,7 @@ config_host_data.set('CONFIG_STATX_MNT_ID', has_statx_mnt_id)
 config_host_data.set('CONFIG_ZSTD', zstd.found())
 config_host_data.set('CONFIG_QPL', qpl.found())
 config_host_data.set('CONFIG_UADK', uadk.found())
+config_host_data.set('CONFIG_QATZIP', qatzip.found())
 config_host_data.set('CONFIG_FUSE', fuse.found())
 config_host_data.set('CONFIG_FUSE_LSEEK', fuse_lseek.found())
 config_host_data.set('CONFIG_SPICE_PROTOCOL', spice_protocol.found())
@ -4535,6 +4544,7 @@ summary_info += {'lzfse support':     liblzfse}
 summary_info += {'zstd support':      zstd}
 summary_info += {'Query Processing Library support': qpl}
 summary_info += {'UADK Library support': uadk}
+summary_info += {'qatzip support':    qatzip}
 summary_info += {'NUMA host support': numa}
 summary_info += {'capstone':          capstone}
 summary_info += {'libpmem support':   libpmem}
--- a/meson_options.txt
+++ b/meson_options.txt
@ -261,6 +261,8 @@ option('qpl', type : 'feature', value : 'auto',
       description: 'Query Processing Library support')
 option('uadk', type : 'feature', value : 'auto',
       description: 'UADK Library support')
+option('qatzip', type: 'feature', value: 'auto',
+       description: 'QATzip compression support')
 option('fuse', type: 'feature', value: 'auto',
       description: 'FUSE block device export')
 option('fuse_lseek', type : 'feature', value : 'auto',
--- a/migration/meson.build
+++ b/migration/meson.build
@ -42,6 +42,7 @@ system_ss.add(when: rdma, if_true: files('rdma.c'))
 system_ss.add(when: zstd, if_true: files('multifd-zstd.c'))
 system_ss.add(when: qpl, if_true: files('multifd-qpl.c'))
 system_ss.add(when: uadk, if_true: files('multifd-uadk.c'))
+system_ss.add(when: qatzip, if_true: files('multifd-qatzip.c'))

 specific_ss.add(when: 'CONFIG_SYSTEM_ONLY',
                if_true: files('ram.c',
--- a/migration/migration-hmp-cmds.c
+++ b/migration/migration-hmp-cmds.c
@ -576,6 +576,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
        p->has_multifd_zlib_level = true;
        visit_type_uint8(v, param, &p->multifd_zlib_level, &err);
        break;
+    case MIGRATION_PARAMETER_MULTIFD_QATZIP_LEVEL:
+        p->has_multifd_qatzip_level = true;
+        visit_type_uint8(v, param, &p->multifd_qatzip_level, &err);
+        break;
    case MIGRATION_PARAMETER_MULTIFD_ZSTD_LEVEL:
        p->has_multifd_zstd_level = true;
        visit_type_uint8(v, param, &p->multifd_zstd_level, &err);
--- a/migration/multifd-qatzip.c
+++ b/migration/multifd-qatzip.c
@ -0,0 +1,394 @@
+/*
+ * Multifd QATzip compression implementation
+ *
+ * Copyright (c) Bytedance
+ *
+ * Authors:
+ *  Bryan Zhang <bryan.zhang@bytedance.com>
+ *  Hao Xiang <hao.xiang@bytedance.com>
+ *  Yichen Wang <yichen.wang@bytedance.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "exec/ramblock.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qapi/qapi-types-migration.h"
+#include "options.h"
+#include "multifd.h"
+#include <qatzip.h>
+
+typedef struct {
+    /*
+     * Unique session for use with QATzip API
+     */
+    QzSession_T sess;
+
+    /*
+     * For compression: Buffer for pages to compress
+     * For decompression: Buffer for data to decompress
+     */
+    uint8_t *in_buf;
+    uint32_t in_len;
+
+    /*
+     * For compression: Output buffer of compressed data
+     * For decompression: Output buffer of decompressed data
+     */
+    uint8_t *out_buf;
+    uint32_t out_len;
+} QatzipData;
+
+/**
+ * qatzip_send_setup: Set up QATzip session and private buffers.
+ *
+ * @param p    Multifd channel params
+ * @param errp Pointer to error, which will be set in case of error
+ * @return     0 on success, -1 on error (and *errp will be set)
+ */
+static int qatzip_send_setup(MultiFDSendParams *p, Error **errp)
+{
+    QatzipData *q;
+    QzSessionParamsDeflate_T params;
+    const char *err_msg;
+    int ret;
+
+    q = g_new0(QatzipData, 1);
+    p->compress_data = q;
+    /* We need one extra place for the packet header */
+    p->iov = g_new0(struct iovec, 2);
+
+    /*
+     * Initialize QAT device with software fallback by default. This allows
+     * QATzip to use CPU path when QAT hardware reaches maximum throughput.
+     */
+    ret = qzInit(&q->sess, true);
+    if (ret != QZ_OK && ret != QZ_DUPLICATE) {
+        err_msg = "qzInit failed";
+        goto err;
+    }
+
+    ret = qzGetDefaultsDeflate(&params);
+    if (ret != QZ_OK) {
+        err_msg = "qzGetDefaultsDeflate failed";
+        goto err;
+    }
+
+    /* Make sure to use configured QATzip compression level. */
+    params.common_params.comp_lvl = migrate_multifd_qatzip_level();
+    ret = qzSetupSessionDeflate(&q->sess, &params);
+    if (ret != QZ_OK && ret != QZ_DUPLICATE) {
+        err_msg = "qzSetupSessionDeflate failed";
+        goto err;
+    }
+
+    if (MULTIFD_PACKET_SIZE > UINT32_MAX) {
+        err_msg = "packet size too large for QAT";
+        goto err;
+    }
+
+    q->in_len = MULTIFD_PACKET_SIZE;
+    /*
+     * PINNED_MEM is an enum from qatzip headers, which means to use
+     * kzalloc_node() to allocate memory for QAT DMA purposes. When QAT device
+     * is not available or software fallback is used, the malloc flag needs to
+     * be set as COMMON_MEM.
+     */
+    q->in_buf = qzMalloc(q->in_len, 0, PINNED_MEM);
+    if (!q->in_buf) {
+        q->in_buf = qzMalloc(q->in_len, 0, COMMON_MEM);
+        if (!q->in_buf) {
+            err_msg = "qzMalloc failed";
+            goto err;
+        }
+    }
+
+    q->out_len = qzMaxCompressedLength(MULTIFD_PACKET_SIZE, &q->sess);
+    q->out_buf = qzMalloc(q->out_len, 0, PINNED_MEM);
+    if (!q->out_buf) {
+        q->out_buf = qzMalloc(q->out_len, 0, COMMON_MEM);
+        if (!q->out_buf) {
+            err_msg = "qzMalloc failed";
+            goto err;
+        }
+    }
+
+    return 0;
+
+err:
+    error_setg(errp, "multifd %u: [sender] %s", p->id, err_msg);
+    return -1;
+}
+
+/**
+ * qatzip_send_cleanup: Tear down QATzip session and release private buffers.
+ *
+ * @param p    Multifd channel params
+ * @param errp Pointer to error, which will be set in case of error
+ * @return     None
+ */
+static void qatzip_send_cleanup(MultiFDSendParams *p, Error **errp)
+{
+    QatzipData *q = p->compress_data;
+
+    if (q) {
+        if (q->in_buf) {
+            qzFree(q->in_buf);
+        }
+        if (q->out_buf) {
+            qzFree(q->out_buf);
+        }
+        (void)qzTeardownSession(&q->sess);
+        (void)qzClose(&q->sess);
+        g_free(q);
+    }
+
+    g_free(p->iov);
+    p->iov = NULL;
+    p->compress_data = NULL;
+}
+
+/**
+ * qatzip_send_prepare: Compress pages and update IO channel info.
+ *
+ * @param p    Multifd channel params
+ * @param errp Pointer to error, which will be set in case of error
+ * @return     0 on success, -1 on error (and *errp will be set)
+ */
+static int qatzip_send_prepare(MultiFDSendParams *p, Error **errp)
+{
+    MultiFDPages_t *pages = p->pages;
+    QatzipData *q = p->compress_data;
+    int ret;
+    unsigned int in_len, out_len;
+
+    if (!multifd_send_prepare_common(p)) {
+        goto out;
+    }
+
+    /*
+     * Unlike other multifd compression implementations, we use a non-streaming
+     * API and place all the data into one buffer, rather than sending each
+     * page to the compression API at a time. Based on initial benchmarks, the
+     * non-streaming API outperforms the streaming API. Plus, the logic in QEMU
+     * is friendly to using the non-streaming API anyway. If either of these
+     * statements becomes no longer true, we can revisit adding a streaming
+     * implementation.
+     */
+    for (int i = 0; i < pages->normal_num; i++) {
+        memcpy(q->in_buf + (i * p->page_size),
+               pages->block->host + pages->offset[i],
+               p->page_size);
+    }
+
+    in_len = pages->normal_num * p->page_size;
+    if (in_len > q->in_len) {
+        error_setg(errp, "multifd %u: unexpectedly large input", p->id);
+        return -1;
+    }
+    out_len = q->out_len;
+
+    ret = qzCompress(&q->sess, q->in_buf, &in_len, q->out_buf, &out_len, 1);
+    if (ret != QZ_OK) {
+        error_setg(errp, "multifd %u: QATzip returned %d instead of QZ_OK",
+                   p->id, ret);
+        return -1;
+    }
+    if (in_len != pages->normal_num * p->page_size) {
+        error_setg(errp, "multifd %u: QATzip failed to compress all input",
+                   p->id);
+        return -1;
+    }
+
+    p->iov[p->iovs_num].iov_base = q->out_buf;
+    p->iov[p->iovs_num].iov_len = out_len;
+    p->iovs_num++;
+    p->next_packet_size = out_len;
+
+out:
+    p->flags |= MULTIFD_FLAG_QATZIP;
+    multifd_send_fill_packet(p);
+    return 0;
+}
+
+/**
+ * qatzip_recv_setup: Set up QATzip session and allocate private buffers.
+ *
+ * @param p    Multifd channel params
+ * @param errp Pointer to error, which will be set in case of error
+ * @return     0 on success, -1 on error (and *errp will be set)
+ */
+static int qatzip_recv_setup(MultiFDRecvParams *p, Error **errp)
+{
+    QatzipData *q;
+    QzSessionParamsDeflate_T params;
+    const char *err_msg;
+    int ret;
+
+    q = g_new0(QatzipData, 1);
+    p->compress_data = q;
+
+    /*
+     * Initialize QAT device with software fallback by default. This allows
+     * QATzip to use CPU path when QAT hardware reaches maximum throughput.
+     */
+    ret = qzInit(&q->sess, true);
+    if (ret != QZ_OK && ret != QZ_DUPLICATE) {
+        err_msg = "qzInit failed";
+        goto err;
+    }
+
+    ret = qzGetDefaultsDeflate(&params);
+    if (ret != QZ_OK) {
+        err_msg = "qzGetDefaultsDeflate failed";
+        goto err;
+    }
+
+    ret = qzSetupSessionDeflate(&q->sess, &params);
+    if (ret != QZ_OK && ret != QZ_DUPLICATE) {
+        err_msg = "qzSetupSessionDeflate failed";
+        goto err;
+    }
+
+    /*
+     * Reserve extra spaces for the incoming packets. Current implementation
+     * doesn't send uncompressed pages in case the compression gets too big.
+     */
+    q->in_len = MULTIFD_PACKET_SIZE * 2;
+    /*
+     * PINNED_MEM is an enum from qatzip headers, which means to use
+     * kzalloc_node() to allocate memory for QAT DMA purposes. When QAT device
+     * is not available or software fallback is used, the malloc flag needs to
+     * be set as COMMON_MEM.
+     */
+    q->in_buf = qzMalloc(q->in_len, 0, PINNED_MEM);
+    if (!q->in_buf) {
+        q->in_buf = qzMalloc(q->in_len, 0, COMMON_MEM);
+        if (!q->in_buf) {
+            err_msg = "qzMalloc failed";
+            goto err;
+        }
+    }
+
+    q->out_len = MULTIFD_PACKET_SIZE;
+    q->out_buf = qzMalloc(q->out_len, 0, PINNED_MEM);
+    if (!q->out_buf) {
+        q->out_buf = qzMalloc(q->out_len, 0, COMMON_MEM);
+        if (!q->out_buf) {
+            err_msg = "qzMalloc failed";
+            goto err;
+        }
+    }
+
+    return 0;
+
+err:
+    error_setg(errp, "multifd %u: [receiver] %s", p->id, err_msg);
+    return -1;
+}
+
+/**
+ * qatzip_recv_cleanup: Tear down QATzip session and release private buffers.
+ *
+ * @param p    Multifd channel params
+ * @return     None
+ */
+static void qatzip_recv_cleanup(MultiFDRecvParams *p)
+{
+    QatzipData *q = p->compress_data;
+
+    if (q) {
+        if (q->in_buf) {
+            qzFree(q->in_buf);
+        }
+        if (q->out_buf) {
+            qzFree(q->out_buf);
+        }
+        (void)qzTeardownSession(&q->sess);
+        (void)qzClose(&q->sess);
+        g_free(q);
+    }
+    p->compress_data = NULL;
+}
+
+
+/**
+ * qatzip_recv: Decompress pages and copy them to the appropriate
+ * locations.
+ *
+ * @param p    Multifd channel params
+ * @param errp Pointer to error, which will be set in case of error
+ * @return     0 on success, -1 on error (and *errp will be set)
+ */
+static int qatzip_recv(MultiFDRecvParams *p, Error **errp)
+{
+    QatzipData *q = p->compress_data;
+    int ret;
+    unsigned int in_len, out_len;
+    uint32_t in_size = p->next_packet_size;
+    uint32_t expected_size = p->normal_num * p->page_size;
+    uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK;
+
+    if (in_size > q->in_len) {
+        error_setg(errp, "multifd %u: received unexpectedly large packet",
+                   p->id);
+        return -1;
+    }
+
+    if (flags != MULTIFD_FLAG_QATZIP) {
+        error_setg(errp, "multifd %u: flags received %x flags expected %x",
+                   p->id, flags, MULTIFD_FLAG_QATZIP);
+        return -1;
+    }
+
+    multifd_recv_zero_page_process(p);
+    if (!p->normal_num) {
+        assert(in_size == 0);
+        return 0;
+    }
+
+    ret = qio_channel_read_all(p->c, (void *)q->in_buf, in_size, errp);
+    if (ret != 0) {
+        return ret;
+    }
+
+    in_len = in_size;
+    out_len = q->out_len;
+    ret = qzDecompress(&q->sess, q->in_buf, &in_len, q->out_buf, &out_len);
+    if (ret != QZ_OK) {
+        error_setg(errp, "multifd %u: qzDecompress failed", p->id);
+        return -1;
+    }
+    if (out_len != expected_size) {
+        error_setg(errp, "multifd %u: packet size received %u size expected %u",
+                   p->id, out_len, expected_size);
+        return -1;
+    }
+
+    /* Copy each page to its appropriate location. */
+    for (int i = 0; i < p->normal_num; i++) {
+        memcpy(p->host + p->normal[i],
+               q->out_buf + p->page_size * i,
+               p->page_size);
+    }
+    return 0;
+}
+
+static MultiFDMethods multifd_qatzip_ops = {
+    .send_setup = qatzip_send_setup,
+    .send_cleanup = qatzip_send_cleanup,
+    .send_prepare = qatzip_send_prepare,
+    .recv_setup = qatzip_recv_setup,
+    .recv_cleanup = qatzip_recv_cleanup,
+    .recv = qatzip_recv
+};
+
+static void multifd_qatzip_register(void)
+{
+    multifd_register_ops(MULTIFD_COMPRESSION_QATZIP, &multifd_qatzip_ops);
+}
+
+migration_init(multifd_qatzip_register);
--- a/migration/multifd.h
+++ b/migration/multifd.h
@ -36,14 +36,15 @@ MultiFDRecvData *multifd_get_recv_data(void);
 /* Multifd Compression flags */
 #define MULTIFD_FLAG_SYNC (1 << 0)

-/* We reserve 4 bits for compression methods */
-#define MULTIFD_FLAG_COMPRESSION_MASK (0xf << 1)
+/* We reserve 5 bits for compression methods */
+#define MULTIFD_FLAG_COMPRESSION_MASK (0x1f << 1)
 /* we need to be compatible. Before compression value was 0 */
 #define MULTIFD_FLAG_NOCOMP (0 << 1)
 #define MULTIFD_FLAG_ZLIB (1 << 1)
 #define MULTIFD_FLAG_ZSTD (2 << 1)
 #define MULTIFD_FLAG_QPL (4 << 1)
 #define MULTIFD_FLAG_UADK (8 << 1)
+#define MULTIFD_FLAG_QATZIP (16 << 1)

 /* This value needs to be a multiple of qemu_target_page_size() */
 #define MULTIFD_PACKET_SIZE (512 * 1024)
--- a/migration/options.c
+++ b/migration/options.c
@ -55,6 +55,13 @@
 #define DEFAULT_MIGRATE_MULTIFD_COMPRESSION MULTIFD_COMPRESSION_NONE
 /* 0: means nocompress, 1: best speed, ... 9: best compress ratio */
 #define DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL 1
+/*
+ * 1: best speed, ... 9: best compress ratio
+ * There is some nuance here. Refer to QATzip documentation to understand
+ * the mapping of QATzip levels to standard deflate levels.
+ */
+#define DEFAULT_MIGRATE_MULTIFD_QATZIP_LEVEL 1
+
 /* 0: means nocompress, 1: best speed, ... 20: best compress ratio */
 #define DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL 1

@ -123,6 +130,9 @@ Property migration_properties[] = {
    DEFINE_PROP_UINT8("multifd-zlib-level", MigrationState,
                      parameters.multifd_zlib_level,
                      DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL),
+    DEFINE_PROP_UINT8("multifd-qatzip-level", MigrationState,
+                      parameters.multifd_qatzip_level,
+                      DEFAULT_MIGRATE_MULTIFD_QATZIP_LEVEL),
    DEFINE_PROP_UINT8("multifd-zstd-level", MigrationState,
                      parameters.multifd_zstd_level,
                      DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL),
@ -787,6 +797,13 @@ int migrate_multifd_zlib_level(void)
    return s->parameters.multifd_zlib_level;
 }

+int migrate_multifd_qatzip_level(void)
+{
+    MigrationState *s = migrate_get_current();
+
+    return s->parameters.multifd_qatzip_level;
+}
+
 int migrate_multifd_zstd_level(void)
 {
    MigrationState *s = migrate_get_current();
@ -892,6 +909,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp)
    params->multifd_compression = s->parameters.multifd_compression;
    params->has_multifd_zlib_level = true;
    params->multifd_zlib_level = s->parameters.multifd_zlib_level;
+    params->has_multifd_qatzip_level = true;
+    params->multifd_qatzip_level = s->parameters.multifd_qatzip_level;
    params->has_multifd_zstd_level = true;
    params->multifd_zstd_level = s->parameters.multifd_zstd_level;
    params->has_xbzrle_cache_size = true;
@ -946,6 +965,7 @@ void migrate_params_init(MigrationParameters *params)
    params->has_multifd_channels = true;
    params->has_multifd_compression = true;
    params->has_multifd_zlib_level = true;
+    params->has_multifd_qatzip_level = true;
    params->has_multifd_zstd_level = true;
    params->has_xbzrle_cache_size = true;
    params->has_max_postcopy_bandwidth = true;
@ -1038,6 +1058,14 @@ bool migrate_params_check(MigrationParameters *params, Error **errp)
        return false;
    }

+    if (params->has_multifd_qatzip_level &&
+        ((params->multifd_qatzip_level > 9) ||
+        (params->multifd_qatzip_level < 1))) {
+        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_qatzip_level",
+                   "a value between 1 and 9");
+        return false;
+    }
+
    if (params->has_multifd_zstd_level &&
        (params->multifd_zstd_level > 20)) {
        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zstd_level",
@ -1195,6 +1223,9 @@ static void migrate_params_test_apply(MigrateSetParameters *params,
    if (params->has_multifd_compression) {
        dest->multifd_compression = params->multifd_compression;
    }
+    if (params->has_multifd_qatzip_level) {
+        dest->multifd_qatzip_level = params->multifd_qatzip_level;
+    }
    if (params->has_multifd_zlib_level) {
        dest->multifd_zlib_level = params->multifd_zlib_level;
    }
@ -1315,6 +1346,9 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp)
    if (params->has_multifd_compression) {
        s->parameters.multifd_compression = params->multifd_compression;
    }
+    if (params->has_multifd_qatzip_level) {
+        s->parameters.multifd_qatzip_level = params->multifd_qatzip_level;
+    }
    if (params->has_multifd_zlib_level) {
        s->parameters.multifd_zlib_level = params->multifd_zlib_level;
    }
--- a/migration/options.h
+++ b/migration/options.h
@ -78,6 +78,7 @@ uint64_t migrate_max_postcopy_bandwidth(void);
 int migrate_multifd_channels(void);
 MultiFDCompression migrate_multifd_compression(void);
 int migrate_multifd_zlib_level(void);
+int migrate_multifd_qatzip_level(void);
 int migrate_multifd_zstd_level(void);
 uint8_t migrate_throttle_trigger_threshold(void);
 const char *migrate_tls_authz(void);
--- a/qapi/migration.json
+++ b/qapi/migration.json
@ -561,6 +561,8 @@
 #
 # @zstd: use zstd compression method.
 #
+# @qatzip: use qatzip compression method.  (Since 9.2)
+#
 # @qpl: use qpl compression method.  Query Processing Library(qpl) is
 #     based on the deflate compression algorithm and use the Intel
 #     In-Memory Analytics Accelerator(IAA) accelerated compression and
@ -573,6 +575,7 @@
 { 'enum': 'MultiFDCompression',
  'data': [ 'none', 'zlib',
            { 'name': 'zstd', 'if': 'CONFIG_ZSTD' },
+            { 'name': 'qatzip', 'if': 'CONFIG_QATZIP'},
            { 'name': 'qpl', 'if': 'CONFIG_QPL' },
            { 'name': 'uadk', 'if': 'CONFIG_UADK' } ] }

@ -792,6 +795,11 @@
 #     speed, and 9 means best compression ratio which will consume
 #     more CPU.  Defaults to 1.  (Since 5.0)
 #
+# @multifd-qatzip-level: Set the compression level to be used in live
+#     migration. The level is an integer between 1 and 9, where 1 means
+#     the best compression speed, and 9 means the best compression
+#     ratio which will consume more CPU. Defaults to 1.  (Since 9.2)
+#
 # @multifd-zstd-level: Set the compression level to be used in live
 #     migration, the compression level is an integer between 0 and 20,
 #     where 0 means no compression, 1 means the best compression
@ -852,6 +860,7 @@
           'xbzrle-cache-size', 'max-postcopy-bandwidth',
           'max-cpu-throttle', 'multifd-compression',
           'multifd-zlib-level', 'multifd-zstd-level',
+           'multifd-qatzip-level',
           'block-bitmap-mapping',
           { 'name': 'x-vcpu-dirty-limit-period', 'features': ['unstable'] },
           'vcpu-dirty-limit',
@ -967,6 +976,11 @@
 #     speed, and 9 means best compression ratio which will consume
 #     more CPU.  Defaults to 1.  (Since 5.0)
 #
+# @multifd-qatzip-level: Set the compression level to be used in live
+#     migration. The level is an integer between 1 and 9, where 1 means
+#     the best compression speed, and 9 means the best compression
+#     ratio which will consume more CPU. Defaults to 1.  (Since 9.2)
+#
 # @multifd-zstd-level: Set the compression level to be used in live
 #     migration, the compression level is an integer between 0 and 20,
 #     where 0 means no compression, 1 means the best compression
@ -1040,6 +1054,7 @@
            '*max-cpu-throttle': 'uint8',
            '*multifd-compression': 'MultiFDCompression',
            '*multifd-zlib-level': 'uint8',
+            '*multifd-qatzip-level': 'uint8',
            '*multifd-zstd-level': 'uint8',
            '*block-bitmap-mapping': [ 'BitmapMigrationNodeAlias' ],
            '*x-vcpu-dirty-limit-period': { 'type': 'uint64',
@ -1171,6 +1186,11 @@
 #     speed, and 9 means best compression ratio which will consume
 #     more CPU.  Defaults to 1.  (Since 5.0)
 #
+# @multifd-qatzip-level: Set the compression level to be used in live
+#     migration. The level is an integer between 1 and 9, where 1 means
+#     the best compression speed, and 9 means the best compression
+#     ratio which will consume more CPU. Defaults to 1.  (Since 9.2)
+#
 # @multifd-zstd-level: Set the compression level to be used in live
 #     migration, the compression level is an integer between 0 and 20,
 #     where 0 means no compression, 1 means the best compression
@ -1241,6 +1261,7 @@
            '*max-cpu-throttle': 'uint8',
            '*multifd-compression': 'MultiFDCompression',
            '*multifd-zlib-level': 'uint8',
+            '*multifd-qatzip-level': 'uint8',
            '*multifd-zstd-level': 'uint8',
            '*block-bitmap-mapping': [ 'BitmapMigrationNodeAlias' ],
            '*x-vcpu-dirty-limit-period': { 'type': 'uint64',
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@ -163,6 +163,7 @@ meson_options_help() {
  printf "%s\n" '  pixman          pixman support'
  printf "%s\n" '  plugins         TCG plugins via shared library loading'
  printf "%s\n" '  png             PNG support with libpng'
+  printf "%s\n" '  qatzip          QATzip compression support'
  printf "%s\n" '  qcow1           qcow1 image format support'
  printf "%s\n" '  qed             qed image format support'
  printf "%s\n" '  qga-vss         build QGA VSS support (broken with MinGW)'
@ -427,6 +428,8 @@ _meson_option_parse() {
    --enable-png) printf "%s" -Dpng=enabled ;;
    --disable-png) printf "%s" -Dpng=disabled ;;
    --prefix=*) quote_sh "-Dprefix=$2" ;;
+    --enable-qatzip) printf "%s" -Dqatzip=enabled ;;
+    --disable-qatzip) printf "%s" -Dqatzip=disabled ;;
    --enable-qcow1) printf "%s" -Dqcow1=enabled ;;
    --disable-qcow1) printf "%s" -Dqcow1=disabled ;;
    --enable-qed) printf "%s" -Dqed=enabled ;;
--- a/system/cpu-throttle.c
+++ b/system/cpu-throttle.c
@ -28,6 +28,7 @@
 #include "qemu/main-loop.h"
 #include "sysemu/cpus.h"
 #include "sysemu/cpu-throttle.h"
+#include "trace.h"

 /* vcpu throttling controls */
 static QEMUTimer *throttle_timer;
@ -95,6 +96,8 @@ void cpu_throttle_set(int new_throttle_pct)
     */
    bool throttle_active = cpu_throttle_active();

+    trace_cpu_throttle_set(new_throttle_pct);
+
    /* Ensure throttle percentage is within valid range */
    new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
    new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
--- a/system/memory.c
+++ b/system/memory.c
@ -3148,7 +3148,8 @@ void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name)
    as->ioeventfds = NULL;
    QTAILQ_INIT(&as->listeners);
    QTAILQ_INSERT_TAIL(&address_spaces, as, address_spaces_link);
-    as->bounce.in_use = false;
+    as->max_bounce_buffer_size = DEFAULT_MAX_BOUNCE_BUFFER_SIZE;
+    as->bounce_buffer_size = 0;
    qemu_mutex_init(&as->map_client_list_lock);
    QLIST_INIT(&as->map_client_list);
    as->name = g_strdup(name ? name : "anonymous");
@ -3158,7 +3159,7 @@ void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name)

 static void do_address_space_destroy(AddressSpace *as)
 {
-    assert(!qatomic_read(&as->bounce.in_use));
+    assert(qatomic_read(&as->bounce_buffer_size) == 0);
    assert(QLIST_EMPTY(&as->map_client_list));
    qemu_mutex_destroy(&as->map_client_list_lock);

--- a/system/physmem.c
+++ b/system/physmem.c
@ -1534,18 +1534,6 @@ static ram_addr_t find_ram_offset(ram_addr_t size)
    return offset;
 }

-static unsigned long last_ram_page(void)
-{
-    RAMBlock *block;
-    ram_addr_t last = 0;
-
-    RCU_READ_LOCK_GUARD();
-    RAMBLOCK_FOREACH(block) {
-        last = MAX(last, block->offset + block->max_length);
-    }
-    return last >> TARGET_PAGE_BITS;
-}
-
 static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
 {
    int ret;
@ -1799,13 +1787,11 @@ void qemu_ram_msync(RAMBlock *block, ram_addr_t start, ram_addr_t length)
 }

 /* Called with ram_list.mutex held */
-static void dirty_memory_extend(ram_addr_t old_ram_size,
-                                ram_addr_t new_ram_size)
+static void dirty_memory_extend(ram_addr_t new_ram_size)
 {
-    ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size,
-                                             DIRTY_MEMORY_BLOCK_SIZE);
-    ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size,
-                                             DIRTY_MEMORY_BLOCK_SIZE);
+    unsigned int old_num_blocks = ram_list.num_dirty_blocks;
+    unsigned int new_num_blocks = DIV_ROUND_UP(new_ram_size,
+                                               DIRTY_MEMORY_BLOCK_SIZE);
    int i;

    /* Only need to extend if block count increased */
@ -1837,6 +1823,8 @@ static void dirty_memory_extend(ram_addr_t old_ram_size,
            g_free_rcu(old_blocks, rcu);
        }
    }
+
+    ram_list.num_dirty_blocks = new_num_blocks;
 }

 static void ram_block_add(RAMBlock *new_block, Error **errp)
@ -1846,11 +1834,9 @@ static void ram_block_add(RAMBlock *new_block, Error **errp)
    RAMBlock *block;
    RAMBlock *last_block = NULL;
    bool free_on_error = false;
-    ram_addr_t old_ram_size, new_ram_size;
+    ram_addr_t ram_size;
    Error *err = NULL;

-    old_ram_size = last_ram_page();
-
    qemu_mutex_lock_ramlist();
    new_block->offset = find_ram_offset(new_block->max_length);

@ -1901,11 +1887,8 @@ static void ram_block_add(RAMBlock *new_block, Error **errp)
        }
    }

-    new_ram_size = MAX(old_ram_size,
-              (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
-    if (new_ram_size > old_ram_size) {
-        dirty_memory_extend(old_ram_size, new_ram_size);
-    }
+    ram_size = (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS;
+    dirty_memory_extend(ram_size);
    /* Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
     * QLIST (which has an RCU-friendly variant) does not have insertion at
     * tail, so save the last element in last_block.
@ -3095,6 +3078,20 @@ void cpu_flush_icache_range(hwaddr start, hwaddr len)
                                     NULL, len, FLUSH_CACHE);
 }

+/*
+ * A magic value stored in the first 8 bytes of the bounce buffer struct. Used
+ * to detect illegal pointers passed to address_space_unmap.
+ */
+#define BOUNCE_BUFFER_MAGIC 0xb4017ceb4ffe12ed
+
+typedef struct {
+    uint64_t magic;
+    MemoryRegion *mr;
+    hwaddr addr;
+    size_t len;
+    uint8_t buffer[];
+} BounceBuffer;
+
 static void
 address_space_unregister_map_client_do(AddressSpaceMapClient *client)
 {
@ -3120,9 +3117,9 @@ void address_space_register_map_client(AddressSpace *as, QEMUBH *bh)
    QEMU_LOCK_GUARD(&as->map_client_list_lock);
    client->bh = bh;
    QLIST_INSERT_HEAD(&as->map_client_list, client, link);
-    /* Write map_client_list before reading in_use.  */
+    /* Write map_client_list before reading bounce_buffer_size. */
    smp_mb();
-    if (!qatomic_read(&as->bounce.in_use)) {
+    if (qatomic_read(&as->bounce_buffer_size) < as->max_bounce_buffer_size) {
        address_space_notify_map_clients_locked(as);
    }
 }
@ -3251,28 +3248,40 @@ void *address_space_map(AddressSpace *as,
    mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);

    if (!memory_access_is_direct(mr, is_write)) {
-        if (qatomic_xchg(&as->bounce.in_use, true)) {
+        size_t used = qatomic_read(&as->bounce_buffer_size);
+        for (;;) {
+            hwaddr alloc = MIN(as->max_bounce_buffer_size - used, l);
+            size_t new_size = used + alloc;
+            size_t actual =
+                qatomic_cmpxchg(&as->bounce_buffer_size, used, new_size);
+            if (actual == used) {
+                l = alloc;
+                break;
+            }
+            used = actual;
+        }
+
+        if (l == 0) {
            *plen = 0;
            return NULL;
        }
-        /* Avoid unbounded allocations */
-        l = MIN(l, TARGET_PAGE_SIZE);
-        as->bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
-        as->bounce.addr = addr;
-        as->bounce.len = l;

+        BounceBuffer *bounce = g_malloc0(l + sizeof(BounceBuffer));
+        bounce->magic = BOUNCE_BUFFER_MAGIC;
        memory_region_ref(mr);
-        as->bounce.mr = mr;
+        bounce->mr = mr;
+        bounce->addr = addr;
+        bounce->len = l;
+
        if (!is_write) {
            flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED,
-                          as->bounce.buffer, l);
+                          bounce->buffer, l);
        }

        *plen = l;
-        return as->bounce.buffer;
+        return bounce->buffer;
    }

-
    memory_region_ref(mr);
    *plen = flatview_extend_translation(fv, addr, len, mr, xlat,
                                        l, is_write, attrs);
@ -3287,12 +3296,11 @@ void *address_space_map(AddressSpace *as,
 void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
                         bool is_write, hwaddr access_len)
 {
-    if (buffer != as->bounce.buffer) {
-        MemoryRegion *mr;
-        ram_addr_t addr1;
+    MemoryRegion *mr;
+    ram_addr_t addr1;

-        mr = memory_region_from_host(buffer, &addr1);
-        assert(mr != NULL);
+    mr = memory_region_from_host(buffer, &addr1);
+    if (mr != NULL) {
        if (is_write) {
            invalidate_and_set_dirty(mr, addr1, access_len);
        }
@ -3302,15 +3310,22 @@ void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
        memory_region_unref(mr);
        return;
    }
+
+
+    BounceBuffer *bounce = container_of(buffer, BounceBuffer, buffer);
+    assert(bounce->magic == BOUNCE_BUFFER_MAGIC);
+
    if (is_write) {
-        address_space_write(as, as->bounce.addr, MEMTXATTRS_UNSPECIFIED,
-                            as->bounce.buffer, access_len);
+        address_space_write(as, bounce->addr, MEMTXATTRS_UNSPECIFIED,
+                            bounce->buffer, access_len);
    }
-    qemu_vfree(as->bounce.buffer);
-    as->bounce.buffer = NULL;
-    memory_region_unref(as->bounce.mr);
-    /* Clear in_use before reading map_client_list.  */
-    qatomic_set_mb(&as->bounce.in_use, false);
+
+    qatomic_sub(&as->bounce_buffer_size, bounce->len);
+    bounce->magic = ~BOUNCE_BUFFER_MAGIC;
+    memory_region_unref(bounce->mr);
+    g_free(bounce);
+    /* Write bounce_buffer_size before reading map_client_list. */
+    smp_mb();
    address_space_notify_map_clients(as);
 }

--- a/system/trace-events
+++ b/system/trace-events
@ -44,3 +44,6 @@ dirtylimit_state_finalize(void)
 dirtylimit_throttle_pct(int cpu_index, uint64_t pct, int64_t time_us) "CPU[%d] throttle percent: %" PRIu64 ", throttle adjust time %"PRIi64 " us"
 dirtylimit_set_vcpu(int cpu_index, uint64_t quota) "CPU[%d] set dirty page rate limit %"PRIu64
 dirtylimit_vcpu_execute(int cpu_index, int64_t sleep_time_us) "CPU[%d] sleep %"PRIi64 " us"
+
+# cpu-throttle.c
+cpu_throttle_set(int new_throttle_pct)  "set guest CPU throttled by %d%%"
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@ -2920,6 +2920,18 @@ test_migrate_precopy_tcp_multifd_zstd_start(QTestState *from,
 }
 #endif /* CONFIG_ZSTD */

+#ifdef CONFIG_QATZIP
+static void *
+test_migrate_precopy_tcp_multifd_qatzip_start(QTestState *from,
+                                              QTestState *to)
+{
+    migrate_set_parameter_int(from, "multifd-qatzip-level", 2);
+    migrate_set_parameter_int(to, "multifd-qatzip-level", 2);
+
+    return test_migrate_precopy_tcp_multifd_start_common(from, to, "qatzip");
+}
+#endif
+
 #ifdef CONFIG_QPL
 static void *
 test_migrate_precopy_tcp_multifd_qpl_start(QTestState *from,
@ -3017,6 +3029,17 @@ static void test_multifd_tcp_zstd(void)
 }
 #endif

+#ifdef CONFIG_QATZIP
+static void test_multifd_tcp_qatzip(void)
+{
+    MigrateCommon args = {
+        .listen_uri = "defer",
+        .start_hook = test_migrate_precopy_tcp_multifd_qatzip_start,
+    };
+    test_precopy_common(&args);
+}
+#endif
+
 #ifdef CONFIG_QPL
 static void test_multifd_tcp_qpl(void)
 {
@ -3922,6 +3945,10 @@ int main(int argc, char **argv)
    migration_test_add("/migration/multifd/tcp/plain/zstd",
                       test_multifd_tcp_zstd);
 #endif
+#ifdef CONFIG_QATZIP
+    migration_test_add("/migration/multifd/tcp/plain/qatzip",
+                test_multifd_tcp_qatzip);
+#endif
 #ifdef CONFIG_QPL
    migration_test_add("/migration/multifd/tcp/plain/qpl",
                       test_multifd_tcp_qpl);