mirror of https://github.com/xemu-project/xemu.git
Merge remote-tracking branch 'quintela/migration.next' into staging
# By Michael R. Hines (9) and others # Via Juan Quintela * quintela/migration.next: rdma: introduce capability x-rdma-pin-all rdma: new QEMUFileOps hooks rdma: introduce qemu_ram_foreach_block() rdma: export qemu_fflush() rdma: introduce qemu_file_mode_is_not_valid() rdma: export throughput w/ MigrationStats QMP rdma: export yield_until_fd_readable() rdma: introduce qemu_update_position() rdma: add documentation migration: do not overwrite zero pages Revert "migration: do not sent zero pages in bulk stage" arch_init/ram_load: add error message for block length mismatch Message-id: 1372329455-5995-1-git-send-email-quintela@redhat.com Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
This commit is contained in:
commit
c394ace828
22
arch_init.c
22
arch_init.c
|
@ -457,15 +457,10 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
|
||||||
bytes_sent = -1;
|
bytes_sent = -1;
|
||||||
if (is_zero_page(p)) {
|
if (is_zero_page(p)) {
|
||||||
acct_info.dup_pages++;
|
acct_info.dup_pages++;
|
||||||
if (!ram_bulk_stage) {
|
|
||||||
bytes_sent = save_block_hdr(f, block, offset, cont,
|
bytes_sent = save_block_hdr(f, block, offset, cont,
|
||||||
RAM_SAVE_FLAG_COMPRESS);
|
RAM_SAVE_FLAG_COMPRESS);
|
||||||
qemu_put_byte(f, 0);
|
qemu_put_byte(f, 0);
|
||||||
bytes_sent++;
|
bytes_sent++;
|
||||||
} else {
|
|
||||||
acct_info.skipped_pages++;
|
|
||||||
bytes_sent = 0;
|
|
||||||
}
|
|
||||||
} else if (!ram_bulk_stage && migrate_use_xbzrle()) {
|
} else if (!ram_bulk_stage && migrate_use_xbzrle()) {
|
||||||
current_addr = block->offset + offset;
|
current_addr = block->offset + offset;
|
||||||
bytes_sent = save_xbzrle_page(f, p, current_addr, block,
|
bytes_sent = save_xbzrle_page(f, p, current_addr, block,
|
||||||
|
@ -498,6 +493,18 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
|
||||||
|
|
||||||
static uint64_t bytes_transferred;
|
static uint64_t bytes_transferred;
|
||||||
|
|
||||||
|
void acct_update_position(QEMUFile *f, size_t size, bool zero)
|
||||||
|
{
|
||||||
|
uint64_t pages = size / TARGET_PAGE_SIZE;
|
||||||
|
if (zero) {
|
||||||
|
acct_info.dup_pages += pages;
|
||||||
|
} else {
|
||||||
|
acct_info.norm_pages += pages;
|
||||||
|
bytes_transferred += size;
|
||||||
|
qemu_update_position(f, size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static ram_addr_t ram_save_remaining(void)
|
static ram_addr_t ram_save_remaining(void)
|
||||||
{
|
{
|
||||||
return migration_dirty_pages;
|
return migration_dirty_pages;
|
||||||
|
@ -808,6 +815,9 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
|
||||||
QTAILQ_FOREACH(block, &ram_list.blocks, next) {
|
QTAILQ_FOREACH(block, &ram_list.blocks, next) {
|
||||||
if (!strncmp(id, block->idstr, sizeof(id))) {
|
if (!strncmp(id, block->idstr, sizeof(id))) {
|
||||||
if (block->length != length) {
|
if (block->length != length) {
|
||||||
|
fprintf(stderr, "Length mismatch: %s: %ld "
|
||||||
|
"in != " RAM_ADDR_FMT "\n", id, length,
|
||||||
|
block->length);
|
||||||
ret = -EINVAL;
|
ret = -EINVAL;
|
||||||
goto done;
|
goto done;
|
||||||
}
|
}
|
||||||
|
@ -837,6 +847,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
|
||||||
}
|
}
|
||||||
|
|
||||||
ch = qemu_get_byte(f);
|
ch = qemu_get_byte(f);
|
||||||
|
if (ch != 0 || !is_zero_page(host)) {
|
||||||
memset(host, ch, TARGET_PAGE_SIZE);
|
memset(host, ch, TARGET_PAGE_SIZE);
|
||||||
#ifndef _WIN32
|
#ifndef _WIN32
|
||||||
if (ch == 0 &&
|
if (ch == 0 &&
|
||||||
|
@ -845,6 +856,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
|
||||||
qemu_madvise(host, TARGET_PAGE_SIZE, QEMU_MADV_DONTNEED);
|
qemu_madvise(host, TARGET_PAGE_SIZE, QEMU_MADV_DONTNEED);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
} else if (flags & RAM_SAVE_FLAG_PAGE) {
|
} else if (flags & RAM_SAVE_FLAG_PAGE) {
|
||||||
void *host;
|
void *host;
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,415 @@
|
||||||
|
(RDMA: Remote Direct Memory Access)
|
||||||
|
RDMA Live Migration Specification, Version # 1
|
||||||
|
==============================================
|
||||||
|
Wiki: http://wiki.qemu.org/Features/RDMALiveMigration
|
||||||
|
Github: git@github.com:hinesmr/qemu.git, 'rdma' branch
|
||||||
|
|
||||||
|
Copyright (C) 2013 Michael R. Hines <mrhines@us.ibm.com>
|
||||||
|
|
||||||
|
An *exhaustive* paper (2010) shows additional performance details
|
||||||
|
linked on the QEMU wiki above.
|
||||||
|
|
||||||
|
Contents:
|
||||||
|
=========
|
||||||
|
* Introduction
|
||||||
|
* Before running
|
||||||
|
* Running
|
||||||
|
* Performance
|
||||||
|
* RDMA Migration Protocol Description
|
||||||
|
* Versioning and Capabilities
|
||||||
|
* QEMUFileRDMA Interface
|
||||||
|
* Migration of pc.ram
|
||||||
|
* Error handling
|
||||||
|
* TODO
|
||||||
|
|
||||||
|
Introduction:
|
||||||
|
=============
|
||||||
|
|
||||||
|
RDMA helps make your migration more deterministic under heavy load because
|
||||||
|
of the significantly lower latency and higher throughput over TCP/IP. This is
|
||||||
|
because the RDMA I/O architecture reduces the number of interrupts and
|
||||||
|
data copies by bypassing the host networking stack. In particular, a TCP-based
|
||||||
|
migration, under certain types of memory-bound workloads, may take a more
|
||||||
|
unpredicatable amount of time to complete the migration if the amount of
|
||||||
|
memory tracked during each live migration iteration round cannot keep pace
|
||||||
|
with the rate of dirty memory produced by the workload.
|
||||||
|
|
||||||
|
RDMA currently comes in two flavors: both Ethernet based (RoCE, or RDMA
|
||||||
|
over Convered Ethernet) as well as Infiniband-based. This implementation of
|
||||||
|
migration using RDMA is capable of using both technologies because of
|
||||||
|
the use of the OpenFabrics OFED software stack that abstracts out the
|
||||||
|
programming model irrespective of the underlying hardware.
|
||||||
|
|
||||||
|
Refer to openfabrics.org or your respective RDMA hardware vendor for
|
||||||
|
an understanding on how to verify that you have the OFED software stack
|
||||||
|
installed in your environment. You should be able to successfully link
|
||||||
|
against the "librdmacm" and "libibverbs" libraries and development headers
|
||||||
|
for a working build of QEMU to run successfully using RDMA Migration.
|
||||||
|
|
||||||
|
BEFORE RUNNING:
|
||||||
|
===============
|
||||||
|
|
||||||
|
Use of RDMA during migration requires pinning and registering memory
|
||||||
|
with the hardware. This means that memory must be physically resident
|
||||||
|
before the hardware can transmit that memory to another machine.
|
||||||
|
If this is not acceptable for your application or product, then the use
|
||||||
|
of RDMA migration may in fact be harmful to co-located VMs or other
|
||||||
|
software on the machine if there is not sufficient memory available to
|
||||||
|
relocate the entire footprint of the virtual machine. If so, then the
|
||||||
|
use of RDMA is discouraged and it is recommended to use standard TCP migration.
|
||||||
|
|
||||||
|
Experimental: Next, decide if you want dynamic page registration.
|
||||||
|
For example, if you have an 8GB RAM virtual machine, but only 1GB
|
||||||
|
is in active use, then enabling this feature will cause all 8GB to
|
||||||
|
be pinned and resident in memory. This feature mostly affects the
|
||||||
|
bulk-phase round of the migration and can be enabled for extremely
|
||||||
|
high-performance RDMA hardware using the following command:
|
||||||
|
|
||||||
|
QEMU Monitor Command:
|
||||||
|
$ migrate_set_capability x-rdma-pin-all on # disabled by default
|
||||||
|
|
||||||
|
Performing this action will cause all 8GB to be pinned, so if that's
|
||||||
|
not what you want, then please ignore this step altogether.
|
||||||
|
|
||||||
|
On the other hand, this will also significantly speed up the bulk round
|
||||||
|
of the migration, which can greatly reduce the "total" time of your migration.
|
||||||
|
Example performance of this using an idle VM in the previous example
|
||||||
|
can be found in the "Performance" section.
|
||||||
|
|
||||||
|
Note: for very large virtual machines (hundreds of GBs), pinning all
|
||||||
|
*all* of the memory of your virtual machine in the kernel is very expensive
|
||||||
|
may extend the initial bulk iteration time by many seconds,
|
||||||
|
and thus extending the total migration time. However, this will not
|
||||||
|
affect the determinism or predictability of your migration you will
|
||||||
|
still gain from the benefits of advanced pinning with RDMA.
|
||||||
|
|
||||||
|
RUNNING:
|
||||||
|
========
|
||||||
|
|
||||||
|
First, set the migration speed to match your hardware's capabilities:
|
||||||
|
|
||||||
|
QEMU Monitor Command:
|
||||||
|
$ migrate_set_speed 40g # or whatever is the MAX of your RDMA device
|
||||||
|
|
||||||
|
Next, on the destination machine, add the following to the QEMU command line:
|
||||||
|
|
||||||
|
qemu ..... -incoming x-rdma:host:port
|
||||||
|
|
||||||
|
Finally, perform the actual migration on the source machine:
|
||||||
|
|
||||||
|
QEMU Monitor Command:
|
||||||
|
$ migrate -d x-rdma:host:port
|
||||||
|
|
||||||
|
PERFORMANCE
|
||||||
|
===========
|
||||||
|
|
||||||
|
Here is a brief summary of total migration time and downtime using RDMA:
|
||||||
|
Using a 40gbps infiniband link performing a worst-case stress test,
|
||||||
|
using an 8GB RAM virtual machine:
|
||||||
|
|
||||||
|
Using the following command:
|
||||||
|
$ apt-get install stress
|
||||||
|
$ stress --vm-bytes 7500M --vm 1 --vm-keep
|
||||||
|
|
||||||
|
1. Migration throughput: 26 gigabits/second.
|
||||||
|
2. Downtime (stop time) varies between 15 and 100 milliseconds.
|
||||||
|
|
||||||
|
EFFECTS of memory registration on bulk phase round:
|
||||||
|
|
||||||
|
For example, in the same 8GB RAM example with all 8GB of memory in
|
||||||
|
active use and the VM itself is completely idle using the same 40 gbps
|
||||||
|
infiniband link:
|
||||||
|
|
||||||
|
1. x-rdma-pin-all disabled total time: approximately 7.5 seconds @ 9.5 Gbps
|
||||||
|
2. x-rdma-pin-all enabled total time: approximately 4 seconds @ 26 Gbps
|
||||||
|
|
||||||
|
These numbers would of course scale up to whatever size virtual machine
|
||||||
|
you have to migrate using RDMA.
|
||||||
|
|
||||||
|
Enabling this feature does *not* have any measurable affect on
|
||||||
|
migration *downtime*. This is because, without this feature, all of the
|
||||||
|
memory will have already been registered already in advance during
|
||||||
|
the bulk round and does not need to be re-registered during the successive
|
||||||
|
iteration rounds.
|
||||||
|
|
||||||
|
RDMA Protocol Description:
|
||||||
|
==========================
|
||||||
|
|
||||||
|
Migration with RDMA is separated into two parts:
|
||||||
|
|
||||||
|
1. The transmission of the pages using RDMA
|
||||||
|
2. Everything else (a control channel is introduced)
|
||||||
|
|
||||||
|
"Everything else" is transmitted using a formal
|
||||||
|
protocol now, consisting of infiniband SEND messages.
|
||||||
|
|
||||||
|
An infiniband SEND message is the standard ibverbs
|
||||||
|
message used by applications of infiniband hardware.
|
||||||
|
The only difference between a SEND message and an RDMA
|
||||||
|
message is that SEND messages cause notifications
|
||||||
|
to be posted to the completion queue (CQ) on the
|
||||||
|
infiniband receiver side, whereas RDMA messages (used
|
||||||
|
for pc.ram) do not (to behave like an actual DMA).
|
||||||
|
|
||||||
|
Messages in infiniband require two things:
|
||||||
|
|
||||||
|
1. registration of the memory that will be transmitted
|
||||||
|
2. (SEND only) work requests to be posted on both
|
||||||
|
sides of the network before the actual transmission
|
||||||
|
can occur.
|
||||||
|
|
||||||
|
RDMA messages are much easier to deal with. Once the memory
|
||||||
|
on the receiver side is registered and pinned, we're
|
||||||
|
basically done. All that is required is for the sender
|
||||||
|
side to start dumping bytes onto the link.
|
||||||
|
|
||||||
|
(Memory is not released from pinning until the migration
|
||||||
|
completes, given that RDMA migrations are very fast.)
|
||||||
|
|
||||||
|
SEND messages require more coordination because the
|
||||||
|
receiver must have reserved space (using a receive
|
||||||
|
work request) on the receive queue (RQ) before QEMUFileRDMA
|
||||||
|
can start using them to carry all the bytes as
|
||||||
|
a control transport for migration of device state.
|
||||||
|
|
||||||
|
To begin the migration, the initial connection setup is
|
||||||
|
as follows (migration-rdma.c):
|
||||||
|
|
||||||
|
1. Receiver and Sender are started (command line or libvirt):
|
||||||
|
2. Both sides post two RQ work requests
|
||||||
|
3. Receiver does listen()
|
||||||
|
4. Sender does connect()
|
||||||
|
5. Receiver accept()
|
||||||
|
6. Check versioning and capabilities (described later)
|
||||||
|
|
||||||
|
At this point, we define a control channel on top of SEND messages
|
||||||
|
which is described by a formal protocol. Each SEND message has a
|
||||||
|
header portion and a data portion (but together are transmitted
|
||||||
|
as a single SEND message).
|
||||||
|
|
||||||
|
Header:
|
||||||
|
* Length (of the data portion, uint32, network byte order)
|
||||||
|
* Type (what command to perform, uint32, network byte order)
|
||||||
|
* Repeat (Number of commands in data portion, same type only)
|
||||||
|
|
||||||
|
The 'Repeat' field is here to support future multiple page registrations
|
||||||
|
in a single message without any need to change the protocol itself
|
||||||
|
so that the protocol is compatible against multiple versions of QEMU.
|
||||||
|
Version #1 requires that all server implementations of the protocol must
|
||||||
|
check this field and register all requests found in the array of commands located
|
||||||
|
in the data portion and return an equal number of results in the response.
|
||||||
|
The maximum number of repeats is hard-coded to 4096. This is a conservative
|
||||||
|
limit based on the maximum size of a SEND message along with emperical
|
||||||
|
observations on the maximum future benefit of simultaneous page registrations.
|
||||||
|
|
||||||
|
The 'type' field has 10 different command values:
|
||||||
|
1. Unused
|
||||||
|
2. Error (sent to the source during bad things)
|
||||||
|
3. Ready (control-channel is available)
|
||||||
|
4. QEMU File (for sending non-live device state)
|
||||||
|
5. RAM Blocks request (used right after connection setup)
|
||||||
|
6. RAM Blocks result (used right after connection setup)
|
||||||
|
7. Compress page (zap zero page and skip registration)
|
||||||
|
8. Register request (dynamic chunk registration)
|
||||||
|
9. Register result ('rkey' to be used by sender)
|
||||||
|
10. Register finished (registration for current iteration finished)
|
||||||
|
|
||||||
|
A single control message, as hinted above, can contain within the data
|
||||||
|
portion an array of many commands of the same type. If there is more than
|
||||||
|
one command, then the 'repeat' field will be greater than 1.
|
||||||
|
|
||||||
|
After connection setup, message 5 & 6 are used to exchange ram block
|
||||||
|
information and optionally pin all the memory if requested by the user.
|
||||||
|
|
||||||
|
After ram block exchange is completed, we have two protocol-level
|
||||||
|
functions, responsible for communicating control-channel commands
|
||||||
|
using the above list of values:
|
||||||
|
|
||||||
|
Logically:
|
||||||
|
|
||||||
|
qemu_rdma_exchange_recv(header, expected command type)
|
||||||
|
|
||||||
|
1. We transmit a READY command to let the sender know that
|
||||||
|
we are *ready* to receive some data bytes on the control channel.
|
||||||
|
2. Before attempting to receive the expected command, we post another
|
||||||
|
RQ work request to replace the one we just used up.
|
||||||
|
3. Block on a CQ event channel and wait for the SEND to arrive.
|
||||||
|
4. When the send arrives, librdmacm will unblock us.
|
||||||
|
5. Verify that the command-type and version received matches the one we expected.
|
||||||
|
|
||||||
|
qemu_rdma_exchange_send(header, data, optional response header & data):
|
||||||
|
|
||||||
|
1. Block on the CQ event channel waiting for a READY command
|
||||||
|
from the receiver to tell us that the receiver
|
||||||
|
is *ready* for us to transmit some new bytes.
|
||||||
|
2. Optionally: if we are expecting a response from the command
|
||||||
|
(that we have no yet transmitted), let's post an RQ
|
||||||
|
work request to receive that data a few moments later.
|
||||||
|
3. When the READY arrives, librdmacm will
|
||||||
|
unblock us and we immediately post a RQ work request
|
||||||
|
to replace the one we just used up.
|
||||||
|
4. Now, we can actually post the work request to SEND
|
||||||
|
the requested command type of the header we were asked for.
|
||||||
|
5. Optionally, if we are expecting a response (as before),
|
||||||
|
we block again and wait for that response using the additional
|
||||||
|
work request we previously posted. (This is used to carry
|
||||||
|
'Register result' commands #6 back to the sender which
|
||||||
|
hold the rkey need to perform RDMA. Note that the virtual address
|
||||||
|
corresponding to this rkey was already exchanged at the beginning
|
||||||
|
of the connection (described below).
|
||||||
|
|
||||||
|
All of the remaining command types (not including 'ready')
|
||||||
|
described above all use the aformentioned two functions to do the hard work:
|
||||||
|
|
||||||
|
1. After connection setup, RAMBlock information is exchanged using
|
||||||
|
this protocol before the actual migration begins. This information includes
|
||||||
|
a description of each RAMBlock on the server side as well as the virtual addresses
|
||||||
|
and lengths of each RAMBlock. This is used by the client to determine the
|
||||||
|
start and stop locations of chunks and how to register them dynamically
|
||||||
|
before performing the RDMA operations.
|
||||||
|
2. During runtime, once a 'chunk' becomes full of pages ready to
|
||||||
|
be sent with RDMA, the registration commands are used to ask the
|
||||||
|
other side to register the memory for this chunk and respond
|
||||||
|
with the result (rkey) of the registration.
|
||||||
|
3. Also, the QEMUFile interfaces also call these functions (described below)
|
||||||
|
when transmitting non-live state, such as devices or to send
|
||||||
|
its own protocol information during the migration process.
|
||||||
|
4. Finally, zero pages are only checked if a page has not yet been registered
|
||||||
|
using chunk registration (or not checked at all and unconditionally
|
||||||
|
written if chunk registration is disabled. This is accomplished using
|
||||||
|
the "Compress" command listed above. If the page *has* been registered
|
||||||
|
then we check the entire chunk for zero. Only if the entire chunk is
|
||||||
|
zero, then we send a compress command to zap the page on the other side.
|
||||||
|
|
||||||
|
Versioning and Capabilities
|
||||||
|
===========================
|
||||||
|
Current version of the protocol is version #1.
|
||||||
|
|
||||||
|
The same version applies to both for protocol traffic and capabilities
|
||||||
|
negotiation. (i.e. There is only one version number that is referred to
|
||||||
|
by all communication).
|
||||||
|
|
||||||
|
librdmacm provides the user with a 'private data' area to be exchanged
|
||||||
|
at connection-setup time before any infiniband traffic is generated.
|
||||||
|
|
||||||
|
Header:
|
||||||
|
* Version (protocol version validated before send/recv occurs), uint32, network byte order
|
||||||
|
* Flags (bitwise OR of each capability), uint32, network byte order
|
||||||
|
|
||||||
|
There is no data portion of this header right now, so there is
|
||||||
|
no length field. The maximum size of the 'private data' section
|
||||||
|
is only 192 bytes per the Infiniband specification, so it's not
|
||||||
|
very useful for data anyway. This structure needs to remain small.
|
||||||
|
|
||||||
|
This private data area is a convenient place to check for protocol
|
||||||
|
versioning because the user does not need to register memory to
|
||||||
|
transmit a few bytes of version information.
|
||||||
|
|
||||||
|
This is also a convenient place to negotiate capabilities
|
||||||
|
(like dynamic page registration).
|
||||||
|
|
||||||
|
If the version is invalid, we throw an error.
|
||||||
|
|
||||||
|
If the version is new, we only negotiate the capabilities that the
|
||||||
|
requested version is able to perform and ignore the rest.
|
||||||
|
|
||||||
|
Currently there is only *one* capability in Version #1: dynamic page registration
|
||||||
|
|
||||||
|
Finally: Negotiation happens with the Flags field: If the primary-VM
|
||||||
|
sets a flag, but the destination does not support this capability, it
|
||||||
|
will return a zero-bit for that flag and the primary-VM will understand
|
||||||
|
that as not being an available capability and will thus disable that
|
||||||
|
capability on the primary-VM side.
|
||||||
|
|
||||||
|
QEMUFileRDMA Interface:
|
||||||
|
=======================
|
||||||
|
|
||||||
|
QEMUFileRDMA introduces a couple of new functions:
|
||||||
|
|
||||||
|
1. qemu_rdma_get_buffer() (QEMUFileOps rdma_read_ops)
|
||||||
|
2. qemu_rdma_put_buffer() (QEMUFileOps rdma_write_ops)
|
||||||
|
|
||||||
|
These two functions are very short and simply use the protocol
|
||||||
|
describe above to deliver bytes without changing the upper-level
|
||||||
|
users of QEMUFile that depend on a bytestream abstraction.
|
||||||
|
|
||||||
|
Finally, how do we handoff the actual bytes to get_buffer()?
|
||||||
|
|
||||||
|
Again, because we're trying to "fake" a bytestream abstraction
|
||||||
|
using an analogy not unlike individual UDP frames, we have
|
||||||
|
to hold on to the bytes received from control-channel's SEND
|
||||||
|
messages in memory.
|
||||||
|
|
||||||
|
Each time we receive a complete "QEMU File" control-channel
|
||||||
|
message, the bytes from SEND are copied into a small local holding area.
|
||||||
|
|
||||||
|
Then, we return the number of bytes requested by get_buffer()
|
||||||
|
and leave the remaining bytes in the holding area until get_buffer()
|
||||||
|
comes around for another pass.
|
||||||
|
|
||||||
|
If the buffer is empty, then we follow the same steps
|
||||||
|
listed above and issue another "QEMU File" protocol command,
|
||||||
|
asking for a new SEND message to re-fill the buffer.
|
||||||
|
|
||||||
|
Migration of pc.ram:
|
||||||
|
====================
|
||||||
|
|
||||||
|
At the beginning of the migration, (migration-rdma.c),
|
||||||
|
the sender and the receiver populate the list of RAMBlocks
|
||||||
|
to be registered with each other into a structure.
|
||||||
|
Then, using the aforementioned protocol, they exchange a
|
||||||
|
description of these blocks with each other, to be used later
|
||||||
|
during the iteration of main memory. This description includes
|
||||||
|
a list of all the RAMBlocks, their offsets and lengths, virtual
|
||||||
|
addresses and possibly includes pre-registered RDMA keys in case dynamic
|
||||||
|
page registration was disabled on the server-side, otherwise not.
|
||||||
|
|
||||||
|
Main memory is not migrated with the aforementioned protocol,
|
||||||
|
but is instead migrated with normal RDMA Write operations.
|
||||||
|
|
||||||
|
Pages are migrated in "chunks" (hard-coded to 1 Megabyte right now).
|
||||||
|
Chunk size is not dynamic, but it could be in a future implementation.
|
||||||
|
There's nothing to indicate that this is useful right now.
|
||||||
|
|
||||||
|
When a chunk is full (or a flush() occurs), the memory backed by
|
||||||
|
the chunk is registered with librdmacm is pinned in memory on
|
||||||
|
both sides using the aforementioned protocol.
|
||||||
|
After pinning, an RDMA Write is generated and transmitted
|
||||||
|
for the entire chunk.
|
||||||
|
|
||||||
|
Chunks are also transmitted in batches: This means that we
|
||||||
|
do not request that the hardware signal the completion queue
|
||||||
|
for the completion of *every* chunk. The current batch size
|
||||||
|
is about 64 chunks (corresponding to 64 MB of memory).
|
||||||
|
Only the last chunk in a batch must be signaled.
|
||||||
|
This helps keep everything as asynchronous as possible
|
||||||
|
and helps keep the hardware busy performing RDMA operations.
|
||||||
|
|
||||||
|
Error-handling:
|
||||||
|
===============
|
||||||
|
|
||||||
|
Infiniband has what is called a "Reliable, Connected"
|
||||||
|
link (one of 4 choices). This is the mode in which
|
||||||
|
we use for RDMA migration.
|
||||||
|
|
||||||
|
If a *single* message fails,
|
||||||
|
the decision is to abort the migration entirely and
|
||||||
|
cleanup all the RDMA descriptors and unregister all
|
||||||
|
the memory.
|
||||||
|
|
||||||
|
After cleanup, the Virtual Machine is returned to normal
|
||||||
|
operation the same way that would happen if the TCP
|
||||||
|
socket is broken during a non-RDMA based migration.
|
||||||
|
|
||||||
|
TODO:
|
||||||
|
=====
|
||||||
|
1. 'migrate x-rdma:host:port' and '-incoming x-rdma' options will be
|
||||||
|
renamed to 'rdma' after the experimental phase of this work has
|
||||||
|
completed upstream.
|
||||||
|
2. Currently, 'ulimit -l' mlock() limits as well as cgroups swap limits
|
||||||
|
are not compatible with infinband memory pinning and will result in
|
||||||
|
an aborted migration (but with the source VM left unaffected).
|
||||||
|
3. Use of the recent /proc/<pid>/pagemap would likely speed up
|
||||||
|
the use of KSM and ballooning while using RDMA.
|
||||||
|
4. Also, some form of balloon-device usage tracking would also
|
||||||
|
help alleviate some issues.
|
9
exec.c
9
exec.c
|
@ -2630,3 +2630,12 @@ bool cpu_physical_memory_is_io(hwaddr phys_addr)
|
||||||
memory_region_is_romd(mr));
|
memory_region_is_romd(mr));
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
void qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
|
||||||
|
{
|
||||||
|
RAMBlock *block;
|
||||||
|
|
||||||
|
QTAILQ_FOREACH(block, &ram_list.blocks, next) {
|
||||||
|
func(block->host, block->offset, block->length, opaque);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
2
hmp.c
2
hmp.c
|
@ -169,6 +169,8 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict)
|
||||||
if (info->has_ram) {
|
if (info->has_ram) {
|
||||||
monitor_printf(mon, "transferred ram: %" PRIu64 " kbytes\n",
|
monitor_printf(mon, "transferred ram: %" PRIu64 " kbytes\n",
|
||||||
info->ram->transferred >> 10);
|
info->ram->transferred >> 10);
|
||||||
|
monitor_printf(mon, "throughput: %0.2f mbps\n",
|
||||||
|
info->ram->mbps);
|
||||||
monitor_printf(mon, "remaining ram: %" PRIu64 " kbytes\n",
|
monitor_printf(mon, "remaining ram: %" PRIu64 " kbytes\n",
|
||||||
info->ram->remaining >> 10);
|
info->ram->remaining >> 10);
|
||||||
monitor_printf(mon, "total ram: %" PRIu64 " kbytes\n",
|
monitor_printf(mon, "total ram: %" PRIu64 " kbytes\n",
|
||||||
|
|
|
@ -209,4 +209,10 @@ void qemu_co_rwlock_unlock(CoRwlock *lock);
|
||||||
*/
|
*/
|
||||||
void coroutine_fn co_sleep_ns(QEMUClock *clock, int64_t ns);
|
void coroutine_fn co_sleep_ns(QEMUClock *clock, int64_t ns);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Yield until a file descriptor becomes readable
|
||||||
|
*
|
||||||
|
* Note that this function clobbers the handlers for the file descriptor.
|
||||||
|
*/
|
||||||
|
void coroutine_fn yield_until_fd_readable(int fd);
|
||||||
#endif /* QEMU_COROUTINE_H */
|
#endif /* QEMU_COROUTINE_H */
|
||||||
|
|
|
@ -113,6 +113,11 @@ void cpu_physical_memory_write_rom(hwaddr addr,
|
||||||
extern struct MemoryRegion io_mem_rom;
|
extern struct MemoryRegion io_mem_rom;
|
||||||
extern struct MemoryRegion io_mem_notdirty;
|
extern struct MemoryRegion io_mem_notdirty;
|
||||||
|
|
||||||
|
typedef void (RAMBlockIterFunc)(void *host_addr,
|
||||||
|
ram_addr_t offset, ram_addr_t length, void *opaque);
|
||||||
|
|
||||||
|
void qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif /* !CPU_COMMON_H */
|
#endif /* !CPU_COMMON_H */
|
||||||
|
|
|
@ -21,6 +21,7 @@
|
||||||
#include "qapi/error.h"
|
#include "qapi/error.h"
|
||||||
#include "migration/vmstate.h"
|
#include "migration/vmstate.h"
|
||||||
#include "qapi-types.h"
|
#include "qapi-types.h"
|
||||||
|
#include "exec/cpu-common.h"
|
||||||
|
|
||||||
struct MigrationParams {
|
struct MigrationParams {
|
||||||
bool blk;
|
bool blk;
|
||||||
|
@ -40,6 +41,7 @@ struct MigrationState
|
||||||
|
|
||||||
int state;
|
int state;
|
||||||
MigrationParams params;
|
MigrationParams params;
|
||||||
|
double mbps;
|
||||||
int64_t total_time;
|
int64_t total_time;
|
||||||
int64_t downtime;
|
int64_t downtime;
|
||||||
int64_t expected_downtime;
|
int64_t expected_downtime;
|
||||||
|
@ -92,6 +94,8 @@ uint64_t ram_bytes_remaining(void);
|
||||||
uint64_t ram_bytes_transferred(void);
|
uint64_t ram_bytes_transferred(void);
|
||||||
uint64_t ram_bytes_total(void);
|
uint64_t ram_bytes_total(void);
|
||||||
|
|
||||||
|
void acct_update_position(QEMUFile *f, size_t size, bool zero);
|
||||||
|
|
||||||
extern SaveVMHandlers savevm_ram_handlers;
|
extern SaveVMHandlers savevm_ram_handlers;
|
||||||
|
|
||||||
uint64_t dup_mig_bytes_transferred(void);
|
uint64_t dup_mig_bytes_transferred(void);
|
||||||
|
@ -119,6 +123,8 @@ void migrate_add_blocker(Error *reason);
|
||||||
*/
|
*/
|
||||||
void migrate_del_blocker(Error *reason);
|
void migrate_del_blocker(Error *reason);
|
||||||
|
|
||||||
|
bool migrate_rdma_pin_all(void);
|
||||||
|
|
||||||
int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen,
|
int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen,
|
||||||
uint8_t *dst, int dlen);
|
uint8_t *dst, int dlen);
|
||||||
int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen);
|
int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen);
|
||||||
|
@ -127,4 +133,23 @@ int migrate_use_xbzrle(void);
|
||||||
int64_t migrate_xbzrle_cache_size(void);
|
int64_t migrate_xbzrle_cache_size(void);
|
||||||
|
|
||||||
int64_t xbzrle_cache_resize(int64_t new_size);
|
int64_t xbzrle_cache_resize(int64_t new_size);
|
||||||
|
|
||||||
|
void ram_control_before_iterate(QEMUFile *f, uint64_t flags);
|
||||||
|
void ram_control_after_iterate(QEMUFile *f, uint64_t flags);
|
||||||
|
void ram_control_load_hook(QEMUFile *f, uint64_t flags);
|
||||||
|
|
||||||
|
/* Whenever this is found in the data stream, the flags
|
||||||
|
* will be passed to ram_control_load_hook in the incoming-migration
|
||||||
|
* side. This lets before_ram_iterate/after_ram_iterate add
|
||||||
|
* transport-specific sections to the RAM migration data.
|
||||||
|
*/
|
||||||
|
#define RAM_SAVE_FLAG_HOOK 0x80
|
||||||
|
|
||||||
|
#define RAM_SAVE_CONTROL_NOT_SUPP -1000
|
||||||
|
#define RAM_SAVE_CONTROL_DELAYED -2000
|
||||||
|
|
||||||
|
size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset,
|
||||||
|
ram_addr_t offset, size_t size,
|
||||||
|
int *bytes_sent);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -23,6 +23,7 @@
|
||||||
*/
|
*/
|
||||||
#ifndef QEMU_FILE_H
|
#ifndef QEMU_FILE_H
|
||||||
#define QEMU_FILE_H 1
|
#define QEMU_FILE_H 1
|
||||||
|
#include "exec/cpu-common.h"
|
||||||
|
|
||||||
/* This function writes a chunk of data to a file at the given position.
|
/* This function writes a chunk of data to a file at the given position.
|
||||||
* The pos argument can be ignored if the file is only being used for
|
* The pos argument can be ignored if the file is only being used for
|
||||||
|
@ -57,12 +58,40 @@ typedef int (QEMUFileGetFD)(void *opaque);
|
||||||
typedef ssize_t (QEMUFileWritevBufferFunc)(void *opaque, struct iovec *iov,
|
typedef ssize_t (QEMUFileWritevBufferFunc)(void *opaque, struct iovec *iov,
|
||||||
int iovcnt, int64_t pos);
|
int iovcnt, int64_t pos);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This function provides hooks around different
|
||||||
|
* stages of RAM migration.
|
||||||
|
*/
|
||||||
|
typedef int (QEMURamHookFunc)(QEMUFile *f, void *opaque, uint64_t flags);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Constants used by ram_control_* hooks
|
||||||
|
*/
|
||||||
|
#define RAM_CONTROL_SETUP 0
|
||||||
|
#define RAM_CONTROL_ROUND 1
|
||||||
|
#define RAM_CONTROL_HOOK 2
|
||||||
|
#define RAM_CONTROL_FINISH 3
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This function allows override of where the RAM page
|
||||||
|
* is saved (such as RDMA, for example.)
|
||||||
|
*/
|
||||||
|
typedef size_t (QEMURamSaveFunc)(QEMUFile *f, void *opaque,
|
||||||
|
ram_addr_t block_offset,
|
||||||
|
ram_addr_t offset,
|
||||||
|
size_t size,
|
||||||
|
int *bytes_sent);
|
||||||
|
|
||||||
typedef struct QEMUFileOps {
|
typedef struct QEMUFileOps {
|
||||||
QEMUFilePutBufferFunc *put_buffer;
|
QEMUFilePutBufferFunc *put_buffer;
|
||||||
QEMUFileGetBufferFunc *get_buffer;
|
QEMUFileGetBufferFunc *get_buffer;
|
||||||
QEMUFileCloseFunc *close;
|
QEMUFileCloseFunc *close;
|
||||||
QEMUFileGetFD *get_fd;
|
QEMUFileGetFD *get_fd;
|
||||||
QEMUFileWritevBufferFunc *writev_buffer;
|
QEMUFileWritevBufferFunc *writev_buffer;
|
||||||
|
QEMURamHookFunc *before_ram_iterate;
|
||||||
|
QEMURamHookFunc *after_ram_iterate;
|
||||||
|
QEMURamHookFunc *hook_ram_load;
|
||||||
|
QEMURamSaveFunc *save_page;
|
||||||
} QEMUFileOps;
|
} QEMUFileOps;
|
||||||
|
|
||||||
QEMUFile *qemu_fopen_ops(void *opaque, const QEMUFileOps *ops);
|
QEMUFile *qemu_fopen_ops(void *opaque, const QEMUFileOps *ops);
|
||||||
|
@ -80,6 +109,7 @@ void qemu_put_byte(QEMUFile *f, int v);
|
||||||
* The buffer should be available till it is sent asynchronously.
|
* The buffer should be available till it is sent asynchronously.
|
||||||
*/
|
*/
|
||||||
void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, int size);
|
void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, int size);
|
||||||
|
bool qemu_file_mode_is_not_valid(const char *mode);
|
||||||
|
|
||||||
static inline void qemu_put_ubyte(QEMUFile *f, unsigned int v)
|
static inline void qemu_put_ubyte(QEMUFile *f, unsigned int v)
|
||||||
{
|
{
|
||||||
|
@ -93,6 +123,7 @@ void qemu_put_be32(QEMUFile *f, unsigned int v);
|
||||||
void qemu_put_be64(QEMUFile *f, uint64_t v);
|
void qemu_put_be64(QEMUFile *f, uint64_t v);
|
||||||
int qemu_get_buffer(QEMUFile *f, uint8_t *buf, int size);
|
int qemu_get_buffer(QEMUFile *f, uint8_t *buf, int size);
|
||||||
int qemu_get_byte(QEMUFile *f);
|
int qemu_get_byte(QEMUFile *f);
|
||||||
|
void qemu_update_position(QEMUFile *f, size_t size);
|
||||||
|
|
||||||
static inline unsigned int qemu_get_ubyte(QEMUFile *f)
|
static inline unsigned int qemu_get_ubyte(QEMUFile *f)
|
||||||
{
|
{
|
||||||
|
@ -110,6 +141,7 @@ void qemu_file_reset_rate_limit(QEMUFile *f);
|
||||||
void qemu_file_set_rate_limit(QEMUFile *f, int64_t new_rate);
|
void qemu_file_set_rate_limit(QEMUFile *f, int64_t new_rate);
|
||||||
int64_t qemu_file_get_rate_limit(QEMUFile *f);
|
int64_t qemu_file_get_rate_limit(QEMUFile *f);
|
||||||
int qemu_file_get_error(QEMUFile *f);
|
int qemu_file_get_error(QEMUFile *f);
|
||||||
|
void qemu_fflush(QEMUFile *f);
|
||||||
|
|
||||||
static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv)
|
static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv)
|
||||||
{
|
{
|
||||||
|
|
15
migration.c
15
migration.c
|
@ -66,6 +66,7 @@ MigrationState *migrate_get_current(void)
|
||||||
.state = MIG_STATE_SETUP,
|
.state = MIG_STATE_SETUP,
|
||||||
.bandwidth_limit = MAX_THROTTLE,
|
.bandwidth_limit = MAX_THROTTLE,
|
||||||
.xbzrle_cache_size = DEFAULT_MIGRATE_CACHE_SIZE,
|
.xbzrle_cache_size = DEFAULT_MIGRATE_CACHE_SIZE,
|
||||||
|
.mbps = -1,
|
||||||
};
|
};
|
||||||
|
|
||||||
return ¤t_migration;
|
return ¤t_migration;
|
||||||
|
@ -201,6 +202,7 @@ MigrationInfo *qmp_query_migrate(Error **errp)
|
||||||
info->ram->normal = norm_mig_pages_transferred();
|
info->ram->normal = norm_mig_pages_transferred();
|
||||||
info->ram->normal_bytes = norm_mig_bytes_transferred();
|
info->ram->normal_bytes = norm_mig_bytes_transferred();
|
||||||
info->ram->dirty_pages_rate = s->dirty_pages_rate;
|
info->ram->dirty_pages_rate = s->dirty_pages_rate;
|
||||||
|
info->ram->mbps = s->mbps;
|
||||||
|
|
||||||
if (blk_mig_active()) {
|
if (blk_mig_active()) {
|
||||||
info->has_disk = true;
|
info->has_disk = true;
|
||||||
|
@ -230,6 +232,7 @@ MigrationInfo *qmp_query_migrate(Error **errp)
|
||||||
info->ram->skipped = skipped_mig_pages_transferred();
|
info->ram->skipped = skipped_mig_pages_transferred();
|
||||||
info->ram->normal = norm_mig_pages_transferred();
|
info->ram->normal = norm_mig_pages_transferred();
|
||||||
info->ram->normal_bytes = norm_mig_bytes_transferred();
|
info->ram->normal_bytes = norm_mig_bytes_transferred();
|
||||||
|
info->ram->mbps = s->mbps;
|
||||||
break;
|
break;
|
||||||
case MIG_STATE_ERROR:
|
case MIG_STATE_ERROR:
|
||||||
info->has_status = true;
|
info->has_status = true;
|
||||||
|
@ -473,6 +476,15 @@ void qmp_migrate_set_downtime(double value, Error **errp)
|
||||||
max_downtime = (uint64_t)value;
|
max_downtime = (uint64_t)value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool migrate_rdma_pin_all(void)
|
||||||
|
{
|
||||||
|
MigrationState *s;
|
||||||
|
|
||||||
|
s = migrate_get_current();
|
||||||
|
|
||||||
|
return s->enabled_capabilities[MIGRATION_CAPABILITY_X_RDMA_PIN_ALL];
|
||||||
|
}
|
||||||
|
|
||||||
int migrate_use_xbzrle(void)
|
int migrate_use_xbzrle(void)
|
||||||
{
|
{
|
||||||
MigrationState *s;
|
MigrationState *s;
|
||||||
|
@ -543,6 +555,9 @@ static void *migration_thread(void *opaque)
|
||||||
double bandwidth = transferred_bytes / time_spent;
|
double bandwidth = transferred_bytes / time_spent;
|
||||||
max_size = bandwidth * migrate_max_downtime() / 1000000;
|
max_size = bandwidth * migrate_max_downtime() / 1000000;
|
||||||
|
|
||||||
|
s->mbps = time_spent ? (((double) transferred_bytes * 8.0) /
|
||||||
|
((double) time_spent / 1000.0)) / 1000.0 / 1000.0 : -1;
|
||||||
|
|
||||||
DPRINTF("transferred %" PRIu64 " time_spent %" PRIu64
|
DPRINTF("transferred %" PRIu64 " time_spent %" PRIu64
|
||||||
" bandwidth %g max_size %" PRId64 "\n",
|
" bandwidth %g max_size %" PRId64 "\n",
|
||||||
transferred_bytes, time_spent, bandwidth, max_size);
|
transferred_bytes, time_spent, bandwidth, max_size);
|
||||||
|
|
|
@ -513,12 +513,15 @@
|
||||||
# @dirty-pages-rate: number of pages dirtied by second by the
|
# @dirty-pages-rate: number of pages dirtied by second by the
|
||||||
# guest (since 1.3)
|
# guest (since 1.3)
|
||||||
#
|
#
|
||||||
|
# @mbps: throughput in megabits/sec. (since 1.6)
|
||||||
|
#
|
||||||
# Since: 0.14.0
|
# Since: 0.14.0
|
||||||
##
|
##
|
||||||
{ 'type': 'MigrationStats',
|
{ 'type': 'MigrationStats',
|
||||||
'data': {'transferred': 'int', 'remaining': 'int', 'total': 'int' ,
|
'data': {'transferred': 'int', 'remaining': 'int', 'total': 'int' ,
|
||||||
'duplicate': 'int', 'skipped': 'int', 'normal': 'int',
|
'duplicate': 'int', 'skipped': 'int', 'normal': 'int',
|
||||||
'normal-bytes': 'int', 'dirty-pages-rate' : 'int' } }
|
'normal-bytes': 'int', 'dirty-pages-rate' : 'int',
|
||||||
|
'mbps' : 'number' } }
|
||||||
|
|
||||||
##
|
##
|
||||||
# @XBZRLECacheStats
|
# @XBZRLECacheStats
|
||||||
|
@ -605,10 +608,15 @@
|
||||||
# This feature allows us to minimize migration traffic for certain work
|
# This feature allows us to minimize migration traffic for certain work
|
||||||
# loads, by sending compressed difference of the pages
|
# loads, by sending compressed difference of the pages
|
||||||
#
|
#
|
||||||
|
# @x-rdma-pin-all: Controls whether or not the entire VM memory footprint is
|
||||||
|
# mlock()'d on demand or all at once. Refer to docs/rdma.txt for usage.
|
||||||
|
# Disabled by default. Experimental: may (or may not) be renamed after
|
||||||
|
# further testing is complete. (since 1.6)
|
||||||
|
#
|
||||||
# Since: 1.2
|
# Since: 1.2
|
||||||
##
|
##
|
||||||
{ 'enum': 'MigrationCapability',
|
{ 'enum': 'MigrationCapability',
|
||||||
'data': ['xbzrle'] }
|
'data': ['xbzrle', 'x-rdma-pin-all'] }
|
||||||
|
|
||||||
##
|
##
|
||||||
# @MigrationCapabilityStatus
|
# @MigrationCapabilityStatus
|
||||||
|
|
|
@ -63,3 +63,26 @@ qemu_co_send_recv(int sockfd, void *buf, size_t bytes, bool do_send)
|
||||||
struct iovec iov = { .iov_base = buf, .iov_len = bytes };
|
struct iovec iov = { .iov_base = buf, .iov_len = bytes };
|
||||||
return qemu_co_sendv_recvv(sockfd, &iov, 1, 0, bytes, do_send);
|
return qemu_co_sendv_recvv(sockfd, &iov, 1, 0, bytes, do_send);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
Coroutine *co;
|
||||||
|
int fd;
|
||||||
|
} FDYieldUntilData;
|
||||||
|
|
||||||
|
static void fd_coroutine_enter(void *opaque)
|
||||||
|
{
|
||||||
|
FDYieldUntilData *data = opaque;
|
||||||
|
qemu_set_fd_handler(data->fd, NULL, NULL, NULL);
|
||||||
|
qemu_coroutine_enter(data->co, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
void coroutine_fn yield_until_fd_readable(int fd)
|
||||||
|
{
|
||||||
|
FDYieldUntilData data;
|
||||||
|
|
||||||
|
assert(qemu_in_coroutine());
|
||||||
|
data.co = qemu_coroutine_self();
|
||||||
|
data.fd = fd;
|
||||||
|
qemu_set_fd_handler(fd, fd_coroutine_enter, NULL, &data);
|
||||||
|
qemu_coroutine_yield();
|
||||||
|
}
|
||||||
|
|
114
savevm.c
114
savevm.c
|
@ -149,34 +149,6 @@ typedef struct QEMUFileSocket
|
||||||
QEMUFile *file;
|
QEMUFile *file;
|
||||||
} QEMUFileSocket;
|
} QEMUFileSocket;
|
||||||
|
|
||||||
typedef struct {
|
|
||||||
Coroutine *co;
|
|
||||||
int fd;
|
|
||||||
} FDYieldUntilData;
|
|
||||||
|
|
||||||
static void fd_coroutine_enter(void *opaque)
|
|
||||||
{
|
|
||||||
FDYieldUntilData *data = opaque;
|
|
||||||
qemu_set_fd_handler(data->fd, NULL, NULL, NULL);
|
|
||||||
qemu_coroutine_enter(data->co, NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Yield until a file descriptor becomes readable
|
|
||||||
*
|
|
||||||
* Note that this function clobbers the handlers for the file descriptor.
|
|
||||||
*/
|
|
||||||
static void coroutine_fn yield_until_fd_readable(int fd)
|
|
||||||
{
|
|
||||||
FDYieldUntilData data;
|
|
||||||
|
|
||||||
assert(qemu_in_coroutine());
|
|
||||||
data.co = qemu_coroutine_self();
|
|
||||||
data.fd = fd;
|
|
||||||
qemu_set_fd_handler(fd, fd_coroutine_enter, NULL, &data);
|
|
||||||
qemu_coroutine_yield();
|
|
||||||
}
|
|
||||||
|
|
||||||
static ssize_t socket_writev_buffer(void *opaque, struct iovec *iov, int iovcnt,
|
static ssize_t socket_writev_buffer(void *opaque, struct iovec *iov, int iovcnt,
|
||||||
int64_t pos)
|
int64_t pos)
|
||||||
{
|
{
|
||||||
|
@ -477,14 +449,23 @@ static const QEMUFileOps socket_write_ops = {
|
||||||
.close = socket_close
|
.close = socket_close
|
||||||
};
|
};
|
||||||
|
|
||||||
QEMUFile *qemu_fopen_socket(int fd, const char *mode)
|
bool qemu_file_mode_is_not_valid(const char *mode)
|
||||||
{
|
{
|
||||||
QEMUFileSocket *s;
|
|
||||||
|
|
||||||
if (mode == NULL ||
|
if (mode == NULL ||
|
||||||
(mode[0] != 'r' && mode[0] != 'w') ||
|
(mode[0] != 'r' && mode[0] != 'w') ||
|
||||||
mode[1] != 'b' || mode[2] != 0) {
|
mode[1] != 'b' || mode[2] != 0) {
|
||||||
fprintf(stderr, "qemu_fopen: Argument validity check failed\n");
|
fprintf(stderr, "qemu_fopen: Argument validity check failed\n");
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
QEMUFile *qemu_fopen_socket(int fd, const char *mode)
|
||||||
|
{
|
||||||
|
QEMUFileSocket *s;
|
||||||
|
|
||||||
|
if (qemu_file_mode_is_not_valid(mode)) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -503,10 +484,7 @@ QEMUFile *qemu_fopen(const char *filename, const char *mode)
|
||||||
{
|
{
|
||||||
QEMUFileStdio *s;
|
QEMUFileStdio *s;
|
||||||
|
|
||||||
if (mode == NULL ||
|
if (qemu_file_mode_is_not_valid(mode)) {
|
||||||
(mode[0] != 'r' && mode[0] != 'w') ||
|
|
||||||
mode[1] != 'b' || mode[2] != 0) {
|
|
||||||
fprintf(stderr, "qemu_fopen: Argument validity check failed\n");
|
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -611,7 +589,7 @@ static inline bool qemu_file_is_writable(QEMUFile *f)
|
||||||
* If there is writev_buffer QEMUFileOps it uses it otherwise uses
|
* If there is writev_buffer QEMUFileOps it uses it otherwise uses
|
||||||
* put_buffer ops.
|
* put_buffer ops.
|
||||||
*/
|
*/
|
||||||
static void qemu_fflush(QEMUFile *f)
|
void qemu_fflush(QEMUFile *f)
|
||||||
{
|
{
|
||||||
ssize_t ret = 0;
|
ssize_t ret = 0;
|
||||||
|
|
||||||
|
@ -638,6 +616,65 @@ static void qemu_fflush(QEMUFile *f)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ram_control_before_iterate(QEMUFile *f, uint64_t flags)
|
||||||
|
{
|
||||||
|
int ret = 0;
|
||||||
|
|
||||||
|
if (f->ops->before_ram_iterate) {
|
||||||
|
ret = f->ops->before_ram_iterate(f, f->opaque, flags);
|
||||||
|
if (ret < 0) {
|
||||||
|
qemu_file_set_error(f, ret);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ram_control_after_iterate(QEMUFile *f, uint64_t flags)
|
||||||
|
{
|
||||||
|
int ret = 0;
|
||||||
|
|
||||||
|
if (f->ops->after_ram_iterate) {
|
||||||
|
ret = f->ops->after_ram_iterate(f, f->opaque, flags);
|
||||||
|
if (ret < 0) {
|
||||||
|
qemu_file_set_error(f, ret);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ram_control_load_hook(QEMUFile *f, uint64_t flags)
|
||||||
|
{
|
||||||
|
int ret = 0;
|
||||||
|
|
||||||
|
if (f->ops->hook_ram_load) {
|
||||||
|
ret = f->ops->hook_ram_load(f, f->opaque, flags);
|
||||||
|
if (ret < 0) {
|
||||||
|
qemu_file_set_error(f, ret);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
qemu_file_set_error(f, ret);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset,
|
||||||
|
ram_addr_t offset, size_t size, int *bytes_sent)
|
||||||
|
{
|
||||||
|
if (f->ops->save_page) {
|
||||||
|
int ret = f->ops->save_page(f, f->opaque, block_offset,
|
||||||
|
offset, size, bytes_sent);
|
||||||
|
|
||||||
|
if (ret != RAM_SAVE_CONTROL_DELAYED) {
|
||||||
|
if (*bytes_sent > 0) {
|
||||||
|
qemu_update_position(f, *bytes_sent);
|
||||||
|
} else if (ret < 0) {
|
||||||
|
qemu_file_set_error(f, ret);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
return RAM_SAVE_CONTROL_NOT_SUPP;
|
||||||
|
}
|
||||||
|
|
||||||
static void qemu_fill_buffer(QEMUFile *f)
|
static void qemu_fill_buffer(QEMUFile *f)
|
||||||
{
|
{
|
||||||
int len;
|
int len;
|
||||||
|
@ -671,6 +708,11 @@ int qemu_get_fd(QEMUFile *f)
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void qemu_update_position(QEMUFile *f, size_t size)
|
||||||
|
{
|
||||||
|
f->pos += size;
|
||||||
|
}
|
||||||
|
|
||||||
/** Closes the file
|
/** Closes the file
|
||||||
*
|
*
|
||||||
* Returns negative error value if any error happened on previous operations or
|
* Returns negative error value if any error happened on previous operations or
|
||||||
|
|
Loading…
Reference in New Issue