Implement an fd pool to get real AIO with posix-aio

This patch implements a simple fd pool to allow many AIO requests with
posix-aio.  The result is significantly improved performance (identical to that
reported for linux-aio) for both cache=on and cache=off.

The fundamental problem with posix-aio is that it limits itself to one thread
per-file descriptor.  I don't know why this is, but this patch provides a simple
mechanism to work around this (duplicating the file descriptor).

This isn't a great solution, but it seems like a reasonable intermediate step
between posix-aio and a custom thread-pool to replace it.

Ryan Harper will be posting some performance analysis he did comparing posix-aio
with fd pooling against linux-aio.  The size of the posix-aio thread pool and
the fd pool were largely determined by him based on this analysis.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>



git-svn-id: svn://svn.savannah.nongnu.org/qemu/trunk@5323 c046a42c-6fe2-441c-8c8c-71466251a162
This commit is contained in:
aliguori 2008-09-26 15:59:29 +00:00
parent 997306fc22
commit 5353872545
1 changed files with 65 additions and 3 deletions

View File

@ -84,10 +84,16 @@
reopen it to see if the disk has been changed */ reopen it to see if the disk has been changed */
#define FD_OPEN_TIMEOUT 1000 #define FD_OPEN_TIMEOUT 1000
/* posix-aio doesn't allow multiple outstanding requests to a single file
* descriptor. we implement a pool of dup()'d file descriptors to work
* around this */
#define RAW_FD_POOL_SIZE 64
typedef struct BDRVRawState { typedef struct BDRVRawState {
int fd; int fd;
int type; int type;
unsigned int lseek_err_cnt; unsigned int lseek_err_cnt;
int fd_pool[RAW_FD_POOL_SIZE];
#if defined(__linux__) #if defined(__linux__)
/* linux floppy specific */ /* linux floppy specific */
int fd_open_flags; int fd_open_flags;
@ -109,6 +115,7 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags)
{ {
BDRVRawState *s = bs->opaque; BDRVRawState *s = bs->opaque;
int fd, open_flags, ret; int fd, open_flags, ret;
int i;
posix_aio_init(); posix_aio_init();
@ -138,6 +145,8 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags)
return ret; return ret;
} }
s->fd = fd; s->fd = fd;
for (i = 0; i < RAW_FD_POOL_SIZE; i++)
s->fd_pool[i] = -1;
#if defined(O_DIRECT) #if defined(O_DIRECT)
s->aligned_buf = NULL; s->aligned_buf = NULL;
if (flags & BDRV_O_DIRECT) { if (flags & BDRV_O_DIRECT) {
@ -436,6 +445,7 @@ static int raw_pwrite(BlockDriverState *bs, int64_t offset,
typedef struct RawAIOCB { typedef struct RawAIOCB {
BlockDriverAIOCB common; BlockDriverAIOCB common;
int fd;
struct aiocb aiocb; struct aiocb aiocb;
struct RawAIOCB *next; struct RawAIOCB *next;
int ret; int ret;
@ -447,6 +457,38 @@ typedef struct PosixAioState
RawAIOCB *first_aio; RawAIOCB *first_aio;
} PosixAioState; } PosixAioState;
static int raw_fd_pool_get(BDRVRawState *s)
{
int i;
for (i = 0; i < RAW_FD_POOL_SIZE; i++) {
/* already in use */
if (s->fd_pool[i] != -1)
continue;
/* try to dup file descriptor */
s->fd_pool[i] = dup(s->fd);
if (s->fd_pool[i] != -1)
return s->fd_pool[i];
}
/* we couldn't dup the file descriptor so just use the main one */
return s->fd;
}
static void raw_fd_pool_put(RawAIOCB *acb)
{
BDRVRawState *s = acb->common.bs->opaque;
int i;
for (i = 0; i < RAW_FD_POOL_SIZE; i++) {
if (s->fd_pool[i] == acb->fd) {
close(s->fd_pool[i]);
s->fd_pool[i] = -1;
}
}
}
static void posix_aio_read(void *opaque) static void posix_aio_read(void *opaque)
{ {
PosixAioState *s = opaque; PosixAioState *s = opaque;
@ -487,6 +529,7 @@ static void posix_aio_read(void *opaque)
if (ret == ECANCELED) { if (ret == ECANCELED) {
/* remove the request */ /* remove the request */
*pacb = acb->next; *pacb = acb->next;
raw_fd_pool_put(acb);
qemu_aio_release(acb); qemu_aio_release(acb);
} else if (ret != EINPROGRESS) { } else if (ret != EINPROGRESS) {
/* end of aio */ /* end of aio */
@ -503,6 +546,7 @@ static void posix_aio_read(void *opaque)
*pacb = acb->next; *pacb = acb->next;
/* call the callback */ /* call the callback */
acb->common.cb(acb->common.opaque, ret); acb->common.cb(acb->common.opaque, ret);
raw_fd_pool_put(acb);
qemu_aio_release(acb); qemu_aio_release(acb);
break; break;
} else { } else {
@ -577,7 +621,8 @@ static RawAIOCB *raw_aio_setup(BlockDriverState *bs,
acb = qemu_aio_get(bs, cb, opaque); acb = qemu_aio_get(bs, cb, opaque);
if (!acb) if (!acb)
return NULL; return NULL;
acb->aiocb.aio_fildes = s->fd; acb->fd = raw_fd_pool_get(s);
acb->aiocb.aio_fildes = acb->fd;
acb->aiocb.aio_sigevent.sigev_signo = SIGUSR2; acb->aiocb.aio_sigevent.sigev_signo = SIGUSR2;
acb->aiocb.aio_sigevent.sigev_notify = SIGEV_SIGNAL; acb->aiocb.aio_sigevent.sigev_notify = SIGEV_SIGNAL;
acb->aiocb.aio_buf = buf; acb->aiocb.aio_buf = buf;
@ -684,6 +729,7 @@ static void raw_aio_cancel(BlockDriverAIOCB *blockacb)
break; break;
} else if (*pacb == acb) { } else if (*pacb == acb) {
*pacb = acb->next; *pacb = acb->next;
raw_fd_pool_put(acb);
qemu_aio_release(acb); qemu_aio_release(acb);
break; break;
} }
@ -697,6 +743,18 @@ static int posix_aio_init(void)
} }
#endif /* CONFIG_AIO */ #endif /* CONFIG_AIO */
static void raw_close_fd_pool(BDRVRawState *s)
{
int i;
for (i = 0; i < RAW_FD_POOL_SIZE; i++) {
if (s->fd_pool[i] != -1) {
close(s->fd_pool[i]);
s->fd_pool[i] = -1;
}
}
}
static void raw_close(BlockDriverState *bs) static void raw_close(BlockDriverState *bs)
{ {
BDRVRawState *s = bs->opaque; BDRVRawState *s = bs->opaque;
@ -708,6 +766,7 @@ static void raw_close(BlockDriverState *bs)
qemu_free(s->aligned_buf); qemu_free(s->aligned_buf);
#endif #endif
} }
raw_close_fd_pool(s);
} }
static int raw_truncate(BlockDriverState *bs, int64_t offset) static int raw_truncate(BlockDriverState *bs, int64_t offset)
@ -898,7 +957,7 @@ kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex ma
static int hdev_open(BlockDriverState *bs, const char *filename, int flags) static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
{ {
BDRVRawState *s = bs->opaque; BDRVRawState *s = bs->opaque;
int fd, open_flags, ret; int fd, open_flags, ret, i;
posix_aio_init(); posix_aio_init();
@ -963,6 +1022,8 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
return ret; return ret;
} }
s->fd = fd; s->fd = fd;
for (i = 0; i < RAW_FD_POOL_SIZE; i++)
s->fd_pool[i] = -1;
#if defined(__linux__) #if defined(__linux__)
/* close fd so that we can reopen it as needed */ /* close fd so that we can reopen it as needed */
if (s->type == FTYPE_FD) { if (s->type == FTYPE_FD) {
@ -975,7 +1036,6 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
} }
#if defined(__linux__) #if defined(__linux__)
/* Note: we do not have a reliable method to detect if the floppy is /* Note: we do not have a reliable method to detect if the floppy is
present. The current method is to try to open the floppy at every present. The current method is to try to open the floppy at every
I/O and to keep it opened during a few hundreds of ms. */ I/O and to keep it opened during a few hundreds of ms. */
@ -991,6 +1051,7 @@ static int fd_open(BlockDriverState *bs)
(qemu_get_clock(rt_clock) - s->fd_open_time) >= FD_OPEN_TIMEOUT) { (qemu_get_clock(rt_clock) - s->fd_open_time) >= FD_OPEN_TIMEOUT) {
close(s->fd); close(s->fd);
s->fd = -1; s->fd = -1;
raw_close_fd_pool(s);
#ifdef DEBUG_FLOPPY #ifdef DEBUG_FLOPPY
printf("Floppy closed\n"); printf("Floppy closed\n");
#endif #endif
@ -1091,6 +1152,7 @@ static int raw_eject(BlockDriverState *bs, int eject_flag)
if (s->fd >= 0) { if (s->fd >= 0) {
close(s->fd); close(s->fd);
s->fd = -1; s->fd = -1;
raw_close_fd_pool(s);
} }
fd = open(bs->filename, s->fd_open_flags | O_NONBLOCK); fd = open(bs->filename, s->fd_open_flags | O_NONBLOCK);
if (fd >= 0) { if (fd >= 0) {