RDMA queue

* Add support for RDMA MAD
  * Various fixes for the pvrdma backend
 -----BEGIN PGP SIGNATURE-----
 
 iQEcBAABAgAGBQJcHgWkAAoJEDbUwPDPL+RtuQcIAJk6BYbi/d5EG9rE3WmM3kOp
 1oh49tA3ahHPApSwc7J69M+j1MMELCUUzU/HUsd1DTn+uR219s5KO7O11f5pgRko
 KX+4kdWdRTumu2s51bR3yz3Alq1KjhtX8lSGchSCB/aV6o16Tt03HJcZegyeWtw1
 BgKkuuFz7lKzXw6tW3Q/F1GzYNRjHAizx5q6c2PI2lxpQ39jiFD0WQa5TnPGaW5E
 dQ0og+aIUKNDQxcX48PeW0Rv1aRRS8GNmkO8L7dYh1x4gZkFLd9SNQu55T+WtULW
 drwb887ROKv5OTnJ8l9c4BQ/eAYBegFcbGcXJl+Zr2ueGk+Rup5IVWOq/hhjOuw=
 =Wcc9
 -----END PGP SIGNATURE-----

Merge remote-tracking branch 'remotes/marcel/tags/rdma-pull-request' into staging

RDMA queue
 * Add support for RDMA MAD
 * Various fixes for the pvrdma backend

# gpg: Signature made Sat 22 Dec 2018 09:36:36 GMT
# gpg:                using RSA key 36D4C0F0CF2FE46D
# gpg: Good signature from "Marcel Apfelbaum <marcel.apfelbaum@zoho.com>"
# gpg:                 aka "Marcel Apfelbaum <marcel@redhat.com>"
# gpg:                 aka "Marcel Apfelbaum <marcel.apfelbaum@gmail.com>"
# gpg: WARNING: This key is not certified with sufficiently trusted signatures!
# gpg:          It is not certain that the signature belongs to the owner.
# Primary key fingerprint: B1C6 3A57 F92E 08F2 640F  31F5 36D4 C0F0 CF2F E46D

* remotes/marcel/tags/rdma-pull-request: (31 commits)
  pvrdma: check return value from pvrdma_idx_ring_has_ routines
  rdma: remove unused VENDOR_ERR_NO_SGE macro
  pvrdma: release ring object in case of an error
  pvrdma: check number of pages when creating rings
  pvrdma: add uar_read routine
  rdma: check num_sge does not exceed MAX_SGE
  pvrdma: release device resources in case of an error
  docs: Update pvrdma device documentation
  hw/rdma: Do not call rdma_backend_del_gid on an empty gid
  hw/rdma: Do not use bitmap_zero_extend to free bitmap
  hw/pvrdma: Clean device's resource when system is shutdown
  vl: Introduce shutdown_notifiers
  hw/rdma: Remove unneeded code that handles more that one port
  hw/pvrdma: Fill error code in command's response
  hw/pvrdma: Fill all CQE fields
  hw/pvrdma: Make device state depend on Ethernet function state
  hw/rdma: Initialize node_guid from vmxnet3 mac address
  hw/pvrdma: Make sure PCI function 0 is vmxnet3
  vmxnet3: Move some definitions to header file
  hw/pvrdma: Add support to allow guest to configure GID table
  ...

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
This commit is contained in:
Peter Maydell 2018-12-22 11:25:31 +00:00
commit 9b2e891ec5
25 changed files with 2080 additions and 416 deletions

View File

@ -2412,6 +2412,8 @@ S: Maintained
F: hw/rdma/*
F: hw/rdma/vmw/*
F: docs/pvrdma.txt
F: contrib/rdmacm-mux/*
F: qapi/rdma.json
Build and test automation
-------------------------

View File

@ -362,6 +362,7 @@ dummy := $(call unnest-vars,, \
elf2dmp-obj-y \
ivshmem-client-obj-y \
ivshmem-server-obj-y \
rdmacm-mux-obj-y \
libvhost-user-obj-y \
vhost-user-scsi-obj-y \
vhost-user-blk-obj-y \
@ -579,6 +580,8 @@ vhost-user-scsi$(EXESUF): $(vhost-user-scsi-obj-y) libvhost-user.a
$(call LINK, $^)
vhost-user-blk$(EXESUF): $(vhost-user-blk-obj-y) libvhost-user.a
$(call LINK, $^)
rdmacm-mux$(EXESUF): $(rdmacm-mux-obj-y) $(COMMON_LDADDS)
$(call LINK, $^)
module_block.h: $(SRC_PATH)/scripts/modules/module_block.py config-host.mak
$(call quiet-command,$(PYTHON) $< $@ \

View File

@ -1,5 +1,6 @@
QAPI_MODULES = block-core block char common crypto introspect job migration
QAPI_MODULES += misc net rocker run-state sockets tpm trace transaction ui
QAPI_MODULES += misc net rdma rocker run-state sockets tpm trace transaction
QAPI_MODULES += ui
#######################################################################
# Common libraries for tools and emulators
@ -133,6 +134,7 @@ vhost-user-scsi.o-cflags := $(LIBISCSI_CFLAGS)
vhost-user-scsi.o-libs := $(LIBISCSI_LIBS)
vhost-user-scsi-obj-y = contrib/vhost-user-scsi/
vhost-user-blk-obj-y = contrib/vhost-user-blk/
rdmacm-mux-obj-y = contrib/rdmacm-mux/
######################################################################
trace-events-subdirs =

View File

@ -0,0 +1,4 @@
ifdef CONFIG_PVRDMA
CFLAGS += -libumad -Wno-format-truncation
rdmacm-mux-obj-y = main.o
endif

798
contrib/rdmacm-mux/main.c Normal file
View File

@ -0,0 +1,798 @@
/*
* QEMU paravirtual RDMA - rdmacm-mux implementation
*
* Copyright (C) 2018 Oracle
* Copyright (C) 2018 Red Hat Inc
*
* Authors:
* Yuval Shaia <yuval.shaia@oracle.com>
* Marcel Apfelbaum <marcel@redhat.com>
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*
*/
#include "qemu/osdep.h"
#include "sys/poll.h"
#include "sys/ioctl.h"
#include "pthread.h"
#include "syslog.h"
#include "infiniband/verbs.h"
#include "infiniband/umad.h"
#include "infiniband/umad_types.h"
#include "infiniband/umad_sa.h"
#include "infiniband/umad_cm.h"
#include "rdmacm-mux.h"
#define SCALE_US 1000
#define COMMID_TTL 2 /* How many SCALE_US a context of MAD session is saved */
#define SLEEP_SECS 5 /* This is used both in poll() and thread */
#define SERVER_LISTEN_BACKLOG 10
#define MAX_CLIENTS 4096
#define MAD_RMPP_VERSION 0
#define MAD_METHOD_MASK0 0x8
#define IB_USER_MAD_LONGS_PER_METHOD_MASK (128 / (8 * sizeof(long)))
#define CM_REQ_DGID_POS 80
#define CM_SIDR_REQ_DGID_POS 44
/* The below can be override by command line parameter */
#define UNIX_SOCKET_PATH "/var/run/rdmacm-mux"
#define RDMA_PORT_NUM 1
typedef struct RdmaCmServerArgs {
char unix_socket_path[PATH_MAX];
char rdma_dev_name[NAME_MAX];
int rdma_port_num;
} RdmaCMServerArgs;
typedef struct CommId2FdEntry {
int fd;
int ttl; /* Initialized to 2, decrement each timeout, entry delete when 0 */
__be64 gid_ifid;
} CommId2FdEntry;
typedef struct RdmaCmUMadAgent {
int port_id;
int agent_id;
GHashTable *gid2fd; /* Used to find fd of a given gid */
GHashTable *commid2fd; /* Used to find fd on of a given comm_id */
} RdmaCmUMadAgent;
typedef struct RdmaCmServer {
bool run;
RdmaCMServerArgs args;
struct pollfd fds[MAX_CLIENTS];
int nfds;
RdmaCmUMadAgent umad_agent;
pthread_t umad_recv_thread;
pthread_rwlock_t lock;
} RdmaCMServer;
static RdmaCMServer server = {0};
static void usage(const char *progname)
{
printf("Usage: %s [OPTION]...\n"
"Start a RDMA-CM multiplexer\n"
"\n"
"\t-h Show this help\n"
"\t-d rdma-device-name Name of RDMA device to register with\n"
"\t-s unix-socket-path Path to unix socket to listen on (default %s)\n"
"\t-p rdma-device-port Port number of RDMA device to register with (default %d)\n",
progname, UNIX_SOCKET_PATH, RDMA_PORT_NUM);
}
static void help(const char *progname)
{
fprintf(stderr, "Try '%s -h' for more information.\n", progname);
}
static void parse_args(int argc, char *argv[])
{
int c;
char unix_socket_path[PATH_MAX];
strcpy(server.args.rdma_dev_name, "");
strcpy(unix_socket_path, UNIX_SOCKET_PATH);
server.args.rdma_port_num = RDMA_PORT_NUM;
while ((c = getopt(argc, argv, "hs:d:p:")) != -1) {
switch (c) {
case 'h':
usage(argv[0]);
exit(0);
case 'd':
strncpy(server.args.rdma_dev_name, optarg, NAME_MAX - 1);
break;
case 's':
/* This is temporary, final name will build below */
strncpy(unix_socket_path, optarg, PATH_MAX);
break;
case 'p':
server.args.rdma_port_num = atoi(optarg);
break;
default:
help(argv[0]);
exit(1);
}
}
if (!strcmp(server.args.rdma_dev_name, "")) {
fprintf(stderr, "Missing RDMA device name\n");
help(argv[0]);
exit(1);
}
/* Build unique unix-socket file name */
snprintf(server.args.unix_socket_path, PATH_MAX, "%s-%s-%d",
unix_socket_path, server.args.rdma_dev_name,
server.args.rdma_port_num);
syslog(LOG_INFO, "unix_socket_path=%s", server.args.unix_socket_path);
syslog(LOG_INFO, "rdma-device-name=%s", server.args.rdma_dev_name);
syslog(LOG_INFO, "rdma-device-port=%d", server.args.rdma_port_num);
}
static void hash_tbl_alloc(void)
{
server.umad_agent.gid2fd = g_hash_table_new_full(g_int64_hash,
g_int64_equal,
g_free, g_free);
server.umad_agent.commid2fd = g_hash_table_new_full(g_int_hash,
g_int_equal,
g_free, g_free);
}
static void hash_tbl_free(void)
{
if (server.umad_agent.commid2fd) {
g_hash_table_destroy(server.umad_agent.commid2fd);
}
if (server.umad_agent.gid2fd) {
g_hash_table_destroy(server.umad_agent.gid2fd);
}
}
static int _hash_tbl_search_fd_by_ifid(__be64 *gid_ifid)
{
int *fd;
fd = g_hash_table_lookup(server.umad_agent.gid2fd, gid_ifid);
if (!fd) {
/* Let's try IPv4 */
*gid_ifid |= 0x00000000ffff0000;
fd = g_hash_table_lookup(server.umad_agent.gid2fd, gid_ifid);
}
return fd ? *fd : 0;
}
static int hash_tbl_search_fd_by_ifid(int *fd, __be64 *gid_ifid)
{
pthread_rwlock_rdlock(&server.lock);
*fd = _hash_tbl_search_fd_by_ifid(gid_ifid);
pthread_rwlock_unlock(&server.lock);
if (!fd) {
syslog(LOG_WARNING, "Can't find matching for ifid 0x%llx\n", *gid_ifid);
return -ENOENT;
}
return 0;
}
static int hash_tbl_search_fd_by_comm_id(uint32_t comm_id, int *fd,
__be64 *gid_idid)
{
CommId2FdEntry *fde;
pthread_rwlock_rdlock(&server.lock);
fde = g_hash_table_lookup(server.umad_agent.commid2fd, &comm_id);
pthread_rwlock_unlock(&server.lock);
if (!fde) {
syslog(LOG_WARNING, "Can't find matching for comm_id 0x%x\n", comm_id);
return -ENOENT;
}
*fd = fde->fd;
*gid_idid = fde->gid_ifid;
return 0;
}
static RdmaCmMuxErrCode add_fd_ifid_pair(int fd, __be64 gid_ifid)
{
int fd1;
pthread_rwlock_wrlock(&server.lock);
fd1 = _hash_tbl_search_fd_by_ifid(&gid_ifid);
if (fd1) { /* record already exist - an error */
pthread_rwlock_unlock(&server.lock);
return fd == fd1 ? RDMACM_MUX_ERR_CODE_EEXIST :
RDMACM_MUX_ERR_CODE_EACCES;
}
g_hash_table_insert(server.umad_agent.gid2fd, g_memdup(&gid_ifid,
sizeof(gid_ifid)), g_memdup(&fd, sizeof(fd)));
pthread_rwlock_unlock(&server.lock);
syslog(LOG_INFO, "0x%lx registered on socket %d",
be64toh((uint64_t)gid_ifid), fd);
return RDMACM_MUX_ERR_CODE_OK;
}
static RdmaCmMuxErrCode delete_fd_ifid_pair(int fd, __be64 gid_ifid)
{
int fd1;
pthread_rwlock_wrlock(&server.lock);
fd1 = _hash_tbl_search_fd_by_ifid(&gid_ifid);
if (!fd1) { /* record not exist - an error */
pthread_rwlock_unlock(&server.lock);
return RDMACM_MUX_ERR_CODE_ENOTFOUND;
}
g_hash_table_remove(server.umad_agent.gid2fd, g_memdup(&gid_ifid,
sizeof(gid_ifid)));
pthread_rwlock_unlock(&server.lock);
syslog(LOG_INFO, "0x%lx unregistered on socket %d",
be64toh((uint64_t)gid_ifid), fd);
return RDMACM_MUX_ERR_CODE_OK;
}
static void hash_tbl_save_fd_comm_id_pair(int fd, uint32_t comm_id,
uint64_t gid_ifid)
{
CommId2FdEntry fde = {fd, COMMID_TTL, gid_ifid};
pthread_rwlock_wrlock(&server.lock);
g_hash_table_insert(server.umad_agent.commid2fd,
g_memdup(&comm_id, sizeof(comm_id)),
g_memdup(&fde, sizeof(fde)));
pthread_rwlock_unlock(&server.lock);
}
static gboolean remove_old_comm_ids(gpointer key, gpointer value,
gpointer user_data)
{
CommId2FdEntry *fde = (CommId2FdEntry *)value;
return !fde->ttl--;
}
static gboolean remove_entry_from_gid2fd(gpointer key, gpointer value,
gpointer user_data)
{
if (*(int *)value == *(int *)user_data) {
syslog(LOG_INFO, "0x%lx unregistered on socket %d",
be64toh(*(uint64_t *)key), *(int *)value);
return true;
}
return false;
}
static void hash_tbl_remove_fd_ifid_pair(int fd)
{
pthread_rwlock_wrlock(&server.lock);
g_hash_table_foreach_remove(server.umad_agent.gid2fd,
remove_entry_from_gid2fd, (gpointer)&fd);
pthread_rwlock_unlock(&server.lock);
}
static int get_fd(const char *mad, int *fd, __be64 *gid_ifid)
{
struct umad_hdr *hdr = (struct umad_hdr *)mad;
char *data = (char *)hdr + sizeof(*hdr);
int32_t comm_id = 0;
uint16_t attr_id = be16toh(hdr->attr_id);
int rc = 0;
switch (attr_id) {
case UMAD_CM_ATTR_REQ:
memcpy(gid_ifid, data + CM_REQ_DGID_POS, sizeof(*gid_ifid));
rc = hash_tbl_search_fd_by_ifid(fd, gid_ifid);
break;
case UMAD_CM_ATTR_SIDR_REQ:
memcpy(gid_ifid, data + CM_SIDR_REQ_DGID_POS, sizeof(*gid_ifid));
rc = hash_tbl_search_fd_by_ifid(fd, gid_ifid);
break;
case UMAD_CM_ATTR_REP:
/* Fall through */
case UMAD_CM_ATTR_REJ:
/* Fall through */
case UMAD_CM_ATTR_DREQ:
/* Fall through */
case UMAD_CM_ATTR_DREP:
/* Fall through */
case UMAD_CM_ATTR_RTU:
data += sizeof(comm_id);
/* Fall through */
case UMAD_CM_ATTR_SIDR_REP:
memcpy(&comm_id, data, sizeof(comm_id));
if (comm_id) {
rc = hash_tbl_search_fd_by_comm_id(comm_id, fd, gid_ifid);
}
break;
default:
rc = -EINVAL;
syslog(LOG_WARNING, "Unsupported attr_id 0x%x\n", attr_id);
}
syslog(LOG_DEBUG, "mad_to_vm: %d 0x%x 0x%x\n", *fd, attr_id, comm_id);
return rc;
}
static void *umad_recv_thread_func(void *args)
{
int rc;
RdmaCmMuxMsg msg = {0};
int fd = -2;
msg.hdr.msg_type = RDMACM_MUX_MSG_TYPE_REQ;
msg.hdr.op_code = RDMACM_MUX_OP_CODE_MAD;
while (server.run) {
do {
msg.umad_len = sizeof(msg.umad.mad);
rc = umad_recv(server.umad_agent.port_id, &msg.umad, &msg.umad_len,
SLEEP_SECS * SCALE_US);
if ((rc == -EIO) || (rc == -EINVAL)) {
syslog(LOG_CRIT, "Fatal error while trying to read MAD");
}
if (rc == -ETIMEDOUT) {
g_hash_table_foreach_remove(server.umad_agent.commid2fd,
remove_old_comm_ids, NULL);
}
} while (rc && server.run);
if (server.run) {
rc = get_fd(msg.umad.mad, &fd, &msg.hdr.sgid.global.interface_id);
if (rc) {
continue;
}
send(fd, &msg, sizeof(msg), 0);
}
}
return NULL;
}
static int read_and_process(int fd)
{
int rc;
RdmaCmMuxMsg msg = {0};
struct umad_hdr *hdr;
uint32_t *comm_id = 0;
uint16_t attr_id;
rc = recv(fd, &msg, sizeof(msg), 0);
syslog(LOG_DEBUG, "Socket %d, recv %d\n", fd, rc);
if (rc < 0 && errno != EWOULDBLOCK) {
syslog(LOG_ERR, "Fail to read from socket %d\n", fd);
return -EIO;
}
if (!rc) {
syslog(LOG_ERR, "Fail to read from socket %d\n", fd);
return -EPIPE;
}
if (msg.hdr.msg_type != RDMACM_MUX_MSG_TYPE_REQ) {
syslog(LOG_WARNING, "Got non-request message (%d) from socket %d\n",
msg.hdr.msg_type, fd);
return -EPERM;
}
switch (msg.hdr.op_code) {
case RDMACM_MUX_OP_CODE_REG:
rc = add_fd_ifid_pair(fd, msg.hdr.sgid.global.interface_id);
break;
case RDMACM_MUX_OP_CODE_UNREG:
rc = delete_fd_ifid_pair(fd, msg.hdr.sgid.global.interface_id);
break;
case RDMACM_MUX_OP_CODE_MAD:
/* If this is REQ or REP then store the pair comm_id,fd to be later
* used for other messages where gid is unknown */
hdr = (struct umad_hdr *)msg.umad.mad;
attr_id = be16toh(hdr->attr_id);
if ((attr_id == UMAD_CM_ATTR_REQ) || (attr_id == UMAD_CM_ATTR_DREQ) ||
(attr_id == UMAD_CM_ATTR_SIDR_REQ) ||
(attr_id == UMAD_CM_ATTR_REP) || (attr_id == UMAD_CM_ATTR_DREP)) {
comm_id = (uint32_t *)(msg.umad.mad + sizeof(*hdr));
hash_tbl_save_fd_comm_id_pair(fd, *comm_id,
msg.hdr.sgid.global.interface_id);
}
syslog(LOG_DEBUG, "vm_to_mad: %d 0x%x 0x%x\n", fd, attr_id,
comm_id ? *comm_id : 0);
rc = umad_send(server.umad_agent.port_id, server.umad_agent.agent_id,
&msg.umad, msg.umad_len, 1, 0);
if (rc) {
syslog(LOG_ERR,
"Fail to send MAD message (0x%x) from socket %d, err=%d",
attr_id, fd, rc);
}
break;
default:
syslog(LOG_ERR, "Got invalid op_code (%d) from socket %d",
msg.hdr.msg_type, fd);
rc = RDMACM_MUX_ERR_CODE_EINVAL;
}
msg.hdr.msg_type = RDMACM_MUX_MSG_TYPE_RESP;
msg.hdr.err_code = rc;
rc = send(fd, &msg, sizeof(msg), 0);
return rc == sizeof(msg) ? 0 : -EPIPE;
}
static int accept_all(void)
{
int fd, rc = 0;;
pthread_rwlock_wrlock(&server.lock);
do {
if ((server.nfds + 1) > MAX_CLIENTS) {
syslog(LOG_WARNING, "Too many clients (%d)", server.nfds);
rc = -EIO;
goto out;
}
fd = accept(server.fds[0].fd, NULL, NULL);
if (fd < 0) {
if (errno != EWOULDBLOCK) {
syslog(LOG_WARNING, "accept() failed");
rc = -EIO;
goto out;
}
break;
}
syslog(LOG_INFO, "Client connected on socket %d\n", fd);
server.fds[server.nfds].fd = fd;
server.fds[server.nfds].events = POLLIN;
server.nfds++;
} while (fd != -1);
out:
pthread_rwlock_unlock(&server.lock);
return rc;
}
static void compress_fds(void)
{
int i, j;
int closed = 0;
pthread_rwlock_wrlock(&server.lock);
for (i = 1; i < server.nfds; i++) {
if (!server.fds[i].fd) {
closed++;
for (j = i; j < server.nfds - 1; j++) {
server.fds[j] = server.fds[j + 1];
}
}
}
server.nfds -= closed;
pthread_rwlock_unlock(&server.lock);
}
static void close_fd(int idx)
{
close(server.fds[idx].fd);
syslog(LOG_INFO, "Socket %d closed\n", server.fds[idx].fd);
hash_tbl_remove_fd_ifid_pair(server.fds[idx].fd);
server.fds[idx].fd = 0;
}
static void run(void)
{
int rc, nfds, i;
bool compress = false;
syslog(LOG_INFO, "Service started");
while (server.run) {
rc = poll(server.fds, server.nfds, SLEEP_SECS * SCALE_US);
if (rc < 0) {
if (errno != EINTR) {
syslog(LOG_WARNING, "poll() failed");
}
continue;
}
if (rc == 0) {
continue;
}
nfds = server.nfds;
for (i = 0; i < nfds; i++) {
syslog(LOG_DEBUG, "pollfd[%d]: revents 0x%x, events 0x%x\n", i,
server.fds[i].revents, server.fds[i].events);
if (server.fds[i].revents == 0) {
continue;
}
if (server.fds[i].revents != POLLIN) {
if (i == 0) {
syslog(LOG_NOTICE, "Unexpected poll() event (0x%x)\n",
server.fds[i].revents);
} else {
close_fd(i);
compress = true;
}
continue;
}
if (i == 0) {
rc = accept_all();
if (rc) {
continue;
}
} else {
rc = read_and_process(server.fds[i].fd);
if (rc) {
close_fd(i);
compress = true;
}
}
}
if (compress) {
compress = false;
compress_fds();
}
}
}
static void fini_listener(void)
{
int i;
if (server.fds[0].fd <= 0) {
return;
}
for (i = server.nfds - 1; i >= 0; i--) {
if (server.fds[i].fd) {
close(server.fds[i].fd);
}
}
unlink(server.args.unix_socket_path);
}
static void fini_umad(void)
{
if (server.umad_agent.agent_id) {
umad_unregister(server.umad_agent.port_id, server.umad_agent.agent_id);
}
if (server.umad_agent.port_id) {
umad_close_port(server.umad_agent.port_id);
}
hash_tbl_free();
}
static void fini(void)
{
if (server.umad_recv_thread) {
pthread_join(server.umad_recv_thread, NULL);
server.umad_recv_thread = 0;
}
fini_umad();
fini_listener();
pthread_rwlock_destroy(&server.lock);
syslog(LOG_INFO, "Service going down");
}
static int init_listener(void)
{
struct sockaddr_un sun;
int rc, on = 1;
server.fds[0].fd = socket(AF_UNIX, SOCK_STREAM, 0);
if (server.fds[0].fd < 0) {
syslog(LOG_ALERT, "socket() failed");
return -EIO;
}
rc = setsockopt(server.fds[0].fd, SOL_SOCKET, SO_REUSEADDR, (char *)&on,
sizeof(on));
if (rc < 0) {
syslog(LOG_ALERT, "setsockopt() failed");
rc = -EIO;
goto err;
}
rc = ioctl(server.fds[0].fd, FIONBIO, (char *)&on);
if (rc < 0) {
syslog(LOG_ALERT, "ioctl() failed");
rc = -EIO;
goto err;
}
if (strlen(server.args.unix_socket_path) >= sizeof(sun.sun_path)) {
syslog(LOG_ALERT,
"Invalid unix_socket_path, size must be less than %ld\n",
sizeof(sun.sun_path));
rc = -EINVAL;
goto err;
}
sun.sun_family = AF_UNIX;
rc = snprintf(sun.sun_path, sizeof(sun.sun_path), "%s",
server.args.unix_socket_path);
if (rc < 0 || rc >= sizeof(sun.sun_path)) {
syslog(LOG_ALERT, "Could not copy unix socket path\n");
rc = -EINVAL;
goto err;
}
rc = bind(server.fds[0].fd, (struct sockaddr *)&sun, sizeof(sun));
if (rc < 0) {
syslog(LOG_ALERT, "bind() failed");
rc = -EIO;
goto err;
}
rc = listen(server.fds[0].fd, SERVER_LISTEN_BACKLOG);
if (rc < 0) {
syslog(LOG_ALERT, "listen() failed");
rc = -EIO;
goto err;
}
server.fds[0].events = POLLIN;
server.nfds = 1;
server.run = true;
return 0;
err:
close(server.fds[0].fd);
return rc;
}
static int init_umad(void)
{
long method_mask[IB_USER_MAD_LONGS_PER_METHOD_MASK];
server.umad_agent.port_id = umad_open_port(server.args.rdma_dev_name,
server.args.rdma_port_num);
if (server.umad_agent.port_id < 0) {
syslog(LOG_WARNING, "umad_open_port() failed");
return -EIO;
}
memset(&method_mask, 0, sizeof(method_mask));
method_mask[0] = MAD_METHOD_MASK0;
server.umad_agent.agent_id = umad_register(server.umad_agent.port_id,
UMAD_CLASS_CM,
UMAD_SA_CLASS_VERSION,
MAD_RMPP_VERSION, method_mask);
if (server.umad_agent.agent_id < 0) {
syslog(LOG_WARNING, "umad_register() failed");
return -EIO;
}
hash_tbl_alloc();
return 0;
}
static void signal_handler(int sig, siginfo_t *siginfo, void *context)
{
static bool warned;
/* Prevent stop if clients are connected */
if (server.nfds != 1) {
if (!warned) {
syslog(LOG_WARNING,
"Can't stop while active client exist, resend SIGINT to overid");
warned = true;
return;
}
}
if (sig == SIGINT) {
server.run = false;
fini();
}
exit(0);
}
static int init(void)
{
int rc;
struct sigaction sig = {0};
rc = init_listener();
if (rc) {
return rc;
}
rc = init_umad();
if (rc) {
return rc;
}
pthread_rwlock_init(&server.lock, 0);
rc = pthread_create(&server.umad_recv_thread, NULL, umad_recv_thread_func,
NULL);
if (rc) {
syslog(LOG_ERR, "Fail to create UMAD receiver thread (%d)\n", rc);
return rc;
}
sig.sa_sigaction = &signal_handler;
sig.sa_flags = SA_SIGINFO;
rc = sigaction(SIGINT, &sig, NULL);
if (rc < 0) {
syslog(LOG_ERR, "Fail to install SIGINT handler (%d)\n", errno);
return rc;
}
return 0;
}
int main(int argc, char *argv[])
{
int rc;
memset(&server, 0, sizeof(server));
parse_args(argc, argv);
rc = init();
if (rc) {
syslog(LOG_ERR, "Fail to initialize server (%d)\n", rc);
rc = -EAGAIN;
goto out;
}
run();
out:
fini();
return rc;
}

View File

@ -0,0 +1,61 @@
/*
* QEMU paravirtual RDMA - rdmacm-mux declarations
*
* Copyright (C) 2018 Oracle
* Copyright (C) 2018 Red Hat Inc
*
* Authors:
* Yuval Shaia <yuval.shaia@oracle.com>
* Marcel Apfelbaum <marcel@redhat.com>
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*
*/
#ifndef RDMACM_MUX_H
#define RDMACM_MUX_H
#include "linux/if.h"
#include "infiniband/verbs.h"
#include "infiniband/umad.h"
#include "rdma/rdma_user_cm.h"
typedef enum RdmaCmMuxMsgType {
RDMACM_MUX_MSG_TYPE_REQ = 0,
RDMACM_MUX_MSG_TYPE_RESP = 1,
} RdmaCmMuxMsgType;
typedef enum RdmaCmMuxOpCode {
RDMACM_MUX_OP_CODE_REG = 0,
RDMACM_MUX_OP_CODE_UNREG = 1,
RDMACM_MUX_OP_CODE_MAD = 2,
} RdmaCmMuxOpCode;
typedef enum RdmaCmMuxErrCode {
RDMACM_MUX_ERR_CODE_OK = 0,
RDMACM_MUX_ERR_CODE_EINVAL = 1,
RDMACM_MUX_ERR_CODE_EEXIST = 2,
RDMACM_MUX_ERR_CODE_EACCES = 3,
RDMACM_MUX_ERR_CODE_ENOTFOUND = 4,
} RdmaCmMuxErrCode;
typedef struct RdmaCmMuxHdr {
RdmaCmMuxMsgType msg_type;
RdmaCmMuxOpCode op_code;
union ibv_gid sgid;
RdmaCmMuxErrCode err_code;
} RdmaCmUHdr;
typedef struct RdmaCmUMad {
struct ib_user_mad hdr;
char mad[RDMA_MAX_PRIVATE_DATA];
} RdmaCmUMad;
typedef struct RdmaCmMuxMsg {
RdmaCmUHdr hdr;
int umad_len;
RdmaCmUMad umad;
} RdmaCmMuxMsg;
#endif

View File

@ -9,8 +9,9 @@ It works with its Linux Kernel driver AS IS, no need for any special guest
modifications.
While it complies with the VMware device, it can also communicate with bare
metal RDMA-enabled machines and does not require an RDMA HCA in the host, it
can work with Soft-RoCE (rxe).
metal RDMA-enabled machines as peers.
It does not require an RDMA HCA in the host, it can work with Soft-RoCE (rxe).
It does not require the whole guest RAM to be pinned allowing memory
over-commit and, even if not implemented yet, migration support will be
@ -78,29 +79,116 @@ the required RDMA libraries.
3. Usage
========
3.1 VM Memory settings
======================
Currently the device is working only with memory backed RAM
and it must be mark as "shared":
-m 1G \
-object memory-backend-ram,id=mb1,size=1G,share \
-numa node,memdev=mb1 \
The pvrdma device is composed of two functions:
- Function 0 is a vmxnet Ethernet Device which is redundant in Guest
but is required to pass the ibdevice GID using its MAC.
Examples:
For an rxe backend using eth0 interface it will use its mac:
-device vmxnet3,addr=<slot>.0,multifunction=on,mac=<eth0 MAC>
For an SRIOV VF, we take the Ethernet Interface exposed by it:
-device vmxnet3,multifunction=on,mac=<RoCE eth MAC>
- Function 1 is the actual device:
-device pvrdma,addr=<slot>.1,backend-dev=<ibdevice>,backend-gid-idx=<gid>,backend-port=<port>
where the ibdevice can be rxe or RDMA VF (e.g. mlx5_4)
Note: Pay special attention that the GID at backend-gid-idx matches vmxnet's MAC.
The rules of conversion are part of the RoCE spec, but since manual conversion
is not required, spotting problems is not hard:
Example: GID: fe80:0000:0000:0000:7efe:90ff:fecb:743a
MAC: 7c:fe:90:cb:74:3a
Note the difference between the first byte of the MAC and the GID.
3.2 MAD Multiplexer
===================
MAD Multiplexer is a service that exposes MAD-like interface for VMs in
order to overcome the limitation where only single entity can register with
MAD layer to send and receive RDMA-CM MAD packets.
To build rdmacm-mux run
# make rdmacm-mux
The application accepts 3 command line arguments and exposes a UNIX socket
to pass control and data to it.
-d rdma-device-name Name of RDMA device to register with
-s unix-socket-path Path to unix socket to listen (default /var/run/rdmacm-mux)
-p rdma-device-port Port number of RDMA device to register with (default 1)
The final UNIX socket file name is a concatenation of the 3 arguments so
for example for device mlx5_0 on port 2 this /var/run/rdmacm-mux-mlx5_0-2
will be created.
pvrdma requires this service.
Please refer to contrib/rdmacm-mux for more details.
3.3 Service exposed by libvirt daemon
=====================================
The control over the RDMA device's GID table is done by updating the
device's Ethernet function addresses.
Usually the first GID entry is determined by the MAC address, the second by
the first IPv6 address and the third by the IPv4 address. Other entries can
be added by adding more IP addresses. The opposite is the same, i.e.
whenever an address is removed, the corresponding GID entry is removed.
The process is done by the network and RDMA stacks. Whenever an address is
added the ib_core driver is notified and calls the device driver add_gid
function which in turn update the device.
To support this in pvrdma device the device hooks into the create_bind and
destroy_bind HW commands triggered by pvrdma driver in guest.
Whenever changed is made to the pvrdma port's GID table a special QMP
messages is sent to be processed by libvirt to update the address of the
backend Ethernet device.
pvrdma requires that libvirt service will be up.
3.4 PCI devices settings
========================
RoCE device exposes two functions - an Ethernet and RDMA.
To support it, pvrdma device is composed of two PCI functions, an Ethernet
device of type vmxnet3 on PCI slot 0 and a PVRDMA device on PCI slot 1. The
Ethernet function can be used for other Ethernet purposes such as IP.
3.5 Device parameters
=====================
- netdev: Specifies the Ethernet device function name on the host for
example enp175s0f0. For Soft-RoCE device (rxe) this would be the Ethernet
device used to create it.
- ibdev: The IB device name on host for example rxe0, mlx5_0 etc.
- mad-chardev: The name of the MAD multiplexer char device.
- ibport: In case of multi-port device (such as Mellanox's HCA) this
specify the port to use. If not set 1 will be used.
- dev-caps-max-mr-size: The maximum size of MR.
- dev-caps-max-qp: Maximum number of QPs.
- dev-caps-max-sge: Maximum number of SGE elements in WR.
- dev-caps-max-cq: Maximum number of CQs.
- dev-caps-max-mr: Maximum number of MRs.
- dev-caps-max-pd: Maximum number of PDs.
- dev-caps-max-ah: Maximum number of AHs.
Notes:
- The first 3 parameters are mandatory settings, the rest have their
defaults.
- The last 8 parameters (the ones that prefixed by dev-caps) defines the top
limits but the final values is adjusted by the backend device limitations.
- netdev can be extracted from ibdev's sysfs
(/sys/class/infiniband/<ibdev>/device/net/)
3.6 Example
===========
Define bridge device with vmxnet3 network backend:
<interface type='bridge'>
<mac address='56:b4:44:e9:62:dc'/>
<source bridge='bridge1'/>
<model type='vmxnet3'/>
<address type='pci' domain='0x0000' bus='0x00' slot='0x10' function='0x0' multifunction='on'/>
</interface>
Define pvrdma device:
<qemu:commandline>
<qemu:arg value='-object'/>
<qemu:arg value='memory-backend-ram,id=mb1,size=1G,share'/>
<qemu:arg value='-numa'/>
<qemu:arg value='node,memdev=mb1'/>
<qemu:arg value='-chardev'/>
<qemu:arg value='socket,path=/var/run/rdmacm-mux-rxe0-1,id=mads'/>
<qemu:arg value='-device'/>
<qemu:arg value='pvrdma,addr=10.1,ibdev=rxe0,netdev=bridge0,mad-chardev=mads'/>
</qemu:commandline>

View File

@ -18,7 +18,6 @@
#include "qemu/osdep.h"
#include "hw/hw.h"
#include "hw/pci/pci.h"
#include "net/net.h"
#include "net/tap.h"
#include "net/checksum.h"
#include "sysemu/sysemu.h"
@ -29,6 +28,7 @@
#include "migration/register.h"
#include "vmxnet3.h"
#include "vmxnet3_defs.h"
#include "vmxnet_debug.h"
#include "vmware_utils.h"
#include "net_tx_pkt.h"
@ -131,23 +131,11 @@ typedef struct VMXNET3Class {
DeviceRealize parent_dc_realize;
} VMXNET3Class;
#define TYPE_VMXNET3 "vmxnet3"
#define VMXNET3(obj) OBJECT_CHECK(VMXNET3State, (obj), TYPE_VMXNET3)
#define VMXNET3_DEVICE_CLASS(klass) \
OBJECT_CLASS_CHECK(VMXNET3Class, (klass), TYPE_VMXNET3)
#define VMXNET3_DEVICE_GET_CLASS(obj) \
OBJECT_GET_CLASS(VMXNET3Class, (obj), TYPE_VMXNET3)
/* Cyclic ring abstraction */
typedef struct {
hwaddr pa;
uint32_t size;
uint32_t cell_size;
uint32_t next;
uint8_t gen;
} Vmxnet3Ring;
static inline void vmxnet3_ring_init(PCIDevice *d,
Vmxnet3Ring *ring,
hwaddr pa,
@ -245,108 +233,6 @@ vmxnet3_dump_rx_descr(struct Vmxnet3_RxDesc *descr)
descr->rsvd, descr->dtype, descr->ext1, descr->btype);
}
/* Device state and helper functions */
#define VMXNET3_RX_RINGS_PER_QUEUE (2)
typedef struct {
Vmxnet3Ring tx_ring;
Vmxnet3Ring comp_ring;
uint8_t intr_idx;
hwaddr tx_stats_pa;
struct UPT1_TxStats txq_stats;
} Vmxnet3TxqDescr;
typedef struct {
Vmxnet3Ring rx_ring[VMXNET3_RX_RINGS_PER_QUEUE];
Vmxnet3Ring comp_ring;
uint8_t intr_idx;
hwaddr rx_stats_pa;
struct UPT1_RxStats rxq_stats;
} Vmxnet3RxqDescr;
typedef struct {
bool is_masked;
bool is_pending;
bool is_asserted;
} Vmxnet3IntState;
typedef struct {
PCIDevice parent_obj;
NICState *nic;
NICConf conf;
MemoryRegion bar0;
MemoryRegion bar1;
MemoryRegion msix_bar;
Vmxnet3RxqDescr rxq_descr[VMXNET3_DEVICE_MAX_RX_QUEUES];
Vmxnet3TxqDescr txq_descr[VMXNET3_DEVICE_MAX_TX_QUEUES];
/* Whether MSI-X support was installed successfully */
bool msix_used;
hwaddr drv_shmem;
hwaddr temp_shared_guest_driver_memory;
uint8_t txq_num;
/* This boolean tells whether RX packet being indicated has to */
/* be split into head and body chunks from different RX rings */
bool rx_packets_compound;
bool rx_vlan_stripping;
bool lro_supported;
uint8_t rxq_num;
/* Network MTU */
uint32_t mtu;
/* Maximum number of fragments for indicated TX packets */
uint32_t max_tx_frags;
/* Maximum number of fragments for indicated RX packets */
uint16_t max_rx_frags;
/* Index for events interrupt */
uint8_t event_int_idx;
/* Whether automatic interrupts masking enabled */
bool auto_int_masking;
bool peer_has_vhdr;
/* TX packets to QEMU interface */
struct NetTxPkt *tx_pkt;
uint32_t offload_mode;
uint32_t cso_or_gso_size;
uint16_t tci;
bool needs_vlan;
struct NetRxPkt *rx_pkt;
bool tx_sop;
bool skip_current_tx_pkt;
uint32_t device_active;
uint32_t last_command;
uint32_t link_status_and_speed;
Vmxnet3IntState interrupt_states[VMXNET3_MAX_INTRS];
uint32_t temp_mac; /* To store the low part first */
MACAddr perm_mac;
uint32_t vlan_table[VMXNET3_VFT_SIZE];
uint32_t rx_mode;
MACAddr *mcast_list;
uint32_t mcast_list_len;
uint32_t mcast_list_buff_size; /* needed for live migration. */
/* Compatibility flags for migration */
uint32_t compat_flags;
} VMXNET3State;
/* Interrupt management */
/*

133
hw/net/vmxnet3_defs.h Normal file
View File

@ -0,0 +1,133 @@
/*
* QEMU VMWARE VMXNET3 paravirtual NIC
*
* Copyright (c) 2012 Ravello Systems LTD (http://ravellosystems.com)
*
* Developed by Daynix Computing LTD (http://www.daynix.com)
*
* Authors:
* Dmitry Fleytman <dmitry@daynix.com>
* Tamir Shomer <tamirs@daynix.com>
* Yan Vugenfirer <yan@daynix.com>
*
* This work is licensed under the terms of the GNU GPL, version 2.
* See the COPYING file in the top-level directory.
*
*/
#include "net/net.h"
#include "hw/net/vmxnet3.h"
#define TYPE_VMXNET3 "vmxnet3"
#define VMXNET3(obj) OBJECT_CHECK(VMXNET3State, (obj), TYPE_VMXNET3)
/* Device state and helper functions */
#define VMXNET3_RX_RINGS_PER_QUEUE (2)
/* Cyclic ring abstraction */
typedef struct {
hwaddr pa;
uint32_t size;
uint32_t cell_size;
uint32_t next;
uint8_t gen;
} Vmxnet3Ring;
typedef struct {
Vmxnet3Ring tx_ring;
Vmxnet3Ring comp_ring;
uint8_t intr_idx;
hwaddr tx_stats_pa;
struct UPT1_TxStats txq_stats;
} Vmxnet3TxqDescr;
typedef struct {
Vmxnet3Ring rx_ring[VMXNET3_RX_RINGS_PER_QUEUE];
Vmxnet3Ring comp_ring;
uint8_t intr_idx;
hwaddr rx_stats_pa;
struct UPT1_RxStats rxq_stats;
} Vmxnet3RxqDescr;
typedef struct {
bool is_masked;
bool is_pending;
bool is_asserted;
} Vmxnet3IntState;
typedef struct {
PCIDevice parent_obj;
NICState *nic;
NICConf conf;
MemoryRegion bar0;
MemoryRegion bar1;
MemoryRegion msix_bar;
Vmxnet3RxqDescr rxq_descr[VMXNET3_DEVICE_MAX_RX_QUEUES];
Vmxnet3TxqDescr txq_descr[VMXNET3_DEVICE_MAX_TX_QUEUES];
/* Whether MSI-X support was installed successfully */
bool msix_used;
hwaddr drv_shmem;
hwaddr temp_shared_guest_driver_memory;
uint8_t txq_num;
/* This boolean tells whether RX packet being indicated has to */
/* be split into head and body chunks from different RX rings */
bool rx_packets_compound;
bool rx_vlan_stripping;
bool lro_supported;
uint8_t rxq_num;
/* Network MTU */
uint32_t mtu;
/* Maximum number of fragments for indicated TX packets */
uint32_t max_tx_frags;
/* Maximum number of fragments for indicated RX packets */
uint16_t max_rx_frags;
/* Index for events interrupt */
uint8_t event_int_idx;
/* Whether automatic interrupts masking enabled */
bool auto_int_masking;
bool peer_has_vhdr;
/* TX packets to QEMU interface */
struct NetTxPkt *tx_pkt;
uint32_t offload_mode;
uint32_t cso_or_gso_size;
uint16_t tci;
bool needs_vlan;
struct NetRxPkt *rx_pkt;
bool tx_sop;
bool skip_current_tx_pkt;
uint32_t device_active;
uint32_t last_command;
uint32_t link_status_and_speed;
Vmxnet3IntState interrupt_states[VMXNET3_MAX_INTRS];
uint32_t temp_mac; /* To store the low part first */
MACAddr perm_mac;
uint32_t vlan_table[VMXNET3_VFT_SIZE];
uint32_t rx_mode;
MACAddr *mcast_list;
uint32_t mcast_list_len;
uint32_t mcast_list_buff_size; /* needed for live migration. */
/* Compatibility flags for migration */
uint32_t compat_flags;
} VMXNET3State;

View File

@ -15,10 +15,18 @@
#include "qemu/osdep.h"
#include "qemu/error-report.h"
#include "sysemu/sysemu.h"
#include "qapi/error.h"
#include "qapi/qmp/qlist.h"
#include "qapi/qmp/qnum.h"
#include "qapi/qapi-events-rdma.h"
#include <infiniband/verbs.h>
#include <infiniband/umad_types.h>
#include <infiniband/umad.h>
#include <rdma/rdma_user_cm.h>
#include "contrib/rdmacm-mux/rdmacm-mux.h"
#include "trace.h"
#include "rdma_utils.h"
#include "rdma_rm.h"
@ -29,27 +37,46 @@
#define VENDOR_ERR_TOO_MANY_SGES 0x202
#define VENDOR_ERR_NOMEM 0x203
#define VENDOR_ERR_QP0 0x204
#define VENDOR_ERR_NO_SGE 0x205
#define VENDOR_ERR_INV_NUM_SGE 0x205
#define VENDOR_ERR_MAD_SEND 0x206
#define VENDOR_ERR_INVLKEY 0x207
#define VENDOR_ERR_MR_SMALL 0x208
#define VENDOR_ERR_INV_MAD_BUFF 0x209
#define THR_NAME_LEN 16
#define THR_POLL_TO 5000
#define MAD_HDR_SIZE sizeof(struct ibv_grh)
typedef struct BackendCtx {
uint64_t req_id;
void *up_ctx;
bool is_tx_req;
struct ibv_sge sge; /* Used to save MAD recv buffer */
} BackendCtx;
static void (*comp_handler)(int status, unsigned int vendor_err, void *ctx);
struct backend_umad {
struct ib_user_mad hdr;
char mad[RDMA_MAX_PRIVATE_DATA];
};
static void dummy_comp_handler(int status, unsigned int vendor_err, void *ctx)
static void (*comp_handler)(void *ctx, struct ibv_wc *wc);
static void dummy_comp_handler(void *ctx, struct ibv_wc *wc)
{
pr_err("No completion handler is registered\n");
}
static inline void complete_work(enum ibv_wc_status status, uint32_t vendor_err,
void *ctx)
{
struct ibv_wc wc = {0};
wc.status = status;
wc.vendor_err = vendor_err;
comp_handler(ctx, &wc);
}
static void poll_cq(RdmaDeviceResources *rdma_dev_res, struct ibv_cq *ibcq)
{
int i, ne;
@ -74,7 +101,7 @@ static void poll_cq(RdmaDeviceResources *rdma_dev_res, struct ibv_cq *ibcq)
}
pr_dbg("Processing %s CQE\n", bctx->is_tx_req ? "send" : "recv");
comp_handler(wc[i].status, wc[i].vendor_err, bctx->up_ctx);
comp_handler(bctx->up_ctx, &wc[i]);
rdma_rm_dealloc_cqe_ctx(rdma_dev_res, wc[i].wr_id);
g_free(bctx);
@ -146,6 +173,77 @@ static void *comp_handler_thread(void *arg)
return NULL;
}
static inline void disable_rdmacm_mux_async(RdmaBackendDev *backend_dev)
{
atomic_set(&backend_dev->rdmacm_mux.can_receive, 0);
}
static inline void enable_rdmacm_mux_async(RdmaBackendDev *backend_dev)
{
atomic_set(&backend_dev->rdmacm_mux.can_receive, sizeof(RdmaCmMuxMsg));
}
static inline int rdmacm_mux_can_process_async(RdmaBackendDev *backend_dev)
{
return atomic_read(&backend_dev->rdmacm_mux.can_receive);
}
static int check_mux_op_status(CharBackend *mad_chr_be)
{
RdmaCmMuxMsg msg = {0};
int ret;
pr_dbg("Reading response\n");
ret = qemu_chr_fe_read_all(mad_chr_be, (uint8_t *)&msg, sizeof(msg));
if (ret != sizeof(msg)) {
pr_dbg("Invalid message size %d, expecting %ld\n", ret, sizeof(msg));
return -EIO;
}
pr_dbg("msg_type=%d\n", msg.hdr.msg_type);
pr_dbg("op_code=%d\n", msg.hdr.op_code);
pr_dbg("err_code=%d\n", msg.hdr.err_code);
if (msg.hdr.msg_type != RDMACM_MUX_MSG_TYPE_RESP) {
pr_dbg("Invalid message type %d\n", msg.hdr.msg_type);
return -EIO;
}
if (msg.hdr.err_code != RDMACM_MUX_ERR_CODE_OK) {
pr_dbg("Operation failed in mux, error code %d\n", msg.hdr.err_code);
return -EIO;
}
return 0;
}
static int exec_rdmacm_mux_req(RdmaBackendDev *backend_dev, RdmaCmMuxMsg *msg)
{
int rc = 0;
pr_dbg("Executing request %d\n", msg->hdr.op_code);
msg->hdr.msg_type = RDMACM_MUX_MSG_TYPE_REQ;
disable_rdmacm_mux_async(backend_dev);
rc = qemu_chr_fe_write(backend_dev->rdmacm_mux.chr_be,
(const uint8_t *)msg, sizeof(*msg));
if (rc != sizeof(*msg)) {
enable_rdmacm_mux_async(backend_dev);
pr_dbg("Fail to send request to rdmacm_mux (rc=%d)\n", rc);
return -EIO;
}
rc = check_mux_op_status(backend_dev->rdmacm_mux.chr_be);
if (rc) {
pr_dbg("Fail to execute rdmacm_mux request %d (rc=%d)\n",
msg->hdr.op_code, rc);
}
enable_rdmacm_mux_async(backend_dev);
return 0;
}
static void stop_backend_thread(RdmaBackendThread *thread)
{
thread->run = false;
@ -168,8 +266,8 @@ static void start_comp_thread(RdmaBackendDev *backend_dev)
comp_handler_thread, backend_dev, QEMU_THREAD_DETACHED);
}
void rdma_backend_register_comp_handler(void (*handler)(int status,
unsigned int vendor_err, void *ctx))
void rdma_backend_register_comp_handler(void (*handler)(void *ctx,
struct ibv_wc *wc))
{
comp_handler = handler;
}
@ -286,11 +384,73 @@ static int build_host_sge_array(RdmaDeviceResources *rdma_dev_res,
return 0;
}
static int mad_send(RdmaBackendDev *backend_dev, uint8_t sgid_idx,
union ibv_gid *sgid, struct ibv_sge *sge, uint32_t num_sge)
{
RdmaCmMuxMsg msg = {0};
char *hdr, *data;
int ret;
pr_dbg("num_sge=%d\n", num_sge);
if (num_sge != 2) {
return -EINVAL;
}
msg.hdr.op_code = RDMACM_MUX_OP_CODE_MAD;
memcpy(msg.hdr.sgid.raw, sgid->raw, sizeof(msg.hdr.sgid));
msg.umad_len = sge[0].length + sge[1].length;
pr_dbg("umad_len=%d\n", msg.umad_len);
if (msg.umad_len > sizeof(msg.umad.mad)) {
return -ENOMEM;
}
msg.umad.hdr.addr.qpn = htobe32(1);
msg.umad.hdr.addr.grh_present = 1;
pr_dbg("sgid_idx=%d\n", sgid_idx);
pr_dbg("sgid=0x%llx\n", sgid->global.interface_id);
msg.umad.hdr.addr.gid_index = sgid_idx;
memcpy(msg.umad.hdr.addr.gid, sgid->raw, sizeof(msg.umad.hdr.addr.gid));
msg.umad.hdr.addr.hop_limit = 0xFF;
hdr = rdma_pci_dma_map(backend_dev->dev, sge[0].addr, sge[0].length);
if (!hdr) {
pr_dbg("Fail to map to sge[0]\n");
return -ENOMEM;
}
data = rdma_pci_dma_map(backend_dev->dev, sge[1].addr, sge[1].length);
if (!data) {
pr_dbg("Fail to map to sge[1]\n");
rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length);
return -ENOMEM;
}
pr_dbg_buf("mad_hdr", hdr, sge[0].length);
pr_dbg_buf("mad_data", data, sge[1].length);
memcpy(&msg.umad.mad[0], hdr, sge[0].length);
memcpy(&msg.umad.mad[sge[0].length], data, sge[1].length);
rdma_pci_dma_unmap(backend_dev->dev, data, sge[1].length);
rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length);
ret = exec_rdmacm_mux_req(backend_dev, &msg);
if (ret) {
pr_dbg("Fail to send MAD to rdma_umadmux (%d)\n", ret);
return -EIO;
}
return 0;
}
void rdma_backend_post_send(RdmaBackendDev *backend_dev,
RdmaBackendQP *qp, uint8_t qp_type,
struct ibv_sge *sge, uint32_t num_sge,
union ibv_gid *dgid, uint32_t dqpn,
uint32_t dqkey, void *ctx)
uint8_t sgid_idx, union ibv_gid *sgid,
union ibv_gid *dgid, uint32_t dqpn, uint32_t dqkey,
void *ctx)
{
BackendCtx *bctx;
struct ibv_sge new_sge[MAX_SGE];
@ -301,19 +461,23 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev,
if (!qp->ibqp) { /* This field does not get initialized for QP0 and QP1 */
if (qp_type == IBV_QPT_SMI) {
pr_dbg("QP0 unsupported\n");
comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx);
complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx);
} else if (qp_type == IBV_QPT_GSI) {
pr_dbg("QP1\n");
comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx);
rc = mad_send(backend_dev, sgid_idx, sgid, sge, num_sge);
if (rc) {
complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx);
} else {
complete_work(IBV_WC_SUCCESS, 0, ctx);
}
}
pr_dbg("qp->ibqp is NULL for qp_type %d!!!\n", qp_type);
return;
}
pr_dbg("num_sge=%d\n", num_sge);
if (!num_sge) {
pr_dbg("num_sge=0\n");
comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_NO_SGE, ctx);
if (!num_sge || num_sge > MAX_SGE) {
pr_dbg("invalid num_sge=%d\n", num_sge);
complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_INV_NUM_SGE, ctx);
return;
}
@ -324,20 +488,23 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev,
rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, &bctx_id, bctx);
if (unlikely(rc)) {
pr_dbg("Failed to allocate cqe_ctx\n");
comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_NOMEM, ctx);
complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_NOMEM, ctx);
goto out_free_bctx;
}
rc = build_host_sge_array(backend_dev->rdma_dev_res, new_sge, sge, num_sge);
if (rc) {
pr_dbg("Error: Failed to build host SGE array\n");
comp_handler(IBV_WC_GENERAL_ERR, rc, ctx);
complete_work(IBV_WC_GENERAL_ERR, rc, ctx);
goto out_dealloc_cqe_ctx;
}
if (qp_type == IBV_QPT_UD) {
wr.wr.ud.ah = create_ah(backend_dev, qp->ibpd,
backend_dev->backend_gid_idx, dgid);
wr.wr.ud.ah = create_ah(backend_dev, qp->ibpd, sgid_idx, dgid);
if (!wr.wr.ud.ah) {
complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
goto out_dealloc_cqe_ctx;
}
wr.wr.ud.remote_qpn = dqpn;
wr.wr.ud.remote_qkey = dqkey;
}
@ -353,7 +520,7 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev,
if (rc) {
pr_dbg("Fail (%d, %d) to post send WQE to qpn %d\n", rc, errno,
qp->ibqp->qp_num);
comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
goto out_dealloc_cqe_ctx;
}
@ -366,6 +533,48 @@ out_free_bctx:
g_free(bctx);
}
static unsigned int save_mad_recv_buffer(RdmaBackendDev *backend_dev,
struct ibv_sge *sge, uint32_t num_sge,
void *ctx)
{
BackendCtx *bctx;
int rc;
uint32_t bctx_id;
if (num_sge != 1) {
pr_dbg("Invalid num_sge (%d), expecting 1\n", num_sge);
return VENDOR_ERR_INV_NUM_SGE;
}
if (sge[0].length < RDMA_MAX_PRIVATE_DATA + sizeof(struct ibv_grh)) {
pr_dbg("Too small buffer for MAD\n");
return VENDOR_ERR_INV_MAD_BUFF;
}
pr_dbg("addr=0x%" PRIx64"\n", sge[0].addr);
pr_dbg("length=%d\n", sge[0].length);
pr_dbg("lkey=%d\n", sge[0].lkey);
bctx = g_malloc0(sizeof(*bctx));
rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, &bctx_id, bctx);
if (unlikely(rc)) {
g_free(bctx);
pr_dbg("Fail to allocate cqe_ctx\n");
return VENDOR_ERR_NOMEM;
}
pr_dbg("bctx_id %d, bctx %p, ctx %p\n", bctx_id, bctx, ctx);
bctx->up_ctx = ctx;
bctx->sge = *sge;
qemu_mutex_lock(&backend_dev->recv_mads_list.lock);
qlist_append_int(backend_dev->recv_mads_list.list, bctx_id);
qemu_mutex_unlock(&backend_dev->recv_mads_list.lock);
return 0;
}
void rdma_backend_post_recv(RdmaBackendDev *backend_dev,
RdmaDeviceResources *rdma_dev_res,
RdmaBackendQP *qp, uint8_t qp_type,
@ -380,19 +589,22 @@ void rdma_backend_post_recv(RdmaBackendDev *backend_dev,
if (!qp->ibqp) { /* This field does not get initialized for QP0 and QP1 */
if (qp_type == IBV_QPT_SMI) {
pr_dbg("QP0 unsupported\n");
comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx);
complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx);
}
if (qp_type == IBV_QPT_GSI) {
pr_dbg("QP1\n");
comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx);
rc = save_mad_recv_buffer(backend_dev, sge, num_sge, ctx);
if (rc) {
complete_work(IBV_WC_GENERAL_ERR, rc, ctx);
}
}
return;
}
pr_dbg("num_sge=%d\n", num_sge);
if (!num_sge) {
pr_dbg("num_sge=0\n");
comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_NO_SGE, ctx);
if (!num_sge || num_sge > MAX_SGE) {
pr_dbg("invalid num_sge=%d\n", num_sge);
complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_INV_NUM_SGE, ctx);
return;
}
@ -403,14 +615,14 @@ void rdma_backend_post_recv(RdmaBackendDev *backend_dev,
rc = rdma_rm_alloc_cqe_ctx(rdma_dev_res, &bctx_id, bctx);
if (unlikely(rc)) {
pr_dbg("Failed to allocate cqe_ctx\n");
comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_NOMEM, ctx);
complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_NOMEM, ctx);
goto out_free_bctx;
}
rc = build_host_sge_array(rdma_dev_res, new_sge, sge, num_sge);
if (rc) {
pr_dbg("Error: Failed to build host SGE array\n");
comp_handler(IBV_WC_GENERAL_ERR, rc, ctx);
complete_work(IBV_WC_GENERAL_ERR, rc, ctx);
goto out_dealloc_cqe_ctx;
}
@ -422,7 +634,7 @@ void rdma_backend_post_recv(RdmaBackendDev *backend_dev,
if (rc) {
pr_dbg("Fail (%d, %d) to post recv WQE to qpn %d\n", rc, errno,
qp->ibqp->qp_num);
comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
goto out_dealloc_cqe_ctx;
}
@ -513,7 +725,6 @@ int rdma_backend_create_qp(RdmaBackendQP *qp, uint8_t qp_type,
switch (qp_type) {
case IBV_QPT_GSI:
pr_dbg("QP1 unsupported\n");
return 0;
case IBV_QPT_RC:
@ -594,9 +805,9 @@ int rdma_backend_qp_state_init(RdmaBackendDev *backend_dev, RdmaBackendQP *qp,
}
int rdma_backend_qp_state_rtr(RdmaBackendDev *backend_dev, RdmaBackendQP *qp,
uint8_t qp_type, union ibv_gid *dgid,
uint32_t dqpn, uint32_t rq_psn, uint32_t qkey,
bool use_qkey)
uint8_t qp_type, uint8_t sgid_idx,
union ibv_gid *dgid, uint32_t dqpn,
uint32_t rq_psn, uint32_t qkey, bool use_qkey)
{
struct ibv_qp_attr attr = {0};
union ibv_gid ibv_gid = {
@ -608,13 +819,15 @@ int rdma_backend_qp_state_rtr(RdmaBackendDev *backend_dev, RdmaBackendQP *qp,
attr.qp_state = IBV_QPS_RTR;
attr_mask = IBV_QP_STATE;
qp->sgid_idx = sgid_idx;
switch (qp_type) {
case IBV_QPT_RC:
pr_dbg("dgid=0x%" PRIx64 ",%" PRIx64 "\n",
be64_to_cpu(ibv_gid.global.subnet_prefix),
be64_to_cpu(ibv_gid.global.interface_id));
pr_dbg("dqpn=0x%x\n", dqpn);
pr_dbg("sgid_idx=%d\n", backend_dev->backend_gid_idx);
pr_dbg("sgid_idx=%d\n", qp->sgid_idx);
pr_dbg("sport_num=%d\n", backend_dev->port_num);
pr_dbg("rq_psn=0x%x\n", rq_psn);
@ -626,7 +839,7 @@ int rdma_backend_qp_state_rtr(RdmaBackendDev *backend_dev, RdmaBackendQP *qp,
attr.ah_attr.is_global = 1;
attr.ah_attr.grh.hop_limit = 1;
attr.ah_attr.grh.dgid = ibv_gid;
attr.ah_attr.grh.sgid_index = backend_dev->backend_gid_idx;
attr.ah_attr.grh.sgid_index = qp->sgid_idx;
attr.rq_psn = rq_psn;
attr_mask |= IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN |
@ -635,8 +848,8 @@ int rdma_backend_qp_state_rtr(RdmaBackendDev *backend_dev, RdmaBackendQP *qp,
break;
case IBV_QPT_UD:
pr_dbg("qkey=0x%x\n", qkey);
if (use_qkey) {
pr_dbg("qkey=0x%x\n", qkey);
attr.qkey = qkey;
attr_mask |= IBV_QP_QKEY;
}
@ -744,23 +957,224 @@ static int init_device_caps(RdmaBackendDev *backend_dev,
return 0;
}
static inline void build_mad_hdr(struct ibv_grh *grh, union ibv_gid *sgid,
union ibv_gid *my_gid, int paylen)
{
grh->paylen = htons(paylen);
grh->sgid = *sgid;
grh->dgid = *my_gid;
pr_dbg("paylen=%d (net=0x%x)\n", paylen, grh->paylen);
pr_dbg("dgid=0x%llx\n", my_gid->global.interface_id);
pr_dbg("sgid=0x%llx\n", sgid->global.interface_id);
}
static void process_incoming_mad_req(RdmaBackendDev *backend_dev,
RdmaCmMuxMsg *msg)
{
QObject *o_ctx_id;
unsigned long cqe_ctx_id;
BackendCtx *bctx;
char *mad;
pr_dbg("umad_len=%d\n", msg->umad_len);
#ifdef PVRDMA_DEBUG
struct umad_hdr *hdr = (struct umad_hdr *)&msg->umad.mad;
pr_dbg("bv %x cls %x cv %x mtd %x st %d tid %" PRIx64 " at %x atm %x\n",
hdr->base_version, hdr->mgmt_class, hdr->class_version,
hdr->method, hdr->status, be64toh(hdr->tid),
hdr->attr_id, hdr->attr_mod);
#endif
qemu_mutex_lock(&backend_dev->recv_mads_list.lock);
o_ctx_id = qlist_pop(backend_dev->recv_mads_list.list);
qemu_mutex_unlock(&backend_dev->recv_mads_list.lock);
if (!o_ctx_id) {
pr_dbg("No more free MADs buffers, waiting for a while\n");
sleep(THR_POLL_TO);
return;
}
cqe_ctx_id = qnum_get_uint(qobject_to(QNum, o_ctx_id));
bctx = rdma_rm_get_cqe_ctx(backend_dev->rdma_dev_res, cqe_ctx_id);
if (unlikely(!bctx)) {
pr_dbg("Error: Fail to find ctx for %ld\n", cqe_ctx_id);
return;
}
pr_dbg("id %ld, bctx %p, ctx %p\n", cqe_ctx_id, bctx, bctx->up_ctx);
mad = rdma_pci_dma_map(backend_dev->dev, bctx->sge.addr,
bctx->sge.length);
if (!mad || bctx->sge.length < msg->umad_len + MAD_HDR_SIZE) {
complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_INV_MAD_BUFF,
bctx->up_ctx);
} else {
struct ibv_wc wc = {0};
pr_dbg_buf("mad", msg->umad.mad, msg->umad_len);
memset(mad, 0, bctx->sge.length);
build_mad_hdr((struct ibv_grh *)mad,
(union ibv_gid *)&msg->umad.hdr.addr.gid, &msg->hdr.sgid,
msg->umad_len);
memcpy(&mad[MAD_HDR_SIZE], msg->umad.mad, msg->umad_len);
rdma_pci_dma_unmap(backend_dev->dev, mad, bctx->sge.length);
wc.byte_len = msg->umad_len;
wc.status = IBV_WC_SUCCESS;
wc.wc_flags = IBV_WC_GRH;
comp_handler(bctx->up_ctx, &wc);
}
g_free(bctx);
rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, cqe_ctx_id);
}
static inline int rdmacm_mux_can_receive(void *opaque)
{
RdmaBackendDev *backend_dev = (RdmaBackendDev *)opaque;
return rdmacm_mux_can_process_async(backend_dev);
}
static void rdmacm_mux_read(void *opaque, const uint8_t *buf, int size)
{
RdmaBackendDev *backend_dev = (RdmaBackendDev *)opaque;
RdmaCmMuxMsg *msg = (RdmaCmMuxMsg *)buf;
pr_dbg("Got %d bytes\n", size);
pr_dbg("msg_type=%d\n", msg->hdr.msg_type);
pr_dbg("op_code=%d\n", msg->hdr.op_code);
if (msg->hdr.msg_type != RDMACM_MUX_MSG_TYPE_REQ &&
msg->hdr.op_code != RDMACM_MUX_OP_CODE_MAD) {
pr_dbg("Error: Not a MAD request, skipping\n");
return;
}
process_incoming_mad_req(backend_dev, msg);
}
static int mad_init(RdmaBackendDev *backend_dev, CharBackend *mad_chr_be)
{
int ret;
backend_dev->rdmacm_mux.chr_be = mad_chr_be;
ret = qemu_chr_fe_backend_connected(backend_dev->rdmacm_mux.chr_be);
if (!ret) {
pr_dbg("Missing chardev for MAD multiplexer\n");
return -EIO;
}
qemu_mutex_init(&backend_dev->recv_mads_list.lock);
backend_dev->recv_mads_list.list = qlist_new();
enable_rdmacm_mux_async(backend_dev);
qemu_chr_fe_set_handlers(backend_dev->rdmacm_mux.chr_be,
rdmacm_mux_can_receive, rdmacm_mux_read, NULL,
NULL, backend_dev, NULL, true);
return 0;
}
static void mad_fini(RdmaBackendDev *backend_dev)
{
pr_dbg("Stopping MAD\n");
disable_rdmacm_mux_async(backend_dev);
qemu_chr_fe_disconnect(backend_dev->rdmacm_mux.chr_be);
qlist_destroy_obj(QOBJECT(backend_dev->recv_mads_list.list));
qemu_mutex_destroy(&backend_dev->recv_mads_list.lock);
}
int rdma_backend_get_gid_index(RdmaBackendDev *backend_dev,
union ibv_gid *gid)
{
union ibv_gid sgid;
int ret;
int i = 0;
pr_dbg("0x%llx, 0x%llx\n",
(long long unsigned int)be64_to_cpu(gid->global.subnet_prefix),
(long long unsigned int)be64_to_cpu(gid->global.interface_id));
do {
ret = ibv_query_gid(backend_dev->context, backend_dev->port_num, i,
&sgid);
i++;
} while (!ret && (memcmp(&sgid, gid, sizeof(*gid))));
pr_dbg("gid_index=%d\n", i - 1);
return ret ? ret : i - 1;
}
int rdma_backend_add_gid(RdmaBackendDev *backend_dev, const char *ifname,
union ibv_gid *gid)
{
RdmaCmMuxMsg msg = {0};
int ret;
pr_dbg("0x%llx, 0x%llx\n",
(long long unsigned int)be64_to_cpu(gid->global.subnet_prefix),
(long long unsigned int)be64_to_cpu(gid->global.interface_id));
msg.hdr.op_code = RDMACM_MUX_OP_CODE_REG;
memcpy(msg.hdr.sgid.raw, gid->raw, sizeof(msg.hdr.sgid));
ret = exec_rdmacm_mux_req(backend_dev, &msg);
if (ret) {
pr_dbg("Fail to register GID to rdma_umadmux (%d)\n", ret);
return -EIO;
}
qapi_event_send_rdma_gid_status_changed(ifname, true,
gid->global.subnet_prefix,
gid->global.interface_id);
return ret;
}
int rdma_backend_del_gid(RdmaBackendDev *backend_dev, const char *ifname,
union ibv_gid *gid)
{
RdmaCmMuxMsg msg = {0};
int ret;
pr_dbg("0x%llx, 0x%llx\n",
(long long unsigned int)be64_to_cpu(gid->global.subnet_prefix),
(long long unsigned int)be64_to_cpu(gid->global.interface_id));
msg.hdr.op_code = RDMACM_MUX_OP_CODE_UNREG;
memcpy(msg.hdr.sgid.raw, gid->raw, sizeof(msg.hdr.sgid));
ret = exec_rdmacm_mux_req(backend_dev, &msg);
if (ret) {
pr_dbg("Fail to unregister GID from rdma_umadmux (%d)\n", ret);
return -EIO;
}
qapi_event_send_rdma_gid_status_changed(ifname, false,
gid->global.subnet_prefix,
gid->global.interface_id);
return 0;
}
int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev,
RdmaDeviceResources *rdma_dev_res,
const char *backend_device_name, uint8_t port_num,
uint8_t backend_gid_idx, struct ibv_device_attr *dev_attr,
struct ibv_device_attr *dev_attr, CharBackend *mad_chr_be,
Error **errp)
{
int i;
int ret = 0;
int num_ibv_devices;
struct ibv_device **dev_list;
struct ibv_port_attr port_attr;
memset(backend_dev, 0, sizeof(*backend_dev));
backend_dev->dev = pdev;
backend_dev->backend_gid_idx = backend_gid_idx;
backend_dev->port_num = port_num;
backend_dev->rdma_dev_res = rdma_dev_res;
@ -797,9 +1211,9 @@ int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev,
backend_dev->ib_dev = *dev_list;
}
pr_dbg("Using backend device %s, port %d, gid_idx %d\n",
ibv_get_device_name(backend_dev->ib_dev),
backend_dev->port_num, backend_dev->backend_gid_idx);
pr_dbg("Using backend device %s, port %d\n",
ibv_get_device_name(backend_dev->ib_dev), backend_dev->port_num);
pr_dbg("uverb device %s\n", backend_dev->ib_dev->dev_name);
backend_dev->context = ibv_open_device(backend_dev->ib_dev);
if (!backend_dev->context) {
@ -816,20 +1230,6 @@ int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev,
}
pr_dbg("dev->backend_dev.channel=%p\n", backend_dev->channel);
ret = ibv_query_port(backend_dev->context, backend_dev->port_num,
&port_attr);
if (ret) {
error_setg(errp, "Error %d from ibv_query_port", ret);
ret = -EIO;
goto out_destroy_comm_channel;
}
if (backend_dev->backend_gid_idx >= port_attr.gid_tbl_len) {
error_setg(errp, "Invalid backend_gid_idx, should be less than %d",
port_attr.gid_tbl_len);
goto out_destroy_comm_channel;
}
ret = init_device_caps(backend_dev, dev_attr);
if (ret) {
error_setg(errp, "Failed to initialize device capabilities");
@ -837,18 +1237,13 @@ int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev,
goto out_destroy_comm_channel;
}
ret = ibv_query_gid(backend_dev->context, backend_dev->port_num,
backend_dev->backend_gid_idx, &backend_dev->gid);
ret = mad_init(backend_dev, mad_chr_be);
if (ret) {
error_setg(errp, "Failed to query gid %d",
backend_dev->backend_gid_idx);
error_setg(errp, "Fail to initialize mad");
ret = -EIO;
goto out_destroy_comm_channel;
}
pr_dbg("subnet_prefix=0x%" PRIx64 "\n",
be64_to_cpu(backend_dev->gid.global.subnet_prefix));
pr_dbg("interface_id=0x%" PRIx64 "\n",
be64_to_cpu(backend_dev->gid.global.interface_id));
backend_dev->comp_thread.run = false;
backend_dev->comp_thread.is_running = false;
@ -886,6 +1281,7 @@ void rdma_backend_stop(RdmaBackendDev *backend_dev)
void rdma_backend_fini(RdmaBackendDev *backend_dev)
{
rdma_backend_stop(backend_dev);
mad_fini(backend_dev);
g_hash_table_destroy(ah_hash);
ibv_destroy_comp_channel(backend_dev->channel);
ibv_close_device(backend_dev->context);

View File

@ -17,6 +17,8 @@
#define RDMA_BACKEND_H
#include "qapi/error.h"
#include "chardev/char-fe.h"
#include "rdma_rm_defs.h"
#include "rdma_backend_defs.h"
@ -26,14 +28,9 @@ enum ibv_special_qp_type {
IBV_QPT_GSI = 1,
};
static inline union ibv_gid *rdma_backend_gid(RdmaBackendDev *dev)
{
return &dev->gid;
}
static inline uint32_t rdma_backend_qpn(const RdmaBackendQP *qp)
{
return qp->ibqp ? qp->ibqp->qp_num : 0;
return qp->ibqp ? qp->ibqp->qp_num : 1;
}
static inline uint32_t rdma_backend_mr_lkey(const RdmaBackendMR *mr)
@ -49,13 +46,19 @@ static inline uint32_t rdma_backend_mr_rkey(const RdmaBackendMR *mr)
int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev,
RdmaDeviceResources *rdma_dev_res,
const char *backend_device_name, uint8_t port_num,
uint8_t backend_gid_idx, struct ibv_device_attr *dev_attr,
struct ibv_device_attr *dev_attr, CharBackend *mad_chr_be,
Error **errp);
void rdma_backend_fini(RdmaBackendDev *backend_dev);
int rdma_backend_add_gid(RdmaBackendDev *backend_dev, const char *ifname,
union ibv_gid *gid);
int rdma_backend_del_gid(RdmaBackendDev *backend_dev, const char *ifname,
union ibv_gid *gid);
int rdma_backend_get_gid_index(RdmaBackendDev *backend_dev,
union ibv_gid *gid);
void rdma_backend_start(RdmaBackendDev *backend_dev);
void rdma_backend_stop(RdmaBackendDev *backend_dev);
void rdma_backend_register_comp_handler(void (*handler)(int status,
unsigned int vendor_err, void *ctx));
void rdma_backend_register_comp_handler(void (*handler)(void *ctx,
struct ibv_wc *wc));
void rdma_backend_unregister_comp_handler(void);
int rdma_backend_query_port(RdmaBackendDev *backend_dev,
@ -80,9 +83,9 @@ int rdma_backend_create_qp(RdmaBackendQP *qp, uint8_t qp_type,
int rdma_backend_qp_state_init(RdmaBackendDev *backend_dev, RdmaBackendQP *qp,
uint8_t qp_type, uint32_t qkey);
int rdma_backend_qp_state_rtr(RdmaBackendDev *backend_dev, RdmaBackendQP *qp,
uint8_t qp_type, union ibv_gid *dgid,
uint32_t dqpn, uint32_t rq_psn, uint32_t qkey,
bool use_qkey);
uint8_t qp_type, uint8_t sgid_idx,
union ibv_gid *dgid, uint32_t dqpn,
uint32_t rq_psn, uint32_t qkey, bool use_qkey);
int rdma_backend_qp_state_rts(RdmaBackendQP *qp, uint8_t qp_type,
uint32_t sq_psn, uint32_t qkey, bool use_qkey);
int rdma_backend_query_qp(RdmaBackendQP *qp, struct ibv_qp_attr *attr,
@ -92,6 +95,7 @@ void rdma_backend_destroy_qp(RdmaBackendQP *qp);
void rdma_backend_post_send(RdmaBackendDev *backend_dev,
RdmaBackendQP *qp, uint8_t qp_type,
struct ibv_sge *sge, uint32_t num_sge,
uint8_t sgid_idx, union ibv_gid *sgid,
union ibv_gid *dgid, uint32_t dqpn, uint32_t dqkey,
void *ctx);
void rdma_backend_post_recv(RdmaBackendDev *backend_dev,

View File

@ -16,8 +16,10 @@
#ifndef RDMA_BACKEND_DEFS_H
#define RDMA_BACKEND_DEFS_H
#include <infiniband/verbs.h>
#include "qemu/thread.h"
#include "chardev/char-fe.h"
#include <infiniband/verbs.h>
#include "contrib/rdmacm-mux/rdmacm-mux.h"
typedef struct RdmaDeviceResources RdmaDeviceResources;
@ -28,17 +30,27 @@ typedef struct RdmaBackendThread {
bool is_running; /* Set by the thread to report its status */
} RdmaBackendThread;
typedef struct RecvMadList {
QemuMutex lock;
QList *list;
} RecvMadList;
typedef struct RdmaCmMux {
CharBackend *chr_be;
int can_receive;
} RdmaCmMux;
typedef struct RdmaBackendDev {
struct ibv_device_attr dev_attr;
RdmaBackendThread comp_thread;
union ibv_gid gid;
PCIDevice *dev;
RdmaDeviceResources *rdma_dev_res;
struct ibv_device *ib_dev;
struct ibv_context *context;
struct ibv_comp_channel *channel;
uint8_t port_num;
uint8_t backend_gid_idx;
RecvMadList recv_mads_list;
RdmaCmMux rdmacm_mux;
} RdmaBackendDev;
typedef struct RdmaBackendPD {
@ -58,6 +70,7 @@ typedef struct RdmaBackendCQ {
typedef struct RdmaBackendQP {
struct ibv_pd *ibpd;
struct ibv_qp *ibqp;
uint8_t sgid_idx;
} RdmaBackendQP;
#endif

View File

@ -43,7 +43,7 @@ static inline void res_tbl_free(RdmaRmResTbl *tbl)
{
qemu_mutex_destroy(&tbl->lock);
g_free(tbl->tbl);
bitmap_zero_extend(tbl->bitmap, tbl->tbl_sz, 0);
g_free(tbl->bitmap);
}
static inline void *res_tbl_get(RdmaRmResTbl *tbl, uint32_t handle)
@ -263,7 +263,7 @@ int rdma_rm_alloc_cq(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev,
}
cq->opaque = opaque;
cq->notify = false;
cq->notify = CNT_CLEAR;
rc = rdma_backend_create_cq(backend_dev, &cq->backend_cq, cqe);
if (rc) {
@ -291,7 +291,10 @@ void rdma_rm_req_notify_cq(RdmaDeviceResources *dev_res, uint32_t cq_handle,
return;
}
cq->notify = notify;
if (cq->notify != CNT_SET) {
cq->notify = notify ? CNT_ARM : CNT_CLEAR;
}
pr_dbg("notify=%d\n", cq->notify);
}
@ -349,6 +352,11 @@ int rdma_rm_alloc_qp(RdmaDeviceResources *dev_res, uint32_t pd_handle,
return -EINVAL;
}
if (qp_type == IBV_QPT_GSI) {
scq->notify = CNT_SET;
rcq->notify = CNT_SET;
}
qp = res_tbl_alloc(&dev_res->qp_tbl, &rm_qpn);
if (!qp) {
return -ENOMEM;
@ -383,7 +391,7 @@ out_dealloc_qp:
}
int rdma_rm_modify_qp(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev,
uint32_t qp_handle, uint32_t attr_mask,
uint32_t qp_handle, uint32_t attr_mask, uint8_t sgid_idx,
union ibv_gid *dgid, uint32_t dqpn,
enum ibv_qp_state qp_state, uint32_t qkey,
uint32_t rq_psn, uint32_t sq_psn)
@ -392,6 +400,7 @@ int rdma_rm_modify_qp(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev,
int ret;
pr_dbg("qpn=0x%x\n", qp_handle);
pr_dbg("qkey=0x%x\n", qkey);
qp = rdma_rm_get_qp(dev_res, qp_handle);
if (!qp) {
@ -422,9 +431,19 @@ int rdma_rm_modify_qp(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev,
}
if (qp->qp_state == IBV_QPS_RTR) {
/* Get backend gid index */
pr_dbg("Guest sgid_idx=%d\n", sgid_idx);
sgid_idx = rdma_rm_get_backend_gid_index(dev_res, backend_dev,
sgid_idx);
if (sgid_idx <= 0) { /* TODO check also less than bk.max_sgid */
pr_dbg("Fail to get bk sgid_idx for sgid_idx %d\n", sgid_idx);
return -EIO;
}
ret = rdma_backend_qp_state_rtr(backend_dev, &qp->backend_qp,
qp->qp_type, dgid, dqpn, rq_psn,
qkey, attr_mask & IBV_QP_QKEY);
qp->qp_type, sgid_idx, dgid, dqpn,
rq_psn, qkey,
attr_mask & IBV_QP_QKEY);
if (ret) {
return -EIO;
}
@ -515,11 +534,93 @@ void rdma_rm_dealloc_cqe_ctx(RdmaDeviceResources *dev_res, uint32_t cqe_ctx_id)
res_tbl_dealloc(&dev_res->cqe_ctx_tbl, cqe_ctx_id);
}
int rdma_rm_add_gid(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev,
const char *ifname, union ibv_gid *gid, int gid_idx)
{
int rc;
rc = rdma_backend_add_gid(backend_dev, ifname, gid);
if (rc) {
pr_dbg("Fail to add gid\n");
return -EINVAL;
}
memcpy(&dev_res->port.gid_tbl[gid_idx].gid, gid, sizeof(*gid));
return 0;
}
int rdma_rm_del_gid(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev,
const char *ifname, int gid_idx)
{
int rc;
if (!dev_res->port.gid_tbl[gid_idx].gid.global.interface_id) {
return 0;
}
rc = rdma_backend_del_gid(backend_dev, ifname,
&dev_res->port.gid_tbl[gid_idx].gid);
if (rc) {
pr_dbg("Fail to delete gid\n");
return -EINVAL;
}
memset(dev_res->port.gid_tbl[gid_idx].gid.raw, 0,
sizeof(dev_res->port.gid_tbl[gid_idx].gid));
dev_res->port.gid_tbl[gid_idx].backend_gid_index = -1;
return 0;
}
int rdma_rm_get_backend_gid_index(RdmaDeviceResources *dev_res,
RdmaBackendDev *backend_dev, int sgid_idx)
{
if (unlikely(sgid_idx < 0 || sgid_idx > MAX_PORT_GIDS)) {
pr_dbg("Got invalid sgid_idx %d\n", sgid_idx);
return -EINVAL;
}
if (unlikely(dev_res->port.gid_tbl[sgid_idx].backend_gid_index == -1)) {
dev_res->port.gid_tbl[sgid_idx].backend_gid_index =
rdma_backend_get_gid_index(backend_dev,
&dev_res->port.gid_tbl[sgid_idx].gid);
}
pr_dbg("backend_gid_index=%d\n",
dev_res->port.gid_tbl[sgid_idx].backend_gid_index);
return dev_res->port.gid_tbl[sgid_idx].backend_gid_index;
}
static void destroy_qp_hash_key(gpointer data)
{
g_bytes_unref(data);
}
static void init_ports(RdmaDeviceResources *dev_res)
{
int i;
memset(&dev_res->port, 0, sizeof(dev_res->port));
dev_res->port.state = IBV_PORT_DOWN;
for (i = 0; i < MAX_PORT_GIDS; i++) {
dev_res->port.gid_tbl[i].backend_gid_index = -1;
}
}
static void fini_ports(RdmaDeviceResources *dev_res,
RdmaBackendDev *backend_dev, const char *ifname)
{
int i;
dev_res->port.state = IBV_PORT_DOWN;
for (i = 0; i < MAX_PORT_GIDS; i++) {
rdma_rm_del_gid(dev_res, backend_dev, ifname, i);
}
}
int rdma_rm_init(RdmaDeviceResources *dev_res, struct ibv_device_attr *dev_attr,
Error **errp)
{
@ -537,11 +638,16 @@ int rdma_rm_init(RdmaDeviceResources *dev_res, struct ibv_device_attr *dev_attr,
dev_attr->max_qp_wr, sizeof(void *));
res_tbl_init("UC", &dev_res->uc_tbl, MAX_UCS, sizeof(RdmaRmUC));
init_ports(dev_res);
return 0;
}
void rdma_rm_fini(RdmaDeviceResources *dev_res)
void rdma_rm_fini(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev,
const char *ifname)
{
fini_ports(dev_res, backend_dev, ifname);
res_tbl_free(&dev_res->uc_tbl);
res_tbl_free(&dev_res->cqe_ctx_tbl);
res_tbl_free(&dev_res->qp_tbl);

View File

@ -22,7 +22,8 @@
int rdma_rm_init(RdmaDeviceResources *dev_res, struct ibv_device_attr *dev_attr,
Error **errp);
void rdma_rm_fini(RdmaDeviceResources *dev_res);
void rdma_rm_fini(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev,
const char *ifname);
int rdma_rm_alloc_pd(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev,
uint32_t *pd_handle, uint32_t ctx_handle);
@ -55,7 +56,7 @@ int rdma_rm_alloc_qp(RdmaDeviceResources *dev_res, uint32_t pd_handle,
uint32_t recv_cq_handle, void *opaque, uint32_t *qpn);
RdmaRmQP *rdma_rm_get_qp(RdmaDeviceResources *dev_res, uint32_t qpn);
int rdma_rm_modify_qp(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev,
uint32_t qp_handle, uint32_t attr_mask,
uint32_t qp_handle, uint32_t attr_mask, uint8_t sgid_idx,
union ibv_gid *dgid, uint32_t dqpn,
enum ibv_qp_state qp_state, uint32_t qkey,
uint32_t rq_psn, uint32_t sq_psn);
@ -69,4 +70,16 @@ int rdma_rm_alloc_cqe_ctx(RdmaDeviceResources *dev_res, uint32_t *cqe_ctx_id,
void *rdma_rm_get_cqe_ctx(RdmaDeviceResources *dev_res, uint32_t cqe_ctx_id);
void rdma_rm_dealloc_cqe_ctx(RdmaDeviceResources *dev_res, uint32_t cqe_ctx_id);
int rdma_rm_add_gid(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev,
const char *ifname, union ibv_gid *gid, int gid_idx);
int rdma_rm_del_gid(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev,
const char *ifname, int gid_idx);
int rdma_rm_get_backend_gid_index(RdmaDeviceResources *dev_res,
RdmaBackendDev *backend_dev, int sgid_idx);
static inline union ibv_gid *rdma_rm_get_gid(RdmaDeviceResources *dev_res,
int sgid_idx)
{
return &dev_res->port.gid_tbl[sgid_idx].gid;
}
#endif

View File

@ -18,8 +18,8 @@
#include "rdma_backend_defs.h"
#define MAX_PORTS 1
#define MAX_PORT_GIDS 1
#define MAX_PORTS 1 /* Do not change - we support only one port */
#define MAX_PORT_GIDS 255
#define MAX_GIDS MAX_PORT_GIDS
#define MAX_PORT_PKEYS 1
#define MAX_PKEYS MAX_PORT_PKEYS
@ -49,10 +49,16 @@ typedef struct RdmaRmPD {
uint32_t ctx_handle;
} RdmaRmPD;
typedef enum CQNotificationType {
CNT_CLEAR,
CNT_ARM,
CNT_SET,
} CQNotificationType;
typedef struct RdmaRmCQ {
RdmaBackendCQ backend_cq;
void *opaque;
bool notify;
CQNotificationType notify;
} RdmaRmCQ;
/* MR (DMA region) */
@ -80,13 +86,18 @@ typedef struct RdmaRmQP {
enum ibv_qp_state qp_state;
} RdmaRmQP;
typedef struct RdmaRmGid {
union ibv_gid gid;
int backend_gid_index;
} RdmaRmGid;
typedef struct RdmaRmPort {
union ibv_gid gid_tbl[MAX_PORT_GIDS];
RdmaRmGid gid_tbl[MAX_PORT_GIDS];
enum ibv_port_state state;
} RdmaRmPort;
typedef struct RdmaDeviceResources {
RdmaRmPort ports[MAX_PORTS];
RdmaRmPort port;
RdmaRmResTbl pd_tbl;
RdmaRmResTbl mr_tbl;
RdmaRmResTbl uc_tbl;

View File

@ -19,6 +19,7 @@
#include "hw/pci/pci.h"
#include "sysemu/dma.h"
#include "stdio.h"
#define pr_info(fmt, ...) \
fprintf(stdout, "%s: %-20s (%3d): " fmt, "rdma", __func__, __LINE__,\
@ -39,12 +40,36 @@ extern unsigned long pr_dbg_cnt;
#define pr_dbg(fmt, ...) \
fprintf(stdout, "%lx %ld: %-20s (%3d): " fmt, pthread_self(), pr_dbg_cnt++, \
__func__, __LINE__, ## __VA_ARGS__)
#define pr_dbg_buf(title, buf, len) \
{ \
int i; \
char *b = g_malloc0(len * 3 + 1); \
char b1[4]; \
for (i = 0; i < len; i++) { \
sprintf(b1, "%.2X ", buf[i] & 0x000000FF); \
strcat(b, b1); \
} \
pr_dbg("%s (%d): %s\n", title, len, b); \
g_free(b); \
}
#else
#define init_pr_dbg(void)
#define pr_dbg(fmt, ...)
#define pr_dbg_buf(title, buf, len)
#endif
void *rdma_pci_dma_map(PCIDevice *dev, dma_addr_t addr, dma_addr_t plen);
void rdma_pci_dma_unmap(PCIDevice *dev, void *buffer, dma_addr_t len);
static inline void addrconf_addr_eui48(uint8_t *eui, const char *addr)
{
memcpy(eui, addr, 3);
eui[3] = 0xFF;
eui[4] = 0xFE;
memcpy(eui + 5, addr + 3, 3);
eui[0] ^= 2;
}
#endif

View File

@ -17,8 +17,11 @@
#define PVRDMA_PVRDMA_H
#include "qemu/units.h"
#include "qemu/notify.h"
#include "hw/pci/pci.h"
#include "hw/pci/msix.h"
#include "chardev/char-fe.h"
#include "hw/net/vmxnet3_defs.h"
#include "../rdma_backend_defs.h"
#include "../rdma_rm_defs.h"
@ -51,7 +54,7 @@
#define PVRDMA_FW_VERSION 14
/* Some defaults */
#define PVRDMA_PKEY 0x7FFF
#define PVRDMA_PKEY 0xFFFF
typedef struct DSRInfo {
dma_addr_t dma;
@ -78,11 +81,14 @@ typedef struct PVRDMADev {
int interrupt_mask;
struct ibv_device_attr dev_attr;
uint64_t node_guid;
char *backend_eth_device_name;
char *backend_device_name;
uint8_t backend_gid_idx;
uint8_t backend_port_num;
RdmaBackendDev backend_dev;
RdmaDeviceResources rdma_dev_res;
CharBackend mad_chr;
VMXNET3State *func0;
Notifier shutdown_notifier;
} PVRDMADev;
#define PVRDMA_DEV(dev) OBJECT_CHECK(PVRDMADev, (dev), PVRDMA_HW_NAME)

View File

@ -128,6 +128,9 @@ static int query_port(PVRDMADev *dev, union pvrdma_cmd_req *req,
struct pvrdma_port_attr attrs = {0};
pr_dbg("port=%d\n", cmd->port_num);
if (cmd->port_num > MAX_PORTS) {
return -EINVAL;
}
if (rdma_backend_query_port(&dev->backend_dev,
(struct ibv_port_attr *)&attrs)) {
@ -135,11 +138,9 @@ static int query_port(PVRDMADev *dev, union pvrdma_cmd_req *req,
}
memset(resp, 0, sizeof(*resp));
resp->hdr.response = cmd->hdr.response;
resp->hdr.ack = PVRDMA_CMD_QUERY_PORT_RESP;
resp->hdr.err = 0;
resp->attrs.state = attrs.state;
resp->attrs.state = dev->func0->device_active ? attrs.state :
PVRDMA_PORT_DOWN;
resp->attrs.max_mtu = attrs.max_mtu;
resp->attrs.active_mtu = attrs.active_mtu;
resp->attrs.phys_state = attrs.phys_state;
@ -159,12 +160,16 @@ static int query_pkey(PVRDMADev *dev, union pvrdma_cmd_req *req,
struct pvrdma_cmd_query_pkey_resp *resp = &rsp->query_pkey_resp;
pr_dbg("port=%d\n", cmd->port_num);
if (cmd->port_num > MAX_PORTS) {
return -EINVAL;
}
pr_dbg("index=%d\n", cmd->index);
if (cmd->index > MAX_PKEYS) {
return -EINVAL;
}
memset(resp, 0, sizeof(*resp));
resp->hdr.response = cmd->hdr.response;
resp->hdr.ack = PVRDMA_CMD_QUERY_PKEY_RESP;
resp->hdr.err = 0;
resp->pkey = PVRDMA_PKEY;
pr_dbg("pkey=0x%x\n", resp->pkey);
@ -177,17 +182,15 @@ static int create_pd(PVRDMADev *dev, union pvrdma_cmd_req *req,
{
struct pvrdma_cmd_create_pd *cmd = &req->create_pd;
struct pvrdma_cmd_create_pd_resp *resp = &rsp->create_pd_resp;
int rc;
pr_dbg("context=0x%x\n", cmd->ctx_handle ? cmd->ctx_handle : 0);
memset(resp, 0, sizeof(*resp));
resp->hdr.response = cmd->hdr.response;
resp->hdr.ack = PVRDMA_CMD_CREATE_PD_RESP;
resp->hdr.err = rdma_rm_alloc_pd(&dev->rdma_dev_res, &dev->backend_dev,
&resp->pd_handle, cmd->ctx_handle);
rc = rdma_rm_alloc_pd(&dev->rdma_dev_res, &dev->backend_dev,
&resp->pd_handle, cmd->ctx_handle);
pr_dbg("ret=%d\n", resp->hdr.err);
return resp->hdr.err;
return rc;
}
static int destroy_pd(PVRDMADev *dev, union pvrdma_cmd_req *req,
@ -209,10 +212,9 @@ static int create_mr(PVRDMADev *dev, union pvrdma_cmd_req *req,
struct pvrdma_cmd_create_mr_resp *resp = &rsp->create_mr_resp;
PCIDevice *pci_dev = PCI_DEVICE(dev);
void *host_virt = NULL;
int rc = 0;
memset(resp, 0, sizeof(*resp));
resp->hdr.response = cmd->hdr.response;
resp->hdr.ack = PVRDMA_CMD_CREATE_MR_RESP;
pr_dbg("pd_handle=%d\n", cmd->pd_handle);
pr_dbg("access_flags=0x%x\n", cmd->access_flags);
@ -223,22 +225,18 @@ static int create_mr(PVRDMADev *dev, union pvrdma_cmd_req *req,
cmd->length);
if (!host_virt) {
pr_dbg("Failed to map to pdir\n");
resp->hdr.err = -EINVAL;
goto out;
return -EINVAL;
}
}
resp->hdr.err = rdma_rm_alloc_mr(&dev->rdma_dev_res, cmd->pd_handle,
cmd->start, cmd->length, host_virt,
cmd->access_flags, &resp->mr_handle,
&resp->lkey, &resp->rkey);
if (host_virt && !resp->hdr.err) {
rc = rdma_rm_alloc_mr(&dev->rdma_dev_res, cmd->pd_handle, cmd->start,
cmd->length, host_virt, cmd->access_flags,
&resp->mr_handle, &resp->lkey, &resp->rkey);
if (rc && host_virt) {
munmap(host_virt, cmd->length);
}
out:
pr_dbg("ret=%d\n", resp->hdr.err);
return resp->hdr.err;
return rc;
}
static int destroy_mr(PVRDMADev *dev, union pvrdma_cmd_req *req,
@ -261,6 +259,11 @@ static int create_cq_ring(PCIDevice *pci_dev , PvrdmaRing **ring,
int rc = -EINVAL;
char ring_name[MAX_RING_NAME_SZ];
if (!nchunks || nchunks > PVRDMA_MAX_FAST_REG_PAGES) {
pr_dbg("invalid nchunks: %d\n", nchunks);
return rc;
}
pr_dbg("pdir_dma=0x%llx\n", (long long unsigned int)pdir_dma);
dir = rdma_pci_dma_map(pci_dev, pdir_dma, TARGET_PAGE_SIZE);
if (!dir) {
@ -310,34 +313,43 @@ out:
return rc;
}
static void destroy_cq_ring(PvrdmaRing *ring)
{
pvrdma_ring_free(ring);
/* ring_state was in slot 1, not 0 so need to jump back */
rdma_pci_dma_unmap(ring->dev, --ring->ring_state, TARGET_PAGE_SIZE);
g_free(ring);
}
static int create_cq(PVRDMADev *dev, union pvrdma_cmd_req *req,
union pvrdma_cmd_resp *rsp)
{
struct pvrdma_cmd_create_cq *cmd = &req->create_cq;
struct pvrdma_cmd_create_cq_resp *resp = &rsp->create_cq_resp;
PvrdmaRing *ring = NULL;
int rc;
memset(resp, 0, sizeof(*resp));
resp->hdr.response = cmd->hdr.response;
resp->hdr.ack = PVRDMA_CMD_CREATE_CQ_RESP;
resp->cqe = cmd->cqe;
resp->hdr.err = create_cq_ring(PCI_DEVICE(dev), &ring, cmd->pdir_dma,
cmd->nchunks, cmd->cqe);
if (resp->hdr.err) {
goto out;
rc = create_cq_ring(PCI_DEVICE(dev), &ring, cmd->pdir_dma, cmd->nchunks,
cmd->cqe);
if (rc) {
return rc;
}
pr_dbg("ring=%p\n", ring);
resp->hdr.err = rdma_rm_alloc_cq(&dev->rdma_dev_res, &dev->backend_dev,
cmd->cqe, &resp->cq_handle, ring);
rc = rdma_rm_alloc_cq(&dev->rdma_dev_res, &dev->backend_dev, cmd->cqe,
&resp->cq_handle, ring);
if (rc) {
destroy_cq_ring(ring);
}
resp->cqe = cmd->cqe;
out:
pr_dbg("ret=%d\n", resp->hdr.err);
return resp->hdr.err;
return rc;
}
static int destroy_cq(PVRDMADev *dev, union pvrdma_cmd_req *req,
@ -356,10 +368,7 @@ static int destroy_cq(PVRDMADev *dev, union pvrdma_cmd_req *req,
}
ring = (PvrdmaRing *)cq->opaque;
pvrdma_ring_free(ring);
/* ring_state was in slot 1, not 0 so need to jump back */
rdma_pci_dma_unmap(PCI_DEVICE(dev), --ring->ring_state, TARGET_PAGE_SIZE);
g_free(ring);
destroy_cq_ring(ring);
rdma_rm_dealloc_cq(&dev->rdma_dev_res, cmd->cq_handle);
@ -377,6 +386,12 @@ static int create_qp_rings(PCIDevice *pci_dev, uint64_t pdir_dma,
char ring_name[MAX_RING_NAME_SZ];
uint32_t wqe_sz;
if (!spages || spages > PVRDMA_MAX_FAST_REG_PAGES
|| !rpages || rpages > PVRDMA_MAX_FAST_REG_PAGES) {
pr_dbg("invalid pages: %d, %d\n", spages, rpages);
return rc;
}
pr_dbg("pdir_dma=0x%llx\n", (long long unsigned int)pdir_dma);
dir = rdma_pci_dma_map(pci_dev, pdir_dma, TARGET_PAGE_SIZE);
if (!dir) {
@ -451,36 +466,49 @@ out:
return rc;
}
static void destroy_qp_rings(PvrdmaRing *ring)
{
pr_dbg("sring=%p\n", &ring[0]);
pvrdma_ring_free(&ring[0]);
pr_dbg("rring=%p\n", &ring[1]);
pvrdma_ring_free(&ring[1]);
rdma_pci_dma_unmap(ring->dev, ring->ring_state, TARGET_PAGE_SIZE);
g_free(ring);
}
static int create_qp(PVRDMADev *dev, union pvrdma_cmd_req *req,
union pvrdma_cmd_resp *rsp)
{
struct pvrdma_cmd_create_qp *cmd = &req->create_qp;
struct pvrdma_cmd_create_qp_resp *resp = &rsp->create_qp_resp;
PvrdmaRing *rings = NULL;
int rc;
memset(resp, 0, sizeof(*resp));
resp->hdr.response = cmd->hdr.response;
resp->hdr.ack = PVRDMA_CMD_CREATE_QP_RESP;
pr_dbg("total_chunks=%d\n", cmd->total_chunks);
pr_dbg("send_chunks=%d\n", cmd->send_chunks);
resp->hdr.err = create_qp_rings(PCI_DEVICE(dev), cmd->pdir_dma, &rings,
cmd->max_send_wr, cmd->max_send_sge,
cmd->send_chunks, cmd->max_recv_wr,
cmd->max_recv_sge, cmd->total_chunks -
cmd->send_chunks - 1);
if (resp->hdr.err) {
goto out;
rc = create_qp_rings(PCI_DEVICE(dev), cmd->pdir_dma, &rings,
cmd->max_send_wr, cmd->max_send_sge, cmd->send_chunks,
cmd->max_recv_wr, cmd->max_recv_sge,
cmd->total_chunks - cmd->send_chunks - 1);
if (rc) {
return rc;
}
pr_dbg("rings=%p\n", rings);
resp->hdr.err = rdma_rm_alloc_qp(&dev->rdma_dev_res, cmd->pd_handle,
cmd->qp_type, cmd->max_send_wr,
cmd->max_send_sge, cmd->send_cq_handle,
cmd->max_recv_wr, cmd->max_recv_sge,
cmd->recv_cq_handle, rings, &resp->qpn);
rc = rdma_rm_alloc_qp(&dev->rdma_dev_res, cmd->pd_handle, cmd->qp_type,
cmd->max_send_wr, cmd->max_send_sge,
cmd->send_cq_handle, cmd->max_recv_wr,
cmd->max_recv_sge, cmd->recv_cq_handle, rings,
&resp->qpn);
if (rc) {
destroy_qp_rings(rings);
return rc;
}
resp->max_send_wr = cmd->max_send_wr;
resp->max_recv_wr = cmd->max_recv_wr;
@ -488,32 +516,31 @@ static int create_qp(PVRDMADev *dev, union pvrdma_cmd_req *req,
resp->max_recv_sge = cmd->max_recv_sge;
resp->max_inline_data = cmd->max_inline_data;
out:
pr_dbg("ret=%d\n", resp->hdr.err);
return resp->hdr.err;
return 0;
}
static int modify_qp(PVRDMADev *dev, union pvrdma_cmd_req *req,
union pvrdma_cmd_resp *rsp)
{
struct pvrdma_cmd_modify_qp *cmd = &req->modify_qp;
int rc;
pr_dbg("qp_handle=%d\n", cmd->qp_handle);
memset(rsp, 0, sizeof(*rsp));
rsp->hdr.response = cmd->hdr.response;
rsp->hdr.ack = PVRDMA_CMD_MODIFY_QP_RESP;
rsp->hdr.err = rdma_rm_modify_qp(&dev->rdma_dev_res, &dev->backend_dev,
cmd->qp_handle, cmd->attr_mask,
(union ibv_gid *)&cmd->attrs.ah_attr.grh.dgid,
cmd->attrs.dest_qp_num,
(enum ibv_qp_state)cmd->attrs.qp_state,
cmd->attrs.qkey, cmd->attrs.rq_psn,
cmd->attrs.sq_psn);
/* No need to verify sgid_index since it is u8 */
pr_dbg("ret=%d\n", rsp->hdr.err);
return rsp->hdr.err;
rc = rdma_rm_modify_qp(&dev->rdma_dev_res, &dev->backend_dev,
cmd->qp_handle, cmd->attr_mask,
cmd->attrs.ah_attr.grh.sgid_index,
(union ibv_gid *)&cmd->attrs.ah_attr.grh.dgid,
cmd->attrs.dest_qp_num,
(enum ibv_qp_state)cmd->attrs.qp_state,
cmd->attrs.qkey, cmd->attrs.rq_psn,
cmd->attrs.sq_psn);
return rc;
}
static int query_qp(PVRDMADev *dev, union pvrdma_cmd_req *req,
@ -522,21 +549,18 @@ static int query_qp(PVRDMADev *dev, union pvrdma_cmd_req *req,
struct pvrdma_cmd_query_qp *cmd = &req->query_qp;
struct pvrdma_cmd_query_qp_resp *resp = &rsp->query_qp_resp;
struct ibv_qp_init_attr init_attr;
int rc;
pr_dbg("qp_handle=%d\n", cmd->qp_handle);
pr_dbg("attr_mask=0x%x\n", cmd->attr_mask);
memset(rsp, 0, sizeof(*rsp));
rsp->hdr.response = cmd->hdr.response;
rsp->hdr.ack = PVRDMA_CMD_QUERY_QP_RESP;
rsp->hdr.err = rdma_rm_query_qp(&dev->rdma_dev_res, &dev->backend_dev,
cmd->qp_handle,
(struct ibv_qp_attr *)&resp->attrs,
cmd->attr_mask, &init_attr);
rc = rdma_rm_query_qp(&dev->rdma_dev_res, &dev->backend_dev, cmd->qp_handle,
(struct ibv_qp_attr *)&resp->attrs, cmd->attr_mask,
&init_attr);
pr_dbg("ret=%d\n", rsp->hdr.err);
return rsp->hdr.err;
return rc;
}
static int destroy_qp(PVRDMADev *dev, union pvrdma_cmd_req *req,
@ -555,13 +579,7 @@ static int destroy_qp(PVRDMADev *dev, union pvrdma_cmd_req *req,
rdma_rm_dealloc_qp(&dev->rdma_dev_res, cmd->qp_handle);
ring = (PvrdmaRing *)qp->opaque;
pr_dbg("sring=%p\n", &ring[0]);
pvrdma_ring_free(&ring[0]);
pr_dbg("rring=%p\n", &ring[1]);
pvrdma_ring_free(&ring[1]);
rdma_pci_dma_unmap(PCI_DEVICE(dev), ring->ring_state, TARGET_PAGE_SIZE);
g_free(ring);
destroy_qp_rings(ring);
return 0;
}
@ -570,10 +588,8 @@ static int create_bind(PVRDMADev *dev, union pvrdma_cmd_req *req,
union pvrdma_cmd_resp *rsp)
{
struct pvrdma_cmd_create_bind *cmd = &req->create_bind;
#ifdef PVRDMA_DEBUG
__be64 *subnet = (__be64 *)&cmd->new_gid[0];
__be64 *if_id = (__be64 *)&cmd->new_gid[8];
#endif
int rc;
union ibv_gid *gid = (union ibv_gid *)&cmd->new_gid;
pr_dbg("index=%d\n", cmd->index);
@ -582,26 +598,20 @@ static int create_bind(PVRDMADev *dev, union pvrdma_cmd_req *req,
}
pr_dbg("gid[%d]=0x%llx,0x%llx\n", cmd->index,
(long long unsigned int)be64_to_cpu(*subnet),
(long long unsigned int)be64_to_cpu(*if_id));
(long long unsigned int)be64_to_cpu(gid->global.subnet_prefix),
(long long unsigned int)be64_to_cpu(gid->global.interface_id));
/* Driver forces to one port only */
memcpy(dev->rdma_dev_res.ports[0].gid_tbl[cmd->index].raw, &cmd->new_gid,
sizeof(cmd->new_gid));
rc = rdma_rm_add_gid(&dev->rdma_dev_res, &dev->backend_dev,
dev->backend_eth_device_name, gid, cmd->index);
/* TODO: Since drivers stores node_guid at load_dsr phase then this
* assignment is not relevant, i need to figure out a way how to
* retrieve MAC of our netdev */
dev->node_guid = dev->rdma_dev_res.ports[0].gid_tbl[0].global.interface_id;
pr_dbg("dev->node_guid=0x%llx\n",
(long long unsigned int)be64_to_cpu(dev->node_guid));
return 0;
return rc;
}
static int destroy_bind(PVRDMADev *dev, union pvrdma_cmd_req *req,
union pvrdma_cmd_resp *rsp)
{
int rc;
struct pvrdma_cmd_destroy_bind *cmd = &req->destroy_bind;
pr_dbg("index=%d\n", cmd->index);
@ -610,10 +620,10 @@ static int destroy_bind(PVRDMADev *dev, union pvrdma_cmd_req *req,
return -EINVAL;
}
memset(dev->rdma_dev_res.ports[0].gid_tbl[cmd->index].raw, 0,
sizeof(dev->rdma_dev_res.ports[0].gid_tbl[cmd->index].raw));
rc = rdma_rm_del_gid(&dev->rdma_dev_res, &dev->backend_dev,
dev->backend_eth_device_name, cmd->index);
return 0;
return rc;
}
static int create_uc(PVRDMADev *dev, union pvrdma_cmd_req *req,
@ -621,18 +631,14 @@ static int create_uc(PVRDMADev *dev, union pvrdma_cmd_req *req,
{
struct pvrdma_cmd_create_uc *cmd = &req->create_uc;
struct pvrdma_cmd_create_uc_resp *resp = &rsp->create_uc_resp;
int rc;
pr_dbg("pfn=%d\n", cmd->pfn);
memset(resp, 0, sizeof(*resp));
resp->hdr.response = cmd->hdr.response;
resp->hdr.ack = PVRDMA_CMD_CREATE_UC_RESP;
resp->hdr.err = rdma_rm_alloc_uc(&dev->rdma_dev_res, cmd->pfn,
&resp->ctx_handle);
rc = rdma_rm_alloc_uc(&dev->rdma_dev_res, cmd->pfn, &resp->ctx_handle);
pr_dbg("ret=%d\n", resp->hdr.err);
return 0;
return rc;
}
static int destroy_uc(PVRDMADev *dev, union pvrdma_cmd_req *req,
@ -646,30 +652,32 @@ static int destroy_uc(PVRDMADev *dev, union pvrdma_cmd_req *req,
return 0;
}
struct cmd_handler {
uint32_t cmd;
uint32_t ack;
int (*exec)(PVRDMADev *dev, union pvrdma_cmd_req *req,
union pvrdma_cmd_resp *rsp);
};
static struct cmd_handler cmd_handlers[] = {
{PVRDMA_CMD_QUERY_PORT, query_port},
{PVRDMA_CMD_QUERY_PKEY, query_pkey},
{PVRDMA_CMD_CREATE_PD, create_pd},
{PVRDMA_CMD_DESTROY_PD, destroy_pd},
{PVRDMA_CMD_CREATE_MR, create_mr},
{PVRDMA_CMD_DESTROY_MR, destroy_mr},
{PVRDMA_CMD_CREATE_CQ, create_cq},
{PVRDMA_CMD_RESIZE_CQ, NULL},
{PVRDMA_CMD_DESTROY_CQ, destroy_cq},
{PVRDMA_CMD_CREATE_QP, create_qp},
{PVRDMA_CMD_MODIFY_QP, modify_qp},
{PVRDMA_CMD_QUERY_QP, query_qp},
{PVRDMA_CMD_DESTROY_QP, destroy_qp},
{PVRDMA_CMD_CREATE_UC, create_uc},
{PVRDMA_CMD_DESTROY_UC, destroy_uc},
{PVRDMA_CMD_CREATE_BIND, create_bind},
{PVRDMA_CMD_DESTROY_BIND, destroy_bind},
{PVRDMA_CMD_QUERY_PORT, PVRDMA_CMD_QUERY_PORT_RESP, query_port},
{PVRDMA_CMD_QUERY_PKEY, PVRDMA_CMD_QUERY_PKEY_RESP, query_pkey},
{PVRDMA_CMD_CREATE_PD, PVRDMA_CMD_CREATE_PD_RESP, create_pd},
{PVRDMA_CMD_DESTROY_PD, PVRDMA_CMD_DESTROY_PD_RESP_NOOP, destroy_pd},
{PVRDMA_CMD_CREATE_MR, PVRDMA_CMD_CREATE_MR_RESP, create_mr},
{PVRDMA_CMD_DESTROY_MR, PVRDMA_CMD_DESTROY_MR_RESP_NOOP, destroy_mr},
{PVRDMA_CMD_CREATE_CQ, PVRDMA_CMD_CREATE_CQ_RESP, create_cq},
{PVRDMA_CMD_RESIZE_CQ, PVRDMA_CMD_RESIZE_CQ_RESP, NULL},
{PVRDMA_CMD_DESTROY_CQ, PVRDMA_CMD_DESTROY_CQ_RESP_NOOP, destroy_cq},
{PVRDMA_CMD_CREATE_QP, PVRDMA_CMD_CREATE_QP_RESP, create_qp},
{PVRDMA_CMD_MODIFY_QP, PVRDMA_CMD_MODIFY_QP_RESP, modify_qp},
{PVRDMA_CMD_QUERY_QP, PVRDMA_CMD_QUERY_QP_RESP, query_qp},
{PVRDMA_CMD_DESTROY_QP, PVRDMA_CMD_DESTROY_QP_RESP, destroy_qp},
{PVRDMA_CMD_CREATE_UC, PVRDMA_CMD_CREATE_UC_RESP, create_uc},
{PVRDMA_CMD_DESTROY_UC, PVRDMA_CMD_DESTROY_UC_RESP_NOOP, destroy_uc},
{PVRDMA_CMD_CREATE_BIND, PVRDMA_CMD_CREATE_BIND_RESP_NOOP, create_bind},
{PVRDMA_CMD_DESTROY_BIND, PVRDMA_CMD_DESTROY_BIND_RESP_NOOP, destroy_bind},
};
int execute_command(PVRDMADev *dev)
@ -692,7 +700,12 @@ int execute_command(PVRDMADev *dev)
}
err = cmd_handlers[dsr_info->req->hdr.cmd].exec(dev, dsr_info->req,
dsr_info->rsp);
dsr_info->rsp);
dsr_info->rsp->hdr.response = dsr_info->req->hdr.response;
dsr_info->rsp->hdr.ack = cmd_handlers[dsr_info->req->hdr.cmd].ack;
dsr_info->rsp->hdr.err = err < 0 ? -err : 0;
pr_dbg("rsp->hdr.err=%d\n", dsr_info->rsp->hdr.err);
out:
set_reg_val(dev, PVRDMA_REG_ERR, err);
post_interrupt(dev, INTR_VEC_CMD_RING);

View File

@ -73,23 +73,16 @@ out:
void *pvrdma_ring_next_elem_read(PvrdmaRing *ring)
{
int e;
unsigned int idx = 0, offset;
/*
pr_dbg("%s: t=%d, h=%d\n", ring->name, ring->ring_state->prod_tail,
ring->ring_state->cons_head);
*/
if (!pvrdma_idx_ring_has_data(ring->ring_state, ring->max_elems, &idx)) {
e = pvrdma_idx_ring_has_data(ring->ring_state, ring->max_elems, &idx);
if (e <= 0) {
pr_dbg("No more data in ring\n");
return NULL;
}
offset = idx * ring->elem_sz;
/*
pr_dbg("idx=%d\n", idx);
pr_dbg("offset=%d\n", offset);
*/
return ring->pages[offset / TARGET_PAGE_SIZE] + (offset % TARGET_PAGE_SIZE);
}
@ -105,20 +98,20 @@ void pvrdma_ring_read_inc(PvrdmaRing *ring)
void *pvrdma_ring_next_elem_write(PvrdmaRing *ring)
{
unsigned int idx, offset, tail;
int idx;
unsigned int offset, tail;
/*
pr_dbg("%s: t=%d, h=%d\n", ring->name, ring->ring_state->prod_tail,
ring->ring_state->cons_head);
*/
if (!pvrdma_idx_ring_has_space(ring->ring_state, ring->max_elems, &tail)) {
idx = pvrdma_idx_ring_has_space(ring->ring_state, ring->max_elems, &tail);
if (idx <= 0) {
pr_dbg("CQ is full\n");
return NULL;
}
idx = pvrdma_idx(&ring->ring_state->prod_tail, ring->max_elems);
/* TODO: tail == idx */
if (idx < 0 || tail != idx) {
pr_dbg("invalid idx\n");
return NULL;
}
offset = idx * ring->elem_sz;
return ring->pages[offset / TARGET_PAGE_SIZE] + (offset % TARGET_PAGE_SIZE);

View File

@ -24,6 +24,7 @@
#include "hw/qdev-properties.h"
#include "cpu.h"
#include "trace.h"
#include "sysemu/sysemu.h"
#include "../rdma_rm.h"
#include "../rdma_backend.h"
@ -36,9 +37,9 @@
#include "pvrdma_qp_ops.h"
static Property pvrdma_dev_properties[] = {
DEFINE_PROP_STRING("backend-dev", PVRDMADev, backend_device_name),
DEFINE_PROP_UINT8("backend-port", PVRDMADev, backend_port_num, 1),
DEFINE_PROP_UINT8("backend-gid-idx", PVRDMADev, backend_gid_idx, 0),
DEFINE_PROP_STRING("netdev", PVRDMADev, backend_eth_device_name),
DEFINE_PROP_STRING("ibdev", PVRDMADev, backend_device_name),
DEFINE_PROP_UINT8("ibport", PVRDMADev, backend_port_num, 1),
DEFINE_PROP_UINT64("dev-caps-max-mr-size", PVRDMADev, dev_attr.max_mr_size,
MAX_MR_SIZE),
DEFINE_PROP_INT32("dev-caps-max-qp", PVRDMADev, dev_attr.max_qp, MAX_QP),
@ -51,6 +52,7 @@ static Property pvrdma_dev_properties[] = {
DEFINE_PROP_INT32("dev-caps-max-qp-init-rd-atom", PVRDMADev,
dev_attr.max_qp_init_rd_atom, MAX_QP_INIT_RD_ATOM),
DEFINE_PROP_INT32("dev-caps-max-ah", PVRDMADev, dev_attr.max_ah, MAX_AH),
DEFINE_PROP_CHR("mad-chardev", PVRDMADev, mad_chr),
DEFINE_PROP_END_OF_LIST(),
};
@ -263,7 +265,7 @@ static void init_dsr_dev_caps(PVRDMADev *dev)
dsr->caps.sys_image_guid = 0;
pr_dbg("sys_image_guid=%" PRIx64 "\n", dsr->caps.sys_image_guid);
dsr->caps.node_guid = cpu_to_be64(dev->node_guid);
dsr->caps.node_guid = dev->node_guid;
pr_dbg("node_guid=%" PRIx64 "\n", be64_to_cpu(dsr->caps.node_guid));
dsr->caps.phys_port_cnt = MAX_PORTS;
@ -275,17 +277,6 @@ static void init_dsr_dev_caps(PVRDMADev *dev)
pr_dbg("Initialized\n");
}
static void init_ports(PVRDMADev *dev, Error **errp)
{
int i;
memset(dev->rdma_dev_res.ports, 0, sizeof(dev->rdma_dev_res.ports));
for (i = 0; i < MAX_PORTS; i++) {
dev->rdma_dev_res.ports[i].state = IBV_PORT_DOWN;
}
}
static void uninit_msix(PCIDevice *pdev, int used_vectors)
{
PVRDMADev *dev = PVRDMA_DEV(pdev);
@ -334,7 +325,8 @@ static void pvrdma_fini(PCIDevice *pdev)
pvrdma_qp_ops_fini();
rdma_rm_fini(&dev->rdma_dev_res);
rdma_rm_fini(&dev->rdma_dev_res, &dev->backend_dev,
dev->backend_eth_device_name);
rdma_backend_fini(&dev->backend_dev);
@ -343,6 +335,9 @@ static void pvrdma_fini(PCIDevice *pdev)
if (msix_enabled(pdev)) {
uninit_msix(pdev, RDMA_MAX_INTRS);
}
pr_dbg("Device %s %x.%x is down\n", pdev->name, PCI_SLOT(pdev->devfn),
PCI_FUNC(pdev->devfn));
}
static void pvrdma_stop(PVRDMADev *dev)
@ -368,13 +363,11 @@ static int unquiesce_device(PVRDMADev *dev)
return 0;
}
static int reset_device(PVRDMADev *dev)
static void reset_device(PVRDMADev *dev)
{
pvrdma_stop(dev);
pr_dbg("Device reset complete\n");
return 0;
}
static uint64_t regs_read(void *opaque, hwaddr addr, unsigned size)
@ -455,6 +448,11 @@ static const MemoryRegionOps regs_ops = {
},
};
static uint64_t uar_read(void *opaque, hwaddr addr, unsigned size)
{
return 0xffffffff;
}
static void uar_write(void *opaque, hwaddr addr, uint64_t val, unsigned size)
{
PVRDMADev *dev = opaque;
@ -496,6 +494,7 @@ static void uar_write(void *opaque, hwaddr addr, uint64_t val, unsigned size)
}
static const MemoryRegionOps uar_ops = {
.read = uar_read,
.write = uar_write,
.endianness = DEVICE_LITTLE_ENDIAN,
.impl = {
@ -570,12 +569,21 @@ static int pvrdma_check_ram_shared(Object *obj, void *opaque)
return 0;
}
static void pvrdma_shutdown_notifier(Notifier *n, void *opaque)
{
PVRDMADev *dev = container_of(n, PVRDMADev, shutdown_notifier);
PCIDevice *pci_dev = PCI_DEVICE(dev);
pvrdma_fini(pci_dev);
}
static void pvrdma_realize(PCIDevice *pdev, Error **errp)
{
int rc;
int rc = 0;
PVRDMADev *dev = PVRDMA_DEV(pdev);
Object *memdev_root;
bool ram_shared = false;
PCIDevice *func0;
init_pr_dbg();
@ -587,6 +595,20 @@ static void pvrdma_realize(PCIDevice *pdev, Error **errp)
return;
}
func0 = pci_get_function_0(pdev);
/* Break if not vmxnet3 device in slot 0 */
if (strcmp(object_get_typename(&func0->qdev.parent_obj), TYPE_VMXNET3)) {
pr_dbg("func0 type is %s\n",
object_get_typename(&func0->qdev.parent_obj));
error_setg(errp, "Device on %x.0 must be %s", PCI_SLOT(pdev->devfn),
TYPE_VMXNET3);
return;
}
dev->func0 = VMXNET3(func0);
addrconf_addr_eui48((unsigned char *)&dev->node_guid,
(const char *)&dev->func0->conf.macaddr.a);
memdev_root = object_resolve_path("/objects", NULL);
if (memdev_root) {
object_child_foreach(memdev_root, pvrdma_check_ram_shared, &ram_shared);
@ -613,7 +635,7 @@ static void pvrdma_realize(PCIDevice *pdev, Error **errp)
rc = rdma_backend_init(&dev->backend_dev, pdev, &dev->rdma_dev_res,
dev->backend_device_name, dev->backend_port_num,
dev->backend_gid_idx, &dev->dev_attr, errp);
&dev->dev_attr, &dev->mad_chr, errp);
if (rc) {
goto out;
}
@ -623,15 +645,17 @@ static void pvrdma_realize(PCIDevice *pdev, Error **errp)
goto out;
}
init_ports(dev, errp);
rc = pvrdma_qp_ops_init();
if (rc) {
goto out;
}
dev->shutdown_notifier.notify = pvrdma_shutdown_notifier;
qemu_register_shutdown_notifier(&dev->shutdown_notifier);
out:
if (rc) {
pvrdma_fini(pdev);
error_append_hint(errp, "Device fail to load\n");
}
}

View File

@ -47,7 +47,7 @@ typedef struct PvrdmaRqWqe {
* 3. Interrupt host
*/
static int pvrdma_post_cqe(PVRDMADev *dev, uint32_t cq_handle,
struct pvrdma_cqe *cqe)
struct pvrdma_cqe *cqe, struct ibv_wc *wc)
{
struct pvrdma_cqe *cqe1;
struct pvrdma_cqne *cqne;
@ -66,6 +66,7 @@ static int pvrdma_post_cqe(PVRDMADev *dev, uint32_t cq_handle,
pr_dbg("Writing CQE\n");
cqe1 = pvrdma_ring_next_elem_write(ring);
if (unlikely(!cqe1)) {
pr_dbg("No CQEs in ring\n");
return -EINVAL;
}
@ -73,8 +74,20 @@ static int pvrdma_post_cqe(PVRDMADev *dev, uint32_t cq_handle,
cqe1->wr_id = cqe->wr_id;
cqe1->qp = cqe->qp;
cqe1->opcode = cqe->opcode;
cqe1->status = cqe->status;
cqe1->vendor_err = cqe->vendor_err;
cqe1->status = wc->status;
cqe1->byte_len = wc->byte_len;
cqe1->src_qp = wc->src_qp;
cqe1->wc_flags = wc->wc_flags;
cqe1->vendor_err = wc->vendor_err;
pr_dbg("wr_id=%" PRIx64 "\n", cqe1->wr_id);
pr_dbg("qp=0x%lx\n", cqe1->qp);
pr_dbg("opcode=%d\n", cqe1->opcode);
pr_dbg("status=%d\n", cqe1->status);
pr_dbg("byte_len=%d\n", cqe1->byte_len);
pr_dbg("src_qp=%d\n", cqe1->src_qp);
pr_dbg("wc_flags=%d\n", cqe1->wc_flags);
pr_dbg("vendor_err=%d\n", cqe1->vendor_err);
pvrdma_ring_write_inc(ring);
@ -89,26 +102,22 @@ static int pvrdma_post_cqe(PVRDMADev *dev, uint32_t cq_handle,
pvrdma_ring_write_inc(&dev->dsr_info.cq);
pr_dbg("cq->notify=%d\n", cq->notify);
if (cq->notify) {
cq->notify = false;
if (cq->notify != CNT_CLEAR) {
if (cq->notify == CNT_ARM) {
cq->notify = CNT_CLEAR;
}
post_interrupt(dev, INTR_VEC_CMD_COMPLETION_Q);
}
return 0;
}
static void pvrdma_qp_ops_comp_handler(int status, unsigned int vendor_err,
void *ctx)
static void pvrdma_qp_ops_comp_handler(void *ctx, struct ibv_wc *wc)
{
CompHandlerCtx *comp_ctx = (CompHandlerCtx *)ctx;
pr_dbg("cq_handle=%d\n", comp_ctx->cq_handle);
pr_dbg("wr_id=%" PRIx64 "\n", comp_ctx->cqe.wr_id);
pr_dbg("status=%d\n", status);
pr_dbg("vendor_err=0x%x\n", vendor_err);
comp_ctx->cqe.status = status;
comp_ctx->cqe.vendor_err = vendor_err;
pvrdma_post_cqe(comp_ctx->dev, comp_ctx->cq_handle, &comp_ctx->cqe);
pvrdma_post_cqe(comp_ctx->dev, comp_ctx->cq_handle, &comp_ctx->cqe, wc);
g_free(ctx);
}
@ -129,6 +138,8 @@ int pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle)
RdmaRmQP *qp;
PvrdmaSqWqe *wqe;
PvrdmaRing *ring;
int sgid_idx;
union ibv_gid *sgid;
pr_dbg("qp_handle=0x%x\n", qp_handle);
@ -152,10 +163,28 @@ int pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle)
comp_ctx->cq_handle = qp->send_cq_handle;
comp_ctx->cqe.wr_id = wqe->hdr.wr_id;
comp_ctx->cqe.qp = qp_handle;
comp_ctx->cqe.opcode = wqe->hdr.opcode;
comp_ctx->cqe.opcode = IBV_WC_SEND;
sgid = rdma_rm_get_gid(&dev->rdma_dev_res, wqe->hdr.wr.ud.av.gid_index);
if (!sgid) {
pr_dbg("Fail to get gid for idx %d\n", wqe->hdr.wr.ud.av.gid_index);
return -EIO;
}
pr_dbg("sgid_id=%d, sgid=0x%llx\n", wqe->hdr.wr.ud.av.gid_index,
sgid->global.interface_id);
sgid_idx = rdma_rm_get_backend_gid_index(&dev->rdma_dev_res,
&dev->backend_dev,
wqe->hdr.wr.ud.av.gid_index);
if (sgid_idx <= 0) {
pr_dbg("Fail to get bk sgid_idx for sgid_idx %d\n",
wqe->hdr.wr.ud.av.gid_index);
return -EIO;
}
rdma_backend_post_send(&dev->backend_dev, &qp->backend_qp, qp->qp_type,
(struct ibv_sge *)&wqe->sge[0], wqe->hdr.num_sge,
sgid_idx, sgid,
(union ibv_gid *)wqe->hdr.wr.ud.av.dgid,
wqe->hdr.wr.ud.remote_qpn,
wqe->hdr.wr.ud.remote_qkey, comp_ctx);
@ -194,8 +223,9 @@ int pvrdma_qp_recv(PVRDMADev *dev, uint32_t qp_handle)
comp_ctx = g_malloc(sizeof(CompHandlerCtx));
comp_ctx->dev = dev;
comp_ctx->cq_handle = qp->recv_cq_handle;
comp_ctx->cqe.qp = qp_handle;
comp_ctx->cqe.wr_id = wqe->hdr.wr_id;
comp_ctx->cqe.qp = qp_handle;
comp_ctx->cqe.opcode = IBV_WC_RECV;
rdma_backend_post_recv(&dev->backend_dev, &dev->rdma_dev_res,
&qp->backend_qp, qp->qp_type,

View File

@ -62,6 +62,7 @@ void qemu_register_wakeup_support(void);
void qemu_system_shutdown_request(ShutdownCause reason);
void qemu_system_powerdown_request(void);
void qemu_register_powerdown_notifier(Notifier *notifier);
void qemu_register_shutdown_notifier(Notifier *notifier);
void qemu_system_debug_request(void);
void qemu_system_vmstop_request(RunState reason);
void qemu_system_vmstop_request_prepare(void);

View File

@ -86,6 +86,7 @@
{ 'include': 'char.json' }
{ 'include': 'job.json' }
{ 'include': 'net.json' }
{ 'include': 'rdma.json' }
{ 'include': 'rocker.json' }
{ 'include': 'tpm.json' }
{ 'include': 'ui.json' }

38
qapi/rdma.json Normal file
View File

@ -0,0 +1,38 @@
# -*- Mode: Python -*-
#
##
# = RDMA device
##
##
# @RDMA_GID_STATUS_CHANGED:
#
# Emitted when guest driver adds/deletes GID to/from device
#
# @netdev: RoCE Network Device name
#
# @gid-status: Add or delete indication
#
# @subnet-prefix: Subnet Prefix
#
# @interface-id : Interface ID
#
# Since: 4.0
#
# Example:
#
# <- {"timestamp": {"seconds": 1541579657, "microseconds": 986760},
# "event": "RDMA_GID_STATUS_CHANGED",
# "data":
# {"netdev": "bridge0",
# "interface-id": 15880512517475447892,
# "gid-status": true,
# "subnet-prefix": 33022}}
#
##
{ 'event': 'RDMA_GID_STATUS_CHANGED',
'data': { 'netdev' : 'str',
'gid-status' : 'bool',
'subnet-prefix' : 'uint64',
'interface-id' : 'uint64' } }

15
vl.c
View File

@ -1577,6 +1577,8 @@ static NotifierList suspend_notifiers =
NOTIFIER_LIST_INITIALIZER(suspend_notifiers);
static NotifierList wakeup_notifiers =
NOTIFIER_LIST_INITIALIZER(wakeup_notifiers);
static NotifierList shutdown_notifiers =
NOTIFIER_LIST_INITIALIZER(shutdown_notifiers);
static uint32_t wakeup_reason_mask = ~(1 << QEMU_WAKEUP_REASON_NONE);
ShutdownCause qemu_shutdown_requested_get(void)
@ -1828,6 +1830,12 @@ static void qemu_system_powerdown(void)
notifier_list_notify(&powerdown_notifiers, NULL);
}
static void qemu_system_shutdown(ShutdownCause cause)
{
qapi_event_send_shutdown(shutdown_caused_by_guest(cause), cause);
notifier_list_notify(&shutdown_notifiers, &cause);
}
void qemu_system_powerdown_request(void)
{
trace_qemu_system_powerdown_request();
@ -1840,6 +1848,11 @@ void qemu_register_powerdown_notifier(Notifier *notifier)
notifier_list_add(&powerdown_notifiers, notifier);
}
void qemu_register_shutdown_notifier(Notifier *notifier)
{
notifier_list_add(&shutdown_notifiers, notifier);
}
void qemu_system_debug_request(void)
{
debug_requested = 1;
@ -1867,7 +1880,7 @@ static bool main_loop_should_exit(void)
request = qemu_shutdown_requested();
if (request) {
qemu_kill_report();
qapi_event_send_shutdown(shutdown_caused_by_guest(request), request);
qemu_system_shutdown(request);
if (no_shutdown) {
vm_stop(RUN_STATE_SHUTDOWN);
} else {