From: Anthony L. <ali...@us...> - 2008-05-04 20:24:07
|
While it has served us well, it is long overdue that we eliminate the virtio-net tap hack. It turns out that zero-copy has very little impact on performance. The tap hack was gaining such a significant performance boost not because of zero-copy, but because it avoided dropping packets on receive which is apparently a significant problem with the tap implementation in QEMU. Patches 3 and 4 in this series address the packet dropping issue and the net result is a 25% boost in RX performance even in the absence of zero-copy. Also worth mentioning, is that this makes merging virtio into upstream QEMU significantly easier. Since v1, we're just rebasing on the new io thread patch set. This series depends on my IO thread series. Signed-off-by: Anthony Liguori <ali...@us...> diff --git a/qemu/hw/pc.h b/qemu/hw/pc.h index 57d2123..f5157bd 100644 --- a/qemu/hw/pc.h +++ b/qemu/hw/pc.h @@ -154,7 +154,6 @@ void isa_ne2000_init(int base, qemu_irq irq, NICInfo *nd); /* virtio-net.c */ PCIDevice *virtio_net_init(PCIBus *bus, NICInfo *nd, int devfn); -void virtio_net_poll(void); /* virtio-blk.h */ void *virtio_blk_init(PCIBus *bus, uint16_t vendor, uint16_t device, diff --git a/qemu/hw/virtio-net.c b/qemu/hw/virtio-net.c index f727b14..8d26832 100644 --- a/qemu/hw/virtio-net.c +++ b/qemu/hw/virtio-net.c @@ -13,7 +13,6 @@ #include "virtio.h" #include "net.h" -#include "pc.h" #include "qemu-timer.h" /* from Linux's virtio_net.h */ @@ -62,15 +61,10 @@ typedef struct VirtIONet VirtQueue *tx_vq; VLANClientState *vc; int can_receive; - int tap_fd; - struct VirtIONet *next; - int do_notify; QEMUTimer *tx_timer; int tx_timer_active; } VirtIONet; -static VirtIONet *VirtIONetHead = NULL; - static VirtIONet *to_virtio_net(VirtIODevice *vdev) { return (VirtIONet *)vdev; @@ -105,7 +99,6 @@ static int virtio_net_can_receive(void *opaque) return (n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK) && n->can_receive; } -/* -net user receive function */ static void virtio_net_receive(void *opaque, const uint8_t *buf, int size) { VirtIONet *n = opaque; @@ -144,87 +137,6 @@ static void virtio_net_receive(void *opaque, const uint8_t *buf, int size) virtio_notify(&n->vdev, n->rx_vq); } -/* -net tap receive handler */ -void virtio_net_poll(void) -{ - VirtIONet *vnet; - int len; - fd_set rfds; - struct timeval tv; - int max_fd = -1; - VirtQueueElement elem; - struct virtio_net_hdr *hdr; - int did_notify; - - FD_ZERO(&rfds); - tv.tv_sec = 0; - tv.tv_usec = 0; - - while (1) { - - // Prepare the set of device to select from - for (vnet = VirtIONetHead; vnet; vnet = vnet->next) { - - if (vnet->tap_fd == -1) - continue; - - vnet->do_notify = 0; - //first check if the driver is ok - if (!virtio_net_can_receive(vnet)) - continue; - - /* FIXME: the drivers really need to set their status better */ - if (vnet->rx_vq->vring.avail == NULL) { - vnet->can_receive = 0; - continue; - } - - FD_SET(vnet->tap_fd, &rfds); - if (max_fd < vnet->tap_fd) max_fd = vnet->tap_fd; - } - - if (select(max_fd + 1, &rfds, NULL, NULL, &tv) <= 0) - break; - - // Now check who has data pending in the tap - for (vnet = VirtIONetHead; vnet; vnet = vnet->next) { - - if (!FD_ISSET(vnet->tap_fd, &rfds)) - continue; - - if (virtqueue_pop(vnet->rx_vq, &elem) == 0) { - vnet->can_receive = 0; - continue; - } - - hdr = (void *)elem.in_sg[0].iov_base; - hdr->flags = 0; - hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; -again: - len = readv(vnet->tap_fd, &elem.in_sg[1], elem.in_num - 1); - if (len == -1) { - if (errno == EINTR || errno == EAGAIN) - goto again; - else - fprintf(stderr, "reading network error %d", len); - } - virtqueue_push(vnet->rx_vq, &elem, sizeof(*hdr) + len); - vnet->do_notify = 1; - } - - /* signal other side */ - did_notify = 0; - for (vnet = VirtIONetHead; vnet; vnet = vnet->next) - if (vnet->do_notify) { - virtio_notify(&vnet->vdev, vnet->rx_vq); - did_notify++; - } - if (!did_notify) - break; - } - -} - /* TX */ static void virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq) { @@ -303,12 +215,6 @@ PCIDevice *virtio_net_init(PCIBus *bus, NICInfo *nd, int devfn) memcpy(n->mac, nd->macaddr, 6); n->vc = qemu_new_vlan_client(nd->vlan, virtio_net_receive, virtio_net_can_receive, n); - n->tap_fd = hack_around_tap(n->vc->vlan->first_client); - if (n->tap_fd != -1) { - n->next = VirtIONetHead; - //push the device on top of the list - VirtIONetHead = n; - } n->tx_timer = qemu_new_timer(vm_clock, virtio_net_tx_timer, n); n->tx_timer_active = 0; diff --git a/qemu/vl.c b/qemu/vl.c index bcf893f..b8ce485 100644 --- a/qemu/vl.c +++ b/qemu/vl.c @@ -3966,15 +3966,8 @@ typedef struct TAPState { VLANClientState *vc; int fd; char down_script[1024]; - int no_poll; } TAPState; -static int tap_read_poll(void *opaque) -{ - TAPState *s = opaque; - return (!s->no_poll); -} - static void tap_receive(void *opaque, const uint8_t *buf, int size) { TAPState *s = opaque; @@ -4008,22 +4001,6 @@ static void tap_send(void *opaque) } } -int hack_around_tap(void *opaque) -{ - VLANClientState *vc = opaque; - TAPState *ts = vc->opaque; - - if (vc->fd_read != tap_receive) - return -1; - - if (ts) { - ts->no_poll = 1; - return ts->fd; - } - - return -1; -} - /* fd support */ static TAPState *net_tap_fd_init(VLANState *vlan, int fd) @@ -4034,10 +4011,8 @@ static TAPState *net_tap_fd_init(VLANState *vlan, int fd) if (!s) return NULL; s->fd = fd; - s->no_poll = 0; - enable_sigio_timer(fd); s->vc = qemu_new_vlan_client(vlan, tap_receive, NULL, s); - qemu_set_fd_handler2(s->fd, tap_read_poll, tap_send, NULL, s); + qemu_set_fd_handler2(s->fd, NULL, tap_send, NULL, s); snprintf(s->vc->info_str, sizeof(s->vc->info_str), "tap: fd=%d", fd); return s; } @@ -7972,10 +7947,7 @@ void main_loop_wait(int timeout) slirp_select_poll(&rfds, &wfds, &xfds); } #endif - virtio_net_poll(); - qemu_aio_poll(); - if (vm_running) { qemu_run_timers(&active_timers[QEMU_TIMER_VIRTUAL], qemu_get_clock(vm_clock)); |
From: Anthony L. <ali...@us...> - 2008-05-04 20:21:23
|
Normally, tap always reads packets and simply lets the client drop them if it cannot receive them. For virtio-net, this results in massive packet loss and about an 80% performance loss in TCP throughput. This patch modifies qemu_send_packet() to only deliver a packet to a VLAN client if it doesn't have a fd_can_read method or the fd_can_read method indicates that it can receive packets. We also return a status of whether any clients were able to receive the packet. If no clients were able to receive a packet, we buffer the packet until a client indicates that it can receive packets again. This patch also modifies the tap code to only read from the tap fd if at least one client on the VLAN is able to receive a packet. Finally, this patch changes the tap code to drain all possible packets from the tap device when the tap fd is readable. Signed-off-by: Anthony Liguori <ali...@us...> diff --git a/qemu/net.h b/qemu/net.h index 13daa27..dfdf9af 100644 --- a/qemu/net.h +++ b/qemu/net.h @@ -29,7 +29,7 @@ VLANClientState *qemu_new_vlan_client(VLANState *vlan, IOCanRWHandler *fd_can_read, void *opaque); int qemu_can_send_packet(VLANClientState *vc); -void qemu_send_packet(VLANClientState *vc, const uint8_t *buf, int size); +int qemu_send_packet(VLANClientState *vc, const uint8_t *buf, int size); void qemu_handler_true(void *opaque); void do_info_network(void); diff --git a/qemu/vl.c b/qemu/vl.c index c51d704..f4abea3 100644 --- a/qemu/vl.c +++ b/qemu/vl.c @@ -3760,10 +3760,11 @@ int qemu_can_send_packet(VLANClientState *vc1) return 0; } -void qemu_send_packet(VLANClientState *vc1, const uint8_t *buf, int size) +int qemu_send_packet(VLANClientState *vc1, const uint8_t *buf, int size) { VLANState *vlan = vc1->vlan; VLANClientState *vc; + int ret = -EAGAIN; #if 0 printf("vlan %d send:\n", vlan->id); @@ -3771,9 +3772,14 @@ void qemu_send_packet(VLANClientState *vc1, const uint8_t *buf, int size) #endif for(vc = vlan->first_client; vc != NULL; vc = vc->next) { if (vc != vc1) { - vc->fd_read(vc->opaque, buf, size); + if (!vc->fd_can_read || vc->fd_can_read(vc->opaque)) { + vc->fd_read(vc->opaque, buf, size); + ret = 0; + } } } + + return ret; } #if defined(CONFIG_SLIRP) @@ -3976,6 +3982,8 @@ typedef struct TAPState { VLANClientState *vc; int fd; char down_script[1024]; + char buf[4096]; + int size; } TAPState; static void tap_receive(void *opaque, const uint8_t *buf, int size) @@ -3991,24 +3999,70 @@ static void tap_receive(void *opaque, const uint8_t *buf, int size) } } +static int tap_can_send(void *opaque) +{ + TAPState *s = opaque; + VLANClientState *vc; + int can_receive = 0; + + /* Check to see if any of our clients can receive a packet */ + for (vc = s->vc->vlan->first_client; vc; vc = vc->next) { + /* Skip ourselves */ + if (vc == s->vc) + continue; + + if (!vc->fd_can_read) { + /* no fd_can_read handler, they always can receive */ + can_receive = 1; + } else + can_receive = vc->fd_can_read(vc->opaque); + + /* Once someone can receive, we try to send a packet */ + if (can_receive) + break; + } + + return can_receive; +} + static void tap_send(void *opaque) { TAPState *s = opaque; - uint8_t buf[4096]; - int size; + /* First try to send any buffered packet */ + if (s->size > 0) { + int err; + + /* If noone can receive the packet, buffer it */ + err = qemu_send_packet(s->vc, s->buf, s->size); + if (err == -EAGAIN) + return; + } + + /* Read packets until we hit EAGAIN */ + do { #ifdef __sun__ - struct strbuf sbuf; - int f = 0; - sbuf.maxlen = sizeof(buf); - sbuf.buf = buf; - size = getmsg(s->fd, NULL, &sbuf, &f) >=0 ? sbuf.len : -1; + struct strbuf sbuf; + int f = 0; + sbuf.maxlen = sizeof(s->buf); + sbuf.buf = s->buf; + s->size = getmsg(s->fd, NULL, &sbuf, &f) >=0 ? sbuf.len : -1; #else - size = read(s->fd, buf, sizeof(buf)); + s->size = read(s->fd, s->buf, sizeof(s->buf)); #endif - if (size > 0) { - qemu_send_packet(s->vc, buf, size); - } + + if (s->size == -1 && errno == EINTR) + continue; + + if (s->size > 0) { + int err; + + /* If noone can receive the packet, buffer it */ + err = qemu_send_packet(s->vc, s->buf, s->size); + if (err == -EAGAIN) + break; + } + } while (s->size > 0); } /* fd support */ @@ -4022,7 +4076,7 @@ static TAPState *net_tap_fd_init(VLANState *vlan, int fd) return NULL; s->fd = fd; s->vc = qemu_new_vlan_client(vlan, tap_receive, NULL, s); - qemu_set_fd_handler2(s->fd, NULL, tap_send, NULL, s); + qemu_set_fd_handler2(s->fd, tap_can_send, tap_send, NULL, s); snprintf(s->vc->info_str, sizeof(s->vc->info_str), "tap: fd=%d", fd); return s; } |
From: Dor L. <dor...@qu...> - 2008-05-04 22:16:37
|
On Sun, 2008-05-04 at 15:21 -0500, Anthony Liguori wrote: > Normally, tap always reads packets and simply lets the client drop them if it > cannot receive them. For virtio-net, this results in massive packet loss and > about an 80% performance loss in TCP throughput. > > This patch modifies qemu_send_packet() to only deliver a packet to a VLAN > client if it doesn't have a fd_can_read method or the fd_can_read method > indicates that it can receive packets. We also return a status of whether > any clients were able to receive the packet. > > If no clients were able to receive a packet, we buffer the packet until a > client indicates that it can receive packets again. > > This patch also modifies the tap code to only read from the tap fd if at least > one client on the VLAN is able to receive a packet. > > Finally, this patch changes the tap code to drain all possible packets from > the tap device when the tap fd is readable. > > Signed-off-by: Anthony Liguori <ali...@us...> Patchset looks good and reduces some nasty hacks. It probably also improves other devices like e1000 et al. Cheers, Dor |
From: Anthony L. <an...@co...> - 2008-05-04 22:53:23
|
Dor Laor wrote: > On Sun, 2008-05-04 at 15:21 -0500, Anthony Liguori wrote: > > Patchset looks good and reduces some nasty hacks. It probably also > improves other devices like e1000 et al. > Yeah, we just need to make sure they have proper can_receive handlers. I took a quick look at the e1000 but it wasn't entirely obvious to me how RX packets get queued. AFAICT, the e1000 can_receive handler only reports false when the device isn't initialized so it's likely it's dropping packets. Regards, Anthony Liguori > Cheers, > Dor > > > ------------------------------------------------------------------------- > This SF.net email is sponsored by the 2008 JavaOne(SM) Conference > Don't miss this year's exciting event. There's still time to save $100. > Use priority code J8TL2D2. > http://ad.doubleclick.net/clk;198757673;13503038;p?http://java.sun.com/javaone > _______________________________________________ > kvm-devel mailing list > kvm...@li... > https://lists.sourceforge.net/lists/listinfo/kvm-devel > |
From: Anthony L. <ali...@us...> - 2008-05-04 20:22:22
|
In the final patch of this series, we rely on a VLAN client's fd_can_read method to avoid dropping packets. Unfortunately, virtio's fd_can_read method is not very accurate at the moment. This patch addresses this. It also generates a notification to the IO thread when more RX packets become available. If we say we can't receive a packet because no RX buffers are available, this may result in the tap file descriptor not being select()'d. Without notifying the IO thread, we may have to wait until the select() times out before we can receive a packet (even if there is one pending). This particular change makes RX performance very consistent. Signed-off-by: Anthony Liguori <ali...@us...> diff --git a/qemu/hw/virtio-net.c b/qemu/hw/virtio-net.c index 8d26832..5538979 100644 --- a/qemu/hw/virtio-net.c +++ b/qemu/hw/virtio-net.c @@ -14,6 +14,7 @@ #include "virtio.h" #include "net.h" #include "qemu-timer.h" +#include "qemu-kvm.h" /* from Linux's virtio_net.h */ @@ -60,11 +61,14 @@ typedef struct VirtIONet VirtQueue *rx_vq; VirtQueue *tx_vq; VLANClientState *vc; - int can_receive; QEMUTimer *tx_timer; int tx_timer_active; } VirtIONet; +/* TODO + * - we could suppress RX interrupt if we were so inclined. + */ + static VirtIONet *to_virtio_net(VirtIODevice *vdev) { return (VirtIONet *)vdev; @@ -88,15 +92,24 @@ static uint32_t virtio_net_get_features(VirtIODevice *vdev) static void virtio_net_handle_rx(VirtIODevice *vdev, VirtQueue *vq) { - VirtIONet *n = to_virtio_net(vdev); - n->can_receive = 1; + /* We now have RX buffers, signal to the IO thread to break out of the + select to re-poll the tap file descriptor */ + if (kvm_enabled()) + qemu_kvm_notify_work(); } static int virtio_net_can_receive(void *opaque) { VirtIONet *n = opaque; - return (n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK) && n->can_receive; + if (n->rx_vq->vring.avail == NULL || + !(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK)) + return 0; + + if (n->rx_vq->vring.avail->idx == n->rx_vq->last_avail_idx) + return 0; + + return 1; } static void virtio_net_receive(void *opaque, const uint8_t *buf, int size) @@ -106,15 +119,8 @@ static void virtio_net_receive(void *opaque, const uint8_t *buf, int size) struct virtio_net_hdr *hdr; int offset, i; - /* FIXME: the drivers really need to set their status better */ - if (n->rx_vq->vring.avail == NULL) { - n->can_receive = 0; - return; - } - if (virtqueue_pop(n->rx_vq, &elem) == 0) { - /* wait until the guest adds some rx bufs */ - n->can_receive = 0; + fprintf(stderr, "virtio_net: this should not happen\n"); return; } @@ -209,9 +215,8 @@ PCIDevice *virtio_net_init(PCIBus *bus, NICInfo *nd, int devfn) n->vdev.update_config = virtio_net_update_config; n->vdev.get_features = virtio_net_get_features; - n->rx_vq = virtio_add_queue(&n->vdev, 512, virtio_net_handle_rx); + n->rx_vq = virtio_add_queue(&n->vdev, 128, virtio_net_handle_rx); n->tx_vq = virtio_add_queue(&n->vdev, 128, virtio_net_handle_tx); - n->can_receive = 0; memcpy(n->mac, nd->macaddr, 6); n->vc = qemu_new_vlan_client(nd->vlan, virtio_net_receive, virtio_net_can_receive, n); |