From: Anthony L. <ali...@us...> - 2008-05-04 02:47:45
|
QEMU is rather aggressive about exhausting the wait period when selecting. This is fine when the wait period is low and when there is significant delays in-between selects as it improves IO throughput. With the IO thread, there is a very small delay between selects and our wait period for select is very large. This patch changes main_loop_wait to only select once before doing the various other things in the main loop. This generally improves responsiveness of things like SDL but also improves individual file descriptor throughput quite dramatically. This patch is relies on my io-thread-timerfd.patch. Signed-off-by: Anthony Liguori <ali...@us...> diff --git a/qemu/qemu-kvm.c b/qemu/qemu-kvm.c index 0c7f49f..31c7ca7 100644 --- a/qemu/qemu-kvm.c +++ b/qemu/qemu-kvm.c @@ -401,24 +401,6 @@ void qemu_kvm_notify_work(void) pthread_kill(io_thread, SIGUSR1); } -static int received_signal; - -/* QEMU relies on periodically breaking out of select via EINTR to poll for IO - and timer signals. Since we're now using a file descriptor to handle - signals, select() won't be interrupted by a signal. We need to forcefully - break the select() loop when a signal is received hence - kvm_check_received_signal(). */ - -int kvm_check_received_signal(void) -{ - if (received_signal) { - received_signal = 0; - return 1; - } - - return 0; -} - #if defined(SYS_signalfd) #if !defined(HAVE_signalfd) #include <linux/signalfd.h> @@ -466,8 +448,6 @@ static void sigfd_handler(void *opaque) if (info.ssi_signo == SIGUSR2) pthread_cond_signal(&qemu_aio_cond); } - - received_signal = 1; } static int setup_signal_handlers(int nr_signals, ...) @@ -576,8 +556,6 @@ static void sigfd_handler(void *opaque) if (signo == SIGUSR2) pthread_cond_signal(&qemu_aio_cond); } - - received_signal = 1; } static int setup_signal_handlers(int nr_signals, ...) diff --git a/qemu/qemu-kvm.h b/qemu/qemu-kvm.h index bcab82c..5109c64 100644 --- a/qemu/qemu-kvm.h +++ b/qemu/qemu-kvm.h @@ -112,13 +112,4 @@ static inline void kvm_sleep_end(void) kvm_mutex_lock(); } -int kvm_check_received_signal(void); - -static inline int kvm_received_signal(void) -{ - if (kvm_enabled()) - return kvm_check_received_signal(); - return 0; -} - #endif diff --git a/qemu/vl.c b/qemu/vl.c index 1192759..bcf893f 100644 --- a/qemu/vl.c +++ b/qemu/vl.c @@ -7936,23 +7936,18 @@ void main_loop_wait(int timeout) slirp_select_fill(&nfds, &rfds, &wfds, &xfds); } #endif - moreio: ret = qemu_select(nfds + 1, &rfds, &wfds, &xfds, &tv); if (ret > 0) { IOHandlerRecord **pioh; - int more = 0; for(ioh = first_io_handler; ioh != NULL; ioh = ioh->next) { if (!ioh->deleted && ioh->fd_read && FD_ISSET(ioh->fd, &rfds)) { ioh->fd_read(ioh->opaque); - if (!ioh->fd_read_poll || ioh->fd_read_poll(ioh->opaque)) - more = 1; - else + if (!(ioh->fd_read_poll && ioh->fd_read_poll(ioh->opaque))) FD_CLR(ioh->fd, &rfds); } if (!ioh->deleted && ioh->fd_write && FD_ISSET(ioh->fd, &wfds)) { ioh->fd_write(ioh->opaque); - more = 1; } } @@ -7966,8 +7961,6 @@ void main_loop_wait(int timeout) } else pioh = &ioh->next; } - if (more && !kvm_received_signal()) - goto moreio; } #if defined(CONFIG_SLIRP) if (slirp_inited) { |
From: Anthony L. <ali...@us...> - 2008-05-04 02:47:45
|
While it has served us well, it is long overdue that we eliminate the virtio-net tap hack. It turns out that zero-copy has very little impact on performance. The tap hack was gaining such a significant performance boost not because of zero-copy, but because it avoided dropping packets on receive which is apparently a significant problem with the tap implementation in QEMU. Patches 3 and 4 in this series address the packet dropping issue and the net result is a 25% boost in RX performance even in the absence of zero-copy. Also worth mentioning, is that this makes merging virtio into upstream QEMU significantly easier. Signed-off-by: Anthony Liguori <ali...@us...> diff --git a/qemu/hw/pc.h b/qemu/hw/pc.h index 57d2123..f5157bd 100644 --- a/qemu/hw/pc.h +++ b/qemu/hw/pc.h @@ -154,7 +154,6 @@ void isa_ne2000_init(int base, qemu_irq irq, NICInfo *nd); /* virtio-net.c */ PCIDevice *virtio_net_init(PCIBus *bus, NICInfo *nd, int devfn); -void virtio_net_poll(void); /* virtio-blk.h */ void *virtio_blk_init(PCIBus *bus, uint16_t vendor, uint16_t device, diff --git a/qemu/hw/virtio-net.c b/qemu/hw/virtio-net.c index f727b14..8d26832 100644 --- a/qemu/hw/virtio-net.c +++ b/qemu/hw/virtio-net.c @@ -13,7 +13,6 @@ #include "virtio.h" #include "net.h" -#include "pc.h" #include "qemu-timer.h" /* from Linux's virtio_net.h */ @@ -62,15 +61,10 @@ typedef struct VirtIONet VirtQueue *tx_vq; VLANClientState *vc; int can_receive; - int tap_fd; - struct VirtIONet *next; - int do_notify; QEMUTimer *tx_timer; int tx_timer_active; } VirtIONet; -static VirtIONet *VirtIONetHead = NULL; - static VirtIONet *to_virtio_net(VirtIODevice *vdev) { return (VirtIONet *)vdev; @@ -105,7 +99,6 @@ static int virtio_net_can_receive(void *opaque) return (n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK) && n->can_receive; } -/* -net user receive function */ static void virtio_net_receive(void *opaque, const uint8_t *buf, int size) { VirtIONet *n = opaque; @@ -144,87 +137,6 @@ static void virtio_net_receive(void *opaque, const uint8_t *buf, int size) virtio_notify(&n->vdev, n->rx_vq); } -/* -net tap receive handler */ -void virtio_net_poll(void) -{ - VirtIONet *vnet; - int len; - fd_set rfds; - struct timeval tv; - int max_fd = -1; - VirtQueueElement elem; - struct virtio_net_hdr *hdr; - int did_notify; - - FD_ZERO(&rfds); - tv.tv_sec = 0; - tv.tv_usec = 0; - - while (1) { - - // Prepare the set of device to select from - for (vnet = VirtIONetHead; vnet; vnet = vnet->next) { - - if (vnet->tap_fd == -1) - continue; - - vnet->do_notify = 0; - //first check if the driver is ok - if (!virtio_net_can_receive(vnet)) - continue; - - /* FIXME: the drivers really need to set their status better */ - if (vnet->rx_vq->vring.avail == NULL) { - vnet->can_receive = 0; - continue; - } - - FD_SET(vnet->tap_fd, &rfds); - if (max_fd < vnet->tap_fd) max_fd = vnet->tap_fd; - } - - if (select(max_fd + 1, &rfds, NULL, NULL, &tv) <= 0) - break; - - // Now check who has data pending in the tap - for (vnet = VirtIONetHead; vnet; vnet = vnet->next) { - - if (!FD_ISSET(vnet->tap_fd, &rfds)) - continue; - - if (virtqueue_pop(vnet->rx_vq, &elem) == 0) { - vnet->can_receive = 0; - continue; - } - - hdr = (void *)elem.in_sg[0].iov_base; - hdr->flags = 0; - hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; -again: - len = readv(vnet->tap_fd, &elem.in_sg[1], elem.in_num - 1); - if (len == -1) { - if (errno == EINTR || errno == EAGAIN) - goto again; - else - fprintf(stderr, "reading network error %d", len); - } - virtqueue_push(vnet->rx_vq, &elem, sizeof(*hdr) + len); - vnet->do_notify = 1; - } - - /* signal other side */ - did_notify = 0; - for (vnet = VirtIONetHead; vnet; vnet = vnet->next) - if (vnet->do_notify) { - virtio_notify(&vnet->vdev, vnet->rx_vq); - did_notify++; - } - if (!did_notify) - break; - } - -} - /* TX */ static void virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq) { @@ -303,12 +215,6 @@ PCIDevice *virtio_net_init(PCIBus *bus, NICInfo *nd, int devfn) memcpy(n->mac, nd->macaddr, 6); n->vc = qemu_new_vlan_client(nd->vlan, virtio_net_receive, virtio_net_can_receive, n); - n->tap_fd = hack_around_tap(n->vc->vlan->first_client); - if (n->tap_fd != -1) { - n->next = VirtIONetHead; - //push the device on top of the list - VirtIONetHead = n; - } n->tx_timer = qemu_new_timer(vm_clock, virtio_net_tx_timer, n); n->tx_timer_active = 0; diff --git a/qemu/vl.c b/qemu/vl.c index bcf893f..b8ce485 100644 --- a/qemu/vl.c +++ b/qemu/vl.c @@ -3966,15 +3966,8 @@ typedef struct TAPState { VLANClientState *vc; int fd; char down_script[1024]; - int no_poll; } TAPState; -static int tap_read_poll(void *opaque) -{ - TAPState *s = opaque; - return (!s->no_poll); -} - static void tap_receive(void *opaque, const uint8_t *buf, int size) { TAPState *s = opaque; @@ -4008,22 +4001,6 @@ static void tap_send(void *opaque) } } -int hack_around_tap(void *opaque) -{ - VLANClientState *vc = opaque; - TAPState *ts = vc->opaque; - - if (vc->fd_read != tap_receive) - return -1; - - if (ts) { - ts->no_poll = 1; - return ts->fd; - } - - return -1; -} - /* fd support */ static TAPState *net_tap_fd_init(VLANState *vlan, int fd) @@ -4034,10 +4011,8 @@ static TAPState *net_tap_fd_init(VLANState *vlan, int fd) if (!s) return NULL; s->fd = fd; - s->no_poll = 0; - enable_sigio_timer(fd); s->vc = qemu_new_vlan_client(vlan, tap_receive, NULL, s); - qemu_set_fd_handler2(s->fd, tap_read_poll, tap_send, NULL, s); + qemu_set_fd_handler2(s->fd, NULL, tap_send, NULL, s); snprintf(s->vc->info_str, sizeof(s->vc->info_str), "tap: fd=%d", fd); return s; } @@ -7972,10 +7947,7 @@ void main_loop_wait(int timeout) slirp_select_poll(&rfds, &wfds, &xfds); } #endif - virtio_net_poll(); - qemu_aio_poll(); - if (vm_running) { qemu_run_timers(&active_timers[QEMU_TIMER_VIRTUAL], qemu_get_clock(vm_clock)); |
From: Anthony L. <ali...@us...> - 2008-05-04 02:47:53
|
Normally, tap always reads packets and simply lets the client drop them if it cannot receive them. For virtio-net, this results in massive packet loss and about an 80% performance loss in TCP throughput. This patch modifies qemu_send_packet() to only deliver a packet to a VLAN client if it doesn't have a fd_can_read method or the fd_can_read method indicates that it can receive packets. We also return a status of whether any clients were able to receive the packet. If no clients were able to receive a packet, we buffer the packet until a client indicates that it can receive packets again. This patch also modifies the tap code to only read from the tap fd if at least one client on the VLAN is able to receive a packet. Finally, this patch changes the tap code to drain all possible packets from the tap device when the tap fd is readable. Signed-off-by: Anthony Liguori <ali...@us...> diff --git a/qemu/net.h b/qemu/net.h index 13daa27..dfdf9af 100644 --- a/qemu/net.h +++ b/qemu/net.h @@ -29,7 +29,7 @@ VLANClientState *qemu_new_vlan_client(VLANState *vlan, IOCanRWHandler *fd_can_read, void *opaque); int qemu_can_send_packet(VLANClientState *vc); -void qemu_send_packet(VLANClientState *vc, const uint8_t *buf, int size); +int qemu_send_packet(VLANClientState *vc, const uint8_t *buf, int size); void qemu_handler_true(void *opaque); void do_info_network(void); diff --git a/qemu/vl.c b/qemu/vl.c index b8ce485..74c34b6 100644 --- a/qemu/vl.c +++ b/qemu/vl.c @@ -3750,10 +3750,11 @@ int qemu_can_send_packet(VLANClientState *vc1) return 0; } -void qemu_send_packet(VLANClientState *vc1, const uint8_t *buf, int size) +int qemu_send_packet(VLANClientState *vc1, const uint8_t *buf, int size) { VLANState *vlan = vc1->vlan; VLANClientState *vc; + int ret = -EAGAIN; #if 0 printf("vlan %d send:\n", vlan->id); @@ -3761,9 +3762,14 @@ void qemu_send_packet(VLANClientState *vc1, const uint8_t *buf, int size) #endif for(vc = vlan->first_client; vc != NULL; vc = vc->next) { if (vc != vc1) { - vc->fd_read(vc->opaque, buf, size); + if (!vc->fd_can_read || vc->fd_can_read(vc->opaque)) { + vc->fd_read(vc->opaque, buf, size); + ret = 0; + } } } + + return ret; } #if defined(CONFIG_SLIRP) @@ -3966,6 +3972,8 @@ typedef struct TAPState { VLANClientState *vc; int fd; char down_script[1024]; + char buf[4096]; + int size; } TAPState; static void tap_receive(void *opaque, const uint8_t *buf, int size) @@ -3981,24 +3989,70 @@ static void tap_receive(void *opaque, const uint8_t *buf, int size) } } +static int tap_can_send(void *opaque) +{ + TAPState *s = opaque; + VLANClientState *vc; + int can_receive = 0; + + /* Check to see if any of our clients can receive a packet */ + for (vc = s->vc->vlan->first_client; vc; vc = vc->next) { + /* Skip ourselves */ + if (vc == s->vc) + continue; + + if (!vc->fd_can_read) { + /* no fd_can_read handler, they always can receive */ + can_receive = 1; + } else + can_receive = vc->fd_can_read(vc->opaque); + + /* Once someone can receive, we try to send a packet */ + if (can_receive) + break; + } + + return can_receive; +} + static void tap_send(void *opaque) { TAPState *s = opaque; - uint8_t buf[4096]; - int size; + /* First try to send any buffered packet */ + if (s->size > 0) { + int err; + + /* If noone can receive the packet, buffer it */ + err = qemu_send_packet(s->vc, s->buf, s->size); + if (err == -EAGAIN) + return; + } + + /* Read packets until we hit EAGAIN */ + do { #ifdef __sun__ - struct strbuf sbuf; - int f = 0; - sbuf.maxlen = sizeof(buf); - sbuf.buf = buf; - size = getmsg(s->fd, NULL, &sbuf, &f) >=0 ? sbuf.len : -1; + struct strbuf sbuf; + int f = 0; + sbuf.maxlen = sizeof(s->buf); + sbuf.buf = s->buf; + s->size = getmsg(s->fd, NULL, &sbuf, &f) >=0 ? sbuf.len : -1; #else - size = read(s->fd, buf, sizeof(buf)); + s->size = read(s->fd, s->buf, sizeof(s->buf)); #endif - if (size > 0) { - qemu_send_packet(s->vc, buf, size); - } + + if (s->size == -1 && errno == EINTR) + continue; + + if (s->size > 0) { + int err; + + /* If noone can receive the packet, buffer it */ + err = qemu_send_packet(s->vc, s->buf, s->size); + if (err == -EAGAIN) + break; + } + } while (s->size > 0); } /* fd support */ @@ -4012,7 +4066,7 @@ static TAPState *net_tap_fd_init(VLANState *vlan, int fd) return NULL; s->fd = fd; s->vc = qemu_new_vlan_client(vlan, tap_receive, NULL, s); - qemu_set_fd_handler2(s->fd, NULL, tap_send, NULL, s); + qemu_set_fd_handler2(s->fd, tap_can_send, tap_send, NULL, s); snprintf(s->vc->info_str, sizeof(s->vc->info_str), "tap: fd=%d", fd); return s; } |
From: Anthony L. <ali...@us...> - 2008-05-04 02:48:09
|
In the final patch of this series, we rely on a VLAN client's fd_can_read method to avoid dropping packets. Unfortunately, virtio's fd_can_read method is not very accurate at the moment. This patch addresses this. It also generates a notification to the IO thread when more RX packets become available. If we say we can't receive a packet because no RX buffers are available, this may result in the tap file descriptor not being select()'d. Without notifying the IO thread, we may have to wait until the select() times out before we can receive a packet (even if there is one pending). This particular change makes RX performance very consistent. Signed-off-by: Anthony Liguori <ali...@us...> diff --git a/qemu/hw/virtio-net.c b/qemu/hw/virtio-net.c index 8d26832..5538979 100644 --- a/qemu/hw/virtio-net.c +++ b/qemu/hw/virtio-net.c @@ -14,6 +14,7 @@ #include "virtio.h" #include "net.h" #include "qemu-timer.h" +#include "qemu-kvm.h" /* from Linux's virtio_net.h */ @@ -60,11 +61,14 @@ typedef struct VirtIONet VirtQueue *rx_vq; VirtQueue *tx_vq; VLANClientState *vc; - int can_receive; QEMUTimer *tx_timer; int tx_timer_active; } VirtIONet; +/* TODO + * - we could suppress RX interrupt if we were so inclined. + */ + static VirtIONet *to_virtio_net(VirtIODevice *vdev) { return (VirtIONet *)vdev; @@ -88,15 +92,24 @@ static uint32_t virtio_net_get_features(VirtIODevice *vdev) static void virtio_net_handle_rx(VirtIODevice *vdev, VirtQueue *vq) { - VirtIONet *n = to_virtio_net(vdev); - n->can_receive = 1; + /* We now have RX buffers, signal to the IO thread to break out of the + select to re-poll the tap file descriptor */ + if (kvm_enabled()) + qemu_kvm_notify_work(); } static int virtio_net_can_receive(void *opaque) { VirtIONet *n = opaque; - return (n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK) && n->can_receive; + if (n->rx_vq->vring.avail == NULL || + !(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK)) + return 0; + + if (n->rx_vq->vring.avail->idx == n->rx_vq->last_avail_idx) + return 0; + + return 1; } static void virtio_net_receive(void *opaque, const uint8_t *buf, int size) @@ -106,15 +119,8 @@ static void virtio_net_receive(void *opaque, const uint8_t *buf, int size) struct virtio_net_hdr *hdr; int offset, i; - /* FIXME: the drivers really need to set their status better */ - if (n->rx_vq->vring.avail == NULL) { - n->can_receive = 0; - return; - } - if (virtqueue_pop(n->rx_vq, &elem) == 0) { - /* wait until the guest adds some rx bufs */ - n->can_receive = 0; + fprintf(stderr, "virtio_net: this should not happen\n"); return; } @@ -209,9 +215,8 @@ PCIDevice *virtio_net_init(PCIBus *bus, NICInfo *nd, int devfn) n->vdev.update_config = virtio_net_update_config; n->vdev.get_features = virtio_net_get_features; - n->rx_vq = virtio_add_queue(&n->vdev, 512, virtio_net_handle_rx); + n->rx_vq = virtio_add_queue(&n->vdev, 128, virtio_net_handle_rx); n->tx_vq = virtio_add_queue(&n->vdev, 128, virtio_net_handle_tx); - n->can_receive = 0; memcpy(n->mac, nd->macaddr, 6); n->vc = qemu_new_vlan_client(nd->vlan, virtio_net_receive, virtio_net_can_receive, n); |
From: Anthony L. <an...@co...> - 2008-05-04 04:03:45
Attachments:
net-tap-zero-copy.patch
|
Anthony Liguori wrote: > While it has served us well, it is long overdue that we eliminate the > virtio-net tap hack. It turns out that zero-copy has very little impact on > performance. The tap hack was gaining such a significant performance boost > not because of zero-copy, but because it avoided dropping packets on receive > which is apparently a significant problem with the tap implementation in QEMU. > FWIW, attached is a pretty straight forward zero-copy patch. What's interesting is that I see no change in throughput using this patch. The CPU is pegged at 100% during the iperf run. Since we're still using small MTUs, this isn't surprising. Copying a 1500 byte packet that we have to bring into the cache anyway doesn't seem significant. I think zero-copy will be more important with GSO though. Regards, Anthony Liguori > Patches 3 and 4 in this series address the packet dropping issue and the net > result is a 25% boost in RX performance even in the absence of zero-copy. > > Also worth mentioning, is that this makes merging virtio into upstream QEMU > significantly easier. > > Signed-off-by: Anthony Liguori <ali...@us...> > |
From: Avi K. <av...@qu...> - 2008-05-04 08:13:26
|
Anthony Liguori wrote: > Anthony Liguori wrote: >> While it has served us well, it is long overdue that we eliminate the >> virtio-net tap hack. It turns out that zero-copy has very little >> impact on >> performance. The tap hack was gaining such a significant performance >> boost >> not because of zero-copy, but because it avoided dropping packets on >> receive >> which is apparently a significant problem with the tap implementation >> in QEMU. >> > > FWIW, attached is a pretty straight forward zero-copy patch. What's > interesting is that I see no change in throughput using this patch. > The CPU is pegged at 100% during the iperf run. Since we're still > using small MTUs, this isn't surprising. Copying a 1500 byte packet > that we have to bring into the cache anyway doesn't seem significant. > I think zero-copy will be more important with GSO though. Zero copy is important when the guest is zero copy, and when we are not doing any extra copying on the host. This doesn't fit the way we benchmark. I expect zero copy to show improvements on things like apachebench (with a file size > 50K) with an external client. The improvements will also show up on SMP, where the likelihood of the copy happening on the wrong cpu increase. -- error compiling committee.c: too many arguments to function |
From: Avi K. <av...@qu...> - 2008-05-04 13:03:21
|
Anthony Liguori wrote: > QEMU is rather aggressive about exhausting the wait period when selecting. > This is fine when the wait period is low and when there is significant delays > in-between selects as it improves IO throughput. > > With the IO thread, there is a very small delay between selects and our wait > period for select is very large. This patch changes main_loop_wait to only > select once before doing the various other things in the main loop. This > generally improves responsiveness of things like SDL but also improves > individual file descriptor throughput quite dramatically. > > This patch is relies on my io-thread-timerfd.patch. > (did you mean signalfd?) Patchset looks good; but as it depends on previous patches I can't apply it yet. -- error compiling committee.c: too many arguments to function |