Thread: [kvm-devel] [PATCH 1/5] Support more than 3.5GB with virtio (v3)

Brought to you by: avik, mtosatti

kvm-devel

[kvm-devel] [PATCH 1/5] Support more than 3.5GB with virtio (v3)

From: Anthony L. <ali...@us...> - 2008-05-07 18:10:04

We're pretty sloppy in virtio right now about phys_ram_base assumptions.  This
patch is an incremental step between what we have today and a full blown DMA
API.  I backported the DMA API but the performance impact was not acceptable
to me. There's only a slight performance impact with this particular patch.

Since we're no longer assuming guest physical memory is contiguous, we need
a more complex way to validate the memory regions than just checking if it's
within ram_size.

Signed-off-by: Anthony Liguori <ali...@us...>

diff --git a/qemu/hw/virtio.c b/qemu/hw/virtio.c
index 6a50001..a4c9d10 100644
--- a/qemu/hw/virtio.c
+++ b/qemu/hw/virtio.c
@@ -58,6 +58,48 @@
 
 /* virt queue functions */
 
+static void *virtio_map_gpa(target_phys_addr_t addr, size_t size)
+{
+    ram_addr_t off;
+    target_phys_addr_t addr1;
+
+    off = cpu_get_physical_page_desc(addr);
+    if ((off & ~TARGET_PAGE_MASK) != IO_MEM_RAM) {
+	fprintf(stderr, "virtio DMA to IO ram\n");
+	exit(1);
+    }
+
+    off = (off & TARGET_PAGE_MASK) | (addr & ~TARGET_PAGE_MASK);
+
+    for (addr1 = addr + TARGET_PAGE_SIZE;
+	 addr1 < TARGET_PAGE_ALIGN(addr + size);
+	 addr1 += TARGET_PAGE_SIZE) {
+	ram_addr_t off1;
+
+	off1 = cpu_get_physical_page_desc(addr1);
+	if ((off1 & ~TARGET_PAGE_MASK) != IO_MEM_RAM) {
+	    fprintf(stderr, "virtio DMA to IO ram\n");
+	    exit(1);
+	}
+
+	off1 = (off1 & TARGET_PAGE_MASK) | (addr1 & ~TARGET_PAGE_MASK);
+
+	if (off1 != (off + (addr1 - addr))) {
+	    fprintf(stderr, "discontigous virtio memory\n");
+	    exit(1);
+	}
+    }
+
+    return phys_ram_base + off;
+}
+
+static size_t virtqueue_size(int num)
+{
+    return TARGET_PAGE_ALIGN((sizeof(VRingDesc) * num) +
+			     (sizeof(VRingAvail) + sizeof(uint16_t) * num)) +
+	(sizeof(VRingUsed) + sizeof(VRingUsedElem) * num);
+}
+
 static void virtqueue_init(VirtQueue *vq, void *p)
 {
     vq->vring.desc = p;
@@ -127,9 +169,6 @@ int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem)
     do {
 	struct iovec *sg;
 
-	if ((vq->vring.desc[i].addr + vq->vring.desc[i].len) > ram_size)
-	    errx(1, "Guest sent invalid pointer");
-
 	if (vq->vring.desc[i].flags & VRING_DESC_F_WRITE)
 	    sg = &elem->in_sg[elem->in_num++];
 	else
@@ -137,7 +176,9 @@ int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem)
 
 	/* Grab the first descriptor, and check it's OK. */
 	sg->iov_len = vq->vring.desc[i].len;
-	sg->iov_base = phys_ram_base + vq->vring.desc[i].addr;
+	sg->iov_base = virtio_map_gpa(vq->vring.desc[i].addr, sg->iov_len);
+	if (sg->iov_base == NULL)
+	    errx(1, "Invalid mapping\n");
 
 	/* If we've got too many, that implies a descriptor loop. */
 	if ((elem->in_num + elem->out_num) > vq->vring.num)
@@ -198,9 +239,10 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
 	vdev->vq[vdev->queue_sel].pfn = val;
 	if (pa == 0) {
             virtio_reset(vdev);
-	} else if (pa < (ram_size - TARGET_PAGE_SIZE)) {
-	    virtqueue_init(&vdev->vq[vdev->queue_sel], phys_ram_base + pa);
-	    /* FIXME if pa == 0, deal with device tear down */
+	} else {
+	    size_t size = virtqueue_size(vdev->vq[vdev->queue_sel].vring.num);
+	    virtqueue_init(&vdev->vq[vdev->queue_sel],
+			   virtio_map_gpa(pa, size));
 	}
 	break;
     case VIRTIO_PCI_QUEUE_SEL:

[kvm-devel] [PATCH 3/5] Revert virtio tap hack (v3)

From: Anthony L. <ali...@us...> - 2008-05-07 18:10:06

While it has served us well, it is long overdue that we eliminate the
virtio-net tap hack.  It turns out that zero-copy has very little impact on
performance.  The tap hack was gaining such a significant performance boost
not because of zero-copy, but because it avoided dropping packets on receive
which is apparently a significant problem with the tap implementation in QEMU.

Signed-off-by: Anthony Liguori <ali...@us...>

diff --git a/qemu/hw/pc.h b/qemu/hw/pc.h
index 73e3918..c284bf1 100644
--- a/qemu/hw/pc.h
+++ b/qemu/hw/pc.h
@@ -155,7 +155,6 @@ void isa_ne2000_init(int base, qemu_irq irq, NICInfo *nd);
 /* virtio-net.c */
 
 PCIDevice *virtio_net_init(PCIBus *bus, NICInfo *nd, int devfn);
-void virtio_net_poll(void);
 
 /* virtio-blk.h */
 void *virtio_blk_init(PCIBus *bus, uint16_t vendor, uint16_t device,
diff --git a/qemu/hw/virtio-net.c b/qemu/hw/virtio-net.c
index e2db49c..a26d04e 100644
--- a/qemu/hw/virtio-net.c
+++ b/qemu/hw/virtio-net.c
@@ -13,7 +13,6 @@
 
 #include "virtio.h"
 #include "net.h"
-#include "pc.h"
 #include "qemu-timer.h"
 
 /* from Linux's virtio_net.h */
@@ -62,15 +61,10 @@ typedef struct VirtIONet
     VirtQueue *tx_vq;
     VLANClientState *vc;
     int can_receive;
-    int tap_fd;
-    struct VirtIONet *next;
-    int do_notify;
     QEMUTimer *tx_timer;
     int tx_timer_active;
 } VirtIONet;
 
-static VirtIONet *VirtIONetHead = NULL;
-
 static VirtIONet *to_virtio_net(VirtIODevice *vdev)
 {
     return (VirtIONet *)vdev;
@@ -105,7 +99,6 @@ static int virtio_net_can_receive(void *opaque)
     return (n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK) && n->can_receive;
 }
 
-/* -net user receive function */
 static void virtio_net_receive(void *opaque, const uint8_t *buf, int size)
 {
     VirtIONet *n = opaque;
@@ -149,92 +142,6 @@ static void virtio_net_receive(void *opaque, const uint8_t *buf, int size)
     virtio_notify(&n->vdev, n->rx_vq);
 }
 
-/* -net tap receive handler */
-void virtio_net_poll(void)
-{
-    VirtIONet *vnet;
-    int len;
-    fd_set rfds;
-    struct timeval tv;
-    int max_fd = -1;
-    VirtQueueElement elem;
-    struct virtio_net_hdr *hdr;
-    int did_notify;
-
-    FD_ZERO(&rfds);
-    tv.tv_sec = 0;
-    tv.tv_usec = 0;
-
-    while (1) {
-
-        // Prepare the set of device to select from
-        for (vnet = VirtIONetHead; vnet; vnet = vnet->next) {
-
-            if (vnet->tap_fd == -1)
-                continue;
-
-            vnet->do_notify = 0;
-            //first check if the driver is ok
-            if (!virtio_net_can_receive(vnet))
-                continue;
-
-            /* FIXME: the drivers really need to set their status better */
-            if (vnet->rx_vq->vring.avail == NULL) {
-                vnet->can_receive = 0;
-                continue;
-            }
-
-            FD_SET(vnet->tap_fd, &rfds);
-            if (max_fd < vnet->tap_fd) max_fd = vnet->tap_fd;
-        }
-
-        if (select(max_fd + 1, &rfds, NULL, NULL, &tv) <= 0)
-            break;
-
-        // Now check who has data pending in the tap
-        for (vnet = VirtIONetHead; vnet; vnet = vnet->next) {
-
-            if (!FD_ISSET(vnet->tap_fd, &rfds))
-                continue;
-
-            if (virtqueue_pop(vnet->rx_vq, &elem) == 0) {
-                vnet->can_receive = 0;
-                continue;
-            }
-
-	    if (elem.in_num < 1 || elem.in_sg[0].iov_len != sizeof(*hdr)) {
-		fprintf(stderr, "virtio-net header not in first element\n");
-		exit(1);
-	    }
-
-            hdr = (void *)elem.in_sg[0].iov_base;
-            hdr->flags = 0;
-            hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
-again:
-            len = readv(vnet->tap_fd, &elem.in_sg[1], elem.in_num - 1);
-            if (len == -1) {
-                if (errno == EINTR || errno == EAGAIN)
-                    goto again;
-                else
-                    fprintf(stderr, "reading network error %d", len);
-            }
-            virtqueue_push(vnet->rx_vq, &elem, sizeof(*hdr) + len);
-            vnet->do_notify = 1;
-        }
-
-        /* signal other side */
-        did_notify = 0;
-        for (vnet = VirtIONetHead; vnet; vnet = vnet->next)
-            if (vnet->do_notify) {
-                virtio_notify(&vnet->vdev, vnet->rx_vq);
-                did_notify++;
-            }
-        if (!did_notify)
-            break;
-     }
-
-}
-
 /* TX */
 static void virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq)
 {
@@ -313,12 +220,6 @@ PCIDevice *virtio_net_init(PCIBus *bus, NICInfo *nd, int devfn)
     memcpy(n->mac, nd->macaddr, 6);
     n->vc = qemu_new_vlan_client(nd->vlan, virtio_net_receive,
                                  virtio_net_can_receive, n);
-    n->tap_fd = hack_around_tap(n->vc->vlan->first_client);
-    if (n->tap_fd != -1) {
-        n->next = VirtIONetHead;
-        //push the device on top of the list
-        VirtIONetHead = n;
-    }
 
     n->tx_timer = qemu_new_timer(rt_clock, virtio_net_tx_timer, n);
     n->tx_timer_active = 0;
diff --git a/qemu/vl.c b/qemu/vl.c
index a1aa270..03051da 100644
--- a/qemu/vl.c
+++ b/qemu/vl.c
@@ -3970,15 +3970,8 @@ typedef struct TAPState {
     VLANClientState *vc;
     int fd;
     char down_script[1024];
-    int no_poll;
 } TAPState;
 
-static int tap_read_poll(void *opaque)
-{
-    TAPState *s = opaque;
-    return (!s->no_poll);
-}
-
 static void tap_receive(void *opaque, const uint8_t *buf, int size)
 {
     TAPState *s = opaque;
@@ -4012,22 +4005,6 @@ static void tap_send(void *opaque)
     }
 }
 
-int hack_around_tap(void *opaque)
-{
-    VLANClientState *vc = opaque;
-    TAPState *ts = vc->opaque;
-
-    if (vc->fd_read != tap_receive)
-        return -1;
-
-    if (ts) {
-       ts->no_poll = 1;
-       return ts->fd;
-    }
-
-    return -1;
-}
-
 /* fd support */
 
 static TAPState *net_tap_fd_init(VLANState *vlan, int fd)
@@ -4038,10 +4015,8 @@ static TAPState *net_tap_fd_init(VLANState *vlan, int fd)
     if (!s)
         return NULL;
     s->fd = fd;
-    s->no_poll = 0;
-    enable_sigio_timer(fd);
     s->vc = qemu_new_vlan_client(vlan, tap_receive, NULL, s);
-    qemu_set_fd_handler2(s->fd, tap_read_poll, tap_send, NULL, s);
+    qemu_set_fd_handler2(s->fd, NULL, tap_send, NULL, s);
     snprintf(s->vc->info_str, sizeof(s->vc->info_str), "tap: fd=%d", fd);
     return s;
 }
@@ -7417,10 +7392,7 @@ void main_loop_wait(int timeout)
         slirp_select_poll(&rfds, &wfds, &xfds);
     }
 #endif
-    virtio_net_poll();
-
     qemu_aio_poll();
-
     if (vm_running) {
         qemu_run_timers(&active_timers[QEMU_TIMER_VIRTUAL],
                         qemu_get_clock(vm_clock));

[kvm-devel] [PATCH 2/5] Validate the SG list layouts in virtio

From: Anthony L. <ali...@us...> - 2008-05-07 18:10:21

We should check that the first element is the size we expect instead of
just casting blindly.

Signed-off-by: Anthony Liguori <ali...@us...>

diff --git a/qemu/hw/virtio-blk.c b/qemu/hw/virtio-blk.c
index 3af36db..048285a 100644
--- a/qemu/hw/virtio-blk.c
+++ b/qemu/hw/virtio-blk.c
@@ -56,8 +56,6 @@ struct virtio_blk_outhdr
     uint32_t ioprio;
     /* Sector (ie. 512 byte offset) */
     uint64_t sector;
-    /* Where to put reply. */
-    uint64_t id;
 };
 
 #define VIRTIO_BLK_S_OK		0
@@ -94,6 +92,17 @@ static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
 	off_t off;
 	int i;
 
+	if (elem.out_num < 1 || elem.in_num < 1) {
+	    fprintf(stderr, "virtio-blk missing headers\n");
+	    exit(1);
+	}
+
+	if (elem.out_sg[0].iov_len != sizeof(*out) ||
+	    elem.in_sg[elem.in_num - 1].iov_len != sizeof(*in)) {
+	    fprintf(stderr, "virtio-blk header not in correct element\n");
+	    exit(1);
+	}
+
 	out = (void *)elem.out_sg[0].iov_base;
 	in = (void *)elem.in_sg[elem.in_num - 1].iov_base;
 	off = out->sector;
diff --git a/qemu/hw/virtio-net.c b/qemu/hw/virtio-net.c
index f727b14..5ac5089 100644
--- a/qemu/hw/virtio-net.c
+++ b/qemu/hw/virtio-net.c
@@ -125,6 +125,11 @@ static void virtio_net_receive(void *opaque, const uint8_t *buf, int size)
 	return;
     }
 
+    if (elem.in_num < 1 || elem.in_sg[0].iov_len != sizeof(*hdr)) {
+	fprintf(stderr, "virtio-net header not in first element\n");
+	exit(1);
+    }
+
     hdr = (void *)elem.in_sg[0].iov_base;
     hdr->flags = 0;
     hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
@@ -197,6 +202,11 @@ void virtio_net_poll(void)
                 continue;
             }
 
+	    if (elem.in_num < 1 || elem.in_sg[0].iov_len != sizeof(*hdr)) {
+		fprintf(stderr, "virtio-net header not in first element\n");
+		exit(1);
+	    }
+
             hdr = (void *)elem.in_sg[0].iov_base;
             hdr->flags = 0;
             hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;

[kvm-devel] [PATCH 5/5] Stop dropping so many RX packets in tap (v3)

From: Anthony L. <ali...@us...> - 2008-05-07 18:10:21

Normally, tap always reads packets and simply lets the client drop them if it
cannot receive them.  For virtio-net, this results in massive packet loss and
about an 80% performance loss in TCP throughput.

This patch modifies qemu_send_packet() to only deliver a packet to a VLAN
client if it doesn't have a fd_can_read method or the fd_can_read method
indicates that it can receive packets.  We also return a status of whether
any clients were able to receive the packet.

If no clients were able to receive a packet, we buffer the packet until a
client indicates that it can receive packets again.

This patch also modifies the tap code to only read from the tap fd if at least
one client on the VLAN is able to receive a packet.

Finally, this patch changes the tap code to drain all possible packets from
the tap device when the tap fd is readable.

Signed-off-by: Anthony Liguori <ali...@us...>

diff --git a/qemu/net.h b/qemu/net.h
index 13daa27..dfdf9af 100644
--- a/qemu/net.h
+++ b/qemu/net.h
@@ -29,7 +29,7 @@ VLANClientState *qemu_new_vlan_client(VLANState *vlan,
                                       IOCanRWHandler *fd_can_read,
                                       void *opaque);
 int qemu_can_send_packet(VLANClientState *vc);
-void qemu_send_packet(VLANClientState *vc, const uint8_t *buf, int size);
+int qemu_send_packet(VLANClientState *vc, const uint8_t *buf, int size);
 void qemu_handler_true(void *opaque);
 
 void do_info_network(void);
diff --git a/qemu/vl.c b/qemu/vl.c
index 03051da..28f8d1d 100644
--- a/qemu/vl.c
+++ b/qemu/vl.c
@@ -3754,10 +3754,11 @@ int qemu_can_send_packet(VLANClientState *vc1)
     return 0;
 }
 
-void qemu_send_packet(VLANClientState *vc1, const uint8_t *buf, int size)
+int qemu_send_packet(VLANClientState *vc1, const uint8_t *buf, int size)
 {
     VLANState *vlan = vc1->vlan;
     VLANClientState *vc;
+    int ret = -EAGAIN;
 
 #if 0
     printf("vlan %d send:\n", vlan->id);
@@ -3765,9 +3766,14 @@ void qemu_send_packet(VLANClientState *vc1, const uint8_t *buf, int size)
 #endif
     for(vc = vlan->first_client; vc != NULL; vc = vc->next) {
         if (vc != vc1) {
-            vc->fd_read(vc->opaque, buf, size);
+	    if (!vc->fd_can_read || vc->fd_can_read(vc->opaque)) {
+		vc->fd_read(vc->opaque, buf, size);
+		ret = 0;
+	    }
         }
     }
+
+    return ret;
 }
 
 #if defined(CONFIG_SLIRP)
@@ -3970,6 +3976,8 @@ typedef struct TAPState {
     VLANClientState *vc;
     int fd;
     char down_script[1024];
+    char buf[4096];
+    int size;
 } TAPState;
 
 static void tap_receive(void *opaque, const uint8_t *buf, int size)
@@ -3985,24 +3993,70 @@ static void tap_receive(void *opaque, const uint8_t *buf, int size)
     }
 }
 
+static int tap_can_send(void *opaque)
+{
+    TAPState *s = opaque;
+    VLANClientState *vc;
+    int can_receive = 0;
+ 
+    /* Check to see if any of our clients can receive a packet */
+    for (vc = s->vc->vlan->first_client; vc; vc = vc->next) {
+	/* Skip ourselves */
+	if (vc == s->vc)
+	    continue;
+
+	if (!vc->fd_can_read) {
+	    /* no fd_can_read handler, they always can receive */
+	    can_receive = 1;
+	} else
+	    can_receive = vc->fd_can_read(vc->opaque);
+
+	/* Once someone can receive, we try to send a packet */
+	if (can_receive)
+	    break;
+    }
+
+    return can_receive;
+}
+
 static void tap_send(void *opaque)
 {
     TAPState *s = opaque;
-    uint8_t buf[4096];
-    int size;
 
+    /* First try to send any buffered packet */
+    if (s->size > 0) {
+	int err;
+
+	/* If noone can receive the packet, buffer it */
+	err = qemu_send_packet(s->vc, s->buf, s->size);
+	if (err == -EAGAIN)
+	    return;
+    }
+
+    /* Read packets until we hit EAGAIN */
+    do {
 #ifdef __sun__
-    struct strbuf sbuf;
-    int f = 0;
-    sbuf.maxlen = sizeof(buf);
-    sbuf.buf = buf;
-    size = getmsg(s->fd, NULL, &sbuf, &f) >=0 ? sbuf.len : -1;
+	struct strbuf sbuf;
+	int f = 0;
+	sbuf.maxlen = sizeof(s->buf);
+	sbuf.buf = s->buf;
+	s->size = getmsg(s->fd, NULL, &sbuf, &f) >=0 ? sbuf.len : -1;
 #else
-    size = read(s->fd, buf, sizeof(buf));
+	s->size = read(s->fd, s->buf, sizeof(s->buf));
 #endif
-    if (size > 0) {
-        qemu_send_packet(s->vc, buf, size);
-    }
+
+	if (s->size == -1 && errno == EINTR)
+	    continue;
+
+	if (s->size > 0) {
+	    int err;
+
+	    /* If noone can receive the packet, buffer it */
+	    err = qemu_send_packet(s->vc, s->buf, s->size);
+	    if (err == -EAGAIN)
+		break;
+	}
+    } while (s->size > 0);
 }
 
 /* fd support */
@@ -4016,7 +4070,7 @@ static TAPState *net_tap_fd_init(VLANState *vlan, int fd)
         return NULL;
     s->fd = fd;
     s->vc = qemu_new_vlan_client(vlan, tap_receive, NULL, s);
-    qemu_set_fd_handler2(s->fd, NULL, tap_send, NULL, s);
+    qemu_set_fd_handler2(s->fd, tap_can_send, tap_send, NULL, s);
     snprintf(s->vc->info_str, sizeof(s->vc->info_str), "tap: fd=%d", fd);
     return s;
 }

Re: [kvm-devel] [PATCH 5/5] Stop dropping so many RX packets in tap (v3)

From: Avi K. <av...@qu...> - 2008-05-11 14:34:08

Anthony Liguori wrote:
> Normally, tap always reads packets and simply lets the client drop them if it
> cannot receive them.  For virtio-net, this results in massive packet loss and
> about an 80% performance loss in TCP throughput.
>
> This patch modifies qemu_send_packet() to only deliver a packet to a VLAN
> client if it doesn't have a fd_can_read method or the fd_can_read method
> indicates that it can receive packets.  We also return a status of whether
> any clients were able to receive the packet.
>
> If no clients were able to receive a packet, we buffer the packet until a
> client indicates that it can receive packets again.
>
> This patch also modifies the tap code to only read from the tap fd if at least
> one client on the VLAN is able to receive a packet.
>
> Finally, this patch changes the tap code to drain all possible packets from
> the tap device when the tap fd is readable.
>
>  
>  #if defined(CONFIG_SLIRP)
> @@ -3970,6 +3976,8 @@ typedef struct TAPState {
>      VLANClientState *vc;
>      int fd;
>      char down_script[1024];
> +    char buf[4096];
> +    int size;
>  } TAPState;
>   

This breaks large MTUs.

How about the other way round: when the vlan consumer detects it can no 
longer receive packets, it tells that to the vlan.  When all vlan 
consumers can no longer receive, tell the producer to stop producing.  
For the tap producer, this is simply removing its fd from the read poll 
list.  When a vlan consumer becomes ready to receive again, it tells the 
vlan, which tells the producers, which then install their fds back again.

This is  a bit difficult since virtio and tap are both consumers and 
producers, but could be made to work, I think.

-- 
error compiling committee.c: too many arguments to function

Re: [kvm-devel] [PATCH 5/5] Stop dropping so many RX packets in tap (v3)

From: Avi K. <av...@qu...> - 2008-05-11 18:52:06

Anthony Liguori wrote:
>> How about the other way round: when the vlan consumer detects it can 
>> no longer receive packets, it tells that to the vlan.  When all vlan 
>> consumers can no longer receive, tell the producer to stop 
>> producing.  For the tap producer, this is simply removing its fd from 
>> the read poll list.  When a vlan consumer becomes ready to receive 
>> again, it tells the vlan, which tells the producers, which then 
>> install their fds back again.
>
> Yeah, that's a nice idea.   I'll think about it.  I don't know if it's 
> really worth doing as an intermediate step though.  What I'd really 
> like to do is have a vlan interface where consumers published all of 
> their receive buffers.  Then there's no need for notifications of 
> receive-ability.

That's definitely better, and is also more multiqueue nic / vringfd 
friendly.

I still think interrupt-on-halfway-mark is needed much more urgently.  
It deals with concurrency much better:

rx:
  host interrupts guest on halfway mark
  guest starts processing packets
  host keeps delivering

tx:
  guest kicks host on halfway mark
  host starts processing packets
  second vcpu on guest keeps on queueing

It's also much better with multiqueue NICs, where there's no socket 
buffer to hold the packets while we're out of descriptors.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

Re: [kvm-devel] [PATCH 5/5] Stop dropping so many RX packets in tap (v3)

From: Anthony L. <an...@co...> - 2008-05-11 20:44:41

Avi Kivity wrote:
> Anthony Liguori wrote:
>   
>>> How about the other way round: when the vlan consumer detects it can 
>>> no longer receive packets, it tells that to the vlan.  When all vlan 
>>> consumers can no longer receive, tell the producer to stop 
>>> producing.  For the tap producer, this is simply removing its fd from 
>>> the read poll list.  When a vlan consumer becomes ready to receive 
>>> again, it tells the vlan, which tells the producers, which then 
>>> install their fds back again.
>>>       
>> Yeah, that's a nice idea.   I'll think about it.  I don't know if it's 
>> really worth doing as an intermediate step though.  What I'd really 
>> like to do is have a vlan interface where consumers published all of 
>> their receive buffers.  Then there's no need for notifications of 
>> receive-ability.
>>     
>
> That's definitely better, and is also more multiqueue nic / vringfd 
> friendly.
>
> I still think interrupt-on-halfway-mark is needed much more urgently.  
> It deals with concurrency much better:
>   

We already sort of do this.  In try_fill_recv() in virtio-net.c, we try 
to allocate as many skbs as we can to fill the rx queue.  We keep track 
of how many we've been able to allocate.  Whenever we process an RX 
packet, we check to see if the current number of RX packets is less than 
half the maximum number of rx packets we've been able to allocate.

In the common case of small queues, this should have exactly the 
behavior you describe.  We don't currently suppress the RX notification 
even though we really could.  The can_receive changes are really the key 
to this.  Unless we are suppressing tap fd select()'ing, we can always 
suppress the RX notification.  That's been sitting on my TODO for a bit.

> rx:
>   host interrupts guest on halfway mark
>   guest starts processing packets
>   host keeps delivering
>
> tx:
>   guest kicks host on halfway mark
>   host starts processing packets
>   second vcpu on guest keeps on queueing
>   

I'm not convinced it's at all practical to suppress notifications in the 
front-end.  We simply don't know whether we'll get more packets which 
means we have to do TX mitigation within the front-end.  We've been 
there, it's not as nice as doing it in the back-end.

What we really ought to do in the back-end though, is start processing 
the TX packets as soon as we begin to do TX mitigation.  This would 
address the ping latency issue while also increasing throughput 
(hopefully).  One thing I've wanted to try is to register a bottom-half 
or a polling function so that the IO thread was always trying to process 
TX packets while the TX timer is active.

Regards,

Anthony Liguori

> It's also much better with multiqueue NICs, where there's no socket 
> buffer to hold the packets while we're out of descriptors.
>
>

Re: [kvm-devel] [PATCH 5/5] Stop dropping so many RX packets in tap (v3)

From: Avi K. <av...@qu...> - 2008-05-12 06:59:54

Anthony Liguori wrote:
> Avi Kivity wrote:
>> Anthony Liguori wrote:
>>  
>>>> How about the other way round: when the vlan consumer detects it 
>>>> can no longer receive packets, it tells that to the vlan.  When all 
>>>> vlan consumers can no longer receive, tell the producer to stop 
>>>> producing.  For the tap producer, this is simply removing its fd 
>>>> from the read poll list.  When a vlan consumer becomes ready to 
>>>> receive again, it tells the vlan, which tells the producers, which 
>>>> then install their fds back again.
>>>>       
>>> Yeah, that's a nice idea.   I'll think about it.  I don't know if 
>>> it's really worth doing as an intermediate step though.  What I'd 
>>> really like to do is have a vlan interface where consumers published 
>>> all of their receive buffers.  Then there's no need for 
>>> notifications of receive-ability.
>>>     
>>
>> That's definitely better, and is also more multiqueue nic / vringfd 
>> friendly.
>>
>> I still think interrupt-on-halfway-mark is needed much more 
>> urgently.  It deals with concurrency much better:
>>   
>
> We already sort of do this.  In try_fill_recv() in virtio-net.c, we 
> try to allocate as many skbs as we can to fill the rx queue.  We keep 
> track of how many we've been able to allocate.  Whenever we process an 
> RX packet, we check to see if the current number of RX packets is less 
> than half the maximum number of rx packets we've been able to allocate.
>

Then why are we dropping packets?  We shouldn't run out of rx 
descriptors if this works.

> In the common case of small queues, this should have exactly the 
> behavior you describe.  We don't currently suppress the RX 
> notification even though we really could.  The can_receive changes are 
> really the key to this.  Unless we are suppressing tap fd 
> select()'ing, we can always suppress the RX notification.  That's been 
> sitting on my TODO for a bit.
>

Sorry, totally confused.  How does can_receive come into this?

>> rx:
>>   host interrupts guest on halfway mark
>>   guest starts processing packets
>>   host keeps delivering
>>
>> tx:
>>   guest kicks host on halfway mark
>>   host starts processing packets
>>   second vcpu on guest keeps on queueing
>>   
>
> I'm not convinced it's at all practical to suppress notifications in 
> the front-end.  We simply don't know whether we'll get more packets 
> which means we have to do TX mitigation within the front-end.  We've 
> been there, it's not as nice as doing it in the back-end.
>
> What we really ought to do in the back-end though, is start processing 
> the TX packets as soon as we begin to do TX mitigation.  This would 
> address the ping latency issue while also increasing throughput 
> (hopefully).  One thing I've wanted to try is to register a 
> bottom-half or a polling function so that the IO thread was always 
> trying to process TX packets while the TX timer is active.

Setting the timer clearly should be the host's job.  But suppression 
needs to be coordinated.  One fairly generic mechanism is for one side 
to tell the other at which ring index it wants the notification.

There are four cases to consider:

- tx guest->host

Normally the host sets the notify index at current+1, to get immediate 
notifications.  If it starts seeing many packets, it can gradually 
increasing this to current+size/2, setting a timer to limit latency.  If 
the timer fires, drop the window size.

- tx host->guest

Returning descriptors is not an urgent task if the ring is large 
enough.  Can be set to size/2 plus a longish timer.

- rx guest->host

Similar to above, set to size/2, don't even need a timer.

- rx host->guest

Similar to tx guest->host, start with a small window to get minimal 
latency, if we see many packets within a short time we can increase the 
window + set a timer.

We should aim for the timer to fire only rarely.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

[kvm-devel] [PATCH 4/5] Make virtio-net can_receive more accurate (v3)

From: Anthony L. <ali...@us...> - 2008-05-07 18:10:39

In the final patch of this series, we rely on a VLAN client's fd_can_read
method to avoid dropping packets.  Unfortunately, virtio's fd_can_read method
is not very accurate at the moment.  This patch addresses this.

It also generates a notification to the IO thread when more RX packets become
available.  If we say we can't receive a packet because no RX buffers are
available, this may result in the tap file descriptor not being select()'d.
Without notifying the IO thread, we may have to wait until the select() times
out before we can receive a packet (even if there is one pending).

This particular change makes RX performance very consistent.

Signed-off-by: Anthony Liguori <ali...@us...>

diff --git a/qemu/hw/virtio-net.c b/qemu/hw/virtio-net.c
index 8d26832..5538979 100644
--- a/qemu/hw/virtio-net.c
+++ b/qemu/hw/virtio-net.c
@@ -14,6 +14,7 @@
 #include "virtio.h"
 #include "net.h"
 #include "qemu-timer.h"
+#include "qemu-kvm.h"
 
 /* from Linux's virtio_net.h */
 
@@ -60,11 +61,14 @@ typedef struct VirtIONet
     VirtQueue *rx_vq;
     VirtQueue *tx_vq;
     VLANClientState *vc;
-    int can_receive;
     QEMUTimer *tx_timer;
     int tx_timer_active;
 } VirtIONet;
 
+/* TODO
+ * - we could suppress RX interrupt if we were so inclined.
+ */
+
 static VirtIONet *to_virtio_net(VirtIODevice *vdev)
 {
     return (VirtIONet *)vdev;
@@ -88,15 +92,24 @@ static uint32_t virtio_net_get_features(VirtIODevice *vdev)
 
 static void virtio_net_handle_rx(VirtIODevice *vdev, VirtQueue *vq)
 {
-    VirtIONet *n = to_virtio_net(vdev);
-    n->can_receive = 1;
+    /* We now have RX buffers, signal to the IO thread to break out of the
+       select to re-poll the tap file descriptor */
+    if (kvm_enabled())
+	qemu_kvm_notify_work();
 }
 
 static int virtio_net_can_receive(void *opaque)
 {
     VirtIONet *n = opaque;
 
-    return (n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK) && n->can_receive;
+    if (n->rx_vq->vring.avail == NULL ||
+	!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK))
+	return 0;
+
+    if (n->rx_vq->vring.avail->idx == n->rx_vq->last_avail_idx)
+	return 0;
+
+    return 1;
 }
 
 static void virtio_net_receive(void *opaque, const uint8_t *buf, int size)
@@ -106,15 +119,8 @@ static void virtio_net_receive(void *opaque, const uint8_t *buf, int size)
     struct virtio_net_hdr *hdr;
     int offset, i;
 
-    /* FIXME: the drivers really need to set their status better */
-    if (n->rx_vq->vring.avail == NULL) {
-	n->can_receive = 0;
-	return;
-    }
-
     if (virtqueue_pop(n->rx_vq, &elem) == 0) {
-	/* wait until the guest adds some rx bufs */
-	n->can_receive = 0;
+	fprintf(stderr, "virtio_net: this should not happen\n");
 	return;
     }
 
@@ -209,9 +215,8 @@ PCIDevice *virtio_net_init(PCIBus *bus, NICInfo *nd, int devfn)
 
     n->vdev.update_config = virtio_net_update_config;
     n->vdev.get_features = virtio_net_get_features;
-    n->rx_vq = virtio_add_queue(&n->vdev, 512, virtio_net_handle_rx);
+    n->rx_vq = virtio_add_queue(&n->vdev, 128, virtio_net_handle_rx);
     n->tx_vq = virtio_add_queue(&n->vdev, 128, virtio_net_handle_tx);
-    n->can_receive = 0;
     memcpy(n->mac, nd->macaddr, 6);
     n->vc = qemu_new_vlan_client(nd->vlan, virtio_net_receive,
                                  virtio_net_can_receive, n);

Re: [kvm-devel] [PATCH 1/5] Support more than 3.5GB with virtio (v3)

From: Avi K. <av...@qu...> - 2008-05-09 15:24:31

Anthony Liguori wrote:
> We're pretty sloppy in virtio right now about phys_ram_base assumptions.  This
> patch is an incremental step between what we have today and a full blown DMA
> API.  I backported the DMA API but the performance impact was not acceptable
> to me. There's only a slight performance impact with this particular patch.
>
> Since we're no longer assuming guest physical memory is contiguous, we need
> a more complex way to validate the memory regions than just checking if it's
> within ram_size.
>   

Applied patches 1-2.  Since patch 4 is under contention on qemu-devel, 
and 3 and 5 depend on it, I'd like to get the can_receive semantic 
change accepted first.


-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

Re: [kvm-devel] [PATCH 1/5] Support more than 3.5GB with virtio (v3)

From: Anthony L. <ali...@us...> - 2008-05-09 18:37:14

Avi Kivity wrote:
> Anthony Liguori wrote:
>> We're pretty sloppy in virtio right now about phys_ram_base 
>> assumptions.  This
>> patch is an incremental step between what we have today and a full 
>> blown DMA
>> API.  I backported the DMA API but the performance impact was not 
>> acceptable
>> to me. There's only a slight performance impact with this particular 
>> patch.
>>
>> Since we're no longer assuming guest physical memory is contiguous, 
>> we need
>> a more complex way to validate the memory regions than just checking 
>> if it's
>> within ram_size.
>>   
>
> Applied patches 1-2.  Since patch 4 is under contention on qemu-devel, 
> and 3 and 5 depend on it, I'd like to get the can_receive semantic 
> change accepted first.

I'll send it upstream, but I think it's much less of a divergence than 
the current virtio_net_poll hacks.

Regards,

Anthony Liguori

Re: [kvm-devel] [PATCH 5/5] Stop dropping so many RX packets in tap (v3)

From: Anthony L. <ali...@us...> - 2008-05-11 18:33:07

Avi Kivity wrote:
> Anthony Liguori wrote:
>> Normally, tap always reads packets and simply lets the client drop 
>> them if it
>> cannot receive them.  For virtio-net, this results in massive packet 
>> loss and
>> about an 80% performance loss in TCP throughput.
>>
>> This patch modifies qemu_send_packet() to only deliver a packet to a 
>> VLAN
>> client if it doesn't have a fd_can_read method or the fd_can_read method
>> indicates that it can receive packets.  We also return a status of 
>> whether
>> any clients were able to receive the packet.
>>
>> If no clients were able to receive a packet, we buffer the packet 
>> until a
>> client indicates that it can receive packets again.
>>
>> This patch also modifies the tap code to only read from the tap fd if 
>> at least
>> one client on the VLAN is able to receive a packet.
>>
>> Finally, this patch changes the tap code to drain all possible 
>> packets from
>> the tap device when the tap fd is readable.
>>
>>  
>>  #if defined(CONFIG_SLIRP)
>> @@ -3970,6 +3976,8 @@ typedef struct TAPState {
>>      VLANClientState *vc;
>>      int fd;
>>      char down_script[1024];
>> +    char buf[4096];
>> +    int size;
>>  } TAPState;
>>   
>
> This breaks large MTUs.

They've always been broken for tap.

> How about the other way round: when the vlan consumer detects it can 
> no longer receive packets, it tells that to the vlan.  When all vlan 
> consumers can no longer receive, tell the producer to stop producing.  
> For the tap producer, this is simply removing its fd from the read 
> poll list.  When a vlan consumer becomes ready to receive again, it 
> tells the vlan, which tells the producers, which then install their 
> fds back again.

Yeah, that's a nice idea.   I'll think about it.  I don't know if it's 
really worth doing as an intermediate step though.  What I'd really like 
to do is have a vlan interface where consumers published all of their 
receive buffers.  Then there's no need for notifications of receive-ability.

Regards,

Anthony Liguori

> This is  a bit difficult since virtio and tap are both consumers and 
> producers, but could be made to work, I think.
>
>