From: Anthony L. <ali...@us...> - 2007-11-08 02:51:08
|
This still needs quite a lot of work but I wanted to post it for reference. Regards, Anthony Liguori diff --git a/qemu/Makefile.target b/qemu/Makefile.target index 65f449e..3032337 100644 --- a/qemu/Makefile.target +++ b/qemu/Makefile.target @@ -448,6 +448,8 @@ VL_OBJS += rtl8139.o # PCI Hypercall VL_OBJS+= hypercall.o +VL_OBJS += virtio.o + ifeq ($(TARGET_BASE_ARCH), i386) # Hardware support VL_OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o $(AUDIODRV) diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c index 8aae814..9b17938 100644 --- a/qemu/hw/pc.c +++ b/qemu/hw/pc.c @@ -943,6 +943,11 @@ static void pc_init1(ram_addr_t ram_size, int vga_ram_size, int boot_device, #ifdef USE_HYPERCALL pci_hypercall_init(pci_bus); #endif + + if (1) { + virtio_init_pci(pci_bus, "virtio", 0x5002, 0x2258); + } + if (pci_enabled) { pci_piix3_ide_init(pci_bus, bs_table, piix3_devfn + 1, i8259); } else { diff --git a/qemu/hw/virtio.c b/qemu/hw/virtio.c new file mode 100644 index 0000000..94efe5a --- /dev/null +++ b/qemu/hw/virtio.c @@ -0,0 +1,486 @@ +#include "vl.h" +#include <linux/virtio_pci.h> +#include <err.h> + +#define DPRINTF(fmt, ...) do { printf(fmt, ## __VA_ARGS__); } while (0) + +#define wmb() asm volatile("sfence" ::: "memory") + +/* This marks a buffer as continuing via the next field. */ +#define VRING_DESC_F_NEXT 1 +/* This marks a buffer as write-only (otherwise read-only). */ +#define VRING_DESC_F_WRITE 2 + +/* This means don't notify other side when buffer added. */ +#define VRING_USED_F_NO_NOTIFY 1 +/* This means don't interrupt guest when buffer consumed. */ +#define VRING_AVAIL_F_NO_INTERRUPT 1 + +typedef struct VRingDesc +{ + uint64_t addr; + uint32_t len; + uint16_t flags; + uint16_t next; +} VRingDesc; + +typedef struct VRingAvail +{ + uint16_t flags; + uint16_t idx; + uint16_t ring[0]; +} VRingAvail; + +typedef struct VRingUsedElem +{ + uint32_t id; + uint32_t len; +} VRingUsedElem; + +typedef struct VRingUsed +{ + uint16_t flags; + uint16_t idx; + VRingUsedElem ring[0]; +} VRingUsed; + +typedef struct VRing +{ + unsigned int num; + VRingDesc *desc; + VRingAvail *avail; + VRingUsed *used; +} VRing; + +struct VirtQueue +{ + VRing vring; + uint32_t pfn; + uint16_t last_avail_idx; + void (*handle_output)(void *opaque, VirtIODevice *vdev, VirtQueue *vq); + void *opaque; +}; + +#define VIRTIO_PCI_QUEUE_MAX 16 + +struct VirtIODevice +{ + PCIDevice pci_dev; + const char *name; + int irq; + uint32_t addr; + uint16_t vendor; + uint16_t device; + uint8_t status; + uint8_t isr; + uint16_t queue_sel; + uint32_t features; + uint32_t (*get_config)(void *opaque, unsigned offset); + void (*set_config)(void *opaque, unsigned offset, uint32_t val); + uint32_t (*get_features)(void *opaque); + void (*set_features)(void *opaque, uint32_t val); + void *opaque; + VirtQueue vq[VIRTIO_PCI_QUEUE_MAX]; +}; + +static void vring_init(VirtQueue *vq, void *p) +{ + vq->vring.desc = p; + vq->vring.avail = p + vq->vring.num * sizeof(VRingDesc); + vq->vring.used = p + (vq->vring.num + 1) * (sizeof(VRingDesc) + sizeof(uint16_t)); +} + +static VirtIODevice *to_virtio_device(PCIDevice *pci_dev) +{ + return (VirtIODevice *)pci_dev; +} + +static unsigned next_desc(VirtQueue *vq, unsigned int i) +{ + unsigned int next; + + /* If this descriptor says it doesn't chain, we're done. */ + if (!(vq->vring.desc[i].flags & VRING_DESC_F_NEXT)) + return vq->vring.num; + + /* Check they're not leading us off end of descriptors. */ + next = vq->vring.desc[i].next; + /* Make sure compiler knows to grab that: we don't want it changing! */ + wmb(); + + if (next >= vq->vring.num) + errx(1, "Desc next is %u", next); + + return next; +} + +static void *check_pointer(unsigned long addr, unsigned int size) +{ + if ((addr + size) > ram_size) + errx(1, "bad guest"); + return phys_ram_base + addr; +} + +static unsigned get_vq_desc(VirtQueue *vq, + struct iovec iov[], + unsigned int *out_num, unsigned int *in_num) +{ + unsigned int i, head; + + /* Check it isn't doing very strange things with descriptor numbers. */ + if ((uint16_t)(vq->vring.avail->idx - vq->last_avail_idx) > vq->vring.num) + errx(1, "Guest moved used index from %u to %u", + vq->last_avail_idx, vq->vring.avail->idx); + + /* If there's nothing new since last we looked, return invalid. */ + if (vq->vring.avail->idx == vq->last_avail_idx) + return vq->vring.num; + + /* Grab the next descriptor number they're advertising, and increment + * the index we've seen. */ + head = vq->vring.avail->ring[vq->last_avail_idx++ % vq->vring.num]; + + /* If their number is silly, that's a fatal mistake. */ + if (head >= vq->vring.num) + errx(1, "Guest says index %u is available", head); + + /* When we start there are none of either input nor output. */ + *out_num = *in_num = 0; + + i = head; + do { + /* Grab the first descriptor, and check it's OK. */ + iov[*out_num + *in_num].iov_len = vq->vring.desc[i].len; + iov[*out_num + *in_num].iov_base + = check_pointer(vq->vring.desc[i].addr, + vq->vring.desc[i].len); + /* If this is an input descriptor, increment that count. */ + if (vq->vring.desc[i].flags & VRING_DESC_F_WRITE) + (*in_num)++; + else { + /* If it's an output descriptor, they're all supposed + * to come before any input descriptors. */ + if (*in_num) + errx(1, "Descriptor has out after in"); + (*out_num)++; + } + + /* If we've got too many, that implies a descriptor loop. */ + if (*out_num + *in_num > vq->vring.num) + errx(1, "Looped descriptor"); + } while ((i = next_desc(vq, i)) != vq->vring.num); + + return head; +} + +static void virtio_ring_kick(VirtIODevice *vdev, VirtQueue *vq) +{ + if (vq->vring.desc) + vq->handle_output(vq->opaque, vdev, vq); +} + +static void virtio_update_irq(VirtIODevice *vdev) +{ + qemu_set_irq(vdev->pci_dev.irq[0], vdev->isr & 1); +} + +static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val) +{ + VirtIODevice *vdev = to_virtio_device(opaque); + ram_addr_t pa; + + addr -= vdev->addr; + + switch (addr) { + case VIRTIO_PCI_GUEST_FEATURES: + if (vdev->set_features) + vdev->set_features(vdev->opaque, val); + vdev->features = val; + break; + case VIRTIO_PCI_QUEUE_PFN: + pa = (ram_addr_t)val << TARGET_PAGE_BITS; + vdev->vq[vdev->queue_sel].pfn = val; + if (pa < (ram_size - TARGET_PAGE_SIZE)) + vring_init(&vdev->vq[vdev->queue_sel], phys_ram_base + pa); + break; + case VIRTIO_PCI_QUEUE_SEL: + if (val < VIRTIO_PCI_QUEUE_MAX) + vdev->queue_sel = val; + break; + case VIRTIO_PCI_QUEUE_NOTIFY: + if (val < VIRTIO_PCI_QUEUE_MAX) + virtio_ring_kick(vdev, &vdev->vq[val]); + break; + case VIRTIO_PCI_STATUS: + vdev->status = val & 0xFF; + break; + default: + if (addr >= VIRTIO_PCI_CONFIG && vdev->set_config) + vdev->set_config(vdev->opaque, addr - VIRTIO_PCI_CONFIG, val); + break; + } +} + +static uint32_t virtio_ioport_read(void *opaque, uint32_t addr) +{ + VirtIODevice *vdev = to_virtio_device(opaque); + uint32_t ret = 0xFFFFFFFF; + + addr -= vdev->addr; + + switch (addr) { + case VIRTIO_PCI_HOST_FEATURES: + ret = vdev->get_features(vdev->opaque); + break; + case VIRTIO_PCI_GUEST_FEATURES: + ret = vdev->features; + break; + case VIRTIO_PCI_QUEUE_PFN: + ret = vdev->vq[vdev->queue_sel].pfn; + break; + case VIRTIO_PCI_QUEUE_NUM: + ret = vdev->vq[vdev->queue_sel].vring.num; + break; + case VIRTIO_PCI_QUEUE_SEL: + ret = vdev->queue_sel; + break; + case VIRTIO_PCI_STATUS: + ret = vdev->status; + break; + case VIRTIO_PCI_ISR: + /* reading from the ISR also clears it. */ + ret = vdev->isr; + vdev->isr = 0; + virtio_update_irq(vdev); + break; + default: + if (addr >= VIRTIO_PCI_CONFIG) + ret = vdev->get_config(vdev->opaque, addr - VIRTIO_PCI_CONFIG); + break; + } + + return ret; +} + +static void virtio_map(PCIDevice *pci_dev, int region_num, + uint32_t addr, uint32_t size, int type) +{ + VirtIODevice *vdev = to_virtio_device(pci_dev); + + vdev->addr = addr; + register_ioport_write(addr, size, 1, virtio_ioport_write, vdev); + register_ioport_write(addr, size, 2, virtio_ioport_write, vdev); + register_ioport_write(addr, size, 4, virtio_ioport_write, vdev); + register_ioport_read(addr, size, 1, virtio_ioport_read, vdev); + register_ioport_read(addr, size, 2, virtio_ioport_read, vdev); + register_ioport_read(addr, size, 4, virtio_ioport_read, vdev); +} + +/* Convert an iovec element to the given type. + * + * This is a fairly ugly trick: we need to know the size of the type and + * alignment requirement to check the pointer is kosher. It's also nice to + * have the name of the type in case we report failure. + * + * Typing those three things all the time is cumbersome and error prone, so we + * have a macro which sets them all up and passes to the real function. */ +#define convert(iov, type) \ + ((type *)_convert((iov), sizeof(type), __alignof__(type), #type)) + +static void *_convert(struct iovec *iov, size_t size, size_t align, + const char *name) +{ + if (iov->iov_len != size) + errx(1, "Bad iovec size %zu for %s", iov->iov_len, name); + if ((unsigned long)iov->iov_base % align != 0) + errx(1, "Bad alignment %p for %s", iov->iov_base, name); + return iov->iov_base; +} + +static void add_used(VirtQueue *vq, unsigned int head, int len) +{ + VRingUsedElem *used; + + /* Get a pointer to the next entry in the used ring. */ + used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num]; + used->id = head; + used->len = len; + /* Make sure buffer is written before we update index. */ + wmb(); + vq->vring.used->idx++; +} + +void virtio_add_queue(VirtIODevice *vdev, unsigned int num, + void (*handle_output)(void *, VirtIODevice *, VirtQueue *), + void *opaque) +{ + int i; + + for (i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++) { + if (vdev->vq[i].vring.num == 0) + break; + } + + /* FIXME bug_on i == VIRTIO_PCI_QUEUE_MAX */ + + vdev->vq[i].vring.num = num; + vdev->vq[i].handle_output = handle_output; + vdev->vq[i].opaque = opaque; +} + +#include <linux/virtio_blk.h> +#include <stdbool.h> + +#define BLK_MAX_QUEUE_SIZE 127 + +static bool virtio_blk_handle_request(BlockDriverState *bs, + VirtIODevice *vdev, VirtQueue *vq) +{ + struct iovec iov[vq->vring.num]; + unsigned int head, out_num, in_num, wlen; + struct virtio_blk_inhdr *in; + struct virtio_blk_outhdr *out; + + head = get_vq_desc(vq, iov, &out_num, &in_num); + /* FIXME: if get_vq_desc returns an err, we don't need to know vq */ + if (head == vq->vring.num) { + return false; + } + + if (out_num == 0 || in_num == 0) { + return false; + } + + out = convert(&iov[0], struct virtio_blk_outhdr); + in = convert(&iov[out_num+in_num-1], struct virtio_blk_inhdr); + off_t off = out->sector; + + if (out->type & VIRTIO_BLK_T_SCSI_CMD) { + in->status = VIRTIO_BLK_S_UNSUPP; + wlen = sizeof(in); + } else if (out->type & VIRTIO_BLK_T_OUT) { + int i; + + wlen = sizeof(in); + for (i = 0; i < out_num - 1; i++) { + bdrv_write(bs, off, iov[i + 1].iov_base, + iov[i + 1].iov_len / 512); + off += iov[i + 1].iov_len / 512; + } + in->status = VIRTIO_BLK_S_OK; + } else { + int i; + + wlen = sizeof(in); + for (i = 0; i < in_num - 1; i++) { + bdrv_read(bs, off, iov[i + 1].iov_base, + iov[i + 1].iov_len / 512); + off += iov[i + 1].iov_len / 512; + wlen += iov[i + 1].iov_len; + } + in->status = VIRTIO_BLK_S_OK; + } + + add_used(vq, head, wlen); + /* FIXME: the blk code shouldn't have to know implementation details of the + virtio layer so this should be abstracted into a ->kick() */ + vdev->isr = 1; + virtio_update_irq(vdev); + + return true; +} + +static void virtio_blk_handle_output(void *opaque, VirtIODevice *vdev, + VirtQueue *vq) +{ + BlockDriverState *bs = opaque; + + while (virtio_blk_handle_request(bs, vdev, vq)); +} + +static uint32_t virtio_blk_get_config(void *opaque, uint32_t addr) +{ + BlockDriverState *bs = opaque; + int64_t capacity; + + switch (addr) { + case VIRTIO_CONFIG_BLK_F_CAPACITY: + bdrv_get_geometry(bs, &capacity); + return (uint32_t)capacity; + case VIRTIO_CONFIG_BLK_F_CAPACITY + 4: + bdrv_get_geometry(bs, &capacity); + return (uint32_t)(capacity >> 32); + case VIRTIO_CONFIG_BLK_F_SIZE_MAX: + return 8192; + case VIRTIO_CONFIG_BLK_F_SEG_MAX: + return BLK_MAX_QUEUE_SIZE - 2; + } + + return 0xFFFFFFFF; +} + +static uint32_t virtio_blk_get_features(void *opaque) +{ + return (1 << VIRTIO_BLK_F_SIZE_MAX) | + (1 << VIRTIO_BLK_F_SEG_MAX); +} + +static void virtio_blk_init(VirtIODevice *vdev, BlockDriverState *bs) +{ + vdev->get_config = virtio_blk_get_config; + vdev->get_features = virtio_blk_get_features; + vdev->opaque = bs; + + virtio_add_queue(vdev, BLK_MAX_QUEUE_SIZE, virtio_blk_handle_output, bs); +} + +VirtIODevice *virtio_init_pci(PCIBus *bus, const char *name, + uint16_t vendorid, uint16_t deviceid) +{ + PCIDevice *pci_dev; + VirtIODevice *vdev; + uint8_t *config; + uint16_t virtio_devid = 0x02; + + pci_dev = pci_register_device(bus, name, sizeof(VirtIODevice), + -1, NULL, NULL); + vdev = to_virtio_device(pci_dev); + + vdev->status = 0; + vdev->isr = 0; + vdev->queue_sel = 0; + memset(vdev->vq, 0, sizeof(vdev->vq)); + + config = pci_dev->config; + config[0x00] = vendorid & 0xFF; + config[0x01] = (vendorid >> 8) & 0xFF; + config[0x02] = deviceid & 0xFF; + config[0x03] = (deviceid >> 8) & 0xFF; + + config[0x09] = 0x00; + config[0x0a] = 0x00; + config[0x0b] = 0x05; + config[0x0e] = 0x00; + + config[0x2c] = vendorid & 0xFF; + config[0x2d] = (vendorid >> 8) & 0xFF; + config[0x2e] = virtio_devid & 0xFF; + config[0x2f] = (virtio_devid >> 8) & 0xFF; + + config[0x3d] = 1; + + vdev->irq = 16; + vdev->name = name; + + pci_register_io_region(pci_dev, 0, 128, + PCI_ADDRESS_SPACE_IO, virtio_map); + + if (1) { + BlockDriverState *bs = bdrv_new("vda"); + if (bdrv_open(bs, "/home/anthony/images/linux.img", BDRV_O_SNAPSHOT)) + exit(1); + virtio_blk_init(vdev, bs); + } + + return vdev; +} diff --git a/qemu/vl.h b/qemu/vl.h index 01aeabc..985035e 100644 --- a/qemu/vl.h +++ b/qemu/vl.h @@ -1392,6 +1392,19 @@ typedef struct ADBDevice ADBDevice; void pci_hypercall_init(PCIBus *bus); void vmchannel_init(CharDriverState *hd, uint32_t deviceid, uint32_t index); +/* virtio.c */ + +typedef struct VirtQueue VirtQueue; +typedef struct VirtIODevice VirtIODevice; + +VirtIODevice *virtio_init_pci(PCIBus *bus, const char *name, + uint16_t vendorid, uint16_t deviceid); +void virtio_set_id(VirtIODevice *vdev, uint16_t vendor, uint16_t device); +void virtio_add_queue(VirtIODevice *vdev, unsigned int num, + void (*handle_output)(void *, VirtIODevice *, VirtQueue *), + void *opaque); +void virtio_add_config(VirtIODevice *vdev, int type, void *data, size_t size); + /* buf = NULL means polling */ typedef int ADBDeviceRequest(ADBDevice *d, uint8_t *buf_out, const uint8_t *buf, int len); |
From: Avi K. <av...@qu...> - 2007-11-08 06:24:30
|
Anthony Liguori wrote: > + case VIRTIO_PCI_QUEUE_NOTIFY: > + if (val < VIRTIO_PCI_QUEUE_MAX) > + virtio_ring_kick(vdev, &vdev->vq[val]); > + break; > I see you're not using hypercalls for this, presumably for compatibility with -no-kvm. Well I think I have a solution: advertise vmcall, vmmcall, pio to some port, and int $some_vector as hypercall feature bits in cpuid (for kvm, kvm, qemu, and kvm-lite respectively). Early setup code could patch the instruction as appropriate (I hear code patching is now taught in second grade). (kvm could advertise all four, or maybe just the first two) -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. |
From: Anthony L. <ali...@us...> - 2007-11-08 13:57:31
|
Avi Kivity wrote: > Anthony Liguori wrote: > >> + case VIRTIO_PCI_QUEUE_NOTIFY: >> + if (val < VIRTIO_PCI_QUEUE_MAX) >> + virtio_ring_kick(vdev, &vdev->vq[val]); >> + break; >> >> > > I see you're not using hypercalls for this, presumably for compatibility > with -no-kvm. More than just that. By stick to PIO, we are compatible with just about any VMM. For instance, we get Xen support for free. If we used hypercalls, even if we agreed on a way to determine which number to use and how to make those calls, it would still be difficult to implement in something like Xen. > Well I think I have a solution: advertise vmcall, > vmmcall, pio to some port, and int $some_vector as hypercall feature > bits in cpuid (for kvm, kvm, qemu, and kvm-lite respectively). Early > setup code could patch the instruction as appropriate (I hear code > patching is now taught in second grade). > That ties our device to our particular hypercall implementation. If we were going to do this, I'd prefer to advertise it in the device I think. I really would need to look at the performance though of vmcall and an edge triggered interrupt. It would have to be pretty compelling to warrant the additional complexity I think. Regards, Anthony Liguori > (kvm could advertise all four, or maybe just the first two) > > |
From: Avi K. <av...@qu...> - 2007-11-08 14:03:46
|
Anthony Liguori wrote: > Avi Kivity wrote: >> Anthony Liguori wrote: >> >>> + case VIRTIO_PCI_QUEUE_NOTIFY: >>> + if (val < VIRTIO_PCI_QUEUE_MAX) >>> + virtio_ring_kick(vdev, &vdev->vq[val]); >>> + break; >>> >> >> I see you're not using hypercalls for this, presumably for compatibility >> with -no-kvm. > > More than just that. By stick to PIO, we are compatible with just > about any VMM. For instance, we get Xen support for free. If we used > hypercalls, even if we agreed on a way to determine which number to > use and how to make those calls, it would still be difficult to > implement in something like Xen. > But pio through the config space basically means you're committed to handling it in qemu. We want a more flexible mechanism. Detecting how to make hypercalls can be left to paravirt_ops. (for Xen you'd use an event channel; and for kvm the virtio kick hypercall). >> Well I think I have a solution: advertise vmcall, >> vmmcall, pio to some port, and int $some_vector as hypercall feature >> bits in cpuid (for kvm, kvm, qemu, and kvm-lite respectively). Early >> setup code could patch the instruction as appropriate (I hear code >> patching is now taught in second grade). >> > > That ties our device to our particular hypercall implementation. If > we were going to do this, I'd prefer to advertise it in the device I > think. I really would need to look at the performance though of > vmcall and an edge triggered interrupt. It would have to be pretty > compelling to warrant the additional complexity I think. vmcall costs will go down, and we don't want to use different mechanisms for high bandwidth and low bandwidth devices. -- error compiling committee.c: too many arguments to function |
From: Anthony L. <ali...@us...> - 2007-11-08 15:09:28
|
Avi Kivity wrote: > Anthony Liguori wrote: >> Avi Kivity wrote: >>> Anthony Liguori wrote: >>> >>>> + case VIRTIO_PCI_QUEUE_NOTIFY: >>>> + if (val < VIRTIO_PCI_QUEUE_MAX) >>>> + virtio_ring_kick(vdev, &vdev->vq[val]); >>>> + break; >>>> >>> >>> I see you're not using hypercalls for this, presumably for >>> compatibility >>> with -no-kvm. >> >> More than just that. By stick to PIO, we are compatible with just >> about any VMM. For instance, we get Xen support for free. If we >> used hypercalls, even if we agreed on a way to determine which number >> to use and how to make those calls, it would still be difficult to >> implement in something like Xen. >> > > But pio through the config space basically means you're committed to > handling it in qemu. We want a more flexible mechanism. There's no reason that the PIO operations couldn't be handled in the kernel. You'll already need some level of cooperation in userspace unless you plan on implementing the PCI bus in kernel space too. It's easy enough in the pci_map function in QEMU to just notify the kernel that it should listen on a particular PIO range. > Detecting how to make hypercalls can be left to paravirt_ops. > > (for Xen you'd use an event channel; and for kvm the virtio kick > hypercall). > >>> Well I think I have a solution: advertise vmcall, >>> vmmcall, pio to some port, and int $some_vector as hypercall feature >>> bits in cpuid (for kvm, kvm, qemu, and kvm-lite respectively). Early >>> setup code could patch the instruction as appropriate (I hear code >>> patching is now taught in second grade). >>> >> >> That ties our device to our particular hypercall implementation. If >> we were going to do this, I'd prefer to advertise it in the device I >> think. I really would need to look at the performance though of >> vmcall and an edge triggered interrupt. It would have to be pretty >> compelling to warrant the additional complexity I think. > > vmcall costs will go down, and we don't want to use different > mechanisms for high bandwidth and low bandwidth devices. vmcalls will certainly get faster but I doubt that the cost difference between vmcall and pio will ever be greater than a few hundred cycles. The only performance sensitive operation here would be the kick and I don't think a few hundred cycles in the kick path is ever going to be that significant for overall performance. So why introduce the extra complexity? Regards, Anthony Liguori > > |
From: Avi K. <av...@qu...> - 2007-11-08 15:20:06
|
Anthony Liguori wrote: > Avi Kivity wrote: > >> Anthony Liguori wrote: >> >>> Avi Kivity wrote: >>> >>>> Anthony Liguori wrote: >>>> >>>> >>>>> + case VIRTIO_PCI_QUEUE_NOTIFY: >>>>> + if (val < VIRTIO_PCI_QUEUE_MAX) >>>>> + virtio_ring_kick(vdev, &vdev->vq[val]); >>>>> + break; >>>>> >>>>> >>>> I see you're not using hypercalls for this, presumably for >>>> compatibility >>>> with -no-kvm. >>>> >>> More than just that. By stick to PIO, we are compatible with just >>> about any VMM. For instance, we get Xen support for free. If we >>> used hypercalls, even if we agreed on a way to determine which number >>> to use and how to make those calls, it would still be difficult to >>> implement in something like Xen. >>> >>> >> But pio through the config space basically means you're committed to >> handling it in qemu. We want a more flexible mechanism. >> > > There's no reason that the PIO operations couldn't be handled in the > kernel. You'll already need some level of cooperation in userspace > unless you plan on implementing the PCI bus in kernel space too. It's > easy enough in the pci_map function in QEMU to just notify the kernel > that it should listen on a particular PIO range. > > This is a config space write, right? If so, the range is the regular 0xcf8-0xcff and it has to be very specially handled. >> Detecting how to make hypercalls can be left to paravirt_ops. >> >> (for Xen you'd use an event channel; and for kvm the virtio kick >> hypercall). >> >> >>>> Well I think I have a solution: advertise vmcall, >>>> vmmcall, pio to some port, and int $some_vector as hypercall feature >>>> bits in cpuid (for kvm, kvm, qemu, and kvm-lite respectively). Early >>>> setup code could patch the instruction as appropriate (I hear code >>>> patching is now taught in second grade). >>>> >>>> >>> That ties our device to our particular hypercall implementation. If >>> we were going to do this, I'd prefer to advertise it in the device I >>> think. I really would need to look at the performance though of >>> vmcall and an edge triggered interrupt. It would have to be pretty >>> compelling to warrant the additional complexity I think. >>> >> vmcall costs will go down, and we don't want to use different >> mechanisms for high bandwidth and low bandwidth devices. >> > > vmcalls will certainly get faster but I doubt that the cost difference > between vmcall and pio will ever be greater than a few hundred cycles. > The only performance sensitive operation here would be the kick and I > don't think a few hundred cycles in the kick path is ever going to be > that significant for overall performance. > > Why do you think the different will be a few hundred cycles? And if you have a large number of devices, searching the list becomes expensive too. > So why introduce the extra complexity? > Overall I think it reduces comlexity if we have in-kernel devices. Anyway we can add additional signalling methods later. -- error compiling committee.c: too many arguments to function |
From: Anthony L. <ali...@us...> - 2007-11-08 16:23:46
|
Avi Kivity wrote: >> There's no reason that the PIO operations couldn't be handled in the >> kernel. You'll already need some level of cooperation in userspace >> unless you plan on implementing the PCI bus in kernel space too. >> It's easy enough in the pci_map function in QEMU to just notify the >> kernel that it should listen on a particular PIO range. >> >> > > This is a config space write, right? If so, the range is the regular > 0xcf8-0xcff and it has to be very specially handled. This is a per-device IO slot and as best as I can tell, the PCI device advertises the size of the region and the OS then identifies a range of PIO space to use and tells the PCI device about it. So we would just need to implement a generic userspace virtio PCI device in QEMU that did an ioctl to the kernel when this happened to tell the kernel what region to listen on for a particular device. >> vmcalls will certainly get faster but I doubt that the cost >> difference between vmcall and pio will ever be greater than a few >> hundred cycles. The only performance sensitive operation here would >> be the kick and I don't think a few hundred cycles in the kick path >> is ever going to be that significant for overall performance. >> >> > > Why do you think the different will be a few hundred cycles? The only difference in hardware between a PIO exit and a vmcall is that you don't have write out an exit reason in the VMC[SB]. So the performance difference between PIO/vmcall shouldn't be that great (and if it were, the difference would probably be obvious today). That's different from, say, a PF exit because with a PF, you also have to attempt to resolve it by walking the guest page table before determining that you do in fact need to exit. > And if you have a large number of devices, searching the list > becomes expensive too. The PIO address space is relatively small. You could do a radix tree or even a direct array lookup if you are concerned about performance. >> So why introduce the extra complexity? >> > > Overall I think it reduces comlexity if we have in-kernel devices. > Anyway we can add additional signalling methods later. In-kernel virtio backends add quite a lot of complexity. Just the mechanism to setup the device is complicated enough. I suspect that it'll be necessary down the road for performance but I certainly don't think it's a simplification. Regards, Anthony Liguori |
From: Avi K. <av...@qu...> - 2007-11-11 09:24:32
|
Anthony Liguori wrote: > Avi Kivity wrote: >>> There's no reason that the PIO operations couldn't be handled in the >>> kernel. You'll already need some level of cooperation in userspace >>> unless you plan on implementing the PCI bus in kernel space too. >>> It's easy enough in the pci_map function in QEMU to just notify the >>> kernel that it should listen on a particular PIO range. >>> >>> >> >> This is a config space write, right? If so, the range is the regular >> 0xcf8-0xcff and it has to be very specially handled. > > This is a per-device IO slot and as best as I can tell, the PCI device > advertises the size of the region and the OS then identifies a range > of PIO space to use and tells the PCI device about it. So we would > just need to implement a generic userspace virtio PCI device in QEMU > that did an ioctl to the kernel when this happened to tell the kernel > what region to listen on for a particular device. > I'll just go and read the patches more carefully before making any more stupid remarks about the code. >>> vmcalls will certainly get faster but I doubt that the cost >>> difference between vmcall and pio will ever be greater than a few >>> hundred cycles. The only performance sensitive operation here would >>> be the kick and I don't think a few hundred cycles in the kick path >>> is ever going to be that significant for overall performance. >>> >>> >> >> Why do you think the different will be a few hundred cycles? > > The only difference in hardware between a PIO exit and a vmcall is > that you don't have write out an exit reason in the VMC[SB]. So the > performance difference between PIO/vmcall shouldn't be that great (and > if it were, the difference would probably be obvious today). That's > different from, say, a PF exit because with a PF, you also have to > attempt to resolve it by walking the guest page table before > determining that you do in fact need to exit. > You have to look at the pio bitmaps with pio. Point taken though. > >>> So why introduce the extra complexity? >>> >> >> Overall I think it reduces comlexity if we have in-kernel devices. >> Anyway we can add additional signalling methods later. > > In-kernel virtio backends add quite a lot of complexity. Just the > mechanism to setup the device is complicated enough. I suspect that > it'll be necessary down the road for performance but I certainly don't > think it's a simplification. I didn't mean that in-kernel devices simplify things (they don't), but that using hypercalls is simpler for in-kernel than pio. -- error compiling committee.c: too many arguments to function |
From: Avi K. <av...@qu...> - 2007-11-08 15:32:01
|
Anthony Liguori wrote: >>>> >>>>> + case VIRTIO_PCI_QUEUE_NOTIFY: >>>>> + if (val < VIRTIO_PCI_QUEUE_MAX) >>>>> + virtio_ring_kick(vdev, &vdev->vq[val]); >>>>> + break; >>>>> >>>> >>>> I see you're not using hypercalls for this, presumably for >>>> compatibility >>>> with -no-kvm. >>> >>> More than just that. By stick to PIO, we are compatible with just >>> about any VMM. For instance, we get Xen support for free. If we >>> used hypercalls, even if we agreed on a way to determine which >>> number to use and how to make those calls, it would still be >>> difficult to implement in something like Xen. >>> >> >> But pio through the config space basically means you're committed to >> handling it in qemu. We want a more flexible mechanism. > > There's no reason that the PIO operations couldn't be handled in the > kernel. You'll already need some level of cooperation in userspace > unless you plan on implementing the PCI bus in kernel space too. It's > easy enough in the pci_map function in QEMU to just notify the kernel > that it should listen on a particular PIO range. With my new understanding of what this is all about, I suggest each virtqueue having an ID filled in by the host. This ID is globally unique, and is used as an argument for kick. It would map into a Xen domain id + event channel number, a number to be written into a pio port for kvm-lite or non-hypercall kvm, the argument for a kick hypercall on kvm, or whatever. This is independent of virtio-pci, which is good. -- error compiling committee.c: too many arguments to function |
From: Anthony L. <ali...@us...> - 2007-11-08 20:37:20
|
Avi Kivity wrote: > Anthony Liguori wrote: >>>>> >>>>>> + case VIRTIO_PCI_QUEUE_NOTIFY: >>>>>> + if (val < VIRTIO_PCI_QUEUE_MAX) >>>>>> + virtio_ring_kick(vdev, &vdev->vq[val]); >>>>>> + break; >>>>>> >>>>> >>>>> I see you're not using hypercalls for this, presumably for >>>>> compatibility >>>>> with -no-kvm. >>>> >>>> More than just that. By stick to PIO, we are compatible with just >>>> about any VMM. For instance, we get Xen support for free. If we >>>> used hypercalls, even if we agreed on a way to determine which >>>> number to use and how to make those calls, it would still be >>>> difficult to implement in something like Xen. >>>> >>> >>> But pio through the config space basically means you're committed to >>> handling it in qemu. We want a more flexible mechanism. >> >> There's no reason that the PIO operations couldn't be handled in the >> kernel. You'll already need some level of cooperation in userspace >> unless you plan on implementing the PCI bus in kernel space too. >> It's easy enough in the pci_map function in QEMU to just notify the >> kernel that it should listen on a particular PIO range. > > With my new understanding of what this is all about, I suggest each > virtqueue having an ID filled in by the host. This ID is globally > unique, and is used as an argument for kick. It would map into a Xen > domain id + event channel number, a number to be written into a pio > port for kvm-lite or non-hypercall kvm, the argument for a kick > hypercall on kvm, or whatever. Yeah, right now, I maintain a virtqueue "selector" within virtio-pci and use that for notification. This index is also exposed in the config->find_vq() within virtio. Changing that to an opaque ID would require introducing another mechanism to enumerate the virtqueues since you couldn't just start from 0 and keep going until you hit an invalid virtqueue. I'm not sure I'm convinced that you couldn't just hide this "id" notion in the virtio-xen implementation if you needed to. Regards, Anthony Liguori > This is independent of virtio-pci, which is good. > |
From: Dor L. <dor...@gm...> - 2007-11-09 00:13:47
|
Anthony Liguori wrote: > Avi Kivity wrote: > >>> There's no reason that the PIO operations couldn't be handled in the >>> kernel. You'll already need some level of cooperation in userspace >>> unless you plan on implementing the PCI bus in kernel space too. >>> It's easy enough in the pci_map function in QEMU to just notify the >>> kernel that it should listen on a particular PIO range. >>> >>> >>> >> This is a config space write, right? If so, the range is the regular >> 0xcf8-0xcff and it has to be very specially handled. >> > > This is a per-device IO slot and as best as I can tell, the PCI device > advertises the size of the region and the OS then identifies a range of > PIO space to use and tells the PCI device about it. So we would just > need to implement a generic userspace virtio PCI device in QEMU that did > an ioctl to the kernel when this happened to tell the kernel what region > to listen on for a particular device. > > >>> vmcalls will certainly get faster but I doubt that the cost >>> difference between vmcall and pio will ever be greater than a few >>> hundred cycles. The only performance sensitive operation here would >>> be the kick and I don't think a few hundred cycles in the kick path >>> is ever going to be that significant for overall performance. >>> >>> >>> >> Why do you think the different will be a few hundred cycles? >> > > The only difference in hardware between a PIO exit and a vmcall is that > you don't have write out an exit reason in the VMC[SB]. So the > performance difference between PIO/vmcall shouldn't be that great (and > if it were, the difference would probably be obvious today). That's > different from, say, a PF exit because with a PF, you also have to > attempt to resolve it by walking the guest page table before determining > that you do in fact need to exit. > > >> And if you have a large number of devices, searching the list >> becomes expensive too. >> > > The PIO address space is relatively small. You could do a radix tree or > even a direct array lookup if you are concerned about performance. > > >>> So why introduce the extra complexity? >>> >>> >> Overall I think it reduces comlexity if we have in-kernel devices. >> Anyway we can add additional signalling methods later. >> > > In-kernel virtio backends add quite a lot of complexity. Just the > mechanism to setup the device is complicated enough. I suspect that > it'll be necessary down the road for performance but I certainly don't > think it's a simplification. > > I believe that the network interface will quickly go to the kernel since copy takes most of the cpu time and qemu does not support scatter gather dma at the moment. Nevertheless using pio seems good enough, Anthony's suggestion of notifying the kernel using ioctls is logical. If we'll run into troubles further on we can add a hypercall capability and if exist use hypercalls instead of pios. > Regards, > > Anthony Liguori > > > ------------------------------------------------------------------------- > This SF.net email is sponsored by: Splunk Inc. > Still grepping through log files to find problems? Stop. > Now Search log events and configuration files using AJAX and a browser. > Download your FREE copy of Splunk now >> http://get.splunk.com/ > _______________________________________________ > kvm-devel mailing list > kvm...@li... > https://lists.sourceforge.net/lists/listinfo/kvm-devel > > |
From: Dor L. <dor...@gm...> - 2007-11-09 00:25:56
|
Anthony Liguori wrote: > This still needs quite a lot of work but I wanted to post it for reference. > > Regards, > > Anthony Liguori > > diff --git a/qemu/Makefile.target b/qemu/Makefile.target > ... Why change Rusty's codding standard? It will be harder to track changes. > +typedef struct VRingDesc > +{ > + uint64_t addr; > + uint32_t len; > + uint16_t flags; > + uint16_t next; > +} VRingDesc; > + > +typedef struct VRingAvail > +{ > + uint16_t flags; > + uint16_t idx; > + uint16_t ring[0]; > +} VRingAvail; > + > +typedef struct VRingUsedElem > +{ > + uint32_t id; > + uint32_t len; > +} VRingUsedElem; > + > +typedef struct VRingUsed > +{ > + uint16_t flags; > + uint16_t idx; > + VRingUsedElem ring[0]; > +} VRingUsed; > + > +typedef struct VRing > +{ > + unsigned int num; > + VRingDesc *desc; > + VRingAvail *avail; > + VRingUsed *used; > +} VRing; > + .... > +static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val) > +{ > + VirtIODevice *vdev = to_virtio_device(opaque); > + ram_addr_t pa; > + > + addr -= vdev->addr; > + > + switch (addr) { > + case VIRTIO_PCI_GUEST_FEATURES: > + if (vdev->set_features) > + vdev->set_features(vdev->opaque, val); > + vdev->features = val; > + break; > + case VIRTIO_PCI_QUEUE_PFN: > + pa = (ram_addr_t)val << TARGET_PAGE_BITS; > + vdev->vq[vdev->queue_sel].pfn = val; > Some validity checks are missing, you assume you have the queue_sel. > + if (pa < (ram_size - TARGET_PAGE_SIZE)) > + vring_init(&vdev->vq[vdev->queue_sel], phys_ram_base + pa); > + break; > + case VIRTIO_PCI_QUEUE_SEL: > + if (val < VIRTIO_PCI_QUEUE_MAX) > + vdev->queue_sel = val; > + break; > + case VIRTIO_PCI_QUEUE_NOTIFY: > + if (val < VIRTIO_PCI_QUEUE_MAX) > + virtio_ring_kick(vdev, &vdev->vq[val]); > + break; > + case VIRTIO_PCI_STATUS: > + vdev->status = val & 0xFF; > we should keep another internal status and it will track the initialization of all the above fields ( pfn, queue_sel,..) the device will be active once all of them were initialized by the guest > + break; > + default: > + if (addr >= VIRTIO_PCI_CONFIG && vdev->set_config) > + vdev->set_config(vdev->opaque, addr - VIRTIO_PCI_CONFIG, val); > + break; > + } > +} > + > What about having block/net/9p.. in separate files? It will grow over time. > +#include <linux/virtio_blk.h> > +#include <stdbool.h> > + > +#define BLK_MAX_QUEUE_SIZE 127 > + > +static bool virtio_blk_handle_request(BlockDriverState *bs, > + VirtIODevice *vdev, VirtQueue *vq) > +{ > + struct iovec iov[vq->vring.num]; > + unsigned int head, out_num, in_num, wlen; > + struct virtio_blk_inhdr *in; > + struct virtio_blk_outhdr *out; > + Great job, Dor. |
From: Anthony L. <ali...@us...> - 2007-11-09 01:38:24
|
Dor Laor wrote: > Anthony Liguori wrote: >> This still needs quite a lot of work but I wanted to post it for >> reference. >> >> Regards, >> >> Anthony Liguori >> >> diff --git a/qemu/Makefile.target b/qemu/Makefile.target >> > ... > Why change Rusty's codding standard? It will be harder to track changes. Because Linux kernel coding standards aren't QEMU coding standards. Besides, this is supposed to be an ABI so it shouldn't be changing all that much :-) I posted the QEMU bits as soon as I got it working. I still have a lot to do in it however I have addressed some of the things you brought up already. >> + case VIRTIO_PCI_QUEUE_PFN: >> + pa = (ram_addr_t)val << TARGET_PAGE_BITS; >> + vdev->vq[vdev->queue_sel].pfn = val; >> > Some validity checks are missing, you assume you have the queue_sel. Yes, the code is carefully written so that queue_sel is always valid. >> + if (pa < (ram_size - TARGET_PAGE_SIZE)) >> + vring_init(&vdev->vq[vdev->queue_sel], phys_ram_base + pa); >> + break; >> + case VIRTIO_PCI_QUEUE_SEL: >> + if (val < VIRTIO_PCI_QUEUE_MAX) >> + vdev->queue_sel = val; >> + break; >> + case VIRTIO_PCI_QUEUE_NOTIFY: >> + if (val < VIRTIO_PCI_QUEUE_MAX) >> + virtio_ring_kick(vdev, &vdev->vq[val]); >> + break; >> + case VIRTIO_PCI_STATUS: >> + vdev->status = val & 0xFF; >> > we should keep another internal status and it will track the > initialization of all the above fields ( > pfn, queue_sel,..) the device will be active once all of them were > initialized by the guest Hrm, I don't follow. The only thing that has to be written to by the guest is the PFN which also has the effect of activating the queue. >> + break; >> + default: >> + if (addr >= VIRTIO_PCI_CONFIG && vdev->set_config) >> + vdev->set_config(vdev->opaque, addr - VIRTIO_PCI_CONFIG, val); >> + break; >> + } >> +} >> + >> > > What about having block/net/9p.. in separate files? It will grow over > time. Yup, already have that in my own queue. The latest version queue for the kernel side is at http://hg.codemonkey.ws/virtio-pci (based on the master branch of Rusty's virtio tree). The latest queue for the QEMU side is at http://hg.codemonkey.ws/qemu-virtio. I have a functioning block and 9p transport. I'll continue cleaning up tomorrow and will hopefully post another set of patches early next week. Unfortunately, I uncovered a bug in the in-kernel APIC code today so you need to run with -no-kvm-irqchip if you want to use multiple virtio devices at once :-/ Regards, Anthony Liguori >> +#include <linux/virtio_blk.h> >> +#include <stdbool.h> >> + >> +#define BLK_MAX_QUEUE_SIZE 127 >> + >> +static bool virtio_blk_handle_request(BlockDriverState *bs, >> + VirtIODevice *vdev, VirtQueue *vq) >> +{ >> + struct iovec iov[vq->vring.num]; >> + unsigned int head, out_num, in_num, wlen; >> + struct virtio_blk_inhdr *in; >> + struct virtio_blk_outhdr *out; >> + > Great job, Dor. |
From: Christian B. <bor...@de...> - 2007-11-20 08:39:26
|
Am Freitag, 9. November 2007 schrieb Dor Laor: > I believe that the network interface will quickly go to the kernel since > copy takes most of the > cpu time and qemu does not support scatter gather dma at the moment. > Nevertheless using pio seems good enough, Anthony's suggestion of > notifying the kernel using ioctls > is logical. If we'll run into troubles further on we can add a hypercall > capability and if exist use hypercalls > instead of pios. Sorry for being late in this thread. We (s390) will need a hypercall as we do not have port I/O. I think it should be possible to default to hypercall on s390 and use pio everywhere else. Christian |
From: Avi K. <av...@qu...> - 2007-11-20 10:00:03
|
Christian Borntraeger wrote: > Am Freitag, 9. November 2007 schrieb Dor Laor: > >> I believe that the network interface will quickly go to the kernel since >> copy takes most of the >> cpu time and qemu does not support scatter gather dma at the moment. >> Nevertheless using pio seems good enough, Anthony's suggestion of >> notifying the kernel using ioctls >> is logical. If we'll run into troubles further on we can add a hypercall >> capability and if exist use hypercalls >> instead of pios. >> > > Sorry for being late in this thread. > We (s390) will need a hypercall as we do not have port I/O. I think it should be > possible to default to hypercall on s390 and use pio everywhere else. > Or be generic: advertise the methods available according to host (kvm/x86, qemu/x86, kvm/s390) and let the guest pick. -- error compiling committee.c: too many arguments to function |
From: Arnd B. <ar...@ar...> - 2007-11-20 10:17:32
|
On Tuesday 20 November 2007, Avi Kivity wrote: >=20 > > > > Sorry for being late in this thread. > > We (s390) will need a hypercall as we do not have port I/O. I think it = should be > > possible to default to hypercall on s390 and use pio everywhere else. > > =A0=20 >=20 > Or be generic: advertise the methods available according to host=20 > (kvm/x86, qemu/x86, kvm/s390) and let the guest pick. Not sure if I'm following the reasoning here. Shouldn't the method be inherent to the virtio bus driver? When you use a PCI based virtio bus, the natural choice would be PIO in some way, but you could also have a different virtio implementation on PCI that uses hcalls. This choice is completely up to virtio-pci. On s390, you have a different virtio backend altogether, so you always use DIAG or hcall instead of whatever virtio-pci does. The virtio-blk and other high-level drivers don't need to care about what transport the bus uses in the first place. Arnd <>< |
From: Carsten O. <co...@de...> - 2007-11-20 11:05:30
|
Arnd Bergmann wrote: > Not sure if I'm following the reasoning here. Shouldn't the method be > inherent to the virtio bus driver? > > When you use a PCI based virtio bus, the natural choice would be PIO > in some way, but you could also have a different virtio implementation > on PCI that uses hcalls. This choice is completely up to virtio-pci. > > On s390, you have a different virtio backend altogether, so you always > use DIAG or hcall instead of whatever virtio-pci does. > > The virtio-blk and other high-level drivers don't need to care about > what transport the bus uses in the first place. If you look at HPA's virtio PCI bus in lguest, it uses PCI device organization but no other PCI features. That's where we're heading, because we don't want a different virtio backend. |