You can subscribe to this list here.
2006 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
|
Oct
(33) |
Nov
(325) |
Dec
(320) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2007 |
Jan
(484) |
Feb
(438) |
Mar
(407) |
Apr
(713) |
May
(831) |
Jun
(806) |
Jul
(1023) |
Aug
(1184) |
Sep
(1118) |
Oct
(1461) |
Nov
(1224) |
Dec
(1042) |
2008 |
Jan
(1449) |
Feb
(1110) |
Mar
(1428) |
Apr
(1643) |
May
(682) |
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
From: Dietmar M. <di...@pr...> - 2008-04-16 13:29:31
|
Hi all, I am glad to announce the first beta release of 'Proxmox Virtual Environment' - an open source virtualization platform for the enterprise. The main features are: - All code is GPL - OpenVZ and KVM support - bare metal installer (debian etch 64) - Backup/restore with vzdump/LVM2 - web based management - integrated virtual appliance download (include certified appliances) - configuration cluster You can find more information at http://pve.proxmox.com We encourage anyone interested to download and test. The CD image is available at: http://pve.proxmox.com/wiki/Downloads Let us know what you think! Best regards, Dietmar -------------------------------------------------- Dietmar Maurer Proxmox Server Solutions GmbH CTO di...@pr... http://www.proxmox.com -------------------------------------------------- |
From: <be...@il...> - 2008-04-16 13:26:53
|
From: Ben-Ami Yassour <be...@il...> Signed-off-by: Ben-Ami Yassour <be...@il...> Signed-off-by: Muli Ben-Yehuda <mu...@il...> --- libkvm/libkvm.c | 24 ++++++++---- qemu/hw/pci-passthrough.c | 89 +++++++++++---------------------------------- qemu/hw/pci-passthrough.h | 2 + 3 files changed, 40 insertions(+), 75 deletions(-) diff --git a/libkvm/libkvm.c b/libkvm/libkvm.c index de91328..8c02af9 100644 --- a/libkvm/libkvm.c +++ b/libkvm/libkvm.c @@ -400,7 +400,7 @@ void *kvm_create_userspace_phys_mem(kvm_context_t kvm, unsigned long phys_start, { int r; int prot = PROT_READ; - void *ptr; + void *ptr = NULL; struct kvm_userspace_memory_region memory = { .memory_size = len, .guest_phys_addr = phys_start, @@ -410,16 +410,24 @@ void *kvm_create_userspace_phys_mem(kvm_context_t kvm, unsigned long phys_start, if (writable) prot |= PROT_WRITE; - ptr = mmap(NULL, len, prot, MAP_ANONYMOUS | MAP_SHARED, -1, 0); - if (ptr == MAP_FAILED) { - fprintf(stderr, "create_userspace_phys_mem: %s", strerror(errno)); - return 0; - } + if (len > 0) { + ptr = mmap(NULL, len, prot, MAP_ANONYMOUS | MAP_SHARED, -1, 0); + if (ptr == MAP_FAILED) { + fprintf(stderr, "create_userspace_phys_mem: %s", + strerror(errno)); + return 0; + } - memset(ptr, 0, len); + memset(ptr, 0, len); + } memory.userspace_addr = (unsigned long)ptr; - memory.slot = get_free_slot(kvm); + + if (len > 0) + memory.slot = get_free_slot(kvm); + else + memory.slot = get_slot(phys_start); + r = ioctl(kvm->vm_fd, KVM_SET_USER_MEMORY_REGION, &memory); if (r == -1) { fprintf(stderr, "create_userspace_phys_mem: %s", strerror(errno)); diff --git a/qemu/hw/pci-passthrough.c b/qemu/hw/pci-passthrough.c index 7ffcc7b..a5894d9 100644 --- a/qemu/hw/pci-passthrough.c +++ b/qemu/hw/pci-passthrough.c @@ -25,18 +25,6 @@ typedef __u64 resource_size_t; extern kvm_context_t kvm_context; extern FILE *logfile; -CPUReadMemoryFunc *pt_mmio_read_cb[3] = { - pt_mmio_readb, - pt_mmio_readw, - pt_mmio_readl -}; - -CPUWriteMemoryFunc *pt_mmio_write_cb[3] = { - pt_mmio_writeb, - pt_mmio_writew, - pt_mmio_writel -}; - //#define PT_DEBUG #ifdef PT_DEBUG @@ -45,47 +33,6 @@ CPUWriteMemoryFunc *pt_mmio_write_cb[3] = { #define DEBUG(fmt, args...) #endif -#define pt_mmio_write(suffix, type) \ -void pt_mmio_write##suffix(void *opaque, target_phys_addr_t e_phys, \ - uint32_t value) \ -{ \ - pt_region_t *r_access = (pt_region_t *)opaque; \ - void *r_virt = (u8 *)r_access->r_virtbase + \ - (e_phys - r_access->e_physbase); \ - if (r_access->debug & PT_DEBUG_MMIO) { \ - fprintf(logfile, "pt_mmio_write" #suffix \ - ": e_physbase=%p e_phys=%p r_virt=%p value=%08x\n", \ - (void *)r_access->e_physbase, (void *)e_phys, \ - r_virt, value); \ - } \ - *(type *)r_virt = (type)value; \ -} - -pt_mmio_write(b, u8) -pt_mmio_write(w, u16) -pt_mmio_write(l, u32) - -#define pt_mmio_read(suffix, type) \ -uint32_t pt_mmio_read##suffix(void *opaque, target_phys_addr_t e_phys) \ -{ \ - pt_region_t *r_access = (pt_region_t *)opaque; \ - void *r_virt = (u8 *)r_access->r_virtbase + \ - (e_phys - r_access->e_physbase); \ - uint32_t value = (u32) (*(type *) r_virt); \ - if (r_access->debug & PT_DEBUG_MMIO) { \ - fprintf(logfile, \ - "pt_mmio_read" #suffix ": e_physbase=%p " \ - "e_phys=%p r_virt=%p value=%08x\n", \ - (void *)r_access->e_physbase, \ - (void *)e_phys, r_virt, value); \ - } \ - return value; \ -} - -pt_mmio_read(b, u8) -pt_mmio_read(w, u16) -pt_mmio_read(l, u32) - #define pt_ioport_write(suffix) \ void pt_ioport_write##suffix(void *opaque, uint32_t addr, uint32_t value) \ { \ @@ -127,22 +74,33 @@ pt_ioport_read(b) pt_ioport_read(w) pt_ioport_read(l) -static void pt_iomem_map(PCIDevice * d, int region_num, - uint32_t e_phys, uint32_t e_size, int type) +void pt_iomem_map(PCIDevice * pci_dev, int region_num, uint32_t e_phys, + uint32_t e_size, int type) { - pt_dev_t *r_dev = (pt_dev_t *) d; - - r_dev->v_addrs[region_num].e_physbase = e_phys; + pt_dev_t *r_dev = (pt_dev_t *) pci_dev; + pt_region_t *region = &r_dev->v_addrs[region_num]; + int first_map = (region->e_size == 0); + int ret = 0; DEBUG("e_phys=%08x r_virt=%p type=%d len=%08x region_num=%d \n", e_phys, r_dev->v_addrs[region_num].r_virtbase, type, e_size, region_num); - cpu_register_physical_memory(e_phys, - r_dev->dev.io_regions[region_num].size, - r_dev->v_addrs[region_num].memory_index); -} + region->e_physbase = e_phys; + region->e_size = e_size; + + if (!first_map) + kvm_destroy_phys_mem(kvm_context, e_phys, e_size); + if (e_size > 0) + ret = kvm_register_userspace_phys_mem(kvm_context, + e_phys, + region->r_virtbase, + e_size, + 0); + if (ret != 0) + fprintf(logfile, "Error: create new mapping failed\n"); +} static void pt_ioport_map(PCIDevice * pci_dev, int region_num, uint32_t addr, uint32_t size, int type) @@ -265,6 +223,8 @@ static int pt_register_regions(pci_region_t * io_regions, (uint32_t) (cur_region->base_addr)); return (-1); } + pci_dev->v_addrs[i].r_size = cur_region->size; + pci_dev->v_addrs[i].e_size = 0; /* add offset */ pci_dev->v_addrs[i].r_virtbase += @@ -274,11 +234,6 @@ static int pt_register_regions(pci_region_t * io_regions, cur_region->size, t, pt_iomem_map); - pci_dev->v_addrs[i].memory_index = - cpu_register_io_memory(0, pt_mmio_read_cb, - pt_mmio_write_cb, - (void *) &(pci_dev->v_addrs[i])); - continue; } /* handle port io regions */ diff --git a/qemu/hw/pci-passthrough.h b/qemu/hw/pci-passthrough.h index 012014a..49db1d2 100644 --- a/qemu/hw/pci-passthrough.h +++ b/qemu/hw/pci-passthrough.h @@ -54,6 +54,8 @@ typedef struct pt_region_s { uint32_t memory_index; void *r_virtbase; /* mmapped access address */ int num; /* our index within v_addrs[] */ + uint32_t e_size; /* emulated size of region in bytes */ + uint32_t r_size; /* real size of region in bytes */ uint32_t debug; } pt_region_t; -- 1.5.4.5 |
From: <be...@il...> - 2008-04-16 13:26:34
|
From: Ben-Ami Yassour <be...@il...> Signed-off-by: Ben-Ami Yassour <be...@il...> Signed-off-by: Muli Ben-Yehuda <mu...@il...> --- libkvm/libkvm.c | 24 ++++++++---- qemu/hw/pci-passthrough.c | 89 +++++++++++---------------------------------- qemu/hw/pci-passthrough.h | 2 + 3 files changed, 40 insertions(+), 75 deletions(-) diff --git a/libkvm/libkvm.c b/libkvm/libkvm.c index de91328..8c02af9 100644 --- a/libkvm/libkvm.c +++ b/libkvm/libkvm.c @@ -400,7 +400,7 @@ void *kvm_create_userspace_phys_mem(kvm_context_t kvm, unsigned long phys_start, { int r; int prot = PROT_READ; - void *ptr; + void *ptr = NULL; struct kvm_userspace_memory_region memory = { .memory_size = len, .guest_phys_addr = phys_start, @@ -410,16 +410,24 @@ void *kvm_create_userspace_phys_mem(kvm_context_t kvm, unsigned long phys_start, if (writable) prot |= PROT_WRITE; - ptr = mmap(NULL, len, prot, MAP_ANONYMOUS | MAP_SHARED, -1, 0); - if (ptr == MAP_FAILED) { - fprintf(stderr, "create_userspace_phys_mem: %s", strerror(errno)); - return 0; - } + if (len > 0) { + ptr = mmap(NULL, len, prot, MAP_ANONYMOUS | MAP_SHARED, -1, 0); + if (ptr == MAP_FAILED) { + fprintf(stderr, "create_userspace_phys_mem: %s", + strerror(errno)); + return 0; + } - memset(ptr, 0, len); + memset(ptr, 0, len); + } memory.userspace_addr = (unsigned long)ptr; - memory.slot = get_free_slot(kvm); + + if (len > 0) + memory.slot = get_free_slot(kvm); + else + memory.slot = get_slot(phys_start); + r = ioctl(kvm->vm_fd, KVM_SET_USER_MEMORY_REGION, &memory); if (r == -1) { fprintf(stderr, "create_userspace_phys_mem: %s", strerror(errno)); diff --git a/qemu/hw/pci-passthrough.c b/qemu/hw/pci-passthrough.c index 7ffcc7b..a5894d9 100644 --- a/qemu/hw/pci-passthrough.c +++ b/qemu/hw/pci-passthrough.c @@ -25,18 +25,6 @@ typedef __u64 resource_size_t; extern kvm_context_t kvm_context; extern FILE *logfile; -CPUReadMemoryFunc *pt_mmio_read_cb[3] = { - pt_mmio_readb, - pt_mmio_readw, - pt_mmio_readl -}; - -CPUWriteMemoryFunc *pt_mmio_write_cb[3] = { - pt_mmio_writeb, - pt_mmio_writew, - pt_mmio_writel -}; - //#define PT_DEBUG #ifdef PT_DEBUG @@ -45,47 +33,6 @@ CPUWriteMemoryFunc *pt_mmio_write_cb[3] = { #define DEBUG(fmt, args...) #endif -#define pt_mmio_write(suffix, type) \ -void pt_mmio_write##suffix(void *opaque, target_phys_addr_t e_phys, \ - uint32_t value) \ -{ \ - pt_region_t *r_access = (pt_region_t *)opaque; \ - void *r_virt = (u8 *)r_access->r_virtbase + \ - (e_phys - r_access->e_physbase); \ - if (r_access->debug & PT_DEBUG_MMIO) { \ - fprintf(logfile, "pt_mmio_write" #suffix \ - ": e_physbase=%p e_phys=%p r_virt=%p value=%08x\n", \ - (void *)r_access->e_physbase, (void *)e_phys, \ - r_virt, value); \ - } \ - *(type *)r_virt = (type)value; \ -} - -pt_mmio_write(b, u8) -pt_mmio_write(w, u16) -pt_mmio_write(l, u32) - -#define pt_mmio_read(suffix, type) \ -uint32_t pt_mmio_read##suffix(void *opaque, target_phys_addr_t e_phys) \ -{ \ - pt_region_t *r_access = (pt_region_t *)opaque; \ - void *r_virt = (u8 *)r_access->r_virtbase + \ - (e_phys - r_access->e_physbase); \ - uint32_t value = (u32) (*(type *) r_virt); \ - if (r_access->debug & PT_DEBUG_MMIO) { \ - fprintf(logfile, \ - "pt_mmio_read" #suffix ": e_physbase=%p " \ - "e_phys=%p r_virt=%p value=%08x\n", \ - (void *)r_access->e_physbase, \ - (void *)e_phys, r_virt, value); \ - } \ - return value; \ -} - -pt_mmio_read(b, u8) -pt_mmio_read(w, u16) -pt_mmio_read(l, u32) - #define pt_ioport_write(suffix) \ void pt_ioport_write##suffix(void *opaque, uint32_t addr, uint32_t value) \ { \ @@ -127,22 +74,33 @@ pt_ioport_read(b) pt_ioport_read(w) pt_ioport_read(l) -static void pt_iomem_map(PCIDevice * d, int region_num, - uint32_t e_phys, uint32_t e_size, int type) +void pt_iomem_map(PCIDevice * pci_dev, int region_num, uint32_t e_phys, + uint32_t e_size, int type) { - pt_dev_t *r_dev = (pt_dev_t *) d; - - r_dev->v_addrs[region_num].e_physbase = e_phys; + pt_dev_t *r_dev = (pt_dev_t *) pci_dev; + pt_region_t *region = &r_dev->v_addrs[region_num]; + int first_map = (region->e_size == 0); + int ret = 0; DEBUG("e_phys=%08x r_virt=%p type=%d len=%08x region_num=%d \n", e_phys, r_dev->v_addrs[region_num].r_virtbase, type, e_size, region_num); - cpu_register_physical_memory(e_phys, - r_dev->dev.io_regions[region_num].size, - r_dev->v_addrs[region_num].memory_index); -} + region->e_physbase = e_phys; + region->e_size = e_size; + + if (!first_map) + kvm_destroy_phys_mem(kvm_context, e_phys, e_size); + if (e_size > 0) + ret = kvm_register_userspace_phys_mem(kvm_context, + e_phys, + region->r_virtbase, + e_size, + 0); + if (ret != 0) + fprintf(logfile, "Error: create new mapping failed\n"); +} static void pt_ioport_map(PCIDevice * pci_dev, int region_num, uint32_t addr, uint32_t size, int type) @@ -265,6 +223,8 @@ static int pt_register_regions(pci_region_t * io_regions, (uint32_t) (cur_region->base_addr)); return (-1); } + pci_dev->v_addrs[i].r_size = cur_region->size; + pci_dev->v_addrs[i].e_size = 0; /* add offset */ pci_dev->v_addrs[i].r_virtbase += @@ -274,11 +234,6 @@ static int pt_register_regions(pci_region_t * io_regions, cur_region->size, t, pt_iomem_map); - pci_dev->v_addrs[i].memory_index = - cpu_register_io_memory(0, pt_mmio_read_cb, - pt_mmio_write_cb, - (void *) &(pci_dev->v_addrs[i])); - continue; } /* handle port io regions */ diff --git a/qemu/hw/pci-passthrough.h b/qemu/hw/pci-passthrough.h index 012014a..49db1d2 100644 --- a/qemu/hw/pci-passthrough.h +++ b/qemu/hw/pci-passthrough.h @@ -54,6 +54,8 @@ typedef struct pt_region_s { uint32_t memory_index; void *r_virtbase; /* mmapped access address */ int num; /* our index within v_addrs[] */ + uint32_t e_size; /* emulated size of region in bytes */ + uint32_t r_size; /* real size of region in bytes */ uint32_t debug; } pt_region_t; -- 1.5.4.5 |
From: <be...@il...> - 2008-04-16 13:26:18
|
This patch for PCI passthrough devices enables a guest to access a device's memory mapped I/O regions directly, without requiring the host to trap and emulate every MMIO access. Updated from last version: we create a memory slot for each MMIO region of the guest's devices, and then use the /sys/bus/pci/.../resource# mapping to find the hfn for that MMIO region. The kernel part and the userspace part of this patchset apply to Amit's pv-dma tree. Tested on a Lenovo M57p with an e1000 NIC assigned directly to an FC8 guest. Comments are appreciated. |
From: <be...@il...> - 2008-04-16 13:26:13
|
From: Ben-Ami Yassour <be...@il...> Signed-off-by: Ben-Ami Yassour <be...@il...> Signed-off-by: Muli Ben-Yehuda <mu...@il...> --- arch/x86/kvm/mmu.c | 59 +++++++++++++++++++++++++++++-------------- arch/x86/kvm/paging_tmpl.h | 19 +++++++++---- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 17 +++++++++++- 4 files changed, 69 insertions(+), 28 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 078a7f1..c89029d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -112,6 +112,8 @@ static int dbg = 1; #define PT_FIRST_AVAIL_BITS_SHIFT 9 #define PT64_SECOND_AVAIL_BITS_SHIFT 52 +#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) + #define VALID_PAGE(x) ((x) != INVALID_PAGE) #define PT64_LEVEL_BITS 9 @@ -237,6 +239,9 @@ static int is_dirty_pte(unsigned long pte) static int is_rmap_pte(u64 pte) { + if (pte & PT_SHADOW_IO_MARK) + return false; + return is_shadow_present_pte(pte); } @@ -1034,7 +1039,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, unsigned pt_access, unsigned pte_access, int user_fault, int write_fault, int dirty, int *ptwrite, int largepage, gfn_t gfn, - pfn_t pfn, bool speculative) + pfn_t pfn, bool speculative, + int direct_mmio) { u64 spte; int was_rmapped = 0; @@ -1114,6 +1120,9 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, } } + if (direct_mmio) + spte |= PT_SHADOW_IO_MARK; + unshadowed: if (pte_access & ACC_WRITE_MASK) @@ -1129,16 +1138,19 @@ unshadowed: ++vcpu->kvm->stat.lpages; page_header_update_slot(vcpu->kvm, shadow_pte, gfn); - if (!was_rmapped) { - rmap_add(vcpu, shadow_pte, gfn, largepage); - if (!is_rmap_pte(*shadow_pte)) - kvm_release_pfn_clean(pfn); - } else { - if (was_writeble) - kvm_release_pfn_dirty(pfn); - else - kvm_release_pfn_clean(pfn); + if (!direct_mmio) { + if (!was_rmapped) { + rmap_add(vcpu, shadow_pte, gfn, largepage); + if (!is_rmap_pte(*shadow_pte)) + kvm_release_pfn_clean(pfn); + } else { + if (was_writeble) + kvm_release_pfn_dirty(pfn); + else + kvm_release_pfn_clean(pfn); + } } + if (!ptwrite || !*ptwrite) vcpu->arch.last_pte_updated = shadow_pte; } @@ -1149,7 +1161,7 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, int largepage, gfn_t gfn, pfn_t pfn, - int level) + int level, int direct_mmio) { hpa_t table_addr = vcpu->arch.mmu.root_hpa; int pt_write = 0; @@ -1163,13 +1175,15 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, if (level == 1) { mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, - 0, write, 1, &pt_write, 0, gfn, pfn, false); + 0, write, 1, &pt_write, 0, gfn, pfn, + false, direct_mmio); return pt_write; } if (largepage && level == 2) { mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, - 0, write, 1, &pt_write, 1, gfn, pfn, false); + 0, write, 1, &pt_write, 1, gfn, pfn, + false, direct_mmio); return pt_write; } @@ -1200,6 +1214,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) int r; int largepage = 0; pfn_t pfn; + int direct_mmio = 0; down_read(¤t->mm->mmap_sem); if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { @@ -1207,10 +1222,10 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) largepage = 1; } - pfn = gfn_to_pfn(vcpu->kvm, gfn); + pfn = gfn_to_pfn(vcpu->kvm, gfn, &direct_mmio); up_read(¤t->mm->mmap_sem); - /* mmio */ + /* handle emulated mmio */ if (is_error_pfn(pfn)) { kvm_release_pfn_clean(pfn); return 1; @@ -1219,7 +1234,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); r = __direct_map(vcpu, v, write, largepage, gfn, pfn, - PT32E_ROOT_LEVEL); + PT32E_ROOT_LEVEL, direct_mmio); spin_unlock(&vcpu->kvm->mmu_lock); @@ -1355,6 +1370,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, int r; int largepage = 0; gfn_t gfn = gpa >> PAGE_SHIFT; + int direct_mmio = 0; ASSERT(vcpu); ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); @@ -1368,7 +1384,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, gfn &= ~(KVM_PAGES_PER_HPAGE-1); largepage = 1; } - pfn = gfn_to_pfn(vcpu->kvm, gfn); + pfn = gfn_to_pfn(vcpu->kvm, gfn, &direct_mmio); up_read(¤t->mm->mmap_sem); if (is_error_pfn(pfn)) { kvm_release_pfn_clean(pfn); @@ -1377,7 +1393,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, - largepage, gfn, pfn, TDP_ROOT_LEVEL); + largepage, gfn, pfn, TDP_ROOT_LEVEL, + direct_mmio); spin_unlock(&vcpu->kvm->mmu_lock); return r; @@ -1643,6 +1660,7 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, int r; u64 gpte = 0; pfn_t pfn; + int direct_mmio = 0; vcpu->arch.update_pte.largepage = 0; @@ -1678,9 +1696,12 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, gfn &= ~(KVM_PAGES_PER_HPAGE-1); vcpu->arch.update_pte.largepage = 1; } - pfn = gfn_to_pfn(vcpu->kvm, gfn); + pfn = gfn_to_pfn(vcpu->kvm, gfn, &direct_mmio); up_read(¤t->mm->mmap_sem); + if (direct_mmio) + return; + if (is_error_pfn(pfn)) { kvm_release_pfn_clean(pfn); return; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 156fe10..e85d8ae 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -264,9 +264,10 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, if (is_error_pfn(pfn)) return; kvm_get_pfn(pfn); + mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte), - pfn, true); + pfn, true, false); } /* @@ -275,7 +276,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct guest_walker *walker, int user_fault, int write_fault, int largepage, - int *ptwrite, pfn_t pfn) + int *ptwrite, pfn_t pfn, int direct_mmio) { hpa_t shadow_addr; int level; @@ -349,11 +350,15 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, user_fault, write_fault, walker->ptes[walker->level-1] & PT_DIRTY_MASK, - ptwrite, largepage, walker->gfn, pfn, false); + ptwrite, largepage, walker->gfn, pfn, false, + direct_mmio); return shadow_ent; } +static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr); + + /* * Page fault handler. There are several causes for a page fault: * - there is no shadow pte for the guest pte @@ -380,6 +385,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, int r; pfn_t pfn; int largepage = 0; + int direct_mmio = 0; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); kvm_mmu_audit(vcpu, "pre page fault"); @@ -413,10 +419,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, largepage = 1; } } - pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); + pfn = gfn_to_pfn(vcpu->kvm, walker.gfn, &direct_mmio); up_read(¤t->mm->mmap_sem); - /* mmio */ + /* handle emulated mmio */ if (is_error_pfn(pfn)) { pgprintk("gfn %x is mmio\n", walker.gfn); kvm_release_pfn_clean(pfn); @@ -426,7 +432,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, - largepage, &write_pt, pfn); + largepage, &write_pt, pfn, + direct_mmio); pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, shadow_pte, *shadow_pte, write_pt); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 578c363..0910cc1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -173,7 +173,7 @@ void kvm_release_page_dirty(struct page *page); void kvm_set_page_dirty(struct page *page); void kvm_set_page_accessed(struct page *page); -pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); +pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn, int *direct_mmio); void kvm_release_pfn_dirty(pfn_t); void kvm_release_pfn_clean(pfn_t pfn); void kvm_set_pfn_dirty(pfn_t pfn); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6a52c08..07b95f7 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -526,20 +526,33 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) /* * Requires current->mm->mmap_sem to be held */ -pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) +pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn, int *direct_mmio) { struct page *page[1]; unsigned long addr; int npages; + struct vm_area_struct *vma; might_sleep(); + if (direct_mmio) + *direct_mmio = 0; + addr = gfn_to_hva(kvm, gfn); if (kvm_is_error_hva(addr)) { get_page(bad_page); return page_to_pfn(bad_page); } + /* handle mmio */ + vma = find_vma(current->mm, addr); + if (vma->vm_flags & VM_IO) { + if (direct_mmio) + *direct_mmio = 1; + + return ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + } + npages = get_user_pages(current, current->mm, addr, 1, 1, 1, page, NULL); @@ -555,7 +568,7 @@ EXPORT_SYMBOL_GPL(gfn_to_pfn); struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) { - return pfn_to_page(gfn_to_pfn(kvm, gfn)); + return pfn_to_page(gfn_to_pfn(kvm, gfn, NULL)); } EXPORT_SYMBOL_GPL(gfn_to_page); -- 1.5.4.5 |
From: Izik E. <iz...@qu...> - 2008-04-16 13:01:35
|
Avi Kivity wrote: > Enrico Weigelt wrote: >> Hi folks, >> >> I'm using dozens of VE's / jails to separate applications >> (even complete webapps w/ their own httpd) for easier maintenance >> and better security. But this tends to consume a lot of memory, >> since code sharing (.so's) cannot take effect here (each jail/VE >> has it's completely own tree). >> Now I wonder if it might be possible to let the kernel automatically >> find out equal pages and map them together. >> A little compasion showed up that at least 50% of the code could be >> shared (maybe more with some tuning, and maybe even data). So it's >> (IMHO) really worth it. >> >> Would this be possible ? What had to be done for this ? >> > > Izik (copied) is working on this for kvm. Results so far are very > encouraging, but currently merged pages are not swappable. > we have module that posted with kernel patchs in the mm list, you can actualy run it and play with it there were some bugs that we fixed with it, but i am still not sending it, beacuse i change there quite alot of things in order to get swapping possible for the pages that are shared. -- woof. |
From: Avi K. <av...@qu...> - 2008-04-16 11:41:31
|
Enrico Weigelt wrote: > Hi folks, > > I'm using dozens of VE's / jails to separate applications > (even complete webapps w/ their own httpd) for easier maintenance > and better security. But this tends to consume a lot of memory, > since code sharing (.so's) cannot take effect here (each jail/VE > has it's completely own tree). > > Now I wonder if it might be possible to let the kernel automatically > find out equal pages and map them together. > > A little compasion showed up that at least 50% of the code could be > shared (maybe more with some tuning, and maybe even data). So it's > (IMHO) really worth it. > > Would this be possible ? What had to be done for this ? > Izik (copied) is working on this for kvm. Results so far are very encouraging, but currently merged pages are not swappable. -- error compiling committee.c: too many arguments to function |
From: Avi K. <av...@qu...> - 2008-04-16 11:24:09
|
Anders wrote: > Avi Kivity wrote: > > >> if select() doesn't enable signals (like you can do with pselect) you >> may sit for a long time in select() until the timer expires. >> > > Hm. Does the guest timer affect host userspace in all configurations? The > original trigger for my SIGIO patch was that opening a VNC connection > would take up to a second, until the qemu RTC timer fired. > > Right, if the guest is using the lapic or pit timers, there is no periodic signal to userspace. > >> Consider a 100Hz Linux guest running 'ls -lR' out of a cold cache: >> instead of 1-2 ms disk latencies you'll see 10 ms latencies, killing >> performance by a factor of 5. >> > > I guess this works out even worse for a dyntick guest? > > > Yes. We can't depend on guest activity for correctness. -- error compiling committee.c: too many arguments to function |
From: Anders <ma...@fl...> - 2008-04-16 10:26:05
|
Avi Kivity wrote: > if select() doesn't enable signals (like you can do with pselect) you > may sit for a long time in select() until the timer expires. Hm. Does the guest timer affect host userspace in all configurations? The original trigger for my SIGIO patch was that opening a VNC connection would take up to a second, until the qemu RTC timer fired. > Consider a 100Hz Linux guest running 'ls -lR' out of a cold cache: > instead of 1-2 ms disk latencies you'll see 10 ms latencies, killing > performance by a factor of 5. I guess this works out even worse for a dyntick guest? > I still think (1) should be merged, preferably to qemu upstream. I will give it another try. They are not very receptive, though, and I am not that confident in what the patch is actually doing :-). You guys are helping me a lot in that regard, thanks. Cheers, Anders. |
From: Yunfeng Z. <yun...@in...> - 2008-04-16 09:12:42
|
Hi All, This is today's KVM test result against kvm.git 8d3a833dc9d42f0967e57717f89c518375d6a417 and kvm-userspace.git bae043c2ddf35ed1965f062131394afa75e45b17. Three Old Issues: ================================================ 1. Booting four guests likely fails https://sourceforge.net/tracker/?func=detail&atid=893831&aid=1919354&group_id=180599 2. booting smp windows guests has 30% chance of hang https://sourceforge.net/tracker/?func=detail&atid=893831&aid=1910923&group_id=180599 3. Cannot boot guests with hugetlbfs https://sourceforge.net/tracker/?func=detail&atid=893831&aid=1941302&group_id=180599 Test environment ================================================ Platform Woodcrest CPU 4 Memory size 8G' Details ================================================ IA32-pae: 1. boot guest with 256M memory PASS 2. boot two windows xp guest PASS 3. boot 4 same guest in parallel PASS 4. boot linux and windows guest in parallel PASS 5. boot guest with 1500M memory PASS 6. boot windows 2003 with ACPI enabled PASS 7. boot Windows xp with ACPI enabled PASS 8. boot Windows 2000 without ACPI PASS 9. kernel build on SMP linux guest PASS 10. LTP on SMP linux guest PASS 11. boot base kernel linux PASS 12. save/restore 32-bit HVM guests PASS 13. live migration 32-bit HVM guests PASS 14. boot SMP Windows xp with ACPI enabled PASS 15. boot SMP Windows 2003 with ACPI enabled PASS 16. boot SMP Windows 2000 with ACPI enabled PASS ================================================ IA32e: 1. boot four 32-bit guest in parallel PASS 2. boot four 64-bit guest in parallel PASS 3. boot 4G 64-bit guest PASS 4. boot 4G pae guest PASS 5. boot 32-bit linux and 32 bit windows guest in parallel PASS 6. boot 32-bit guest with 1500M memory PASS 7. boot 64-bit guest with 1500M memory PASS 8. boot 32-bit guest with 256M memory PASS 9. boot 64-bit guest with 256M memory PASS 10. boot two 32-bit windows xp in parallel PASS 11. boot four 32-bit different guest in para PASS 12. save/restore 64-bit linux guests PASS 13. save/restore 32-bit linux guests PASS 14. boot 32-bit SMP windows 2003 with ACPI enabled PASS 15. boot 32-bit SMP Windows 2000 with ACPI enabled PASS 16. boot 32-bit SMP Windows xp with ACPI enabled PASS 17. boot 32-bit Windows 2000 without ACPI PASS 18. boot 64-bit Windows xp with ACPI enabled PASS 19. boot 32-bit Windows xp without ACPI PASS 20. boot 64-bit UP vista PASS 21. boot 64-bit SMP vista PASS 22. kernel build in 32-bit linux guest OS PASS 23. kernel build in 64-bit linux guest OS PASS 24. LTP on SMP 32-bit linux guest OS PASS 25. LTP on SMP 64-bit linux guest OS PASS 26. boot 64-bit guests with ACPI enabled PASS 27. boot 32-bit x-server PASS 28. boot 64-bit SMP windows XP with ACPI enabled PASS 29. boot 64-bit SMP windows 2003 with ACPI enabled PASS 30. live migration 64bit linux guests PASS 31. live migration 32bit linux guests PASS 32. reboot 32bit windows xp guest PASS 33. reboot 32bit windows xp guest PASS Report Summary on IA32-pae Summary Test Report of Last Session ===================================================================== Total Pass Fail NoResult Crash ===================================================================== control_panel 7 7 0 0 0 Restart 2 2 0 0 0 gtest 15 15 0 0 0 ===================================================================== control_panel 7 7 0 0 0 :KVM_LM_PAE_gPAE 1 1 0 0 0 :KVM_four_sguest_PAE_gPA 1 1 0 0 0 :KVM_256M_guest_PAE_gPAE 1 1 0 0 0 :KVM_linux_win_PAE_gPAE 1 1 0 0 0 :KVM_1500M_guest_PAE_gPA 1 1 0 0 0 :KVM_SR_PAE_gPAE 1 1 0 0 0 :KVM_two_winxp_PAE_gPAE 1 1 0 0 0 Restart 2 2 0 0 0 :GuestPAE_PAE_gPAE 1 1 0 0 0 :BootTo32pae_PAE_gPAE 1 1 0 0 0 gtest 15 15 0 0 0 :ltp_nightly_PAE_gPAE 1 1 0 0 0 :boot_up_acpi_PAE_gPAE 1 1 0 0 0 :reboot_xp_PAE_gPAE 1 1 0 0 0 :boot_up_vista_PAE_gPAE 1 1 0 0 0 :boot_up_acpi_xp_PAE_gPA 1 1 0 0 0 :boot_up_acpi_win2k3_PAE 1 1 0 0 0 :boot_base_kernel_PAE_gP 1 1 0 0 0 :boot_smp_acpi_win2k3_PA 1 1 0 0 0 :boot_smp_acpi_win2k_PAE 1 1 0 0 0 :boot_up_acpi_win2k_PAE_ 1 1 0 0 0 :boot_smp_acpi_xp_PAE_gP 1 1 0 0 0 :boot_up_noacpi_win2k_PA 1 1 0 0 0 :boot_smp_vista_PAE_gPAE 1 1 0 0 0 :bootx_PAE_gPAE 1 1 0 0 0 :kb_nightly_PAE_gPAE 1 1 0 0 0 ===================================================================== Total 24 24 0 0 0 Report Summary on IA32e Summary Test Report of Last Session ===================================================================== Total Pass Fail NoResult Crash ===================================================================== control_panel 15 15 0 0 0 Restart 3 3 0 0 0 gtest 25 25 0 0 0 ===================================================================== control_panel 15 15 0 0 0 :KVM_LM_64_g64 1 1 0 0 0 :KVM_four_sguest_64_gPAE 1 1 0 0 0 :KVM_4G_guest_64_g64 1 1 0 0 0 :KVM_four_sguest_64_g64 1 1 0 0 0 :KVM_linux_win_64_gPAE 1 1 0 0 0 :KVM_1500M_guest_64_gPAE 1 1 0 0 0 :KVM_SR_64_g64 1 1 0 0 0 :KVM_LM_64_gPAE 1 1 0 0 0 :KVM_256M_guest_64_g64 1 1 0 0 0 :KVM_1500M_guest_64_g64 1 1 0 0 0 :KVM_4G_guest_64_gPAE 1 1 0 0 0 :KVM_SR_64_gPAE 1 1 0 0 0 :KVM_256M_guest_64_gPAE 1 1 0 0 0 :KVM_two_winxp_64_gPAE 1 1 0 0 0 :KVM_four_dguest_64_gPAE 1 1 0 0 0 Restart 3 3 0 0 0 :GuestPAE_64_gPAE 1 1 0 0 0 :BootTo64_64_gPAE 1 1 0 0 0 :Guest64_64_gPAE 1 1 0 0 0 gtest 25 25 0 0 0 :boot_up_acpi_64_gPAE 1 1 0 0 0 :boot_up_noacpi_xp_64_gP 1 1 0 0 0 :boot_smp_acpi_xp_64_g64 1 1 0 0 0 :boot_base_kernel_64_gPA 1 1 0 0 0 :boot_smp_acpi_win2k3_64 1 1 0 0 0 :boot_smp_acpi_win2k_64_ 1 1 0 0 0 :boot_base_kernel_64_g64 1 1 0 0 0 :bootx_64_gPAE 1 1 0 0 0 :kb_nightly_64_gPAE 1 1 0 0 0 :ltp_nightly_64_g64 1 1 0 0 0 :boot_up_acpi_64_g64 1 1 0 0 0 :boot_up_noacpi_win2k_64 1 1 0 0 0 :boot_smp_acpi_xp_64_gPA 1 1 0 0 0 :boot_smp_vista_64_gPAE 1 1 0 0 0 :boot_up_acpi_win2k3_64_ 1 1 0 0 0 :reboot_xp_64_gPAE 1 1 0 0 0 :bootx_64_g64 1 1 0 0 0 :boot_up_vista_64_g64 1 1 0 0 0 :boot_smp_vista_64_g64 1 1 0 0 0 :boot_up_acpi_xp_64_g64 1 1 0 0 0 :boot_up_vista_64_gPAE 1 1 0 0 0 :ltp_nightly_64_gPAE 1 1 0 0 0 :boot_smp_acpi_win2k3_64 1 1 0 0 0 :boot_up_noacpi_win2k3_6 1 1 0 0 0 :kb_nightly_64_g64 1 1 0 0 0 ===================================================================== Total 43 43 0 0 0 Thanks Yunfeng |
From: Avi K. <av...@qu...> - 2008-04-16 08:46:40
|
David S. Ahern wrote: > I have been looking at RHEL3 based guests lately, and to say the least the > performance is horrible. Rather than write a long tome on what I've done and > observed, I'd like to find out if anyone has some insights or known problem > areas running 2.4 guests. The short of it is that % system time spikes from time > to time (e.g., on exec of a new process such as running /bin/true). > > I do not see the problem running RHEL3 on ESX, and an equivalent VM running > RHEL4 runs fine. That suggests that the 2.4 kernel is doing something in a way > that is not handled efficiently by kvm. > > Can someone shed some light on it? > It's not something that I test regularly. If you're running a 32-bit kernel, I'd suspect kmap(), or perhaps false positives from the fork detector. kvmtrace will probably give enough info to tell exactly what's going on; 'kvmstat -1' while the badness is happening may also help. -- error compiling committee.c: too many arguments to function |
From: Avi K. <av...@qu...> - 2008-04-16 08:37:30
|
Anthony Liguori wrote: >> >> What about aio completions? The only race-free way to handle both >> posix aio completion and fd readiness is signals AFAIK. > > We poll aio completion after the select don't we? Worst case scenario > we miss a signal and wait to poll after the next select event. That's > going to occur very often because of the timer. if select() doesn't enable signals (like you can do with pselect) you may sit for a long time in select() until the timer expires. Consider a 100Hz Linux guest running 'ls -lR' out of a cold cache: instead of 1-2 ms disk latencies you'll see 10 ms latencies, killing performance by a factor of 5. I see the following possible solutions: 1. Apply Anders' patch and keep I/O completions signal based. 2. Use signalfd() to convert aio completions to fd readiness, emulating signalfd() using a thread which does sigwait()+write() (to a pipe) on older hosts 3. Use a separate thread for aio completions 4. Use pselect(), live with the race on older hosts (it was introduced in 2.6.16, which we barely support anyway), live with the signal delivery inefficiency. When I started writing this email I was in favor of (1), but now with the new signalfd emulation I'm leaning towards (2). I still think (1) should be merged, preferably to qemu upstream. -- error compiling committee.c: too many arguments to function |
From: Elliott <thu...@AU...> - 2008-04-16 08:15:40
|
Discover new ways to grow your man device http://www.bugehaej.com/ |
From: Elmar H. <el...@ha...> - 2008-04-16 07:59:28
|
> That's it. If anyone can attest to this procedures and report, it would > be greatly appreciated. I did try to do so But I do get an "disk read error". kvm is invoked via qemu-system-x86_64 \ -m 768 \ -drive file=Vista.img,if=scsi,bus=0,index=0,media=disk,boot=on \ -net nic,model=e1000,macaddr=52:54:00:12:34:$ifnum \ -net tap,ifname=$iface,script=no \ -k de \ -usb \ -usbdevice tablet \ -monitor stdio What's going wrong? Elmar |
From: Nguyen A. Q. <aq...@gm...> - 2008-04-16 07:07:43
|
Hi Anthony, I found a bug in the last code: send_command() failed to copy back the result into extboot_cmd structure. This patch fixes it. I succesfully tested this version with guest Win2K (fully updated, scsi boot) and Linux 2.6.25-rc8 (virtio). Let me know if you can boot Windows with this version. Thanks, Quynh --- This code is an attempt to rewrite the current extboot option rom in C. The new code now minimize the assembly code, so that the assembly code is very small and simple: boot.S's only job is to interface with C code, which does all the dirty job. "signrom" is modified to adapt with the new result binary image. The result option rom has the same size as the original one: 1.5KB, while the actual code size is around the same: 1.2KB (gcc can optimize really well) To install this option rom, do the following steps as root: make make save <--- backup the original option rom to /usr/share/qemu/extboot.bin.org make install <--- overwrite the new option rom to /usr/share/qemu/extboot.bin |
From: Liu, E. E <eri...@in...> - 2008-04-16 06:45:39
|
Hollis Blanchard wrote: > On Tuesday 15 April 2008 22:13:28 Liu, Eric E wrote: >> Hollis Blanchard wrote: >>> On Wednesday 09 April 2008 05:01:36 Liu, Eric E wrote: >>>> +/* This structure represents a single trace buffer record. */ >>>> +struct kvm_trace_rec { + __u32 event:28; >>>> + __u32 extra_u32:3; >>>> + __u32 cycle_in:1; >>>> + __u32 pid; >>>> + __u32 vcpu_id; >>>> + union { >>>> + struct { >>>> + __u32 cycle_lo, cycle_hi; >>>> + __u32 extra_u32[KVM_TRC_EXTRA_MAX]; >>>> + } cycle; + struct { >>>> + __u32 extra_u32[KVM_TRC_EXTRA_MAX]; >>>> + } nocycle; + } u; >>>> +}; >>> >>> Do we really need bitfields here? They are notoriously non-portable. >>> >>> Practically speaking, this will prevent me from copying a trace file >>> from my big-endian target to my little-endian workstation for >>> analysis, at least without some ugly hacking in the userland tool. >> Here the main consideration using bitfields is to save storage space >> for > each record, but as you said it is non-portable for your mentioned > case, so should we need to adjust the struct like this? >> __u32 event; >> __16 extra_u32; >> __16 cycle_in; > > If space really is a worry, you could still combine the fields, and > just use masks to extract the data later. No matter what, > byteswapping is required in the userland tool. I suspect this isn't > there already, but it will be easier to add without the bitfields. > > Hmm, while we're on the subject, I'm not sure what the best way to > automatically byteswap will be. It probably isn't worth it to convert > all trace data to a standard ordering (which would add overhead to > tracing), but I suppose there is no metadata in the trace log? A > command line switch might be inconvenient but inevitable. A tricky approach is that we insert medadata to the trace file before reading the trace log, so that the analysis tool can look at the medadata to check whether we need to convert byte order? |
From: Felix L. <fel...@gm...> - 2008-04-16 06:09:41
|
Jun, what happens when you boot into Windows' safe mode? It usually displays the drivers as it loads them. Would be interesting to see where it hangs. Also try this: * Boot with the IDE disk which works and the dummy SCSI disk attached as per Alberto's instructions * After you confirmed that the SCSI controller and disk are in device manager: delete the currently in use IDE controller from device manager * Reboot into the SCSI setup Good luck, Felix |
From: Hollis B. <ho...@us...> - 2008-04-16 05:35:19
|
On Tuesday 15 April 2008 22:13:28 Liu, Eric E wrote: > Hollis Blanchard wrote: > > On Wednesday 09 April 2008 05:01:36 Liu, Eric E wrote: > >> +/* This structure represents a single trace buffer record. */ > >> +struct kvm_trace_rec { + __u32 event:28; > >> + __u32 extra_u32:3; > >> + __u32 cycle_in:1; > >> + __u32 pid; > >> + __u32 vcpu_id; > >> + union { > >> + struct { > >> + __u32 cycle_lo, cycle_hi; > >> + __u32 extra_u32[KVM_TRC_EXTRA_MAX]; > >> + } cycle; + struct { > >> + __u32 extra_u32[KVM_TRC_EXTRA_MAX]; > >> + } nocycle; + } u; > >> +}; > > > > Do we really need bitfields here? They are notoriously non-portable. > > > > Practically speaking, this will prevent me from copying a trace file > > from my big-endian target to my little-endian workstation for > > analysis, at least without some ugly hacking in the userland tool. > Here the main consideration using bitfields is to save storage space for each record, but as you said it is non-portable for your mentioned case, so should we need to adjust the struct like this? > __u32 event; > __16 extra_u32; > __16 cycle_in; If space really is a worry, you could still combine the fields, and just use masks to extract the data later. No matter what, byteswapping is required in the userland tool. I suspect this isn't there already, but it will be easier to add without the bitfields. Hmm, while we're on the subject, I'm not sure what the best way to automatically byteswap will be. It probably isn't worth it to convert all trace data to a standard ordering (which would add overhead to tracing), but I suppose there is no metadata in the trace log? A command line switch might be inconvenient but inevitable. -- Hollis Blanchard IBM Linux Technology Center |
From: Леокадия <mi...@sc...> - 2008-04-16 05:30:10
|
Продажа кино на DVD-дисках!!! Новинки, классика и сборники Добро пожаловать в наш магазин www.dvdservice.ru |
From: Liu, E. E <eri...@in...> - 2008-04-16 03:13:31
|
Hollis Blanchard wrote: > On Wednesday 09 April 2008 05:01:36 Liu, Eric E wrote: >> +/* This structure represents a single trace buffer record. */ >> +struct kvm_trace_rec { + __u32 event:28; >> + __u32 extra_u32:3; >> + __u32 cycle_in:1; >> + __u32 pid; >> + __u32 vcpu_id; >> + union { >> + struct { >> + __u32 cycle_lo, cycle_hi; >> + __u32 extra_u32[KVM_TRC_EXTRA_MAX]; >> + } cycle; + struct { >> + __u32 extra_u32[KVM_TRC_EXTRA_MAX]; >> + } nocycle; + } u; >> +}; > > Do we really need bitfields here? They are notoriously non-portable. > > Practically speaking, this will prevent me from copying a trace file > from my big-endian target to my little-endian workstation for > analysis, at least without some ugly hacking in the userland tool. Here the main consideration using bitfields is to save storage space for each record, but as you said it is non-portable for your mentioned case, so should we need to adjust the struct like this? __u32 event; __16 extra_u32; __16 cycle_in; |
From: Jun K. <jun...@gm...> - 2008-04-16 01:28:22
|
On Wed, Apr 16, 2008 at 1:29 AM, Alberto Treviño <al...@by...> wrote: > On Tuesday 15 April 2008 12:57:45 am Jun Koi wrote: > > Looks like a problem, however. In his instruction, part 3: > > >3. Shut down the VM. This time, don't include the new temporary > > > image from step 1 and define your disk(s) as SCSI disks: > > > > > > qemu-system-x86_64 -m 256 \ > > > -drive file=hda.img,if=scsi,bus=0,index=0,media=disk,boot=off > > > ... > > > > I think above should have "boot=on", rather than "boot=off". > > Otherwise, you cannot boot from scsi disk, right? > > You are right! That's what happens when you copy and paste code. :-) > > > > I tried the above instructions with WinXP, but WinXP cannot boot > > successfully: it stops somewhere in the middle, and hang there. > > I tried it on a vanilla install of WinXP SP2, no updates. My WinXP is fully updated. > In step #2, did the Device Manager show the SCSI disk? Yes, it does. If you have any progress on this, please let us know. Many thanks, Jun |
From: David S. A. <da...@ci...> - 2008-04-16 00:15:35
|
I have been looking at RHEL3 based guests lately, and to say the least the performance is horrible. Rather than write a long tome on what I've done and observed, I'd like to find out if anyone has some insights or known problem areas running 2.4 guests. The short of it is that % system time spikes from time to time (e.g., on exec of a new process such as running /bin/true). I do not see the problem running RHEL3 on ESX, and an equivalent VM running RHEL4 runs fine. That suggests that the 2.4 kernel is doing something in a way that is not handled efficiently by kvm. Can someone shed some light on it? thanks, david |
From: Anthony L. <ali...@us...> - 2008-04-15 22:13:05
|
This patch introduces a DMA API and plumbs support through the DMA layer. We use a mostly opaque structure, IOVector to represent a scatter/gather list of physical memory. Associated with each IOVector is a read/write function and an opaque pointer. This allows arbitrary transformation/mapping of the data while providing an easy mechanism to short-cut the zero-copy case in the block/net backends. Signed-off-by: Anthony Liguori <ali...@us...> diff --git a/Makefile b/Makefile index adb50a8..a8df278 100644 --- a/Makefile +++ b/Makefile @@ -34,7 +34,7 @@ recurse-all: $(patsubst %,subdir-%, $(TARGET_DIRS)) ####################################################################### # BLOCK_OBJS is code used by both qemu system emulation and qemu-img -BLOCK_OBJS=cutils.o +BLOCK_OBJS=cutils.o iovector.o BLOCK_OBJS+=block-cow.o block-qcow.o aes.o block-vmdk.o block-cloop.o BLOCK_OBJS+=block-dmg.o block-bochs.o block-vpc.o block-vvfat.o BLOCK_OBJS+=block-qcow2.o block-parallels.o diff --git a/block.c b/block.c index 0730954..eb610e0 100644 --- a/block.c +++ b/block.c @@ -570,6 +570,55 @@ int bdrv_write(BlockDriverState *bs, int64_t sector_num, } } +int bdrv_readv(BlockDriverState *bs, int64_t sector_num, + IOVector *iovec) +{ + char *buffer; + size_t size; + int ret; + + /* it's possible that we'll see a OOM condition here if the transfer size + * is sufficiently large. + */ + size = iovector_size(iovec); + buffer = qemu_malloc(size); + if (buffer == NULL) + return -ENOMEM; + + ret = bdrv_read(bs, sector_num, buffer, size / 512); + + if (ret >= 0) + memcpy_to_iovector(iovec, 0, size, buffer); + + qemu_free(buffer); + + return ret; +} + +int bdrv_writev(BlockDriverState *bs, int64_t sector_num, + const IOVector *iovec) +{ + char *buffer; + size_t size; + int ret; + + /* it's possible that we'll see a OOM condition here if the transfer size + * is sufficiently large. + */ + size = iovector_size(iovec); + buffer = qemu_malloc(size); + if (buffer == NULL) + return -ENOMEM; + + memcpy_from_iovector(buffer, 0, size, iovec); + + ret = bdrv_write(bs, sector_num, buffer, size / 512); + + qemu_free(buffer); + + return ret; +} + static int bdrv_pread_em(BlockDriverState *bs, int64_t offset, uint8_t *buf, int count1) { diff --git a/block.h b/block.h index b730505..9d30db2 100644 --- a/block.h +++ b/block.h @@ -1,6 +1,8 @@ #ifndef BLOCK_H #define BLOCK_H +#include "iovector.h" + /* block.c */ typedef struct BlockDriver BlockDriver; @@ -67,6 +69,9 @@ int bdrv_read(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, int nb_sectors); int bdrv_write(BlockDriverState *bs, int64_t sector_num, const uint8_t *buf, int nb_sectors); +int bdrv_readv(BlockDriverState *bs, int64_t sector_num, IOVector *iovec); +int bdrv_writev(BlockDriverState *bs, int64_t sector_num, + const IOVector *iovec); int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int count); int bdrv_pwrite(BlockDriverState *bs, int64_t offset, diff --git a/hw/pci.c b/hw/pci.c index bc55989..3282478 100644 --- a/hw/pci.c +++ b/hw/pci.c @@ -145,6 +145,18 @@ int pci_device_load(PCIDevice *s, QEMUFile *f) return 0; } +void pci_device_dma_write(PCIDevice *s, target_phys_addr_t addr, + const void *buffer, size_t len) +{ + cpu_physical_memory_write(addr, buffer, len); +} + +void pci_device_dma_read(PCIDevice *s, target_phys_addr_t addr, + void *buffer, size_t len) +{ + cpu_physical_memory_read(addr, buffer, len); +} + /* -1 for devfn means auto assign */ PCIDevice *pci_register_device(PCIBus *bus, const char *name, int instance_size, int devfn, diff --git a/hw/pci.h b/hw/pci.h index e870987..c885cc5 100644 --- a/hw/pci.h +++ b/hw/pci.h @@ -81,6 +81,12 @@ void pci_default_write_config(PCIDevice *d, void pci_device_save(PCIDevice *s, QEMUFile *f); int pci_device_load(PCIDevice *s, QEMUFile *f); +void pci_device_dma_write(PCIDevice *s, target_phys_addr_t addr, + const void *buffer, size_t len); + +void pci_device_dma_read(PCIDevice *s, target_phys_addr_t addr, + void *buffer, size_t len); + typedef void (*pci_set_irq_fn)(qemu_irq *pic, int irq_num, int level); typedef int (*pci_map_irq_fn)(PCIDevice *pci_dev, int irq_num); PCIBus *pci_register_bus(pci_set_irq_fn set_irq, pci_map_irq_fn map_irq, diff --git a/iovector.c b/iovector.c new file mode 100644 index 0000000..056a86e --- /dev/null +++ b/iovector.c @@ -0,0 +1,144 @@ +/* + * IO Vectors + * + * Copyright IBM, Corp. 2008 + * + * Authors: + * Anthony Liguori <ali...@us...> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "qemu-common.h" +#include "iovector.h" + +static size_t iovector_rw(void *buffer, size_t size, IOVector *iov, int read) +{ + uint8_t *ptr = buffer; + size_t offset = 0; + int i; + + for (i = 0; i < iov->num; i++) { + size_t len; + + len = MIN(iov->sg[i].len, size - offset); + + if (read) + iov->read(iov->opaque, iov->sg[i].base, ptr + offset, len); + else + iov->write(iov->opaque, iov->sg[i].base, ptr + offset, len); + + offset += len; + } + + return offset; +} + +size_t memcpy_from_iovector(void *buffer, size_t offset, size_t size, + const IOVector *iov) +{ + IOVector *sg; + size_t len; + + if (offset) + sg = iovector_trim(iov, offset, size); + else + sg = (IOVector *)iov; + + len = iovector_rw(buffer, size, sg, 1); + + if (offset) + iovector_free(sg); + + return len; +} + +size_t memcpy_to_iovector(IOVector *iovec, size_t offset, size_t size, + const void *buffer) +{ + IOVector *sg; + size_t len; + + if (offset) + sg = iovector_trim(iovec, offset, size); + else + sg = iovec; + + len = iovector_rw((void *)buffer, size, sg, 0); + + if (offset) + iovector_free(sg); + + return len; +} + +IOVector *iovector_new(int num, DMAReadHandler *read, DMAWriteHandler *write, + void *opaque) +{ + IOVector *ret; + + ret = qemu_malloc(sizeof(IOVector) + sizeof(IOVectorElement) * num); + if (ret == NULL) + return NULL; + + ret->num = num; + ret->read = read; + ret->write = write; + ret->opaque = opaque; + + return ret; +} + +void iovector_free(IOVector *iov) +{ + qemu_free(iov); +} + +IOVector *iovector_trim(const IOVector *iov, size_t offset, size_t size) +{ + IOVector *ret; + size_t off, total_size; + int i; + + ret = iovector_new(iov->num, iov->read, iov->write, iov->opaque); + if (ret == NULL) + return NULL; + + total_size = 0; + ret->num = 0; + + off = 0; + for (i = 0; i < iov->num; i++) { + if (off >= offset || offset < (off + iov->sg[i].len)) { + size_t fudge = 0; + if (off < offset) + fudge = offset - off; + + ret->sg[ret->num].base = iov->sg[i].base + fudge; + ret->sg[ret->num].len = MIN(iov->sg[i].len - fudge, + size - total_size); + total_size += ret->sg[ret->num].len; + ret->num++; + + if (total_size == size) + break; + } + + off += iov->sg[i].len; + } + + return ret; +} + +size_t iovector_size(const IOVector *iov) +{ + size_t size = 0; + int i; + + for (i = 0; i < iov->num; i++) + size += iov->sg[i].len; + + return size; +} diff --git a/iovector.h b/iovector.h new file mode 100644 index 0000000..f40f0a0 --- /dev/null +++ b/iovector.h @@ -0,0 +1,63 @@ +/* + * IO Vectors + * + * Copyright IBM, Corp. 2008 + * + * Authors: + * Anthony Liguori <ali...@us...> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#ifndef _QEMU_IOVECTOR_H +#define _QEMU_IOVECTOR_H + +typedef struct IOVectorElement IOVectorElement; + +typedef void (DMAReadHandler)(void *opaque, uint64_t addr, + void *data, size_t len); + +typedef void (DMAWriteHandler)(void *opaque, uint64_t addr, + const void *data, size_t len); + +typedef struct IOVector +{ + DMAWriteHandler *write; + DMAReadHandler *read; + void *opaque; + + int num; + struct IOVectorElement { + uint64_t base; + size_t len; + } sg[0]; +} IOVector; + +/* Copy from an IOVector to a flat buffer. Be careful to pass in a fully + * translated IOVector here. */ +size_t memcpy_from_iovector(void *buffer, size_t offset, size_t size, + const IOVector *iov); + +/* Copy to an IOVector from a flat buffer. Be careful to pass in a fully + * translated IOVector here. */ +size_t memcpy_to_iovector(IOVector *iovec, size_t offset, size_t size, + const void *buffer); + +/* Return a new IOVector that's a subset of the passed in IOVector. It should + * be freed with iovector_free when you are done with it. */ +IOVector *iovector_trim(const IOVector *iov, size_t offset, size_t size); + +/* Returns the size of an IOVector in bytes */ +size_t iovector_size(const IOVector *iov); + +/* Returns a new IOVector with num elements. iov->num will be set to num on + * return */ +IOVector *iovector_new(int num, DMAReadHandler *read, DMAWriteHandler *write, + void *opaque); + +/* Frees an IOVector */ +void iovector_free(IOVector *iov); + +#endif diff --git a/net.h b/net.h index 2dfff8d..0b3a155 100644 --- a/net.h +++ b/net.h @@ -1,6 +1,8 @@ #ifndef QEMU_NET_H #define QEMU_NET_H +#include "iovector.h" + /* VLANs support */ typedef struct VLANClientState VLANClientState; @@ -30,6 +32,7 @@ VLANClientState *qemu_new_vlan_client(VLANState *vlan, void *opaque); int qemu_can_send_packet(VLANClientState *vc); void qemu_send_packet(VLANClientState *vc, const uint8_t *buf, int size); +void qemu_sendv_packet(VLANClientState *vc, const IOVector *iovec); void qemu_handler_true(void *opaque); void do_info_network(void); diff --git a/vl.c b/vl.c index 318eb35..821c05d 100644 --- a/vl.c +++ b/vl.c @@ -3731,6 +3731,22 @@ void qemu_send_packet(VLANClientState *vc1, const uint8_t *buf, int size) } } +void qemu_sendv_packet(VLANClientState *vc, const IOVector *iovec) +{ + size_t size; + uint8_t *data; + + size = iovector_size(iovec); + data = qemu_malloc(size); + if (data == NULL) + return; + + memcpy_from_iovector(data, 0, size, iovec); + qemu_send_packet(vc, data, size); + + qemu_free(data); +} + #if defined(CONFIG_SLIRP) /* slirp network adapter */ |
From: Anthony L. <ali...@us...> - 2008-04-15 22:11:36
|
This patch implements the virtio balloon driver backend. A user can interact with the balloon driver using a newly introduce monitor command 'balloon'. Ballooning is used to request the guest to stop using a certain portion of its memory. The guest notifies the host of this memory so the host can immediately reallocate it. Ballooning is implemented within QEMU via the madvise() system call. This is for Linux hosts only ATM but it should be easy enough to add the right code for other hosts. If you balloon down sufficiently, you can see the resident memory of the QEMU instance decrease when using this driver. Signed-off-by: Anthony Liguori <ali...@us...> diff --git a/Makefile.target b/Makefile.target index 4d695c7..dead372 100644 --- a/Makefile.target +++ b/Makefile.target @@ -535,7 +535,7 @@ OBJS += rtl8139.o OBJS += e1000.o # virtio devices -OBJS += virtio.o virtio-net.o virtio-blk.o +OBJS += virtio.o virtio-net.o virtio-blk.o virtio-balloon.o ifeq ($(TARGET_BASE_ARCH), i386) # Hardware support diff --git a/balloon.h b/balloon.h new file mode 100644 index 0000000..60b4a5d --- /dev/null +++ b/balloon.h @@ -0,0 +1,27 @@ +/* + * Balloon + * + * Copyright IBM, Corp. 2008 + * + * Authors: + * Anthony Liguori <ali...@us...> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#ifndef _QEMU_BALLOON_H +#define _QEMU_BALLOON_H + +#include "cpu-defs.h" + +typedef ram_addr_t (QEMUBalloonEvent)(void *opaque, ram_addr_t target); + +void qemu_add_balloon_handler(QEMUBalloonEvent *func, void *opaque); + +void qemu_balloon(ram_addr_t target); + +ram_addr_t qemu_balloon_status(void); + +#endif diff --git a/hw/pc.c b/hw/pc.c index 2da9413..8d3401a 100644 --- a/hw/pc.c +++ b/hw/pc.c @@ -1023,6 +1023,8 @@ static void pc_init1(int ram_size, int vga_ram_size, } } + if (pci_enabled) + virtio_balloon_init(pci_bus); } static void pc_init_pci(int ram_size, int vga_ram_size, diff --git a/hw/pc.h b/hw/pc.h index c828cda..67583f7 100644 --- a/hw/pc.h +++ b/hw/pc.h @@ -146,4 +146,7 @@ void isa_ne2000_init(int base, qemu_irq irq, NICInfo *nd); /* virtio-blk.c */ void *virtio_blk_init(PCIBus *bus, BlockDriverState *bs); +/* virtio-balloon.h */ +void *virtio_balloon_init(PCIBus *bus); + #endif diff --git a/hw/virtio-balloon.c b/hw/virtio-balloon.c new file mode 100644 index 0000000..d97f4b2 --- /dev/null +++ b/hw/virtio-balloon.c @@ -0,0 +1,134 @@ +/* + * Virtio Block Device + * + * Copyright IBM, Corp. 2008 + * + * Authors: + * Anthony Liguori <ali...@us...> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "qemu-common.h" +#include "virtio.h" +#include "pc.h" +#include "sysemu.h" +#include "cpu.h" +#include "balloon.h" +#include "virtio-balloon.h" + +#if defined(__linux__) +#include <sys/mman.h> +#endif + +typedef struct VirtIOBalloon +{ + VirtIODevice vdev; + VirtQueue *ivq, *dvq; + uint32_t num_pages; + uint32_t actual; +} VirtIOBalloon; + +static VirtIOBalloon *to_virtio_balloon(VirtIODevice *vdev) +{ + return (VirtIOBalloon *)vdev; +} + +static void balloon_page(void *addr, int deflate) +{ +#if defined(__linux__) + madvise(addr, TARGET_PAGE_SIZE, deflate ? MADV_WILLNEED : MADV_DONTNEED); +#endif +} + +static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq) +{ + VirtIOBalloon *s = to_virtio_balloon(vdev); + VirtQueueElement *elem; + + while ((elem = virtqueue_pop(vq)) != NULL) { + size_t offset = 0; + uint32_t pfn; + + while (memcpy_from_iovector(&pfn, offset, 4, elem->out) == 4) { + ram_addr_t pa; + ram_addr_t addr; + + pa = (ram_addr_t)ldl_p(&pfn) << TARGET_PAGE_BITS; + offset += 4; + + addr = cpu_get_physical_page_desc(pa); + if ((addr & ~TARGET_PAGE_MASK) != IO_MEM_RAM) + continue; + + balloon_page(phys_ram_base + addr, !!(vq == s->dvq)); + } + + virtqueue_push(vq, elem, offset); + virtio_notify(vdev, vq); + } +} + +static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data) +{ + VirtIOBalloon *dev = to_virtio_balloon(vdev); + struct virtio_balloon_config config; + + config.num_pages = cpu_to_le32(dev->num_pages); + config.actual = cpu_to_le32(dev->actual); + + memcpy(config_data, &config, 8); +} + +static void virtio_balloon_set_config(VirtIODevice *vdev, + const uint8_t *config_data) +{ + VirtIOBalloon *dev = to_virtio_balloon(vdev); + struct virtio_balloon_config config; + memcpy(&config, config_data, 8); + dev->actual = config.actual; +} + +static uint32_t virtio_balloon_get_features(VirtIODevice *vdev) +{ + return 0; +} + +static ram_addr_t virtio_balloon_to_target(void *opaque, ram_addr_t target) +{ + VirtIOBalloon *dev = opaque; + + if (target > ram_size) + target = ram_size; + + if (target) { + dev->num_pages = (ram_size - target) >> TARGET_PAGE_BITS; + virtio_notify_config(&dev->vdev); + } + + return ram_size - (dev->actual << TARGET_PAGE_BITS); +} + +void *virtio_balloon_init(PCIBus *bus) +{ + VirtIOBalloon *s; + + s = (VirtIOBalloon *)virtio_init_pci(bus, "virtio-balloon", + 6900, 0x1002, + 0, VIRTIO_ID_BALLOON, + 0x05, 0x00, 0x00, + 8, sizeof(VirtIOBalloon)); + + s->vdev.get_config = virtio_balloon_get_config; + s->vdev.set_config = virtio_balloon_set_config; + s->vdev.get_features = virtio_balloon_get_features; + + s->ivq = virtio_add_queue(&s->vdev, 128, virtio_balloon_handle_output); + s->dvq = virtio_add_queue(&s->vdev, 128, virtio_balloon_handle_output); + + qemu_add_balloon_handler(virtio_balloon_to_target, s); + + return &s->vdev; +} diff --git a/hw/virtio-balloon.h b/hw/virtio-balloon.h new file mode 100644 index 0000000..27d6985 --- /dev/null +++ b/hw/virtio-balloon.h @@ -0,0 +1,34 @@ +/* + * Virtio Support + * + * Copyright IBM, Corp. 2007-2008 + * + * Authors: + * Anthony Liguori <ali...@us...> + * Rusty Russell <ru...@ru...> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#ifndef _QEMU_VIRTIO_BALLOON_H +#define _QEMU_VIRTIO_BALLOON_H + +/* from Linux's linux/virtio_balloon.h */ + +/* The ID for virtio_balloon */ +#define VIRTIO_ID_BALLOON 5 + +/* The feature bitmap for virtio balloon */ +#define VIRTIO_BALLOON_F_MUST_TELL_HOST 0 /* Tell before reclaiming pages */ + +struct virtio_balloon_config +{ + /* Number of pages host wants Guest to give up. */ + uint32_t num_pages; + /* Number of pages we've actually got in balloon. */ + uint32_t actual; +}; + +#endif diff --git a/monitor.c b/monitor.c index 025025b..7f4c096 100644 --- a/monitor.c +++ b/monitor.c @@ -34,6 +34,7 @@ #include "block.h" #include "audio/audio.h" #include "disas.h" +#include "balloon.h" #include <dirent.h> #ifdef CONFIG_PROFILER @@ -1257,6 +1258,23 @@ static void do_wav_capture (const char *path, } #endif +static void do_balloon(int value) +{ + ram_addr_t target = value; + qemu_balloon(target << 20); +} + +static void do_info_balloon(void) +{ + ram_addr_t actual; + + actual = qemu_balloon_status(); + if (actual == 0) + term_printf("Ballooning not activated in VM\n"); + else + term_printf("balloon: actual=%d\n", (int)(actual >> 20)); +} + static term_cmd_t term_cmds[] = { { "help|?", "s?", do_help, "[cmd]", "show the help" }, @@ -1328,6 +1346,8 @@ static term_cmd_t term_cmds[] = { "capture index", "stop capture" }, { "memsave", "lis", do_memory_save, "addr size file", "save to disk virtual memory dump starting at 'addr' of size 'size'", }, + { "balloon", "i", do_balloon, + "target", "request VM to change it's memory allocation (in MB)" }, { NULL, NULL, }, }; @@ -1388,6 +1408,8 @@ static term_cmd_t info_cmds[] = { { "slirp", "", do_info_slirp, "", "show SLIRP statistics", }, #endif + { "balloon", "", do_info_balloon, + "", "show balloon information" }, { NULL, NULL, }, }; diff --git a/vl.c b/vl.c index 4c11be6..eca3377 100644 --- a/vl.c +++ b/vl.c @@ -37,6 +37,7 @@ #include "qemu-char.h" #include "block.h" #include "audio/audio.h" +#include "balloon.h" #include <unistd.h> #include <fcntl.h> @@ -482,6 +483,31 @@ void hw_error(const char *fmt, ...) va_end(ap); abort(); } + +/***************/ +/* ballooning */ + +static QEMUBalloonEvent *qemu_balloon_event; +void *qemu_balloon_event_opaque; + +void qemu_add_balloon_handler(QEMUBalloonEvent *func, void *opaque) +{ + qemu_balloon_event = func; + qemu_balloon_event_opaque = opaque; +} + +void qemu_balloon(ram_addr_t target) +{ + if (qemu_balloon_event) + qemu_balloon_event(qemu_balloon_event_opaque, target); +} + +ram_addr_t qemu_balloon_status(void) +{ + if (qemu_balloon_event) + return qemu_balloon_event(qemu_balloon_event_opaque, 0); + return 0; +} /***********************************************************/ /* keyboard/mouse */ |
From: Anthony L. <ali...@us...> - 2008-04-15 22:11:36
|
This patch implements the virtio block driver backend. Signed-off-by: Anthony Liguori <ali...@us...> diff --git a/Makefile.target b/Makefile.target index ea632fa..4d695c7 100644 --- a/Makefile.target +++ b/Makefile.target @@ -535,7 +535,7 @@ OBJS += rtl8139.o OBJS += e1000.o # virtio devices -OBJS += virtio.o virtio-net.o +OBJS += virtio.o virtio-net.o virtio-blk.o ifeq ($(TARGET_BASE_ARCH), i386) # Hardware support diff --git a/hw/pc.c b/hw/pc.c index 4fec2d4..2da9413 100644 --- a/hw/pc.c +++ b/hw/pc.c @@ -1011,6 +1011,18 @@ static void pc_init1(int ram_size, int vga_ram_size, } } } + + /* Add virtio block devices */ + if (pci_enabled) { + int index; + int unit_id = 0; + + while ((index = drive_get_index(IF_VIRTIO, 0, unit_id)) != -1) { + virtio_blk_init(pci_bus, drives_table[index].bdrv); + unit_id++; + } + } + } static void pc_init_pci(int ram_size, int vga_ram_size, diff --git a/hw/pc.h b/hw/pc.h index 9f83050..c828cda 100644 --- a/hw/pc.h +++ b/hw/pc.h @@ -143,4 +143,7 @@ void pci_piix4_ide_init(PCIBus *bus, BlockDriverState **hd_table, int devfn, void isa_ne2000_init(int base, qemu_irq irq, NICInfo *nd); +/* virtio-blk.c */ +void *virtio_blk_init(PCIBus *bus, BlockDriverState *bs); + #endif diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c new file mode 100644 index 0000000..534e819 --- /dev/null +++ b/hw/virtio-blk.c @@ -0,0 +1,112 @@ +/* + * Virtio Block Device + * + * Copyright IBM, Corp. 2007 + * + * Authors: + * Anthony Liguori <ali...@us...> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "virtio.h" +#include "block.h" +#include "block_int.h" +#include "pc.h" +#include "virtio-blk.h" + +typedef struct VirtIOBlock +{ + VirtIODevice vdev; + BlockDriverState *bs; +} VirtIOBlock; + +static VirtIOBlock *to_virtio_blk(VirtIODevice *vdev) +{ + return (VirtIOBlock *)vdev; +} + +static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq) +{ + VirtIOBlock *s = to_virtio_blk(vdev); + VirtQueueElement *elem; + + while ((elem = virtqueue_pop(vq)) != 0) { + struct virtio_blk_outhdr out; + struct virtio_blk_inhdr in; + unsigned int wlen; + size_t in_size, out_size; + + out_size = iovector_size(elem->out); + in_size = iovector_size(elem->in); + + memcpy_from_iovector(&out, 0, sizeof(out), elem->out); + + if (out.type & VIRTIO_BLK_T_SCSI_CMD) { + wlen = sizeof(in); + in.status = VIRTIO_BLK_S_UNSUPP; + } else if (out.type & VIRTIO_BLK_T_OUT) { + IOVector *sg; + + sg = iovector_trim(elem->out, sizeof(out), + out_size - sizeof(out)); + bdrv_writev(s->bs, out.sector, sg); + iovector_free(sg); + + wlen = sizeof(in); + in.status = VIRTIO_BLK_S_OK; + } else { + IOVector *sg; + + sg = iovector_trim(elem->in, 0, in_size - sizeof(in)); + bdrv_readv(s->bs, out.sector, sg); + iovector_free(sg); + + wlen = in_size; + in.status = VIRTIO_BLK_S_OK; + } + + memcpy_to_iovector(elem->in, in_size - sizeof(in), + sizeof(in), &in); + + virtqueue_push(vq, elem, wlen); + virtio_notify(vdev, vq); + } +} + +static void virtio_blk_get_config(VirtIODevice *vdev, uint8_t *config) +{ + VirtIOBlock *s = to_virtio_blk(vdev); + struct virtio_blk_config blkcfg; + int64_t capacity; + + bdrv_get_geometry(s->bs, &capacity); + blkcfg.capacity = cpu_to_le64(capacity); + blkcfg.seg_max = cpu_to_le32(128 - 2); + memcpy(config, &blkcfg, sizeof(blkcfg)); +} + +static uint32_t virtio_blk_get_features(VirtIODevice *vdev) +{ + return (1 << VIRTIO_BLK_F_SEG_MAX); +} + +void *virtio_blk_init(PCIBus *bus, BlockDriverState *bs) +{ + VirtIOBlock *s; + + s = (VirtIOBlock *)virtio_init_pci(bus, "virtio-blk", 6900, 0x1001, + 0, VIRTIO_ID_BLOCK, + 0x01, 0x80, 0x00, + 16, sizeof(VirtIOBlock)); + + s->vdev.get_config = virtio_blk_get_config; + s->vdev.get_features = virtio_blk_get_features; + s->bs = bs; + + virtio_add_queue(&s->vdev, 128, virtio_blk_handle_output); + + return s; +} diff --git a/hw/virtio-blk.h b/hw/virtio-blk.h new file mode 100644 index 0000000..290ff5b --- /dev/null +++ b/hw/virtio-blk.h @@ -0,0 +1,66 @@ +/* + * Virtio Support + * + * Copyright IBM, Corp. 2007-2008 + * + * Authors: + * Anthony Liguori <ali...@us...> + * Rusty Russell <ru...@ru...> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#ifndef _QEMU_VIRTIO_BLK_H +#define _QEMU_VIRTIO_BLK_H + +/* from Linux's linux/virtio_blk.h */ + +/* The ID for virtio_block */ +#define VIRTIO_ID_BLOCK 2 + +/* Feature bits */ +#define VIRTIO_BLK_F_BARRIER 0 /* Does host support barriers? */ +#define VIRTIO_BLK_F_SIZE_MAX 1 /* Indicates maximum segment size */ +#define VIRTIO_BLK_F_SEG_MAX 2 /* Indicates maximum # of segments */ + +struct virtio_blk_config +{ + uint64_t capacity; + uint32_t size_max; + uint32_t seg_max; +}; + +/* These two define direction. */ +#define VIRTIO_BLK_T_IN 0 +#define VIRTIO_BLK_T_OUT 1 + +/* This bit says it's a scsi command, not an actual read or write. */ +#define VIRTIO_BLK_T_SCSI_CMD 2 + +/* Barrier before this op. */ +#define VIRTIO_BLK_T_BARRIER 0x80000000 + +/* This is the first element of the read scatter-gather list. */ +struct virtio_blk_outhdr +{ + /* VIRTIO_BLK_T* */ + uint32_t type; + /* io priority. */ + uint32_t ioprio; + /* Sector (ie. 512 byte offset) */ + uint64_t sector; +}; + +#define VIRTIO_BLK_S_OK 0 +#define VIRTIO_BLK_S_IOERR 1 +#define VIRTIO_BLK_S_UNSUPP 2 + +/* This is the first element of the write scatter-gather list */ +struct virtio_blk_inhdr +{ + unsigned char status; +}; + +#endif diff --git a/sysemu.h b/sysemu.h index 0f18e04..0078190 100644 --- a/sysemu.h +++ b/sysemu.h @@ -119,7 +119,7 @@ extern unsigned int nb_prom_envs; #endif typedef enum { - IF_IDE, IF_SCSI, IF_FLOPPY, IF_PFLASH, IF_MTD, IF_SD + IF_IDE, IF_SCSI, IF_FLOPPY, IF_PFLASH, IF_MTD, IF_SD, IF_VIRTIO } BlockInterfaceType; typedef struct DriveInfo { diff --git a/vl.c b/vl.c index 821c05d..4c11be6 100644 --- a/vl.c +++ b/vl.c @@ -5050,6 +5050,9 @@ static int drive_init(struct drive_opt *arg, int snapshot, } else if (!strcmp(buf, "sd")) { type = IF_SD; max_devs = 0; + } else if (!strcmp(buf, "virtio")) { + type = IF_VIRTIO; + max_devs = 0; } else { fprintf(stderr, "qemu: '%s' unsupported bus type '%s'\n", str, buf); return -1; @@ -5241,6 +5244,7 @@ static int drive_init(struct drive_opt *arg, int snapshot, break; case IF_PFLASH: case IF_MTD: + case IF_VIRTIO: break; } if (!file[0]) |