You can subscribe to this list here.
2006 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
|
Oct
(33) |
Nov
(325) |
Dec
(320) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2007 |
Jan
(484) |
Feb
(438) |
Mar
(407) |
Apr
(713) |
May
(831) |
Jun
(806) |
Jul
(1023) |
Aug
(1184) |
Sep
(1118) |
Oct
(1461) |
Nov
(1224) |
Dec
(1042) |
2008 |
Jan
(1449) |
Feb
(1110) |
Mar
(1428) |
Apr
(1643) |
May
(682) |
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
From: Marcelo T. <mto...@re...> - 2008-04-17 20:32:50
|
Introduce a QEMUDevice type to allow global knowledge of present devices. At the moment its only used for locking purposes, but not limited to that. Index: kvm-userspace.io/qemu/qemu-common.h =================================================================== --- kvm-userspace.io.orig/qemu/qemu-common.h +++ kvm-userspace.io/qemu/qemu-common.h @@ -27,6 +27,8 @@ #define ENOMEDIUM ENODEV #endif +#include "qemu-device.h" + #ifdef _WIN32 #define WIN32_LEAN_AND_MEAN #include <windows.h> Index: kvm-userspace.io/qemu/vl.c =================================================================== --- kvm-userspace.io.orig/qemu/vl.c +++ kvm-userspace.io/qemu/vl.c @@ -32,6 +32,7 @@ #include "net.h" #include "console.h" #include "sysemu.h" +#include "qemu-common.h" #include "gdbstub.h" #include "qemu-timer.h" #include "qemu-char.h" @@ -270,6 +271,13 @@ void decorate_application_name(char *app } } +int qemu_register_device(QEMUDevice *qemu_device) +{ + qemu_mutex_init(&qemu_device->lock); + + return 0; +} + /***********************************************************/ /* x86 ISA bus support */ Index: kvm-userspace.io/qemu/hw/hw.h =================================================================== --- kvm-userspace.io.orig/qemu/hw/hw.h +++ kvm-userspace.io/qemu/hw/hw.h @@ -99,6 +99,8 @@ typedef void QEMUResetHandler(void *opaq void qemu_register_reset(QEMUResetHandler *func, void *opaque); +int qemu_register_device(QEMUDevice *qemu_device); + /* These should really be in isa.h, but are here to make pc.h happy. */ typedef void (IOPortWriteFunc)(void *opaque, uint32_t address, uint32_t data); typedef uint32_t (IOPortReadFunc)(void *opaque, uint32_t address); Index: kvm-userspace.io/qemu/cpu-all.h =================================================================== --- kvm-userspace.io.orig/qemu/cpu-all.h +++ kvm-userspace.io/qemu/cpu-all.h @@ -17,6 +17,7 @@ * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include "qemu-device.h" #ifndef CPU_ALL_H #define CPU_ALL_H Index: kvm-userspace.io/qemu/qemu-device.h =================================================================== --- /dev/null +++ kvm-userspace.io/qemu/qemu-device.h @@ -0,0 +1,85 @@ +#ifndef QEMU_DEVICE_H +#define QEMU_DEVICE_H +#include <unistd.h> +#if defined (_POSIX_THREADS) +#include <stdlib.h> +#include <pthread.h> +#define DEBUG_PTHREADS +#ifndef DEBUG_PTHREADS +#define qemu_mutex_t pthread_mutex_t +#define qemu_mutex_lock(mutex) pthread_mutex_lock(mutex) +#define qemu_mutex_unlock(mutex) pthread_mutex_unlock(mutex) +#define assert_is_locked(mutex) do { } while (0) +static inline void qemu_mutex_init(qemu_mutex_t *mutex) +{ + pthread_mutex_init(mutex, NULL); +} +#else +#include <execinfo.h> + +struct qemu_mutex_t { + pthread_mutex_t mutex; + pthread_t owner; +}; + +typedef struct qemu_mutex_t qemu_mutex_t; + +static void print_backtrace(void) +{ + void *buffer[4096]; + int len; + + len = backtrace(buffer, sizeof(buffer)); + backtrace_symbols_fd(buffer, len, 2); +} + +static inline void assert_is_locked(qemu_mutex_t *mutex) +{ + if (mutex->owner != pthread_self()) { + printf("assert failure, not locked!\n"); + print_backtrace(); + exit(0); + } +} + +static inline void qemu_mutex_lock(qemu_mutex_t *mutex) +{ + if (!mutex) { + print_backtrace(); + exit(0); + } + if (mutex->owner == pthread_self()) { + printf("attempting to acquire lock twice!\n"); + print_backtrace(); + exit(0); + } + pthread_mutex_lock(&mutex->mutex); + mutex->owner = pthread_self(); +} + +static inline void qemu_mutex_unlock(qemu_mutex_t *mutex) +{ + if (mutex->owner != pthread_self()) { + printf("thread %lx attempting to release lock acquired by %lx\n", + (unsigned long)pthread_self(), (unsigned long)mutex->owner); + print_backtrace(); + exit(0); + } + mutex->owner = 0; + pthread_mutex_unlock(&mutex->mutex); +} + +static inline void qemu_mutex_init(qemu_mutex_t *mutex) +{ + pthread_mutex_init(&mutex->mutex, NULL); + mutex->owner = 0; +} +#endif /* DEBUG_PTHREADS */ +#endif /* _POSIX_THREADS */ + +struct QEMUDevice { + qemu_mutex_t lock; +}; + +typedef struct QEMUDevice QEMUDevice; +#endif /* QEMU_DEVICE_H */ -- |
From: Marcelo T. <mto...@re...> - 2008-04-17 20:32:48
|
cpu_single_env is a global variable, so there is an assumption that only one vcpu can execute QEMU at the same time. Provide a get_cpu_env() wrapper allowing KVM to use its thread local storage vcpu info. It simplifies IO thread handling. Index: kvm-userspace.io/qemu/block-raw-posix.c =================================================================== --- kvm-userspace.io.orig/qemu/block-raw-posix.c +++ kvm-userspace.io/qemu/block-raw-posix.c @@ -247,7 +247,7 @@ static int aio_initialized = 0; static void aio_signal_handler(int signum) { #ifndef QEMU_IMG - CPUState *env = cpu_single_env; + CPUState *env = get_cpu_env(); if (env) { /* stop the currently executing cpu because a timer occured */ cpu_interrupt(env, CPU_INTERRUPT_EXIT); Index: kvm-userspace.io/qemu/cpu-all.h =================================================================== --- kvm-userspace.io.orig/qemu/cpu-all.h +++ kvm-userspace.io/qemu/cpu-all.h @@ -741,7 +741,8 @@ void cpu_abort(CPUState *env, const char __attribute__ ((__format__ (__printf__, 2, 3))) __attribute__ ((__noreturn__)); extern CPUState *first_cpu; -extern CPUState *cpu_single_env; +CPUState *get_cpu_env(void); +void set_cpu_env(CPUState *env); extern int code_copy_enabled; #define CPU_INTERRUPT_EXIT 0x01 /* wants exit from main loop */ Index: kvm-userspace.io/qemu/cpu-exec.c =================================================================== --- kvm-userspace.io.orig/qemu/cpu-exec.c +++ kvm-userspace.io/qemu/cpu-exec.c @@ -306,7 +306,7 @@ int cpu_exec(CPUState *env1) if (cpu_halted(env1) == EXCP_HALTED) return EXCP_HALTED; - cpu_single_env = env1; + set_cpu_env(env1); /* first we save global registers */ #define SAVE_HOST_REGS 1 @@ -743,7 +743,7 @@ int cpu_exec(CPUState *env1) #include "hostregs_helper.h" /* fail safe : never use cpu_single_env outside cpu_exec() */ - cpu_single_env = NULL; + set_cpu_env(NULL); return ret; } Index: kvm-userspace.io/qemu/exec-all.h =================================================================== --- kvm-userspace.io.orig/qemu/exec-all.h +++ kvm-userspace.io/qemu/exec-all.h @@ -472,7 +472,6 @@ void tlb_fill(target_ulong addr, int is_ #define ACCESS_TYPE (NB_MMU_MODES + 1) #define MEMSUFFIX _code -#define env cpu_single_env #define DATA_SIZE 1 #include "softmmu_header.h" Index: kvm-userspace.io/qemu/exec.c =================================================================== --- kvm-userspace.io.orig/qemu/exec.c +++ kvm-userspace.io/qemu/exec.c @@ -108,9 +108,20 @@ static int in_migration; static ram_addr_t phys_ram_alloc_offset = 0; CPUState *first_cpu; -/* current CPU in the current thread. It is only valid inside - cpu_exec() */ -CPUState *cpu_single_env; +/* env holder for single threaded SMP emulation */ +static CPUState *curr_cpu_env; + +/* get the environment of the current executing CPU */ +__attribute__((weak)) CPUState *get_cpu_env(void) +{ + return curr_cpu_env; +} + +__attribute__((weak)) void set_cpu_env(CPUState *env) +{ + curr_cpu_env = env; +} + typedef struct PageDesc { /* list of TBs intersecting this ram page */ @@ -686,7 +697,7 @@ void tb_invalidate_phys_page_range(targe int is_cpu_write_access) { int n, current_tb_modified, current_tb_not_found, current_flags; - CPUState *env = cpu_single_env; + CPUState *env = get_cpu_env(); PageDesc *p; TranslationBlock *tb, *tb_next, *current_tb, *saved_tb; target_ulong tb_start, tb_end; @@ -832,7 +843,7 @@ static void tb_invalidate_phys_page(targ PageDesc *p; TranslationBlock *tb, *current_tb; #ifdef TARGET_HAS_PRECISE_SMC - CPUState *env = cpu_single_env; + CPUState *env = get_cpu_env(); #endif addr &= TARGET_PAGE_MASK; @@ -1244,17 +1255,19 @@ void cpu_interrupt(CPUState *env, int ma TranslationBlock *tb; static int interrupt_lock; - env->interrupt_request |= mask; - if (kvm_enabled() && !qemu_kvm_irqchip_in_kernel()) - kvm_update_interrupt_request(env); - - /* if the cpu is currently executing code, we must unlink it and - all the potentially executing TB */ - tb = env->current_tb; - if (tb && !testandset(&interrupt_lock)) { - env->current_tb = NULL; - tb_reset_jump_recursive(tb); - interrupt_lock = 0; + if (env) { + env->interrupt_request |= mask; + if (kvm_enabled() && !qemu_kvm_irqchip_in_kernel()) + kvm_update_interrupt_request(env); + + /* if the cpu is currently executing code, we must unlink it and + all the potentially executing TB */ + tb = env->current_tb; + if (tb && !testandset(&interrupt_lock)) { + env->current_tb = NULL; + tb_reset_jump_recursive(tb); + interrupt_lock = 0; + } } } @@ -1834,7 +1847,7 @@ int page_unprotect(target_ulong addr, un addr, vp->phys_addr, vp->prot); #endif if (mprotect((void *)addr, TARGET_PAGE_SIZE, vp->prot) < 0) - cpu_abort(cpu_single_env, "error mprotect addr=0x%lx prot=%d\n", + cpu_abort(get_cpu_env(), "error mprotect addr=0x%lx prot=%d\n", (unsigned long)addr, vp->prot); /* set the dirty bit */ phys_ram_dirty[vp->phys_addr >> TARGET_PAGE_BITS] = 0xff; @@ -2191,6 +2204,8 @@ static void notdirty_mem_writeb(void *op int dirty_flags; ram_addr = addr - (unsigned long)phys_ram_base; dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS]; + CPUState *env = get_cpu_env(); + if (!(dirty_flags & CODE_DIRTY_FLAG)) { #if !defined(CONFIG_USER_ONLY) tb_invalidate_phys_page_fast(ram_addr, 1); @@ -2199,16 +2214,16 @@ static void notdirty_mem_writeb(void *op } stb_p((uint8_t *)(long)addr, val); #ifdef USE_KQEMU - if (cpu_single_env->kqemu_enabled && + if (env->kqemu_enabled && (dirty_flags & KQEMU_MODIFY_PAGE_MASK) != KQEMU_MODIFY_PAGE_MASK) - kqemu_modify_page(cpu_single_env, ram_addr); + kqemu_modify_page(env, ram_addr); #endif dirty_flags |= (0xff & ~CODE_DIRTY_FLAG); phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS] = dirty_flags; /* we remove the notdirty callback only if the code has been flushed */ if (dirty_flags == 0xff) - tlb_set_dirty(cpu_single_env, addr, cpu_single_env->mem_write_vaddr); + tlb_set_dirty(env, addr, env->mem_write_vaddr); } static void notdirty_mem_writew(void *opaque, target_phys_addr_t addr, uint32_t val) @@ -2217,6 +2232,7 @@ static void notdirty_mem_writew(void *op int dirty_flags; ram_addr = addr - (unsigned long)phys_ram_base; dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS]; + CPUState *env = get_cpu_env(); if (!(dirty_flags & CODE_DIRTY_FLAG)) { #if !defined(CONFIG_USER_ONLY) tb_invalidate_phys_page_fast(ram_addr, 2); @@ -2225,16 +2241,16 @@ static void notdirty_mem_writew(void *op } stw_p((uint8_t *)(long)addr, val); #ifdef USE_KQEMU - if (cpu_single_env->kqemu_enabled && + if (env->kqemu_enabled && (dirty_flags & KQEMU_MODIFY_PAGE_MASK) != KQEMU_MODIFY_PAGE_MASK) - kqemu_modify_page(cpu_single_env, ram_addr); + kqemu_modify_page(env, ram_addr); #endif dirty_flags |= (0xff & ~CODE_DIRTY_FLAG); phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS] = dirty_flags; /* we remove the notdirty callback only if the code has been flushed */ if (dirty_flags == 0xff) - tlb_set_dirty(cpu_single_env, addr, cpu_single_env->mem_write_vaddr); + tlb_set_dirty(env, addr, env->mem_write_vaddr); } static void notdirty_mem_writel(void *opaque, target_phys_addr_t addr, uint32_t val) @@ -2243,6 +2259,7 @@ static void notdirty_mem_writel(void *op int dirty_flags; ram_addr = addr - (unsigned long)phys_ram_base; dirty_flags = phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS]; + CPUState *env = get_cpu_env(); if (!(dirty_flags & CODE_DIRTY_FLAG)) { #if !defined(CONFIG_USER_ONLY) tb_invalidate_phys_page_fast(ram_addr, 4); @@ -2251,16 +2268,16 @@ static void notdirty_mem_writel(void *op } stl_p((uint8_t *)(long)addr, val); #ifdef USE_KQEMU - if (cpu_single_env->kqemu_enabled && + if (env->kqemu_enabled && (dirty_flags & KQEMU_MODIFY_PAGE_MASK) != KQEMU_MODIFY_PAGE_MASK) - kqemu_modify_page(cpu_single_env, ram_addr); + kqemu_modify_page(env, ram_addr); #endif dirty_flags |= (0xff & ~CODE_DIRTY_FLAG); phys_ram_dirty[ram_addr >> TARGET_PAGE_BITS] = dirty_flags; /* we remove the notdirty callback only if the code has been flushed */ if (dirty_flags == 0xff) - tlb_set_dirty(cpu_single_env, addr, cpu_single_env->mem_write_vaddr); + tlb_set_dirty(env, addr, env->mem_write_vaddr); } static CPUReadMemoryFunc *error_mem_read[3] = { @@ -2299,7 +2316,7 @@ static uint32_t watch_mem_readl(void *op address in case of a RAM location. */ static target_ulong check_watchpoint(target_phys_addr_t addr) { - CPUState *env = cpu_single_env; + CPUState *env = get_cpu_env(); target_ulong watch; target_ulong retaddr; int i; @@ -2310,8 +2327,8 @@ static target_ulong check_watchpoint(tar if (((env->mem_write_vaddr ^ watch) & TARGET_PAGE_MASK) == 0) { retaddr = addr - env->watchpoint[i].addend; if (((addr ^ watch) & ~TARGET_PAGE_MASK) == 0) { - cpu_single_env->watchpoint_hit = i + 1; - cpu_interrupt(cpu_single_env, CPU_INTERRUPT_DEBUG); + env->watchpoint_hit = i + 1; + cpu_interrupt(env, CPU_INTERRUPT_DEBUG); break; } } @@ -3100,7 +3117,6 @@ void dump_exec_info(FILE *f, #define MMUSUFFIX _cmmu #define GETPC() NULL -#define env cpu_single_env #define SOFTMMU_CODE_ACCESS #define SHIFT 0 Index: kvm-userspace.io/qemu/hw/apic.c =================================================================== --- kvm-userspace.io.orig/qemu/hw/apic.c +++ kvm-userspace.io/qemu/hw/apic.c @@ -592,7 +592,7 @@ static uint32_t apic_mem_readl(void *opa uint32_t val; int index; - env = cpu_single_env; + env = get_cpu_env(); if (!env) return 0; s = env->apic_state; @@ -671,7 +671,7 @@ static void apic_mem_writel(void *opaque APICState *s; int index; - env = cpu_single_env; + env = get_cpu_env(); if (!env) return; s = env->apic_state; Index: kvm-userspace.io/qemu/hw/dma.c =================================================================== --- kvm-userspace.io.orig/qemu/hw/dma.c +++ kvm-userspace.io/qemu/hw/dma.c @@ -428,9 +428,7 @@ int DMA_write_memory (int nchan, void *b /* request the emulator to transfer a new DMA memory block ASAP */ void DMA_schedule(int nchan) { - CPUState *env = cpu_single_env; - if (env) - cpu_interrupt(env, CPU_INTERRUPT_EXIT); + cpu_interrupt(get_cpu_env(), CPU_INTERRUPT_EXIT); } static void dma_reset(void *opaque) Index: kvm-userspace.io/qemu/hw/vmmouse.c =================================================================== --- kvm-userspace.io.orig/qemu/hw/vmmouse.c +++ kvm-userspace.io/qemu/hw/vmmouse.c @@ -167,7 +167,7 @@ static void vmmouse_data(VMMouseState *s static void vmmouse_get_data(uint32_t *data) { - CPUState *env = cpu_single_env; + CPUState *env = get_cpu_env(); data[0] = env->regs[R_EAX]; data[1] = env->regs[R_EBX]; data[2] = env->regs[R_ECX]; data[3] = env->regs[R_EDX]; @@ -179,7 +179,7 @@ static void vmmouse_get_data(uint32_t *d static void vmmouse_set_data(const uint32_t *data) { - CPUState *env = cpu_single_env; + CPUState *env = get_cpu_env(); DPRINTF("set_data = {%x, %x, %x, %x, %x, %x}\n", data[0], data[1], data[2], data[3], data[4], data[5]); Index: kvm-userspace.io/qemu/hw/vmport.c =================================================================== --- kvm-userspace.io.orig/qemu/hw/vmport.c +++ kvm-userspace.io/qemu/hw/vmport.c @@ -54,7 +54,7 @@ void vmport_register(unsigned char comma static uint32_t vmport_ioport_read(void *opaque, uint32_t addr) { VMPortState *s = opaque; - CPUState *env = cpu_single_env; + CPUState *env = get_cpu_env(); unsigned char command; uint32_t eax; uint32_t ret; @@ -85,14 +85,14 @@ static uint32_t vmport_ioport_read(void static uint32_t vmport_cmd_get_version(void *opaque, uint32_t addr) { - CPUState *env = cpu_single_env; + CPUState *env = get_cpu_env(); env->regs[R_EBX] = VMPORT_MAGIC; return 6; } static uint32_t vmport_cmd_ram_size(void *opaque, uint32_t addr) { - CPUState *env = cpu_single_env; + CPUState *env = get_cpu_env(); env->regs[R_EBX] = 0x1177; return ram_size; } Index: kvm-userspace.io/qemu/qemu-kvm-x86.c =================================================================== --- kvm-userspace.io.orig/qemu/qemu-kvm-x86.c +++ kvm-userspace.io/qemu/qemu-kvm-x86.c @@ -584,7 +584,7 @@ int kvm_arch_qemu_init_env(CPUState *cen int kvm_arch_halt(void *opaque, int vcpu) { - CPUState *env = cpu_single_env; + CPUState *env = get_cpu_env(); if (!((env->interrupt_request & CPU_INTERRUPT_HARD) && (env->eflags & IF_MASK))) { @@ -596,7 +596,7 @@ int kvm_arch_halt(void *opaque, int vcpu void kvm_arch_pre_kvm_run(void *opaque, int vcpu) { - CPUState *env = cpu_single_env; + CPUState *env = get_cpu_env(); if (!kvm_irqchip_in_kernel(kvm_context)) kvm_set_cr8(kvm_context, vcpu, cpu_get_apic_tpr(env)); @@ -604,8 +604,7 @@ void kvm_arch_pre_kvm_run(void *opaque, void kvm_arch_post_kvm_run(void *opaque, int vcpu) { - CPUState *env = qemu_kvm_cpu_env(vcpu); - cpu_single_env = env; + CPUState *env = get_cpu_env(); env->eflags = kvm_get_interrupt_flag(kvm_context, vcpu) ? env->eflags | IF_MASK : env->eflags & ~IF_MASK; @@ -626,7 +625,7 @@ int kvm_arch_has_work(CPUState *env) int kvm_arch_try_push_interrupts(void *opaque) { - CPUState *env = cpu_single_env; + CPUState *env = get_cpu_env(); int r, irq; if (env->ready_for_interrupt_injection && @@ -657,6 +656,6 @@ void kvm_arch_update_regs_for_sipi(CPUSt int handle_tpr_access(void *opaque, int vcpu, uint64_t rip, int is_write) { - kvm_tpr_access_report(cpu_single_env, rip, is_write); + kvm_tpr_access_report(get_cpu_env(), rip, is_write); return 0; } Index: kvm-userspace.io/qemu/qemu-kvm.c =================================================================== --- kvm-userspace.io.orig/qemu/qemu-kvm.c +++ kvm-userspace.io/qemu/qemu-kvm.c @@ -66,6 +66,40 @@ CPUState *qemu_kvm_cpu_env(int index) return vcpu_info[index].env; } +CPUState *kvm_get_cpu_env(void) +{ + CPUState *env = NULL; + + if (vcpu) + env = vcpu->env; + + return env; +} + +void kvm_set_cpu_env(CPUState *env) +{ + if (vcpu) + vcpu->env = env; +} + +static CPUState *curr_cpu_env; + +void set_cpu_env(CPUState *env) +{ + if (kvm_enabled()) + kvm_set_cpu_env(env); + else + curr_cpu_env = env; +} + +CPUState *get_cpu_env(void) +{ + if (kvm_enabled()) + return kvm_get_cpu_env(); + else + return curr_cpu_env; +} + static void sig_ipi_handler(int n) { } @@ -195,8 +229,6 @@ static int kvm_eat_signal(struct qemu_kv return 0; e = errno; pthread_mutex_lock(&qemu_mutex); - if (env && vcpu) - cpu_single_env = vcpu->env; if (r == -1 && !(errno == EAGAIN || errno == EINTR)) { printf("sigtimedwait: %s\n", strerror(e)); exit(1); @@ -235,7 +267,6 @@ static void kvm_main_loop_wait(CPUState pthread_mutex_unlock(&qemu_mutex); kvm_eat_signals(env, timeout); pthread_mutex_lock(&qemu_mutex); - cpu_single_env = env; vcpu_info[env->cpu_index].signalled = 0; } @@ -316,7 +347,6 @@ static int kvm_main_loop_cpu(CPUState *e kvm_tpr_vcpu_start(env); #endif - cpu_single_env = env; while (1) { while (!has_work(env)) kvm_main_loop_wait(env, 10); @@ -428,9 +458,10 @@ int kvm_main_loop(void) io_thread = pthread_self(); pthread_mutex_unlock(&qemu_mutex); while (1) { + if (get_cpu_env()) + hw_error("io thread has valid env\n"); kvm_eat_signal(&io_signal_table, NULL, 1000); pthread_mutex_lock(&qemu_mutex); - cpu_single_env = NULL; main_loop_wait(0); if (qemu_shutdown_requested()) break; @@ -449,7 +480,7 @@ int kvm_main_loop(void) static int kvm_debug(void *opaque, int vcpu) { - CPUState *env = cpu_single_env; + CPUState *env = get_cpu_env(); env->exception_index = EXCP_DEBUG; return 1; @@ -579,7 +610,7 @@ static struct kvm_callbacks qemu_kvm_ops int kvm_qemu_init() { /* Try to initialize kvm */ - kvm_context = kvm_init(&qemu_kvm_ops, cpu_single_env); + kvm_context = kvm_init(&qemu_kvm_ops, get_cpu_env()); if (!kvm_context) { return -1; } @@ -798,17 +829,16 @@ void qemu_kvm_aio_wait_start(void) void qemu_kvm_aio_wait(void) { - CPUState *cpu_single = cpu_single_env; + CPUState *env = get_cpu_env(); - if (!cpu_single_env) { + /* io thread */ + if (!env) { pthread_mutex_unlock(&qemu_mutex); kvm_eat_signal(&io_signal_table, NULL, 1000); pthread_mutex_lock(&qemu_mutex); - cpu_single_env = NULL; - } else { + /* vcpu thread */ + } else pthread_cond_wait(&qemu_aio_cond, &qemu_mutex); - cpu_single_env = cpu_single; - } } void qemu_kvm_aio_wait_end(void) Index: kvm-userspace.io/qemu/softmmu_header.h =================================================================== --- kvm-userspace.io.orig/qemu/softmmu_header.h +++ kvm-userspace.io/qemu/softmmu_header.h @@ -46,12 +46,12 @@ #elif ACCESS_TYPE == (NB_MMU_MODES) -#define CPU_MMU_INDEX (cpu_mmu_index(env)) +#define CPU_MMU_INDEX (cpu_mmu_index(get_cpu_env())) #define MMUSUFFIX _mmu #elif ACCESS_TYPE == (NB_MMU_MODES + 1) -#define CPU_MMU_INDEX (cpu_mmu_index(env)) +#define CPU_MMU_INDEX (cpu_mmu_index(get_cpu_env())) #define MMUSUFFIX _cmmu #else @@ -227,6 +227,7 @@ static inline RES_TYPE glue(glue(ld, USU target_ulong addr; unsigned long physaddr; int mmu_idx; + CPUState *env = get_cpu_env(); addr = ptr; index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1); @@ -248,6 +249,7 @@ static inline int glue(glue(lds, SUFFIX) target_ulong addr; unsigned long physaddr; int mmu_idx; + CPUState *env = get_cpu_env(); addr = ptr; index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1); @@ -273,6 +275,7 @@ static inline void glue(glue(st, SUFFIX) target_ulong addr; unsigned long physaddr; int mmu_idx; + CPUState *env = get_cpu_env(); addr = ptr; index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1); Index: kvm-userspace.io/qemu/target-i386/helper.c =================================================================== --- kvm-userspace.io.orig/qemu/target-i386/helper.c +++ kvm-userspace.io/qemu/target-i386/helper.c @@ -3933,7 +3933,7 @@ void tlb_fill(target_ulong addr, int is_ /* XXX: hack to restore env in all cases, even if not called from generated code */ saved_env = env; - env = cpu_single_env; + env = get_cpu_env(); ret = cpu_x86_handle_mmu_fault(env, addr, is_write, mmu_idx, 1); if (ret) { Index: kvm-userspace.io/qemu/vl.c =================================================================== --- kvm-userspace.io.orig/qemu/vl.c +++ kvm-userspace.io/qemu/vl.c @@ -7577,7 +7577,7 @@ int qemu_bh_poll(void) void qemu_bh_schedule(QEMUBH *bh) { - CPUState *env = cpu_single_env; + CPUState *env = get_cpu_env(); if (bh->scheduled) return; bh->scheduled = 1; @@ -7585,9 +7585,8 @@ void qemu_bh_schedule(QEMUBH *bh) first_bh = bh; /* stop the currently executing CPU to execute the BH ASAP */ - if (env) { - cpu_interrupt(env, CPU_INTERRUPT_EXIT); - } + cpu_interrupt(env, CPU_INTERRUPT_EXIT); + if (kvm_enabled()) qemu_kvm_notify_work(); } @@ -7795,22 +7794,20 @@ void qemu_system_reset_request(void) } else { reset_requested = 1; } - if (cpu_single_env) - cpu_interrupt(cpu_single_env, CPU_INTERRUPT_EXIT); + cpu_interrupt(get_cpu_env(), CPU_INTERRUPT_EXIT); } void qemu_system_shutdown_request(void) { shutdown_requested = 1; - if (cpu_single_env) - cpu_interrupt(cpu_single_env, CPU_INTERRUPT_EXIT); + + cpu_interrupt(get_cpu_env(), CPU_INTERRUPT_EXIT); } void qemu_system_powerdown_request(void) { powerdown_requested = 1; - if (cpu_single_env) - cpu_interrupt(cpu_single_env, CPU_INTERRUPT_EXIT); + cpu_interrupt(get_cpu_env(), CPU_INTERRUPT_EXIT); } void main_loop_wait(int timeout) Index: kvm-userspace.io/qemu/softmmu_template.h =================================================================== --- kvm-userspace.io.orig/qemu/softmmu_template.h +++ kvm-userspace.io/qemu/softmmu_template.h @@ -55,6 +55,9 @@ static inline DATA_TYPE glue(io_read, SU { DATA_TYPE res; int index; +#ifdef USE_KQEMU + CPUState *env = get_cpu_env(); +#endif index = (tlb_addr >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1); #if SHIFT <= 2 @@ -83,6 +86,7 @@ DATA_TYPE REGPARM glue(glue(__ld, SUFFIX target_ulong tlb_addr; target_phys_addr_t physaddr; void *retaddr; + CPUState *env = get_cpu_env(); /* test if there is match for unaligned or IO access */ /* XXX: could done more in memory macro in a non portable way */ @@ -137,6 +141,7 @@ static DATA_TYPE glue(glue(slow_ld, SUFF int index, shift; target_phys_addr_t physaddr; target_ulong tlb_addr, addr1, addr2; + CPUState *env = get_cpu_env(); index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1); redo: @@ -189,6 +194,7 @@ static inline void glue(io_write, SUFFIX void *retaddr) { int index; + CPUState *env = get_cpu_env(); index = (tlb_addr >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1); env->mem_write_vaddr = tlb_addr; @@ -217,6 +223,7 @@ void REGPARM glue(glue(__st, SUFFIX), MM target_ulong tlb_addr; void *retaddr; int index; + CPUState *env = get_cpu_env(); index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1); redo: @@ -268,6 +275,7 @@ static void glue(glue(slow_st, SUFFIX), target_phys_addr_t physaddr; target_ulong tlb_addr; int index, i; + CPUState *env = get_cpu_env(); index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1); redo: -- |
From: Marcelo T. <mto...@re...> - 2008-04-17 20:32:40
|
Introduce QEMUDevice, making the ioport/iomem->device relationship visible. At the moment it only contains a lock, but could be extended. With it the following is possible: - vcpu's to read/write via ioports/iomem while the iothread is working on some unrelated device, or just copying data from the kernel. - vcpu's to read/write via ioports/iomem to different devices simultaneously. This patchset is only a proof of concept kind of thing, so only serial+raw image are supported. Tried two benchmarks, iperf and tiobench. With tiobench the reported latency is significantly lower (20%+), but throughput with IDE is only slightly higher. Expect to see larger improvements with a higher performing IO scheme (SCSI still buggy, looking at it). The iperf numbers are pretty good. Performance of UP guests increase slightly but SMP is quite significant. Note that workloads with multiple busy devices (such as databases, web servers) should be the real winners. What is the feeling on this? Its not _that_ intrusive and can be easily NOP'ed out for QEMU. iperf -c 4 -i 60 ---- e1000 UP guest: global lock [SUM] 0.0-10.0 sec 156 MBytes 131 Mbits/sec [SUM] 0.0-10.0 sec 151 MBytes 126 Mbits/sec [SUM] 0.0-10.0 sec 151 MBytes 126 Mbits/sec [SUM] 0.0-10.0 sec 151 MBytes 127 Mbits/sec per-device lock [SUM] 0.0-10.0 sec 164 MBytes 137 Mbits/sec [SUM] 0.0-10.0 sec 161 MBytes 135 Mbits/sec [SUM] 0.0-10.0 sec 158 MBytes 133 Mbits/sec [SUM] 0.0-10.0 sec 171 MBytes 143 Mbits/sec SMP guest (4-way) global lock [SUM] 0.0-13.0 sec 402 MBytes 259 Mbits/sec [SUM] 0.0-10.1 sec 469 MBytes 391 Mbits/sec [SUM] 0.0-10.1 sec 477 MBytes 397 Mbits/sec [SUM] 0.0-10.0 sec 469 MBytes 393 Mbits/sec per-device lock [SUM] 0.0-13.0 sec 471 MBytes 304 Mbits/sec [SUM] 0.0-10.2 sec 532 MBytes 439 Mbits/sec [SUM] 0.0-10.1 sec 510 MBytes 423 Mbits/sec [SUM] 0.0-10.1 sec 529 MBytes 441 Mbits/sec ----- virtio-net UP guest: global lock [SUM] 0.0-13.0 sec 192 MBytes 124 Mbits/sec [SUM] 0.0-10.0 sec 213 MBytes 178 Mbits/sec [SUM] 0.0-10.0 sec 213 MBytes 178 Mbits/sec [SUM] 0.0-10.0 sec 213 MBytes 178 Mbits/sec per-device lock [SUM] 0.0-13.0 sec 193 MBytes 125 Mbits/sec [SUM] 0.0-10.0 sec 210 MBytes 176 Mbits/sec [SUM] 0.0-10.0 sec 218 MBytes 183 Mbits/sec [SUM] 0.0-10.0 sec 216 MBytes 181 Mbits/sec SMP guest: global lock [SUM] 0.0-13.0 sec 446 MBytes 288 Mbits/sec [SUM] 0.0-10.0 sec 521 MBytes 437 Mbits/sec [SUM] 0.0-10.0 sec 525 MBytes 440 Mbits/sec [SUM] 0.0-10.0 sec 533 MBytes 446 Mbits/sec per-device lock [SUM] 0.0-13.0 sec 512 MBytes 331 Mbits/sec [SUM] 0.0-10.0 sec 617 MBytes 517 Mbits/sec [SUM] 0.0-10.1 sec 631 MBytes 527 Mbits/sec [SUM] 0.0-10.0 sec 626 MBytes 524 Mbits/sec -- |
From: Anthony L. <ali...@us...> - 2008-04-17 20:07:08
|
Blue Swirl wrote: > > I fixed the bug, now pcnet works. Performance is improved by a few > percent. The problem was that the vector was not freed. Maybe dynamic > allocation is a bit fragile. In this case, the length of the vector is > known, so it could be allocated once at init time. But would this > work? > For you, yes, but not for me. virtio scatter/gather lists can be very long. The API tries not to make assumptions about who's allocating what so you should be able to get away without a dynamic allocation if you were sufficiently motivated. > The next step would be to add a vector version for packet receive. For > ESP/SCSI, in addition to bdrv_readv/writev, AIO versions would need to > be added. Last year I made a patch (attached) that made SLIRP use my > version of IOVector, I could update it to this model. > Yes, the vector version of packet receive is tough. I'll take a look at your patch. Basically, you need to associate a set of RX vectors with each VLANClientState and then when it comes time to deliver a packet to the VLAN, before calling fd_read, see if there is an RX vector available for the client. In the case of tap, I want to optimize further and do the initial readv() to one of the clients RX buffers and then copy that RX buffer to the rest of the clients if necessary. Regards, Anthony Liguori >>> IMHO the read/write functions should be a property of the bus so that >>> they are hidden from the device, for pcnet it does not matter as we >>> have to do the swapping anyway. >>> >>> >>> >> For an IOMMU that has a per-device mapping, the read/write functions have >> to operate on a per-device basis. >> > > No, I meant that there could be a bus layer that did the memory access > and provided a specialized version of iovector_new without the > handlers. But I think we can live with this, if things get too ugly we > can add the layering later. > |
From: Anthony L. <ali...@us...> - 2008-04-17 20:06:10
|
Daniel P. Berrange wrote: > If QEMU can't discover cases where it won't work, what criteria should > the end user use to decide between the impls, or for that matter, what > criteria should a management api/app like libvirt use ? If the only decision > logic is 'try it & benchmark your VM' then its not a particularly useful > option. > > I've basically got a choice of making libvirt always ad '-aio linux' > or never add it at all. My inclination is to the latter since it is > compatible with existing QEMU which has no -aio option. Presumably > '-aio linux' is intended to provide some performance benefit so it'd > be nice to use it. If we can't express some criteria under which it > should be turned on, I can't enable it; where as if you can express > some criteria, then QEMU should apply them automatically. > > Pushing this choice of AIO impls to the app or user invoking QEMU just > does not seem like a win here. > The one thing we could possibly do is detect the cache where we see a block device and then automagically enable cache=off and -aio linux. Without cache=off, -aio linux is not so useful ATM. At the same time though, not all users are going to want to disable the use of the host page cache. It's not necessary an easy decision either way. For libvirt, I'd recommend just never using -aio linux. We'll have a better AIO option in the near future (based on Rusty's vringfd work) and I'd like to detect and enable that by default. Regards, Anthony Liguori > Dan. > |
From: Daniel P. B. <ber...@re...> - 2008-04-17 20:02:29
|
On Thu, Apr 17, 2008 at 02:41:32PM -0500, Anthony Liguori wrote: > Daniel P. Berrange wrote: > >On Thu, Apr 17, 2008 at 02:26:50PM -0500, Anthony Liguori wrote: > > > >>Posix AIO, especially as used by QEMU, is not very efficient for disk IO. > >>This patch introduces an AIO abstract to allow multiple AIO implements to > >>be > >>used. We can't simply replace posix-aio by linux-aio because linux-aio > >>only > >>works on some filesystems and only with files opened with O_DIRECT. > >> > >>This patch adds a command line option (-aio) to select the AIO > >>implementation > >>to be used. It avoids code motion to allow for easy review. The next > >>patch > >>separates out the posix-aio implementation. > >> > > > >This is not a very pleasant user experiance. They can not & should not be > >expected to figure out which AIO impl works with their particular > >filesystem. > >If the linux-aio impl doesn't work in some cases, then the code should > >detect > >these and automatically fallback to posix-aio. The user should not have to > >use a -aio flag to make it work. > > > > Those cases aren't always discoverable. Linux-aio just falls back to > using synchronous IO. It's pretty terrible. We need a new AIO > interface for Linux (and yes, we're working on this). Once we have > something better, we'll change that to be the default and things will > Just Work for most users. If QEMU can't discover cases where it won't work, what criteria should the end user use to decide between the impls, or for that matter, what criteria should a management api/app like libvirt use ? If the only decision logic is 'try it & benchmark your VM' then its not a particularly useful option. I've basically got a choice of making libvirt always ad '-aio linux' or never add it at all. My inclination is to the latter since it is compatible with existing QEMU which has no -aio option. Presumably '-aio linux' is intended to provide some performance benefit so it'd be nice to use it. If we can't express some criteria under which it should be turned on, I can't enable it; where as if you can express some criteria, then QEMU should apply them automatically. Pushing this choice of AIO impls to the app or user invoking QEMU just does not seem like a win here. Dan. -- |: Red Hat, Engineering, Boston -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :| |
From: Конкурс <sf...@ab...> - 2008-04-17 19:48:39
|
Размeщeниe госудаpствeнного и муниципального заказа на тоpгах: споpныe вопpосы совpeмeнной пpактики 25 апpeля 2008, г. Москва Пpогpамма сeминаpа - Анализ конкpeтных аpбитpажных дeл, являющихся наиболee pаспpостpанeнными (типичными) в судeбно-аpбитpажной пpактикe оспаpивания pазмeщeния госудаpствeнного и муниципального заказа. - Аналитичeский коммeнтаpий споpных ситуаций, возникающих как в ходe пpовeдeния тоpгов, так и в пpоцeссe заключeния и исполнeния госудаpствeнных (муниципальных) контpактов), заключeнных по их итогам. - Исслeдованиe тоpгов носит комплeксный хаpактep, т.к. затpагиваются и пpоцeссуальныe особeнности pассмотpeния споpов о нeдeйствитeльности тоpгов. - Отвeты на ключeвыe вопpосы сeминаpа аpгумeнтиpуются со ссылкой на научно-пpикладныe исслeдования, матepиалы заpубeжной и мeждунаpодной пpактики пpовeдeния аукционов и конкуpсов. - По каждой тeмe пpогpаммы могут быть пpоанализиpованы пpоблeмныe ситуации из пpактики слушатeлeй, по ходу обсуждeния автоpом даются конкpeтныe peкомeндации. Ключeвыe вопpосы пpогpаммы ∙ Сколько можeт быть побeдитeлeй на тоpгах? Впpавe ли участники объeдинять свои пpeдложeния до или в пpоцeссe пpовeдeния тоpгов? ∙ Какиe лица впpавe заявить иск о пpизнании тоpгов нeдeйствитeльными? Как опpeдeлить "заинтepeсованность" в оспаpивании peзультатов тоpгов? ∙ Что означаeт "нeдeйствитeльность" аукциона или конкуpса: оспоpимость или ничтожность? ∙ Как оцeнить состязатeльность участников? Есть ли основания пpизнать тоpги нeсостоявшимися, eсли участников было двоe? ∙ Чeм обeспeчиваeтся заявка на участиe в аукционe (конкуpсe)? Можeт ли оpганизатоp тоpгов пpинимать от участников банковскиe гаpантии, пpостыe вeксeля или дeнeжныe сpeдства на условиях залога? ∙ В чeм пpинципиальныe отличия пpавового статуса участника тоpгов и участника pазмeщeния заказа? ∙ Каким обpазом фоpмулиpуются кpитepии конкуpсного (аукционного) отбоpа и можно ли от них отступить пpи опpeдeлeнии побeдитeля? ∙ Что дeлать пpи получeнии одинаковых пpeдложeний от нeскольких участников? ∙ Нeобходимо ли пpоводить повтоpныe тоpги, eсли побeдитeль нe исполняeт заключeнный на тоpгах договоp / уклоняeтся от eго заключeния? ∙ Можeт ли суд, пpизнав факты наpушeния законодатeльства, оставить в силe peзультаты тоpгов на pазмeщeниe госудаpствeнного и муниципального заказа? Опоpныe тeмы пpогpаммы ∙ Пpeимущeства и нeдостатки заключeния договоpов путeм пpовeдeния тоpгов. Основныe pазновидности аукционов и конкуpсов. ∙ Пpавовыe пpоблeмы участия в тоpгах договоpных объeдинeний участников, а такжe аффилиpованных лиц. ∙ Пpоцeссуальныe особeнности pассмотpeния споpов о нeдeйствитeльности тоpгов. ∙ Основания для пpизнания тоpгов нeсостоявшимися. ∙ Пpавовоe значeниe обeспeчeния аукционной или конкуpсной заявки. ∙ Тpeбования законодатeльства к извeщeнию о пpовeдeнии тоpгов, хаpактepистика "нeнадлeжащих" извeщeний. ∙ Опpeдeлeниe кpитepиeв конкуpсного или аукционного отбоpа, пpавовыe pамки pаботы конкуpсной (аукционной) комиссии. ∙ "Аннулиpованиe тоpгов", "Пpизнаниe тоpгов нeдeйствитeльными", "Объявлeниe тоpгов нeсостоявшимися": pазличия пpоцeдуp и их пpавовыe послeдствия. ∙ Соотношeниe администpативного и судeбного способов защиты пpав и законных интepeсов участников pазмeщeния заказа. Пpодолжитeльность обучeния: с 10 до 17 часов (с пepepывом на обeд и кофe-паузу). Мeсто обучeния: г. Москва, 5 мин. пeшком от м. Акадeмичeская. Стоимость обучeния: 4900 pуб. (с НДС). (В стоимость вxодит: pаздаточный матepиал, кофe-пауза, обeд в peстоpанe). Пpи отсутствии возможности посeтить сeминаp, мы пpeдлагаeм пpиобpeсти eго видeовepсию на DVD/CD дискаx или видeокассeтаx (пpилагаeтся автоpский pаздаточный матepиал). Цeна видeокуpса - 3500 pублeй, с учeтом НДС. Для peгистpации на сeминаp нeобxодимо отпpавить нам по факсу: peквизиты оpганизации, тeму и дату сeминаpа, полноe ФИо участников, контактный тeлeфон и факс. Для заказа видeокуpса нeобxодимо отпpавить нам по факсу: peквизиты оpганизации, тeму видeокуpса, указать носитeль (ДВД или СД диски), тeлeфон, факс, контактноe лицо и точный адpeс доставки. Получить дополнитeльную инфоpмацию и заpeгистpиpоваться можно: по т/ф: (495) 543-88-46 |
From: Anthony L. <ali...@us...> - 2008-04-17 19:41:42
|
Daniel P. Berrange wrote: > On Thu, Apr 17, 2008 at 02:26:50PM -0500, Anthony Liguori wrote: > >> Posix AIO, especially as used by QEMU, is not very efficient for disk IO. >> This patch introduces an AIO abstract to allow multiple AIO implements to be >> used. We can't simply replace posix-aio by linux-aio because linux-aio only >> works on some filesystems and only with files opened with O_DIRECT. >> >> This patch adds a command line option (-aio) to select the AIO implementation >> to be used. It avoids code motion to allow for easy review. The next patch >> separates out the posix-aio implementation. >> > > This is not a very pleasant user experiance. They can not & should not be > expected to figure out which AIO impl works with their particular filesystem. > If the linux-aio impl doesn't work in some cases, then the code should detect > these and automatically fallback to posix-aio. The user should not have to > use a -aio flag to make it work. > Those cases aren't always discoverable. Linux-aio just falls back to using synchronous IO. It's pretty terrible. We need a new AIO interface for Linux (and yes, we're working on this). Once we have something better, we'll change that to be the default and things will Just Work for most users. Regards, Anthony Liguori > Dan. > |
From: Daniel P. B. <ber...@re...> - 2008-04-17 19:38:11
|
On Thu, Apr 17, 2008 at 02:26:50PM -0500, Anthony Liguori wrote: > Posix AIO, especially as used by QEMU, is not very efficient for disk IO. > This patch introduces an AIO abstract to allow multiple AIO implements to be > used. We can't simply replace posix-aio by linux-aio because linux-aio only > works on some filesystems and only with files opened with O_DIRECT. > > This patch adds a command line option (-aio) to select the AIO implementation > to be used. It avoids code motion to allow for easy review. The next patch > separates out the posix-aio implementation. This is not a very pleasant user experiance. They can not & should not be expected to figure out which AIO impl works with their particular filesystem. If the linux-aio impl doesn't work in some cases, then the code should detect these and automatically fallback to posix-aio. The user should not have to use a -aio flag to make it work. Dan. -- |: Red Hat, Engineering, Boston -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :| |
From: Anthony L. <ali...@us...> - 2008-04-17 19:28:16
|
This patch introduces a Linux-aio backend that is disabled by default. To use this backend effectively, the user should disable caching and select it with the appropriate -aio option. For instance: qemu-system-x86_64 -drive foo.img,cache=off -aio linux There's no universal way to asynchronous wait with linux-aio. At some point, signals were added to signal completion. More recently, and eventfd interface was added. This patch relies on the later. We try hard to detect whether the right support is available in configure to avoid compile failures. Signed-off-by: Anthony Liguori <ali...@us...> diff --git a/Makefile.target b/Makefile.target index f635d68..289887c 100644 --- a/Makefile.target +++ b/Makefile.target @@ -487,6 +487,9 @@ OBJS+=block-raw-win32.o else OBJS+=block-raw-posix.o aio-posix.o endif +ifdef CONFIG_LINUX_AIO +OBJS+=aio-linux.o +endif LIBS+=-lz ifdef CONFIG_ALSA diff --git a/aio-linux.c b/aio-linux.c new file mode 100644 index 0000000..f5c222b --- /dev/null +++ b/aio-linux.c @@ -0,0 +1,210 @@ +/* + * QEMU Linux AIO Support + * + * Copyright IBM, Corp. 2008 + * + * Authors: + * Anthony Liguori <ali...@us...> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "qemu-common.h" +#include "qemu-char.h" +#include "block.h" +#include "block_int.h" +#include "block-aio.h" +#include "sysemu.h" + +#include <sys/types.h> +#include <sys/syscall.h> +#include <linux/aio_abi.h> + +int eventfd(unsigned int initval) +{ + return syscall(SYS_eventfd, initval); +} + +int io_setup(unsigned nr_reqs, aio_context_t *ctx_id) +{ + return syscall(SYS_io_setup, nr_reqs, ctx_id); +} + +int io_destroy(aio_context_t ctx_id) +{ + return syscall(SYS_io_destroy, ctx_id); +} + +int io_getevents(aio_context_t ctx_id, long min_nr, long nr, + struct io_event *events, struct timespec *timeout) +{ + return syscall(SYS_io_getevents, ctx_id, min_nr, nr, events, timeout); +} + +int io_submit(aio_context_t ctx_id, long nr, struct iocb **iocb) +{ + return syscall(SYS_io_submit, ctx_id, nr, iocb); +} + +int io_cancel(aio_context_t ctx_id, struct iocb *iocb, struct io_event *result) +{ + return syscall(SYS_io_cancel, ctx_id, iocb, result); +} + +typedef struct LinuxAIOCB { + BlockDriverAIOCB common; + struct iocb iocb; +} LinuxAIOCB; + +static int aio_efd; +static aio_context_t aio_ctxt_id; +static int outstanding_requests; + +static BlockDriverAIOCB *la_submit(BlockDriverState *bs, + int fd, int64_t sector_num, + void *buf, int nb_sectors, int write, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + LinuxAIOCB *aiocb; + struct iocb *iocbs[1]; + int err; + + aiocb = qemu_aio_get(bs, cb, opaque); + if (!aiocb) { + printf("returning null??\n"); + return NULL; + } + + if (write) + aiocb->iocb.aio_lio_opcode = IOCB_CMD_PWRITE; + else + aiocb->iocb.aio_lio_opcode = IOCB_CMD_PREAD; + + aiocb->iocb.aio_data = (unsigned long)aiocb; + aiocb->iocb.aio_fildes = fd; + aiocb->iocb.aio_flags = IOCB_FLAG_RESFD; + aiocb->iocb.aio_resfd = aio_efd; + aiocb->iocb.aio_buf = (unsigned long)buf; + aiocb->iocb.aio_nbytes = nb_sectors * 512; + aiocb->iocb.aio_offset = sector_num * 512; + + iocbs[0] = &aiocb->iocb; + + do { + err = io_submit(aio_ctxt_id, 1, iocbs); + } while (err == -1 && errno == EINTR); + + if (err != 1) { + fprintf(stderr, "failed to submit aio request: %m\n"); + exit(1); + } + + outstanding_requests++; + + return &aiocb->common; +} + +static void la_wait(void) +{ + main_loop_wait(10); +} + +static void la_flush(void) +{ + while (outstanding_requests) + la_wait(); +} + +static void la_cancel(BlockDriverAIOCB *baiocb) +{ + LinuxAIOCB *aiocb = (void *)baiocb; + struct io_event result; + int err; + + do { + err = io_cancel(aio_ctxt_id, &aiocb->iocb, &result); + } while (err == -1 && errno == EINTR); + + /* it may have happened... we probably should check and complete */ + + outstanding_requests--; + + qemu_aio_release(aiocb); +} + +static void la_completion(void *opaque) +{ + struct io_event events[256]; + struct timespec ts = {0, 0}; + uint64_t count; + int i, ret; + + do { + ret = read(aio_efd, &count, sizeof(count)); + } while (ret == -1 && errno == EINTR); + + if (ret != 8) { + fprintf(stderr, "bad read from eventfd\n"); + exit(1); + } + + do { + ret = io_getevents(aio_ctxt_id, count, ARRAY_SIZE(events), + events, &ts); + } while (ret == -1 && errno == EINTR); + + if (ret < count) { + fprintf(stderr, "io_getevents failed\n"); + exit(1); + } + + for (i = 0; i < ret; i++) { + LinuxAIOCB *aiocb; + int res; + + aiocb = (LinuxAIOCB *)(unsigned long)events[i].data; + res = events[i].res; + + if (res > 0) + res = 0; + + aiocb->common.cb(aiocb->common.opaque, res); + qemu_aio_release(aiocb); + + outstanding_requests--; + } +} + +static void la_init(void) +{ + aio_efd = eventfd(0); + if (aio_efd == -1) { + fprintf(stderr, "failed to allocate aio fd\n"); + exit(1); + } + + if (io_setup(256, &aio_ctxt_id) == -1) { + fprintf(stderr, "failed to initialize linux aio\n"); + exit(1); + } + + qemu_set_fd_handler2(aio_efd, NULL, la_completion, NULL, NULL); +} + +static AIODriver linux_aio_drv = { + .name = "linux", + .aiocb_size = sizeof(LinuxAIOCB), + .aio_init = la_init, + .aio_wait = la_wait, + .aio_flush = la_flush, + .aio_submit = la_submit, + .aio_cancel = la_cancel, +}; + +int linux_aio_init(void) +{ + return qemu_register_aio(&linux_aio_drv); +} diff --git a/block-aio.h b/block-aio.h index 2fe8c58..6e82cb5 100644 --- a/block-aio.h +++ b/block-aio.h @@ -42,5 +42,6 @@ int qemu_set_aio_driver(const char *name); extern AIODriver *aio_drv; int posix_aio_init(void); +int linux_aio_init(void); #endif diff --git a/block.c b/block.c index 44cb747..259bf3a 100644 --- a/block.c +++ b/block.c @@ -1349,6 +1349,11 @@ void bdrv_init(void) bdrv_register(&bdrv_qcow2); bdrv_register(&bdrv_parallels); #ifndef _WIN32 +#ifndef QEMU_IMG +#ifdef CONFIG_LINUX_AIO + linux_aio_init(); +#endif +#endif posix_aio_init(); #endif } diff --git a/configure b/configure index 85cb68a..95fb88f 100755 --- a/configure +++ b/configure @@ -109,6 +109,7 @@ darwin_user="no" build_docs="no" uname_release="" curses="yes" +linux_aio="yes" # OS specific targetos=`uname -s` @@ -326,6 +327,8 @@ for opt do ;; --disable-curses) curses="no" ;; + --disable-linux-aio) linux_aio="no" + ;; *) echo "ERROR: unknown option $opt"; show_help="yes" ;; esac @@ -418,6 +421,7 @@ echo " --enable-fmod enable FMOD audio driver" echo " --enable-dsound enable DirectSound audio driver" echo " --disable-vnc-tls disable TLS encryption for VNC server" echo " --disable-curses disable curses output" +echo " --disable-linux-aio disable Linux AIO support" echo " --enable-system enable all system emulation targets" echo " --disable-system disable all system emulation targets" echo " --enable-linux-user enable all linux usermode emulation targets" @@ -687,6 +691,24 @@ EOF fi fi # test "$curses" +# linux aio probe + +if test "$linux_aio" = "yes" ; then + linux_aio=no + cat > $TMPC <<EOF +#include <linux/aio_abi.h> +#include <unistd.h> +#include <sys/syscall.h> +#ifndef SYS_eventfd +#error No eventfd support +#endif +int main(void) { struct iocb iocb; (void)iocb.aio_resfd; return 0; } +EOF + if $cc $ARCH_CFLAGS -o $TMPE $TMPC 2> /dev/null ; then + linux_aio=yes + fi +fi + # Check if tools are available to build documentation. if [ -x "`which texi2html 2>/dev/null`" ] && \ [ -x "`which pod2man 2>/dev/null`" ]; then @@ -738,6 +760,7 @@ echo "SDL support $sdl" if test "$sdl" != "no" ; then echo "SDL static link $sdl_static" fi +echo "Linux AIO support $linux_aio" echo "curses support $curses" echo "mingw32 support $mingw32" echo "Adlib support $adlib" @@ -1001,6 +1024,10 @@ if test "$curses" = "yes" ; then echo "CONFIG_CURSES=yes" >> $config_mak echo "CURSES_LIBS=-lcurses" >> $config_mak fi +if test "$linux_aio" = "yes" ; then + echo "#define CONFIG_LINUX_AIO 1" >> $config_h + echo "CONFIG_LINUX_AIO=yes" >> $config_mak +fi # XXX: suppress that if [ "$bsd" = "yes" ] ; then |
From: Blue S. <bla...@gm...> - 2008-04-17 19:27:33
|
On 4/16/08, Anthony Liguori <ali...@us...> wrote: > Blue Swirl wrote: > > > On 4/16/08, Anthony Liguori <ali...@us...> wrote: > > > > > > > This patch introduces a DMA API and plumbs support through the DMA > layer. We > > > use a mostly opaque structure, IOVector to represent a scatter/gather > list of > > > physical memory. Associated with each IOVector is a read/write > function and > > > an opaque pointer. This allows arbitrary transformation/mapping of the > > > data while providing an easy mechanism to short-cut the zero-copy case > > > in the block/net backends. > > > > > > > > > > This looks much better also for Sparc uses. I converted pcnet to use > > the IOVectors (see patch), it does not work yet but looks doable. > > > > > > Excellent! I fixed the bug, now pcnet works. Performance is improved by a few percent. The problem was that the vector was not freed. Maybe dynamic allocation is a bit fragile. In this case, the length of the vector is known, so it could be allocated once at init time. But would this work? The next step would be to add a vector version for packet receive. For ESP/SCSI, in addition to bdrv_readv/writev, AIO versions would need to be added. Last year I made a patch (attached) that made SLIRP use my version of IOVector, I could update it to this model. > > IMHO the read/write functions should be a property of the bus so that > > they are hidden from the device, for pcnet it does not matter as we > > have to do the swapping anyway. > > > > > > For an IOMMU that has a per-device mapping, the read/write functions have > to operate on a per-device basis. No, I meant that there could be a bus layer that did the memory access and provided a specialized version of iovector_new without the handlers. But I think we can live with this, if things get too ugly we can add the layering later. |
From: Anthony L. <ali...@us...> - 2008-04-17 19:27:25
|
Posix AIO, especially as used by QEMU, is not very efficient for disk IO. This patch introduces an AIO abstract to allow multiple AIO implements to be used. We can't simply replace posix-aio by linux-aio because linux-aio only works on some filesystems and only with files opened with O_DIRECT. This patch adds a command line option (-aio) to select the AIO implementation to be used. It avoids code motion to allow for easy review. The next patch separates out the posix-aio implementation. Signed-off-by: Anthony Liguori <ali...@us...> diff --git a/block-aio.h b/block-aio.h new file mode 100644 index 0000000..2fe8c58 --- /dev/null +++ b/block-aio.h @@ -0,0 +1,46 @@ +/* + * QEMU Block AIO API + * + * Copyright IBM, Corp. 2008 + * + * Authors: + * Anthony Liguori <ali...@us...> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#ifndef QEMU_AIO_H +#define QEMU_AIO_H + +#include "qemu-common.h" +#include "block.h" + +typedef struct AIODriver +{ + const char *name; + size_t aiocb_size; + void (*aio_init)(void); + void (*aio_wait_start)(void); + void (*aio_wait)(void); + void (*aio_wait_end)(void); + void (*aio_flush)(void); + BlockDriverAIOCB *(*aio_submit)(BlockDriverState *bs, int fd, + int64_t sector_num, void *buf, + int sectors, int write, + BlockDriverCompletionFunc *cb, + void *opaque); + void (*aio_cancel)(BlockDriverAIOCB *aiocb); + struct AIODriver *next; +} AIODriver; + +int qemu_register_aio(AIODriver *drv); + +int qemu_set_aio_driver(const char *name); + +extern AIODriver *aio_drv; + +int posix_aio_init(void); + +#endif diff --git a/block-raw-posix.c b/block-raw-posix.c index 6b0009e..fee8422 100644 --- a/block-raw-posix.c +++ b/block-raw-posix.c @@ -27,6 +27,7 @@ #include "exec-all.h" #endif #include "block_int.h" +#include "block-aio.h" #include <assert.h> #include <aio.h> @@ -243,6 +244,11 @@ static int aio_sig_num = SIGUSR2; static RawAIOCB *first_aio; /* AIO issued */ static int aio_initialized = 0; +static void pa_poll(void *opaque); +static void pa_wait_start(void); +static void pa_wait(void); +static void pa_wait_end(void); + static void aio_signal_handler(int signum) { #ifndef QEMU_IMG @@ -259,11 +265,13 @@ static void aio_signal_handler(int signum) #endif } -void qemu_aio_init(void) +static void pa_init(void) { struct sigaction act; - aio_initialized = 1; +#ifndef QEMU_IMG + qemu_register_poll(pa_poll, NULL); +#endif sigfillset(&act.sa_mask); act.sa_flags = 0; /* do not restart syscalls to interrupt select() */ @@ -284,7 +292,7 @@ void qemu_aio_init(void) #endif } -void qemu_aio_poll(void) +static void pa_poll(void *opaque) { RawAIOCB *acb, **pacb; int ret; @@ -326,31 +334,29 @@ void qemu_aio_poll(void) } /* Wait for all IO requests to complete. */ -void qemu_aio_flush(void) +static void pa_flush(void) { - qemu_aio_wait_start(); - qemu_aio_poll(); + pa_wait_start(); + pa_poll(NULL); while (first_aio) { - qemu_aio_wait(); + pa_wait(); } - qemu_aio_wait_end(); + pa_wait_end(); } /* wait until at least one AIO was handled */ static sigset_t wait_oset; -void qemu_aio_wait_start(void) +static void pa_wait_start(void) { sigset_t set; - if (!aio_initialized) - qemu_aio_init(); sigemptyset(&set); sigaddset(&set, aio_sig_num); sigprocmask(SIG_BLOCK, &set, &wait_oset); } -void qemu_aio_wait(void) +static void pa_wait(void) { sigset_t set; int nb_sigs; @@ -362,19 +368,18 @@ void qemu_aio_wait(void) sigemptyset(&set); sigaddset(&set, aio_sig_num); sigwait(&set, &nb_sigs); - qemu_aio_poll(); + pa_poll(NULL); } -void qemu_aio_wait_end(void) +static void pa_wait_end(void) { sigprocmask(SIG_SETMASK, &wait_oset, NULL); } -static RawAIOCB *raw_aio_setup(BlockDriverState *bs, +static RawAIOCB *raw_aio_setup(BlockDriverState *bs, int fd, int64_t sector_num, uint8_t *buf, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) { - BDRVRawState *s = bs->opaque; RawAIOCB *acb; if (fd_open(bs) < 0) @@ -383,7 +388,7 @@ static RawAIOCB *raw_aio_setup(BlockDriverState *bs, acb = qemu_aio_get(bs, cb, opaque); if (!acb) return NULL; - acb->aiocb.aio_fildes = s->fd; + acb->aiocb.aio_fildes = fd; acb->aiocb.aio_sigevent.sigev_signo = aio_sig_num; acb->aiocb.aio_sigevent.sigev_notify = SIGEV_SIGNAL; acb->aiocb.aio_buf = buf; @@ -397,39 +402,32 @@ static RawAIOCB *raw_aio_setup(BlockDriverState *bs, return acb; } -static BlockDriverAIOCB *raw_aio_read(BlockDriverState *bs, - int64_t sector_num, uint8_t *buf, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque) +static BlockDriverAIOCB *pa_submit(BlockDriverState *bs, + int fd, int64_t sector_num, + void *buf, int nb_sectors, int write, + BlockDriverCompletionFunc *cb, + void *opaque) { RawAIOCB *acb; + int err; - acb = raw_aio_setup(bs, sector_num, buf, nb_sectors, cb, opaque); + acb = raw_aio_setup(bs, fd, sector_num, buf, nb_sectors, cb, opaque); if (!acb) return NULL; - if (aio_read(&acb->aiocb) < 0) { - qemu_aio_release(acb); - return NULL; - } - return &acb->common; -} -static BlockDriverAIOCB *raw_aio_write(BlockDriverState *bs, - int64_t sector_num, const uint8_t *buf, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque) -{ - RawAIOCB *acb; + if (write) + err = aio_write(&acb->aiocb); + else + err = aio_read(&acb->aiocb); - acb = raw_aio_setup(bs, sector_num, (uint8_t*)buf, nb_sectors, cb, opaque); - if (!acb) - return NULL; - if (aio_write(&acb->aiocb) < 0) { + if (err < 0) { qemu_aio_release(acb); return NULL; } return &acb->common; } -static void raw_aio_cancel(BlockDriverAIOCB *blockacb) +static void pa_cancel(BlockDriverAIOCB *blockacb) { int ret; RawAIOCB *acb = (RawAIOCB *)blockacb; @@ -456,6 +454,91 @@ static void raw_aio_cancel(BlockDriverAIOCB *blockacb) } } +static AIODriver posix_aio_drv = { + .name = "posix", + .aiocb_size = sizeof(RawAIOCB), + .aio_init = pa_init, + .aio_wait_start = pa_wait_start, + .aio_wait = pa_wait, + .aio_wait_end = pa_wait_end, + .aio_flush = pa_flush, + .aio_submit = pa_submit, + .aio_cancel = pa_cancel, +}; + +int posix_aio_init(void) +{ + return qemu_register_aio(&posix_aio_drv); +} + +void qemu_aio_init(void) +{ + if (aio_initialized) + return; + + aio_initialized = 1; + bdrv_host_device.aiocb_size = aio_drv->aiocb_size; + bdrv_raw.aiocb_size = aio_drv->aiocb_size; + if (aio_drv->aio_init) + aio_drv->aio_init(); +} + +void qemu_aio_flush(void) +{ + qemu_aio_init(); + aio_drv->aio_flush(); +} + +void qemu_aio_wait_start(void) +{ + qemu_aio_init(); + if (aio_drv->aio_wait_start) + aio_drv->aio_wait_start(); +} + +void qemu_aio_wait(void) +{ + qemu_aio_init(); + aio_drv->aio_wait(); +} + +void qemu_aio_wait_end(void) +{ + if (aio_drv->aio_wait_end) + aio_drv->aio_wait_end(); +} + +static BlockDriverAIOCB *raw_aio_read(BlockDriverState *bs, + int64_t sector_num, uint8_t *buf, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + BDRVRawState *s = bs->opaque; + + if (fd_open(bs) < 0) + return NULL; + + return aio_drv->aio_submit(bs, s->fd, sector_num, buf, nb_sectors, 0, + cb, opaque); +} + +static BlockDriverAIOCB *raw_aio_write(BlockDriverState *bs, + int64_t sector_num, const uint8_t *buf, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + BDRVRawState *s = bs->opaque; + + if (fd_open(bs) < 0) + return NULL; + + return aio_drv->aio_submit(bs, s->fd, sector_num, (void *)buf, nb_sectors, + 1, cb, opaque); +} + +static void raw_aio_cancel(BlockDriverAIOCB *blockacb) +{ + aio_drv->aio_cancel(blockacb); +} + static void raw_close(BlockDriverState *bs) { BDRVRawState *s = bs->opaque; @@ -559,7 +642,6 @@ BlockDriver bdrv_raw = { .bdrv_aio_read = raw_aio_read, .bdrv_aio_write = raw_aio_write, .bdrv_aio_cancel = raw_aio_cancel, - .aiocb_size = sizeof(RawAIOCB), .protocol_name = "file", .bdrv_pread = raw_pread, .bdrv_pwrite = raw_pwrite, @@ -911,7 +993,6 @@ BlockDriver bdrv_host_device = { .bdrv_aio_read = raw_aio_read, .bdrv_aio_write = raw_aio_write, .bdrv_aio_cancel = raw_aio_cancel, - .aiocb_size = sizeof(RawAIOCB), .bdrv_pread = raw_pread, .bdrv_pwrite = raw_pwrite, .bdrv_getlength = raw_getlength, diff --git a/block-raw-win32.c b/block-raw-win32.c index 43d3f6c..6b40a27 100644 --- a/block-raw-win32.c +++ b/block-raw-win32.c @@ -350,10 +350,6 @@ void qemu_aio_init(void) { } -void qemu_aio_poll(void) -{ -} - void qemu_aio_flush(void) { } diff --git a/block.c b/block.c index eb610e0..44cb747 100644 --- a/block.c +++ b/block.c @@ -26,6 +26,7 @@ #include "console.h" #endif #include "block_int.h" +#include "block-aio.h" #ifdef _BSD #include <sys/types.h> @@ -1347,6 +1348,9 @@ void bdrv_init(void) bdrv_register(&bdrv_vvfat); bdrv_register(&bdrv_qcow2); bdrv_register(&bdrv_parallels); +#ifndef _WIN32 + posix_aio_init(); +#endif } void *qemu_aio_get(BlockDriverState *bs, BlockDriverCompletionFunc *cb, @@ -1378,6 +1382,40 @@ void qemu_aio_release(void *p) drv->free_aiocb = acb; } +static AIODriver *aio_driver_list; +AIODriver *aio_drv; + +int qemu_register_aio(AIODriver *drv) +{ + drv->next = aio_driver_list; + aio_driver_list = drv; + aio_drv = aio_driver_list; + + return 0; +} + +int qemu_set_aio_driver(const char *name) +{ + AIODriver *drv; + + if (!strcmp(name, "?")) { + printf("Available aio drivers:\n"); + for (drv = aio_driver_list; drv; drv = drv->next) { + printf("%s\n", drv->name); + } + exit(0); + } + + for (drv = aio_driver_list; drv; drv = drv->next) { + if (!strcmp(name, drv->name)) + break; + } + + aio_drv = drv; + + return 0; +} + /**************************************************************/ /* removable device support */ diff --git a/block.h b/block.h index 9d30db2..ff19425 100644 --- a/block.h +++ b/block.h @@ -94,7 +94,6 @@ BlockDriverAIOCB *bdrv_aio_write(BlockDriverState *bs, int64_t sector_num, void bdrv_aio_cancel(BlockDriverAIOCB *acb); void qemu_aio_init(void); -void qemu_aio_poll(void); void qemu_aio_flush(void); void qemu_aio_wait_start(void); void qemu_aio_wait(void); diff --git a/sysemu.h b/sysemu.h index 0078190..9931139 100644 --- a/sysemu.h +++ b/sysemu.h @@ -41,6 +41,8 @@ void qemu_system_powerdown(void); #endif void qemu_system_reset(void); +void qemu_register_poll(IOHandler *poll, void *opaque); + void cpu_save(QEMUFile *f, void *opaque); int cpu_load(QEMUFile *f, void *opaque, int version_id); diff --git a/vl.c b/vl.c index cc328b0..cebcdc3 100644 --- a/vl.c +++ b/vl.c @@ -36,6 +36,7 @@ #include "qemu-timer.h" #include "qemu-char.h" #include "block.h" +#include "block-aio.h" #include "audio/audio.h" #include "balloon.h" @@ -7371,6 +7372,33 @@ void qemu_bh_delete(QEMUBH *bh) qemu_free(bh); } + /***********************************************************/ +/* poll handlers */ + +typedef struct PollHandler +{ + IOHandler *func; + void *opaque; + struct PollHandler *next; +} PollHandler; + +static PollHandler *poll_handlers; + +void qemu_register_poll(IOHandler *poll, void *opaque) +{ + PollHandler *p; + + p = qemu_mallocz(sizeof(*p)); + if (p == NULL) + return; + + p->func = poll; + p->opaque = opaque; + p->next = poll_handlers; + + poll_handlers = p; +} + /***********************************************************/ /* machine registration */ @@ -7689,7 +7717,12 @@ void main_loop_wait(int timeout) slirp_select_poll(&rfds, &wfds, &xfds); } #endif - qemu_aio_poll(); + if (poll_handlers) { + PollHandler *poll; + + for (poll = poll_handlers; poll; poll = poll->next) + poll->func(poll->opaque); + } if (vm_running) { qemu_run_timers(&active_timers[QEMU_TIMER_VIRTUAL], @@ -7928,6 +7961,8 @@ static void help(int exitcode) "-clock force the use of the given methods for timer alarm.\n" " To see what timers are available use -clock ?\n" "-startdate select initial date of the clock\n" + "-aio string Force aio type `string'\n" + " Use -aio ? to see available aio types.\n" "\n" "During emulation, the following keys are useful:\n" "ctrl-alt-f toggle full screen\n" @@ -8031,6 +8066,7 @@ enum { QEMU_OPTION_old_param, QEMU_OPTION_clock, QEMU_OPTION_startdate, + QEMU_OPTION_aio, }; typedef struct QEMUOption { @@ -8142,6 +8178,7 @@ const QEMUOption qemu_options[] = { #endif { "clock", HAS_ARG, QEMU_OPTION_clock }, { "startdate", HAS_ARG, QEMU_OPTION_startdate }, + { "aio", HAS_ARG, QEMU_OPTION_aio }, { NULL }, }; @@ -8417,6 +8454,7 @@ int main(int argc, char **argv) int fds[2]; const char *pid_file = NULL; VLANState *vlan; + const char *aio_opt = NULL; LIST_INIT (&vm_change_state_head); #ifndef _WIN32 @@ -8991,6 +9029,9 @@ int main(int argc, char **argv) } } break; + case QEMU_OPTION_aio: + aio_opt = optarg; + break; } } } @@ -9075,7 +9116,6 @@ int main(int argc, char **argv) init_timers(); init_timer_alarm(); - qemu_aio_init(); #ifdef _WIN32 socket_init(); @@ -9146,6 +9186,11 @@ int main(int argc, char **argv) bdrv_init(); + if (aio_opt) + qemu_set_aio_driver(aio_opt); + + qemu_aio_init(); + /* we always create the cdrom drive, even if no disk is there */ if (nb_drives_opt < MAX_DRIVES) |
From: Anthony L. <ali...@us...> - 2008-04-17 19:27:25
|
This patch moves the posix-aio code into a separate file. It's strictly code motion, no new functionality is introduced. Signed-off-by: Anthony Liguori <ali...@us...> diff --git a/Makefile b/Makefile index a8df278..916f071 100644 --- a/Makefile +++ b/Makefile @@ -139,7 +139,7 @@ QEMU_IMG_BLOCK_OBJS = $(BLOCK_OBJS) ifdef CONFIG_WIN32 QEMU_IMG_BLOCK_OBJS += qemu-img-block-raw-win32.o else -QEMU_IMG_BLOCK_OBJS += qemu-img-block-raw-posix.o +QEMU_IMG_BLOCK_OBJS += qemu-img-block-raw-posix.o qemu-img-aio-posix.o endif ###################################################################### diff --git a/Makefile.target b/Makefile.target index 75de753..f635d68 100644 --- a/Makefile.target +++ b/Makefile.target @@ -485,7 +485,7 @@ OBJS=vl.o osdep.o monitor.o pci.o loader.o isa_mmio.o ifdef CONFIG_WIN32 OBJS+=block-raw-win32.o else -OBJS+=block-raw-posix.o +OBJS+=block-raw-posix.o aio-posix.o endif LIBS+=-lz diff --git a/aio-posix.c b/aio-posix.c new file mode 100644 index 0000000..b5fea7d --- /dev/null +++ b/aio-posix.c @@ -0,0 +1,290 @@ +/* + * Block driver for RAW files (posix) + * + * Copyright (c) 2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#ifndef QEMU_IMG +#include "qemu-timer.h" +#include "exec-all.h" +#endif +#include "sysemu.h" +#include "block_int.h" +#include "block-aio.h" +#include <assert.h> +#include <aio.h> + +#ifdef CONFIG_COCOA +#include <paths.h> +#include <sys/param.h> +#include <IOKit/IOKitLib.h> +#include <IOKit/IOBSD.h> +#include <IOKit/storage/IOMediaBSDClient.h> +#include <IOKit/storage/IOMedia.h> +#include <IOKit/storage/IOCDMedia.h> +//#include <IOKit/storage/IOCDTypes.h> +#include <CoreFoundation/CoreFoundation.h> +#endif + +#ifdef __sun__ +#define _POSIX_PTHREAD_SEMANTICS 1 +#include <signal.h> +#include <sys/dkio.h> +#endif +#ifdef __linux__ +#include <sys/ioctl.h> +#include <linux/cdrom.h> +#include <linux/fd.h> +#endif +#ifdef __FreeBSD__ +#include <sys/disk.h> +#endif + +/***********************************************************/ +/* Unix AIO using POSIX AIO */ + +typedef struct RawAIOCB { + BlockDriverAIOCB common; + struct aiocb aiocb; + struct RawAIOCB *next; +} RawAIOCB; + +static int aio_sig_num = SIGUSR2; +static RawAIOCB *first_aio; /* AIO issued */ + +static void aio_signal_handler(int signum) +{ +#ifndef QEMU_IMG + CPUState *env = cpu_single_env; + if (env) { + /* stop the currently executing cpu because a timer occured */ + cpu_interrupt(env, CPU_INTERRUPT_EXIT); +#ifdef USE_KQEMU + if (env->kqemu_enabled) { + kqemu_cpu_interrupt(env); + } +#endif + } +#endif +} + +static void pa_poll(void *opaque) +{ + RawAIOCB *acb, **pacb; + int ret; + + for(;;) { + pacb = &first_aio; + for(;;) { + acb = *pacb; + if (!acb) + goto the_end; + ret = aio_error(&acb->aiocb); + if (ret == ECANCELED) { + /* remove the request */ + *pacb = acb->next; + qemu_aio_release(acb); + } else if (ret != EINPROGRESS) { + /* end of aio */ + if (ret == 0) { + ret = aio_return(&acb->aiocb); + if (ret == acb->aiocb.aio_nbytes) + ret = 0; + else + ret = -EINVAL; + } else { + ret = -ret; + } + /* remove the request */ + *pacb = acb->next; + /* call the callback */ + acb->common.cb(acb->common.opaque, ret); + qemu_aio_release(acb); + break; + } else { + pacb = &acb->next; + } + } + } + the_end: ; +} + +static void pa_init(void) +{ + struct sigaction act; + +#ifndef QEMU_IMG + qemu_register_poll(pa_poll, NULL); +#endif + + sigfillset(&act.sa_mask); + act.sa_flags = 0; /* do not restart syscalls to interrupt select() */ + act.sa_handler = aio_signal_handler; + sigaction(aio_sig_num, &act, NULL); + +#if defined(__GLIBC__) && defined(__linux__) + { + /* XXX: aio thread exit seems to hang on RedHat 9 and this init + seems to fix the problem. */ + struct aioinit ai; + memset(&ai, 0, sizeof(ai)); + ai.aio_threads = 1; + ai.aio_num = 1; + ai.aio_idle_time = 365 * 100000; + aio_init(&ai); + } +#endif +} + +/* wait until at least one AIO was handled */ +static sigset_t wait_oset; + +static void pa_wait_start(void) +{ + sigset_t set; + + sigemptyset(&set); + sigaddset(&set, aio_sig_num); + sigprocmask(SIG_BLOCK, &set, &wait_oset); +} + +static void pa_wait(void) +{ + sigset_t set; + int nb_sigs; + +#ifndef QEMU_IMG + if (qemu_bh_poll()) + return; +#endif + sigemptyset(&set); + sigaddset(&set, aio_sig_num); + sigwait(&set, &nb_sigs); + pa_poll(NULL); +} + +static void pa_wait_end(void) +{ + sigprocmask(SIG_SETMASK, &wait_oset, NULL); +} + +/* Wait for all IO requests to complete. */ +static void pa_flush(void) +{ + pa_wait_start(); + pa_poll(NULL); + while (first_aio) { + pa_wait(); + } + pa_wait_end(); +} + +static RawAIOCB *raw_aio_setup(BlockDriverState *bs, int fd, + int64_t sector_num, uint8_t *buf, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + RawAIOCB *acb; + + acb = qemu_aio_get(bs, cb, opaque); + if (!acb) + return NULL; + acb->aiocb.aio_fildes = fd; + acb->aiocb.aio_sigevent.sigev_signo = aio_sig_num; + acb->aiocb.aio_sigevent.sigev_notify = SIGEV_SIGNAL; + acb->aiocb.aio_buf = buf; + if (nb_sectors < 0) + acb->aiocb.aio_nbytes = -nb_sectors; + else + acb->aiocb.aio_nbytes = nb_sectors * 512; + acb->aiocb.aio_offset = sector_num * 512; + acb->next = first_aio; + first_aio = acb; + return acb; +} + +static BlockDriverAIOCB *pa_submit(BlockDriverState *bs, + int fd, int64_t sector_num, + void *buf, int nb_sectors, int write, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + RawAIOCB *acb; + int err; + + acb = raw_aio_setup(bs, fd, sector_num, buf, nb_sectors, cb, opaque); + if (!acb) + return NULL; + + if (write) + err = aio_write(&acb->aiocb); + else + err = aio_read(&acb->aiocb); + + if (err < 0) { + qemu_aio_release(acb); + return NULL; + } + return &acb->common; +} + +static void pa_cancel(BlockDriverAIOCB *blockacb) +{ + int ret; + RawAIOCB *acb = (RawAIOCB *)blockacb; + RawAIOCB **pacb; + + ret = aio_cancel(acb->aiocb.aio_fildes, &acb->aiocb); + if (ret == AIO_NOTCANCELED) { + /* fail safe: if the aio could not be canceled, we wait for + it */ + while (aio_error(&acb->aiocb) == EINPROGRESS); + } + + /* remove the callback from the queue */ + pacb = &first_aio; + for(;;) { + if (*pacb == NULL) { + break; + } else if (*pacb == acb) { + *pacb = acb->next; + qemu_aio_release(acb); + break; + } + pacb = &acb->next; + } +} + +static AIODriver posix_aio_drv = { + .name = "posix", + .aiocb_size = sizeof(RawAIOCB), + .aio_init = pa_init, + .aio_wait_start = pa_wait_start, + .aio_wait = pa_wait, + .aio_wait_end = pa_wait_end, + .aio_flush = pa_flush, + .aio_submit = pa_submit, + .aio_cancel = pa_cancel, +}; + +int posix_aio_init(void) +{ + return qemu_register_aio(&posix_aio_drv); +} diff --git a/block-raw-posix.c b/block-raw-posix.c index fee8422..f0a111a 100644 --- a/block-raw-posix.c +++ b/block-raw-posix.c @@ -29,7 +29,6 @@ #include "block_int.h" #include "block-aio.h" #include <assert.h> -#include <aio.h> #ifdef CONFIG_COCOA #include <paths.h> @@ -232,245 +231,10 @@ label__raw_write__success: } /***********************************************************/ -/* Unix AIO using POSIX AIO */ - -typedef struct RawAIOCB { - BlockDriverAIOCB common; - struct aiocb aiocb; - struct RawAIOCB *next; -} RawAIOCB; - -static int aio_sig_num = SIGUSR2; -static RawAIOCB *first_aio; /* AIO issued */ +/* AIO Interface */ + static int aio_initialized = 0; -static void pa_poll(void *opaque); -static void pa_wait_start(void); -static void pa_wait(void); -static void pa_wait_end(void); - -static void aio_signal_handler(int signum) -{ -#ifndef QEMU_IMG - CPUState *env = cpu_single_env; - if (env) { - /* stop the currently executing cpu because a timer occured */ - cpu_interrupt(env, CPU_INTERRUPT_EXIT); -#ifdef USE_KQEMU - if (env->kqemu_enabled) { - kqemu_cpu_interrupt(env); - } -#endif - } -#endif -} - -static void pa_init(void) -{ - struct sigaction act; - -#ifndef QEMU_IMG - qemu_register_poll(pa_poll, NULL); -#endif - - sigfillset(&act.sa_mask); - act.sa_flags = 0; /* do not restart syscalls to interrupt select() */ - act.sa_handler = aio_signal_handler; - sigaction(aio_sig_num, &act, NULL); - -#if defined(__GLIBC__) && defined(__linux__) - { - /* XXX: aio thread exit seems to hang on RedHat 9 and this init - seems to fix the problem. */ - struct aioinit ai; - memset(&ai, 0, sizeof(ai)); - ai.aio_threads = 1; - ai.aio_num = 1; - ai.aio_idle_time = 365 * 100000; - aio_init(&ai); - } -#endif -} - -static void pa_poll(void *opaque) -{ - RawAIOCB *acb, **pacb; - int ret; - - for(;;) { - pacb = &first_aio; - for(;;) { - acb = *pacb; - if (!acb) - goto the_end; - ret = aio_error(&acb->aiocb); - if (ret == ECANCELED) { - /* remove the request */ - *pacb = acb->next; - qemu_aio_release(acb); - } else if (ret != EINPROGRESS) { - /* end of aio */ - if (ret == 0) { - ret = aio_return(&acb->aiocb); - if (ret == acb->aiocb.aio_nbytes) - ret = 0; - else - ret = -EINVAL; - } else { - ret = -ret; - } - /* remove the request */ - *pacb = acb->next; - /* call the callback */ - acb->common.cb(acb->common.opaque, ret); - qemu_aio_release(acb); - break; - } else { - pacb = &acb->next; - } - } - } - the_end: ; -} - -/* Wait for all IO requests to complete. */ -static void pa_flush(void) -{ - pa_wait_start(); - pa_poll(NULL); - while (first_aio) { - pa_wait(); - } - pa_wait_end(); -} - -/* wait until at least one AIO was handled */ -static sigset_t wait_oset; - -static void pa_wait_start(void) -{ - sigset_t set; - - sigemptyset(&set); - sigaddset(&set, aio_sig_num); - sigprocmask(SIG_BLOCK, &set, &wait_oset); -} - -static void pa_wait(void) -{ - sigset_t set; - int nb_sigs; - -#ifndef QEMU_IMG - if (qemu_bh_poll()) - return; -#endif - sigemptyset(&set); - sigaddset(&set, aio_sig_num); - sigwait(&set, &nb_sigs); - pa_poll(NULL); -} - -static void pa_wait_end(void) -{ - sigprocmask(SIG_SETMASK, &wait_oset, NULL); -} - -static RawAIOCB *raw_aio_setup(BlockDriverState *bs, int fd, - int64_t sector_num, uint8_t *buf, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque) -{ - RawAIOCB *acb; - - if (fd_open(bs) < 0) - return NULL; - - acb = qemu_aio_get(bs, cb, opaque); - if (!acb) - return NULL; - acb->aiocb.aio_fildes = fd; - acb->aiocb.aio_sigevent.sigev_signo = aio_sig_num; - acb->aiocb.aio_sigevent.sigev_notify = SIGEV_SIGNAL; - acb->aiocb.aio_buf = buf; - if (nb_sectors < 0) - acb->aiocb.aio_nbytes = -nb_sectors; - else - acb->aiocb.aio_nbytes = nb_sectors * 512; - acb->aiocb.aio_offset = sector_num * 512; - acb->next = first_aio; - first_aio = acb; - return acb; -} - -static BlockDriverAIOCB *pa_submit(BlockDriverState *bs, - int fd, int64_t sector_num, - void *buf, int nb_sectors, int write, - BlockDriverCompletionFunc *cb, - void *opaque) -{ - RawAIOCB *acb; - int err; - - acb = raw_aio_setup(bs, fd, sector_num, buf, nb_sectors, cb, opaque); - if (!acb) - return NULL; - - if (write) - err = aio_write(&acb->aiocb); - else - err = aio_read(&acb->aiocb); - - if (err < 0) { - qemu_aio_release(acb); - return NULL; - } - return &acb->common; -} - -static void pa_cancel(BlockDriverAIOCB *blockacb) -{ - int ret; - RawAIOCB *acb = (RawAIOCB *)blockacb; - RawAIOCB **pacb; - - ret = aio_cancel(acb->aiocb.aio_fildes, &acb->aiocb); - if (ret == AIO_NOTCANCELED) { - /* fail safe: if the aio could not be canceled, we wait for - it */ - while (aio_error(&acb->aiocb) == EINPROGRESS); - } - - /* remove the callback from the queue */ - pacb = &first_aio; - for(;;) { - if (*pacb == NULL) { - break; - } else if (*pacb == acb) { - *pacb = acb->next; - qemu_aio_release(acb); - break; - } - pacb = &acb->next; - } -} - -static AIODriver posix_aio_drv = { - .name = "posix", - .aiocb_size = sizeof(RawAIOCB), - .aio_init = pa_init, - .aio_wait_start = pa_wait_start, - .aio_wait = pa_wait, - .aio_wait_end = pa_wait_end, - .aio_flush = pa_flush, - .aio_submit = pa_submit, - .aio_cancel = pa_cancel, -}; - -int posix_aio_init(void) -{ - return qemu_register_aio(&posix_aio_drv); -} - void qemu_aio_init(void) { if (aio_initialized) |
From: Christoph L. <cla...@sg...> - 2008-04-17 19:10:59
|
On Thu, 17 Apr 2008, Andrea Arcangeli wrote: > Also note, EMM isn't using the clean hlist_del, it's implementing list > by hand (with zero runtime gain) so all the debugging may not be > existent in EMM, so if it's really a mm_lock race, and it only > triggers with mmu notifiers and not with EMM, it doesn't necessarily > mean EMM is bug free. If you've a full stack trace it would greatly > help to verify what is mangling over the list when the oops triggers. EMM was/is using a single linked list which allows atomic updates. Looked cleaner to me since doubly linked list must update two pointers. I have not seen docs on the locking so not sure why you use rcu operations here? Isnt the requirement to have either rmap locks or mmap_sem held enough to guarantee the consistency of the doubly linked list? |
From: Ian K. <bl...@bl...> - 2008-04-17 19:04:39
|
Avi Kivity wrote: > I do this regularly, basically you need to install kernel-devel and > that's it. Yes, that is very easy isn't it. Oops to my stupidity. I've got it built and will give it a go tomorrow and report back on each test case. |
From: Avi K. <av...@qu...> - 2008-04-17 18:40:48
|
Ian Kirk wrote: > Avi Kivity wrote: > > >> 1. Boot with 'noexec=off' on the host kernel command line >> 2. Loading the kernel modules that come with kvm-66 >> > > I'll have a go at no.2, I have not had much luck compiling the modules > within the constraints of a Fedora kernel. > > I do this regularly, basically you need to install kernel-devel and that's it. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. |
From: SourceForge.net <no...@so...> - 2008-04-17 17:36:29
|
Bugs item #1945129, was opened at 2008-04-17 14:36 Message generated for change (Tracker Item Submitted) made by Item Submitter You can respond by visiting: https://sourceforge.net/tracker/?func=detail&atid=893831&aid=1945129&group_id=180599 Please note that this message will contain a full copy of the comment thread, including the initial issue submission, for this request, not just the latest update. Category: None Group: None Status: Open Resolution: None Priority: 5 Private: No Submitted By: Duilio J. Protti (dprotti) Assigned to: Nobody/Anonymous (nobody) Summary: Compilation fails when configure with -DDEBUG Initial Comment: When configured kvm-65 with ./configure --prefix=/usr/devel --qemu-cflags=-DDEBUG Compilation fails with the following error: gcc34 -I /root/linux/kvm-65/qemu/../libkvm -DDEBUG -DCONFIG_X86 -Wall -O2 -g -fno-strict-aliasing -m32 -I. -I/root/linux/kvm-65/qemu -MMD -MP -D_GNU_SOURCE -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -I/root/linux/kvm-65/qemu/slirp -c -o smbus_eeprom.o /root/linux/kvm-65/qemu/hw/smbus_eeprom.c /root/linux/kvm-65/qemu/hw/smbus_eeprom.c: In function `eeprom_quick_cmd': /root/linux/kvm-65/qemu/hw/smbus_eeprom.c:40: error: structure has no member named `addr' /root/linux/kvm-65/qemu/hw/smbus_eeprom.c: In function `eeprom_send_byte': /root/linux/kvm-65/qemu/hw/smbus_eeprom.c:48: error: structure has no member named `addr' /root/linux/kvm-65/qemu/hw/smbus_eeprom.c: In function `eeprom_receive_byte': /root/linux/kvm-65/qemu/hw/smbus_eeprom.c:58: error: structure has no member named `addr' /root/linux/kvm-65/qemu/hw/smbus_eeprom.c: In function `eeprom_write_data': /root/linux/kvm-65/qemu/hw/smbus_eeprom.c:68: error: structure has no member named `addr' make[1]: *** [smbus_eeprom.o] Error 1 make[1]: Leaving directory `/root/linux/kvm-65/qemu' make: *** [qemu] Error 2 The error comes because structure SMBusDevice does not have a field named 'addr' (probably name has changed since the debugging code was written). Code conditionally compiled with a DEBUG macro appears in many places in the Qemu code, but it is not driven by a central --debug option or similar, so it seems that the configuration in question (with -DDEBUG) is a little odd and a not supported one. However, may be the code should be changed or else removed to avoid these useless lines there. Attached is a simple patch which resolves the issue. Regards, Duilio Protti. ---------------------------------------------------------------------- You can respond by visiting: https://sourceforge.net/tracker/?func=detail&atid=893831&aid=1945129&group_id=180599 |
From: Robin H. <ho...@sg...> - 2008-04-17 17:25:55
|
On Thu, Apr 17, 2008 at 07:14:43PM +0200, Andrea Arcangeli wrote: > On Thu, Apr 17, 2008 at 11:36:42AM -0500, Robin Holt wrote: > > In this case, we are not making the call to unregister, we are waiting > > for the _release callout which has already removed it from the list. > > > > In the event that the user has removed all the grants, we use unregister. > > That typically does not occur. We merely wait for exit processing to > > clean up the structures. > > Then it's very strange. LIST_POISON1 is set in n->next. If it was a > second hlist_del triggering the bug in theory list_poison2 should > trigger first, so perhaps it's really a notifier running despite a > mm_lock is taken? Could you post a full stack trace so I can see who's > running into LIST_POISON1? If it's really a notifier running outside > of some mm_lock that will be _immediately_ visible from the stack > trace that triggered the LIST_POISON1! > > Also note, EMM isn't using the clean hlist_del, it's implementing list > by hand (with zero runtime gain) so all the debugging may not be > existent in EMM, so if it's really a mm_lock race, and it only > triggers with mmu notifiers and not with EMM, it doesn't necessarily > mean EMM is bug free. If you've a full stack trace it would greatly > help to verify what is mangling over the list when the oops triggers. The stack trace is below. I did not do this level of testing on emm so I can not compare the two in this area. This is for a different, but equivalent failure. I just reproduce the LIST_POISON1 failure without trying to reproduce the exact same failure as I had documented earlier (lost that stack trace, sorry). Thanks, Robin <1>Unable to handle kernel paging request at virtual address 0000000000100100 <4>mpi006.f.x[23403]: Oops 11012296146944 [1] <4>Modules linked in: nfs lockd sunrpc binfmt_misc thermal processor fan button loop md_mod dm_mod xpmem xp mspec sg <4> <4>Pid: 23403, CPU 114, comm: mpi006.f.x <4>psr : 0000121008526010 ifs : 800000000000038b ip : [<a00000010015d6a1>] Not tainted (2.6.25-rc8) <4>ip is at __mmu_notifier_invalidate_range_start+0x81/0x120 <4>unat: 0000000000000000 pfs : 000000000000038b rsc : 0000000000000003 <4>rnat: a000000100149a00 bsps: a000000000010740 pr : 66555666a9599aa9 <4>ldrs: 0000000000000000 ccv : 0000000000000000 fpsr: 0009804c0270033f <4>csd : 0000000000000000 ssd : 0000000000000000 <4>b0 : a00000010015d670 b6 : a0000002101ddb40 b7 : a00000010000eb50 <4>f6 : 1003e2222222222222222 f7 : 000000000000000000000 <4>f8 : 000000000000000000000 f9 : 000000000000000000000 <4>f10 : 000000000000000000000 f11 : 000000000000000000000 <4>r1 : a000000100ef1190 r2 : e0000e6080cc1940 r3 : a0000002101edd10 <4>r8 : e0000e6080cc1970 r9 : 0000000000000000 r10 : e0000e6080cc19c8 <4>r11 : 20000003a6480000 r12 : e0000c60d31efb90 r13 : e0000c60d31e0000 <4>r14 : 000000000000004d r15 : e0000e6080cc1914 r16 : e0000e6080cc1970 <4>r17 : 20000003a6480000 r18 : 20000007bf900000 r19 : 0000000000040000 <4>r20 : e0000c60d31e0000 r21 : 0000000000000010 r22 : e0000e6080cc19a8 <4>r23 : e0000c60c55f1120 r24 : e0000c60d31efda0 r25 : e0000c60d31efd98 <4>r26 : e0000e60812166d0 r27 : e0000c60d31efdc0 r28 : e0000c60d31efdb8 <4>r29 : e0000c60d31e0b60 r30 : 0000000000000000 r31 : 0000000000000081 <4> <4>Call Trace: <4> [<a000000100014a20>] show_stack+0x40/0xa0 <4> sp=e0000c60d31ef760 bsp=e0000c60d31e11f0 <4> [<a000000100015330>] show_regs+0x850/0x8a0 <4> sp=e0000c60d31ef930 bsp=e0000c60d31e1198 <4> [<a000000100035ed0>] die+0x1b0/0x2e0 <4> sp=e0000c60d31ef930 bsp=e0000c60d31e1150 <4> [<a000000100060e90>] ia64_do_page_fault+0x8d0/0xa40 <4> sp=e0000c60d31ef930 bsp=e0000c60d31e1100 <4> [<a00000010000ab00>] ia64_leave_kernel+0x0/0x270 <4> sp=e0000c60d31ef9c0 bsp=e0000c60d31e1100 <4> [<a00000010015d6a0>] __mmu_notifier_invalidate_range_start+0x80/0x120 <4> sp=e0000c60d31efb90 bsp=e0000c60d31e10a8 <4> [<a00000010011b1d0>] unmap_vmas+0x70/0x14c0 <4> sp=e0000c60d31efb90 bsp=e0000c60d31e0fa8 <4> [<a00000010011c660>] zap_page_range+0x40/0x60 <4> sp=e0000c60d31efda0 bsp=e0000c60d31e0f70 <4> [<a0000002101d62d0>] xpmem_clear_PTEs+0x350/0x560 [xpmem] <4> sp=e0000c60d31efdb0 bsp=e0000c60d31e0ef0 <4> [<a0000002101d1e30>] xpmem_remove_seg+0x3f0/0x700 [xpmem] <4> sp=e0000c60d31efde0 bsp=e0000c60d31e0ea8 <4> [<a0000002101d2500>] xpmem_remove_segs_of_tg+0x80/0x140 [xpmem] <4> sp=e0000c60d31efe10 bsp=e0000c60d31e0e78 <4> [<a0000002101dda40>] xpmem_mmu_notifier_release+0x40/0x80 [xpmem] <4> sp=e0000c60d31efe10 bsp=e0000c60d31e0e58 <4> [<a00000010015d7f0>] __mmu_notifier_release+0xb0/0x100 <4> sp=e0000c60d31efe10 bsp=e0000c60d31e0e38 <4> [<a000000100124430>] exit_mmap+0x50/0x180 <4> sp=e0000c60d31efe10 bsp=e0000c60d31e0e10 <4> [<a00000010008fb30>] mmput+0x70/0x180 <4> sp=e0000c60d31efe20 bsp=e0000c60d31e0dd8 <4> [<a000000100098df0>] exit_mm+0x1f0/0x220 <4> sp=e0000c60d31efe20 bsp=e0000c60d31e0da0 <4> [<a00000010009ca60>] do_exit+0x4e0/0xf40 <4> sp=e0000c60d31efe20 bsp=e0000c60d31e0d58 <4> [<a00000010009d640>] do_group_exit+0x180/0x1c0 <4> sp=e0000c60d31efe30 bsp=e0000c60d31e0d20 <4> [<a00000010009d6a0>] sys_exit_group+0x20/0x40 <4> sp=e0000c60d31efe30 bsp=e0000c60d31e0cc8 <4> [<a00000010000a960>] ia64_ret_from_syscall+0x0/0x20 <4> sp=e0000c60d31efe30 bsp=e0000c60d31e0cc8 <4> [<a000000000010720>] __kernel_syscall_via_break+0x0/0x20 <4> sp=e0000c60d31f0000 bsp=e0000c60d31e0cc8 |
From: Andrea A. <an...@qu...> - 2008-04-17 17:14:45
|
On Thu, Apr 17, 2008 at 11:36:42AM -0500, Robin Holt wrote: > In this case, we are not making the call to unregister, we are waiting > for the _release callout which has already removed it from the list. > > In the event that the user has removed all the grants, we use unregister. > That typically does not occur. We merely wait for exit processing to > clean up the structures. Then it's very strange. LIST_POISON1 is set in n->next. If it was a second hlist_del triggering the bug in theory list_poison2 should trigger first, so perhaps it's really a notifier running despite a mm_lock is taken? Could you post a full stack trace so I can see who's running into LIST_POISON1? If it's really a notifier running outside of some mm_lock that will be _immediately_ visible from the stack trace that triggered the LIST_POISON1! Also note, EMM isn't using the clean hlist_del, it's implementing list by hand (with zero runtime gain) so all the debugging may not be existent in EMM, so if it's really a mm_lock race, and it only triggers with mmu notifiers and not with EMM, it doesn't necessarily mean EMM is bug free. If you've a full stack trace it would greatly help to verify what is mangling over the list when the oops triggers. Thanks! Andrea |
From: Ian K. <bl...@bl...> - 2008-04-17 16:57:40
|
Avi Kivity wrote: > 1. Boot with 'noexec=off' on the host kernel command line > 2. Loading the kernel modules that come with kvm-66 I'll have a go at no.2, I have not had much luck compiling the modules within the constraints of a Fedora kernel. No.1 I can do tomorrow when in the same building as the server (in case of user error) Ian. |
From: Robin H. <ho...@sg...> - 2008-04-17 16:36:40
|
On Thu, Apr 17, 2008 at 05:51:57PM +0200, Andrea Arcangeli wrote: > On Wed, Apr 16, 2008 at 11:35:38AM -0700, Christoph Lameter wrote: > > On Wed, 16 Apr 2008, Robin Holt wrote: > > > > > I don't think this lock mechanism is completely working. I have > > > gotten a few failures trying to dereference 0x100100 which appears to > > > be LIST_POISON1. > > > > How does xpmem unregistering of notifiers work? > > Especially are you using mmu_notifier_unregister? In this case, we are not making the call to unregister, we are waiting for the _release callout which has already removed it from the list. In the event that the user has removed all the grants, we use unregister. That typically does not occur. We merely wait for exit processing to clean up the structures. Thanks, Robin |
From: Avi K. <av...@qu...> - 2008-04-17 16:36:02
|
Avi Kivity wrote: > > Not good. kvm doesn't really care about the host page table layout, I > don't see how this can matter. Actually kvm is affected by pae: it enables nx support. Please try (separately) 1. Boot with 'noexec=off' on the host kernel command line 2. Loading the kernel modules that come with kvm-66 -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. |
From: Avi K. <av...@qu...> - 2008-04-17 15:52:36
|
Ian Kirk wrote: > Avi Kivity wrote: > > >>> If so, I might as well drop back to non-PAE and get to use 3GB. >>> >> If it works, this provides a clue as to what goes wrong, and we can fix >> it. >> >> >>> Are these known issues? Can I do anything to help/test which might >>> make it work for other people? >>> >> Just let us know if it works or not. If it does, I'll try to prepare a >> patch which fixes the problem with pae and the entire 4GB. If it >> doesn't, we'll have to do some more searching. >> > > 2.6.24.4-64.fc8PAE mem=2000: > Linux explodes (forgot the m!) > > Sorry. > 2.6.24.4-64.fc8PAE mem=2000m: > kvm solaris still dies as before, free(1) reports ~2GB. > > Not good. kvm doesn't really care about the host page table layout, I don't see how this can matter. > 2.6.24.4-64.fc8: > kvm solaris works, free(1) reports ~3GB. > > Currently installed KVM-64 based on previous post, kernel module is stock > Fedora 8 from the above (not sure which version that is) > It's kvm-60-ish IIRC. -- error compiling committee.c: too many arguments to function |
From: Andrea A. <an...@qu...> - 2008-04-17 15:52:26
|
On Wed, Apr 16, 2008 at 11:35:38AM -0700, Christoph Lameter wrote: > On Wed, 16 Apr 2008, Robin Holt wrote: > > > I don't think this lock mechanism is completely working. I have > > gotten a few failures trying to dereference 0x100100 which appears to > > be LIST_POISON1. > > How does xpmem unregistering of notifiers work? Especially are you using mmu_notifier_unregister? |
From: Ian K. <bl...@bl...> - 2008-04-17 15:46:15
|
Avi Kivity wrote: > > If so, I might as well drop back to non-PAE and get to use 3GB. > > If it works, this provides a clue as to what goes wrong, and we can fix > it. > > > Are these known issues? Can I do anything to help/test which might > > make it work for other people? > > Just let us know if it works or not. If it does, I'll try to prepare a > patch which fixes the problem with pae and the entire 4GB. If it > doesn't, we'll have to do some more searching. 2.6.24.4-64.fc8PAE mem=2000: Linux explodes (forgot the m!) 2.6.24.4-64.fc8PAE mem=2000m: kvm solaris still dies as before, free(1) reports ~2GB. 2.6.24.4-64.fc8: kvm solaris works, free(1) reports ~3GB. Currently installed KVM-64 based on previous post, kernel module is stock Fedora 8 from the above (not sure which version that is) HTH, Ian. |