From: Andrea A. <an...@qu...> - 2008-01-09 18:19:07
|
Hello, This patch is a first basic implementation of the mmu notifiers. More methods can be added in the future. In short when the linux VM decides to free a page, it will unmap it from the linux pagetables. However when a page is mapped not just by the regular linux ptes, but also from the shadow pagetables, it's currently unfreeable by the linux VM. This patch allows the shadow pagetables to be dropped and the page to be freed after that, if the linux VM decides to unmap the page from the main ptes because it wants to swap out the page. In my basic initial patch I only track the tlb flushes which should be the minimum required to have a nice linux-VM controlled swapping behavior of the KVM gphysical memory. The shadow-ptes works much like a TLB, so the same way we flush the tlb after clearing the ptes, we should also issue the mmu_notifier invalidate_page/range/release methods. Quadrics needs much more than that to optimize things but it's easy to add more methods to the below code to fit their needs if the basic is ok. This follows the model of Avi's original patch, however I guess it would also be possible to track when the VM shrink_cache methods wants to free a certain host-page_t instead of tracking when the tlb is flushed. Not sure what's better, but the below should be enough for KVM to swap nicely with minimal overhead to the host kernel even if KVM is unused. About the locking perhaps I'm underestimating it, but by following the TLB flushing analogy, by simply clearing the shadow ptes (with kvm mmu_lock spinlock) and flushing the shadow-pte after clearing the main linux pte, it should be enough to serialize against shadow-pte page faults that would call into get_user_pages. Flushing the host TLB before or after the shadow-ptes shouldn't matter. Comments welcome... especially from Quadrics. Patch is mostly untested, tomorrow I'll try to plug KVM on top of the below and see if it survives swap. Signed-off-by: Andrea Arcangeli <an...@qu...> diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -86,6 +86,7 @@ do { \ pte_t __pte; \ __pte = ptep_get_and_clear((__vma)->vm_mm, __address, __ptep); \ flush_tlb_page(__vma, __address); \ + mmu_notifier(invalidate_page, (__vma)->vm_mm, __address); \ __pte; \ }) #endif diff --git a/include/linux/mm.h b/include/linux/mm.h --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -13,6 +13,7 @@ #include <linux/debug_locks.h> #include <linux/mm_types.h> #include <linux/security.h> +#include <linux/mmu_notifier.h> struct mempolicy; struct anon_vma; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -219,6 +219,10 @@ struct mm_struct { /* aio bits */ rwlock_t ioctx_list_lock; struct kioctx *ioctx_list; + +#ifdef CONFIG_MMU_NOTIFIER + struct hlist_head mmu_notifier; /* MMU notifier list */ +#endif }; #endif /* _LINUX_MM_TYPES_H */ diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h new file mode 100644 --- /dev/null +++ b/include/linux/mmu_notifier.h @@ -0,0 +1,53 @@ +#ifndef _LINUX_MMU_NOTIFIER_H +#define _LINUX_MMU_NOTIFIER_H + +#include <linux/list.h> +#include <linux/mm_types.h> + +#ifdef CONFIG_MMU_NOTIFIER + +struct mmu_notifier; + +struct mmu_notifier_ops { + void (*release)(struct mmu_notifier * mn, + struct mm_struct *mm); + void (*invalidate_page)(struct mmu_notifier * mn, + struct mm_struct *mm, + unsigned long address); + void (*invalidate_range)(struct mmu_notifier * mn, + struct mm_struct *mm, + unsigned long start, unsigned long end); +}; + +struct mmu_notifier { + struct hlist_node hlist; + const struct mmu_notifier_ops *ops; +}; + +extern void mmu_notifier_register(struct mmu_notifier *mn, + struct mm_struct *mm); +extern void mmu_notifier_unregister(struct mmu_notifier *mn); +extern void mmu_notifier_release(struct mm_struct *mm); + +#define mmu_notifier(function, mm, args...) \ + do { \ + struct mmu_notifier *__mn; \ + struct hlist_node *__n; \ + \ + hlist_for_each_entry(__mn, __n, &(mm)->mmu_notifier, hlist) \ + if (__mn->ops->function) \ + __mn->ops->function(__mn, mm, args); \ + } while (0) + +#else /* CONFIG_MMU_NOTIFIER */ + +#define mmu_notifier_register(mn, mm) do {} while(0) +#define mmu_notifier_unregister(mn) do {} while (0) +#define mmu_notifier_release(mm) do {} while (0) + +#define mmu_notifier(function, mm, args...) \ + do { } while (0) + +#endif /* CONFIG_MMU_NOTIFIER */ + +#endif /* _LINUX_MMU_NOTIFIER_H */ diff --git a/mm/Kconfig b/mm/Kconfig --- a/mm/Kconfig +++ b/mm/Kconfig @@ -193,3 +193,7 @@ config VIRT_TO_BUS config VIRT_TO_BUS def_bool y depends on !ARCH_NO_VIRT_TO_BUS + +config MMU_NOTIFIER + def_bool y + bool "MMU notifier, for paging KVM/RDMA" diff --git a/mm/Makefile b/mm/Makefile --- a/mm/Makefile +++ b/mm/Makefile @@ -30,4 +30,5 @@ obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_SMP) += allocpercpu.o obj-$(CONFIG_QUICKLIST) += quicklist.o +obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o diff --git a/mm/hugetlb.c b/mm/hugetlb.c --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -753,6 +753,7 @@ void __unmap_hugepage_range(struct vm_ar } spin_unlock(&mm->page_table_lock); flush_tlb_range(vma, start, end); + mmu_notifier(invalidate_range, mm, start, end); list_for_each_entry_safe(page, tmp, &page_list, lru) { list_del(&page->lru); put_page(page); diff --git a/mm/memory.c b/mm/memory.c --- a/mm/memory.c +++ b/mm/memory.c @@ -889,6 +889,7 @@ unsigned long zap_page_range(struct vm_a end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); if (tlb) tlb_finish_mmu(tlb, address, end); + mmu_notifier(invalidate_range, mm, address, end); return end; } @@ -1358,6 +1359,7 @@ int remap_pfn_range(struct vm_area_struc if (err) break; } while (pgd++, addr = next, addr != end); + mmu_notifier(invalidate_range, mm, end-PAGE_ALIGN(size), end); return err; } EXPORT_SYMBOL(remap_pfn_range); @@ -1452,6 +1454,7 @@ int apply_to_page_range(struct mm_struct if (err) break; } while (pgd++, addr = next, addr != end); + mmu_notifier(invalidate_range, mm, end-size, end); return err; } EXPORT_SYMBOL_GPL(apply_to_page_range); diff --git a/mm/mmap.c b/mm/mmap.c --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1747,6 +1747,7 @@ static void unmap_region(struct mm_struc free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, next? next->vm_start: 0); tlb_finish_mmu(tlb, start, end); + mmu_notifier(invalidate_range, mm, start, end); } /* @@ -2043,6 +2044,7 @@ void exit_mmap(struct mm_struct *mm) vm_unacct_memory(nr_accounted); free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); tlb_finish_mmu(tlb, 0, end); + mmu_notifier_release(mm); /* * Walk the list again, actually closing and freeing it, diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c new file mode 100644 --- /dev/null +++ b/mm/mmu_notifier.c @@ -0,0 +1,35 @@ +/* + * linux/mm/mmu_notifier.c + * + * Copyright (C) 2008 Qumranet, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/mmu_notifier.h> +#include <linux/module.h> + +void mmu_notifier_release(struct mm_struct *mm) +{ + struct mmu_notifier *mn; + struct hlist_node *n, *tmp; + + hlist_for_each_entry_safe(mn, n, tmp, &mm->mmu_notifier, hlist) { + if (mn->ops->release) + mn->ops->release(mn, mm); + hlist_del(n); + } +} + +void mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) +{ + hlist_add_head(&mn->hlist, &mm->mmu_notifier); +} +EXPORT_SYMBOL_GPL(mmu_notifier_register); + +void mmu_notifier_unregister(struct mmu_notifier *mn) +{ + hlist_del(&mn->hlist); +} +EXPORT_SYMBOL_GPL(mmu_notifier_unregister); |
From: Christoph L. <cla...@sg...> - 2008-01-09 21:55:09
|
On Wed, 9 Jan 2008, Andrea Arcangeli wrote: > This patch is a first basic implementation of the mmu notifiers. More > methods can be added in the future. > > In short when the linux VM decides to free a page, it will unmap it > from the linux pagetables. However when a page is mapped not just by > the regular linux ptes, but also from the shadow pagetables, it's > currently unfreeable by the linux VM. Such a patch would also address issues that SGI has with exporting mappings via XPMEM. Plus a variety of other uses. Go ahead and lets do more in this area. Are the KVM folks interested in exporting memory from one guest to another? That may also become possible with some of the work that we have in progress and that also requires a patch like this. |
From: Avi K. <av...@qu...> - 2008-01-10 11:44:20
|
Christoph Lameter wrote: > On Wed, 9 Jan 2008, Andrea Arcangeli wrote: > > >> This patch is a first basic implementation of the mmu notifiers. More >> methods can be added in the future. >> >> In short when the linux VM decides to free a page, it will unmap it >> from the linux pagetables. However when a page is mapped not just by >> the regular linux ptes, but also from the shadow pagetables, it's >> currently unfreeable by the linux VM. >> > > Such a patch would also address issues that SGI has with exporting > mappings via XPMEM. Plus a variety of other uses. Go ahead and lets do > more in this area. > > Are the KVM folks interested in exporting memory from one guest to > another? That may also become possible with some of the work that we have > in progress and that also requires a patch like this. > > Actually sharing memory is possible even without this patch; one simply mmap()s a file into the address space of both guests. Or are you referring to something else? The patch does enable some nifty things; one example you may be familiar with is using page migration to move a guest from one numa node to another. -- error compiling committee.c: too many arguments to function |
From: Christoph L. <cla...@sg...> - 2008-01-10 19:04:46
|
On Thu, 10 Jan 2008, Avi Kivity wrote: > Actually sharing memory is possible even without this patch; one simply > mmap()s a file into the address space of both guests. Or are you referring to > something else? A file from where? If a file is read by two guests then they will have distinct page structs. |
From: Avi K. <av...@qu...> - 2008-01-12 19:58:42
|
Christoph Lameter wrote: > On Thu, 10 Jan 2008, Avi Kivity wrote: > > >> Actually sharing memory is possible even without this patch; one simply >> mmap()s a file into the address space of both guests. Or are you referring to >> something else? >> > > A file from where? If a file is read by two guests then they will have > distinct page structs. > > Two kvm instances mmap() the file (from anywhere) into the guest address space. That memory is shared, and will be backed by the same page structs at the same offset. -- Any sufficiently difficult bug is indistinguishable from a feature. |
From: Christoph L. <cla...@sg...> - 2008-01-14 19:49:48
|
On Sat, 12 Jan 2008, Avi Kivity wrote: > Two kvm instances mmap() the file (from anywhere) into the guest address > space. That memory is shared, and will be backed by the same page structs at > the same offset. Duh. Impossible. Two instances of Linux cannot share page structs. So how are you doing this? Or is this just an idea? |
From: Avi K. <av...@qu...> - 2008-01-15 07:38:33
|
Christoph Lameter wrote: > On Sat, 12 Jan 2008, Avi Kivity wrote: > > >> Two kvm instances mmap() the file (from anywhere) into the guest address >> space. That memory is shared, and will be backed by the same page structs at >> the same offset. >> > > Duh. Impossible. Two instances of Linux cannot share page structs. So how > are you doing this? Or is this just an idea? > > I was describing one Linux host running two guest instances. The page structs are in the host, so they are shared by mmap(). kvm userspace is just an ordinary host process, it can mmap() any file it likes and then assign that virtual memory range to the guest (as guest physical memory). -- error compiling committee.c: too many arguments to function |
From: Christoph L. <cla...@sg...> - 2008-01-15 17:39:22
|
On Tue, 15 Jan 2008, Avi Kivity wrote: > > Duh. Impossible. Two instances of Linux cannot share page structs. So how > > are you doing this? Or is this just an idea? > > I was describing one Linux host running two guest instances. The page structs > are in the host, so they are shared by mmap(). Ahh.. Okay I was talking about a guest exporting its memory to another guest. > kvm userspace is just an ordinary host process, it can mmap() any file it > likes and then assign that virtual memory range to the guest (as guest > physical memory). But then the guest does not have its own page struct to manage the memory. |
From: Christoph L. <cla...@sg...> - 2008-01-14 19:51:23
|
On Sun, 13 Jan 2008, Avi Kivity wrote: > I was just explaining how kvm shares memory among guests (which does not > require mmu notifiers); if you have some other configuration that can benefit > from mmu notifiers, then, well, great. I think you have two page tables pointing to the same memory location right (not to page structs but two ptes)? Without a mmu notifier the pages in this memory range cannot be evicted because otherwise ptes of the other instance will point to a page that is now used for a different purpose. |
From: Avi K. <av...@qu...> - 2008-01-15 07:43:27
|
Christoph Lameter wrote: > On Sun, 13 Jan 2008, Avi Kivity wrote: > > >> I was just explaining how kvm shares memory among guests (which does not >> require mmu notifiers); if you have some other configuration that can benefit >> from mmu notifiers, then, well, great. >> > > I think you have two page tables pointing to the same memory location > right (not to page structs but two ptes)? Without a mmu notifier the pages > in this memory range cannot be evicted because otherwise ptes of the other > instance will point to a page that is now used for a different purpose. > Even with just one guest we can't swap well without mmu notifiers. kvm constructs new page tables for the guest that the Linux vm doesn't know about, so when Linux removes all the ptes, we need a callback to remove the kvm private ptes (and tlb entries). -- error compiling committee.c: too many arguments to function |
From: Robin H. <ho...@sg...> - 2008-01-10 13:16:10
|
On Thu, Jan 10, 2008 at 01:44:18PM +0200, Avi Kivity wrote: > Christoph Lameter wrote: >> On Wed, 9 Jan 2008, Andrea Arcangeli wrote: >> >> >>> This patch is a first basic implementation of the mmu notifiers. More >>> methods can be added in the future. >>> >>> In short when the linux VM decides to free a page, it will unmap it >>> from the linux pagetables. However when a page is mapped not just by >>> the regular linux ptes, but also from the shadow pagetables, it's >>> currently unfreeable by the linux VM. >>> >> >> Such a patch would also address issues that SGI has with exporting >> mappings via XPMEM. Plus a variety of other uses. Go ahead and lets do >> more in this area. >> >> Are the KVM folks interested in exporting memory from one guest to >> another? That may also become possible with some of the work that we have >> in progress and that also requires a patch like this. >> >> > > Actually sharing memory is possible even without this patch; one simply > mmap()s a file into the address space of both guests. Or are you referring > to something else? He is referring to the xpmem work SGI has pushed in the past. It was rejected precisely because this type functionality did not exist. We were trying to determine the cleanest yet smallest acceptable implementation when this suddenly sprang up. I would expect Dean Nelson or myself to repost the xpmem patch set again based upon this patche. > The patch does enable some nifty things; one example you may be familiar > with is using page migration to move a guest from one numa node to another. xpmem allows one MPI rank to "export" his address space, a different MPI rank to "import" that address space, and they share the same pages. This allows sharing of things like stack and heap space. XPMEM also provides a mechanism to share that PFN information across partition boundaries so the pages become available on a different host. This, of course, is dependent upon hardware that supports direct access to the memory by the processor. Thanks, Robin |
From: Avi K. <av...@qu...> - 2008-01-10 13:27:24
|
Robin Holt wrote: > >> The patch does enable some nifty things; one example you may be familiar >> with is using page migration to move a guest from one numa node to another. >> > > xpmem allows one MPI rank to "export" his address space, a different > MPI rank to "import" that address space, and they share the same pages. > This allows sharing of things like stack and heap space. XPMEM also > provides a mechanism to share that PFN information across partition > boundaries so the pages become available on a different host. This, > of course, is dependent upon hardware that supports direct access to > the memory by the processor. > > So this is yet another instance of hardware that has a tlb that needs to be kept in sync with the page tables, yes? Excellent, the more users the patch has, the easier it will be to justify it. -- error compiling committee.c: too many arguments to function |
From: Robin H. <ho...@sg...> - 2008-01-10 14:50:27
|
On Thu, Jan 10, 2008 at 03:27:24PM +0200, Avi Kivity wrote: > Robin Holt wrote: >> >>> The patch does enable some nifty things; one example you may be familiar >>> with is using page migration to move a guest from one numa node to >>> another. >>> >> >> xpmem allows one MPI rank to "export" his address space, a different >> MPI rank to "import" that address space, and they share the same pages. >> This allows sharing of things like stack and heap space. XPMEM also >> provides a mechanism to share that PFN information across partition >> boundaries so the pages become available on a different host. This, >> of course, is dependent upon hardware that supports direct access to >> the memory by the processor. >> >> > > So this is yet another instance of hardware that has a tlb that needs to be > kept in sync with the page tables, yes? Yep, the external TLBs happen to be cpus in a different OS instance, but you get the idea. > Excellent, the more users the patch has, the easier it will be to justify > it. I think we have another hardware device driver that will use it first. It is sort of a hardware coprocessor that is available from user space to do operations against a processes address space. That driver will probably be first out the door. Looking at the mmu_notifiers patch, there are locks held which will preclude the use of invalidate_page for xpmem. In that circumstance, the clearing operation will need to be messaged to the other OS instance and that will certainly involving putting the current task to sleep. We will work on that detail later. First, we will focus on getting the other driver submitted to the community. Thanks, Robin |
From: Christoph L. <cla...@sg...> - 2008-01-10 19:06:00
|
On Thu, 10 Jan 2008, Avi Kivity wrote: > So this is yet another instance of hardware that has a tlb that needs to be > kept in sync with the page tables, yes? Correct. > Excellent, the more users the patch has, the easier it will be to justify it. We'd like to make sure though that we can sleep when the hooks have been called. We may have to sent a message to kick remote ptes out when local pte changes happen. |
From: Avi K. <av...@qu...> - 2008-01-12 20:03:17
|
Christoph Lameter wrote: > >> Excellent, the more users the patch has, the easier it will be to justify it. >> > > We'd like to make sure though that we can sleep when the hooks have been > called. We may have to sent a message to kick remote ptes out when local > pte changes happen. > > It may be as simple as moving the notifier calls down to a sleeping context, away from the pte lock and any friends. kvm also needs to send a message on an mmu notification, but that's just an IPI within the same host. -- Any sufficiently difficult bug is indistinguishable from a feature. |
From: Robin H. <ho...@sg...> - 2008-01-13 12:09:36
|
On Sat, Jan 12, 2008 at 09:51:56PM +0200, Avi Kivity wrote: > Christoph Lameter wrote: >> On Thu, 10 Jan 2008, Avi Kivity wrote: >> >> >>> Actually sharing memory is possible even without this patch; one simply >>> mmap()s a file into the address space of both guests. Or are you >>> referring to >>> something else? >>> >> >> A file from where? If a file is read by two guests then they will have >> distinct page structs. >> >> > > Two kvm instances mmap() the file (from anywhere) into the guest address > space. That memory is shared, and will be backed by the same page structs > at the same offset. That sounds nice, but... For larger machine configurations, we have different memory access capabilities. When a partition that is located close to the home node of the memory accesses memory, it is normal access. When it is further away, they get special access to the line. Before the shared line is sent to the reading node, it is converted by the memory controller into an exclusive request and the reading node is handed the only copy of the line. If we gave a remote kernel access to the page, we would also open the entire owning nodes page tables up to speculative references which effectively would be viewed by hardware as cache-line contention. Additionally, we have needs beyond memory backed by files. Including special devices which do not have struct pages at all (see mspec.c). Thanks, Robin |
From: Avi K. <av...@qu...> - 2008-01-13 12:28:03
|
Robin Holt wrote: > On Sat, Jan 12, 2008 at 09:51:56PM +0200, Avi Kivity wrote: > >> Christoph Lameter wrote: >> >>> On Thu, 10 Jan 2008, Avi Kivity wrote: >>> >>> >>> >>>> Actually sharing memory is possible even without this patch; one simply >>>> mmap()s a file into the address space of both guests. Or are you >>>> referring to >>>> something else? >>>> >>>> >>> A file from where? If a file is read by two guests then they will have >>> distinct page structs. >>> >>> >>> >> Two kvm instances mmap() the file (from anywhere) into the guest address >> space. That memory is shared, and will be backed by the same page structs >> at the same offset. >> > > That sounds nice, but... > > For larger machine configurations, we have different memory access > capabilities. When a partition that is located close to the home node > of the memory accesses memory, it is normal access. When it is further > away, they get special access to the line. Before the shared line is > sent to the reading node, it is converted by the memory controller into > an exclusive request and the reading node is handed the only copy of > the line. If we gave a remote kernel access to the page, we would also > open the entire owning nodes page tables up to speculative references > which effectively would be viewed by hardware as cache-line contention. > > Additionally, we have needs beyond memory backed by files. Including > special devices which do not have struct pages at all (see mspec.c). > I don't understand. I was just explaining how kvm shares memory among guests (which does not require mmu notifiers); if you have some other configuration that can benefit from mmu notifiers, then, well, great. -- error compiling committee.c: too many arguments to function |
From: Avi K. <av...@qu...> - 2008-01-15 17:53:17
|
Christoph Lameter wrote: > On Tue, 15 Jan 2008, Avi Kivity wrote: > > >>> Duh. Impossible. Two instances of Linux cannot share page structs. So how >>> are you doing this? Or is this just an idea? >>> >> I was describing one Linux host running two guest instances. The page structs >> are in the host, so they are shared by mmap(). >> > > Ahh.. Okay I was talking about a guest exporting its memory to another > guest. > That's not very different, if they are on the same host? > > >> kvm userspace is just an ordinary host process, it can mmap() any file it >> likes and then assign that virtual memory range to the guest (as guest >> physical memory). >> > > But then the guest does not have its own page struct to manage the memory. > > Why not? It's just a block of memory as far as the guest is concerned. It's entirely up to it whether to create page structs or not. Example: qemu 1: p = mmap("/dev/shm/blah", size, ... ); ioctl(vm_fd, KVM_CREATE_MEMORY_REGION_USER, { p, size, 0x10000000, ... }); qemu 2: p = mmap("/dev/shm/blah", size, ... ); ioctl(vm_fd, KVM_CREATE_MEMORY_REGION_USER, { p, size, 0x10000000, ... }); Physical address 0x10000000, of both guests, would map to the same page. Of course, ordinary Linux kernels can't do much with memory that is shared with another guest. I've a feeling we need a whiteboard. -- error compiling committee.c: too many arguments to function |
From: Christoph L. <cla...@sg...> - 2008-01-15 17:57:40
|
On Tue, 15 Jan 2008, Avi Kivity wrote: > > Ahh.. Okay I was talking about a guest exporting its memory to another > > guest. > > > > That's not very different, if they are on the same host? But each guest has its own page structs. They cannot share page structs. Concurrent access of two independent kernel instances for synchronization and status maintenance to a single page struct? |
From: Avi K. <av...@qu...> - 2008-01-15 18:05:57
|
Christoph Lameter wrote: > On Tue, 15 Jan 2008, Avi Kivity wrote: > > >>> Ahh.. Okay I was talking about a guest exporting its memory to another >>> guest. >>> >>> >> That's not very different, if they are on the same host? >> > > But each guest has its own page structs. They cannot share page structs. > Concurrent access of two independent kernel instances for synchronization > and status maintenance to a single page struct? > There's a host page struct (that the guest know nothing about and cannot touch), and optionally a guest page struct for each guest (that the host and the other guest know nothing about). The guest page struct is optional, since it is up to the guest to create it. kvm doesn't care. If the guest isn't Linux, there certainly won't be a page struct. The host page struct may disappear if the host decides to swap the page into its backing store and free the page. The guest page structs (if any) would remain. -- error compiling committee.c: too many arguments to function |
From: Christoph L. <cla...@sg...> - 2008-01-15 18:16:15
|
On Tue, 15 Jan 2008, Avi Kivity wrote: > > But each guest has its own page structs. They cannot share page structs. > > Concurrent access of two independent kernel instances for synchronization > > and status maintenance to a single page struct? > > > > There's a host page struct (that the guest know nothing about and cannot > touch), and optionally a guest page struct for each guest (that the host and > the other guest know nothing about). Ok so if two linux guests want to share memory three page structs are involved: 1. Host page struct 2. Guest #1 page struct 3. Guest #2 page struct I can understand that 1 and 2 point to the same physical page. Even all three could point to the same page if the page is readonly. However, lets say that Guest #1 allocates some anonymous memory and wants to share it with Guest #2. In that case something like PFNMAP is likely going to be used? Or are you remapping the physical page so that #1 and #2 share it? In that case two page struct describe state of the same physical page and we have no effective synchronization for writeback etc. > The host page struct may disappear if the host decides to swap the page into > its backing store and free the page. The guest page structs (if any) would > remain. Page structs never disappear. The pte's may disappear and the page may be unmapped from an address space of a process but the page struct stays. Page struct can only disappear if memory hotplug is activated and memory is taken out of the system. |
From: Avi K. <av...@qu...> - 2008-01-16 07:39:27
|
Christoph Lameter wrote: > On Tue, 15 Jan 2008, Avi Kivity wrote: > > >>> But each guest has its own page structs. They cannot share page structs. >>> Concurrent access of two independent kernel instances for synchronization >>> and status maintenance to a single page struct? >>> >>> >> There's a host page struct (that the guest know nothing about and cannot >> touch), and optionally a guest page struct for each guest (that the host and >> the other guest know nothing about). >> > > Ok so if two linux guests want to share memory three page structs are > involved: > > 1. Host page struct > 2. Guest #1 page struct > 3. Guest #2 page struct > > I can understand that 1 and 2 point to the same physical page. Even all > three could point to the same page if the page is readonly. > > However, lets say that Guest #1 allocates some anonymous memory and wants > to share it with Guest #2. In that case something like PFNMAP is likely > going to be used? Or are you remapping the physical page so that #1 and #2 > share it? In that case two page struct describe state of the same physical > page and we have no effective synchronization for writeback etc. > > Like I said, out of the box Linux doesn't support using memory that is shared with other instances as main memory. One usage (by the s390 folk) was to put a read-only filesystem with execute-in-place support on this memory, and so reduce the memory usage of guests. >> The host page struct may disappear if the host decides to swap the page into >> its backing store and free the page. The guest page structs (if any) would >> remain. >> > > Page structs never disappear. The pte's may disappear and the page may be > unmapped from an address space of a process but the page struct stays. > Page struct can only disappear if memory hotplug is activated and memory > is taken out of the system. > Yes, that was poorly phrased. The page and its page struct may be reallocated for other purposes. -- error compiling committee.c: too many arguments to function |
From: Christoph L. <cla...@sg...> - 2008-01-16 18:08:49
|
On Wed, 16 Jan 2008, Avi Kivity wrote: > Yes, that was poorly phrased. The page and its page struct may be reallocated > for other purposes. Its better to say "reused". Otherwise one may think that an allocation of page structs is needed. |