From: <bla...@ya...> - 2005-07-28 18:42:03
|
There is a lot of code which is duplicated between the 2 and 3 level implementation, with the only difference that the 3-level implementation is a bit more generalized (instead of accessing directly pte_t.pte, it uses the appropriate access macros). So this code is joined together. As obvious, a "core code nice cleanup" is not a "stability-friendly patch" so usual care applies. Signed-off-by: Paolo 'Blaisorblade' Giarrusso <bla...@ya...> --- linux-2.6.git-paolo/include/asm-um/pgtable-2level.h | 27 ----------- linux-2.6.git-paolo/include/asm-um/pgtable-3level.h | 36 --------------- linux-2.6.git-paolo/include/asm-um/pgtable.h | 46 ++++++++++++++++++++ 3 files changed, 46 insertions(+), 63 deletions(-) diff -puN include/asm-um/pgtable-2level.h~uml-join-page-bits-handling include/asm-um/pgtable-2level.h --- linux-2.6.git/include/asm-um/pgtable-2level.h~uml-join-page-bits-handling 2005-07-24 00:49:30.000000000 +0200 +++ linux-2.6.git-paolo/include/asm-um/pgtable-2level.h 2005-07-24 00:49:30.000000000 +0200 @@ -35,35 +35,8 @@ static inline int pgd_newpage(pgd_t pgd) { return 0; } static inline void pgd_mkuptodate(pgd_t pgd) { } -#define pte_present(x) (pte_val(x) & (_PAGE_PRESENT | _PAGE_PROTNONE)) - -static inline pte_t pte_mknewprot(pte_t pte) -{ - pte_val(pte) |= _PAGE_NEWPROT; - return(pte); -} - -static inline pte_t pte_mknewpage(pte_t pte) -{ - pte_val(pte) |= _PAGE_NEWPAGE; - return(pte); -} - -static inline void set_pte(pte_t *pteptr, pte_t pteval) -{ - /* If it's a swap entry, it needs to be marked _PAGE_NEWPAGE so - * fix_range knows to unmap it. _PAGE_NEWPROT is specific to - * mapped pages. - */ - *pteptr = pte_mknewpage(pteval); - if(pte_present(*pteptr)) *pteptr = pte_mknewprot(*pteptr); -} -#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval) - #define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval)) -#define pte_page(x) pfn_to_page(pte_pfn(x)) -#define pte_none(x) !(pte_val(x) & ~_PAGE_NEWPAGE) #define pte_pfn(x) phys_to_pfn(pte_val(x)) #define pfn_pte(pfn, prot) __pte(pfn_to_phys(pfn) | pgprot_val(prot)) #define pfn_pmd(pfn, prot) __pmd(pfn_to_phys(pfn) | pgprot_val(prot)) diff -puN include/asm-um/pgtable-3level.h~uml-join-page-bits-handling include/asm-um/pgtable-3level.h --- linux-2.6.git/include/asm-um/pgtable-3level.h~uml-join-page-bits-handling 2005-07-24 00:49:30.000000000 +0200 +++ linux-2.6.git-paolo/include/asm-um/pgtable-3level.h 2005-07-24 00:49:30.000000000 +0200 @@ -57,35 +57,6 @@ static inline int pgd_newpage(pgd_t pgd) static inline void pgd_mkuptodate(pgd_t pgd) { pgd_val(pgd) &= ~_PAGE_NEWPAGE; } - -#define pte_present(x) pte_get_bits(x, (_PAGE_PRESENT | _PAGE_PROTNONE)) - -static inline pte_t pte_mknewprot(pte_t pte) -{ - pte_set_bits(pte, _PAGE_NEWPROT); - return(pte); -} - -static inline pte_t pte_mknewpage(pte_t pte) -{ - pte_set_bits(pte, _PAGE_NEWPAGE); - return(pte); -} - -static inline void set_pte(pte_t *pteptr, pte_t pteval) -{ - pte_copy(*pteptr, pteval); - - /* If it's a swap entry, it needs to be marked _PAGE_NEWPAGE so - * fix_range knows to unmap it. _PAGE_NEWPROT is specific to - * mapped pages. - */ - - *pteptr = pte_mknewpage(*pteptr); - if(pte_present(*pteptr)) *pteptr = pte_mknewprot(*pteptr); -} -#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval) - #define set_pmd(pmdptr, pmdval) set_64bit((phys_t *) (pmdptr), pmd_val(pmdval)) static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) @@ -113,13 +84,6 @@ static inline void pud_clear (pud_t * pu #define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \ pmd_index(address)) -#define pte_page(x) pfn_to_page(pte_pfn(x)) - -static inline int pte_none(pte_t pte) -{ - return pte_is_zero(pte); -} - static inline unsigned long pte_pfn(pte_t pte) { return phys_to_pfn(pte_val(pte)); diff -puN include/asm-um/pgtable.h~uml-join-page-bits-handling include/asm-um/pgtable.h --- linux-2.6.git/include/asm-um/pgtable.h~uml-join-page-bits-handling 2005-07-24 00:49:30.000000000 +0200 +++ linux-2.6.git-paolo/include/asm-um/pgtable.h 2005-07-24 06:11:58.000000000 +0200 @@ -151,10 +151,24 @@ extern unsigned long pg0[1024]; #define pmd_page(pmd) phys_to_page(pmd_val(pmd) & PAGE_MASK) +#define pte_page(x) pfn_to_page(pte_pfn(x)) #define pte_address(x) (__va(pte_val(x) & PAGE_MASK)) #define mk_phys(a, r) ((a) + (((unsigned long) r) << REGION_SHIFT)) #define phys_addr(p) ((p) & ~REGION_MASK) +#define pte_present(x) pte_get_bits(x, (_PAGE_PRESENT | _PAGE_PROTNONE)) + +/* + * ================================= + * Flags checking section. + * ================================= + */ + +static inline int pte_none(pte_t pte) +{ + return pte_is_zero(pte); +} + /* * The following only work if pte_present() is true. * Undefined behaviour if not.. @@ -210,6 +224,18 @@ static inline int pte_newprot(pte_t pte) return(pte_present(pte) && (pte_get_bits(pte, _PAGE_NEWPROT))); } +/* + * ================================= + * Flags setting section. + * ================================= + */ + +static inline pte_t pte_mknewprot(pte_t pte) +{ + pte_set_bits(pte, _PAGE_NEWPROT); + return(pte); +} + static inline pte_t pte_rdprotect(pte_t pte) { pte_clear_bits(pte, _PAGE_USER); @@ -278,6 +304,26 @@ static inline pte_t pte_mkuptodate(pte_t return(pte); } +static inline pte_t pte_mknewpage(pte_t pte) +{ + pte_set_bits(pte, _PAGE_NEWPAGE); + return(pte); +} + +static inline void set_pte(pte_t *pteptr, pte_t pteval) +{ + pte_copy(*pteptr, pteval); + + /* If it's a swap entry, it needs to be marked _PAGE_NEWPAGE so + * fix_range knows to unmap it. _PAGE_NEWPROT is specific to + * mapped pages. + */ + + *pteptr = pte_mknewpage(*pteptr); + if(pte_present(*pteptr)) *pteptr = pte_mknewprot(*pteptr); +} +#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval) + extern phys_t page_to_phys(struct page *page); /* _ |
From: Jeff D. <jd...@ad...> - 2005-07-31 12:50:07
|
On Thu, Jul 28, 2005 at 08:56:53PM +0200, bla...@ya... wrote: > As obvious, a "core code nice cleanup" is not a "stability-friendly patch" so > usual care applies. These look reasonable, as they are what we discussed in Ottawa. I'll put them in my tree and see if I see any problems. I would suggest sending these in early after 2.6.13 if they seem OK. Jeff |
From: Blaisorblade <bla...@ya...> - 2005-07-30 19:05:00
|
On Saturday 30 July 2005 18:02, Jeff Dike wrote: > On Thu, Jul 28, 2005 at 08:56:53PM +0200, bla...@ya... wrote: > > As obvious, a "core code nice cleanup" is not a "stability-friendly > > patch" so usual care applies. > > These look reasonable, as they are what we discussed in Ottawa. > > I'll put them in my tree and see if I see any problems. I would > suggest sending these in early after 2.6.13 if they seem OK. Excluding the accessed handling it's ok, for the accessed handling I'm doubtful. Could you check if this was introduced recently (for instance by the introduction of flush_range_common, which IIRC is recent) or if it's old? For instance, in latest 2.4, and/or in 2.6.9. If this is a regression the fix for accessed handling can be merged too. -- Inform me of my mistakes, so I can keep imitating Homer Simpson's "Doh!". Paolo Giarrusso, aka Blaisorblade (Skype ID "PaoloGiarrusso", ICQ 215621894) http://www.user-mode-linux.org/~blaisorblade ___________________________________ Yahoo! Mail: gratis 1GB per i messaggi e allegati da 10MB http://mail.yahoo.it |
From: Blaisorblade <bla...@ya...> - 2005-08-12 18:41:49
|
On Saturday 30 July 2005 18:02, Jeff Dike wrote: > On Thu, Jul 28, 2005 at 08:56:53PM +0200, bla...@ya... wrote: > > As obvious, a "core code nice cleanup" is not a "stability-friendly > > patch" so usual care applies. > These look reasonable, as they are what we discussed in Ottawa. > I'll put them in my tree and see if I see any problems. I would > suggest sending these in early after 2.6.13 if they seem OK. I've discovered that we're not the only one to miss dirty / accessed "hardware" bits: see include/asm-alpha/pgtable.h (they don't have the accessed bit). So maybe we could drop the "fault-on-access" thing. Also, note the comment before handle_pte_fault: /* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most * RISC architectures). The early dirtying is also good on the i386. */ I'm not able to find where we clean the dirty bit on a pte, however it's not only done by pte_mkclean, there are some macros like ptep_clear... in asm-generic/pgtable.h -- Inform me of my mistakes, so I can keep imitating Homer Simpson's "Doh!". Paolo Giarrusso, aka Blaisorblade (Skype ID "PaoloGiarrusso", ICQ 215621894) http://www.user-mode-linux.org/~blaisorblade ___________________________________ Yahoo! Mail: gratis 1GB per i messaggi e allegati da 10MB http://mail.yahoo.it |
From: Blaisorblade <bla...@ya...> - 2005-08-12 13:05:47
Attachments:
uml-avoid-already-done-dirtying.patch
|
On Saturday 30 July 2005 18:02, Jeff Dike wrote: > On Thu, Jul 28, 2005 at 08:56:53PM +0200, bla...@ya... wrote: > > As obvious, a "core code nice cleanup" is not a "stability-friendly > > patch" so usual care applies. > > These look reasonable, as they are what we discussed in Ottawa. > > I'll put them in my tree and see if I see any problems. I would > suggest sending these in early after 2.6.13 if they seem OK. Just noticed: you can drop them (except the first, which is a nice cleanup). set_pte handles that, and include/asm-generic/pgtable.h uses coherently set_pte_at. I've checked UML by examining "grep pte", and either mk_pte or set_pte are used. Exceptions: fixaddr_user_init (but that should be ok as we shouldn't map it actually), pte_modify() (which handles that only for present pages). But pte_modify is used with set_pte, so probably we could as well drop that handling. Also look, on the "set_pte" theme, at the attached patch. I realized this when I needed those lines to work - I was getting a segfault loop. After using set_pte(), things worked. I have now an almost perfectly working implementation of remap_file_pages with protection support. There will probably be some other things to update, like swapping locations, but I can't get this kernel to fail (it's easier to find bugs in the test-program, it grew quite complex). And, I'd like to note, original Ingo's version *DID NOT* work properly (it was not safe against swapout, it didn't allow write-protecting a page successfully). I'm going to clean up the code and write changelogs, to send then the patches for -mm (hoping the page fault scalability patches don't get in the way). -- Inform me of my mistakes, so I can keep imitating Homer Simpson's "Doh!". Paolo Giarrusso, aka Blaisorblade (Skype ID "PaoloGiarrusso", ICQ 215621894) http://www.user-mode-linux.org/~blaisorblade |
From: Jeff D. <jd...@ad...> - 2005-08-12 17:35:20
|
On Wed, Aug 10, 2005 at 09:37:28PM +0200, Blaisorblade wrote: > Just noticed: you can drop them (except the first, which is a nice cleanup). > > set_pte handles that, and include/asm-generic/pgtable.h uses coherently > set_pte_at. I've checked UML by examining "grep pte", and either mk_pte or > set_pte are used. > > Exceptions: fixaddr_user_init (but that should be ok as we shouldn't map it > actually), pte_modify() (which handles that only for present pages). > > But pte_modify is used with set_pte, so probably we could as well drop that > handling. > > Also look, on the "set_pte" theme, at the attached patch. I realized this when > I needed those lines to work - I was getting a segfault loop. OK, this sounds right. To recap, we were concerned that when the page scanner went around clearing dirty (accessed) bits, that fix_range_common wasn't write (read) protecting the page in order to emulate the cleared bits. However, set_pte does set _PAGE_NEWPAGE, which forces the page through a full update, including dirty/accessed bit emulation. Correct? However, I did add some parenthesis to your patch: WARN_ON(!pte_young(*pte) || (pte_write(*pte) && !pte_dirty(*pte))); This seems clearer to me. So, your pte consolidation patch is still in my tree, the other two pte patches are dropped. > After using set_pte(), things worked. I have now an almost perfectly working > implementation of remap_file_pages with protection support. > There will probably be some other things to update, like swapping locations, > but I can't get this kernel to fail (it's easier to find bugs in the > test-program, it grew quite complex). Excellent. > I'm going to clean up the code and write changelogs, to send then the patches > for -mm (hoping the page fault scalability patches don't get in the > way). Good. Jeff |
From: Jeff D. <jd...@ad...> - 2005-09-02 21:19:19
|
On Wed, Aug 10, 2005 at 09:37:28PM +0200, Blaisorblade wrote: > Also look, on the "set_pte" theme, at the attached patch. + WARN_ON(!pte_young(*pte) || pte_write(*pte) && !pte_dirty(*pte)); This one has been firing on me, and I decided to figure out why. The culprit is this code in do_no_page: if (pte_none(*page_table)) { if (!PageReserved(new_page)) inc_mm_counter(mm, rss); flush_icache_page(vma, new_page); entry = mk_pte(new_page, vma->vm_page_prot); if (write_access) entry = maybe_mkwrite(pte_mkdirty(entry), vma); set_pte_at(mm, address, page_table, entry); The first mk_pte immediately sets the pte to the protection limits of the VMA, regardless of the access type. So, if it's a read access on a writeable page, we get a writeable, but not dirty pte, since the mkdirty never happens. The exercises the warning you added. This seems somewhat bogus to me. If we set the pte protection to its limits, then the maybe_mkwrite is unneccesary. If we are the process in this address space, and we have a write access, then the maybe_mkwrite doesn't do anything because the pte is already writeable because the VMA has to be writeable, or we would have been faulted already. If we are a debugger changing the process memory, then the vma may be read-only, and maybe_mkwrite is explicitly a no-op in this case. This doesn't seem to harm our dirty bit emulation. fix_range_common checks the dirty and accessed bits and disables read and write protection as appropriate. So, it seems like the warning could be dropped, or perhaps made more selective, like checking for is_write == 0 and VM_WRITE, but then the test is getting complicated. Heff |
From: Hugh D. <hu...@ve...> - 2005-09-03 05:06:45
|
On Fri, 2 Sep 2005, Jeff Dike wrote: > On Wed, Aug 10, 2005 at 09:37:28PM +0200, Blaisorblade wrote: > > Also look, on the "set_pte" theme, at the attached patch. > > + WARN_ON(!pte_young(*pte) || pte_write(*pte) && !pte_dirty(*pte)); > > This one has been firing on me, and I decided to figure out why. The > culprit is this code in do_no_page: > > if (pte_none(*page_table)) { > if (!PageReserved(new_page)) > inc_mm_counter(mm, rss); > > flush_icache_page(vma, new_page); > entry = mk_pte(new_page, vma->vm_page_prot); > if (write_access) > entry = maybe_mkwrite(pte_mkdirty(entry), vma); > set_pte_at(mm, address, page_table, entry); > > The first mk_pte immediately sets the pte to the protection limits of > the VMA, regardless of the access type. So, if it's a read access on > a writeable page, we get a writeable, but not dirty pte, since the > mkdirty never happens. The exercises the warning you added. > > This seems somewhat bogus to me. If we set the pte protection to its > limits, then the maybe_mkwrite is unneccesary. > > If we are the process in this address space, and we have a write > access, then the maybe_mkwrite doesn't do anything because the pte is > already writeable because the VMA has to be writeable, or we would > have been faulted already. Not at all. The private, COW areas. They may be writeable in the sense that VM_WRITE is set, but pte_write permission cannot be in vm_page_prot, or we'd never get the fault when to Copy On Write. What is bogus, I think, are those places where we do the reverse: like do_anonymous_page's pte_wrprotect of its mkpte of the ZERO_PAGE. > If we are a debugger changing the process memory, then the vma may be > read-only, and maybe_mkwrite is explicitly a no-op in this case. > > This doesn't seem to harm our dirty bit emulation. fix_range_common > checks the dirty and accessed bits and disables read and write > protection as appropriate. > > So, it seems like the warning could be dropped, or perhaps made more > selective, like checking for is_write == 0 and VM_WRITE, but then the > test is getting complicated. > > Heff Jugh |
From: Blaisorblade <bla...@ya...> - 2005-09-04 11:36:45
|
On Friday 02 September 2005 22:17, Jeff Dike wrote: > On Wed, Aug 10, 2005 at 09:37:28PM +0200, Blaisorblade wrote: > > Also look, on the "set_pte" theme, at the attached patch. > + WARN_ON(!pte_young(*pte) || pte_write(*pte) && !pte_dirty(*pte)); > This one has been firing on me, and I decided to figure out why. The > culprit is this code in do_no_page: > if (pte_none(*page_table)) { > if (!PageReserved(new_page)) > inc_mm_counter(mm, rss); > > flush_icache_page(vma, new_page); > entry = mk_pte(new_page, vma->vm_page_prot); > if (write_access) > entry = maybe_mkwrite(pte_mkdirty(entry), vma); > set_pte_at(mm, address, page_table, entry); > > The first mk_pte immediately sets the pte to the protection limits of > the VMA, regardless of the access type. > So, if it's a read access on > a writeable page, we get a writeable, but not dirty pte, since the > mkdirty never happens. The exercises the warning you added. Thanks for noticing - I had really this doubt when writing some code (in the patch, I've added a dirty PTEs on read accesses because I was unsure, and even because of my warning). > This seems somewhat bogus to me. If we set the pte protection to its > limits, then the maybe_mkwrite is unneccesary. > This doesn't seem to harm our dirty bit emulation. fix_range_common > checks the dirty and accessed bits and disables read and write > protection as appropriate. > So, it seems like the warning could be dropped, or perhaps made more > selective, like checking for is_write == 0 and VM_WRITE, but then the > test is getting complicated. No, just replace pte_write() with is_write, as below. They might not coincide, but if on a write fault we return with a clean PTE, we'll loop indefinitely (experienced while hacking on remap_f_p), so the warning above is definitely correct. WARN_ON(!pte_young(*pte) || is_write && !pte_dirty(*pte)); > Heff -- Inform me of my mistakes, so I can keep imitating Homer Simpson's "Doh!". Paolo Giarrusso, aka Blaisorblade (Skype ID "PaoloGiarrusso", ICQ 215621894) http://www.user-mode-linux.org/~blaisorblade ___________________________________ Yahoo! Messenger: chiamate gratuite in tutto il mondo http://it.beta.messenger.yahoo.com |