[lc-checkins] CVS: linux/mm Makefile,1.5,1.6 filemap.c,1.42,1.43 memory.c,1.36,1.37 mmap.c,1.8,1.9 o

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/linuxcompressed/linux/mm
In directory sc8-pr-cvs1:/tmp/cvs-serv25395/mm

Modified Files:
	Makefile filemap.c memory.c mmap.c oom_kill.c page_alloc.c 
	page_io.c shmem.c swap_state.c swapfile.c vmscan.c 
Log Message:
o Port code to 2.4.20

Bug fix (?)
o Changes checks in vswap.c to avoid oopses. It will BUG()
instead. Some of the checks were done after the value had been
accessed.

Note
o Virtual swap addresses are temporarily disabled, due to debugging
sessions related to the use of swap files instead of swap partitions.

Index: Makefile
===================================================================
RCS file: /cvsroot/linuxcompressed/linux/mm/Makefile,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -r1.5 -r1.6
*** Makefile	12 Dec 2001 20:45:46 -0000	1.5
--- Makefile	19 May 2003 01:38:47 -0000	1.6
***************
*** 10,14 ****
  O_TARGET := mm.o

! export-objs := shmem.o filemap.o

  obj-y	 := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
--- 10,14 ----
  O_TARGET := mm.o

! export-objs := shmem.o filemap.o memory.o page_alloc.o

  obj-y	 := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \

Index: filemap.c
===================================================================
RCS file: /cvsroot/linuxcompressed/linux/mm/filemap.c,v
retrieving revision 1.42
retrieving revision 1.43
diff -C2 -r1.42 -r1.43
*** filemap.c	29 Nov 2002 21:23:02 -0000	1.42
--- filemap.c	19 May 2003 01:38:47 -0000	1.43
***************
*** 24,28 ****
  #include <linux/mm.h>
  #include <linux/iobuf.h>
- #include <linux/compiler.h>
  #include <linux/comp_cache.h>

--- 24,27 ----
***************
*** 55,59 ****

! spinlock_t pagecache_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
  /*
   * NOTE: to avoid deadlocking you must never acquire the pagemap_lru_lock 
--- 54,58 ----

! spinlock_cacheline_t pagecache_lock_cacheline  = {SPIN_LOCK_UNLOCKED};
  /*
   * NOTE: to avoid deadlocking you must never acquire the pagemap_lru_lock 
***************
*** 65,69 ****
   *			pagecache_lock
   */
! spinlock_t pagemap_lru_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;

  #define CLUSTER_PAGES		(1 << page_cluster)
--- 64,68 ----
   *			pagecache_lock
   */
! spinlock_cacheline_t pagemap_lru_lock_cacheline = {SPIN_LOCK_UNLOCKED};

  #define CLUSTER_PAGES		(1 << page_cluster)
***************
*** 122,126 ****
  void __remove_inode_page(struct page *page)
  {
! 	if (PageDirty(page)) BUG();
  	remove_page_from_inode_queue(page);
  	remove_page_from_hash_queue(page);
--- 121,126 ----
  void __remove_inode_page(struct page *page)
  {
! 	if (PageDirty(page) && !PageSwapCache(page))
! 		BUG();
  	remove_page_from_inode_queue(page);
  	remove_page_from_hash_queue(page);
***************
*** 156,164 ****
  		if (mapping) {
  			spin_lock(&pagecache_lock);
! 			list_del(&page->list);
! 			list_add(&page->list, &mapping->dirty_pages);
  			spin_unlock(&pagecache_lock);

! 			if (mapping->host)
  				mark_inode_dirty_pages(mapping->host);
  #ifdef CONFIG_COMP_CACHE
--- 156,167 ----
  		if (mapping) {
  			spin_lock(&pagecache_lock);
! 			mapping = page->mapping;
! 			if (mapping) {	/* may have been truncated */
! 				list_del(&page->list);
! 				list_add(&page->list, &mapping->dirty_pages);
! 			}
  			spin_unlock(&pagecache_lock);

! 			if (mapping && mapping->host)
  				mark_inode_dirty_pages(mapping->host);
  #ifdef CONFIG_COMP_CACHE
***************
*** 582,586 ****

          while (!list_empty(&mapping->dirty_pages)) {
! 		struct page *page = list_entry(mapping->dirty_pages.next, struct page, list);

  		list_del(&page->list);
--- 585,589 ----

          while (!list_empty(&mapping->dirty_pages)) {
! 		struct page *page = list_entry(mapping->dirty_pages.prev, struct page, list);

  		list_del(&page->list);
***************
*** 816,819 ****
--- 819,882 ----
  }

+ /*
+  * Knuth recommends primes in approximately golden ratio to the maximum
+  * integer representable by a machine word for multiplicative hashing.
+  * Chuck Lever verified the effectiveness of this technique:
+  * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
+  *
+  * These primes are chosen to be bit-sparse, that is operations on
+  * them can use shifts and additions instead of multiplications for
+  * machines where multiplications are slow.
+  */
+ #if BITS_PER_LONG == 32
+ /* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
+ #define GOLDEN_RATIO_PRIME 0x9e370001UL
+ #elif BITS_PER_LONG == 64
+ /*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
+ #define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
+ #else
+ #error Define GOLDEN_RATIO_PRIME for your wordsize.
+ #endif
+ 
+ /*
+  * In order to wait for pages to become available there must be
+  * waitqueues associated with pages. By using a hash table of
+  * waitqueues where the bucket discipline is to maintain all
+  * waiters on the same queue and wake all when any of the pages
+  * become available, and for the woken contexts to check to be
+  * sure the appropriate page became available, this saves space
+  * at a cost of "thundering herd" phenomena during rare hash
+  * collisions.
+  */
+ static inline wait_queue_head_t *page_waitqueue(struct page *page)
+ {
+ 	const zone_t *zone = page_zone(page);
+ 	wait_queue_head_t *wait = zone->wait_table;
+ 	unsigned long hash = (unsigned long)page;
+ 
+ #if BITS_PER_LONG == 64
+ 	/*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
+ 	unsigned long n = hash;
+ 	n <<= 18;
+ 	hash -= n;
+ 	n <<= 33;
+ 	hash -= n;
+ 	n <<= 3;
+ 	hash += n;
+ 	n <<= 3;
+ 	hash -= n;
+ 	n <<= 4;
+ 	hash += n;
+ 	n <<= 2;
+ 	hash += n;
+ #else
+ 	/* On some cpus multiply is faster, on others gcc will do shifts */
+ 	hash *= GOLDEN_RATIO_PRIME;
+ #endif
+ 	hash >>= zone->wait_table_shift;
+ 
+ 	return &wait[hash];
+ }
+ 
  /* 
   * Wait for a page to get unlocked.
***************
*** 822,832 ****
   * ie with increased "page->count" so that the page won't
   * go away during the wait..
   */
  void ___wait_on_page(struct page *page)
  {
  	struct task_struct *tsk = current;
  	DECLARE_WAITQUEUE(wait, tsk);

! 	add_wait_queue(&page->wait, &wait);
  	do {
  		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
--- 885,911 ----
   * ie with increased "page->count" so that the page won't
   * go away during the wait..
+  *
+  * The waiting strategy is to get on a waitqueue determined
+  * by hashing. Waiters will then collide, and the newly woken
+  * task must then determine whether it was woken for the page
+  * it really wanted, and go back to sleep on the waitqueue if
+  * that wasn't it. With the waitqueue semantics, it never leaves
+  * the waitqueue unless it calls, so the loop moves forward one
+  * iteration every time there is
+  * (1) a collision 
+  * and
+  * (2) one of the colliding pages is woken
+  *
+  * This is the thundering herd problem, but it is expected to
+  * be very rare due to the few pages that are actually being
+  * waited on at any given time and the quality of the hash function.
   */
  void ___wait_on_page(struct page *page)
  {
+ 	wait_queue_head_t *waitqueue = page_waitqueue(page);
  	struct task_struct *tsk = current;
  	DECLARE_WAITQUEUE(wait, tsk);

! 	add_wait_queue(waitqueue, &wait);
  	do {
  		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
***************
*** 836,852 ****
  		schedule();
  	} while (PageLocked(page));
! 	tsk->state = TASK_RUNNING;
! 	remove_wait_queue(&page->wait, &wait);
  }

  void unlock_page(struct page *page)
  {
! 	clear_bit(PG_launder, &(page)->flags);
  	smp_mb__before_clear_bit();
  	if (!test_and_clear_bit(PG_locked, &(page)->flags))
  		BUG();
  	smp_mb__after_clear_bit(); 
! 	if (waitqueue_active(&(page)->wait))
! 	wake_up(&(page)->wait);
  }

--- 915,946 ----
  		schedule();
  	} while (PageLocked(page));
! 	__set_task_state(tsk, TASK_RUNNING);
! 	remove_wait_queue(waitqueue, &wait);
  }

+ /*
+  * unlock_page() is the other half of the story just above
+  * __wait_on_page(). Here a couple of quick checks are done
+  * and a couple of flags are set on the page, and then all
+  * of the waiters for all of the pages in the appropriate
+  * wait queue are woken.
+  */
  void unlock_page(struct page *page)
  {
! 	wait_queue_head_t *waitqueue = page_waitqueue(page);
! 	ClearPageLaunder(page);
  	smp_mb__before_clear_bit();
  	if (!test_and_clear_bit(PG_locked, &(page)->flags))
  		BUG();
  	smp_mb__after_clear_bit(); 
! 
! 	/*
! 	 * Although the default semantics of wake_up() are
! 	 * to wake all, here the specific function is used
! 	 * to make it even more explicit that a number of
! 	 * pages are being waited on here.
! 	 */
! 	if (waitqueue_active(waitqueue))
! 		wake_up_all(waitqueue);
  }

***************
*** 857,864 ****
  static void __lock_page(struct page *page)
  {
  	struct task_struct *tsk = current;
  	DECLARE_WAITQUEUE(wait, tsk);

! 	add_wait_queue_exclusive(&page->wait, &wait);
  	for (;;) {
  		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
--- 951,959 ----
  static void __lock_page(struct page *page)
  {
+ 	wait_queue_head_t *waitqueue = page_waitqueue(page);
  	struct task_struct *tsk = current;
  	DECLARE_WAITQUEUE(wait, tsk);

! 	add_wait_queue_exclusive(waitqueue, &wait);
  	for (;;) {
  		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
***************
*** 870,877 ****
  			break;
  	}
! 	tsk->state = TASK_RUNNING;
! 	remove_wait_queue(&page->wait, &wait);
  }
- 	

  /*
--- 965,971 ----
  			break;
  	}
! 	__set_task_state(tsk, TASK_RUNNING);
! 	remove_wait_queue(waitqueue, &wait);
  }

  /*
***************
*** 1091,1103 ****

  /*
-  * Returns locked page at given index in given cache, creating it if needed.
-  */
- struct page *grab_cache_page(struct address_space *mapping, unsigned long index)
- {
- 	return find_or_create_page(mapping, index, mapping->gfp_mask);
- }
- 
- 
- /*
   * Same as grab_cache_page, but do not wait if the page is unavailable.
   * This is intended for speculative data generators, where the data can
--- 1185,1188 ----
***************
*** 1381,1388 ****
   * Mark a page as having seen activity.
   *
!  * If it was already so marked, move it
!  * to the active queue and drop the referenced
!  * bit. Otherwise, just mark it for future
!  * action..
   */
  void mark_page_accessed(struct page *page)
--- 1466,1471 ----
   * Mark a page as having seen activity.
   *
!  * If it was already so marked, move it to the active queue and drop
!  * the referenced bit.  Otherwise, just mark it for future action..
   */
  void mark_page_accessed(struct page *page)
***************
*** 1391,1399 ****
  		activate_page(page);
  		ClearPageReferenced(page);
! 		return;
! 	}
! 
! 	/* Mark the page referenced, AFTER checking for previous usage.. */
! 	SetPageReferenced(page);
  }

--- 1474,1479 ----
  		activate_page(page);
  		ClearPageReferenced(page);
! 	} else
! 		SetPageReferenced(page);
  }

***************
*** 1634,1637 ****
--- 1714,1718 ----
  	struct address_space * mapping = filp->f_dentry->d_inode->i_mapping;
  	struct inode * inode = mapping->host;
+ 	loff_t size = inode->i_size;

  	new_iobuf = 0;
***************
*** 1659,1662 ****
--- 1740,1746 ----
  		goto out_free;

+ 	if ((rw == READ) && (offset + count > size))
+ 		count = size - offset;
+ 
  	/*
  	 * Flush to disk exclusively the _data_, metadata must remain
***************
*** 1689,1692 ****
--- 1773,1777 ----
  			count -= retval;
  			buf += retval;
+ 			/* warning: weird semantics here, we're reporting a read behind the end of the file */
  			progress += retval;
  		}
***************
*** 1778,1783 ****
  		size = inode->i_size;
  		if (pos < size) {
- 			if (pos + count > size)
- 				count = size - pos;
  			retval = generic_file_direct_IO(READ, filp, buf, count, pos);
  			if (retval > 0)
--- 1863,1866 ----
***************
*** 2307,2310 ****
--- 2390,2396 ----
  	struct file * file = vma->vm_file;

+ 	if ( (flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED) )
+ 		return -EBUSY;
+ 
  	if (file && (vma->vm_flags & VM_SHARED)) {
  		ret = filemap_sync(vma, start, end-start, flags);
***************
*** 2348,2351 ****
--- 2434,2440 ----
  	if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
  		goto out;
+ 	if ((flags & MS_ASYNC) && (flags & MS_SYNC))
+ 		goto out;
+ 
  	error = 0;
  	if (end == start)
***************
*** 2353,2357 ****
  	/*
  	 * If the interval [start,end) covers some unmapped address ranges,
! 	 * just ignore them, but return -EFAULT at the end.
  	 */
  	vma = find_vma(current->mm, start);
--- 2442,2446 ----
  	/*
  	 * If the interval [start,end) covers some unmapped address ranges,
! 	 * just ignore them, but return -ENOMEM at the end.
  	 */
  	vma = find_vma(current->mm, start);
***************
*** 2359,2368 ****
  	for (;;) {
  		/* Still start < end. */
! 		error = -EFAULT;
  		if (!vma)
  			goto out;
  		/* Here start < vma->vm_end. */
  		if (start < vma->vm_start) {
! 			unmapped_error = -EFAULT;
  			start = vma->vm_start;
  		}
--- 2448,2457 ----
  	for (;;) {
  		/* Still start < end. */
! 		error = -ENOMEM;
  		if (!vma)
  			goto out;
  		/* Here start < vma->vm_end. */
  		if (start < vma->vm_start) {
! 			unmapped_error = -ENOMEM;
  			start = vma->vm_start;
  		}
***************
*** 2512,2516 ****

  	/* This caps the number of vma's this process can own */
! 	if (vma->vm_mm->map_count > MAX_MAP_COUNT)
  		return -ENOMEM;

--- 2601,2605 ----

  	/* This caps the number of vma's this process can own */
! 	if (vma->vm_mm->map_count > max_map_count)
  		return -ENOMEM;

***************
*** 3077,3081 ****
  	err = -EFBIG;

! 	if (limit != RLIM_INFINITY) {
  		if (pos >= limit) {
  			send_sig(SIGXFSZ, current, 0);
--- 3166,3170 ----
  	err = -EFBIG;

! 	if (!S_ISBLK(inode->i_mode) && limit != RLIM_INFINITY) {
  		if (pos >= limit) {
  			send_sig(SIGXFSZ, current, 0);

Index: memory.c
===================================================================
RCS file: /cvsroot/linuxcompressed/linux/mm/memory.c,v
retrieving revision 1.36
retrieving revision 1.37
diff -C2 -r1.36 -r1.37
*** memory.c	10 Sep 2002 16:43:12 -0000	1.36
--- memory.c	19 May 2003 01:38:48 -0000	1.37
***************
*** 45,48 ****
--- 45,49 ----
  #include <linux/highmem.h>
  #include <linux/pagemap.h>
+ #include <linux/module.h>
  #include <linux/comp_cache.h>

***************
*** 53,56 ****
--- 54,58 ----
  unsigned long max_mapnr;
  unsigned long num_physpages;
+ unsigned long num_mappedpages;
  void * high_memory;
  struct page *highmem_start_page;
***************
*** 529,532 ****
--- 531,536 ----
  }

+ EXPORT_SYMBOL(get_user_pages);
+ 
  /*
   * Force in an entire range of pages from the current process's user VA,
***************
*** 587,590 ****
--- 591,596 ----
   * size of the kiobuf, so we have to stop marking pages dirty once the
   * requested byte count has been reached.
+  *
+  * Must be called from process context - set_page_dirty() takes VFS locks.
   */

***************
*** 604,608 ****

  		if (!PageReserved(page))
! 			SetPageDirty(page);

  		remaining -= (PAGE_SIZE - offset);
--- 610,614 ----

  		if (!PageReserved(page))
! 			set_page_dirty(page);

  		remaining -= (PAGE_SIZE - offset);
***************
*** 1500,1502 ****
--- 1506,1529 ----
  			len, write, 0, NULL, NULL);
  	return ret == len ? 0 : -1;
+ }
+ 
+ struct page * vmalloc_to_page(void * vmalloc_addr)
+ {
+ 	unsigned long addr = (unsigned long) vmalloc_addr;
+ 	struct page *page = NULL;
+ 	pmd_t *pmd;
+ 	pte_t *pte;
+ 	pgd_t *pgd;
+ 	
+ 	pgd = pgd_offset_k(addr);
+ 	if (!pgd_none(*pgd)) {
+ 		pmd = pmd_offset(pgd, addr);
+ 		if (!pmd_none(*pmd)) {
+ 			pte = pte_offset(pmd, addr);
+ 			if (pte_present(*pte)) {
+ 				page = pte_page(*pte);
+ 			}
+ 		}
+ 	}
+ 	return page;
  }

Index: mmap.c
===================================================================
RCS file: /cvsroot/linuxcompressed/linux/mm/mmap.c,v
retrieving revision 1.8
retrieving revision 1.9
diff -C2 -r1.8 -r1.9
*** mmap.c	28 Apr 2002 20:51:34 -0000	1.8
--- mmap.c	19 May 2003 01:38:48 -0000	1.9
***************
*** 47,50 ****
--- 47,51 ----

  int sysctl_overcommit_memory;
+ int max_map_count = DEFAULT_MAX_MAP_COUNT;

  /* Check that a process has enough memory to allocate a
***************
*** 420,424 ****

  	/* Too many mappings? */
! 	if (mm->map_count > MAX_MAP_COUNT)
  		return -ENOMEM;

--- 421,425 ----

  	/* Too many mappings? */
! 	if (mm->map_count > max_map_count)
  		return -ENOMEM;

***************
*** 485,489 ****

  	/* Clear old maps */
- 	error = -ENOMEM;
  munmap_back:
  	vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
--- 486,489 ----
***************
*** 555,559 ****
  	 *         f_op->mmap method. -DaveM
  	 */
! 	addr = vma->vm_start;

  	vma_link(mm, vma, prev, rb_link, rb_parent);
--- 555,582 ----
  	 *         f_op->mmap method. -DaveM
  	 */
! 	if (addr != vma->vm_start) {
! 		/*
! 		 * It is a bit too late to pretend changing the virtual
! 		 * area of the mapping, we just corrupted userspace
! 		 * in the do_munmap, so FIXME (not in 2.4 to avoid breaking
! 		 * the driver API).
! 		 */
! 		struct vm_area_struct * stale_vma;
! 		/* Since addr changed, we rely on the mmap op to prevent 
! 		 * collisions with existing vmas and just use find_vma_prepare 
! 		 * to update the tree pointers.
! 		 */
! 		addr = vma->vm_start;
! 		stale_vma = find_vma_prepare(mm, addr, &prev,
! 						&rb_link, &rb_parent);
! 		/*
! 		 * Make sure the lowlevel driver did its job right.
! 		 */
! 		if (unlikely(stale_vma && stale_vma->vm_start < vma->vm_end)) {
! 			printk(KERN_ERR "buggy mmap operation: [<%p>]\n",
! 				file ? file->f_op->mmap : NULL);
! 			BUG();
! 		}
! 	}

  	vma_link(mm, vma, prev, rb_link, rb_parent);
***************
*** 926,930 ****
  	/* If we'll make "hole", check the vm areas limit */
  	if ((mpnt->vm_start < addr && mpnt->vm_end > addr+len)
! 	    && mm->map_count >= MAX_MAP_COUNT)
  		return -ENOMEM;

--- 949,953 ----
  	/* If we'll make "hole", check the vm areas limit */
  	if ((mpnt->vm_start < addr && mpnt->vm_end > addr+len)
! 	    && mm->map_count >= max_map_count)
  		return -ENOMEM;

***************
*** 1047,1051 ****
  		return -ENOMEM;

! 	if (mm->map_count > MAX_MAP_COUNT)
  		return -ENOMEM;

--- 1070,1074 ----
  		return -ENOMEM;

! 	if (mm->map_count > max_map_count)
  		return -ENOMEM;

***************
*** 1053,1060 ****
  		return -ENOMEM;

! 	flags = calc_vm_flags(PROT_READ|PROT_WRITE|PROT_EXEC,
! 				MAP_FIXED|MAP_PRIVATE) | mm->def_flags;
! 
! 	flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;

  	/* Can we just expand an old anonymous mapping? */
--- 1076,1080 ----
  		return -ENOMEM;

! 	flags = VM_DATA_DEFAULT_FLAGS | mm->def_flags;

  	/* Can we just expand an old anonymous mapping? */
***************
*** 1140,1144 ****
  		mpnt = next;
  	}
- 	flush_tlb_mm(mm);

  	/* This is just debugging */
--- 1160,1163 ----
***************
*** 1147,1150 ****
--- 1166,1171 ----

  	clear_page_tables(mm, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD);
+ 
+ 	flush_tlb_mm(mm);
  }

Index: oom_kill.c
===================================================================
RCS file: /cvsroot/linuxcompressed/linux/mm/oom_kill.c,v
retrieving revision 1.8
retrieving revision 1.9
diff -C2 -r1.8 -r1.9
*** oom_kill.c	14 Jan 2002 12:05:08 -0000	1.8
--- oom_kill.c	19 May 2003 01:38:48 -0000	1.9
***************
*** 112,117 ****
  /*
   * Simple selection loop. We chose the process with the highest
!  * number of 'points'. We need the locks to make sure that the
!  * list of task structs doesn't change while we look the other way.
   *
   * (not docbooked, we don't want this one cluttering up the manual)
--- 112,116 ----
  /*
   * Simple selection loop. We chose the process with the highest
!  * number of 'points'. We expect the caller will lock the tasklist.
   *
   * (not docbooked, we don't want this one cluttering up the manual)
***************
*** 123,127 ****
  	struct task_struct *chosen = NULL;

- 	read_lock(&tasklist_lock);
  	for_each_task(p) {
  		if (p->pid) {
--- 122,125 ----
***************
*** 133,137 ****
  		}
  	}
- 	read_unlock(&tasklist_lock);
  	return chosen;
  }
--- 131,134 ----
***************
*** 172,176 ****
  static void oom_kill(void)
  {
! 	struct task_struct *p = select_bad_process(), *q;

  	/* Found nothing?!?! Either we hang forever, or we panic. */
--- 169,176 ----
  static void oom_kill(void)
  {
! 	struct task_struct *p, *q;
! 
! 	read_lock(&tasklist_lock);
! 	p = select_bad_process();

  	/* Found nothing?!?! Either we hang forever, or we panic. */
***************
*** 179,185 ****

  	/* kill all processes that share the ->mm (i.e. all threads) */
- 	read_lock(&tasklist_lock);
  	for_each_task(q) {
! 		if(q->mm == p->mm) oom_kill_task(q);
  	}
  	read_unlock(&tasklist_lock);
--- 179,185 ----

  	/* kill all processes that share the ->mm (i.e. all threads) */
  	for_each_task(q) {
! 		if (q->mm == p->mm)
! 			oom_kill_task(q);
  	}
  	read_unlock(&tasklist_lock);
***************
*** 190,195 ****
  	 * for more memory.
  	 */
! 	current->policy |= SCHED_YIELD;
! 	schedule();
  	return;
  }
--- 190,194 ----
  	 * for more memory.
  	 */
! 	yield();
  	return;
  }
***************
*** 200,204 ****
  void out_of_memory(void)
  {
! 	static unsigned long first, last, count;
  	unsigned long now, since;

--- 199,203 ----
  void out_of_memory(void)
  {
! 	static unsigned long first, last, count, lastkill;
  	unsigned long now, since;

***************
*** 243,248 ****
--- 242,257 ----

  	/*
+ 	 * If we just killed a process, wait a while
+ 	 * to give that task a chance to exit. This
+ 	 * avoids killing multiple processes needlessly.
+ 	 */
+ 	since = now - lastkill;
+ 	if (since < HZ*5)
+ 		return;
+ 
+ 	/*
  	 * Ok, really out of memory. Kill something.
  	 */
+ 	lastkill = now;
  	oom_kill();

Index: page_alloc.c
===================================================================
RCS file: /cvsroot/linuxcompressed/linux/mm/page_alloc.c,v
retrieving revision 1.26
retrieving revision 1.27
diff -C2 -r1.26 -r1.27
*** page_alloc.c	29 Nov 2002 21:23:02 -0000	1.26
--- page_alloc.c	19 May 2003 01:38:48 -0000	1.27
***************
*** 2,5 ****
--- 2,8 ----
   *  linux/mm/page_alloc.c
   *
+  *  Manages the free list, the system allocates free pages here.
+  *  Note that kmalloc() lives in slab.c
+  *
   *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   *  Swap reorganised 29.12.95, Stephen Tweedie
***************
*** 18,22 ****
  #include <linux/bootmem.h>
  #include <linux/slab.h>
! #include <linux/compiler.h>
  #include <linux/comp_cache.h>

--- 21,25 ----
  #include <linux/bootmem.h>
  #include <linux/slab.h>
! #include <linux/module.h>
  #include <linux/comp_cache.h>

***************
*** 24,31 ****
  int nr_active_pages;
  int nr_inactive_pages;
! struct list_head inactive_list;
! struct list_head active_list;
  pg_data_t *pgdat_list;

  static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
  #ifdef CONFIG_COMP_CACHE
--- 27,43 ----
  int nr_active_pages;
  int nr_inactive_pages;
! LIST_HEAD(inactive_list);
! LIST_HEAD(active_list);
  pg_data_t *pgdat_list;

+ /*
+  *
+  * The zone_table array is used to look up the address of the
+  * struct zone corresponding to a given zone number (ZONE_DMA,
+  * ZONE_NORMAL, or ZONE_HIGHMEM).
+  */
+ zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES];
+ EXPORT_SYMBOL(zone_table);
+ 
  static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
  #ifdef CONFIG_COMP_CACHE
***************
*** 40,71 ****

  /*
-  * Free_page() adds the page to the free lists. This is optimized for
-  * fast normal cases (no error jumps taken normally).
-  *
-  * The way to optimize jumps for gcc-2.2.2 is to:
-  *  - select the "normal" case and put it inside the if () { XXX }
-  *  - no else-statements if you can avoid them
-  *
-  * With the above two rules, you get a straight-line execution path
-  * for the normal case, giving better asm-code.
-  */
- 
- #define memlist_init(x) INIT_LIST_HEAD(x)
- #define memlist_add_head list_add
- #define memlist_add_tail list_add_tail
- #define memlist_del list_del
- #define memlist_entry list_entry
- #define memlist_next(x) ((x)->next)
- #define memlist_prev(x) ((x)->prev)
- 
- /*
   * Temporary debugging check.
   */
! #define BAD_RANGE(zone,x) (((zone) != (x)->zone) || (((x)-mem_map) < (zone)->zone_start_mapnr) || (((x)-mem_map) >= (zone)->zone_start_mapnr+(zone)->size))

  /*
!  * Buddy system. Hairy. You really aren't expected to understand this
   *
!  * Hint: -mask = 1+~mask
   */

--- 52,87 ----

  /*
   * Temporary debugging check.
   */
! #define BAD_RANGE(zone, page)						\
! (									\
! 	(((page) - mem_map) >= ((zone)->zone_start_mapnr+(zone)->size))	\
! 	|| (((page) - mem_map) < (zone)->zone_start_mapnr)		\
! 	|| ((zone) != page_zone(page))					\
! )

  /*
!  * Freeing function for a buddy system allocator.
!  * Contrary to prior comments, this is *NOT* hairy, and there
!  * is no reason for anyone not to understand it.
   *
!  * The concept of a buddy system is to maintain direct-mapped tables
!  * (containing bit values) for memory blocks of various "orders".
!  * The bottom level table contains the map for the smallest allocatable
!  * units of memory (here, pages), and each level above it describes
!  * pairs of units from the levels below, hence, "buddies".
!  * At a high level, all that happens here is marking the table entry
!  * at the bottom level available, and propagating the changes upward
!  * as necessary, plus some accounting needed to play nicely with other
!  * parts of the VM system.
!  * At each level, we keep one bit for each pair of blocks, which
!  * is set to 1 iff only one of the pair is allocated.  So when we
!  * are allocating or freeing one, we can derive the state of the
!  * other.  That is, if we allocate a small block, and both were   
!  * free, the remainder of the region must be split into blocks.   
!  * If a block is freed, and its buddy is also free, then this
!  * triggers coalescing into a block of larger size.            
!  *
!  * -- wli
   */

***************
*** 78,86 ****
  	zone_t *zone;

! 	/* Yes, think what happens when other parts of the kernel take 
  	 * a reference to a page in order to pin it for io. -ben
  	 */
! 	if (PageLRU(page))
  		lru_cache_del(page);

  	if (page->buffers)
--- 94,106 ----
  	zone_t *zone;

! 	/*
! 	 * Yes, think what happens when other parts of the kernel take 
  	 * a reference to a page in order to pin it for io. -ben
  	 */
! 	if (PageLRU(page)) {
! 		if (unlikely(in_interrupt()))
! 			BUG();
  		lru_cache_del(page);
+ 	}

  	if (page->buffers)
***************
*** 90,99 ****
  	if (!VALID_PAGE(page))
  		BUG();
- 	if (PageSwapCache(page))
- 		BUG();
  	if (PageLocked(page))
  		BUG();
- 	if (PageLRU(page))
- 		BUG();
  	if (PageActive(page))
  		BUG();
--- 110,115 ----
***************
*** 104,108 ****
   back_local_freelist:

! 	zone = page->zone;

  	mask = (~0UL) << order;
--- 120,124 ----
   back_local_freelist:

! 	zone = page_zone(page);

  	mask = (~0UL) << order;
***************
*** 131,134 ****
--- 147,152 ----
  		/*
  		 * Move the buddy up one level.
+ 		 * This code is taking advantage of the identity:
+ 		 * 	-mask = 1+~mask
  		 */
  		buddy1 = base + (page_idx ^ -mask);
***************
*** 139,143 ****
  			BUG();

! 		memlist_del(&buddy1->list);
  		mask <<= 1;
  		area++;
--- 157,161 ----
  			BUG();

! 		list_del(&buddy1->list);
  		mask <<= 1;
  		area++;
***************
*** 145,149 ****
  		page_idx &= mask;
  	}
! 	memlist_add_head(&(base + page_idx)->list, &area->free_list);

  	spin_unlock_irqrestore(&zone->lock, flags);
--- 163,167 ----
  		page_idx &= mask;
  	}
! 	list_add(&(base + page_idx)->list, &area->free_list);

  	spin_unlock_irqrestore(&zone->lock, flags);
***************
*** 175,179 ****
  		high--;
  		size >>= 1;
! 		memlist_add_head(&(page)->list, &(area)->free_list);
  		MARK_USED(index, high, area);
  		index += size;
--- 193,197 ----
  		high--;
  		size >>= 1;
! 		list_add(&(page)->list, &(area)->free_list);
  		MARK_USED(index, high, area);
  		index += size;
***************
*** 197,209 ****
  	do {
  		head = &area->free_list;
! 		curr = memlist_next(head);

  		if (curr != head) {
  			unsigned int index;

! 			page = memlist_entry(curr, struct page, list);
  			if (BAD_RANGE(zone,page))
  				BUG();
! 			memlist_del(curr);
  			index = page - zone->zone_mem_map;
  			if (curr_order != MAX_ORDER-1)
--- 215,227 ----
  	do {
  		head = &area->free_list;
! 		curr = head->next;

  		if (curr != head) {
  			unsigned int index;

! 			page = list_entry(curr, struct page, list);
  			if (BAD_RANGE(zone,page))
  				BUG();
! 			list_del(curr);
  			index = page - zone->zone_mem_map;
  			if (curr_order != MAX_ORDER-1)
***************
*** 253,257 ****
  	current->flags |= PF_MEMALLOC | PF_FREE_PAGES;

! 	__freed = try_to_free_pages(classzone, gfp_mask, order);

  	current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);
--- 271,275 ----
  	current->flags |= PF_MEMALLOC | PF_FREE_PAGES;

! 	__freed = try_to_free_pages_zone(classzone, gfp_mask);

  	current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);
***************
*** 269,273 ****
  			do {
  				tmp = list_entry(entry, struct page, list);
! 				if (tmp->index == order && memclass(tmp->zone, classzone)) {
  					list_del(entry);
  					current->nr_local_pages--;
--- 287,291 ----
  			do {
  				tmp = list_entry(entry, struct page, list);
! 				if (tmp->index == order && memclass(page_zone(tmp), classzone)) {
  					list_del(entry);
  					current->nr_local_pages--;
***************
*** 281,286 ****
  					if (!VALID_PAGE(page))
  						BUG();
- 					if (PageSwapCache(page))
- 						BUG();
  					if (PageLocked(page))
  						BUG();
--- 299,302 ----
***************
*** 325,328 ****
--- 341,346 ----
  	zone = zonelist->zones;
  	classzone = *zone;
+ 	if (classzone == NULL)
+ 		return NULL;
  	min = 1UL << order;
  	for (;;) {
***************
*** 408,414 ****

  	/* Yield for kswapd, and try again */
! 	current->policy |= SCHED_YIELD;
! 	__set_current_state(TASK_RUNNING);
! 	schedule();
  	goto rebalance;
  }
--- 426,430 ----

  	/* Yield for kswapd, and try again */
! 	yield();
  	goto rebalance;
  }
***************
*** 457,470 ****
  unsigned int nr_free_pages (void)
  {
! 	unsigned int sum;
  	zone_t *zone;
- 	pg_data_t *pgdat = pgdat_list;

! 	sum = 0;
! 	while (pgdat) {
! 		for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++)
! 			sum += zone->free_pages;
! 		pgdat = pgdat->node_next;
! 	}
  	return sum;
  }
--- 473,482 ----
  unsigned int nr_free_pages (void)
  {
! 	unsigned int sum = 0;
  	zone_t *zone;

! 	for_each_zone(zone)
! 		sum += zone->free_pages;
! 
  	return sum;
  }
***************
*** 475,482 ****
  unsigned int nr_free_buffer_pages (void)
  {
! 	pg_data_t *pgdat = pgdat_list;
  	unsigned int sum = 0;

! 	do {
  		zonelist_t *zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK);
  		zone_t **zonep = zonelist->zones;
--- 487,494 ----
  unsigned int nr_free_buffer_pages (void)
  {
! 	pg_data_t *pgdat;
  	unsigned int sum = 0;

! 	for_each_pgdat(pgdat) {
  		zonelist_t *zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK);
  		zone_t **zonep = zonelist->zones;
***************
*** 489,495 ****
  				sum += size - high;
  		}
! 
! 		pgdat = pgdat->node_next;
! 	} while (pgdat);

  	return sum;
--- 501,505 ----
  				sum += size - high;
  		}
! 	}

  	return sum;
***************
*** 499,509 ****
  unsigned int nr_free_highpages (void)
  {
! 	pg_data_t *pgdat = pgdat_list;
  	unsigned int pages = 0;

! 	while (pgdat) {
  		pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
! 		pgdat = pgdat->node_next;
! 	}
  	return pages;
  }
--- 509,518 ----
  unsigned int nr_free_highpages (void)
  {
! 	pg_data_t *pgdat;
  	unsigned int pages = 0;

! 	for_each_pgdat(pgdat)
  		pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
! 
  	return pages;
  }
***************
*** 560,565 ****
  				nr = 0;
  				for (;;) {
! 					curr = memlist_next(curr);
! 					if (curr == head)
  						break;
  					nr++;
--- 569,573 ----
  				nr = 0;
  				for (;;) {
! 					if ((curr = curr->next) == head)
  						break;
  					nr++;
***************
*** 631,634 ****
--- 639,684 ----
  }

+ /*
+  * Helper functions to size the waitqueue hash table.
+  * Essentially these want to choose hash table sizes sufficiently
+  * large so that collisions trying to wait on pages are rare.
+  * But in fact, the number of active page waitqueues on typical
+  * systems is ridiculously low, less than 200. So this is even
+  * conservative, even though it seems large.
+  *
+  * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
+  * waitqueues, i.e. the size of the waitq table given the number of pages.
+  */
+ #define PAGES_PER_WAITQUEUE	256
+ 
+ static inline unsigned long wait_table_size(unsigned long pages)
+ {
+ 	unsigned long size = 1;
+ 
+ 	pages /= PAGES_PER_WAITQUEUE;
+ 
+ 	while (size < pages)
+ 		size <<= 1;
+ 
+ 	/*
+ 	 * Once we have dozens or even hundreds of threads sleeping
+ 	 * on IO we've got bigger problems than wait queue collision.
+ 	 * Limit the size of the wait table to a reasonable size.
+ 	 */
+ 	size = min(size, 4096UL);
+ 
+ 	return size;
+ }
+ 
+ /*
+  * This is an integer logarithm so that shifts can be used later
+  * to extract the more random high bits from the multiplicative
+  * hash function before the remainder is taken.
+  */
+ static inline unsigned long wait_table_bits(unsigned long size)
+ {
+ 	return ffz(~size);
+ }
+ 
  #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))

***************
*** 682,686 ****
  	unsigned long *zholes_size, struct page *lmem_map)
  {
- 	struct page *p;
  	unsigned long i, j;
  	unsigned long map_size;
--- 732,735 ----
***************
*** 703,709 ****
  	printk("On node %d totalpages: %lu\n", nid, realtotalpages);

- 	INIT_LIST_HEAD(&active_list);
- 	INIT_LIST_HEAD(&inactive_list);
- 
  	/*
  	 * Some architectures (with lots of mem and discontinous memory
--- 752,755 ----
***************
*** 725,740 ****
  	pgdat->nr_zones = 0;

- 	/*
- 	 * Initially all pages are reserved - free ones are freed
- 	 * up by free_all_bootmem() once the early boot process is
- 	 * done.
- 	 */
- 	for (p = lmem_map; p < lmem_map + totalpages; p++) {
- 		set_page_count(p, 0);
- 		SetPageReserved(p);
- 		init_waitqueue_head(&p->wait);
- 		memlist_init(&p->list);
- 	}
- 
  	offset = lmem_map - mem_map;	
  	for (j = 0; j < MAX_NR_ZONES; j++) {
--- 771,774 ----
***************
*** 743,746 ****
--- 777,781 ----
  		unsigned long size, realsize;

+ 		zone_table[nid * MAX_NR_ZONES + j] = zone;
  		realsize = size = zones_size[j];
  		if (zholes_size)
***************
*** 757,760 ****
--- 792,809 ----
  			continue;

+ 		/*
+ 		 * The per-page waitqueue mechanism uses hashed waitqueues
+ 		 * per zone.
+ 		 */
+ 		zone->wait_table_size = wait_table_size(size);
+ 		zone->wait_table_shift =
+ 			BITS_PER_LONG - wait_table_bits(zone->wait_table_size);
+ 		zone->wait_table = (wait_queue_head_t *)
+ 			alloc_bootmem_node(pgdat, zone->wait_table_size
+ 						* sizeof(wait_queue_head_t));
+ 
+ 		for(i = 0; i < zone->wait_table_size; ++i)
+ 			init_waitqueue_head(zone->wait_table + i);
+ 
  		pgdat->nr_zones = j+1;

***************
*** 775,783 ****
  			printk("BUG: wrong zone alignment, it will crash\n");

  		for (i = 0; i < size; i++) {
  			struct page *page = mem_map + offset + i;
! 			page->zone = zone;
  			if (j != ZONE_HIGHMEM)
! 				page->virtual = __va(zone_start_paddr);
  			zone_start_paddr += PAGE_SIZE;
  		}
--- 824,840 ----
  			printk("BUG: wrong zone alignment, it will crash\n");

+ 		/*
+ 		 * Initially all pages are reserved - free ones are freed
+ 		 * up by free_all_bootmem() once the early boot process is
+ 		 * done. Non-atomic initialization, single-pass.
+ 		 */
  		for (i = 0; i < size; i++) {
  			struct page *page = mem_map + offset + i;
! 			set_page_zone(page, nid * MAX_NR_ZONES + j);
! 			set_page_count(page, 0);
! 			SetPageReserved(page);
! 			INIT_LIST_HEAD(&page->list);
  			if (j != ZONE_HIGHMEM)
! 				set_page_address(page, __va(zone_start_paddr));
  			zone_start_paddr += PAGE_SIZE;
  		}
***************
*** 787,791 ****
  			unsigned long bitmap_size;

! 			memlist_init(&zone->free_area[i].free_list);
  			if (i == MAX_ORDER-1) {
  				zone->free_area[i].map = NULL;
--- 844,848 ----
  			unsigned long bitmap_size;

! 			INIT_LIST_HEAD(&zone->free_area[i].free_list);
  			if (i == MAX_ORDER-1) {
  				zone->free_area[i].map = NULL;

Index: page_io.c
===================================================================
RCS file: /cvsroot/linuxcompressed/linux/mm/page_io.c,v
retrieving revision 1.6
retrieving revision 1.7
diff -C2 -r1.6 -r1.7
*** page_io.c	10 Sep 2002 16:43:15 -0000	1.6
--- page_io.c	19 May 2003 01:38:49 -0000	1.7
***************
*** 73,81 ****
   	/* block_size == PAGE_SIZE/zones_used */
   	brw_page(rw, page, dev, zones, block_size);
- 
-  	/* Note! For consistency we do all of the logic,
-  	 * decrementing the page count, and unlocking the page in the
-  	 * swap lock map - in the IO completion handler.
-  	 */
  	return 1;
  }
--- 73,76 ----
***************
*** 100,105 ****
  	if (!PageSwapCache(page))
  		PAGE_BUG(page);
- 	if (page->mapping != &swapper_space)
- 		PAGE_BUG(page);
  	if (!rw_swap_page_base(rw, entry, page))
  		UnlockPage(page);
--- 95,98 ----
***************
*** 117,129 ****
  	if (!PageLocked(page))
  		PAGE_BUG(page);
- 	if (PageSwapCache(page))
- 		PAGE_BUG(page);
  	if (page->mapping)
  		PAGE_BUG(page);
  	/* needs sync_page to wait I/O completation */
  	page->mapping = &swapper_space;
! 	if (!rw_swap_page_base(rw, entry, page))
! 		UnlockPage(page);
! 	wait_on_page(page);
  	page->mapping = NULL;
  }
--- 110,122 ----
  	if (!PageLocked(page))
  		PAGE_BUG(page);
  	if (page->mapping)
  		PAGE_BUG(page);
  	/* needs sync_page to wait I/O completation */
  	page->mapping = &swapper_space;
! 	if (rw_swap_page_base(rw, entry, page))
! 		lock_page(page);
! 	if (!block_flushpage(page, 0))
! 		PAGE_BUG(page);
  	page->mapping = NULL;
+ 	UnlockPage(page);
  }

Index: shmem.c
===================================================================
RCS file: /cvsroot/linuxcompressed/linux/mm/shmem.c,v
retrieving revision 1.22
retrieving revision 1.23
diff -C2 -r1.22 -r1.23
*** shmem.c	10 Sep 2002 16:43:16 -0000	1.22
--- shmem.c	19 May 2003 01:38:49 -0000	1.23
***************
*** 36,39 ****
--- 36,47 ----

  #define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
+ #define BLOCKS_PER_PAGE  (PAGE_CACHE_SIZE/512)
+ 
+ #define SHMEM_MAX_INDEX  (SHMEM_NR_DIRECT + ENTRIES_PER_PAGE * (ENTRIES_PER_PAGE/2) * (ENTRIES_PER_PAGE+1))
+ #define SHMEM_MAX_BYTES  ((unsigned long long)SHMEM_MAX_INDEX << PAGE_CACHE_SHIFT)
+ #define VM_ACCT(size)    (((size) + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT)
+ 
+ /* Pretend that each entry is of this size in directory's i_size */
+ #define BOGO_DIRENT_SIZE 20

  #define SHMEM_SB(sb) (&sb->u.shmem_sb)
***************
*** 43,47 ****
  static struct file_operations shmem_file_operations;
  static struct inode_operations shmem_inode_operations;
- static struct file_operations shmem_dir_operations;
  static struct inode_operations shmem_dir_inode_operations;
  static struct vm_operations_struct shmem_vm_ops;
--- 51,54 ----
***************
*** 51,55 ****
  atomic_t shmem_nrpages = ATOMIC_INIT(0); /* Not used right now */

! #define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)

  /*
--- 58,62 ----
  atomic_t shmem_nrpages = ATOMIC_INIT(0); /* Not used right now */

! static struct page *shmem_getpage_locked(struct shmem_inode_info *, struct inode *, unsigned long);

  /*
***************
*** 128,134 ****
   * 	      	       +-> 52-55
   */
- 
- #define SHMEM_MAX_BLOCKS (SHMEM_NR_DIRECT + ENTRIES_PER_PAGE * ENTRIES_PER_PAGE/2*(ENTRIES_PER_PAGE+1))
- 
  static swp_entry_t * shmem_swp_entry (struct shmem_inode_info *info, unsigned long index, unsigned long page) 
  {
--- 135,138 ----
***************
*** 183,187 ****
  	swp_entry_t * res;

! 	if (index >= SHMEM_MAX_BLOCKS)
  		return ERR_PTR(-EFBIG);

--- 187,191 ----
  	swp_entry_t * res;

! 	if (index >= SHMEM_MAX_INDEX)
  		return ERR_PTR(-EFBIG);

***************
*** 315,318 ****
--- 319,323 ----
  {
  	unsigned long index;
+ 	unsigned long partial;
  	unsigned long freed = 0;
  	struct shmem_inode_info * info = SHMEM_I(inode);
***************
*** 322,325 ****
--- 327,352 ----
  	spin_lock (&info->lock);
  	index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ 	partial = inode->i_size & ~PAGE_CACHE_MASK;
+ 
+ 	if (partial) {
+ 		swp_entry_t *entry = shmem_swp_entry(info, index-1, 0);
+ 		struct page *page;
+ 		/*
+ 		 * This check is racy: it's faintly possible that page
+ 		 * was assigned to swap during truncate_inode_pages,
+ 		 * and now assigned to file; but better than nothing.
+ 		 */
+ 		if (!IS_ERR(entry) && entry->val) {
+ 			spin_unlock(&info->lock);
+ 			page = shmem_getpage_locked(info, inode, index-1);
+ 			if (!IS_ERR(page)) {
+ 				memclear_highpage_flush(page, partial,
+ 					PAGE_CACHE_SIZE - partial);
+ 				UnlockPage(page);
+ 				page_cache_release(page);
+ 			}
+ 			spin_lock(&info->lock);
+ 		}
+ 	}

  	while (index < info->next_index) 
***************
*** 336,344 ****
  	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);

! 	inode->i_size = 0;
! 	if (inode->i_op->truncate == shmem_truncate){ 
  		spin_lock (&shmem_ilock);
  		list_del (&SHMEM_I(inode)->list);
  		spin_unlock (&shmem_ilock);
  		shmem_truncate (inode);
  	}
--- 363,371 ----
  	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);

! 	if (inode->i_op->truncate == shmem_truncate) {
  		spin_lock (&shmem_ilock);
  		list_del (&SHMEM_I(inode)->list);
  		spin_unlock (&shmem_ilock);
+ 		inode->i_size = 0;
  		shmem_truncate (inode);
  	}
***************
*** 349,374 ****
  }

! static int shmem_clear_swp (swp_entry_t entry, swp_entry_t *ptr, int size) {
  	swp_entry_t *test;

! 	for (test = ptr; test < ptr + size; test++) {
! 		if (test->val == entry.val) {
! 			swap_free (entry);
! 			*test = (swp_entry_t) {0};
  			return test - ptr;
- 		}
  	}
  	return -1;
  }

! static int shmem_unuse_inode (struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
  {
  	swp_entry_t *ptr;
  	unsigned long idx;
  	int offset;
! 	
  	idx = 0;
  	spin_lock (&info->lock);
! 	offset = shmem_clear_swp (entry, info->i_direct, SHMEM_NR_DIRECT);
  	if (offset >= 0)
  		goto found;
--- 376,403 ----
  }

! static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *ptr, swp_entry_t *eptr)
! {
  	swp_entry_t *test;

! 	for (test = ptr; test < eptr; test++) {
! 		if (test->val == entry.val)
  			return test - ptr;
  	}
  	return -1;
  }

! static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
  {
  	swp_entry_t *ptr;
  	unsigned long idx;
  	int offset;
! 
  	idx = 0;
+ 	ptr = info->i_direct;
  	spin_lock (&info->lock);
! 	offset = info->next_index;
! 	if (offset > SHMEM_NR_DIRECT)
! 		offset = SHMEM_NR_DIRECT;
! 	offset = shmem_find_swp(entry, ptr, ptr + offset);
  	if (offset >= 0)
  		goto found;
***************
*** 379,383 ****
  		if (IS_ERR(ptr))
  			continue;
! 		offset = shmem_clear_swp (entry, ptr, ENTRIES_PER_PAGE);
  		if (offset >= 0)
  			goto found;
--- 408,415 ----
  		if (IS_ERR(ptr))
  			continue;
! 		offset = info->next_index - idx;
! 		if (offset > ENTRIES_PER_PAGE)
! 			offset = ENTRIES_PER_PAGE;
! 		offset = shmem_find_swp(entry, ptr, ptr + offset);
  		if (offset >= 0)
  			goto found;
***************
*** 387,391 ****
  found:
  	if (PageCompressed(page))
! 		decompress_swap_cache_page(page);
  	delete_from_swap_cache(page);
  	add_to_page_cache(page, info->inode->i_mapping, offset + idx);
--- 419,425 ----
  found:
  	if (PageCompressed(page))
! 		decompress_swap_cache_page(page);	
! 	swap_free(entry);
! 	ptr[offset] = (swp_entry_t) {0};
  	delete_from_swap_cache(page);
  	add_to_page_cache(page, info->inode->i_mapping, offset + idx);
***************
*** 398,402 ****

  /*
!  * unuse_shmem() search for an eventually swapped out shmem page.
   */
  void shmem_unuse(swp_entry_t entry, struct page *page)
--- 432,436 ----

  /*
!  * shmem_unuse() search for an eventually swapped out shmem page.
   */
  void shmem_unuse(swp_entry_t entry, struct page *page)
***************
*** 409,414 ****
  		info = list_entry(p, struct shmem_inode_info, list);

! 		if (shmem_unuse_inode(info, entry, page))
  			break;
  	}
  	spin_unlock (&shmem_ilock);
--- 443,452 ----
  		info = list_entry(p, struct shmem_inode_info, list);

! 		if (info->swapped && shmem_unuse_inode(info, entry, page)) {
! 			/* move head to start search for next from here */
! 			list_del(&shmem_inodes);
! 			list_add_tail(&shmem_inodes, p);
  			break;
+ 		}
  	}
  	spin_unlock (&shmem_ilock);
***************
*** 531,535 ****

  		/* Look it up and read it in.. */
! 		page = find_get_page(&swapper_space, entry->val);
  		if (!page) {
  			swp_entry_t swap = *entry;
--- 569,573 ----

  		/* Look it up and read it in.. */
! 		page = lookup_swap_cache(*entry);
  		if (!page) {
  			swp_entry_t swap = *entry;
***************
*** 588,591 ****
--- 626,630 ----
  			return ERR_PTR(-ENOMEM);
  		clear_highpage(page);
+ 		flush_dcache_page(page);
  		inode->i_blocks += BLOCKS_PER_PAGE;
  		add_to_page_cache (page, mapping, idx);
***************
*** 707,717 ****
  			inode->i_fop = &shmem_file_operations;
  			spin_lock (&shmem_ilock);
! 			list_add (&SHMEM_I(inode)->list, &shmem_inodes);
  			spin_unlock (&shmem_ilock);
  			break;
  		case S_IFDIR:
  			inode->i_nlink++;
  			inode->i_op = &shmem_dir_inode_operations;
! 			inode->i_fop = &shmem_dir_operations;
  			break;
  		case S_IFLNK:
--- 746,758 ----
  			inode->i_fop = &shmem_file_operations;
  			spin_lock (&shmem_ilock);
! 			list_add_tail(&info->list, &shmem_inodes);
  			spin_unlock (&shmem_ilock);
  			break;
  		case S_IFDIR:
  			inode->i_nlink++;
+ 			/* Some things misbehave if size == 0 on a directory */
+ 			inode->i_size = 2 * BOGO_DIRENT_SIZE;
  			inode->i_op = &shmem_dir_inode_operations;
! 			inode->i_fop = &dcache_dir_ops;
  			break;
  		case S_IFLNK:
***************
*** 884,888 ****
  	status = -EFAULT;
  	ClearPageUptodate(page);
- 	kunmap(page);
  	goto unlock;
  }
--- 925,928 ----
***************
*** 979,983 ****
  	buf->f_ffree = sbinfo->free_inodes;
  	spin_unlock (&sbinfo->stat_lock);
! 	buf->f_namelen = 255;
  	return 0;
  }
--- 1019,1023 ----
  	buf->f_ffree = sbinfo->free_inodes;
  	spin_unlock (&sbinfo->stat_lock);
! 	buf->f_namelen = NAME_MAX;
  	return 0;
  }
***************
*** 1001,1006 ****
  	int error = -ENOSPC;

- 	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
  	if (inode) {
  		d_instantiate(dentry, inode);
  		dget(dentry); /* Extra count - pin the dentry in core */
--- 1041,1047 ----
  	int error = -ENOSPC;

  	if (inode) {
+ 		dir->i_size += BOGO_DIRENT_SIZE;
+ 		dir->i_ctime = dir->i_mtime = CURRENT_TIME;
  		d_instantiate(dentry, inode);
  		dget(dentry); /* Extra count - pin the dentry in core */
***************
*** 1035,1038 ****
--- 1076,1080 ----
  		return -EPERM;

+ 	dir->i_size += BOGO_DIRENT_SIZE;
  	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
  	inode->i_nlink++;
***************
*** 1079,1082 ****
--- 1121,1126 ----
  {
  	struct inode *inode = dentry->d_inode;
+ 
+ 	dir->i_size -= BOGO_DIRENT_SIZE;
  	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
  	inode->i_nlink--;
***************
*** 1102,1123 ****
  static int shmem_rename(struct inode * old_dir, struct dentry *old_dentry, struct inode * new_dir,struct dentry *new_dentry)
  {
! 	int error = -ENOTEMPTY;

! 	if (shmem_empty(new_dentry)) {
! 		struct inode *inode = new_dentry->d_inode;
! 		if (inode) {
! 			inode->i_ctime = CURRENT_TIME;
! 			inode->i_nlink--;
! 			dput(new_dentry);
! 		}
! 		error = 0;
! 		old_dentry->d_inode->i_ctime = old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
  	}
! 	return error;
  }

  static int shmem_symlink(struct inode * dir, struct dentry *dentry, const char * symname)
  {
- 	int error;
  	int len;
  	struct inode *inode;
--- 1146,1174 ----
  static int shmem_rename(struct inode * old_dir, struct dentry *old_dentry, struct inode * new_dir,struct dentry *new_dentry)
  {
! 	struct inode *inode = old_dentry->d_inode;
! 	int they_are_dirs = S_ISDIR(inode->i_mode);

! 	if (!shmem_empty(new_dentry)) 
! 		return -ENOTEMPTY;
! 
! 	if (new_dentry->d_inode) {
! 		(void) shmem_unlink(new_dir, new_dentry);
! 		if (they_are_dirs)
! 			old_dir->i_nlink--;
! 	} else if (they_are_dirs) {
! 		old_dir->i_nlink--;
! 		new_dir->i_nlink++;
  	}
! 
! 	old_dir->i_size -= BOGO_DIRENT_SIZE;
! 	new_dir->i_size += BOGO_DIRENT_SIZE;
! 	old_dir->i_ctime = old_dir->i_mtime =
! 	new_dir->i_ctime = new_dir->i_mtime =
! 	inode->i_ctime = CURRENT_TIME;
! 	return 0;
  }

  static int shmem_symlink(struct inode * dir, struct dentry *dentry, const char * symname)
  {
  	int len;
  	struct inode *inode;
***************
*** 1126,1138 ****
  	struct shmem_inode_info * info;

- 	error = shmem_mknod(dir, dentry, S_IFLNK | S_IRWXUGO, 0);
- 	if (error)
- 		return error;
- 
  	len = strlen(symname) + 1;
  	if (len > PAGE_CACHE_SIZE)
  		return -ENAMETOOLONG;
! 		
! 	inode = dentry->d_inode;
  	info = SHMEM_I(inode);
  	inode->i_size = len-1;
--- 1177,1188 ----
  	struct shmem_inode_info * info;

  	len = strlen(symname) + 1;
  	if (len > PAGE_CACHE_SIZE)
  		return -ENAMETOOLONG;
! 
! 	inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0);
! 	if (!inode)
! 		return -ENOSPC;
! 
  	info = SHMEM_I(inode);
  	inode->i_size = len-1;
***************
*** 1142,1154 ****
  		inode->i_op = &shmem_symlink_inline_operations;
  	} else {
- 		spin_lock (&shmem_ilock);
- 		list_add (&info->list, &shmem_inodes);
- 		spin_unlock (&shmem_ilock);
  		down(&info->sem);
  		page = shmem_getpage_locked(info, inode, 0);
  		if (IS_ERR(page)) {
  			up(&info->sem);
  			return PTR_ERR(page);
  		}
  		kaddr = kmap(page);
  		memcpy(kaddr, symname, len);
--- 1192,1206 ----
  		inode->i_op = &shmem_symlink_inline_operations;
  	} else {
  		down(&info->sem);
  		page = shmem_getpage_locked(info, inode, 0);
  		if (IS_ERR(page)) {
  			up(&info->sem);
+ 			iput(inode);
  			return PTR_ERR(page);
  		}
+ 		inode->i_op = &shmem_symlink_inode_operations;
+ 		spin_lock (&shmem_ilock);
+ 		list_add_tail(&info->list, &shmem_inodes);
+ 		spin_unlock (&shmem_ilock);
  		kaddr = kmap(page);
  		memcpy(kaddr, symname, len);
***************
*** 1158,1164 ****
  		page_cache_release(page);
  		up(&info->sem);
- 		inode->i_op = &shmem_symlink_inode_operations;
  	}
  	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
  	return 0;
  }
--- 1210,1218 ----
  		page_cache_release(page);
  		up(&info->sem);
  	}
+ 	dir->i_size += BOGO_DIRENT_SIZE;
  	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ 	d_instantiate(dentry, inode);
+ 	dget(dentry);
  	return 0;
  }
***************
*** 1321,1325 ****
  	sbinfo->max_inodes = inodes;
  	sbinfo->free_inodes = inodes;
! 	sb->s_maxbytes = (unsigned long long) SHMEM_MAX_BLOCKS << PAGE_CACHE_SHIFT;
  	sb->s_blocksize = PAGE_CACHE_SIZE;
  	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
--- 1375,1379 ----
  	sbinfo->max_inodes = inodes;
  	sbinfo->free_inodes = inodes;
! 	sb->s_maxbytes = SHMEM_MAX_BYTES;
  	sb->s_blocksize = PAGE_CACHE_SIZE;
  	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
***************
*** 1360,1371 ****
  };

- static struct file_operations shmem_dir_operations = {
- 	read:		generic_read_dir,
- 	readdir:	dcache_readdir,
- #ifdef CONFIG_TMPFS
- 	fsync:		shmem_sync_file,
- #endif
- };
- 
  static struct inode_operations shmem_dir_inode_operations = {
  #ifdef CONFIG_TMPFS
--- 1414,1417 ----
***************
*** 1463,1470 ****
  	int vm_enough_memory(long pages);

! 	if (size > (unsigned long long) SHMEM_MAX_BLOCKS << PAGE_CACHE_SHIFT)
  		return ERR_PTR(-EINVAL);

! 	if (!vm_enough_memory((size) >> PAGE_CACHE_SHIFT))
  		return ERR_PTR(-ENOMEM);

--- 1509,1516 ----
  	int vm_enough_memory(long pages);

! 	if (size > SHMEM_MAX_BYTES)
  		return ERR_PTR(-EINVAL);

! 	if (!vm_enough_memory(VM_ACCT(size)))
  		return ERR_PTR(-ENOMEM);

***************
*** 1488,1498 ****

  	d_instantiate(dentry, inode);
! 	dentry->d_inode->i_size = size;
! 	shmem_truncate(inode);
  	file->f_vfsmnt = mntget(shm_mnt);
  	file->f_dentry = dentry;
  	file->f_op = &shmem_file_operations;
  	file->f_mode = FMODE_WRITE | FMODE_READ;
- 	inode->i_nlink = 0;	/* It is unlinked */
  	return(file);

--- 1534,1543 ----

  	d_instantiate(dentry, inode);
! 	inode->i_size = size;
! 	inode->i_nlink = 0;	/* It is unlinked */
  	file->f_vfsmnt = mntget(shm_mnt);
  	file->f_dentry = dentry;
  	file->f_op = &shmem_file_operations;
  	file->f_mode = FMODE_WRITE | FMODE_READ;
  	return(file);

***************
*** 1503,1506 ****
--- 1548,1552 ----
  	return ERR_PTR(error);	
  }
+ 
  /*
   * shmem_zero_setup - setup a shared anonymous mapping

Index: swap_state.c
===================================================================
RCS file: /cvsroot/linuxcompressed/linux/mm/swap_state.c,v
retrieving revision 1.42
retrieving revision 1.43
diff -C2 -r1.42 -r1.43
*** swap_state.c	6 Dec 2002 19:29:21 -0000	1.42
--- swap_state.c	19 May 2003 01:38:49 -0000	1.43
***************
*** 127,131 ****
  		BUG();

! 	block_flushpage(page, 0);

  	entry.val = page->index;
--- 127,132 ----
  		BUG();

! 	if (unlikely(!block_flushpage(page, 0)))
! 		BUG();	/* an anonymous page cannot have page->buffers set */

  	entry.val = page->index;

Index: swapfile.c
===================================================================
RCS file: /cvsroot/linuxcompressed/linux/mm/swapfile.c,v
retrieving revision 1.38
retrieving revision 1.39
diff -C2 -r1.38 -r1.39
*** swapfile.c	6 Dec 2002 19:29:21 -0000	1.38
--- swapfile.c	19 May 2003 01:38:49 -0000	1.39
***************
*** 15,19 ****
  #include <linux/pagemap.h>
  #include <linux/shm.h>
- #include <linux/compiler.h>
  #include <linux/comp_cache.h>

--- 15,18 ----
***************
*** 944,956 ****
  		 * Note shmem_unuse already deleted its from swap cache.
  		 */
! 		swcount = swap_map_count(*swap_map);
! 		if ((swcount > 0) != PageSwapCache(page))
! 			BUG();
! 		if ((swcount > 1) && PageDirty(page)) {
  			rw_swap_page(WRITE, page);
  			lock_page(page);
  		}
! 		if (PageCompressed(page))
! 			decompress_swap_cache_page(page);
  		if (PageSwapCache(page))
  			delete_from_swap_cache(page);
--- 943,952 ----
  		 * Note shmem_unuse already deleted its from swap cache.
  		 */
! 		if ((swap_map_count(*swap_map) > 1) && PageDirty(page) && PageSwapCache(page)) {
  			rw_swap_page(WRITE, page);
  			lock_page(page);
  		}
!  		if (PageCompressed(page))
!  			decompress_swap_cache_page(page);		
  		if (PageSwapCache(page))
  			delete_from_swap_cache(page);

Index: vmscan.c
===================================================================
RCS file: /cvsroot/linuxcompressed/linux/mm/vmscan.c,v
retrieving revision 1.44
retrieving revision 1.45
diff -C2 -r1.44 -r1.45
*** vmscan.c	22 Nov 2002 16:01:36 -0000	1.44
--- vmscan.c	19 May 2003 01:38:50 -0000	1.45
***************
*** 2,5 ****
--- 2,8 ----
   *  linux/mm/vmscan.c
   *
+  *  The pageout daemon, decides which pages to evict (swap out) and
+  *  does the actual work of freeing them.
+  *
   *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   *
***************
*** 21,25 ****
  #include <linux/highmem.h>
  #include <linux/file.h>
- #include <linux/compiler.h>
  #include <linux/comp_cache.h>

--- 24,27 ----
***************
*** 60,64 ****

  	/* Don't bother replenishing zones not under pressure.. */
! 	if (!memclass(page->zone, classzone))
  		return 0;

--- 62,66 ----

  	/* Don't bother replenishing zones not under pressure.. */
! 	if (!memclass(page_zone(page), classzone))
  		return 0;

***************
*** 241,246 ****

  	end = vma->vm_end;
! 	if (address >= end)
! 		BUG();
  	do {
  		count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone);
--- 243,247 ----

  	end = vma->vm_end;
! 	BUG_ON(address >= end);
  	do {
  		count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone);
***************
*** 361,368 ****
  		page = list_entry(entry, struct page, lru);

! 		if (unlikely(!PageLRU(page)))
! 			BUG();
! 		if (unlikely(PageActive(page)))
! 			BUG();

  		list_del(entry);
--- 362,367 ----
  		page = list_entry(entry, struct page, lru);

! 		BUG_ON(!PageLRU(page));
! 		BUG_ON(PageActive(page));

  		list_del(entry);
***************
*** 376,380 ****
  			continue;

! 		if (!memclass(page->zone, classzone))
  			continue;

--- 375,379 ----
  			continue;

! 		if (!memclass(page_zone(page), classzone))
  			continue;

***************
*** 643,647 ****
  }

! int try_to_free_pages(zone_t *classzone, unsigned int gfp_mask, unsigned int order)
  {
  	int priority = DEF_PRIORITY;
--- 642,646 ----
  }

! int try_to_free_pages_zone(zone_t *classzone, unsigned int gfp_mask)
  {
  	int priority = DEF_PRIORITY;
***************
*** 663,666 ****
--- 662,684 ----
  }

+ int try_to_free_pages(unsigned int gfp_mask)
+ {
+ 	pg_data_t *pgdat;
+ 	zonelist_t *zonelist;
+ 	unsigned long pf_free_pages;
+ 	int error = 0;
+ 
+ 	pf_free_pages = current->flags & PF_FREE_PAGES;
+ 	current->flags &= ~PF_FREE_PAGES;
+ 
+ 	for_each_pgdat(pgdat) {
+ 		zonelist = pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK);
+ 		error |= try_to_free_pages_zone(zonelist->zones[0], gfp_mask);
+ 	}
+ 
+ 	current->flags |= pf_free_pages;
+ 	return error;
+ }
+ 
  DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);

***************
*** 689,693 ****
  		if (!zone->need_balance)
  			continue;
! 		if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) {
  			zone->need_balance = 0;
  			__set_current_state(TASK_INTERRUPTIBLE);
--- 707,711 ----
  		if (!zone->need_balance)
  			continue;
! 		if (!try_to_free_pages_zone(zone, GFP_KSWAPD)) {
  			zone->need_balance = 0;
  			__set_current_state(TASK_INTERRUPTIBLE);
***************
*** 711,718 ****
  	do {
  		need_more_balance = 0;
! 		pgdat = pgdat_list;
! 		do
  			need_more_balance |= kswapd_balance_pgdat(pgdat);
- 		while ((pgdat = pgdat->node_next));
  	} while (need_more_balance);
  }
--- 729,735 ----
  	do {
  		need_more_balance = 0;
! 
! 		for_each_pgdat(pgdat)
  			need_more_balance |= kswapd_balance_pgdat(pgdat);
  	} while (need_more_balance);
  }
***************
*** 737,746 ****
  	pg_data_t * pgdat;

! 	pgdat = pgdat_list;
! 	do {
! 		if (kswapd_can_sleep_pgdat(pgdat))
! 			continue;
! 		return 0;
! 	} while ((pgdat = pgdat->node_next));

  	return 1;
--- 754,761 ----
  	pg_data_t * pgdat;

! 	for_each_pgdat(pgdat) {
! 		if (!kswapd_can_sleep_pgdat(pgdat))
! 			return 0;
! 	}

  	return 1;

[lc-checkins] CVS: linux/mm Makefile,1.5,1.6 filemap.c,1.42,1.43 memory.c,1.36,1.37 mmap.c,1.8,1.9 o

[lc-checkins] CVS: linux/mm Makefile,1.5,1.6 filemap.c,1.42,1.43 memory.c,1.36,1.37 mmap.c,1.8,1.9 oom_kill.c,1.8,1.9 page_alloc.c,1.26,1.27 page_io.c,1.6,1.7 shmem.c,1.22,1.23 swap_state.c,1.42,1.43 swapfile.c,1.38,1.39 vmscan.c,1.44,1.45