Thread: [Lse-tech] the buffer_head problem

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 454-5900

Here's one possible solution to the problem of ZONE_NORMAL
exhaustion on large-memory x86 machines in the 2.4 kernel.

Basically: hunt down buffer_heads and kill them wherever possible.

This patch is a rough quickie.  I'd appreciate it if someone who
has the interest and the hardware can test and indicate whether
it is worth further development.

Some people may voice objections to the proactive stripping of
disk mappings.  I can shoot that down, and benchmarks talk.

Patch is against 2.4.19-pre8

--- 2.4.19-pre8/include/linux/pagemap.h~nuke-buffers	Tue May  7 22:01:20 2002
+++ 2.4.19-pre8-akpm/include/linux/pagemap.h	Tue May  7 22:03:09 2002
@@ -89,13 +89,7 @@ extern void add_to_page_cache(struct pag
 extern void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index);
 extern int add_to_page_cache_unique(struct page * page, struct address_space *mapping, unsigned long index, struct page **hash);
 
-extern void ___wait_on_page(struct page *);
-
-static inline void wait_on_page(struct page * page)
-{
-	if (PageLocked(page))
-		___wait_on_page(page);
-}
+extern void wait_on_page(struct page *);
 
 extern struct page * grab_cache_page (struct address_space *, unsigned long);
 extern struct page * grab_cache_page_nowait (struct address_space *, unsigned long);
--- 2.4.19-pre8/mm/filemap.c~nuke-buffers	Tue May  7 22:01:20 2002
+++ 2.4.19-pre8-akpm/mm/filemap.c	Tue May  7 22:01:20 2002
@@ -608,7 +608,7 @@ int filemap_fdatawait(struct address_spa
 		page_cache_get(page);
 		spin_unlock(&pagecache_lock);
 
-		___wait_on_page(page);
+		wait_on_page(page);
 		if (PageError(page))
 			ret = -EIO;
 
@@ -805,33 +805,29 @@ static inline wait_queue_head_t *page_wa
 	return &wait[hash];
 }
 
-/* 
- * Wait for a page to get unlocked.
+static void kill_buffers(struct page *page)
+{
+	if (!PageLocked(page))
+		BUG();
+	if (page->buffers)
+		try_to_release_page(page, GFP_NOIO);
+}
+
+/*
+ * Wait for a page to come unlocked.  Then try to ditch its buffer_heads.
  *
- * This must be called with the caller "holding" the page,
- * ie with increased "page->count" so that the page won't
- * go away during the wait..
+ * FIXME: Make the ditching dependent on CONFIG_MONSTER_BOX or something.
  */
-void ___wait_on_page(struct page *page)
+void wait_on_page(struct page *page)
 {
-	wait_queue_head_t *waitqueue = page_waitqueue(page);
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-
-	add_wait_queue(waitqueue, &wait);
-	do {
-		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-		if (!PageLocked(page))
-			break;
-		sync_page(page);
-		schedule();
-	} while (PageLocked(page));
-	__set_task_state(tsk, TASK_RUNNING);
-	remove_wait_queue(waitqueue, &wait);
+	lock_page(page);
+	kill_buffers(page);
+	unlock_page(page);
 }
+EXPORT_SYMBOL(wait_on_page);
 
 /*
- * Unlock the page and wake up sleepers in ___wait_on_page.
+ * Unlock the page and wake up sleepers in lock_page.
  */
 void unlock_page(struct page *page)
 {
@@ -1400,6 +1396,11 @@ found_page:
 
 		if (!Page_Uptodate(page))
 			goto page_not_up_to_date;
+		if (page->buffers) {
+			lock_page(page);
+			kill_buffers(page);
+			unlock_page(page);
+		}
 		generic_file_readahead(reada_ok, filp, inode, page);
 page_ok:
 		/* If users can be writing to this page using arbitrary
@@ -1457,6 +1458,7 @@ page_not_up_to_date:
 
 		/* Did somebody else fill it already? */
 		if (Page_Uptodate(page)) {
+			kill_buffers(page);
 			UnlockPage(page);
 			goto page_ok;
 		}
@@ -1948,6 +1950,11 @@ retry_find:
 	 */
 	if (!Page_Uptodate(page))
 		goto page_not_uptodate;
+	if (page->buffers) {
+		lock_page(page);
+		kill_buffers(page);
+		unlock_page(page);
+	}
 
 success:
  	/*
@@ -2006,6 +2013,7 @@ page_not_uptodate:
 
 	/* Did somebody else get it up-to-date? */
 	if (Page_Uptodate(page)) {
+		kill_buffers(page);
 		UnlockPage(page);
 		goto success;
 	}
@@ -2033,6 +2041,7 @@ page_not_uptodate:
 
 	/* Somebody else successfully read it in? */
 	if (Page_Uptodate(page)) {
+		kill_buffers(page);
 		UnlockPage(page);
 		goto success;
 	}
@@ -2850,6 +2859,7 @@ retry:
 		goto retry;
 	}
 	if (Page_Uptodate(page)) {
+		kill_buffers(page);
 		UnlockPage(page);
 		goto out;
 	}
--- 2.4.19-pre8/kernel/ksyms.c~nuke-buffers	Tue May  7 22:01:20 2002
+++ 2.4.19-pre8-akpm/kernel/ksyms.c	Tue May  7 22:01:20 2002
@@ -202,7 +202,6 @@ EXPORT_SYMBOL(ll_rw_block);
 EXPORT_SYMBOL(submit_bh);
 EXPORT_SYMBOL(unlock_buffer);
 EXPORT_SYMBOL(__wait_on_buffer);
-EXPORT_SYMBOL(___wait_on_page);
 EXPORT_SYMBOL(generic_direct_IO);
 EXPORT_SYMBOL(discard_bh_page);
 EXPORT_SYMBOL(block_write_full_page);
--- 2.4.19-pre8/mm/vmscan.c~nuke-buffers	Tue May  7 22:01:20 2002
+++ 2.4.19-pre8-akpm/mm/vmscan.c	Tue May  7 22:01:20 2002
@@ -365,8 +365,13 @@ static int shrink_cache(int nr_pages, zo
 		if (unlikely(!page_count(page)))
 			continue;
 
-		if (!memclass(page_zone(page), classzone))
+		if (!memclass(page_zone(page), classzone)) {
+			if (page->buffers && !TryLockPage(page)) {
+				try_to_release_page(page, GFP_NOIO);
+				unlock_page(page);
+			}
 			continue;
+		}
 
 		/* Racy check to avoid trylocking when not worthwhile */
 		if (!page->buffers && (page_count(page) != 1 || !page->mapping))
@@ -562,6 +567,11 @@ static int shrink_caches(zone_t * classz
 	nr_pages -= kmem_cache_reap(gfp_mask);
 	if (nr_pages <= 0)
 		return 0;
+	if ((gfp_mask & __GFP_WAIT) && (shrink_buffer_cache() > 16)) {
+		nr_pages -= kmem_cache_reap(gfp_mask);
+		if (nr_pages <= 0)
+			return 0;
+	}
 
 	nr_pages = chunk_size;
 	/* try to keep the active list 2/3 of the size of the cache */
--- 2.4.19-pre8/fs/buffer.c~nuke-buffers	Tue May  7 22:01:20 2002
+++ 2.4.19-pre8-akpm/fs/buffer.c	Tue May  7 22:17:36 2002
@@ -1500,6 +1500,10 @@ static int __block_write_full_page(struc
 	/* Stage 3: submit the IO */
 	do {
 		struct buffer_head *next = bh->b_this_page;
+		/*
+		 * Stick it on BUF_LOCKED so shrink_buffer_cache() can nail it.
+		 */
+		refile_buffer(bh);
 		submit_bh(WRITE, bh);
 		bh = next;
 	} while (bh != head);
@@ -2615,6 +2619,25 @@ static int sync_page_buffers(struct buff
 int try_to_free_buffers(struct page * page, unsigned int gfp_mask)
 {
 	struct buffer_head * tmp, * bh = page->buffers;
+	int was_uptodate = 1;
+
+	if (!PageLocked(page))
+		BUG();
+
+	if (!bh)
+		return 1;
+	/*
+	 * Quick check for freeable buffers before we go take three
+	 * global locks.
+	 */
+	if (!(gfp_mask & __GFP_IO)) {
+		tmp = bh;
+		do {
+			if (buffer_busy(tmp))
+				return 0;
+			tmp = tmp->b_this_page;
+		} while (tmp != bh);
+	}
 
 cleaned_buffers_try_again:
 	spin_lock(&lru_list_lock);
@@ -2637,7 +2660,8 @@ cleaned_buffers_try_again:
 		tmp = tmp->b_this_page;
 
 		if (p->b_dev == B_FREE) BUG();
-
+		if (!buffer_uptodate(bh))
+			was_uptodate = 0;
 		remove_inode_queue(p);
 		__remove_from_queues(p);
 		__put_unused_buffer_head(p);
@@ -2645,7 +2669,14 @@ cleaned_buffers_try_again:
 	spin_unlock(&unused_list_lock);
 
 	/* Wake up anyone waiting for buffer heads */
-	wake_up(&buffer_wait);
+	if (waitqueue_active(&buffer_wait))
+		wake_up(&buffer_wait);
+
+	/*
+	 * Make sure we don't read buffers again when they are reattached
+	 */
+	if (was_uptodate)
+		SetPageUptodate(page);
 
 	/* And free the page */
 	page->buffers = NULL;
@@ -2674,6 +2705,62 @@ busy_buffer_page:
 }
 EXPORT_SYMBOL(try_to_free_buffers);
 
+/*
+ * Returns the number of pages which might have become freeable 
+ */
+int shrink_buffer_cache(void)
+{
+	struct buffer_head *bh;
+	int nr_todo;
+	int nr_shrunk = 0;
+
+	/*
+	 * Move any clean unlocked buffers from BUF_LOCKED onto BUF_CLEAN
+	 */
+	spin_lock(&lru_list_lock);
+	for ( ; ; ) {
+		bh = lru_list[BUF_LOCKED];
+		if (!bh || buffer_locked(bh))
+			break;
+		__refile_buffer(bh);
+	}
+
+	/*
+	 * Now start liberating buffers
+	 */
+	nr_todo = nr_buffers_type[BUF_CLEAN];
+	while (nr_todo--) {
+		struct page *page;
+
+		bh = lru_list[BUF_CLEAN];
+		if (!bh)
+			break;
+
+		/*
+		 * Park the buffer on BUF_LOCKED so we don't revisit it on
+		 * this pass.
+		 */
+		__remove_from_lru_list(bh);
+		bh->b_list = BUF_LOCKED;
+		__insert_into_lru_list(bh, BUF_LOCKED);
+		page = bh->b_page;
+		if (TryLockPage(page))
+			continue;
+
+		page_cache_get(page);
+		spin_unlock(&lru_list_lock);
+		if (try_to_release_page(page, GFP_NOIO))
+			nr_shrunk++;
+		unlock_page(page);
+		page_cache_release(page);
+		spin_lock(&lru_list_lock);
+	}
+	spin_unlock(&lru_list_lock);
+//	printk("%s: liberated %d page's worth of buffer_heads\n",
+//		__FUNCTION__, nr_shrunk);
+	return (nr_shrunk * sizeof(struct buffer_head)) / PAGE_CACHE_SIZE;
+}
+
 /* ================== Debugging =================== */
 
 void show_buffers(void)
@@ -2988,6 +3075,7 @@ int kupdate(void *startup)
 #ifdef DEBUG
 		printk(KERN_DEBUG "kupdate() activated...\n");
 #endif
+		shrink_buffer_cache();
 		sync_old_buffers();
 		run_task_queue(&tq_disk);
 	}
--- 2.4.19-pre8/include/linux/fs.h~nuke-buffers	Tue May  7 22:01:20 2002
+++ 2.4.19-pre8-akpm/include/linux/fs.h	Tue May  7 22:01:20 2002
@@ -1116,6 +1116,7 @@ extern int FASTCALL(try_to_free_buffers(
 extern void refile_buffer(struct buffer_head * buf);
 extern void create_empty_buffers(struct page *, kdev_t, unsigned long);
 extern void end_buffer_io_sync(struct buffer_head *bh, int uptodate);
+extern int shrink_buffer_cache(void);
 
 /* reiserfs_writepage needs this */
 extern void set_buffer_async_io(struct buffer_head *bh) ;


-




Thread: [Lse-tech] the buffer_head problem

lse-tech