[LV-kern-commit] CVS: kernel-2.4/fs dcache.c,1.1.1.2,1.2 devices.c,1.1.1.1,1.2 dquot.c,1.1.1.2,1.2 e

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/linux-vax/kernel-2.4/fs
In directory usw-pr-cvs1:/tmp/cvs-serv27691

Modified Files:
	dcache.c devices.c dquot.c exec.c fcntl.c file_table.c 
	filesystems.c inode.c iobuf.c 
Log Message:
sync 2.4.15 commit 11


Index: dcache.c
===================================================================
RCS file: /cvsroot/linux-vax/kernel-2.4/fs/dcache.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -u -r1.1.1.2 -r1.2

--- dcache.c	25 Feb 2001 23:14:46 -0000	1.1.1.2
+++ dcache.c	9 Apr 2002 13:11:16 -0000	1.2
@@ -22,6 +22,7 @@
 #include <linux/init.h>
 #include <linux/smp_lock.h>
 #include <linux/cache.h>
+#include <linux/module.h>
 
 #include <asm/uaccess.h>
 
@@ -138,10 +139,6 @@
 		goto kill_it;
 	list_add(&dentry->d_lru, &dentry_unused);
 	dentry_stat.nr_unused++;
-	/*
-	 * Update the timestamp
-	 */
-	dentry->d_reftime = jiffies;
 	spin_unlock(&dcache_lock);
 	return;
 
@@ -223,8 +220,7 @@
 	atomic_inc(&dentry->d_count);
 	if (atomic_read(&dentry->d_count) == 1) {
 		dentry_stat.nr_unused--;
-		list_del(&dentry->d_lru);
-		INIT_LIST_HEAD(&dentry->d_lru);		/* make "list_empty()" work */
+		list_del_init(&dentry->d_lru);
 	}
 	return dentry;
 }
@@ -337,10 +333,10 @@
 		dentry = list_entry(tmp, struct dentry, d_lru);
 
 		/* If the dentry was recently referenced, don't free it. */
-		if (dentry->d_flags & DCACHE_REFERENCED) {
-			dentry->d_flags &= ~DCACHE_REFERENCED;
+		if (dentry->d_vfs_flags & DCACHE_REFERENCED) {
+			dentry->d_vfs_flags &= ~DCACHE_REFERENCED;
 			list_add(&dentry->d_lru, &dentry_unused);
-			goto next;
+			continue;
 		}
 		dentry_stat.nr_unused--;
 
@@ -349,7 +345,6 @@
 			BUG();
 
 		prune_one_dentry(dentry);
-	next:
 		if (!--count)
 			break;
 	}
@@ -413,8 +408,7 @@
 		if (atomic_read(&dentry->d_count))
 			continue;
 		dentry_stat.nr_unused--;
-		list_del(tmp);
-		INIT_LIST_HEAD(tmp);
+		list_del_init(tmp);
 		prune_one_dentry(dentry);
 		goto repeat;
 	}
@@ -553,7 +547,7 @@
  *  ...
  *   6 - base-level: try to shrink a bit.
  */
-void shrink_dcache_memory(int priority, unsigned int gfp_mask)
+int shrink_dcache_memory(int priority, unsigned int gfp_mask)
 {
 	int count = 0;
 
@@ -568,14 +562,14 @@
 	 * We should make sure we don't hold the superblock lock over
 	 * block allocations, but for now:
 	 */
-	if (!(gfp_mask & __GFP_IO))
-		return;
+	if (!(gfp_mask & __GFP_FS))
+		return 0;
 
-	if (priority)
-		count = dentry_stat.nr_unused / priority;
+	count = dentry_stat.nr_unused / priority;
 
 	prune_dcache(count);
 	kmem_cache_shrink(dentry_cache);
+	return 0;
 }
 
 #define NAME_ALLOC_LEN(len)	((len+16) & ~15)
@@ -612,6 +606,7 @@
 	str[name->len] = 0;
 
 	atomic_set(&dentry->d_count, 1);
+	dentry->d_vfs_flags = 0;
 	dentry->d_flags = 0;
 	dentry->d_inode = NULL;
 	dentry->d_parent = NULL;
@@ -621,7 +616,7 @@
 	dentry->d_name.hash = name->hash;
 	dentry->d_op = NULL;
 	dentry->d_fsdata = NULL;
-	INIT_LIST_HEAD(&dentry->d_vfsmnt);
+	dentry->d_mounted = 0;
 	INIT_LIST_HEAD(&dentry->d_hash);
 	INIT_LIST_HEAD(&dentry->d_lru);
 	INIT_LIST_HEAD(&dentry->d_subdirs);
@@ -656,6 +651,7 @@
  
 void d_instantiate(struct dentry *entry, struct inode * inode)
 {
+	if (!list_empty(&entry->d_alias)) BUG();
 	spin_lock(&dcache_lock);
 	if (inode)
 		list_add(&entry->d_alias, &inode->i_dentry);
@@ -734,7 +730,7 @@
 				continue;
 		}
 		__dget_locked(dentry);
-		dentry->d_flags |= DCACHE_REFERENCED;
+		dentry->d_vfs_flags |= DCACHE_REFERENCED;
 		spin_unlock(&dcache_lock);
 		return dentry;
 	}
@@ -744,58 +740,48 @@
 
 /**
  * d_validate - verify dentry provided from insecure source
- * @dentry: The dentry alleged to be valid
- * @dparent: The parent dentry
+ * @dentry: The dentry alleged to be valid child of @dparent
+ * @dparent: The parent dentry (known to be valid)
  * @hash: Hash of the dentry
  * @len: Length of the name
  *
  * An insecure source has sent us a dentry, here we verify it and dget() it.
  * This is used by ncpfs in its readdir implementation.
  * Zero is returned in the dentry is invalid.
- *
- * NOTE: This function does _not_ dereference the pointers before we have
- * validated them. We can test the pointer values, but we
- * must not actually use them until we have found a valid
- * copy of the pointer in kernel space..
  */
  
-int d_validate(struct dentry *dentry, struct dentry *dparent,
-	       unsigned int hash, unsigned int len)
+int d_validate(struct dentry *dentry, struct dentry *dparent)
 {
+	unsigned long dent_addr = (unsigned long) dentry;
+	unsigned long min_addr = PAGE_OFFSET;
+	unsigned long align_mask = 0x0F;
 	struct list_head *base, *lhp;
-	int valid = 1;
 
-	spin_lock(&dcache_lock);
-	if (dentry != dparent) {
-		base = d_hash(dparent, hash);
-		lhp = base;
-		while ((lhp = lhp->next) != base) {
-			if (dentry == list_entry(lhp, struct dentry, d_hash)) {
-				__dget_locked(dentry);
-				goto out;
-			}
-		}
-	} else {
-		/*
-		 * Special case: local mount points don't live in
-		 * the hashes, so we search the super blocks.
-		 */
-		struct super_block *sb = sb_entry(super_blocks.next);
+	if (dent_addr < min_addr)
+		goto out;
+	if (dent_addr > (unsigned long)high_memory - sizeof(struct dentry))
+		goto out;
+	if (dent_addr & align_mask)
+		goto out;
+	if ((!kern_addr_valid(dent_addr)) || (!kern_addr_valid(dent_addr -1 +
+						sizeof(struct dentry))))
+		goto out;
 
-		for (; sb != sb_entry(&super_blocks); 
-		     sb = sb_entry(sb->s_list.next)) {
-			if (!sb->s_dev)
-				continue;
-			if (sb->s_root == dentry) {
-				__dget_locked(dentry);
-				goto out;
-			}
+	if (dentry->d_parent != dparent)
+		goto out;
+
+	spin_lock(&dcache_lock);
+	lhp = base = d_hash(dparent, dentry->d_name.hash);
+	while ((lhp = lhp->next) != base) {
+		if (dentry == list_entry(lhp, struct dentry, d_hash)) {
+			__dget_locked(dentry);
+			spin_unlock(&dcache_lock);
+			return 1;
 		}
 	}
-	valid = 0;
-out:
 	spin_unlock(&dcache_lock);
-	return valid;
+out:
+	return 0;
 }
 
 /*
@@ -848,6 +834,7 @@
 void d_rehash(struct dentry * entry)
 {
 	struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash);
+	if (!list_empty(&entry->d_hash)) BUG();
 	spin_lock(&dcache_lock);
 	list_add(&entry->d_hash, list);
 	spin_unlock(&dcache_lock);
@@ -922,8 +909,7 @@
 	list_add(&dentry->d_hash, &target->d_hash);
 
 	/* Unhash the target: dput() will then get rid of it */
-	list_del(&target->d_hash);
-	INIT_LIST_HEAD(&target->d_hash);
+	list_del_init(&target->d_hash);
 
 	list_del(&dentry->d_child);
 	list_del(&target->d_child);
@@ -1239,6 +1225,18 @@
 	} while (i);
 }
 
+static void init_buffer_head(void * foo, kmem_cache_t * cachep, unsigned long flags)
+{
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR)
+	{
+		struct buffer_head * bh = (struct buffer_head *) foo;
+
+		memset(bh, 0, sizeof(*bh));
+		init_waitqueue_head(&bh->b_wait);
+	}
+}
+
 /* SLAB cache for __getname() consumers */
 kmem_cache_t *names_cachep;
 
@@ -1250,12 +1248,16 @@
 
 /* SLAB cache for buffer_head structures */
 kmem_cache_t *bh_cachep;
+EXPORT_SYMBOL(bh_cachep);
+
+extern void bdev_cache_init(void);
+extern void cdev_cache_init(void);
 
 void __init vfs_caches_init(unsigned long mempages)
 {
 	bh_cachep = kmem_cache_create("buffer_head",
 			sizeof(struct buffer_head), 0,
-			SLAB_HWCACHE_ALIGN, NULL, NULL);
+			SLAB_HWCACHE_ALIGN, init_buffer_head, NULL);
 	if(!bh_cachep)
 		panic("Cannot create buffer head SLAB cache");
 
@@ -1280,4 +1282,8 @@
 #endif
 
 	dcache_init(mempages);
+	inode_init(mempages);
+	mnt_init(mempages);
+	bdev_cache_init();
+	cdev_cache_init();
 }

Index: devices.c
===================================================================
RCS file: /cvsroot/linux-vax/kernel-2.4/fs/devices.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -u -r1.1.1.1 -r1.2
--- devices.c	14 Jan 2001 16:25:21 -0000	1.1.1.1
+++ devices.c	9 Apr 2002 13:11:16 -0000	1.2
@@ -203,10 +203,10 @@
 	if (S_ISCHR(mode)) {
 		inode->i_fop = &def_chr_fops;
 		inode->i_rdev = to_kdev_t(rdev);
+		inode->i_cdev = cdget(rdev);
 	} else if (S_ISBLK(mode)) {
 		inode->i_fop = &def_blk_fops;
 		inode->i_rdev = to_kdev_t(rdev);
-		inode->i_bdev = bdget(rdev);
 	} else if (S_ISFIFO(mode))
 		inode->i_fop = &def_fifo_fops;
 	else if (S_ISSOCK(mode))

Index: dquot.c
===================================================================
RCS file: /cvsroot/linux-vax/kernel-2.4/fs/dquot.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -u -r1.1.1.2 -r1.2
--- dquot.c	25 Feb 2001 23:14:46 -0000	1.1.1.2
+++ dquot.c	9 Apr 2002 13:11:16 -0000	1.2
@@ -26,7 +26,7 @@
  *		dquot_incr_...() to calling functions.
  *		invalidate_dquots() now writes modified dquots.
  *		Serialized quota_off() and quota_on() for mount point.
- *		Fixed a few bugs in grow_dquots.
+ *		Fixed a few bugs in grow_dquots().
  *		Fixed deadlock in write_dquot() - we no longer account quotas on
  *		quota files
  *		remove_dquot_ref() moved to inode.c - it now traverses through inodes
@@ -34,13 +34,24 @@
  *		Added check for bogus uid and fixed check for group in quotactl.
  *		Jan Kara, <ja...@su...>, sponsored by SuSE CR, 10-11/99
[...1541 lines suppressed...]
+			ret = quota_on(sb, type, (char *) addr);
 			goto out;
 		case Q_QUOTAOFF:
 			ret = quota_off(sb, type);
@@ -1597,12 +1468,12 @@
 			goto out;
 	}
 
-	flags |= QUOTA_SYSCALL;
-
-	ret = -ESRCH;
+	ret = -NODEV;
 	if (sb && sb_has_quota_enabled(sb, type))
 		ret = set_dqblk(sb, id, type, flags, (struct dqblk *) addr);
 out:
+	if (sb)
+		drop_super(sb);
 	unlock_kernel();
 	return ret;
 }

Index: exec.c
===================================================================
RCS file: /cvsroot/linux-vax/kernel-2.4/fs/exec.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -u -r1.1.1.2 -r1.2
--- exec.c	25 Feb 2001 23:14:45 -0000	1.1.1.2
+++ exec.c	9 Apr 2002 13:11:16 -0000	1.2
@@ -34,6 +34,7 @@
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
 #include <linux/spinlock.h>
+#include <linux/personality.h>
 #define __NO_VERSION__
 #include <linux/module.h>
 
@@ -45,6 +46,8 @@
 #include <linux/kmod.h>
 #endif
 
+int core_uses_pid;
+
 static struct linux_binfmt *formats;
 static rwlock_t binfmt_lock = RW_LOCK_UNLOCKED;
 
@@ -159,11 +162,9 @@
 	if (argv != NULL) {
 		for (;;) {
 			char * p;
-			int error;
 
-			error = get_user(p,argv);
-			if (error)
-				return error;
+			if (get_user(p, argv))
+				return -EFAULT;
 			if (!p)
 				break;
 			argv++;
@@ -186,7 +187,7 @@
 		int len;
 		unsigned long pos;
 
-		if (get_user(str, argv+argc) || !str || !(len = strnlen_user(str, bprm->p))) 
+		if (get_user(str, argv+argc) || !(len = strnlen_user(str, bprm->p)))
 			return -EFAULT;
 		if (bprm->p < len) 
 			return -E2BIG; 
@@ -252,6 +253,8 @@
 /*
  * This routine is used to map in a page into an address space: needed by
  * execve() for the initial stack and environment pages.
+ *
+ * tsk->mmap_sem is held for writing.
  */
 void put_dirty_page(struct task_struct * tsk, struct page *page, unsigned long address)
 {
@@ -260,29 +263,32 @@
 	pte_t * pte;
 
 	if (page_count(page) != 1)
-		printk("mem_map disagrees with %p at %08lx\n", page, address);
+		printk(KERN_ERR "mem_map disagrees with %p at %08lx\n", page, address);
 	pgd = pgd_offset(tsk->mm, address);
-	pmd = pmd_alloc(pgd, address);
-	if (!pmd) {
-		__free_page(page);
-		force_sig(SIGKILL, tsk);
-		return;
-	}
-	pte = pte_alloc(pmd, address);
-	if (!pte) {
-		__free_page(page);
-		force_sig(SIGKILL, tsk);
-		return;
-	}
-	if (!pte_none(*pte)) {
-		pte_ERROR(*pte);
-		__free_page(page);
-		return;
-	}
+
+	spin_lock(&tsk->mm->page_table_lock);
+	pmd = pmd_alloc(tsk->mm, pgd, address);
+	if (!pmd)
+		goto out;
+	pte = pte_alloc(tsk->mm, pmd, address);
+	if (!pte)
+		goto out;
+	if (!pte_none(*pte))
+		goto out;
+	lru_cache_add(page);
 	flush_dcache_page(page);
 	flush_page_to_ram(page);
 	set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY))));
-/* no need for flush_tlb */
+	tsk->mm->rss++;
+	spin_unlock(&tsk->mm->page_table_lock);
+
+	/* no need for flush_tlb */
+	return;
+out:
+	spin_unlock(&tsk->mm->page_table_lock);
+	__free_page(page);
+	force_sig(SIGKILL, tsk);
+	return;
 }
 
 int setup_arg_pages(struct linux_binprm *bprm)
@@ -302,7 +308,7 @@
 	if (!mpnt) 
 		return -ENOMEM; 
 	
-	down(&current->mm->mmap_sem);
+	down_write(&current->mm->mmap_sem);
 	{
 		mpnt->vm_mm = current->mm;
 		mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p;
@@ -321,12 +327,11 @@
 		struct page *page = bprm->page[i];
 		if (page) {
 			bprm->page[i] = NULL;
-			current->mm->rss++;
 			put_dirty_page(current,page,stack_base);
 		}
 		stack_base += PAGE_SIZE;
 	}
-	up(&current->mm->mmap_sem);
+	up_write(&current->mm->mmap_sem);
 	
 	return 0;
 }
@@ -344,8 +349,11 @@
 	if (!err) {
 		inode = nd.dentry->d_inode;
 		file = ERR_PTR(-EACCES);
-		if (!IS_NOEXEC(inode) && S_ISREG(inode->i_mode)) {
+		if (!(nd.mnt->mnt_flags & MNT_NOEXEC) &&
+		    S_ISREG(inode->i_mode)) {
 			int err = permission(inode, MAY_EXEC);
+			if (!err && !(inode->i_mode & 0111))
+				err = -EACCES;
 			file = ERR_PTR(err);
 			if (!err) {
 				file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
@@ -545,7 +553,7 @@
 	current->sas_ss_sp = current->sas_ss_size = 0;
 
 	if (current->euid == current->uid && current->egid == current->gid)
-		current->dumpable = 1;
+		current->mm->dumpable = 1;
 	name = bprm->filename;
 	for (i=0; (ch = *(name++)) != '\0';) {
 		if (ch == '/')
@@ -562,7 +570,7 @@
 
 	if (bprm->e_uid != current->euid || bprm->e_gid != current->egid || 
 	    permission(bprm->file->f_dentry->d_inode,MAY_READ))
-		current->dumpable = 0;
+		current->mm->dumpable = 0;
 
 	/* An exec changes our domain. We are no longer part of the thread
 	   group */
@@ -577,9 +585,10 @@
 mmap_failed:
 flush_failed:
 	spin_lock_irq(&current->sigmask_lock);
-	if (current->sig != oldsig)
+	if (current->sig != oldsig) {
 		kfree(current->sig);
-	current->sig = oldsig;
+		current->sig = oldsig;
+	}
 	spin_unlock_irq(&current->sigmask_lock);
 	return retval;
 }
@@ -590,7 +599,7 @@
  */
 static inline int must_not_trace_exec(struct task_struct * p)
 {
-	return (p->ptrace & PT_PTRACED) && !cap_raised(p->p_pptr->cap_effective, CAP_SYS_PTRACE);
+	return (p->ptrace & PT_PTRACED) && !(p->ptrace & PT_PTRACE_CAP);
 }
 
 /* 
@@ -603,7 +612,10 @@
 	struct inode * inode = bprm->file->f_dentry->d_inode;
 
 	mode = inode->i_mode;
-	/* Huh? We had already checked for MAY_EXEC, WTF do we check this? */
+	/*
+	 * Check execute perms again - if the caller has CAP_DAC_OVERRIDE,
+	 * vfs_permission lets a non-executable through
+	 */
 	if (!(mode & 0111))	/* with at least _one_ execute bit set */
 		return -EACCES;
 	if (bprm->file->f_op == NULL)
@@ -612,7 +624,7 @@
 	bprm->e_uid = current->euid;
 	bprm->e_gid = current->egid;
 
-	if(!IS_NOSUID(inode)) {
+	if(!(bprm->file->f_vfsmnt->mnt_flags & MNT_NOSUID)) {
 		/* Set-uid? */
 		if (mode & S_ISUID)
 			bprm->e_uid = inode->i_uid;
@@ -680,7 +692,7 @@
 
 	if (bprm->e_uid != current->uid || bprm->e_gid != current->gid ||
 	    !cap_issubset(new_permitted, current->cap_permitted)) {
-                current->dumpable = 0;
+                current->mm->dumpable = 0;
 		
 		lock_kernel();
 		if (must_not_trace_exec(current)
@@ -759,7 +771,6 @@
 	    if (!bprm->loader && eh->fh.f_magic == 0x183 &&
 		(eh->fh.f_flags & 0x3000) == 0x3000)
 	    {
-		char * dynloader[] = { "/sbin/loader" };
 		struct file * file;
 		unsigned long loader;
 
@@ -769,10 +780,14 @@
 
 	        loader = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
 
-		file = open_exec(dynloader[0]);
+		file = open_exec("/sbin/loader");
 		retval = PTR_ERR(file);
 		if (IS_ERR(file))
 			return retval;
+
+		/* Remember if the application is TASO.  */
+		bprm->sh_bang = eh->ah.entry < 0x100000000;
+
 		bprm->file = file;
 		bprm->loader = loader;
 		retval = prepare_binprm(bprm);
@@ -783,6 +798,9 @@
 	    }
 	}
 #endif
+	/* kernel module loader fixup */
+	/* so we don't try to load run modprobe in kernel space. */
+	set_fs(USER_DS);
 	for (try=0; try<2; try++) {
 		read_lock(&binfmt_lock);
 		for (fmt = formats ; fmt ; fmt = fmt->next) {
@@ -918,26 +936,25 @@
 int do_coredump(long signr, struct pt_regs * regs)
 {
 	struct linux_binfmt * binfmt;
-	char corename[6+sizeof(current->comm)];
+	char corename[6+sizeof(current->comm)+10];
 	struct file * file;
 	struct inode * inode;
+	int retval = 0;
 
 	lock_kernel();
 	binfmt = current->binfmt;
 	if (!binfmt || !binfmt->core_dump)
 		goto fail;
-	if (!current->dumpable || atomic_read(&current->mm->mm_users) != 1)
+	if (!current->mm->dumpable)
 		goto fail;
-	current->dumpable = 0;
+	current->mm->dumpable = 0;
 	if (current->rlim[RLIMIT_CORE].rlim_cur < binfmt->min_coredump)
 		goto fail;
 
 	memcpy(corename,"core.", 5);
-#if 0
-	memcpy(corename+5,current->comm,sizeof(current->comm));
-#else
 	corename[4] = '\0';
-#endif
+ 	if (core_uses_pid || atomic_read(&current->mm->mm_users) != 1)
+ 		sprintf(&corename[4], ".%d", current->pid);
 	file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW, 0600);
 	if (IS_ERR(file))
 		goto fail;
@@ -955,15 +972,14 @@
 		goto close_fail;
 	if (do_truncate(file->f_dentry, 0) != 0)
 		goto close_fail;
-	if (!binfmt->core_dump(signr, regs, file))
-		goto close_fail;
-	unlock_kernel();
-	filp_close(file, NULL);
-	return 1;
+
+	down_read(&current->mm->mmap_sem);
+	retval = binfmt->core_dump(signr, regs, file);
+	up_read(&current->mm->mmap_sem);
 
 close_fail:
 	filp_close(file, NULL);
 fail:
 	unlock_kernel();
-	return 0;
+	return retval;
 }

Index: fcntl.c
===================================================================
RCS file: /cvsroot/linux-vax/kernel-2.4/fs/fcntl.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -u -r1.1.1.1 -r1.2
--- fcntl.c	14 Jan 2001 16:25:03 -0000	1.1.1.1
+++ fcntl.c	9 Apr 2002 13:11:16 -0000	1.2
@@ -10,6 +10,7 @@
 #include <linux/dnotify.h>
 #include <linux/smp_lock.h>
 #include <linux/slab.h>
+#include <linux/iobuf.h>
 
 #include <asm/poll.h>
 #include <asm/siginfo.h>
@@ -194,7 +195,7 @@
 	return ret;
 }
 
-#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | FASYNC)
+#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | FASYNC | O_DIRECT)
 
 static int setfl(int fd, struct file * filp, unsigned long arg)
 {
@@ -217,6 +218,25 @@
 		}
 	}
 
+	if (arg & O_DIRECT) {
+		/*
+		 * alloc_kiovec() can sleep and we are only serialized by
+		 * the big kernel lock here, so abuse the i_sem to serialize
+		 * this case too. We of course wouldn't need to go deep down
+		 * to the inode layer, we could stay at the file layer, but
+		 * we don't want to pay for the memory of a semaphore in each
+		 * file structure too and we use the inode semaphore that we just
+		 * pay for anyways.
+		 */
+		error = 0;
+		down(&inode->i_sem);
+		if (!filp->f_iobuf)
+			error = alloc_kiovec(1, &filp->f_iobuf);
+		up(&inode->i_sem);
+		if (error < 0)
+			return error;
+	}
+
 	/* required for strict SunOS emulation */
 	if (O_NONBLOCK != O_NDELAY)
 	       if (arg & O_NDELAY)
@@ -338,7 +358,6 @@
 	if (!filp)
 		goto out;
 
-	lock_kernel();
 	switch (cmd) {
 		case F_GETLK64:
 			err = fcntl_getlk64(fd, (struct flock64 *) arg);
@@ -353,7 +372,6 @@
 			err = do_fcntl(fd, cmd, arg, filp);
 			break;
 	}
-	unlock_kernel();
 	fput(filp);
 out:
 	return err;

Index: file_table.c
===================================================================
RCS file: /cvsroot/linux-vax/kernel-2.4/fs/file_table.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -u -r1.1.1.1 -r1.2
--- file_table.c	14 Jan 2001 16:24:51 -0000	1.1.1.1
+++ file_table.c	9 Apr 2002 13:11:16 -0000	1.2
@@ -11,6 +11,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/smp_lock.h>
+#include <linux/iobuf.h>
 
 /* sysctl tunables... */
 struct files_stat_struct files_stat = {0, 0, NR_FILE};
@@ -66,10 +67,10 @@
 			goto new_one;
 		}
 		/* Big problems... */
-		printk("VFS: filp allocation failed\n");
+		printk(KERN_WARNING "VFS: filp allocation failed\n");
 
 	} else if (files_stat.max_files > old_max) {
-		printk("VFS: file-max limit %d reached\n", files_stat.max_files);
+		printk(KERN_INFO "VFS: file-max limit %d reached\n", files_stat.max_files);
 		old_max = files_stat.max_files;
 	}
 	file_list_unlock();
@@ -104,21 +105,24 @@
 
 	if (atomic_dec_and_test(&file->f_count)) {
 		locks_remove_flock(file);
+
+		if (file->f_iobuf)
+			free_kiovec(1, &file->f_iobuf);
+
 		if (file->f_op && file->f_op->release)
 			file->f_op->release(inode, file);
 		fops_put(file->f_op);
-		file->f_dentry = NULL;
-		file->f_vfsmnt = NULL;
 		if (file->f_mode & FMODE_WRITE)
 			put_write_access(inode);
-		dput(dentry);
-		if (mnt)
-			mntput(mnt);
 		file_list_lock();
+		file->f_dentry = NULL;
+		file->f_vfsmnt = NULL;
 		list_del(&file->f_list);
 		list_add(&file->f_list, &free_list);
 		files_stat.nr_free_files++;
 		file_list_unlock();
+		dput(dentry);
+		mntput(mnt);
 	}
 }
 
@@ -158,14 +162,6 @@
 	file_list_unlock();
 }
 
-void file_moveto(struct file *new, struct file *old)
-{
-	file_list_lock();
-	list_del(&new->f_list);
-	list_add(&new->f_list, &old->f_list);
-	file_list_unlock();
-}
-
 int fs_may_remount_ro(struct super_block *sb)
 {
 	struct list_head *p;
@@ -174,12 +170,7 @@
 	file_list_lock();
 	for (p = sb->s_files.next; p != &sb->s_files; p = p->next) {
 		struct file *file = list_entry(p, struct file, f_list);
-		struct inode *inode;
-
-		if (!file->f_dentry)
-			continue;
-
-		inode = file->f_dentry->d_inode;
+		struct inode *inode = file->f_dentry->d_inode;
 
 		/* File with pending delete? */
 		if (inode->i_nlink == 0)

Index: filesystems.c
===================================================================
RCS file: /cvsroot/linux-vax/kernel-2.4/fs/filesystems.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -u -r1.1.1.1 -r1.2
--- filesystems.c	14 Jan 2001 16:25:21 -0000	1.1.1.1
+++ filesystems.c	9 Apr 2002 13:11:16 -0000	1.2
@@ -7,36 +7,11 @@
  */
 
 #include <linux/config.h>
-#include <linux/fs.h>
-
-#include <linux/devfs_fs_kernel.h>
-#include <linux/nfs_fs.h>
-#include <linux/auto_fs.h>
-#include <linux/devpts_fs.h>
-#include <linux/major.h>
-#include <linux/smp.h>
+#include <linux/module.h>
+#include <linux/sched.h>
 #include <linux/smp_lock.h>
 #include <linux/kmod.h>
-#include <linux/init.h>
-#include <linux/module.h>
 #include <linux/nfsd/interface.h>
-
-#ifdef CONFIG_DEVPTS_FS
-extern int init_devpts_fs(void);
-#endif
-
-void __init filesystem_setup(void)
-{
-	init_devfs_fs();  /*  Header file may make this empty  */
-
-#ifdef CONFIG_NFS_FS
-	init_nfs_fs();
-#endif
-
-#ifdef CONFIG_DEVPTS_FS
-	init_devpts_fs();
-#endif
-}
 
 #if defined(CONFIG_NFSD_MODULE)
 struct nfsd_linkage *nfsd_linkage = NULL;

Index: inode.c
===================================================================
RCS file: /cvsroot/linux-vax/kernel-2.4/fs/inode.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -u -r1.1.1.2 -r1.2
--- inode.c	25 Feb 2001 23:14:45 -0000	1.1.1.2
+++ inode.c	9 Apr 2002 13:11:16 -0000	1.2
@@ -13,6 +13,10 @@
 #include <linux/quotaops.h>
 #include <linux/slab.h>
 #include <linux/cache.h>
+#include <linux/swap.h>
+#include <linux/swapctl.h>
+#include <linux/prefetch.h>
+#include <linux/locks.h>
 
 /*
  * New inode.c implementation.
@@ -62,7 +66,7 @@
  * NOTE! You also have to own the lock if you change
  * the i_state of an inode while it is in use..
  */
-spinlock_t inode_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t inode_lock = SPIN_LOCK_UNLOCKED;
 
 /*
  * Statistics gathering..
@@ -75,7 +79,7 @@
 	 ((struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL))
 static void destroy_inode(struct inode *inode) 
 {
-	if (!list_empty(&inode->i_dirty_buffers))
+	if (inode_has_buffers(inode))
 		BUG();
 	kmem_cache_free(inode_cachep, (inode));
 }
@@ -101,6 +105,8 @@
 		INIT_LIST_HEAD(&inode->i_data.locked_pages);
 		INIT_LIST_HEAD(&inode->i_dentry);
 		INIT_LIST_HEAD(&inode->i_dirty_buffers);
+		INIT_LIST_HEAD(&inode->i_dirty_data_buffers);
+		INIT_LIST_HEAD(&inode->i_devices);
 		sema_init(&inode->i_sem, 1);
 		sema_init(&inode->i_zombie, 1);
 		spin_lock_init(&inode->i_data.i_shared_lock);
@@ -123,36 +129,38 @@
 /**
  *	__mark_inode_dirty -	internal function
  *	@inode: inode to mark
- *
- *	Mark an inode as dirty. Callers should use mark_inode_dirty.
+ *	@flags: what kind of dirty (i.e. I_DIRTY_SYNC)
+ *	Mark an inode as dirty. Callers should use mark_inode_dirty or
+ *  	mark_inode_dirty_sync.
  */
  
 void __mark_inode_dirty(struct inode *inode, int flags)
 {
 	struct super_block * sb = inode->i_sb;
 
-	if (sb) {
-		/* Don't do this for I_DIRTY_PAGES - that doesn't actually dirty the inode itself */
-		if (flags & (I_DIRTY | I_DIRTY_SYNC)) {
-			if (sb->s_op && sb->s_op->dirty_inode)
-				sb->s_op->dirty_inode(inode);
-		}
+	if (!sb)
+		return;
 
-		/* avoid the locking if we can */
-		if ((inode->i_state & flags) == flags)
-			return;
+	/* Don't do this for I_DIRTY_PAGES - that doesn't actually dirty the inode itself */
+	if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+		if (sb->s_op && sb->s_op->dirty_inode)
+			sb->s_op->dirty_inode(inode);
+	}
 
-		spin_lock(&inode_lock);
-		if ((inode->i_state & flags) != flags) {
-			inode->i_state |= flags;
-			/* Only add valid (ie hashed) inodes to the dirty list */
-			if (!list_empty(&inode->i_hash)) {
-				list_del(&inode->i_list);
-				list_add(&inode->i_list, &sb->s_dirty);
-			}
+	/* avoid the locking if we can */
+	if ((inode->i_state & flags) == flags)
+		return;
+
+	spin_lock(&inode_lock);
+	if ((inode->i_state & flags) != flags) {
+		inode->i_state |= flags;
+		/* Only add valid (ie hashed) inodes to the dirty list */
+		if (!(inode->i_state & I_LOCK) && !list_empty(&inode->i_hash)) {
+			list_del(&inode->i_list);
+			list_add(&inode->i_list, &sb->s_dirty);
 		}
-		spin_unlock(&inode_lock);
 	}
+	spin_unlock(&inode_lock);
 }
 
 static void __wait_on_inode(struct inode * inode)
@@ -179,7 +187,7 @@
 
 static inline void write_inode(struct inode *inode, int sync)
 {
-	if (inode->i_sb && inode->i_sb->s_op && inode->i_sb->s_op->write_inode)
+	if (inode->i_sb && inode->i_sb->s_op && inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
 		inode->i_sb->s_op->write_inode(inode, sync);
 }
 
@@ -190,13 +198,53 @@
 		return;
 	}
 	atomic_inc(&inode->i_count);
-	if (!(inode->i_state & I_DIRTY)) {
+	if (!(inode->i_state & (I_DIRTY|I_LOCK))) {
 		list_del(&inode->i_list);
 		list_add(&inode->i_list, &inode_in_use);
 	}
 	inodes_stat.nr_unused--;
 }
 
+static inline void __sync_one(struct inode *inode, int sync)
+{
+	unsigned dirty;
+
+	list_del(&inode->i_list);
+	list_add(&inode->i_list, &inode->i_sb->s_locked_inodes);
+
+	if (inode->i_state & I_LOCK)
+		BUG();
+
+	/* Set I_LOCK, reset I_DIRTY */
+	dirty = inode->i_state & I_DIRTY;
+	inode->i_state |= I_LOCK;
+	inode->i_state &= ~I_DIRTY;
+	spin_unlock(&inode_lock);
+
+	filemap_fdatasync(inode->i_mapping);
+
+	/* Don't write the inode if only I_DIRTY_PAGES was set */
+	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC))
+		write_inode(inode, sync);
+
+	filemap_fdatawait(inode->i_mapping);
+
+	spin_lock(&inode_lock);
+	inode->i_state &= ~I_LOCK;
+	if (!(inode->i_state & I_FREEING)) {
+		struct list_head *to;
+		if (inode->i_state & I_DIRTY)
+			to = &inode->i_sb->s_dirty;
+		else if (atomic_read(&inode->i_count))
+			to = &inode_in_use;
+		else
+			to = &inode_unused;
+		list_del(&inode->i_list);
+		list_add(&inode->i_list, to);
+	}
+	wake_up(&inode->i_wait);
+}
+
 static inline void sync_one(struct inode *inode, int sync)
 {
 	if (inode->i_state & I_LOCK) {
@@ -206,38 +254,117 @@
 		iput(inode);
 		spin_lock(&inode_lock);
 	} else {
-		unsigned dirty;
+		__sync_one(inode, sync);
+	}
+}
 
-		list_del(&inode->i_list);
-		list_add(&inode->i_list, atomic_read(&inode->i_count)
-							? &inode_in_use
-							: &inode_unused);
-		/* Set I_LOCK, reset I_DIRTY */
-		dirty = inode->i_state & I_DIRTY;
-		inode->i_state |= I_LOCK;
-		inode->i_state &= ~I_DIRTY;
+static inline void sync_list(struct list_head *head)
+{
+	struct list_head * tmp;
+
+	while ((tmp = head->prev) != head) 
+		__sync_one(list_entry(tmp, struct inode, i_list), 0);
+}
+
+static inline void wait_on_locked(struct list_head *head)
+{
+	struct list_head * tmp;
+	while ((tmp = head->prev) != head) {
+		struct inode *inode = list_entry(tmp, struct inode, i_list);
+		__iget(inode);
 		spin_unlock(&inode_lock);
+		__wait_on_inode(inode);
+		iput(inode);
+		spin_lock(&inode_lock);
+	}
+}
 
-		filemap_fdatasync(inode->i_mapping);
+static inline int try_to_sync_unused_list(struct list_head *head, int nr_inodes)
+{
+	struct list_head *tmp = head;
+	struct inode *inode;
 
-		/* Don't write the inode if only I_DIRTY_PAGES was set */
-		if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC))
-			write_inode(inode, sync);
+	while (nr_inodes && (tmp = tmp->prev) != head) {
+		inode = list_entry(tmp, struct inode, i_list);
 
-		filemap_fdatawait(inode->i_mapping);
+		if (!atomic_read(&inode->i_count)) {
+			__sync_one(inode, 0);
+			nr_inodes--;
 
-		spin_lock(&inode_lock);
-		inode->i_state &= ~I_LOCK;
-		wake_up(&inode->i_wait);
+			/* 
+			 * __sync_one moved the inode to another list,
+			 * so we have to start looking from the list head.
+			 */
+			tmp = head;
+		}
+	}
+
+	return nr_inodes;
+}
+
+void sync_inodes_sb(struct super_block *sb)
+{
+	spin_lock(&inode_lock);
+	while (!list_empty(&sb->s_dirty)||!list_empty(&sb->s_locked_inodes)) {
+		sync_list(&sb->s_dirty);
+		wait_on_locked(&sb->s_locked_inodes);
 	}
+	spin_unlock(&inode_lock);
 }
 
-static inline void sync_list(struct list_head *head)
+/*
+ * Note:
+ * We don't need to grab a reference to superblock here. If it has non-empty
+ * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed
+ * past sync_inodes_sb() until both ->s_dirty and ->s_locked_inodes are
+ * empty. Since __sync_one() regains inode_lock before it finally moves
+ * inode from superblock lists we are OK.
+ */
+
+void sync_unlocked_inodes(void)
 {
-	struct list_head * tmp;
+	struct super_block * sb;
+	spin_lock(&inode_lock);
+	spin_lock(&sb_lock);
+	sb = sb_entry(super_blocks.next);
+	for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.next)) {
+		if (!list_empty(&sb->s_dirty)) {
+			spin_unlock(&sb_lock);
+			sync_list(&sb->s_dirty);
+			spin_lock(&sb_lock);
+		}
+	}
+	spin_unlock(&sb_lock);
+	spin_unlock(&inode_lock);
+}
 
-	while ((tmp = head->prev) != head)
-		sync_one(list_entry(tmp, struct inode, i_list), 0);
+/*
+ * Find a superblock with inodes that need to be synced
+ */
+
+static struct super_block *get_super_to_sync(void)
+{
+	struct list_head *p;
+restart:
+	spin_lock(&inode_lock);
+	spin_lock(&sb_lock);
+	list_for_each(p, &super_blocks) {
+		struct super_block *s = list_entry(p,struct super_block,s_list);
+		if (list_empty(&s->s_dirty) && list_empty(&s->s_locked_inodes))
+			continue;
+		s->s_count++;
+		spin_unlock(&sb_lock);
+		spin_unlock(&inode_lock);
+		down_read(&s->s_umount);
+		if (!s->s_root) {
+			drop_super(s);
+			goto restart;
+		}
+		return s;
+	}
+	spin_unlock(&sb_lock);
+	spin_unlock(&inode_lock);
+	return NULL;
 }
 
 /**
@@ -247,42 +374,48 @@
  *	sync_inodes goes through the super block's dirty list, 
  *	writes them out, and puts them back on the normal list.
  */
- 
+
 void sync_inodes(kdev_t dev)
 {
-	struct super_block * sb = sb_entry(super_blocks.next);
+	struct super_block * s;
 
 	/*
 	 * Search the super_blocks array for the device(s) to sync.
 	 */
-	spin_lock(&inode_lock);
-	for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.next)) {
-		if (!sb->s_dev)
-			continue;
-		if (dev && sb->s_dev != dev)
-			continue;
-
-		sync_list(&sb->s_dirty);
-
-		if (dev)
-			break;
+	if (dev) {
+		if ((s = get_super(dev)) != NULL) {
+			sync_inodes_sb(s);
+			drop_super(s);
+		}
+	} else {
+		while ((s = get_super_to_sync()) != NULL) {
+			sync_inodes_sb(s);
+			drop_super(s);
+		}
 	}
-	spin_unlock(&inode_lock);
 }
 
-/*
- * Called with the spinlock already held..
- */
-static void sync_all_inodes(void)
+static void try_to_sync_unused_inodes(void * arg)
 {
-	struct super_block * sb = sb_entry(super_blocks.next);
-	for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.next)) {
-		if (!sb->s_dev)
+	struct super_block * sb;
+	int nr_inodes = inodes_stat.nr_unused;
+
+	spin_lock(&inode_lock);
+	spin_lock(&sb_lock);
+	sb = sb_entry(super_blocks.next);
+	for (; nr_inodes && sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.next)) {
+		if (list_empty(&sb->s_dirty))
 			continue;
-		sync_list(&sb->s_dirty);
+		spin_unlock(&sb_lock);
+		nr_inodes = try_to_sync_unused_list(&sb->s_dirty, nr_inodes);
+		spin_lock(&sb_lock);
 	}
+	spin_unlock(&sb_lock);
+	spin_unlock(&inode_lock);
 }
 
+static struct tq_struct unused_inodes_flush_task;
+
 /**
  *	write_inode_now	-	write an inode to disk
  *	@inode: inode to write to disk
@@ -301,9 +434,11 @@
 		while (inode->i_state & I_DIRTY)
 			sync_one(inode, sync);
 		spin_unlock(&inode_lock);
+		if (sync)
+			wait_on_inode(inode);
 	}
 	else
-		printk("write_inode_now: no super block\n");
+		printk(KERN_ERR "write_inode_now: no super block\n");
 }
 
 /**
@@ -315,9 +450,9 @@
  * O_SYNC flag set, to flush dirty writes to disk.  
  */
 
-int generic_osync_inode(struct inode *inode, int datasync)
+int generic_osync_inode(struct inode *inode, int what)
 {
-	int err;
+	int err = 0, err2 = 0, need_write_inode_now = 0;
 	
 	/* 
 	 * WARNING
@@ -340,23 +475,24 @@
 	 * every O_SYNC write, not just the synchronous I/Os.  --sct
 	 */
 
-#ifdef WRITERS_QUEUE_IO
-	err = osync_inode_buffers(inode);
-#else
-	err = fsync_inode_buffers(inode);
-#endif
+	if (what & OSYNC_METADATA)
+		err = fsync_inode_buffers(inode);
+	if (what & OSYNC_DATA)
+		err2 = fsync_inode_data_buffers(inode);
+	if (!err)
+		err = err2;
 
 	spin_lock(&inode_lock);
-	if (!(inode->i_state & I_DIRTY))
-		goto out;
-	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-		goto out;
+	if ((inode->i_state & I_DIRTY) &&
+	    ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC)))
+		need_write_inode_now = 1;
 	spin_unlock(&inode_lock);
-	write_inode_now(inode, 1);
-	return err;
 
- out:
-	spin_unlock(&inode_lock);
+	if (need_write_inode_now)
+		write_inode_now(inode, 1);
+	else
+		wait_on_inode(inode);
+
 	return err;
 }
 
@@ -371,8 +507,7 @@
  
 void clear_inode(struct inode *inode)
 {
-	if (!list_empty(&inode->i_dirty_buffers))
-		invalidate_inode_buffers(inode);
+	invalidate_inode_buffers(inode);
        
 	if (inode->i_data.nrpages)
 		BUG();
@@ -381,13 +516,14 @@
 	if (inode->i_state & I_CLEAR)
 		BUG();
 	wait_on_inode(inode);
-	if (IS_QUOTAINIT(inode))
-		DQUOT_DROP(inode);
+	DQUOT_DROP(inode);
 	if (inode->i_sb && inode->i_sb->s_op && inode->i_sb->s_op->clear_inode)
 		inode->i_sb->s_op->clear_inode(inode);
-	if (inode->i_bdev) {
-		bdput(inode->i_bdev);
-		inode->i_bdev = NULL;
+	if (inode->i_bdev)
+		bd_forget(inode);
+	else if (inode->i_cdev) {
+		cdput(inode->i_cdev);
+		inode->i_cdev = NULL;
 	}
 	inode->i_state = I_CLEAR;
 }
@@ -435,8 +571,7 @@
 			continue;
 		invalidate_inode_buffers(inode);
 		if (!atomic_read(&inode->i_count)) {
-			list_del(&inode->i_hash);
-			INIT_LIST_HEAD(&inode->i_hash);
+			list_del_init(&inode->i_hash);
 			list_del(&inode->i_list);
 			list_add(&inode->i_list, dispose);
 			inode->i_state |= I_FREEING;
@@ -476,12 +611,39 @@
 	busy = invalidate_list(&inode_in_use, sb, &throw_away);
 	busy |= invalidate_list(&inode_unused, sb, &throw_away);
 	busy |= invalidate_list(&sb->s_dirty, sb, &throw_away);
+	busy |= invalidate_list(&sb->s_locked_inodes, sb, &throw_away);
 	spin_unlock(&inode_lock);
 
 	dispose_list(&throw_away);
 
 	return busy;
 }
+ 
+int invalidate_device(kdev_t dev, int do_sync)
+{
+	struct super_block *sb;
+	int res;
+
+	if (do_sync)
+		fsync_dev(dev);
+
+	res = 0;
+	sb = get_super(dev);
+	if (sb) {
+		/*
+		 * no need to lock the super, get_super holds the
+		 * read semaphore so the filesystem cannot go away
+		 * under us (->put_super runs with the write lock
+		 * hold).
+		 */
+		shrink_dcache_sb(sb);
+		res = invalidate_inodes(sb);
+		drop_super(sb);
+	}
+	invalidate_buffers(dev);
+	return res;
+}
+
 
 /*
  * This is called with the inode lock held. It searches
@@ -503,13 +665,12 @@
 {
 	LIST_HEAD(list);
 	struct list_head *entry, *freeable = &list;
-	int count = 0;
+	int count;
 	struct inode * inode;
 
 	spin_lock(&inode_lock);
-	/* go simple and safe syncing everything before starting */
-	sync_all_inodes();
 
+	count = 0;
 	entry = inode_unused.prev;
 	while (entry != &inode_unused)
 	{
@@ -517,12 +678,12 @@
 
 		entry = entry->prev;
 		inode = INODE(tmp);
-		if (inode->i_state & (I_FREEING|I_CLEAR))
-			BUG();
+		if (inode->i_state & (I_FREEING|I_CLEAR|I_LOCK))
+			continue;
 		if (!CAN_UNUSE(inode))
 			continue;
 		if (atomic_read(&inode->i_count))
-			BUG();
+			continue;
 		list_del(tmp);
 		list_del(&inode->i_hash);
 		INIT_LIST_HEAD(&inode->i_hash);
@@ -536,9 +697,18 @@
 	spin_unlock(&inode_lock);
 
 	dispose_list(freeable);
+
+	/* 
+	 * If we didn't freed enough clean inodes schedule
+	 * a sync of the dirty inodes, we cannot do it
+	 * from here or we're either synchronously dogslow
+	 * or we deadlock with oom.
+	 */
+	if (goal)
+		schedule_task(&unused_inodes_flush_task);
 }
 
-void shrink_icache_memory(int priority, int gfp_mask)
+int shrink_icache_memory(int priority, int gfp_mask)
 {
 	int count = 0;
 
@@ -549,14 +719,14 @@
 	 * want to recurse into the FS that called us
 	 * in clear_inode() and friends..
 	 */
-	if (!(gfp_mask & __GFP_IO))
-		return;
+	if (!(gfp_mask & __GFP_FS))
+		return 0;
 
-	if (priority)
-		count = inodes_stat.nr_unused / priority;
+	count = inodes_stat.nr_unused / priority;
 
 	prune_icache(count);
 	kmem_cache_shrink(inode_cachep);
+	return 0;
 }
 
 /*
@@ -607,12 +777,15 @@
 	inode->i_nlink = 1;
 	atomic_set(&inode->i_writecount, 0);
 	inode->i_size = 0;
+	inode->i_blocks = 0;
 	inode->i_generation = 0;
 	memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
 	inode->i_pipe = NULL;
 	inode->i_bdev = NULL;
+	inode->i_cdev = NULL;
 	inode->i_data.a_ops = &empty_aops;
 	inode->i_data.host = inode;
+	inode->i_data.gfp_mask = GFP_HIGHUSER;
 	inode->i_mapping = &inode->i_data;
 }
 
@@ -634,6 +807,8 @@
 	static unsigned long last_ino;
 	struct inode * inode;
 
+	spin_lock_prefetch(&inode_lock);
+	
 	inode = alloc_inode();
 	if (inode)
 	{
@@ -642,6 +817,7 @@
 		list_add(&inode->i_list, &inode_in_use);
 		inode->i_sb = NULL;
 		inode->i_dev = 0;
+		inode->i_blkbits = 0;
 		inode->i_ino = ++last_ino;
 		inode->i_flags = 0;
 		atomic_set(&inode->i_count, 1);
@@ -675,6 +851,7 @@
 			list_add(&inode->i_hash, head);
 			inode->i_sb = sb;
 			inode->i_dev = sb->s_dev;
+			inode->i_blkbits = sb->s_blocksize_bits;
 			inode->i_ino = ino;
 			inode->i_flags = 0;
 			atomic_set(&inode->i_count, 1);
@@ -781,8 +958,6 @@
 		 */
 		inode = NULL;
 	spin_unlock(&inode_lock);
-	if (inode)
-		wait_on_inode(inode);
 	return inode;
 }
 
@@ -853,10 +1028,14 @@
 void iput(struct inode *inode)
 {
 	if (inode) {
+		struct super_block *sb = inode->i_sb;
 		struct super_operations *op = NULL;
 
-		if (inode->i_sb && inode->i_sb->s_op)
-			op = inode->i_sb->s_op;
+		if (inode->i_state == I_CLEAR)
+			BUG();
+
+		if (sb && sb->s_op)
+			op = sb->s_op;
 		if (op && op->put_inode)
 			op->put_inode(inode);
 
@@ -877,6 +1056,8 @@
 
 			if (op && op->delete_inode) {
 				void (*delete)(struct inode *) = op->delete_inode;
+				if (!is_bad_inode(inode))
+					DQUOT_INIT(inode);
 				/* s_op->delete_inode internally recalls clear_inode() */
 				delete(inode);
 			} else
@@ -884,22 +1065,22 @@
 			if (inode->i_state != I_CLEAR)
 				BUG();
 		} else {
-			if (!list_empty(&inode->i_hash)) {
-				if (!(inode->i_state & I_DIRTY)) {
+			if (!list_empty(&inode->i_hash) && sb && sb->s_root) {
+				if (!(inode->i_state & (I_DIRTY|I_LOCK))) {
 					list_del(&inode->i_list);
-					list_add(&inode->i_list,
-						 &inode_unused);
+					list_add(&inode->i_list, &inode_unused);
 				}
 				inodes_stat.nr_unused++;
 				spin_unlock(&inode_lock);
 				return;
 			} else {
-				/* magic nfs path */
-				list_del(&inode->i_list);
-				INIT_LIST_HEAD(&inode->i_list);
+				list_del_init(&inode->i_list);
+				list_del_init(&inode->i_hash);
 				inode->i_state|=I_FREEING;
 				inodes_stat.nr_inodes--;
 				spin_unlock(&inode_lock);
+				if (inode->i_data.nrpages)
+					truncate_inode_pages(&inode->i_data, 0);
 				clear_inode(inode);
 			}
 		}
@@ -988,6 +1169,8 @@
 					 NULL);
 	if (!inode_cachep)
 		panic("cannot create inode slab cache");
+
+	unused_inodes_flush_task.routine = try_to_sync_unused_inodes;
 }
 
 /**
@@ -1018,38 +1201,40 @@
 void put_dquot_list(struct list_head *);
 int remove_inode_dquot_ref(struct inode *, short, struct list_head *);
 
-void remove_dquot_ref(kdev_t dev, short type)
+void remove_dquot_ref(struct super_block *sb, short type)
 {
-	struct super_block *sb = get_super(dev);
 	struct inode *inode;
 	struct list_head *act_head;
 	LIST_HEAD(tofree_head);
 
-	if (!sb || !sb->dq_op)
+	if (!sb->dq_op)
 		return;	/* nothing to do */
-
 	/* We have to be protected against other CPUs */
-	spin_lock(&inode_lock);
+	lock_kernel();		/* This lock is for quota code */
+	spin_lock(&inode_lock);	/* This lock is for inodes code */
  
-	for (act_head = inode_in_use.next; act_head != &inode_in_use; act_head = act_head->next) {
+	list_for_each(act_head, &inode_in_use) {
 		inode = list_entry(act_head, struct inode, i_list);
-		if (inode->i_sb != sb || !IS_QUOTAINIT(inode))
-			continue;
-		remove_inode_dquot_ref(inode, type, &tofree_head);
+		if (inode->i_sb == sb && IS_QUOTAINIT(inode))
+			remove_inode_dquot_ref(inode, type, &tofree_head);
 	}
-	for (act_head = inode_unused.next; act_head != &inode_unused; act_head = act_head->next) {
+	list_for_each(act_head, &inode_unused) {
 		inode = list_entry(act_head, struct inode, i_list);
-		if (inode->i_sb != sb || !IS_QUOTAINIT(inode))
-			continue;
-		remove_inode_dquot_ref(inode, type, &tofree_head);
+		if (inode->i_sb == sb && IS_QUOTAINIT(inode))
+			remove_inode_dquot_ref(inode, type, &tofree_head);
 	}
-	for (act_head = sb->s_dirty.next; act_head != &sb->s_dirty; act_head = act_head->next) {
+	list_for_each(act_head, &sb->s_dirty) {
 		inode = list_entry(act_head, struct inode, i_list);
-		if (!IS_QUOTAINIT(inode))
-			continue;
-  		remove_inode_dquot_ref(inode, type, &tofree_head);
+		if (IS_QUOTAINIT(inode))
+			remove_inode_dquot_ref(inode, type, &tofree_head);
+	}
+	list_for_each(act_head, &sb->s_locked_inodes) {
+		inode = list_entry(act_head, struct inode, i_list);
+		if (IS_QUOTAINIT(inode))
+			remove_inode_dquot_ref(inode, type, &tofree_head);
 	}
 	spin_unlock(&inode_lock);
+	unlock_kernel();
 
 	put_dquot_list(&tofree_head);
 }

Index: iobuf.c
===================================================================
RCS file: /cvsroot/linux-vax/kernel-2.4/fs/iobuf.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -u -r1.1.1.2 -r1.2
--- iobuf.c	25 Feb 2001 23:14:46 -0000	1.1.1.2
+++ iobuf.c	9 Apr 2002 13:11:16 -0000	1.2
@@ -8,9 +8,7 @@
 
 #include <linux/iobuf.h>
 #include <linux/slab.h>
-
-static kmem_cache_t *kiobuf_cachep;
-
+#include <linux/vmalloc.h>
 
 void end_kio_request(struct kiobuf *kiobuf, int uptodate)
 {
@@ -24,18 +22,7 @@
 	}
 }
 
-
-void __init kiobuf_setup(void)
-{
-	kiobuf_cachep =  kmem_cache_create("kiobuf",
-					   sizeof(struct kiobuf),
-					   0,
-					   SLAB_HWCACHE_ALIGN, NULL, NULL);
-	if(!kiobuf_cachep)
-		panic("Cannot create kernel iobuf cache\n");
-}
-
-void kiobuf_init(struct kiobuf *iobuf)
+static void kiobuf_init(struct kiobuf *iobuf)
 {
 	memset(iobuf, 0, sizeof(*iobuf));
 	init_waitqueue_head(&iobuf->wait_queue);
@@ -43,19 +30,49 @@
 	iobuf->maplist   = iobuf->map_array;
 }
 
+int alloc_kiobuf_bhs(struct kiobuf * kiobuf)
+{
+	int i;
+
+	for (i = 0; i < KIO_MAX_SECTORS; i++)
+		if (!(kiobuf->bh[i] = kmem_cache_alloc(bh_cachep, SLAB_KERNEL))) {
+			while (i--) {
+				kmem_cache_free(bh_cachep, kiobuf->bh[i]);
+				kiobuf->bh[i] = NULL;
+			}
+			return -ENOMEM;
+		}
+	return 0;
+}
+
+void free_kiobuf_bhs(struct kiobuf * kiobuf)
+{
+	int i;
+
+	for (i = 0; i < KIO_MAX_SECTORS; i++) {
+		kmem_cache_free(bh_cachep, kiobuf->bh[i]);
+		kiobuf->bh[i] = NULL;
+	}
+}
+
 int alloc_kiovec(int nr, struct kiobuf **bufp)
 {
 	int i;
 	struct kiobuf *iobuf;
 	
 	for (i = 0; i < nr; i++) {
-		iobuf = kmem_cache_alloc(kiobuf_cachep, SLAB_KERNEL);
+		iobuf = vmalloc(sizeof(struct kiobuf));
 		if (!iobuf) {
 			free_kiovec(i, bufp);
 			return -ENOMEM;
 		}
 		kiobuf_init(iobuf);
-		*bufp++ = iobuf;
+ 		if (alloc_kiobuf_bhs(iobuf)) {
+			vfree(iobuf);
+ 			free_kiovec(i, bufp);
+ 			return -ENOMEM;
+ 		}
+		bufp[i] = iobuf;
 	}
 	
 	return 0;
@@ -72,7 +89,8 @@
 			unlock_kiovec(1, &iobuf);
 		if (iobuf->array_len > KIO_STATIC_PAGES)
 			kfree (iobuf->maplist);
-		kmem_cache_free(kiobuf_cachep, bufp[i]);
+		free_kiobuf_bhs(iobuf);
+		vfree(bufp[i]);
 	}
 }
 
@@ -115,11 +133,12 @@
 
 	add_wait_queue(&kiobuf->wait_queue, &wait);
 repeat:
-	run_task_queue(&tq_disk);
 	set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 	if (atomic_read(&kiobuf->io_count) != 0) {
+		run_task_queue(&tq_disk);
 		schedule();
-		goto repeat;
+		if (atomic_read(&kiobuf->io_count) != 0)
+			goto repeat;
 	}
 	tsk->state = TASK_RUNNING;
 	remove_wait_queue(&kiobuf->wait_queue, &wait);





[LV-kern-commit] CVS: kernel-2.4/fs dcache.c,1.1.1.2,1.2 devices.c,1.1.1.1,1.2 dquot.c,1.1.1.2,1.2 e

[LV-kern-commit] CVS: kernel-2.4/fs dcache.c,1.1.1.2,1.2 devices.c,1.1.1.1,1.2 dquot.c,1.1.1.2,1.2 exec.c,1.1.1.2,1.2 fcntl.c,1.1.1.1,1.2 file_table.c,1.1.1.1,1.2 filesystems.c,1.1.1.1,1.2 inode.c,1.1.1.2,1.2 iobuf.c,1.1.1.2,1.2