Update of /cvsroot/linux-vax/kernel-2.4/fs In directory usw-pr-cvs1:/tmp/cvs-serv27691 Modified Files: dcache.c devices.c dquot.c exec.c fcntl.c file_table.c filesystems.c inode.c iobuf.c Log Message: sync 2.4.15 commit 11 Index: dcache.c =================================================================== RCS file: /cvsroot/linux-vax/kernel-2.4/fs/dcache.c,v retrieving revision 1.1.1.2 retrieving revision 1.2 diff -u -r1.1.1.2 -r1.2 --- dcache.c 25 Feb 2001 23:14:46 -0000 1.1.1.2 +++ dcache.c 9 Apr 2002 13:11:16 -0000 1.2 @@ -22,6 +22,7 @@ #include <linux/init.h> #include <linux/smp_lock.h> #include <linux/cache.h> +#include <linux/module.h> #include <asm/uaccess.h> @@ -138,10 +139,6 @@ goto kill_it; list_add(&dentry->d_lru, &dentry_unused); dentry_stat.nr_unused++; - /* - * Update the timestamp - */ - dentry->d_reftime = jiffies; spin_unlock(&dcache_lock); return; @@ -223,8 +220,7 @@ atomic_inc(&dentry->d_count); if (atomic_read(&dentry->d_count) == 1) { dentry_stat.nr_unused--; - list_del(&dentry->d_lru); - INIT_LIST_HEAD(&dentry->d_lru); /* make "list_empty()" work */ + list_del_init(&dentry->d_lru); } return dentry; } @@ -337,10 +333,10 @@ dentry = list_entry(tmp, struct dentry, d_lru); /* If the dentry was recently referenced, don't free it. */ - if (dentry->d_flags & DCACHE_REFERENCED) { - dentry->d_flags &= ~DCACHE_REFERENCED; + if (dentry->d_vfs_flags & DCACHE_REFERENCED) { + dentry->d_vfs_flags &= ~DCACHE_REFERENCED; list_add(&dentry->d_lru, &dentry_unused); - goto next; + continue; } dentry_stat.nr_unused--; @@ -349,7 +345,6 @@ BUG(); prune_one_dentry(dentry); - next: if (!--count) break; } @@ -413,8 +408,7 @@ if (atomic_read(&dentry->d_count)) continue; dentry_stat.nr_unused--; - list_del(tmp); - INIT_LIST_HEAD(tmp); + list_del_init(tmp); prune_one_dentry(dentry); goto repeat; } @@ -553,7 +547,7 @@ * ... * 6 - base-level: try to shrink a bit. */ -void shrink_dcache_memory(int priority, unsigned int gfp_mask) +int shrink_dcache_memory(int priority, unsigned int gfp_mask) { int count = 0; @@ -568,14 +562,14 @@ * We should make sure we don't hold the superblock lock over * block allocations, but for now: */ - if (!(gfp_mask & __GFP_IO)) - return; + if (!(gfp_mask & __GFP_FS)) + return 0; - if (priority) - count = dentry_stat.nr_unused / priority; + count = dentry_stat.nr_unused / priority; prune_dcache(count); kmem_cache_shrink(dentry_cache); + return 0; } #define NAME_ALLOC_LEN(len) ((len+16) & ~15) @@ -612,6 +606,7 @@ str[name->len] = 0; atomic_set(&dentry->d_count, 1); + dentry->d_vfs_flags = 0; dentry->d_flags = 0; dentry->d_inode = NULL; dentry->d_parent = NULL; @@ -621,7 +616,7 @@ dentry->d_name.hash = name->hash; dentry->d_op = NULL; dentry->d_fsdata = NULL; - INIT_LIST_HEAD(&dentry->d_vfsmnt); + dentry->d_mounted = 0; INIT_LIST_HEAD(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); INIT_LIST_HEAD(&dentry->d_subdirs); @@ -656,6 +651,7 @@ void d_instantiate(struct dentry *entry, struct inode * inode) { + if (!list_empty(&entry->d_alias)) BUG(); spin_lock(&dcache_lock); if (inode) list_add(&entry->d_alias, &inode->i_dentry); @@ -734,7 +730,7 @@ continue; } __dget_locked(dentry); - dentry->d_flags |= DCACHE_REFERENCED; + dentry->d_vfs_flags |= DCACHE_REFERENCED; spin_unlock(&dcache_lock); return dentry; } @@ -744,58 +740,48 @@ /** * d_validate - verify dentry provided from insecure source - * @dentry: The dentry alleged to be valid - * @dparent: The parent dentry + * @dentry: The dentry alleged to be valid child of @dparent + * @dparent: The parent dentry (known to be valid) * @hash: Hash of the dentry * @len: Length of the name * * An insecure source has sent us a dentry, here we verify it and dget() it. * This is used by ncpfs in its readdir implementation. * Zero is returned in the dentry is invalid. - * - * NOTE: This function does _not_ dereference the pointers before we have - * validated them. We can test the pointer values, but we - * must not actually use them until we have found a valid - * copy of the pointer in kernel space.. */ -int d_validate(struct dentry *dentry, struct dentry *dparent, - unsigned int hash, unsigned int len) +int d_validate(struct dentry *dentry, struct dentry *dparent) { + unsigned long dent_addr = (unsigned long) dentry; + unsigned long min_addr = PAGE_OFFSET; + unsigned long align_mask = 0x0F; struct list_head *base, *lhp; - int valid = 1; - spin_lock(&dcache_lock); - if (dentry != dparent) { - base = d_hash(dparent, hash); - lhp = base; - while ((lhp = lhp->next) != base) { - if (dentry == list_entry(lhp, struct dentry, d_hash)) { - __dget_locked(dentry); - goto out; - } - } - } else { - /* - * Special case: local mount points don't live in - * the hashes, so we search the super blocks. - */ - struct super_block *sb = sb_entry(super_blocks.next); + if (dent_addr < min_addr) + goto out; + if (dent_addr > (unsigned long)high_memory - sizeof(struct dentry)) + goto out; + if (dent_addr & align_mask) + goto out; + if ((!kern_addr_valid(dent_addr)) || (!kern_addr_valid(dent_addr -1 + + sizeof(struct dentry)))) + goto out; - for (; sb != sb_entry(&super_blocks); - sb = sb_entry(sb->s_list.next)) { - if (!sb->s_dev) - continue; - if (sb->s_root == dentry) { - __dget_locked(dentry); - goto out; - } + if (dentry->d_parent != dparent) + goto out; + + spin_lock(&dcache_lock); + lhp = base = d_hash(dparent, dentry->d_name.hash); + while ((lhp = lhp->next) != base) { + if (dentry == list_entry(lhp, struct dentry, d_hash)) { + __dget_locked(dentry); + spin_unlock(&dcache_lock); + return 1; } } - valid = 0; -out: spin_unlock(&dcache_lock); - return valid; +out: + return 0; } /* @@ -848,6 +834,7 @@ void d_rehash(struct dentry * entry) { struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash); + if (!list_empty(&entry->d_hash)) BUG(); spin_lock(&dcache_lock); list_add(&entry->d_hash, list); spin_unlock(&dcache_lock); @@ -922,8 +909,7 @@ list_add(&dentry->d_hash, &target->d_hash); /* Unhash the target: dput() will then get rid of it */ - list_del(&target->d_hash); - INIT_LIST_HEAD(&target->d_hash); + list_del_init(&target->d_hash); list_del(&dentry->d_child); list_del(&target->d_child); @@ -1239,6 +1225,18 @@ } while (i); } +static void init_buffer_head(void * foo, kmem_cache_t * cachep, unsigned long flags) +{ + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) + { + struct buffer_head * bh = (struct buffer_head *) foo; + + memset(bh, 0, sizeof(*bh)); + init_waitqueue_head(&bh->b_wait); + } +} + /* SLAB cache for __getname() consumers */ kmem_cache_t *names_cachep; @@ -1250,12 +1248,16 @@ /* SLAB cache for buffer_head structures */ kmem_cache_t *bh_cachep; +EXPORT_SYMBOL(bh_cachep); + +extern void bdev_cache_init(void); +extern void cdev_cache_init(void); void __init vfs_caches_init(unsigned long mempages) { bh_cachep = kmem_cache_create("buffer_head", sizeof(struct buffer_head), 0, - SLAB_HWCACHE_ALIGN, NULL, NULL); + SLAB_HWCACHE_ALIGN, init_buffer_head, NULL); if(!bh_cachep) panic("Cannot create buffer head SLAB cache"); @@ -1280,4 +1282,8 @@ #endif dcache_init(mempages); + inode_init(mempages); + mnt_init(mempages); + bdev_cache_init(); + cdev_cache_init(); } Index: devices.c =================================================================== RCS file: /cvsroot/linux-vax/kernel-2.4/fs/devices.c,v retrieving revision 1.1.1.1 retrieving revision 1.2 diff -u -r1.1.1.1 -r1.2 --- devices.c 14 Jan 2001 16:25:21 -0000 1.1.1.1 +++ devices.c 9 Apr 2002 13:11:16 -0000 1.2 @@ -203,10 +203,10 @@ if (S_ISCHR(mode)) { inode->i_fop = &def_chr_fops; inode->i_rdev = to_kdev_t(rdev); + inode->i_cdev = cdget(rdev); } else if (S_ISBLK(mode)) { inode->i_fop = &def_blk_fops; inode->i_rdev = to_kdev_t(rdev); - inode->i_bdev = bdget(rdev); } else if (S_ISFIFO(mode)) inode->i_fop = &def_fifo_fops; else if (S_ISSOCK(mode)) Index: dquot.c =================================================================== RCS file: /cvsroot/linux-vax/kernel-2.4/fs/dquot.c,v retrieving revision 1.1.1.2 retrieving revision 1.2 diff -u -r1.1.1.2 -r1.2 --- dquot.c 25 Feb 2001 23:14:46 -0000 1.1.1.2 +++ dquot.c 9 Apr 2002 13:11:16 -0000 1.2 @@ -26,7 +26,7 @@ * dquot_incr_...() to calling functions. * invalidate_dquots() now writes modified dquots. * Serialized quota_off() and quota_on() for mount point. - * Fixed a few bugs in grow_dquots. + * Fixed a few bugs in grow_dquots(). * Fixed deadlock in write_dquot() - we no longer account quotas on * quota files * remove_dquot_ref() moved to inode.c - it now traverses through inodes @@ -34,13 +34,24 @@ * Added check for bogus uid and fixed check for group in quotactl. * Jan Kara, <ja...@su...>, sponsored by SuSE CR, 10-11/99 [...1541 lines suppressed...] + ret = quota_on(sb, type, (char *) addr); goto out; case Q_QUOTAOFF: ret = quota_off(sb, type); @@ -1597,12 +1468,12 @@ goto out; } - flags |= QUOTA_SYSCALL; - - ret = -ESRCH; + ret = -NODEV; if (sb && sb_has_quota_enabled(sb, type)) ret = set_dqblk(sb, id, type, flags, (struct dqblk *) addr); out: + if (sb) + drop_super(sb); unlock_kernel(); return ret; } Index: exec.c =================================================================== RCS file: /cvsroot/linux-vax/kernel-2.4/fs/exec.c,v retrieving revision 1.1.1.2 retrieving revision 1.2 diff -u -r1.1.1.2 -r1.2 --- exec.c 25 Feb 2001 23:14:45 -0000 1.1.1.2 +++ exec.c 9 Apr 2002 13:11:16 -0000 1.2 @@ -34,6 +34,7 @@ #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/spinlock.h> +#include <linux/personality.h> #define __NO_VERSION__ #include <linux/module.h> @@ -45,6 +46,8 @@ #include <linux/kmod.h> #endif +int core_uses_pid; + static struct linux_binfmt *formats; static rwlock_t binfmt_lock = RW_LOCK_UNLOCKED; @@ -159,11 +162,9 @@ if (argv != NULL) { for (;;) { char * p; - int error; - error = get_user(p,argv); - if (error) - return error; + if (get_user(p, argv)) + return -EFAULT; if (!p) break; argv++; @@ -186,7 +187,7 @@ int len; unsigned long pos; - if (get_user(str, argv+argc) || !str || !(len = strnlen_user(str, bprm->p))) + if (get_user(str, argv+argc) || !(len = strnlen_user(str, bprm->p))) return -EFAULT; if (bprm->p < len) return -E2BIG; @@ -252,6 +253,8 @@ /* * This routine is used to map in a page into an address space: needed by * execve() for the initial stack and environment pages. + * + * tsk->mmap_sem is held for writing. */ void put_dirty_page(struct task_struct * tsk, struct page *page, unsigned long address) { @@ -260,29 +263,32 @@ pte_t * pte; if (page_count(page) != 1) - printk("mem_map disagrees with %p at %08lx\n", page, address); + printk(KERN_ERR "mem_map disagrees with %p at %08lx\n", page, address); pgd = pgd_offset(tsk->mm, address); - pmd = pmd_alloc(pgd, address); - if (!pmd) { - __free_page(page); - force_sig(SIGKILL, tsk); - return; - } - pte = pte_alloc(pmd, address); - if (!pte) { - __free_page(page); - force_sig(SIGKILL, tsk); - return; - } - if (!pte_none(*pte)) { - pte_ERROR(*pte); - __free_page(page); - return; - } + + spin_lock(&tsk->mm->page_table_lock); + pmd = pmd_alloc(tsk->mm, pgd, address); + if (!pmd) + goto out; + pte = pte_alloc(tsk->mm, pmd, address); + if (!pte) + goto out; + if (!pte_none(*pte)) + goto out; + lru_cache_add(page); flush_dcache_page(page); flush_page_to_ram(page); set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY)))); -/* no need for flush_tlb */ + tsk->mm->rss++; + spin_unlock(&tsk->mm->page_table_lock); + + /* no need for flush_tlb */ + return; +out: + spin_unlock(&tsk->mm->page_table_lock); + __free_page(page); + force_sig(SIGKILL, tsk); + return; } int setup_arg_pages(struct linux_binprm *bprm) @@ -302,7 +308,7 @@ if (!mpnt) return -ENOMEM; - down(¤t->mm->mmap_sem); + down_write(¤t->mm->mmap_sem); { mpnt->vm_mm = current->mm; mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p; @@ -321,12 +327,11 @@ struct page *page = bprm->page[i]; if (page) { bprm->page[i] = NULL; - current->mm->rss++; put_dirty_page(current,page,stack_base); } stack_base += PAGE_SIZE; } - up(¤t->mm->mmap_sem); + up_write(¤t->mm->mmap_sem); return 0; } @@ -344,8 +349,11 @@ if (!err) { inode = nd.dentry->d_inode; file = ERR_PTR(-EACCES); - if (!IS_NOEXEC(inode) && S_ISREG(inode->i_mode)) { + if (!(nd.mnt->mnt_flags & MNT_NOEXEC) && + S_ISREG(inode->i_mode)) { int err = permission(inode, MAY_EXEC); + if (!err && !(inode->i_mode & 0111)) + err = -EACCES; file = ERR_PTR(err); if (!err) { file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); @@ -545,7 +553,7 @@ current->sas_ss_sp = current->sas_ss_size = 0; if (current->euid == current->uid && current->egid == current->gid) - current->dumpable = 1; + current->mm->dumpable = 1; name = bprm->filename; for (i=0; (ch = *(name++)) != '\0';) { if (ch == '/') @@ -562,7 +570,7 @@ if (bprm->e_uid != current->euid || bprm->e_gid != current->egid || permission(bprm->file->f_dentry->d_inode,MAY_READ)) - current->dumpable = 0; + current->mm->dumpable = 0; /* An exec changes our domain. We are no longer part of the thread group */ @@ -577,9 +585,10 @@ mmap_failed: flush_failed: spin_lock_irq(¤t->sigmask_lock); - if (current->sig != oldsig) + if (current->sig != oldsig) { kfree(current->sig); - current->sig = oldsig; + current->sig = oldsig; + } spin_unlock_irq(¤t->sigmask_lock); return retval; } @@ -590,7 +599,7 @@ */ static inline int must_not_trace_exec(struct task_struct * p) { - return (p->ptrace & PT_PTRACED) && !cap_raised(p->p_pptr->cap_effective, CAP_SYS_PTRACE); + return (p->ptrace & PT_PTRACED) && !(p->ptrace & PT_PTRACE_CAP); } /* @@ -603,7 +612,10 @@ struct inode * inode = bprm->file->f_dentry->d_inode; mode = inode->i_mode; - /* Huh? We had already checked for MAY_EXEC, WTF do we check this? */ + /* + * Check execute perms again - if the caller has CAP_DAC_OVERRIDE, + * vfs_permission lets a non-executable through + */ if (!(mode & 0111)) /* with at least _one_ execute bit set */ return -EACCES; if (bprm->file->f_op == NULL) @@ -612,7 +624,7 @@ bprm->e_uid = current->euid; bprm->e_gid = current->egid; - if(!IS_NOSUID(inode)) { + if(!(bprm->file->f_vfsmnt->mnt_flags & MNT_NOSUID)) { /* Set-uid? */ if (mode & S_ISUID) bprm->e_uid = inode->i_uid; @@ -680,7 +692,7 @@ if (bprm->e_uid != current->uid || bprm->e_gid != current->gid || !cap_issubset(new_permitted, current->cap_permitted)) { - current->dumpable = 0; + current->mm->dumpable = 0; lock_kernel(); if (must_not_trace_exec(current) @@ -759,7 +771,6 @@ if (!bprm->loader && eh->fh.f_magic == 0x183 && (eh->fh.f_flags & 0x3000) == 0x3000) { - char * dynloader[] = { "/sbin/loader" }; struct file * file; unsigned long loader; @@ -769,10 +780,14 @@ loader = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *); - file = open_exec(dynloader[0]); + file = open_exec("/sbin/loader"); retval = PTR_ERR(file); if (IS_ERR(file)) return retval; + + /* Remember if the application is TASO. */ + bprm->sh_bang = eh->ah.entry < 0x100000000; + bprm->file = file; bprm->loader = loader; retval = prepare_binprm(bprm); @@ -783,6 +798,9 @@ } } #endif + /* kernel module loader fixup */ + /* so we don't try to load run modprobe in kernel space. */ + set_fs(USER_DS); for (try=0; try<2; try++) { read_lock(&binfmt_lock); for (fmt = formats ; fmt ; fmt = fmt->next) { @@ -918,26 +936,25 @@ int do_coredump(long signr, struct pt_regs * regs) { struct linux_binfmt * binfmt; - char corename[6+sizeof(current->comm)]; + char corename[6+sizeof(current->comm)+10]; struct file * file; struct inode * inode; + int retval = 0; lock_kernel(); binfmt = current->binfmt; if (!binfmt || !binfmt->core_dump) goto fail; - if (!current->dumpable || atomic_read(¤t->mm->mm_users) != 1) + if (!current->mm->dumpable) goto fail; - current->dumpable = 0; + current->mm->dumpable = 0; if (current->rlim[RLIMIT_CORE].rlim_cur < binfmt->min_coredump) goto fail; memcpy(corename,"core.", 5); -#if 0 - memcpy(corename+5,current->comm,sizeof(current->comm)); -#else corename[4] = '\0'; -#endif + if (core_uses_pid || atomic_read(¤t->mm->mm_users) != 1) + sprintf(&corename[4], ".%d", current->pid); file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW, 0600); if (IS_ERR(file)) goto fail; @@ -955,15 +972,14 @@ goto close_fail; if (do_truncate(file->f_dentry, 0) != 0) goto close_fail; - if (!binfmt->core_dump(signr, regs, file)) - goto close_fail; - unlock_kernel(); - filp_close(file, NULL); - return 1; + + down_read(¤t->mm->mmap_sem); + retval = binfmt->core_dump(signr, regs, file); + up_read(¤t->mm->mmap_sem); close_fail: filp_close(file, NULL); fail: unlock_kernel(); - return 0; + return retval; } Index: fcntl.c =================================================================== RCS file: /cvsroot/linux-vax/kernel-2.4/fs/fcntl.c,v retrieving revision 1.1.1.1 retrieving revision 1.2 diff -u -r1.1.1.1 -r1.2 --- fcntl.c 14 Jan 2001 16:25:03 -0000 1.1.1.1 +++ fcntl.c 9 Apr 2002 13:11:16 -0000 1.2 @@ -10,6 +10,7 @@ #include <linux/dnotify.h> #include <linux/smp_lock.h> #include <linux/slab.h> +#include <linux/iobuf.h> #include <asm/poll.h> #include <asm/siginfo.h> @@ -194,7 +195,7 @@ return ret; } -#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | FASYNC) +#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | FASYNC | O_DIRECT) static int setfl(int fd, struct file * filp, unsigned long arg) { @@ -217,6 +218,25 @@ } } + if (arg & O_DIRECT) { + /* + * alloc_kiovec() can sleep and we are only serialized by + * the big kernel lock here, so abuse the i_sem to serialize + * this case too. We of course wouldn't need to go deep down + * to the inode layer, we could stay at the file layer, but + * we don't want to pay for the memory of a semaphore in each + * file structure too and we use the inode semaphore that we just + * pay for anyways. + */ + error = 0; + down(&inode->i_sem); + if (!filp->f_iobuf) + error = alloc_kiovec(1, &filp->f_iobuf); + up(&inode->i_sem); + if (error < 0) + return error; + } + /* required for strict SunOS emulation */ if (O_NONBLOCK != O_NDELAY) if (arg & O_NDELAY) @@ -338,7 +358,6 @@ if (!filp) goto out; - lock_kernel(); switch (cmd) { case F_GETLK64: err = fcntl_getlk64(fd, (struct flock64 *) arg); @@ -353,7 +372,6 @@ err = do_fcntl(fd, cmd, arg, filp); break; } - unlock_kernel(); fput(filp); out: return err; Index: file_table.c =================================================================== RCS file: /cvsroot/linux-vax/kernel-2.4/fs/file_table.c,v retrieving revision 1.1.1.1 retrieving revision 1.2 diff -u -r1.1.1.1 -r1.2 --- file_table.c 14 Jan 2001 16:24:51 -0000 1.1.1.1 +++ file_table.c 9 Apr 2002 13:11:16 -0000 1.2 @@ -11,6 +11,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/smp_lock.h> +#include <linux/iobuf.h> /* sysctl tunables... */ struct files_stat_struct files_stat = {0, 0, NR_FILE}; @@ -66,10 +67,10 @@ goto new_one; } /* Big problems... */ - printk("VFS: filp allocation failed\n"); + printk(KERN_WARNING "VFS: filp allocation failed\n"); } else if (files_stat.max_files > old_max) { - printk("VFS: file-max limit %d reached\n", files_stat.max_files); + printk(KERN_INFO "VFS: file-max limit %d reached\n", files_stat.max_files); old_max = files_stat.max_files; } file_list_unlock(); @@ -104,21 +105,24 @@ if (atomic_dec_and_test(&file->f_count)) { locks_remove_flock(file); + + if (file->f_iobuf) + free_kiovec(1, &file->f_iobuf); + if (file->f_op && file->f_op->release) file->f_op->release(inode, file); fops_put(file->f_op); - file->f_dentry = NULL; - file->f_vfsmnt = NULL; if (file->f_mode & FMODE_WRITE) put_write_access(inode); - dput(dentry); - if (mnt) - mntput(mnt); file_list_lock(); + file->f_dentry = NULL; + file->f_vfsmnt = NULL; list_del(&file->f_list); list_add(&file->f_list, &free_list); files_stat.nr_free_files++; file_list_unlock(); + dput(dentry); + mntput(mnt); } } @@ -158,14 +162,6 @@ file_list_unlock(); } -void file_moveto(struct file *new, struct file *old) -{ - file_list_lock(); - list_del(&new->f_list); - list_add(&new->f_list, &old->f_list); - file_list_unlock(); -} - int fs_may_remount_ro(struct super_block *sb) { struct list_head *p; @@ -174,12 +170,7 @@ file_list_lock(); for (p = sb->s_files.next; p != &sb->s_files; p = p->next) { struct file *file = list_entry(p, struct file, f_list); - struct inode *inode; - - if (!file->f_dentry) - continue; - - inode = file->f_dentry->d_inode; + struct inode *inode = file->f_dentry->d_inode; /* File with pending delete? */ if (inode->i_nlink == 0) Index: filesystems.c =================================================================== RCS file: /cvsroot/linux-vax/kernel-2.4/fs/filesystems.c,v retrieving revision 1.1.1.1 retrieving revision 1.2 diff -u -r1.1.1.1 -r1.2 --- filesystems.c 14 Jan 2001 16:25:21 -0000 1.1.1.1 +++ filesystems.c 9 Apr 2002 13:11:16 -0000 1.2 @@ -7,36 +7,11 @@ */ #include <linux/config.h> -#include <linux/fs.h> - -#include <linux/devfs_fs_kernel.h> -#include <linux/nfs_fs.h> -#include <linux/auto_fs.h> -#include <linux/devpts_fs.h> -#include <linux/major.h> -#include <linux/smp.h> +#include <linux/module.h> +#include <linux/sched.h> #include <linux/smp_lock.h> #include <linux/kmod.h> -#include <linux/init.h> -#include <linux/module.h> #include <linux/nfsd/interface.h> - -#ifdef CONFIG_DEVPTS_FS -extern int init_devpts_fs(void); -#endif - -void __init filesystem_setup(void) -{ - init_devfs_fs(); /* Header file may make this empty */ - -#ifdef CONFIG_NFS_FS - init_nfs_fs(); -#endif - -#ifdef CONFIG_DEVPTS_FS - init_devpts_fs(); -#endif -} #if defined(CONFIG_NFSD_MODULE) struct nfsd_linkage *nfsd_linkage = NULL; Index: inode.c =================================================================== RCS file: /cvsroot/linux-vax/kernel-2.4/fs/inode.c,v retrieving revision 1.1.1.2 retrieving revision 1.2 diff -u -r1.1.1.2 -r1.2 --- inode.c 25 Feb 2001 23:14:45 -0000 1.1.1.2 +++ inode.c 9 Apr 2002 13:11:16 -0000 1.2 @@ -13,6 +13,10 @@ #include <linux/quotaops.h> #include <linux/slab.h> #include <linux/cache.h> +#include <linux/swap.h> +#include <linux/swapctl.h> +#include <linux/prefetch.h> +#include <linux/locks.h> /* * New inode.c implementation. @@ -62,7 +66,7 @@ * NOTE! You also have to own the lock if you change * the i_state of an inode while it is in use.. */ -spinlock_t inode_lock = SPIN_LOCK_UNLOCKED; +static spinlock_t inode_lock = SPIN_LOCK_UNLOCKED; /* * Statistics gathering.. @@ -75,7 +79,7 @@ ((struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL)) static void destroy_inode(struct inode *inode) { - if (!list_empty(&inode->i_dirty_buffers)) + if (inode_has_buffers(inode)) BUG(); kmem_cache_free(inode_cachep, (inode)); } @@ -101,6 +105,8 @@ INIT_LIST_HEAD(&inode->i_data.locked_pages); INIT_LIST_HEAD(&inode->i_dentry); INIT_LIST_HEAD(&inode->i_dirty_buffers); + INIT_LIST_HEAD(&inode->i_dirty_data_buffers); + INIT_LIST_HEAD(&inode->i_devices); sema_init(&inode->i_sem, 1); sema_init(&inode->i_zombie, 1); spin_lock_init(&inode->i_data.i_shared_lock); @@ -123,36 +129,38 @@ /** * __mark_inode_dirty - internal function * @inode: inode to mark - * - * Mark an inode as dirty. Callers should use mark_inode_dirty. + * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) + * Mark an inode as dirty. Callers should use mark_inode_dirty or + * mark_inode_dirty_sync. */ void __mark_inode_dirty(struct inode *inode, int flags) { struct super_block * sb = inode->i_sb; - if (sb) { - /* Don't do this for I_DIRTY_PAGES - that doesn't actually dirty the inode itself */ - if (flags & (I_DIRTY | I_DIRTY_SYNC)) { - if (sb->s_op && sb->s_op->dirty_inode) - sb->s_op->dirty_inode(inode); - } + if (!sb) + return; - /* avoid the locking if we can */ - if ((inode->i_state & flags) == flags) - return; + /* Don't do this for I_DIRTY_PAGES - that doesn't actually dirty the inode itself */ + if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { + if (sb->s_op && sb->s_op->dirty_inode) + sb->s_op->dirty_inode(inode); + } - spin_lock(&inode_lock); - if ((inode->i_state & flags) != flags) { - inode->i_state |= flags; - /* Only add valid (ie hashed) inodes to the dirty list */ - if (!list_empty(&inode->i_hash)) { - list_del(&inode->i_list); - list_add(&inode->i_list, &sb->s_dirty); - } + /* avoid the locking if we can */ + if ((inode->i_state & flags) == flags) + return; + + spin_lock(&inode_lock); + if ((inode->i_state & flags) != flags) { + inode->i_state |= flags; + /* Only add valid (ie hashed) inodes to the dirty list */ + if (!(inode->i_state & I_LOCK) && !list_empty(&inode->i_hash)) { + list_del(&inode->i_list); + list_add(&inode->i_list, &sb->s_dirty); } - spin_unlock(&inode_lock); } + spin_unlock(&inode_lock); } static void __wait_on_inode(struct inode * inode) @@ -179,7 +187,7 @@ static inline void write_inode(struct inode *inode, int sync) { - if (inode->i_sb && inode->i_sb->s_op && inode->i_sb->s_op->write_inode) + if (inode->i_sb && inode->i_sb->s_op && inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) inode->i_sb->s_op->write_inode(inode, sync); } @@ -190,13 +198,53 @@ return; } atomic_inc(&inode->i_count); - if (!(inode->i_state & I_DIRTY)) { + if (!(inode->i_state & (I_DIRTY|I_LOCK))) { list_del(&inode->i_list); list_add(&inode->i_list, &inode_in_use); } inodes_stat.nr_unused--; } +static inline void __sync_one(struct inode *inode, int sync) +{ + unsigned dirty; + + list_del(&inode->i_list); + list_add(&inode->i_list, &inode->i_sb->s_locked_inodes); + + if (inode->i_state & I_LOCK) + BUG(); + + /* Set I_LOCK, reset I_DIRTY */ + dirty = inode->i_state & I_DIRTY; + inode->i_state |= I_LOCK; + inode->i_state &= ~I_DIRTY; + spin_unlock(&inode_lock); + + filemap_fdatasync(inode->i_mapping); + + /* Don't write the inode if only I_DIRTY_PAGES was set */ + if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) + write_inode(inode, sync); + + filemap_fdatawait(inode->i_mapping); + + spin_lock(&inode_lock); + inode->i_state &= ~I_LOCK; + if (!(inode->i_state & I_FREEING)) { + struct list_head *to; + if (inode->i_state & I_DIRTY) + to = &inode->i_sb->s_dirty; + else if (atomic_read(&inode->i_count)) + to = &inode_in_use; + else + to = &inode_unused; + list_del(&inode->i_list); + list_add(&inode->i_list, to); + } + wake_up(&inode->i_wait); +} + static inline void sync_one(struct inode *inode, int sync) { if (inode->i_state & I_LOCK) { @@ -206,38 +254,117 @@ iput(inode); spin_lock(&inode_lock); } else { - unsigned dirty; + __sync_one(inode, sync); + } +} - list_del(&inode->i_list); - list_add(&inode->i_list, atomic_read(&inode->i_count) - ? &inode_in_use - : &inode_unused); - /* Set I_LOCK, reset I_DIRTY */ - dirty = inode->i_state & I_DIRTY; - inode->i_state |= I_LOCK; - inode->i_state &= ~I_DIRTY; +static inline void sync_list(struct list_head *head) +{ + struct list_head * tmp; + + while ((tmp = head->prev) != head) + __sync_one(list_entry(tmp, struct inode, i_list), 0); +} + +static inline void wait_on_locked(struct list_head *head) +{ + struct list_head * tmp; + while ((tmp = head->prev) != head) { + struct inode *inode = list_entry(tmp, struct inode, i_list); + __iget(inode); spin_unlock(&inode_lock); + __wait_on_inode(inode); + iput(inode); + spin_lock(&inode_lock); + } +} - filemap_fdatasync(inode->i_mapping); +static inline int try_to_sync_unused_list(struct list_head *head, int nr_inodes) +{ + struct list_head *tmp = head; + struct inode *inode; - /* Don't write the inode if only I_DIRTY_PAGES was set */ - if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) - write_inode(inode, sync); + while (nr_inodes && (tmp = tmp->prev) != head) { + inode = list_entry(tmp, struct inode, i_list); - filemap_fdatawait(inode->i_mapping); + if (!atomic_read(&inode->i_count)) { + __sync_one(inode, 0); + nr_inodes--; - spin_lock(&inode_lock); - inode->i_state &= ~I_LOCK; - wake_up(&inode->i_wait); + /* + * __sync_one moved the inode to another list, + * so we have to start looking from the list head. + */ + tmp = head; + } + } + + return nr_inodes; +} + +void sync_inodes_sb(struct super_block *sb) +{ + spin_lock(&inode_lock); + while (!list_empty(&sb->s_dirty)||!list_empty(&sb->s_locked_inodes)) { + sync_list(&sb->s_dirty); + wait_on_locked(&sb->s_locked_inodes); } + spin_unlock(&inode_lock); } -static inline void sync_list(struct list_head *head) +/* + * Note: + * We don't need to grab a reference to superblock here. If it has non-empty + * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed + * past sync_inodes_sb() until both ->s_dirty and ->s_locked_inodes are + * empty. Since __sync_one() regains inode_lock before it finally moves + * inode from superblock lists we are OK. + */ + +void sync_unlocked_inodes(void) { - struct list_head * tmp; + struct super_block * sb; + spin_lock(&inode_lock); + spin_lock(&sb_lock); + sb = sb_entry(super_blocks.next); + for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.next)) { + if (!list_empty(&sb->s_dirty)) { + spin_unlock(&sb_lock); + sync_list(&sb->s_dirty); + spin_lock(&sb_lock); + } + } + spin_unlock(&sb_lock); + spin_unlock(&inode_lock); +} - while ((tmp = head->prev) != head) - sync_one(list_entry(tmp, struct inode, i_list), 0); +/* + * Find a superblock with inodes that need to be synced + */ + +static struct super_block *get_super_to_sync(void) +{ + struct list_head *p; +restart: + spin_lock(&inode_lock); + spin_lock(&sb_lock); + list_for_each(p, &super_blocks) { + struct super_block *s = list_entry(p,struct super_block,s_list); + if (list_empty(&s->s_dirty) && list_empty(&s->s_locked_inodes)) + continue; + s->s_count++; + spin_unlock(&sb_lock); + spin_unlock(&inode_lock); + down_read(&s->s_umount); + if (!s->s_root) { + drop_super(s); + goto restart; + } + return s; + } + spin_unlock(&sb_lock); + spin_unlock(&inode_lock); + return NULL; } /** @@ -247,42 +374,48 @@ * sync_inodes goes through the super block's dirty list, * writes them out, and puts them back on the normal list. */ - + void sync_inodes(kdev_t dev) { - struct super_block * sb = sb_entry(super_blocks.next); + struct super_block * s; /* * Search the super_blocks array for the device(s) to sync. */ - spin_lock(&inode_lock); - for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.next)) { - if (!sb->s_dev) - continue; - if (dev && sb->s_dev != dev) - continue; - - sync_list(&sb->s_dirty); - - if (dev) - break; + if (dev) { + if ((s = get_super(dev)) != NULL) { + sync_inodes_sb(s); + drop_super(s); + } + } else { + while ((s = get_super_to_sync()) != NULL) { + sync_inodes_sb(s); + drop_super(s); + } } - spin_unlock(&inode_lock); } -/* - * Called with the spinlock already held.. - */ -static void sync_all_inodes(void) +static void try_to_sync_unused_inodes(void * arg) { - struct super_block * sb = sb_entry(super_blocks.next); - for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.next)) { - if (!sb->s_dev) + struct super_block * sb; + int nr_inodes = inodes_stat.nr_unused; + + spin_lock(&inode_lock); + spin_lock(&sb_lock); + sb = sb_entry(super_blocks.next); + for (; nr_inodes && sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.next)) { + if (list_empty(&sb->s_dirty)) continue; - sync_list(&sb->s_dirty); + spin_unlock(&sb_lock); + nr_inodes = try_to_sync_unused_list(&sb->s_dirty, nr_inodes); + spin_lock(&sb_lock); } + spin_unlock(&sb_lock); + spin_unlock(&inode_lock); } +static struct tq_struct unused_inodes_flush_task; + /** * write_inode_now - write an inode to disk * @inode: inode to write to disk @@ -301,9 +434,11 @@ while (inode->i_state & I_DIRTY) sync_one(inode, sync); spin_unlock(&inode_lock); + if (sync) + wait_on_inode(inode); } else - printk("write_inode_now: no super block\n"); + printk(KERN_ERR "write_inode_now: no super block\n"); } /** @@ -315,9 +450,9 @@ * O_SYNC flag set, to flush dirty writes to disk. */ -int generic_osync_inode(struct inode *inode, int datasync) +int generic_osync_inode(struct inode *inode, int what) { - int err; + int err = 0, err2 = 0, need_write_inode_now = 0; /* * WARNING @@ -340,23 +475,24 @@ * every O_SYNC write, not just the synchronous I/Os. --sct */ -#ifdef WRITERS_QUEUE_IO - err = osync_inode_buffers(inode); -#else - err = fsync_inode_buffers(inode); -#endif + if (what & OSYNC_METADATA) + err = fsync_inode_buffers(inode); + if (what & OSYNC_DATA) + err2 = fsync_inode_data_buffers(inode); + if (!err) + err = err2; spin_lock(&inode_lock); - if (!(inode->i_state & I_DIRTY)) - goto out; - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - goto out; + if ((inode->i_state & I_DIRTY) && + ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC))) + need_write_inode_now = 1; spin_unlock(&inode_lock); - write_inode_now(inode, 1); - return err; - out: - spin_unlock(&inode_lock); + if (need_write_inode_now) + write_inode_now(inode, 1); + else + wait_on_inode(inode); + return err; } @@ -371,8 +507,7 @@ void clear_inode(struct inode *inode) { - if (!list_empty(&inode->i_dirty_buffers)) - invalidate_inode_buffers(inode); + invalidate_inode_buffers(inode); if (inode->i_data.nrpages) BUG(); @@ -381,13 +516,14 @@ if (inode->i_state & I_CLEAR) BUG(); wait_on_inode(inode); - if (IS_QUOTAINIT(inode)) - DQUOT_DROP(inode); + DQUOT_DROP(inode); if (inode->i_sb && inode->i_sb->s_op && inode->i_sb->s_op->clear_inode) inode->i_sb->s_op->clear_inode(inode); - if (inode->i_bdev) { - bdput(inode->i_bdev); - inode->i_bdev = NULL; + if (inode->i_bdev) + bd_forget(inode); + else if (inode->i_cdev) { + cdput(inode->i_cdev); + inode->i_cdev = NULL; } inode->i_state = I_CLEAR; } @@ -435,8 +571,7 @@ continue; invalidate_inode_buffers(inode); if (!atomic_read(&inode->i_count)) { - list_del(&inode->i_hash); - INIT_LIST_HEAD(&inode->i_hash); + list_del_init(&inode->i_hash); list_del(&inode->i_list); list_add(&inode->i_list, dispose); inode->i_state |= I_FREEING; @@ -476,12 +611,39 @@ busy = invalidate_list(&inode_in_use, sb, &throw_away); busy |= invalidate_list(&inode_unused, sb, &throw_away); busy |= invalidate_list(&sb->s_dirty, sb, &throw_away); + busy |= invalidate_list(&sb->s_locked_inodes, sb, &throw_away); spin_unlock(&inode_lock); dispose_list(&throw_away); return busy; } + +int invalidate_device(kdev_t dev, int do_sync) +{ + struct super_block *sb; + int res; + + if (do_sync) + fsync_dev(dev); + + res = 0; + sb = get_super(dev); + if (sb) { + /* + * no need to lock the super, get_super holds the + * read semaphore so the filesystem cannot go away + * under us (->put_super runs with the write lock + * hold). + */ + shrink_dcache_sb(sb); + res = invalidate_inodes(sb); + drop_super(sb); + } + invalidate_buffers(dev); + return res; +} + /* * This is called with the inode lock held. It searches @@ -503,13 +665,12 @@ { LIST_HEAD(list); struct list_head *entry, *freeable = &list; - int count = 0; + int count; struct inode * inode; spin_lock(&inode_lock); - /* go simple and safe syncing everything before starting */ - sync_all_inodes(); + count = 0; entry = inode_unused.prev; while (entry != &inode_unused) { @@ -517,12 +678,12 @@ entry = entry->prev; inode = INODE(tmp); - if (inode->i_state & (I_FREEING|I_CLEAR)) - BUG(); + if (inode->i_state & (I_FREEING|I_CLEAR|I_LOCK)) + continue; if (!CAN_UNUSE(inode)) continue; if (atomic_read(&inode->i_count)) - BUG(); + continue; list_del(tmp); list_del(&inode->i_hash); INIT_LIST_HEAD(&inode->i_hash); @@ -536,9 +697,18 @@ spin_unlock(&inode_lock); dispose_list(freeable); + + /* + * If we didn't freed enough clean inodes schedule + * a sync of the dirty inodes, we cannot do it + * from here or we're either synchronously dogslow + * or we deadlock with oom. + */ + if (goal) + schedule_task(&unused_inodes_flush_task); } -void shrink_icache_memory(int priority, int gfp_mask) +int shrink_icache_memory(int priority, int gfp_mask) { int count = 0; @@ -549,14 +719,14 @@ * want to recurse into the FS that called us * in clear_inode() and friends.. */ - if (!(gfp_mask & __GFP_IO)) - return; + if (!(gfp_mask & __GFP_FS)) + return 0; - if (priority) - count = inodes_stat.nr_unused / priority; + count = inodes_stat.nr_unused / priority; prune_icache(count); kmem_cache_shrink(inode_cachep); + return 0; } /* @@ -607,12 +777,15 @@ inode->i_nlink = 1; atomic_set(&inode->i_writecount, 0); inode->i_size = 0; + inode->i_blocks = 0; inode->i_generation = 0; memset(&inode->i_dquot, 0, sizeof(inode->i_dquot)); inode->i_pipe = NULL; inode->i_bdev = NULL; + inode->i_cdev = NULL; inode->i_data.a_ops = &empty_aops; inode->i_data.host = inode; + inode->i_data.gfp_mask = GFP_HIGHUSER; inode->i_mapping = &inode->i_data; } @@ -634,6 +807,8 @@ static unsigned long last_ino; struct inode * inode; + spin_lock_prefetch(&inode_lock); + inode = alloc_inode(); if (inode) { @@ -642,6 +817,7 @@ list_add(&inode->i_list, &inode_in_use); inode->i_sb = NULL; inode->i_dev = 0; + inode->i_blkbits = 0; inode->i_ino = ++last_ino; inode->i_flags = 0; atomic_set(&inode->i_count, 1); @@ -675,6 +851,7 @@ list_add(&inode->i_hash, head); inode->i_sb = sb; inode->i_dev = sb->s_dev; + inode->i_blkbits = sb->s_blocksize_bits; inode->i_ino = ino; inode->i_flags = 0; atomic_set(&inode->i_count, 1); @@ -781,8 +958,6 @@ */ inode = NULL; spin_unlock(&inode_lock); - if (inode) - wait_on_inode(inode); return inode; } @@ -853,10 +1028,14 @@ void iput(struct inode *inode) { if (inode) { + struct super_block *sb = inode->i_sb; struct super_operations *op = NULL; - if (inode->i_sb && inode->i_sb->s_op) - op = inode->i_sb->s_op; + if (inode->i_state == I_CLEAR) + BUG(); + + if (sb && sb->s_op) + op = sb->s_op; if (op && op->put_inode) op->put_inode(inode); @@ -877,6 +1056,8 @@ if (op && op->delete_inode) { void (*delete)(struct inode *) = op->delete_inode; + if (!is_bad_inode(inode)) + DQUOT_INIT(inode); /* s_op->delete_inode internally recalls clear_inode() */ delete(inode); } else @@ -884,22 +1065,22 @@ if (inode->i_state != I_CLEAR) BUG(); } else { - if (!list_empty(&inode->i_hash)) { - if (!(inode->i_state & I_DIRTY)) { + if (!list_empty(&inode->i_hash) && sb && sb->s_root) { + if (!(inode->i_state & (I_DIRTY|I_LOCK))) { list_del(&inode->i_list); - list_add(&inode->i_list, - &inode_unused); + list_add(&inode->i_list, &inode_unused); } inodes_stat.nr_unused++; spin_unlock(&inode_lock); return; } else { - /* magic nfs path */ - list_del(&inode->i_list); - INIT_LIST_HEAD(&inode->i_list); + list_del_init(&inode->i_list); + list_del_init(&inode->i_hash); inode->i_state|=I_FREEING; inodes_stat.nr_inodes--; spin_unlock(&inode_lock); + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); clear_inode(inode); } } @@ -988,6 +1169,8 @@ NULL); if (!inode_cachep) panic("cannot create inode slab cache"); + + unused_inodes_flush_task.routine = try_to_sync_unused_inodes; } /** @@ -1018,38 +1201,40 @@ void put_dquot_list(struct list_head *); int remove_inode_dquot_ref(struct inode *, short, struct list_head *); -void remove_dquot_ref(kdev_t dev, short type) +void remove_dquot_ref(struct super_block *sb, short type) { - struct super_block *sb = get_super(dev); struct inode *inode; struct list_head *act_head; LIST_HEAD(tofree_head); - if (!sb || !sb->dq_op) + if (!sb->dq_op) return; /* nothing to do */ - /* We have to be protected against other CPUs */ - spin_lock(&inode_lock); + lock_kernel(); /* This lock is for quota code */ + spin_lock(&inode_lock); /* This lock is for inodes code */ - for (act_head = inode_in_use.next; act_head != &inode_in_use; act_head = act_head->next) { + list_for_each(act_head, &inode_in_use) { inode = list_entry(act_head, struct inode, i_list); - if (inode->i_sb != sb || !IS_QUOTAINIT(inode)) - continue; - remove_inode_dquot_ref(inode, type, &tofree_head); + if (inode->i_sb == sb && IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); } - for (act_head = inode_unused.next; act_head != &inode_unused; act_head = act_head->next) { + list_for_each(act_head, &inode_unused) { inode = list_entry(act_head, struct inode, i_list); - if (inode->i_sb != sb || !IS_QUOTAINIT(inode)) - continue; - remove_inode_dquot_ref(inode, type, &tofree_head); + if (inode->i_sb == sb && IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); } - for (act_head = sb->s_dirty.next; act_head != &sb->s_dirty; act_head = act_head->next) { + list_for_each(act_head, &sb->s_dirty) { inode = list_entry(act_head, struct inode, i_list); - if (!IS_QUOTAINIT(inode)) - continue; - remove_inode_dquot_ref(inode, type, &tofree_head); + if (IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + list_for_each(act_head, &sb->s_locked_inodes) { + inode = list_entry(act_head, struct inode, i_list); + if (IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); } spin_unlock(&inode_lock); + unlock_kernel(); put_dquot_list(&tofree_head); } Index: iobuf.c =================================================================== RCS file: /cvsroot/linux-vax/kernel-2.4/fs/iobuf.c,v retrieving revision 1.1.1.2 retrieving revision 1.2 diff -u -r1.1.1.2 -r1.2 --- iobuf.c 25 Feb 2001 23:14:46 -0000 1.1.1.2 +++ iobuf.c 9 Apr 2002 13:11:16 -0000 1.2 @@ -8,9 +8,7 @@ #include <linux/iobuf.h> #include <linux/slab.h> - -static kmem_cache_t *kiobuf_cachep; - +#include <linux/vmalloc.h> void end_kio_request(struct kiobuf *kiobuf, int uptodate) { @@ -24,18 +22,7 @@ } } - -void __init kiobuf_setup(void) -{ - kiobuf_cachep = kmem_cache_create("kiobuf", - sizeof(struct kiobuf), - 0, - SLAB_HWCACHE_ALIGN, NULL, NULL); - if(!kiobuf_cachep) - panic("Cannot create kernel iobuf cache\n"); -} - -void kiobuf_init(struct kiobuf *iobuf) +static void kiobuf_init(struct kiobuf *iobuf) { memset(iobuf, 0, sizeof(*iobuf)); init_waitqueue_head(&iobuf->wait_queue); @@ -43,19 +30,49 @@ iobuf->maplist = iobuf->map_array; } +int alloc_kiobuf_bhs(struct kiobuf * kiobuf) +{ + int i; + + for (i = 0; i < KIO_MAX_SECTORS; i++) + if (!(kiobuf->bh[i] = kmem_cache_alloc(bh_cachep, SLAB_KERNEL))) { + while (i--) { + kmem_cache_free(bh_cachep, kiobuf->bh[i]); + kiobuf->bh[i] = NULL; + } + return -ENOMEM; + } + return 0; +} + +void free_kiobuf_bhs(struct kiobuf * kiobuf) +{ + int i; + + for (i = 0; i < KIO_MAX_SECTORS; i++) { + kmem_cache_free(bh_cachep, kiobuf->bh[i]); + kiobuf->bh[i] = NULL; + } +} + int alloc_kiovec(int nr, struct kiobuf **bufp) { int i; struct kiobuf *iobuf; for (i = 0; i < nr; i++) { - iobuf = kmem_cache_alloc(kiobuf_cachep, SLAB_KERNEL); + iobuf = vmalloc(sizeof(struct kiobuf)); if (!iobuf) { free_kiovec(i, bufp); return -ENOMEM; } kiobuf_init(iobuf); - *bufp++ = iobuf; + if (alloc_kiobuf_bhs(iobuf)) { + vfree(iobuf); + free_kiovec(i, bufp); + return -ENOMEM; + } + bufp[i] = iobuf; } return 0; @@ -72,7 +89,8 @@ unlock_kiovec(1, &iobuf); if (iobuf->array_len > KIO_STATIC_PAGES) kfree (iobuf->maplist); - kmem_cache_free(kiobuf_cachep, bufp[i]); + free_kiobuf_bhs(iobuf); + vfree(bufp[i]); } } @@ -115,11 +133,12 @@ add_wait_queue(&kiobuf->wait_queue, &wait); repeat: - run_task_queue(&tq_disk); set_task_state(tsk, TASK_UNINTERRUPTIBLE); if (atomic_read(&kiobuf->io_count) != 0) { + run_task_queue(&tq_disk); schedule(); - goto repeat; + if (atomic_read(&kiobuf->io_count) != 0) + goto repeat; } tsk->state = TASK_RUNNING; remove_wait_queue(&kiobuf->wait_queue, &wait); |