From: Anand V. A. <av...@re...> - 2012-07-19 09:12:34
|
This patch implements readdirplus support in FUSE, similar to NFS. The payload returned in the readdirplus call contains 'fuse_entry_out' structure thereby providing all the necessary inputs for 'faking' a lookup() operation on the spot. If the dentry and inode already existed (for e.g. in a re-run of ls -l) then just the inode attributes timeout and dentry timeout are refreshed. With a simple client->network->server implementation of a FUSE based filesystem, the following performance observations were made: Test: Performing a filesystem crawl over 20,000 files with sh# time ls -lR /mnt Without readdirplus: Run 1: 18.1s Run 2: 16.0s Run 3: 16.2s With readdirplus: Run 1: 4.1s Run 2: 3.8s Run 3: 3.8s The performance improvement is significant as it avoided 20,000 upcalls calls (lookup). Cache consistency is no worse than what already is. RFC 1. Is it preferred to implement this support as an init flag or autodetect with ENOSYS and fail back to readdir? 2. When an inode link cannot be performed, we need to send a FORGET for the entry's nodeid. However to guarantee delivery of FORGET, we need to allocate fuse_forget_link in prior. However, for readdir like calls we do not know how many entries (and therefore how many fuse_forget_link structures) will be there. The code currently performs a best effort by trying to allocate on demand, however this might be useless because the need to allocate this is very likely because of failure to allocate some other memory. Is it possible somehow to allocate just one fuse_forget_link at the start of the call and queue+wait for every failed inode linkage? Signed-off-by: Anand V. Avati <av...@re...> --- fs/fuse/dir.c | 175 +++++++++++++++++++++++++++++++++++++++++++++++-- fs/fuse/fuse_i.h | 3 + fs/fuse/inode.c | 4 +- include/linux/fuse.h | 15 ++++- 4 files changed, 186 insertions(+), 11 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 334e0b1..6fdb454 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1096,10 +1096,14 @@ static int fuse_permission(struct inode *inode, int mask) static int parse_dirfile(char *buf, size_t nbytes, struct file *file, void *dstbuf, filldir_t filldir) { - while (nbytes >= FUSE_NAME_OFFSET) { - struct fuse_dirent *dirent = (struct fuse_dirent *) buf; - size_t reclen = FUSE_DIRENT_SIZE(dirent); - int over; + struct fuse_dirent *dirent; + size_t reclen; + int over; + + while (nbytes >= FUSE_NAME_OFFSET(dirent)) { + dirent = (struct fuse_dirent *) buf; + reclen = FUSE_DIRENT_SIZE(dirent); + if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) return -EIO; if (reclen > nbytes) @@ -1118,6 +1122,151 @@ static int parse_dirfile(char *buf, size_t nbytes, struct file *file, return 0; } +void fuse_alloc_and_forget(struct fuse_conn *fc, u64 nodeid) +{ + struct fuse_forget_link *forget; + + forget = fuse_alloc_forget(); + if (!forget) + return; + fuse_queue_forget(fc, forget, nodeid, 1); +} + +static int fuse_direntp_link(struct dentry *parent, + struct fuse_direntp *direntp) +{ + int ret = -1; + struct qstr name = QSTR_INIT(direntp->name, direntp->namelen); + struct dentry *dentry; + struct dentry *alias; + struct inode *dir = parent->d_inode; + struct fuse_conn *fc; + struct fuse_entry_out *feo = &direntp->feo; + struct inode *inode; + struct fuse_inode *fi; + u64 attr_version; + + if (!direntp->feo.nodeid) { + /* Unlike in the case of fuse_lookup, zero nodeid does + not mean ENOENT. Instead, it only means the userspace + filesystem did not want to return attributes/handle for + this entry. + + So do nothing. + */ + return 0; + } + + if (name.name[0] == '.') { + /* We could potentially refresh the attributes of the + directory and its parent? + */ + if (name.len == 1) + return 0; + if (name.name[1] == '.' && name.len == 2) + return 0; + } + name.hash = full_name_hash(name.name, name.len); + + dentry = d_lookup(parent, &name); + if (dentry && dentry->d_inode) { + inode = dentry->d_inode; + if (get_node_id(inode) == direntp->feo.nodeid) { + goto found; + } else { + fuse_alloc_and_forget(fc, get_node_id(inode)); + d_drop(dentry); + dput(dentry); + dentry = NULL; + } + } + + dentry = d_alloc(parent, &name); + if (!dentry) + goto out; + + inode = fuse_iget(dir->i_sb, feo->nodeid, feo->generation, + &feo->attr, entry_attr_timeout(feo), + attr_version); + if (!inode || IS_ERR(inode)) + goto out; + + alias = d_materialise_unique(dentry, inode); + if (IS_ERR(alias)) + goto out; + +found: + fi = get_fuse_inode(inode); + + fc = get_fuse_conn(dir); + + attr_version = fuse_get_attr_version(fc); + + fuse_change_attributes(inode, &feo->attr, + entry_attr_timeout(feo), + attr_version); + if (alias) + fuse_change_entry_timeout(alias, feo); + else + fuse_change_entry_timeout(dentry, feo); + + spin_lock(&fc->lock); + fi->nlookup++; + spin_unlock(&fc->lock); + + ret = 0; +out: + if (dentry) + dput(dentry); + if (alias) + dput(alias); + return ret; +} + +static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, + void *dstbuf, filldir_t filldir) +{ + struct fuse_direntp *direntp; + size_t reclen; + int over = 0; + int ret; + + while (nbytes >= FUSE_NAME_OFFSET(direntp)) { + direntp = (struct fuse_direntp *) buf; + reclen = FUSE_DIRENT_SIZE(direntp); + + if (!direntp->namelen || direntp->namelen > FUSE_NAME_MAX) + return -EIO; + if (reclen > nbytes) + break; + + if (!over) { + /* We fill entries into dstbuf only as much as + it can hold. But we still continue iterating + over remaining entries to link them. If not, + we need to send a FORGET for each of those + which we did not link. + */ + over = filldir(dstbuf, direntp->name, direntp->namelen, + file->f_pos, direntp->ino, + direntp->type); + file->f_pos = direntp->off; + } + + buf += reclen; + nbytes -= reclen; + + ret = fuse_direntp_link(file->f_path.dentry, direntp); + if (ret) { + struct fuse_conn *fc; + fc = get_fuse_conn(file->f_path.dentry->d_inode); + fuse_alloc_and_forget(fc, direntp->feo.nodeid); + } + } + + return 0; +} + static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) { int err; @@ -1142,14 +1291,24 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) req->out.argpages = 1; req->num_pages = 1; req->pages[0] = page; - fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, FUSE_READDIR); + if (fc->do_readdirplus) + fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, + FUSE_READDIRPLUS); + else + fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, + FUSE_READDIR); fuse_request_send(fc, req); nbytes = req->out.args[0].size; err = req->out.h.error; fuse_put_request(fc, req); - if (!err) - err = parse_dirfile(page_address(page), nbytes, file, dstbuf, - filldir); + if (!err) { + if (fc->do_readdirplus) + err = parse_dirplusfile(page_address(page), nbytes, + file, dstbuf, filldir); + else + err = parse_dirfile(page_address(page), nbytes, file, + dstbuf, filldir); + } __free_page(page); fuse_invalidate_attr(inode); /* atime changed */ diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 771fb63..2ed4259 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -484,6 +484,9 @@ struct fuse_conn { /** Is fallocate not implemented by fs? */ unsigned no_fallocate:1; + /** Does the filesystem support readdir-plus? */ + unsigned do_readdirplus:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 1cd6165..5a10c8c 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -834,6 +834,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->big_writes = 1; if (arg->flags & FUSE_DONT_MASK) fc->dont_mask = 1; + if (arg->flags & FUSE_DO_READDIRPLUS) + fc->do_readdirplus = 1; } else { ra_pages = fc->max_read / PAGE_CACHE_SIZE; fc->no_lock = 1; @@ -859,7 +861,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE; arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | - FUSE_FLOCK_LOCKS; + FUSE_FLOCK_LOCKS | FUSE_DO_READDIRPLUS; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); diff --git a/include/linux/fuse.h b/include/linux/fuse.h index 9303348..3be2f39 100644 --- a/include/linux/fuse.h +++ b/include/linux/fuse.h @@ -175,6 +175,7 @@ struct fuse_file_lock { #define FUSE_EXPORT_SUPPORT (1 << 4) #define FUSE_BIG_WRITES (1 << 5) #define FUSE_DONT_MASK (1 << 6) +#define FUSE_DO_READDIRPLUS (1 << 7) #define FUSE_FLOCK_LOCKS (1 << 10) /** @@ -282,6 +283,7 @@ enum fuse_opcode { FUSE_NOTIFY_REPLY = 41, FUSE_BATCH_FORGET = 42, FUSE_FALLOCATE = 43, + FUSE_READDIRPLUS = 44, /* CUSE specific operations */ CUSE_INIT = 4096, @@ -608,10 +610,19 @@ struct fuse_dirent { char name[]; }; -#define FUSE_NAME_OFFSET offsetof(struct fuse_dirent, name) +struct fuse_direntp { + __u64 ino; + __u64 off; + __u32 namelen; + __u32 type; + struct fuse_entry_out feo; + char name[]; +}; + +#define FUSE_NAME_OFFSET(d) offsetof(typeof(*d), name) #define FUSE_DIRENT_ALIGN(x) (((x) + sizeof(__u64) - 1) & ~(sizeof(__u64) - 1)) #define FUSE_DIRENT_SIZE(d) \ - FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (d)->namelen) + FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET(d) + (d)->namelen) struct fuse_notify_inval_inode_out { __u64 ino; -- 1.7.4.4 |