diff --git a/include/linux/vfs_compat.h b/include/linux/vfs_compat.h index c4e1771ae..c9fa76ece 100644 --- a/include/linux/vfs_compat.h +++ b/include/linux/vfs_compat.h @@ -94,6 +94,14 @@ bdi_setup_and_register(struct backing_dev_info *bdi,char *name,unsigned int cap) } #endif /* HAVE_BDI && !HAVE_BDI_SETUP_AND_REGISTER */ +/* + * 2.6.38 API change, + * LOOKUP_RCU flag introduced to distinguish rcu-walk from ref-walk cases. + */ +#ifndef LOOKUP_RCU +#define LOOKUP_RCU 0x0 +#endif /* LOOKUP_RCU */ + /* * 3.2-rc1 API change, * Add set_nlink() if it is not exported by the Linux kernel. diff --git a/include/sys/zfs_vfsops.h b/include/sys/zfs_vfsops.h index 4dd46710f..f685c1296 100644 --- a/include/sys/zfs_vfsops.h +++ b/include/sys/zfs_vfsops.h @@ -69,6 +69,7 @@ typedef struct zfs_sb { krwlock_t z_teardown_inactive_lock; list_t z_all_znodes; /* all znodes in the fs */ uint64_t z_nr_znodes; /* number of znodes in the fs */ + unsigned long z_rollback_time;/* last online rollback time */ kmutex_t z_znodes_lock; /* lock for z_all_znodes */ struct inode *z_ctldir; /* .zfs directory inode */ avl_tree_t z_ctldir_snaps; /* .zfs/snapshot entries */ diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h index 0b75d5295..41233547b 100644 --- a/include/sys/zfs_znode.h +++ b/include/sys/zfs_znode.h @@ -216,6 +216,7 @@ typedef struct znode { boolean_t z_is_zvol; /* are we used by the zvol */ boolean_t z_is_mapped; /* are we mmap'ed */ boolean_t z_is_ctldir; /* are we .zfs entry */ + boolean_t z_is_stale; /* are we stale due to rollback? */ struct inode z_inode; /* generic vfs inode */ } znode_t; diff --git a/include/sys/zpl.h b/include/sys/zpl.h index e34b323bd..61a57ef29 100644 --- a/include/sys/zpl.h +++ b/include/sys/zpl.h @@ -28,18 +28,20 @@ #include #include #include +#include #include #include #include /* zpl_inode.c */ extern void zpl_vap_init(vattr_t *vap, struct inode *dir, - struct dentry *dentry, zpl_umode_t mode, cred_t *cr); + zpl_umode_t mode, cred_t *cr); extern const struct inode_operations zpl_inode_operations; extern const struct inode_operations zpl_dir_inode_operations; extern const struct inode_operations zpl_symlink_inode_operations; extern const struct inode_operations zpl_special_inode_operations; +extern dentry_operations_t zpl_dentry_operations; /* zpl_file.c */ extern ssize_t zpl_read_common(struct inode *ip, const char *buf, diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c index 622d865df..b3801d494 100644 --- a/module/zfs/zfs_ctldir.c +++ b/module/zfs/zfs_ctldir.c @@ -197,6 +197,7 @@ zfsctl_inode_alloc(zfs_sb_t *zsb, uint64_t id, zp->z_is_mapped = B_FALSE; zp->z_is_ctldir = B_TRUE; zp->z_is_sa = B_FALSE; + zp->z_is_stale = B_FALSE; ip->i_ino = id; ip->i_mode = (S_IFDIR | S_IRUGO | S_IXUGO); ip->i_uid = 0; diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c index fc5c2ba39..ac5c317ce 100644 --- a/module/zfs/zfs_vfsops.c +++ b/module/zfs/zfs_vfsops.c @@ -1032,7 +1032,7 @@ EXPORT_SYMBOL(zfs_sb_prune); #endif /* HAVE_SHRINK */ /* - * Teardown the zfs_sb_t::z_os. + * Teardown the zfs_sb_t. * * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock' * and 'z_teardown_inactive_lock' held. @@ -1053,7 +1053,6 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting) * for non-snapshots. */ shrink_dcache_sb(zsb->z_parent->z_sb); - (void) spl_invalidate_inodes(zsb->z_parent->z_sb, 0); } /* @@ -1079,25 +1078,26 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting) } /* - * At this point there are no vops active, and any new vops will - * fail with EIO since we have z_teardown_lock for writer (only - * relavent for forced unmount). + * At this point there are no VFS ops active, and any new VFS ops + * will fail with EIO since we have z_teardown_lock for writer (only + * relevant for forced unmount). * * Release all holds on dbufs. */ mutex_enter(&zsb->z_znodes_lock); for (zp = list_head(&zsb->z_all_znodes); zp != NULL; - zp = list_next(&zsb->z_all_znodes, zp)) + zp = list_next(&zsb->z_all_znodes, zp)) { if (zp->z_sa_hdl) { ASSERT(atomic_read(&ZTOI(zp)->i_count) > 0); zfs_znode_dmu_fini(zp); } + } mutex_exit(&zsb->z_znodes_lock); /* - * If we are unmounting, set the unmounted flag and let new vops + * If we are unmounting, set the unmounted flag and let new VFS ops * unblock. zfs_inactive will have the unmounted behavior, and all - * other vops will fail with EIO. + * other VFS ops will fail with EIO. */ if (unmounting) { zsb->z_unmounted = B_TRUE; @@ -1392,7 +1392,7 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) EXPORT_SYMBOL(zfs_vget); /* - * Block out VOPs and close zfs_sb_t::z_os + * Block out VFS ops and close zfs_sb_t * * Note, if successful, then we return with the 'z_teardown_lock' and * 'z_teardown_inactive_lock' write held. @@ -1404,6 +1404,7 @@ zfs_suspend_fs(zfs_sb_t *zsb) if ((error = zfs_sb_teardown(zsb, B_FALSE)) != 0) return (error); + dmu_objset_disown(zsb->z_os, zsb); return (0); @@ -1411,7 +1412,7 @@ zfs_suspend_fs(zfs_sb_t *zsb) EXPORT_SYMBOL(zfs_suspend_fs); /* - * Reopen zfs_sb_t::z_os and release VOPs. + * Reopen zfs_sb_t and release VFS ops. */ int zfs_resume_fs(zfs_sb_t *zsb, const char *osname) @@ -1440,30 +1441,37 @@ zfs_resume_fs(zfs_sb_t *zsb, const char *osname) goto bail; VERIFY(zfs_sb_setup(zsb, B_FALSE) == 0); + zsb->z_rollback_time = jiffies; /* - * Attempt to re-establish all the active znodes with - * their dbufs. If a zfs_rezget() fails, then we'll let - * any potential callers discover that via ZFS_ENTER_VERIFY_VP - * when they try to use their znode. + * Attempt to re-establish all the active inodes with their + * dbufs. If a zfs_rezget() fails, then we unhash the inode + * and mark it stale. This prevents a collision if a new + * inode/object is created which must use the same inode + * number. The stale inode will be be released when the + * VFS prunes the dentry holding the remaining references + * on the stale inode. */ mutex_enter(&zsb->z_znodes_lock); for (zp = list_head(&zsb->z_all_znodes); zp; zp = list_next(&zsb->z_all_znodes, zp)) { - (void) zfs_rezget(zp); + err2 = zfs_rezget(zp); + if (err2) { + remove_inode_hash(ZTOI(zp)); + zp->z_is_stale = B_TRUE; + } } mutex_exit(&zsb->z_znodes_lock); - } bail: - /* release the VOPs */ + /* release the VFS ops */ rw_exit(&zsb->z_teardown_inactive_lock); rrw_exit(&zsb->z_teardown_lock, FTAG); if (err) { /* - * Since we couldn't reopen zfs_sb_t::z_os, force + * Since we couldn't reopen zfs_sb_t, force * unmount this file system. */ (void) zfs_umount(zsb->z_sb); diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c index 8074f1d00..9bf26a734 100644 --- a/module/zfs/zfs_znode.c +++ b/module/zfs/zfs_znode.c @@ -274,8 +274,10 @@ zfs_inode_destroy(struct inode *ip) zfsctl_inode_destroy(ip); mutex_enter(&zsb->z_znodes_lock); - list_remove(&zsb->z_all_znodes, zp); - zsb->z_nr_znodes--; + if (list_link_active(&zp->z_link_node)) { + list_remove(&zsb->z_all_znodes, zp); + zsb->z_nr_znodes--; + } mutex_exit(&zsb->z_znodes_lock); if (zp->z_acl_cached) { @@ -348,7 +350,7 @@ zfs_inode_set_ops(zfs_sb_t *zsb, struct inode *ip) static znode_t * zfs_znode_alloc(zfs_sb_t *zsb, dmu_buf_t *db, int blksz, dmu_object_type_t obj_type, uint64_t obj, sa_handle_t *hdl, - struct dentry *dentry, struct inode *dip) + struct inode *dip) { znode_t *zp; struct inode *ip; @@ -379,6 +381,7 @@ zfs_znode_alloc(zfs_sb_t *zsb, dmu_buf_t *db, int blksz, zp->z_is_zvol = B_FALSE; zp->z_is_mapped = B_FALSE; zp->z_is_ctldir = B_FALSE; + zp->z_is_stale = B_FALSE; zfs_znode_sa_init(zsb, zp, db, obj_type, hdl); @@ -414,11 +417,15 @@ zfs_znode_alloc(zfs_sb_t *zsb, dmu_buf_t *db, int blksz, zfs_inode_update(zp); zfs_inode_set_ops(zsb, ip); - if (insert_inode_locked(ip)) - goto error; - - if (dentry) - d_instantiate(dentry, ip); + /* + * The only way insert_inode_locked() can fail is if the ip->i_ino + * number is already hashed for this super block. This can never + * happen because the inode numbers map 1:1 with the object numbers. + * + * The one exception is rolling back a mounted file system, but in + * this case all the active inode are unhashed during the rollback. + */ + VERIFY3S(insert_inode_locked(ip), ==, 0); mutex_enter(&zsb->z_znodes_lock); list_insert_tail(&zsb->z_all_znodes, zp); @@ -720,9 +727,9 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, if (!(flag & IS_ROOT_NODE)) { *zpp = zfs_znode_alloc(zsb, db, 0, obj_type, obj, sa_hdl, - vap->va_dentry, ZTOI(dzp)); - ASSERT(*zpp != NULL); - ASSERT(dzp != NULL); + ZTOI(dzp)); + VERIFY(*zpp != NULL); + VERIFY(dzp != NULL); } else { /* * If we are creating the root node, the "parent" we @@ -931,7 +938,7 @@ again: * bonus buffer. */ zp = zfs_znode_alloc(zsb, db, doi.doi_data_block_size, - doi.doi_bonus_type, obj_num, NULL, NULL, NULL); + doi.doi_bonus_type, obj_num, NULL, NULL); if (zp == NULL) { err = ENOENT; } else { @@ -961,8 +968,20 @@ zfs_rezget(znode_t *zp) zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = NULL; } - mutex_exit(&zp->z_acl_lock); + + rw_enter(&zp->z_xattr_lock, RW_WRITER); + if (zp->z_xattr_cached) { + nvlist_free(zp->z_xattr_cached); + zp->z_xattr_cached = NULL; + } + + if (zp->z_xattr_parent) { + iput(ZTOI(zp->z_xattr_parent)); + zp->z_xattr_parent = NULL; + } + rw_exit(&zp->z_xattr_lock); + ASSERT(zp->z_sa_hdl == NULL); err = sa_buf_hold(zsb->z_os, obj_num, NULL, &db); if (err) { @@ -1016,6 +1035,7 @@ zfs_rezget(znode_t *zp) zp->z_unlinked = (zp->z_links == 0); zp->z_blksz = doi.doi_data_block_size; + zfs_inode_update(zp); ZFS_OBJ_HOLD_EXIT(zsb, obj_num); diff --git a/module/zfs/zpl_ctldir.c b/module/zfs/zpl_ctldir.c index 2e5209f8c..54bdbe409 100644 --- a/module/zfs/zpl_ctldir.c +++ b/module/zfs/zpl_ctldir.c @@ -366,7 +366,7 @@ zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, zpl_umode_t mode) crhold(cr); vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP); - zpl_vap_init(vap, dip, dentry, mode | S_IFDIR, cr); + zpl_vap_init(vap, dip, mode | S_IFDIR, cr); error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0); if (error == 0) { diff --git a/module/zfs/zpl_inode.c b/module/zfs/zpl_inode.c index 6175c2e93..15ee0f610 100644 --- a/module/zfs/zpl_inode.c +++ b/module/zfs/zpl_inode.c @@ -46,6 +46,10 @@ zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) ASSERT3S(error, <=, 0); crfree(cr); + spin_lock(&dentry->d_lock); + dentry->d_time = jiffies; + spin_unlock(&dentry->d_lock); + if (error) { if (error == -ENOENT) return d_splice_alias(NULL, dentry); @@ -57,12 +61,10 @@ zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) } void -zpl_vap_init(vattr_t *vap, struct inode *dir, struct dentry *dentry, - zpl_umode_t mode, cred_t *cr) +zpl_vap_init(vattr_t *vap, struct inode *dir, zpl_umode_t mode, cred_t *cr) { vap->va_mask = ATTR_MODE; vap->va_mode = mode; - vap->va_dentry = dentry; vap->va_uid = crgetfsuid(cr); if (dir && dir->i_mode & S_ISGID) { @@ -90,12 +92,14 @@ zpl_create(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, crhold(cr); vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP); - zpl_vap_init(vap, dir, dentry, mode, cr); + zpl_vap_init(vap, dir, mode, cr); error = -zfs_create(dir, dname(dentry), vap, 0, mode, &ip, cr, 0, NULL); if (error == 0) { error = zpl_xattr_security_init(ip, dir, &dentry->d_name); VERIFY3S(error, ==, 0); + d_instantiate(dentry, ip); + d_set_d_op(dentry, &zpl_dentry_operations); } kmem_free(vap, sizeof(vattr_t)); @@ -123,11 +127,15 @@ zpl_mknod(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, crhold(cr); vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP); - zpl_vap_init(vap, dir, dentry, mode, cr); + zpl_vap_init(vap, dir, mode, cr); vap->va_rdev = rdev; - error = -zfs_create(dir, (char *)dentry->d_name.name, - vap, 0, mode, &ip, cr, 0, NULL); + error = -zfs_create(dir, dname(dentry), vap, 0, mode, &ip, cr, 0, NULL); + if (error == 0) { + d_instantiate(dentry, ip); + d_set_d_op(dentry, &zpl_dentry_operations); + } + kmem_free(vap, sizeof(vattr_t)); crfree(cr); ASSERT3S(error, <=, 0); @@ -159,9 +167,14 @@ zpl_mkdir(struct inode *dir, struct dentry *dentry, zpl_umode_t mode) crhold(cr); vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP); - zpl_vap_init(vap, dir, dentry, mode | S_IFDIR, cr); + zpl_vap_init(vap, dir, mode | S_IFDIR, cr); error = -zfs_mkdir(dir, dname(dentry), vap, &ip, cr, 0, NULL); + if (error == 0) { + d_instantiate(dentry, ip); + d_set_d_op(dentry, &zpl_dentry_operations); + } + kmem_free(vap, sizeof(vattr_t)); crfree(cr); ASSERT3S(error, <=, 0); @@ -262,9 +275,14 @@ zpl_symlink(struct inode *dir, struct dentry *dentry, const char *name) crhold(cr); vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP); - zpl_vap_init(vap, dir, dentry, S_IFLNK | S_IRWXUGO, cr); + zpl_vap_init(vap, dir, S_IFLNK | S_IRWXUGO, cr); error = -zfs_symlink(dir, dname(dentry), vap, (char *)name, &ip, cr, 0); + if (error == 0) { + d_instantiate(dentry, ip); + d_set_d_op(dentry, &zpl_dentry_operations); + } + kmem_free(vap, sizeof(vattr_t)); crfree(cr); ASSERT3S(error, <=, 0); @@ -334,6 +352,7 @@ zpl_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) } d_instantiate(dentry, ip); + d_set_d_op(dentry, &zpl_dentry_operations); out: crfree(cr); ASSERT3S(error, <=, 0); @@ -378,6 +397,44 @@ zpl_fallocate(struct inode *ip, int mode, loff_t offset, loff_t len) } #endif /* HAVE_INODE_FALLOCATE */ +static int +#ifdef HAVE_D_REVALIDATE_NAMEIDATA +zpl_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + unsigned int flags = nd->flags; +#else +zpl_revalidate(struct dentry *dentry, unsigned int flags) +{ +#endif /* HAVE_D_REVALIDATE_NAMEIDATA */ + zfs_sb_t *zsb = dentry->d_sb->s_fs_info; + int error; + + if (flags & LOOKUP_RCU) + return (-ECHILD); + + /* + * After a rollback negative dentries created before the rollback + * time must be invalidated. Otherwise they can obscure files which + * are only present in the rolled back dataset. + */ + if (dentry->d_inode == NULL) { + spin_lock(&dentry->d_lock); + error = time_before(dentry->d_time, zsb->z_rollback_time); + spin_unlock(&dentry->d_lock); + + if (error) + return (0); + } + + /* + * The dentry may reference a stale inode if a mounted file system + * was rolled back to a point in time where the object didn't exist. + */ + if (dentry->d_inode && ITOZ(dentry->d_inode)->z_is_stale) + return (0); + + return (1); +} const struct inode_operations zpl_inode_operations = { .create = zpl_create, @@ -440,3 +497,7 @@ const struct inode_operations zpl_special_inode_operations = { .removexattr = generic_removexattr, .listxattr = zpl_xattr_list, }; + +dentry_operations_t zpl_dentry_operations = { + .d_revalidate = zpl_revalidate, +};