From 040dab993936d832df4c7624bbcdb71c3fb9b34b Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Thu, 19 Jan 2017 13:56:36 -0800 Subject: [PATCH] Suspend/resume zvol for recv and rollback When doing recv and rollback, dsl_dataset_clone_swap_sync_impl will be called to swap out the ds_objset and do dmu_objset_evict on the old one. However, currently zv->zv_objset will not be swapped out accordingly, so if anyone currently holds a fd on the zvol, we risk hitting a use-after-free. We fix this by introducing the suspend and resume mechanism of zsb to zv. Before recv or rollback, we use zvol_suspend to block all access to zv_objset and shut it down. After the recv or rollback, we use zvol_resume to swap in zv_objset with the new ds_objset and unblock the access. Reviewed-by: Brian Behlendorf Signed-off-by: Chunwei Chen Closes #4866 Closes #5609 --- include/sys/zvol.h | 7 +- module/zfs/dsl_dataset.c | 5 +- module/zfs/zfs_ioctl.c | 8 ++ module/zfs/zvol.c | 190 +++++++++++++++++++++++++++++++-------- 4 files changed, 168 insertions(+), 42 deletions(-) diff --git a/include/sys/zvol.h b/include/sys/zvol.h index 2fb20fbf9..f149da977 100644 --- a/include/sys/zvol.h +++ b/include/sys/zvol.h @@ -35,14 +35,14 @@ #define SPEC_MAXOFFSET_T ((1LL << ((NBBY * sizeof (daddr32_t)) + \ DEV_BSHIFT - 1)) - 1) -extern void *zvol_tag; - extern void zvol_create_minors(spa_t *spa, const char *name, boolean_t async); extern void zvol_remove_minors(spa_t *spa, const char *name, boolean_t async); extern void zvol_rename_minors(spa_t *spa, const char *oldname, const char *newname, boolean_t async); #ifdef _KERNEL +typedef struct zvol_state zvol_state_t; + extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize); extern int zvol_check_volblocksize(const char *name, uint64_t volblocksize); extern int zvol_get_stats(objset_t *os, nvlist_t *nv); @@ -51,6 +51,9 @@ extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); extern int zvol_set_volsize(const char *, uint64_t); extern int zvol_set_volblocksize(const char *, uint64_t); extern int zvol_set_snapdev(const char *, zprop_source_t, uint64_t); +extern zvol_state_t *zvol_suspend(const char *); +extern int zvol_resume(zvol_state_t *); +extern void *zvol_tag(zvol_state_t *); extern int zvol_init(void); extern void zvol_fini(void); diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 43fecf28b..ddab3eddb 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -2124,8 +2124,7 @@ dsl_dataset_rename_snapshot(const char *fsname, * only one long hold on the dataset. We're not allowed to change anything here * so we don't permanently release the long hold or regular hold here. We want * to do this only when syncing to avoid the dataset unexpectedly going away - * when we release the long hold. Allow a long hold to exist for volumes, this - * may occur when asynchronously registering the minor with the kernel. + * when we release the long hold. */ static int dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx) @@ -2140,7 +2139,7 @@ dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx) dsl_dataset_long_rele(ds, owner); } - held = (dsl_dataset_long_held(ds) && (ds->ds_owner != zvol_tag)); + held = dsl_dataset_long_held(ds); if (owner != NULL) dsl_dataset_long_hold(ds, owner); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index ba4e0ee3f..659345917 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -3641,6 +3641,7 @@ static int zfs_ioc_rollback(const char *fsname, nvlist_t *args, nvlist_t *outnvl) { zfs_sb_t *zsb; + zvol_state_t *zv; int error; if (get_zfs_sb(fsname, &zsb) == 0) { @@ -3653,6 +3654,9 @@ zfs_ioc_rollback(const char *fsname, nvlist_t *args, nvlist_t *outnvl) error = error ? error : resume_err; } deactivate_super(zsb->z_sb); + } else if ((zv = zvol_suspend(fsname)) != NULL) { + error = dsl_dataset_rollback(fsname, zvol_tag(zv), outnvl); + zvol_resume(zv); } else { error = dsl_dataset_rollback(fsname, NULL, outnvl); } @@ -4240,6 +4244,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, if (error == 0) { zfs_sb_t *zsb = NULL; + zvol_state_t *zv = NULL; if (get_zfs_sb(tofs, &zsb) == 0) { /* online recv */ @@ -4255,6 +4260,9 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, error = zfs_resume_fs(zsb, tofs); error = error ? error : end_err; deactivate_super(zsb->z_sb); + } else if ((zv = zvol_suspend(tofs)) != NULL) { + error = dmu_recv_end(&drc, zvol_tag(zv)); + zvol_resume(zv); } else { error = dmu_recv_end(&drc, NULL); } diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 10c963ca5..aad110b1b 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -61,7 +61,6 @@ unsigned long zvol_max_discard_blocks = 16384; static kmutex_t zvol_state_lock; static list_t zvol_state_list; -void *zvol_tag = "zvol_tag"; #define ZVOL_HT_SIZE 1024 static struct hlist_head *zvol_htable; @@ -71,7 +70,7 @@ static DEFINE_IDA(zvol_ida); /* * The in-core state of each volume. */ -typedef struct zvol_state { +struct zvol_state { char zv_name[MAXNAMELEN]; /* name */ uint64_t zv_volsize; /* advertised space */ uint64_t zv_volblocksize; /* volume block size */ @@ -88,7 +87,9 @@ typedef struct zvol_state { list_node_t zv_next; /* next zvol_state_t linkage */ uint64_t zv_hash; /* name hash */ struct hlist_node zv_hlink; /* hash link */ -} zvol_state_t; + atomic_t zv_suspend_ref; /* refcount for suspend */ + krwlock_t zv_suspend_lock; /* suspend lock */ +}; typedef enum { ZVOL_ASYNC_CREATE_MINORS, @@ -373,6 +374,7 @@ zvol_set_volsize(const char *name, uint64_t volsize) if (zv != NULL) zv->zv_objset = os; } else { + rw_enter(&zv->zv_suspend_lock, RW_READER); os = zv->zv_objset; } @@ -392,6 +394,8 @@ out: dmu_objset_disown(os, FTAG); if (zv != NULL) zv->zv_objset = NULL; + } else { + rw_exit(&zv->zv_suspend_lock); } mutex_exit(&zvol_state_lock); return (error); @@ -457,6 +461,8 @@ zvol_set_volblocksize(const char *name, uint64_t volblocksize) goto out; } + rw_enter(&zv->zv_suspend_lock, RW_READER); + tx = dmu_tx_create(zv->zv_objset); dmu_tx_hold_bonus(tx, ZVOL_OBJ); error = dmu_tx_assign(tx, TXG_WAIT); @@ -471,6 +477,7 @@ zvol_set_volblocksize(const char *name, uint64_t volblocksize) if (error == 0) zv->zv_volblocksize = volblocksize; } + rw_exit(&zv->zv_suspend_lock); out: mutex_exit(&zvol_state_lock); @@ -785,6 +792,8 @@ zvol_request(struct request_queue *q, struct bio *bio) #endif int error = 0; + rw_enter(&zv->zv_suspend_lock, RW_READER); + uio.uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)]; uio.uio_skip = BIO_BI_SKIP(bio); uio.uio_resid = BIO_BI_SIZE(bio); @@ -836,6 +845,7 @@ out2: generic_end_io_acct(rw, &zv->zv_disk->part0, start); out1: BIO_END_IO(bio, -error); + rw_exit(&zv->zv_suspend_lock); spl_fstrans_unmark(cookie); #ifdef HAVE_MAKE_REQUEST_FN_RET_INT return (0); @@ -947,32 +957,28 @@ zvol_remove(zvol_state_t *zv) hlist_del(&zv->zv_hlink); } +/* + * Setup zv after we just own the zv->objset + */ static int -zvol_first_open(zvol_state_t *zv) +zvol_setup_zv(zvol_state_t *zv) { - objset_t *os; uint64_t volsize; int error; uint64_t ro; - - /* lie and say we're read-only */ - error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, 1, zvol_tag, &os); - if (error) - return (SET_ERROR(-error)); - - zv->zv_objset = os; + objset_t *os = zv->zv_objset; error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL); if (error) - goto out_owned; + return (SET_ERROR(error)); error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) - goto out_owned; + return (SET_ERROR(error)); - error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf); + error = dmu_bonus_hold(os, ZVOL_OBJ, zv, &zv->zv_dbuf); if (error) - goto out_owned; + return (SET_ERROR(error)); set_capacity(zv->zv_disk, volsize >> 9); zv->zv_volsize = volsize; @@ -986,23 +992,20 @@ zvol_first_open(zvol_state_t *zv) set_disk_ro(zv->zv_disk, 0); zv->zv_flags &= ~ZVOL_RDONLY; } - -out_owned: - if (error) { - dmu_objset_disown(os, zvol_tag); - zv->zv_objset = NULL; - } - - return (SET_ERROR(-error)); + return (0); } +/* + * Shutdown every zv_objset related stuff except zv_objset itself. + * The is the reverse of zvol_setup_zv. + */ static void -zvol_last_close(zvol_state_t *zv) +zvol_shutdown_zv(zvol_state_t *zv) { zil_close(zv->zv_zilog); zv->zv_zilog = NULL; - dmu_buf_rele(zv->zv_dbuf, zvol_tag); + dmu_buf_rele(zv->zv_dbuf, zv); zv->zv_dbuf = NULL; /* @@ -1012,8 +1015,98 @@ zvol_last_close(zvol_state_t *zv) !(zv->zv_flags & ZVOL_RDONLY)) txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); (void) dmu_objset_evict_dbufs(zv->zv_objset); +} - dmu_objset_disown(zv->zv_objset, zvol_tag); +/* + * return the proper tag for rollback and recv + */ +void * +zvol_tag(zvol_state_t *zv) +{ + ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); + return (zv->zv_open_count > 0 ? zv : NULL); +} + +/* + * Suspend the zvol for recv and rollback. + */ +zvol_state_t * +zvol_suspend(const char *name) +{ + zvol_state_t *zv; + + mutex_enter(&zvol_state_lock); + zv = zvol_find_by_name(name); + if (zv == NULL) + goto out; + + /* block all I/O, release in zvol_resume. */ + rw_enter(&zv->zv_suspend_lock, RW_WRITER); + + atomic_inc(&zv->zv_suspend_ref); + + if (zv->zv_open_count > 0) + zvol_shutdown_zv(zv); +out: + mutex_exit(&zvol_state_lock); + return (zv); +} + +int +zvol_resume(zvol_state_t *zv) +{ + int error = 0; + + ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); + if (zv->zv_open_count > 0) { + VERIFY0(dmu_objset_hold(zv->zv_name, zv, &zv->zv_objset)); + VERIFY3P(zv->zv_objset->os_dsl_dataset->ds_owner, ==, zv); + VERIFY(dsl_dataset_long_held(zv->zv_objset->os_dsl_dataset)); + dmu_objset_rele(zv->zv_objset, zv); + + error = zvol_setup_zv(zv); + } + rw_exit(&zv->zv_suspend_lock); + /* + * We need this because we don't hold zvol_state_lock while releasing + * zv_suspend_lock. zvol_remove_minors_impl thus cannot check + * zv_suspend_lock to determine it is safe to free because rwlock is + * not inherent atomic. + */ + atomic_dec(&zv->zv_suspend_ref); + + return (SET_ERROR(error)); +} + +static int +zvol_first_open(zvol_state_t *zv) +{ + objset_t *os; + int error; + + /* lie and say we're read-only */ + error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, 1, zv, &os); + if (error) + return (SET_ERROR(-error)); + + zv->zv_objset = os; + + error = zvol_setup_zv(zv); + + if (error) { + dmu_objset_disown(os, zv); + zv->zv_objset = NULL; + } + + return (SET_ERROR(-error)); +} + +static void +zvol_last_close(zvol_state_t *zv) +{ + zvol_shutdown_zv(zv); + + dmu_objset_disown(zv->zv_objset, zv); zv->zv_objset = NULL; } @@ -1021,7 +1114,7 @@ static int zvol_open(struct block_device *bdev, fmode_t flag) { zvol_state_t *zv; - int error = 0, drop_mutex = 0; + int error = 0, drop_mutex = 0, drop_suspend = 0; /* * If the caller is already holding the mutex do not take it @@ -1047,6 +1140,10 @@ zvol_open(struct block_device *bdev, fmode_t flag) } if (zv->zv_open_count == 0) { + /* make sure zvol is not suspended when first open */ + rw_enter(&zv->zv_suspend_lock, RW_READER); + drop_suspend = 1; + error = zvol_first_open(zv); if (error) goto out_mutex; @@ -1064,8 +1161,9 @@ zvol_open(struct block_device *bdev, fmode_t flag) out_open_count: if (zv->zv_open_count == 0) zvol_last_close(zv); - out_mutex: + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); if (drop_mutex) mutex_exit(&zvol_state_lock); @@ -1089,9 +1187,15 @@ zvol_release(struct gendisk *disk, fmode_t mode) drop_mutex = 1; } + /* make sure zvol is not suspended when last close */ + if (zv->zv_open_count == 1) + rw_enter(&zv->zv_suspend_lock, RW_READER); + zv->zv_open_count--; - if (zv->zv_open_count == 0) + if (zv->zv_open_count == 0) { zvol_last_close(zv); + rw_exit(&zv->zv_suspend_lock); + } if (drop_mutex) mutex_exit(&zvol_state_lock); @@ -1110,6 +1214,7 @@ zvol_ioctl(struct block_device *bdev, fmode_t mode, ASSERT(zv && zv->zv_open_count > 0); + rw_enter(&zv->zv_suspend_lock, RW_READER); switch (cmd) { case BLKFLSBUF: zil_commit(zv->zv_zilog, ZVOL_OBJ); @@ -1121,8 +1226,8 @@ zvol_ioctl(struct block_device *bdev, fmode_t mode, default: error = -ENOTTY; break; - } + rw_exit(&zv->zv_suspend_lock); return (SET_ERROR(error)); } @@ -1296,6 +1401,7 @@ zvol_alloc(dev_t dev, const char *name) strlcpy(zv->zv_name, name, MAXNAMELEN); zfs_rlock_init(&zv->zv_range_lock); + rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); zv->zv_disk->major = zvol_major; zv->zv_disk->first_minor = (dev & MINORMASK); @@ -1325,6 +1431,7 @@ zvol_free_impl(void *arg) zvol_state_t *zv = arg; ASSERT(zv->zv_open_count == 0); + rw_destroy(&zv->zv_suspend_lock); zfs_rlock_destroy(&zv->zv_range_lock); zv->zv_disk->private_data = NULL; @@ -1380,7 +1487,7 @@ zvol_create_minor_impl(const char *name) doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); - error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, zvol_tag, &os); + error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os); if (error) goto out_doi; @@ -1446,7 +1553,7 @@ zvol_create_minor_impl(const char *name) zv->zv_objset = NULL; out_dmu_objset_disown: - dmu_objset_disown(os, zvol_tag); + dmu_objset_disown(os, FTAG); out_doi: kmem_free(doi, sizeof (dmu_object_info_t)); out: @@ -1479,7 +1586,14 @@ zvol_rename_minor(zvol_state_t *zv, const char *newname) ASSERT(MUTEX_HELD(&zvol_state_lock)); + rw_enter(&zv->zv_suspend_lock, RW_READER); strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); + rw_exit(&zv->zv_suspend_lock); + + /* move to new hashtable entry */ + zv->zv_hash = zvol_name_hash(zv->zv_name); + hlist_del(&zv->zv_hlink); + hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); /* * The block device's read-only state is briefly changed causing @@ -1512,11 +1626,11 @@ zvol_prefetch_minors_impl(void *arg) char *dsname = job->name; objset_t *os = NULL; - job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, zvol_tag, + job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, FTAG, &os); if (job->error == 0) { dmu_prefetch(os, ZVOL_OBJ, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); - dmu_objset_disown(os, zvol_tag); + dmu_objset_disown(os, FTAG); } } @@ -1718,7 +1832,8 @@ zvol_remove_minors_impl(const char *name) zv->zv_name[namelen] == '@'))) { /* If in use, leave alone */ - if (zv->zv_open_count > 0) + if (zv->zv_open_count > 0 || + atomic_read(&zv->zv_suspend_ref)) continue; zvol_remove(zv); @@ -1759,7 +1874,8 @@ zvol_remove_minor_impl(const char *name) if (strcmp(zv->zv_name, name) == 0) { /* If in use, leave alone */ - if (zv->zv_open_count > 0) + if (zv->zv_open_count > 0 || + atomic_read(&zv->zv_suspend_ref)) continue; zvol_remove(zv); zvol_free(zv);