diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index ed0a97c45..a1abe20da 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -498,6 +498,7 @@ typedef struct taskq { #define TASKQID_INVALID ((taskqid_t)0) extern taskq_t *system_taskq; +extern taskq_t *system_delay_taskq; extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); #define taskq_create_proc(a, b, c, d, e, p, f) \ diff --git a/lib/libzpool/taskq.c b/lib/libzpool/taskq.c index c1f87e173..791d50981 100644 --- a/lib/libzpool/taskq.c +++ b/lib/libzpool/taskq.c @@ -32,6 +32,7 @@ int taskq_now; taskq_t *system_taskq; +taskq_t *system_delay_taskq; #define TASKQ_ACTIVE 0x00010000 @@ -353,6 +354,8 @@ system_taskq_init(void) { system_taskq = taskq_create("system_taskq", 64, maxclsyspri, 4, 512, TASKQ_DYNAMIC | TASKQ_PREPOPULATE); + system_delay_taskq = taskq_create("delay_taskq", 4, maxclsyspri, 4, + 512, TASKQ_DYNAMIC | TASKQ_PREPOPULATE); } void @@ -360,4 +363,6 @@ system_taskq_fini(void) { taskq_destroy(system_taskq); system_taskq = NULL; /* defensive */ + taskq_destroy(system_delay_taskq); + system_delay_taskq = NULL; } diff --git a/module/zfs/spa.c b/module/zfs/spa.c index c55225a10..5203ea826 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1208,7 +1208,7 @@ spa_deactivate(spa_t *spa) list_destroy(&spa->spa_evicting_os_list); list_destroy(&spa->spa_state_dirty_list); - taskq_cancel_id(system_taskq, spa->spa_deadman_tqid); + taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); for (t = 0; t < ZIO_TYPES; t++) { for (q = 0; q < ZIO_TASKQ_TYPES; q++) { @@ -6515,8 +6515,8 @@ spa_sync(spa_t *spa, uint64_t txg) tx = dmu_tx_create_assigned(dp, txg); spa->spa_sync_starttime = gethrtime(); - taskq_cancel_id(system_taskq, spa->spa_deadman_tqid); - spa->spa_deadman_tqid = taskq_dispatch_delay(system_taskq, + taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); + spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + NSEC_TO_TICK(spa->spa_deadman_synctime)); @@ -6704,7 +6704,7 @@ spa_sync(spa_t *spa, uint64_t txg) } dmu_tx_commit(tx); - taskq_cancel_id(system_taskq, spa->spa_deadman_tqid); + taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); spa->spa_deadman_tqid = 0; /* diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 909002cf5..8ae5fb559 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -530,7 +530,7 @@ spa_deadman(void *arg) if (zfs_deadman_enabled) vdev_deadman(spa->spa_root_vdev); - spa->spa_deadman_tqid = taskq_dispatch_delay(system_taskq, + spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + NSEC_TO_TICK(spa->spa_deadman_synctime)); } diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c index c7a93edfc..53674d975 100644 --- a/module/zfs/zfs_ctldir.c +++ b/module/zfs/zfs_ctldir.c @@ -111,11 +111,6 @@ static krwlock_t zfs_snapshot_lock; int zfs_expire_snapshot = ZFSCTL_EXPIRE_SNAPSHOT; int zfs_admin_snapshot = 1; -/* - * Dedicated task queue for unmounting snapshots. - */ -static taskq_t *zfs_expire_taskq; - typedef struct { char *se_name; /* full snapshot name */ char *se_path; /* full mount path */ @@ -365,7 +360,7 @@ zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se) { ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock)); - if (taskq_cancel_id(zfs_expire_taskq, se->se_taskqid) == 0) { + if (taskq_cancel_id(system_delay_taskq, se->se_taskqid) == 0) { se->se_taskqid = TASKQID_INVALID; zfsctl_snapshot_rele(se); } @@ -383,7 +378,7 @@ zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay) return; zfsctl_snapshot_hold(se); - se->se_taskqid = taskq_dispatch_delay(zfs_expire_taskq, + se->se_taskqid = taskq_dispatch_delay(system_delay_taskq, snapentry_expire, se, TQ_SLEEP, ddi_get_lbolt() + delay * HZ); } @@ -1257,9 +1252,6 @@ zfsctl_init(void) sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node_objsetid)); rw_init(&zfs_snapshot_lock, NULL, RW_DEFAULT, NULL); - - zfs_expire_taskq = taskq_create("z_unmount", 1, defclsyspri, - 1, 8, TASKQ_PREPOPULATE); } /* @@ -1269,8 +1261,6 @@ zfsctl_init(void) void zfsctl_fini(void) { - taskq_destroy(zfs_expire_taskq); - avl_destroy(&zfs_snapshots_by_name); avl_destroy(&zfs_snapshots_by_objsetid); rw_destroy(&zfs_snapshot_lock); diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c index 5417f2422..39e92ce21 100644 --- a/module/zfs/zfs_vfsops.c +++ b/module/zfs/zfs_vfsops.c @@ -1922,6 +1922,7 @@ zfs_fini(void) /* * we don't use outstanding because zpl_posix_acl_free might add more. */ + taskq_wait(system_delay_taskq); taskq_wait(system_taskq); unregister_filesystem(&zpl_fs_type); zfs_znode_fini(); diff --git a/module/zfs/zpl_xattr.c b/module/zfs/zpl_xattr.c index cec870824..9ab27f1c2 100644 --- a/module/zfs/zpl_xattr.c +++ b/module/zfs/zpl_xattr.c @@ -1511,8 +1511,8 @@ zpl_posix_acl_free(void *arg) } if (refire) - taskq_dispatch_delay(system_taskq, zpl_posix_acl_free, NULL, - TQ_SLEEP, new_time); + taskq_dispatch_delay(system_delay_taskq, zpl_posix_acl_free, + NULL, TQ_SLEEP, new_time); while (freelist) { a = freelist; @@ -1537,7 +1537,7 @@ zpl_posix_acl_release_impl(struct posix_acl *acl) *prev = a; /* if it was empty before, schedule the free task */ if (prev == &acl_rel_head) - taskq_dispatch_delay(system_taskq, zpl_posix_acl_free, NULL, - TQ_SLEEP, ddi_get_lbolt() + ACL_REL_SCHED); + taskq_dispatch_delay(system_delay_taskq, zpl_posix_acl_free, + NULL, TQ_SLEEP, ddi_get_lbolt() + ACL_REL_SCHED); } #endif diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index ea6997b5b..61d0538a3 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -63,6 +63,11 @@ static kmutex_t zvol_state_lock; static list_t zvol_state_list; void *zvol_tag = "zvol_tag"; +#define ZVOL_HT_SIZE 1024 +static struct hlist_head *zvol_htable; +#define ZVOL_HT_HEAD(hash) (&zvol_htable[(hash) & (ZVOL_HT_SIZE-1)]) +static DEFINE_IDA(zvol_ida); + /* * The in-core state of each volume. */ @@ -81,6 +86,8 @@ typedef struct zvol_state { struct gendisk *zv_disk; /* generic disk */ struct request_queue *zv_queue; /* request queue */ list_node_t zv_next; /* next zvol_state_t linkage */ + uint64_t zv_hash; /* name hash */ + struct hlist_node zv_hlink; /* hash link */ } zvol_state_t; typedef enum { @@ -102,30 +109,17 @@ typedef struct { #define ZVOL_RDONLY 0x1 -/* - * Find the next available range of ZVOL_MINORS minor numbers. The - * zvol_state_list is kept in ascending minor order so we simply need - * to scan the list for the first gap in the sequence. This allows us - * to recycle minor number as devices are created and removed. - */ -static int -zvol_find_minor(unsigned *minor) +static uint64_t +zvol_name_hash(const char *name) { - zvol_state_t *zv; - - *minor = 0; - ASSERT(MUTEX_HELD(&zvol_state_lock)); - for (zv = list_head(&zvol_state_list); zv != NULL; - zv = list_next(&zvol_state_list, zv), *minor += ZVOL_MINORS) { - if (MINOR(zv->zv_dev) != MINOR(*minor)) - break; + int i; + uint64_t crc = -1ULL; + uint8_t *p = (uint8_t *)name; + ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); + for (i = 0; i < MAXNAMELEN - 1 && *p; i++, p++) { + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (*p)) & 0xFF]; } - - /* All minors are in use */ - if (*minor >= (1 << MINORBITS)) - return (SET_ERROR(ENXIO)); - - return (0); + return (crc); } /* @@ -146,22 +140,32 @@ zvol_find_by_dev(dev_t dev) return (NULL); } +/* + * Find a zvol_state_t given the name and hash generated by zvol_name_hash. + */ +static zvol_state_t * +zvol_find_by_name_hash(const char *name, uint64_t hash) +{ + zvol_state_t *zv; + struct hlist_node *p; + + ASSERT(MUTEX_HELD(&zvol_state_lock)); + hlist_for_each(p, ZVOL_HT_HEAD(hash)) { + zv = hlist_entry(p, zvol_state_t, zv_hlink); + if (zv->zv_hash == hash && + strncmp(zv->zv_name, name, MAXNAMELEN) == 0) + return (zv); + } + return (NULL); +} + /* * Find a zvol_state_t given the name provided at zvol_alloc() time. */ static zvol_state_t * zvol_find_by_name(const char *name) { - zvol_state_t *zv; - - ASSERT(MUTEX_HELD(&zvol_state_lock)); - for (zv = list_head(&zvol_state_list); zv != NULL; - zv = list_next(&zvol_state_list, zv)) { - if (strncmp(zv->zv_name, name, MAXNAMELEN) == 0) - return (zv); - } - - return (NULL); + return (zvol_find_by_name_hash(name, zvol_name_hash(name))); } @@ -921,32 +925,26 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) } /* - * The zvol_state_t's are inserted in increasing MINOR(dev_t) order. + * The zvol_state_t's are inserted into zvol_state_list and zvol_htable. */ static void -zvol_insert(zvol_state_t *zv_insert) +zvol_insert(zvol_state_t *zv) { - zvol_state_t *zv = NULL; - ASSERT(MUTEX_HELD(&zvol_state_lock)); - ASSERT3U(MINOR(zv_insert->zv_dev) & ZVOL_MINOR_MASK, ==, 0); - for (zv = list_head(&zvol_state_list); zv != NULL; - zv = list_next(&zvol_state_list, zv)) { - if (MINOR(zv->zv_dev) > MINOR(zv_insert->zv_dev)) - break; - } - - list_insert_before(&zvol_state_list, zv, zv_insert); + ASSERT3U(MINOR(zv->zv_dev) & ZVOL_MINOR_MASK, ==, 0); + list_insert_head(&zvol_state_list, zv); + hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); } /* * Simply remove the zvol from to list of zvols. */ static void -zvol_remove(zvol_state_t *zv_remove) +zvol_remove(zvol_state_t *zv) { ASSERT(MUTEX_HELD(&zvol_state_lock)); - list_remove(&zvol_state_list, zv_remove); + list_remove(&zvol_state_list, zv); + hlist_del(&zv->zv_hlink); } static int @@ -1038,7 +1036,7 @@ zvol_open(struct block_device *bdev, fmode_t flag) /* * Obtain a copy of private_data under the lock to make sure - * that either the result of zvol_freeg() setting + * that either the result of zvol_free() setting * bdev->bd_disk->private_data to NULL is observed, or zvol_free() * is not called on this zv because of the positive zv_open_count. */ @@ -1318,12 +1316,13 @@ out_kmem: } /* - * Cleanup then free a zvol_state_t which was created by zvol_alloc(). + * Used for taskq, if used out side zvol_state_lock, you need to clear + * zv_disk->private_data inside lock first. */ static void -zvol_free(zvol_state_t *zv) +zvol_free_impl(void *arg) { - ASSERT(MUTEX_HELD(&zvol_state_lock)); + zvol_state_t *zv = arg; ASSERT(zv->zv_open_count == 0); zfs_rlock_destroy(&zv->zv_range_lock); @@ -1334,9 +1333,20 @@ zvol_free(zvol_state_t *zv) blk_cleanup_queue(zv->zv_queue); put_disk(zv->zv_disk); + ida_simple_remove(&zvol_ida, MINOR(zv->zv_dev) >> ZVOL_MINOR_BITS); kmem_free(zv, sizeof (zvol_state_t)); } +/* + * Cleanup then free a zvol_state_t which was created by zvol_alloc(). + */ +static void +zvol_free(zvol_state_t *zv) +{ + ASSERT(MUTEX_HELD(&zvol_state_lock)); + zvol_free_impl(zv); +} + /* * Create a block device minor node and setup the linkage between it * and the specified volume. Once this function returns the block @@ -1352,10 +1362,17 @@ zvol_create_minor_impl(const char *name) uint64_t len; unsigned minor = 0; int error = 0; + int idx; + uint64_t hash = zvol_name_hash(name); + + idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); + if (idx < 0) + return (SET_ERROR(-idx)); + minor = idx << ZVOL_MINOR_BITS; mutex_enter(&zvol_state_lock); - zv = zvol_find_by_name(name); + zv = zvol_find_by_name_hash(name, hash); if (zv) { error = SET_ERROR(EEXIST); goto out; @@ -1375,15 +1392,12 @@ zvol_create_minor_impl(const char *name) if (error) goto out_dmu_objset_disown; - error = zvol_find_minor(&minor); - if (error) - goto out_dmu_objset_disown; - zv = zvol_alloc(MKDEV(zvol_major, minor), name); if (zv == NULL) { error = SET_ERROR(EAGAIN); goto out_dmu_objset_disown; } + zv->zv_hash = hash; if (dmu_objset_is_snapshot(os)) zv->zv_flags |= ZVOL_RDONLY; @@ -1449,6 +1463,7 @@ out: add_disk(zv->zv_disk); } else { mutex_exit(&zvol_state_lock); + ida_simple_remove(&zvol_ida, idx); } return (SET_ERROR(error)); @@ -1478,6 +1493,32 @@ zvol_rename_minor(zvol_state_t *zv, const char *newname) set_disk_ro(zv->zv_disk, readonly); } +typedef struct minors_job { + list_t *list; + list_node_t link; + /* input */ + char *name; + /* output */ + int error; +} minors_job_t; + +/* + * Prefetch zvol dnodes for the minors_job + */ +static void +zvol_prefetch_minors_impl(void *arg) +{ + minors_job_t *job = arg; + char *dsname = job->name; + objset_t *os = NULL; + + job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, zvol_tag, + &os); + if (job->error == 0) { + dmu_prefetch(os, ZVOL_OBJ, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); + dmu_objset_disown(os, zvol_tag); + } +} /* * Mask errors to continue dmu_objset_find() traversal @@ -1485,7 +1526,9 @@ zvol_rename_minor(zvol_state_t *zv, const char *newname) static int zvol_create_snap_minor_cb(const char *dsname, void *arg) { - const char *name = (const char *)arg; + minors_job_t *j = arg; + list_t *minors_list = j->list; + const char *name = j->name; ASSERT0(MUTEX_HELD(&spa_namespace_lock)); @@ -1498,7 +1541,19 @@ zvol_create_snap_minor_cb(const char *dsname, void *arg) dprintf("zvol_create_snap_minor_cb(): " "%s is not a shapshot name\n", dsname); } else { - (void) zvol_create_minor_impl(dsname); + minors_job_t *job; + char *n = strdup(dsname); + if (n == NULL) + return (0); + + job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP); + job->name = n; + job->list = minors_list; + job->error = 0; + list_insert_tail(minors_list, job); + /* don't care if dispatch fails, because job->error is 0 */ + taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job, + TQ_SLEEP); } return (0); @@ -1512,6 +1567,7 @@ zvol_create_minors_cb(const char *dsname, void *arg) { uint64_t snapdev; int error; + list_t *minors_list = arg; ASSERT0(MUTEX_HELD(&spa_namespace_lock)); @@ -1527,19 +1583,28 @@ zvol_create_minors_cb(const char *dsname, void *arg) * snapshots and create device minor nodes for those. */ if (strchr(dsname, '@') == 0) { - /* create minor for the 'dsname' explicitly */ - error = zvol_create_minor_impl(dsname); - if ((error == 0 || error == EEXIST) && - (snapdev == ZFS_SNAPDEV_VISIBLE)) { - fstrans_cookie_t cookie = spl_fstrans_mark(); + minors_job_t *job; + char *n = strdup(dsname); + if (n == NULL) + return (0); + + job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP); + job->name = n; + job->list = minors_list; + job->error = 0; + list_insert_tail(minors_list, job); + /* don't care if dispatch fails, because job->error is 0 */ + taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job, + TQ_SLEEP); + + if (snapdev == ZFS_SNAPDEV_VISIBLE) { /* * traverse snapshots only, do not traverse children, * and skip the 'dsname' */ error = dmu_objset_find((char *)dsname, - zvol_create_snap_minor_cb, (void *)dsname, + zvol_create_snap_minor_cb, (void *)job, DS_FIND_SNAPSHOTS); - spl_fstrans_unmark(cookie); } } else { dprintf("zvol_create_minors_cb(): %s is not a zvol name\n", @@ -1572,10 +1637,24 @@ zvol_create_minors_impl(const char *name) int error = 0; fstrans_cookie_t cookie; char *atp, *parent; + list_t minors_list; + minors_job_t *job; if (zvol_inhibit_dev) return (0); + /* + * This is the list for prefetch jobs. Whenever we found a match + * during dmu_objset_find, we insert a minors_job to the list and do + * taskq_dispatch to parallel prefetch zvol dnodes. Note we don't need + * any lock because all list operation is done on the current thread. + * + * We will use this list to do zvol_create_minor_impl after prefetch + * so we don't have to traverse using dmu_objset_find again. + */ + list_create(&minors_list, sizeof (minors_job_t), + offsetof(minors_job_t, link)); + parent = kmem_alloc(MAXPATHLEN, KM_SLEEP); (void) strlcpy(parent, name, MAXPATHLEN); @@ -1591,11 +1670,26 @@ zvol_create_minors_impl(const char *name) } else { cookie = spl_fstrans_mark(); error = dmu_objset_find(parent, zvol_create_minors_cb, - NULL, DS_FIND_CHILDREN); + &minors_list, DS_FIND_CHILDREN); spl_fstrans_unmark(cookie); } kmem_free(parent, MAXPATHLEN); + taskq_wait_outstanding(system_taskq, 0); + + /* + * Prefetch is completed, we can do zvol_create_minor_impl + * sequentially. + */ + while ((job = list_head(&minors_list)) != NULL) { + list_remove(&minors_list, job); + if (!job->error) + zvol_create_minor_impl(job->name); + strfree(job->name); + kmem_free(job, sizeof (minors_job_t)); + } + + list_destroy(&minors_list); return (SET_ERROR(error)); } @@ -1608,6 +1702,7 @@ zvol_remove_minors_impl(const char *name) { zvol_state_t *zv, *zv_next; int namelen = ((name) ? strlen(name) : 0); + taskqid_t t, tid = TASKQID_INVALID; if (zvol_inhibit_dev) return; @@ -1627,11 +1722,22 @@ zvol_remove_minors_impl(const char *name) continue; zvol_remove(zv); - zvol_free(zv); + + /* clear this so zvol_open won't open it */ + zv->zv_disk->private_data = NULL; + + /* try parallel zv_free, if failed do it in place */ + t = taskq_dispatch(system_taskq, zvol_free_impl, zv, + TQ_SLEEP); + if (t == TASKQID_INVALID) + zvol_free(zv); + else + tid = t; } } - mutex_exit(&zvol_state_lock); + if (tid != TASKQID_INVALID) + taskq_wait_outstanding(system_taskq, tid); } /* Remove minor for this specific snapshot only */ @@ -1933,16 +2039,25 @@ zvol_rename_minors(spa_t *spa, const char *name1, const char *name2, int zvol_init(void) { - int error; + int i, error; list_create(&zvol_state_list, sizeof (zvol_state_t), offsetof(zvol_state_t, zv_next)); mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL); + zvol_htable = kmem_alloc(ZVOL_HT_SIZE * sizeof (struct hlist_head), + KM_SLEEP); + if (!zvol_htable) { + error = ENOMEM; + goto out; + } + for (i = 0; i < ZVOL_HT_SIZE; i++) + INIT_HLIST_HEAD(&zvol_htable[i]); + error = register_blkdev(zvol_major, ZVOL_DRIVER); if (error) { printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); - goto out; + goto out_free; } blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS, @@ -1950,6 +2065,8 @@ zvol_init(void) return (0); +out_free: + kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head)); out: mutex_destroy(&zvol_state_lock); list_destroy(&zvol_state_list); @@ -1964,6 +2081,7 @@ zvol_fini(void) blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS); unregister_blkdev(zvol_major, ZVOL_DRIVER); + kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head)); list_destroy(&zvol_state_list); mutex_destroy(&zvol_state_lock);