diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index a32848941..4abafcc6f 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -22,6 +22,9 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ #ifndef _SYS_ZFS_CONTEXT_H #define _SYS_ZFS_CONTEXT_H @@ -365,6 +368,16 @@ typedef struct taskq taskq_t; typedef uintptr_t taskqid_t; typedef void (task_func_t)(void *); +typedef struct taskq_ent { + struct taskq_ent *tqent_next; + struct taskq_ent *tqent_prev; + task_func_t *tqent_func; + void *tqent_arg; + uintptr_t tqent_flags; +} taskq_ent_t; + +#define TQENT_FLAG_PREALLOC 0x1 /* taskq_dispatch_ent used */ + #define TASKQ_PREPOPULATE 0x0001 #define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */ #define TASKQ_DYNAMIC 0x0004 /* Use dynamic thread scheduling */ @@ -385,6 +398,10 @@ extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); #define taskq_create_sysdc(a, b, d, e, p, dc, f) \ (taskq_create(a, b, maxclsyspri, d, e, f)) extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); +extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, + taskq_ent_t *); +extern int taskq_empty_ent(taskq_ent_t *); +extern void taskq_init_ent(taskq_ent_t *); extern void taskq_destroy(taskq_t *); extern void taskq_wait(taskq_t *); extern int taskq_member(taskq_t *, kthread_t *); diff --git a/include/sys/zio.h b/include/sys/zio.h index a46918174..c0da4e2d7 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -22,6 +22,9 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ #ifndef _ZIO_H #define _ZIO_H @@ -423,6 +426,9 @@ struct zio { /* FMA state */ zio_cksum_report_t *io_cksum_report; uint64_t io_ena; + + /* Taskq dispatching state */ + taskq_ent_t io_tqent; }; extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, diff --git a/lib/libzpool/taskq.c b/lib/libzpool/taskq.c index 36c0ec7df..6143a9189 100644 --- a/lib/libzpool/taskq.c +++ b/lib/libzpool/taskq.c @@ -22,19 +22,15 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ #include int taskq_now; taskq_t *system_taskq; -typedef struct task { - struct task *task_next; - struct task *task_prev; - task_func_t *task_func; - void *task_arg; -} task_t; - #define TASKQ_ACTIVE 0x00010000 struct taskq { @@ -51,18 +47,19 @@ struct taskq { int tq_maxalloc; kcondvar_t tq_maxalloc_cv; int tq_maxalloc_wait; - task_t *tq_freelist; - task_t tq_task; + taskq_ent_t *tq_freelist; + taskq_ent_t tq_task; }; -static task_t * +static taskq_ent_t * task_alloc(taskq_t *tq, int tqflags) { - task_t *t; + taskq_ent_t *t; int rv; again: if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) { - tq->tq_freelist = t->task_next; + ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); + tq->tq_freelist = t->tqent_next; } else { if (tq->tq_nalloc >= tq->tq_maxalloc) { if (!(tqflags & KM_SLEEP)) @@ -87,25 +84,28 @@ again: if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) { } mutex_exit(&tq->tq_lock); - t = kmem_alloc(sizeof (task_t), tqflags); + t = kmem_alloc(sizeof (taskq_ent_t), tqflags); mutex_enter(&tq->tq_lock); - if (t != NULL) + if (t != NULL) { + /* Make sure we start without any flags */ + t->tqent_flags = 0; tq->tq_nalloc++; + } } return (t); } static void -task_free(taskq_t *tq, task_t *t) +task_free(taskq_t *tq, taskq_ent_t *t) { if (tq->tq_nalloc <= tq->tq_minalloc) { - t->task_next = tq->tq_freelist; + t->tqent_next = tq->tq_freelist; tq->tq_freelist = t; } else { tq->tq_nalloc--; mutex_exit(&tq->tq_lock); - kmem_free(t, sizeof (task_t)); + kmem_free(t, sizeof (taskq_ent_t)); mutex_enter(&tq->tq_lock); } @@ -116,7 +116,7 @@ task_free(taskq_t *tq, task_t *t) taskqid_t taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags) { - task_t *t; + taskq_ent_t *t; if (taskq_now) { func(arg); @@ -130,26 +130,77 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags) return (0); } if (tqflags & TQ_FRONT) { - t->task_next = tq->tq_task.task_next; - t->task_prev = &tq->tq_task; + t->tqent_next = tq->tq_task.tqent_next; + t->tqent_prev = &tq->tq_task; } else { - t->task_next = &tq->tq_task; - t->task_prev = tq->tq_task.task_prev; + t->tqent_next = &tq->tq_task; + t->tqent_prev = tq->tq_task.tqent_prev; } - t->task_next->task_prev = t; - t->task_prev->task_next = t; - t->task_func = func; - t->task_arg = arg; + t->tqent_next->tqent_prev = t; + t->tqent_prev->tqent_next = t; + t->tqent_func = func; + t->tqent_arg = arg; + + ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); + cv_signal(&tq->tq_dispatch_cv); mutex_exit(&tq->tq_lock); return (1); } +int +taskq_empty_ent(taskq_ent_t *t) +{ + return t->tqent_next == NULL; +} + +void +taskq_init_ent(taskq_ent_t *t) +{ + t->tqent_next = NULL; + t->tqent_prev = NULL; + t->tqent_func = NULL; + t->tqent_arg = NULL; + t->tqent_flags = 0; +} + +void +taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, + taskq_ent_t *t) +{ + ASSERT(func != NULL); + ASSERT(!(tq->tq_flags & TASKQ_DYNAMIC)); + + /* + * Mark it as a prealloc'd task. This is important + * to ensure that we don't free it later. + */ + t->tqent_flags |= TQENT_FLAG_PREALLOC; + /* + * Enqueue the task to the underlying queue. + */ + mutex_enter(&tq->tq_lock); + + if (flags & TQ_FRONT) { + t->tqent_next = tq->tq_task.tqent_next; + t->tqent_prev = &tq->tq_task; + } else { + t->tqent_next = &tq->tq_task; + t->tqent_prev = tq->tq_task.tqent_prev; + } + t->tqent_next->tqent_prev = t; + t->tqent_prev->tqent_next = t; + t->tqent_func = func; + t->tqent_arg = arg; + cv_signal(&tq->tq_dispatch_cv); + mutex_exit(&tq->tq_lock); +} + void taskq_wait(taskq_t *tq) { mutex_enter(&tq->tq_lock); - while (tq->tq_task.task_next != &tq->tq_task || tq->tq_active != 0) + while (tq->tq_task.tqent_next != &tq->tq_task || tq->tq_active != 0) cv_wait(&tq->tq_wait_cv, &tq->tq_lock); mutex_exit(&tq->tq_lock); } @@ -158,27 +209,32 @@ static void taskq_thread(void *arg) { taskq_t *tq = arg; - task_t *t; + taskq_ent_t *t; + boolean_t prealloc; mutex_enter(&tq->tq_lock); while (tq->tq_flags & TASKQ_ACTIVE) { - if ((t = tq->tq_task.task_next) == &tq->tq_task) { + if ((t = tq->tq_task.tqent_next) == &tq->tq_task) { if (--tq->tq_active == 0) cv_broadcast(&tq->tq_wait_cv); cv_wait(&tq->tq_dispatch_cv, &tq->tq_lock); tq->tq_active++; continue; } - t->task_prev->task_next = t->task_next; - t->task_next->task_prev = t->task_prev; + t->tqent_prev->tqent_next = t->tqent_next; + t->tqent_next->tqent_prev = t->tqent_prev; + t->tqent_next = NULL; + t->tqent_prev = NULL; + prealloc = t->tqent_flags & TQENT_FLAG_PREALLOC; mutex_exit(&tq->tq_lock); rw_enter(&tq->tq_threadlock, RW_READER); - t->task_func(t->task_arg); + t->tqent_func(t->tqent_arg); rw_exit(&tq->tq_threadlock); mutex_enter(&tq->tq_lock); - task_free(tq, t); + if (!prealloc) + task_free(tq, t); } tq->tq_nthreads--; cv_broadcast(&tq->tq_wait_cv); @@ -217,8 +273,8 @@ taskq_create(const char *name, int nthreads, pri_t pri, tq->tq_nthreads = nthreads; tq->tq_minalloc = minalloc; tq->tq_maxalloc = maxalloc; - tq->tq_task.task_next = &tq->tq_task; - tq->tq_task.task_prev = &tq->tq_task; + tq->tq_task.tqent_next = &tq->tq_task; + tq->tq_task.tqent_prev = &tq->tq_task; tq->tq_threadlist = kmem_alloc(nthreads*sizeof(kthread_t *), KM_SLEEP); if (flags & TASKQ_PREPOPULATE) { diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 868a0d9d2..0b9649723 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -22,6 +22,9 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ /* * This file contains all the routines used when modifying on-disk SPA state. @@ -665,7 +668,7 @@ spa_create_zio_taskqs(spa_t *spa) const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; enum zti_modes mode = ztip->zti_mode; uint_t value = ztip->zti_value; - uint_t flags = TASKQ_PREPOPULATE; + uint_t flags = 0; char name[32]; if (t == ZIO_TYPE_WRITE) diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 6b03be6f3..c96442d0b 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. */ #include @@ -570,6 +571,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_add_child(pio, zio); } + taskq_init_ent(&zio->io_tqent); + return (zio); } @@ -1073,7 +1076,7 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline) { spa_t *spa = zio->io_spa; zio_type_t t = zio->io_type; - int flags = TQ_NOSLEEP | (cutinline ? TQ_FRONT : 0); + int flags = (cutinline ? TQ_FRONT : 0); /* * If we're a config writer or a probe, the normal issue and @@ -1098,8 +1101,14 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline) ASSERT3U(q, <, ZIO_TASKQ_TYPES); - while (taskq_dispatch(spa->spa_zio_taskq[t][q], - (task_func_t *)zio_execute, zio, flags) == 0); /* do nothing */ + /* + * NB: We are assuming that the zio can only be dispatched + * to a single taskq at a time. It would be a grievous error + * to dispatch the zio to another taskq at the same time. + */ + ASSERT(taskq_empty_ent(&zio->io_tqent)); + taskq_dispatch_ent(spa->spa_zio_taskq[t][q], + (task_func_t *)zio_execute, zio, flags, &zio->io_tqent); } static boolean_t @@ -2947,9 +2956,11 @@ zio_done(zio_t *zio) * Reexecution is potentially a huge amount of work. * Hand it off to the otherwise-unused claim taskq. */ - (void) taskq_dispatch( + ASSERT(taskq_empty_ent(&zio->io_tqent)); + (void) taskq_dispatch_ent( zio->io_spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], - (task_func_t *)zio_reexecute, zio, TQ_SLEEP); + (task_func_t *)zio_reexecute, zio, 0, + &zio->io_tqent); } return (ZIO_PIPELINE_STOP); }