/* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * The SPL is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . * * Solaris Porting Layer (SPL) Task Queue Implementation. */ /* * Copyright (c) 2024, Klara Inc. * Copyright (c) 2024, Syneto */ #include #include #include #include #include #include #include #include #ifdef HAVE_CPU_HOTPLUG #include #endif typedef struct taskq_kstats { /* static values, for completeness */ kstat_named_t tqks_threads_max; kstat_named_t tqks_entry_pool_min; kstat_named_t tqks_entry_pool_max; /* gauges (inc/dec counters, current value) */ kstat_named_t tqks_threads_active; kstat_named_t tqks_threads_idle; kstat_named_t tqks_threads_total; kstat_named_t tqks_tasks_pending; kstat_named_t tqks_tasks_priority; kstat_named_t tqks_tasks_total; kstat_named_t tqks_tasks_delayed; kstat_named_t tqks_entries_free; /* counters (inc only, since taskq creation) */ kstat_named_t tqks_threads_created; kstat_named_t tqks_threads_destroyed; kstat_named_t tqks_tasks_dispatched; kstat_named_t tqks_tasks_dispatched_delayed; kstat_named_t tqks_tasks_executed_normal; kstat_named_t tqks_tasks_executed_priority; kstat_named_t tqks_tasks_executed; kstat_named_t tqks_tasks_delayed_requeued; kstat_named_t tqks_tasks_cancelled; kstat_named_t tqks_thread_wakeups; kstat_named_t tqks_thread_wakeups_nowork; kstat_named_t tqks_thread_sleeps; } taskq_kstats_t; static taskq_kstats_t taskq_kstats_template = { { "threads_max", KSTAT_DATA_UINT64 }, { "entry_pool_min", KSTAT_DATA_UINT64 }, { "entry_pool_max", KSTAT_DATA_UINT64 }, { "threads_active", KSTAT_DATA_UINT64 }, { "threads_idle", KSTAT_DATA_UINT64 }, { "threads_total", KSTAT_DATA_UINT64 }, { "tasks_pending", KSTAT_DATA_UINT64 }, { "tasks_priority", KSTAT_DATA_UINT64 }, { "tasks_total", KSTAT_DATA_UINT64 }, { "tasks_delayed", KSTAT_DATA_UINT64 }, { "entries_free", KSTAT_DATA_UINT64 }, { "threads_created", KSTAT_DATA_UINT64 }, { "threads_destroyed", KSTAT_DATA_UINT64 }, { "tasks_dispatched", KSTAT_DATA_UINT64 }, { "tasks_dispatched_delayed", KSTAT_DATA_UINT64 }, { "tasks_executed_normal", KSTAT_DATA_UINT64 }, { "tasks_executed_priority", KSTAT_DATA_UINT64 }, { "tasks_executed", KSTAT_DATA_UINT64 }, { "tasks_delayed_requeued", KSTAT_DATA_UINT64 }, { "tasks_cancelled", KSTAT_DATA_UINT64 }, { "thread_wakeups", KSTAT_DATA_UINT64 }, { "thread_wakeups_nowork", KSTAT_DATA_UINT64 }, { "thread_sleeps", KSTAT_DATA_UINT64 }, }; #define TQSTAT_INC(tq, stat) wmsum_add(&tq->tq_sums.tqs_##stat, 1) #define TQSTAT_DEC(tq, stat) wmsum_add(&tq->tq_sums.tqs_##stat, -1) #define _TQSTAT_MOD_LIST(mod, tq, t) do { \ switch (t->tqent_flags & TQENT_LIST_MASK) { \ case TQENT_LIST_NONE: ASSERT(list_empty(&t->tqent_list)); break;\ case TQENT_LIST_PENDING: mod(tq, tasks_pending); break; \ case TQENT_LIST_PRIORITY: mod(tq, tasks_priority); break; \ case TQENT_LIST_DELAY: mod(tq, tasks_delayed); break; \ } \ } while (0) #define TQSTAT_INC_LIST(tq, t) _TQSTAT_MOD_LIST(TQSTAT_INC, tq, t) #define TQSTAT_DEC_LIST(tq, t) _TQSTAT_MOD_LIST(TQSTAT_DEC, tq, t) #define TQENT_SET_LIST(t, l) \ t->tqent_flags = (t->tqent_flags & ~TQENT_LIST_MASK) | l; static int spl_taskq_thread_bind = 0; module_param(spl_taskq_thread_bind, int, 0644); MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default"); static uint_t spl_taskq_thread_timeout_ms = 5000; /* BEGIN CSTYLED */ module_param(spl_taskq_thread_timeout_ms, uint, 0644); /* END CSTYLED */ MODULE_PARM_DESC(spl_taskq_thread_timeout_ms, "Minimum idle threads exit interval for dynamic taskqs"); static int spl_taskq_thread_dynamic = 1; module_param(spl_taskq_thread_dynamic, int, 0444); MODULE_PARM_DESC(spl_taskq_thread_dynamic, "Allow dynamic taskq threads"); static int spl_taskq_thread_priority = 1; module_param(spl_taskq_thread_priority, int, 0644); MODULE_PARM_DESC(spl_taskq_thread_priority, "Allow non-default priority for taskq threads"); static uint_t spl_taskq_thread_sequential = 4; /* BEGIN CSTYLED */ module_param(spl_taskq_thread_sequential, uint, 0644); /* END CSTYLED */ MODULE_PARM_DESC(spl_taskq_thread_sequential, "Create new taskq threads after N sequential tasks"); /* * Global system-wide dynamic task queue available for all consumers. This * taskq is not intended for long-running tasks; instead, a dedicated taskq * should be created. */ taskq_t *system_taskq; EXPORT_SYMBOL(system_taskq); /* Global dynamic task queue for long delay */ taskq_t *system_delay_taskq; EXPORT_SYMBOL(system_delay_taskq); /* Private dedicated taskq for creating new taskq threads on demand. */ static taskq_t *dynamic_taskq; static taskq_thread_t *taskq_thread_create(taskq_t *); #ifdef HAVE_CPU_HOTPLUG /* Multi-callback id for cpu hotplugging. */ static int spl_taskq_cpuhp_state; #endif /* List of all taskqs */ LIST_HEAD(tq_list); struct rw_semaphore tq_list_sem; static uint_t taskq_tsd; static int task_km_flags(uint_t flags) { if (flags & TQ_NOSLEEP) return (KM_NOSLEEP); if (flags & TQ_PUSHPAGE) return (KM_PUSHPAGE); return (KM_SLEEP); } /* * taskq_find_by_name - Find the largest instance number of a named taskq. */ static int taskq_find_by_name(const char *name) { struct list_head *tql = NULL; taskq_t *tq; list_for_each_prev(tql, &tq_list) { tq = list_entry(tql, taskq_t, tq_taskqs); if (strcmp(name, tq->tq_name) == 0) return (tq->tq_instance); } return (-1); } /* * NOTE: Must be called with tq->tq_lock held, returns a list_t which * is not attached to the free, work, or pending taskq lists. */ static taskq_ent_t * task_alloc(taskq_t *tq, uint_t flags, unsigned long *irqflags) { taskq_ent_t *t; int count = 0; ASSERT(tq); retry: /* Acquire taskq_ent_t's from free list if available */ if (!list_empty(&tq->tq_free_list) && !(flags & TQ_NEW)) { t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list); ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); ASSERT(!(t->tqent_flags & TQENT_FLAG_CANCEL)); ASSERT(!timer_pending(&t->tqent_timer)); list_del_init(&t->tqent_list); TQSTAT_DEC(tq, entries_free); return (t); } /* Free list is empty and memory allocations are prohibited */ if (flags & TQ_NOALLOC) return (NULL); /* Hit maximum taskq_ent_t pool size */ if (tq->tq_nalloc >= tq->tq_maxalloc) { if (flags & TQ_NOSLEEP) return (NULL); /* * Sleep periodically polling the free list for an available * taskq_ent_t. Dispatching with TQ_SLEEP should always succeed * but we cannot block forever waiting for an taskq_ent_t to * show up in the free list, otherwise a deadlock can happen. * * Therefore, we need to allocate a new task even if the number * of allocated tasks is above tq->tq_maxalloc, but we still * end up delaying the task allocation by one second, thereby * throttling the task dispatch rate. */ spin_unlock_irqrestore(&tq->tq_lock, *irqflags); schedule_timeout_interruptible(HZ / 100); spin_lock_irqsave_nested(&tq->tq_lock, *irqflags, tq->tq_lock_class); if (count < 100) { count++; goto retry; } } spin_unlock_irqrestore(&tq->tq_lock, *irqflags); t = kmem_alloc(sizeof (taskq_ent_t), task_km_flags(flags)); spin_lock_irqsave_nested(&tq->tq_lock, *irqflags, tq->tq_lock_class); if (t) { taskq_init_ent(t); tq->tq_nalloc++; } return (t); } /* * NOTE: Must be called with tq->tq_lock held, expects the taskq_ent_t * to already be removed from the free, work, or pending taskq lists. */ static void task_free(taskq_t *tq, taskq_ent_t *t) { ASSERT(tq); ASSERT(t); ASSERT(list_empty(&t->tqent_list)); ASSERT(!timer_pending(&t->tqent_timer)); kmem_free(t, sizeof (taskq_ent_t)); tq->tq_nalloc--; } /* * NOTE: Must be called with tq->tq_lock held, either destroys the * taskq_ent_t if too many exist or moves it to the free list for later use. */ static void task_done(taskq_t *tq, taskq_ent_t *t) { ASSERT(tq); ASSERT(t); ASSERT(list_empty(&t->tqent_list)); /* Wake tasks blocked in taskq_wait_id() */ wake_up_all(&t->tqent_waitq); if (tq->tq_nalloc <= tq->tq_minalloc) { t->tqent_id = TASKQID_INVALID; t->tqent_func = NULL; t->tqent_arg = NULL; t->tqent_flags = 0; list_add_tail(&t->tqent_list, &tq->tq_free_list); TQSTAT_INC(tq, entries_free); } else { task_free(tq, t); } } /* * When a delayed task timer expires remove it from the delay list and * add it to the priority list in order for immediate processing. */ static void task_expire_impl(taskq_ent_t *t) { taskq_ent_t *w; taskq_t *tq = t->tqent_taskq; struct list_head *l = NULL; unsigned long flags; spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); if (t->tqent_flags & TQENT_FLAG_CANCEL) { ASSERT(list_empty(&t->tqent_list)); spin_unlock_irqrestore(&tq->tq_lock, flags); return; } t->tqent_birth = jiffies; DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t); /* * The priority list must be maintained in strict task id order * from lowest to highest for lowest_id to be easily calculable. */ list_del(&t->tqent_list); list_for_each_prev(l, &tq->tq_prio_list) { w = list_entry(l, taskq_ent_t, tqent_list); if (w->tqent_id < t->tqent_id) { list_add(&t->tqent_list, l); break; } } if (l == &tq->tq_prio_list) list_add(&t->tqent_list, &tq->tq_prio_list); spin_unlock_irqrestore(&tq->tq_lock, flags); wake_up(&tq->tq_work_waitq); TQSTAT_INC(tq, tasks_delayed_requeued); } static void task_expire(spl_timer_list_t tl) { struct timer_list *tmr = (struct timer_list *)tl; taskq_ent_t *t = from_timer(t, tmr, tqent_timer); task_expire_impl(t); } /* * Returns the lowest incomplete taskqid_t. The taskqid_t may * be queued on the pending list, on the priority list, on the * delay list, or on the work list currently being handled, but * it is not 100% complete yet. */ static taskqid_t taskq_lowest_id(taskq_t *tq) { taskqid_t lowest_id = tq->tq_next_id; taskq_ent_t *t; taskq_thread_t *tqt; if (!list_empty(&tq->tq_pend_list)) { t = list_entry(tq->tq_pend_list.next, taskq_ent_t, tqent_list); lowest_id = MIN(lowest_id, t->tqent_id); } if (!list_empty(&tq->tq_prio_list)) { t = list_entry(tq->tq_prio_list.next, taskq_ent_t, tqent_list); lowest_id = MIN(lowest_id, t->tqent_id); } if (!list_empty(&tq->tq_delay_list)) { t = list_entry(tq->tq_delay_list.next, taskq_ent_t, tqent_list); lowest_id = MIN(lowest_id, t->tqent_id); } if (!list_empty(&tq->tq_active_list)) { tqt = list_entry(tq->tq_active_list.next, taskq_thread_t, tqt_active_list); ASSERT(tqt->tqt_id != TASKQID_INVALID); lowest_id = MIN(lowest_id, tqt->tqt_id); } return (lowest_id); } /* * Insert a task into a list keeping the list sorted by increasing taskqid. */ static void taskq_insert_in_order(taskq_t *tq, taskq_thread_t *tqt) { taskq_thread_t *w; struct list_head *l = NULL; ASSERT(tq); ASSERT(tqt); list_for_each_prev(l, &tq->tq_active_list) { w = list_entry(l, taskq_thread_t, tqt_active_list); if (w->tqt_id < tqt->tqt_id) { list_add(&tqt->tqt_active_list, l); break; } } if (l == &tq->tq_active_list) list_add(&tqt->tqt_active_list, &tq->tq_active_list); } /* * Find and return a task from the given list if it exists. The list * must be in lowest to highest task id order. */ static taskq_ent_t * taskq_find_list(taskq_t *tq, struct list_head *lh, taskqid_t id) { struct list_head *l = NULL; taskq_ent_t *t; list_for_each(l, lh) { t = list_entry(l, taskq_ent_t, tqent_list); if (t->tqent_id == id) return (t); if (t->tqent_id > id) break; } return (NULL); } /* * Find an already dispatched task given the task id regardless of what * state it is in. If a task is still pending it will be returned. * If a task is executing, then -EBUSY will be returned instead. * If the task has already been run then NULL is returned. */ static taskq_ent_t * taskq_find(taskq_t *tq, taskqid_t id) { taskq_thread_t *tqt; struct list_head *l = NULL; taskq_ent_t *t; t = taskq_find_list(tq, &tq->tq_delay_list, id); if (t) return (t); t = taskq_find_list(tq, &tq->tq_prio_list, id); if (t) return (t); t = taskq_find_list(tq, &tq->tq_pend_list, id); if (t) return (t); list_for_each(l, &tq->tq_active_list) { tqt = list_entry(l, taskq_thread_t, tqt_active_list); if (tqt->tqt_id == id) { /* * Instead of returning tqt_task, we just return a non * NULL value to prevent misuse, since tqt_task only * has two valid fields. */ return (ERR_PTR(-EBUSY)); } } return (NULL); } /* * Theory for the taskq_wait_id(), taskq_wait_outstanding(), and * taskq_wait() functions below. * * Taskq waiting is accomplished by tracking the lowest outstanding task * id and the next available task id. As tasks are dispatched they are * added to the tail of the pending, priority, or delay lists. As worker * threads become available the tasks are removed from the heads of these * lists and linked to the worker threads. This ensures the lists are * kept sorted by lowest to highest task id. * * Therefore the lowest outstanding task id can be quickly determined by * checking the head item from all of these lists. This value is stored * with the taskq as the lowest id. It only needs to be recalculated when * either the task with the current lowest id completes or is canceled. * * By blocking until the lowest task id exceeds the passed task id the * taskq_wait_outstanding() function can be easily implemented. Similarly, * by blocking until the lowest task id matches the next task id taskq_wait() * can be implemented. * * Callers should be aware that when there are multiple worked threads it * is possible for larger task ids to complete before smaller ones. Also * when the taskq contains delay tasks with small task ids callers may * block for a considerable length of time waiting for them to expire and * execute. */ static int taskq_wait_id_check(taskq_t *tq, taskqid_t id) { int rc; unsigned long flags; spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); rc = (taskq_find(tq, id) == NULL); spin_unlock_irqrestore(&tq->tq_lock, flags); return (rc); } /* * The taskq_wait_id() function blocks until the passed task id completes. * This does not guarantee that all lower task ids have completed. */ void taskq_wait_id(taskq_t *tq, taskqid_t id) { wait_event(tq->tq_wait_waitq, taskq_wait_id_check(tq, id)); } EXPORT_SYMBOL(taskq_wait_id); static int taskq_wait_outstanding_check(taskq_t *tq, taskqid_t id) { int rc; unsigned long flags; spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); rc = (id < tq->tq_lowest_id); spin_unlock_irqrestore(&tq->tq_lock, flags); return (rc); } /* * The taskq_wait_outstanding() function will block until all tasks with a * lower taskqid than the passed 'id' have been completed. Note that all * task id's are assigned monotonically at dispatch time. Zero may be * passed for the id to indicate all tasks dispatch up to this point, * but not after, should be waited for. */ void taskq_wait_outstanding(taskq_t *tq, taskqid_t id) { id = id ? id : tq->tq_next_id - 1; wait_event(tq->tq_wait_waitq, taskq_wait_outstanding_check(tq, id)); } EXPORT_SYMBOL(taskq_wait_outstanding); static int taskq_wait_check(taskq_t *tq) { int rc; unsigned long flags; spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); rc = (tq->tq_lowest_id == tq->tq_next_id); spin_unlock_irqrestore(&tq->tq_lock, flags); return (rc); } /* * The taskq_wait() function will block until the taskq is empty. * This means that if a taskq re-dispatches work to itself taskq_wait() * callers will block indefinitely. */ void taskq_wait(taskq_t *tq) { wait_event(tq->tq_wait_waitq, taskq_wait_check(tq)); } EXPORT_SYMBOL(taskq_wait); int taskq_member(taskq_t *tq, kthread_t *t) { return (tq == (taskq_t *)tsd_get_by_thread(taskq_tsd, t)); } EXPORT_SYMBOL(taskq_member); taskq_t * taskq_of_curthread(void) { return (tsd_get(taskq_tsd)); } EXPORT_SYMBOL(taskq_of_curthread); /* * Cancel an already dispatched task given the task id. Still pending tasks * will be immediately canceled, and if the task is active the function will * block until it completes. Preallocated tasks which are canceled must be * freed by the caller. */ int taskq_cancel_id(taskq_t *tq, taskqid_t id) { taskq_ent_t *t; int rc = ENOENT; unsigned long flags; ASSERT(tq); spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); t = taskq_find(tq, id); if (t && t != ERR_PTR(-EBUSY)) { list_del_init(&t->tqent_list); TQSTAT_DEC_LIST(tq, t); t->tqent_flags |= TQENT_FLAG_CANCEL; TQSTAT_INC(tq, tasks_cancelled); /* * When canceling the lowest outstanding task id we * must recalculate the new lowest outstanding id. */ if (tq->tq_lowest_id == t->tqent_id) { tq->tq_lowest_id = taskq_lowest_id(tq); ASSERT3S(tq->tq_lowest_id, >, t->tqent_id); } /* * The task_expire() function takes the tq->tq_lock so drop * drop the lock before synchronously cancelling the timer. */ if (timer_pending(&t->tqent_timer)) { spin_unlock_irqrestore(&tq->tq_lock, flags); del_timer_sync(&t->tqent_timer); spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); } if (!(t->tqent_flags & TQENT_FLAG_PREALLOC)) task_done(tq, t); rc = 0; } spin_unlock_irqrestore(&tq->tq_lock, flags); if (t == ERR_PTR(-EBUSY)) { taskq_wait_id(tq, id); rc = EBUSY; } return (rc); } EXPORT_SYMBOL(taskq_cancel_id); static int taskq_thread_spawn(taskq_t *tq); taskqid_t taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) { taskq_ent_t *t; taskqid_t rc = TASKQID_INVALID; unsigned long irqflags; ASSERT(tq); ASSERT(func); spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class); /* Taskq being destroyed and all tasks drained */ if (!(tq->tq_flags & TASKQ_ACTIVE)) goto out; /* Do not queue the task unless there is idle thread for it */ ASSERT(tq->tq_nactive <= tq->tq_nthreads); if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) { /* Dynamic taskq may be able to spawn another thread */ if (taskq_thread_spawn(tq) == 0) goto out; } if ((t = task_alloc(tq, flags, &irqflags)) == NULL) goto out; spin_lock(&t->tqent_lock); /* Queue to the front of the list to enforce TQ_NOQUEUE semantics */ if (flags & TQ_NOQUEUE) { TQENT_SET_LIST(t, TQENT_LIST_PRIORITY); list_add(&t->tqent_list, &tq->tq_prio_list); /* Queue to the priority list instead of the pending list */ } else if (flags & TQ_FRONT) { TQENT_SET_LIST(t, TQENT_LIST_PRIORITY); list_add_tail(&t->tqent_list, &tq->tq_prio_list); } else { TQENT_SET_LIST(t, TQENT_LIST_PENDING); list_add_tail(&t->tqent_list, &tq->tq_pend_list); } TQSTAT_INC_LIST(tq, t); TQSTAT_INC(tq, tasks_total); t->tqent_id = rc = tq->tq_next_id; tq->tq_next_id++; t->tqent_func = func; t->tqent_arg = arg; t->tqent_taskq = tq; t->tqent_timer.function = NULL; t->tqent_timer.expires = 0; t->tqent_birth = jiffies; DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t); ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); spin_unlock(&t->tqent_lock); wake_up(&tq->tq_work_waitq); TQSTAT_INC(tq, tasks_dispatched); /* Spawn additional taskq threads if required. */ if (!(flags & TQ_NOQUEUE) && tq->tq_nactive == tq->tq_nthreads) (void) taskq_thread_spawn(tq); out: spin_unlock_irqrestore(&tq->tq_lock, irqflags); return (rc); } EXPORT_SYMBOL(taskq_dispatch); taskqid_t taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, uint_t flags, clock_t expire_time) { taskqid_t rc = TASKQID_INVALID; taskq_ent_t *t; unsigned long irqflags; ASSERT(tq); ASSERT(func); spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class); /* Taskq being destroyed and all tasks drained */ if (!(tq->tq_flags & TASKQ_ACTIVE)) goto out; if ((t = task_alloc(tq, flags, &irqflags)) == NULL) goto out; spin_lock(&t->tqent_lock); /* Queue to the delay list for subsequent execution */ list_add_tail(&t->tqent_list, &tq->tq_delay_list); TQENT_SET_LIST(t, TQENT_LIST_DELAY); TQSTAT_INC_LIST(tq, t); t->tqent_id = rc = tq->tq_next_id; tq->tq_next_id++; t->tqent_func = func; t->tqent_arg = arg; t->tqent_taskq = tq; t->tqent_timer.function = task_expire; t->tqent_timer.expires = (unsigned long)expire_time; add_timer(&t->tqent_timer); ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); spin_unlock(&t->tqent_lock); TQSTAT_INC(tq, tasks_dispatched_delayed); /* Spawn additional taskq threads if required. */ if (tq->tq_nactive == tq->tq_nthreads) (void) taskq_thread_spawn(tq); out: spin_unlock_irqrestore(&tq->tq_lock, irqflags); return (rc); } EXPORT_SYMBOL(taskq_dispatch_delay); void taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, taskq_ent_t *t) { unsigned long irqflags; ASSERT(tq); ASSERT(func); spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class); /* Taskq being destroyed and all tasks drained */ if (!(tq->tq_flags & TASKQ_ACTIVE)) { t->tqent_id = TASKQID_INVALID; goto out; } if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) { /* Dynamic taskq may be able to spawn another thread */ if (taskq_thread_spawn(tq) == 0) goto out; flags |= TQ_FRONT; } spin_lock(&t->tqent_lock); /* * Make sure the entry is not on some other taskq; it is important to * ASSERT() under lock */ ASSERT(taskq_empty_ent(t)); /* * Mark it as a prealloc'd task. This is important * to ensure that we don't free it later. */ t->tqent_flags |= TQENT_FLAG_PREALLOC; /* Queue to the priority list instead of the pending list */ if (flags & TQ_FRONT) { TQENT_SET_LIST(t, TQENT_LIST_PRIORITY); list_add_tail(&t->tqent_list, &tq->tq_prio_list); } else { TQENT_SET_LIST(t, TQENT_LIST_PENDING); list_add_tail(&t->tqent_list, &tq->tq_pend_list); } TQSTAT_INC_LIST(tq, t); TQSTAT_INC(tq, tasks_total); t->tqent_id = tq->tq_next_id; tq->tq_next_id++; t->tqent_func = func; t->tqent_arg = arg; t->tqent_taskq = tq; t->tqent_birth = jiffies; DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t); spin_unlock(&t->tqent_lock); wake_up(&tq->tq_work_waitq); TQSTAT_INC(tq, tasks_dispatched); /* Spawn additional taskq threads if required. */ if (tq->tq_nactive == tq->tq_nthreads) (void) taskq_thread_spawn(tq); out: spin_unlock_irqrestore(&tq->tq_lock, irqflags); } EXPORT_SYMBOL(taskq_dispatch_ent); int taskq_empty_ent(taskq_ent_t *t) { return (list_empty(&t->tqent_list)); } EXPORT_SYMBOL(taskq_empty_ent); void taskq_init_ent(taskq_ent_t *t) { spin_lock_init(&t->tqent_lock); init_waitqueue_head(&t->tqent_waitq); timer_setup(&t->tqent_timer, NULL, 0); INIT_LIST_HEAD(&t->tqent_list); t->tqent_id = 0; t->tqent_func = NULL; t->tqent_arg = NULL; t->tqent_flags = 0; t->tqent_taskq = NULL; } EXPORT_SYMBOL(taskq_init_ent); /* * Return the next pending task, preference is given to tasks on the * priority list which were dispatched with TQ_FRONT. */ static taskq_ent_t * taskq_next_ent(taskq_t *tq) { struct list_head *list; if (!list_empty(&tq->tq_prio_list)) list = &tq->tq_prio_list; else if (!list_empty(&tq->tq_pend_list)) list = &tq->tq_pend_list; else return (NULL); return (list_entry(list->next, taskq_ent_t, tqent_list)); } /* * Spawns a new thread for the specified taskq. */ static void taskq_thread_spawn_task(void *arg) { taskq_t *tq = (taskq_t *)arg; unsigned long flags; if (taskq_thread_create(tq) == NULL) { /* restore spawning count if failed */ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); tq->tq_nspawn--; spin_unlock_irqrestore(&tq->tq_lock, flags); } } /* * Spawn addition threads for dynamic taskqs (TASKQ_DYNAMIC) the current * number of threads is insufficient to handle the pending tasks. These * new threads must be created by the dedicated dynamic_taskq to avoid * deadlocks between thread creation and memory reclaim. The system_taskq * which is also a dynamic taskq cannot be safely used for this. */ static int taskq_thread_spawn(taskq_t *tq) { int spawning = 0; if (!(tq->tq_flags & TASKQ_DYNAMIC)) return (0); tq->lastspawnstop = jiffies; if ((tq->tq_nthreads + tq->tq_nspawn < tq->tq_maxthreads) && (tq->tq_flags & TASKQ_ACTIVE)) { spawning = (++tq->tq_nspawn); taskq_dispatch(dynamic_taskq, taskq_thread_spawn_task, tq, TQ_NOSLEEP); } return (spawning); } /* * Threads in a dynamic taskq may exit once there is no more work to do. * To prevent threads from being created and destroyed too often limit * the exit rate to one per spl_taskq_thread_timeout_ms. * * The first thread is the thread list is treated as the primary thread. * There is nothing special about the primary thread but in order to avoid * all the taskq pids from changing we opt to make it long running. */ static int taskq_thread_should_stop(taskq_t *tq, taskq_thread_t *tqt) { ASSERT(!taskq_next_ent(tq)); if (!(tq->tq_flags & TASKQ_DYNAMIC) || !spl_taskq_thread_dynamic) return (0); if (!(tq->tq_flags & TASKQ_ACTIVE)) return (1); if (list_first_entry(&(tq->tq_thread_list), taskq_thread_t, tqt_thread_list) == tqt) return (0); ASSERT3U(tq->tq_nthreads, >, 1); if (tq->tq_nspawn != 0) return (0); if (time_before(jiffies, tq->lastspawnstop + msecs_to_jiffies(spl_taskq_thread_timeout_ms))) return (0); tq->lastspawnstop = jiffies; return (1); } static int taskq_thread(void *args) { DECLARE_WAITQUEUE(wait, current); sigset_t blocked; taskq_thread_t *tqt = args; taskq_t *tq; taskq_ent_t *t; int seq_tasks = 0; unsigned long flags; taskq_ent_t dup_task = {}; ASSERT(tqt); ASSERT(tqt->tqt_tq); tq = tqt->tqt_tq; current->flags |= PF_NOFREEZE; (void) spl_fstrans_mark(); sigfillset(&blocked); sigprocmask(SIG_BLOCK, &blocked, NULL); flush_signals(current); tsd_set(taskq_tsd, tq); spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); /* * If we are dynamically spawned, decrease spawning count. Note that * we could be created during taskq_create, in which case we shouldn't * do the decrement. But it's fine because taskq_create will reset * tq_nspawn later. */ if (tq->tq_flags & TASKQ_DYNAMIC) tq->tq_nspawn--; /* Immediately exit if more threads than allowed were created. */ if (tq->tq_nthreads >= tq->tq_maxthreads) goto error; tq->tq_nthreads++; list_add_tail(&tqt->tqt_thread_list, &tq->tq_thread_list); wake_up(&tq->tq_wait_waitq); set_current_state(TASK_INTERRUPTIBLE); TQSTAT_INC(tq, threads_total); while (!kthread_should_stop()) { if (list_empty(&tq->tq_pend_list) && list_empty(&tq->tq_prio_list)) { if (taskq_thread_should_stop(tq, tqt)) break; add_wait_queue_exclusive(&tq->tq_work_waitq, &wait); spin_unlock_irqrestore(&tq->tq_lock, flags); TQSTAT_INC(tq, thread_sleeps); TQSTAT_INC(tq, threads_idle); schedule(); seq_tasks = 0; TQSTAT_DEC(tq, threads_idle); TQSTAT_INC(tq, thread_wakeups); spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); remove_wait_queue(&tq->tq_work_waitq, &wait); } else { __set_current_state(TASK_RUNNING); } if ((t = taskq_next_ent(tq)) != NULL) { list_del_init(&t->tqent_list); TQSTAT_DEC_LIST(tq, t); TQSTAT_DEC(tq, tasks_total); /* * A TQENT_FLAG_PREALLOC task may be reused or freed * during the task function call. Store tqent_id and * tqent_flags here. * * Also use an on stack taskq_ent_t for tqt_task * assignment in this case; we want to make sure * to duplicate all fields, so the values are * correct when it's accessed via DTRACE_PROBE*. */ tqt->tqt_id = t->tqent_id; tqt->tqt_flags = t->tqent_flags; if (t->tqent_flags & TQENT_FLAG_PREALLOC) { dup_task = *t; t = &dup_task; } tqt->tqt_task = t; taskq_insert_in_order(tq, tqt); tq->tq_nactive++; spin_unlock_irqrestore(&tq->tq_lock, flags); TQSTAT_INC(tq, threads_active); DTRACE_PROBE1(taskq_ent__start, taskq_ent_t *, t); /* Perform the requested task */ t->tqent_func(t->tqent_arg); DTRACE_PROBE1(taskq_ent__finish, taskq_ent_t *, t); TQSTAT_DEC(tq, threads_active); if ((t->tqent_flags & TQENT_LIST_MASK) == TQENT_LIST_PENDING) TQSTAT_INC(tq, tasks_executed_normal); else TQSTAT_INC(tq, tasks_executed_priority); TQSTAT_INC(tq, tasks_executed); spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); tq->tq_nactive--; list_del_init(&tqt->tqt_active_list); tqt->tqt_task = NULL; /* For prealloc'd tasks, we don't free anything. */ if (!(tqt->tqt_flags & TQENT_FLAG_PREALLOC)) task_done(tq, t); /* * When the current lowest outstanding taskqid is * done calculate the new lowest outstanding id */ if (tq->tq_lowest_id == tqt->tqt_id) { tq->tq_lowest_id = taskq_lowest_id(tq); ASSERT3S(tq->tq_lowest_id, >, tqt->tqt_id); } /* Spawn additional taskq threads if required. */ if ((++seq_tasks) > spl_taskq_thread_sequential && taskq_thread_spawn(tq)) seq_tasks = 0; tqt->tqt_id = TASKQID_INVALID; tqt->tqt_flags = 0; wake_up_all(&tq->tq_wait_waitq); } else TQSTAT_INC(tq, thread_wakeups_nowork); set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); tq->tq_nthreads--; list_del_init(&tqt->tqt_thread_list); TQSTAT_DEC(tq, threads_total); TQSTAT_INC(tq, threads_destroyed); error: kmem_free(tqt, sizeof (taskq_thread_t)); spin_unlock_irqrestore(&tq->tq_lock, flags); tsd_set(taskq_tsd, NULL); thread_exit(); return (0); } static taskq_thread_t * taskq_thread_create(taskq_t *tq) { static int last_used_cpu = 0; taskq_thread_t *tqt; tqt = kmem_alloc(sizeof (*tqt), KM_PUSHPAGE); INIT_LIST_HEAD(&tqt->tqt_thread_list); INIT_LIST_HEAD(&tqt->tqt_active_list); tqt->tqt_tq = tq; tqt->tqt_id = TASKQID_INVALID; tqt->tqt_thread = spl_kthread_create(taskq_thread, tqt, "%s", tq->tq_name); if (tqt->tqt_thread == NULL) { kmem_free(tqt, sizeof (taskq_thread_t)); return (NULL); } if (spl_taskq_thread_bind) { last_used_cpu = (last_used_cpu + 1) % num_online_cpus(); kthread_bind(tqt->tqt_thread, last_used_cpu); } if (spl_taskq_thread_priority) set_user_nice(tqt->tqt_thread, PRIO_TO_NICE(tq->tq_pri)); wake_up_process(tqt->tqt_thread); TQSTAT_INC(tq, threads_created); return (tqt); } static void taskq_stats_init(taskq_t *tq) { taskq_sums_t *tqs = &tq->tq_sums; wmsum_init(&tqs->tqs_threads_active, 0); wmsum_init(&tqs->tqs_threads_idle, 0); wmsum_init(&tqs->tqs_threads_total, 0); wmsum_init(&tqs->tqs_tasks_pending, 0); wmsum_init(&tqs->tqs_tasks_priority, 0); wmsum_init(&tqs->tqs_tasks_total, 0); wmsum_init(&tqs->tqs_tasks_delayed, 0); wmsum_init(&tqs->tqs_entries_free, 0); wmsum_init(&tqs->tqs_threads_created, 0); wmsum_init(&tqs->tqs_threads_destroyed, 0); wmsum_init(&tqs->tqs_tasks_dispatched, 0); wmsum_init(&tqs->tqs_tasks_dispatched_delayed, 0); wmsum_init(&tqs->tqs_tasks_executed_normal, 0); wmsum_init(&tqs->tqs_tasks_executed_priority, 0); wmsum_init(&tqs->tqs_tasks_executed, 0); wmsum_init(&tqs->tqs_tasks_delayed_requeued, 0); wmsum_init(&tqs->tqs_tasks_cancelled, 0); wmsum_init(&tqs->tqs_thread_wakeups, 0); wmsum_init(&tqs->tqs_thread_wakeups_nowork, 0); wmsum_init(&tqs->tqs_thread_sleeps, 0); } static void taskq_stats_fini(taskq_t *tq) { taskq_sums_t *tqs = &tq->tq_sums; wmsum_fini(&tqs->tqs_threads_active); wmsum_fini(&tqs->tqs_threads_idle); wmsum_fini(&tqs->tqs_threads_total); wmsum_fini(&tqs->tqs_tasks_pending); wmsum_fini(&tqs->tqs_tasks_priority); wmsum_fini(&tqs->tqs_tasks_total); wmsum_fini(&tqs->tqs_tasks_delayed); wmsum_fini(&tqs->tqs_entries_free); wmsum_fini(&tqs->tqs_threads_created); wmsum_fini(&tqs->tqs_threads_destroyed); wmsum_fini(&tqs->tqs_tasks_dispatched); wmsum_fini(&tqs->tqs_tasks_dispatched_delayed); wmsum_fini(&tqs->tqs_tasks_executed_normal); wmsum_fini(&tqs->tqs_tasks_executed_priority); wmsum_fini(&tqs->tqs_tasks_executed); wmsum_fini(&tqs->tqs_tasks_delayed_requeued); wmsum_fini(&tqs->tqs_tasks_cancelled); wmsum_fini(&tqs->tqs_thread_wakeups); wmsum_fini(&tqs->tqs_thread_wakeups_nowork); wmsum_fini(&tqs->tqs_thread_sleeps); } static int taskq_kstats_update(kstat_t *ksp, int rw) { if (rw == KSTAT_WRITE) return (EACCES); taskq_t *tq = ksp->ks_private; taskq_kstats_t *tqks = ksp->ks_data; tqks->tqks_threads_max.value.ui64 = tq->tq_maxthreads; tqks->tqks_entry_pool_min.value.ui64 = tq->tq_minalloc; tqks->tqks_entry_pool_max.value.ui64 = tq->tq_maxalloc; taskq_sums_t *tqs = &tq->tq_sums; tqks->tqks_threads_active.value.ui64 = wmsum_value(&tqs->tqs_threads_active); tqks->tqks_threads_idle.value.ui64 = wmsum_value(&tqs->tqs_threads_idle); tqks->tqks_threads_total.value.ui64 = wmsum_value(&tqs->tqs_threads_total); tqks->tqks_tasks_pending.value.ui64 = wmsum_value(&tqs->tqs_tasks_pending); tqks->tqks_tasks_priority.value.ui64 = wmsum_value(&tqs->tqs_tasks_priority); tqks->tqks_tasks_total.value.ui64 = wmsum_value(&tqs->tqs_tasks_total); tqks->tqks_tasks_delayed.value.ui64 = wmsum_value(&tqs->tqs_tasks_delayed); tqks->tqks_entries_free.value.ui64 = wmsum_value(&tqs->tqs_entries_free); tqks->tqks_threads_created.value.ui64 = wmsum_value(&tqs->tqs_threads_created); tqks->tqks_threads_destroyed.value.ui64 = wmsum_value(&tqs->tqs_threads_destroyed); tqks->tqks_tasks_dispatched.value.ui64 = wmsum_value(&tqs->tqs_tasks_dispatched); tqks->tqks_tasks_dispatched_delayed.value.ui64 = wmsum_value(&tqs->tqs_tasks_dispatched_delayed); tqks->tqks_tasks_executed_normal.value.ui64 = wmsum_value(&tqs->tqs_tasks_executed_normal); tqks->tqks_tasks_executed_priority.value.ui64 = wmsum_value(&tqs->tqs_tasks_executed_priority); tqks->tqks_tasks_executed.value.ui64 = wmsum_value(&tqs->tqs_tasks_executed); tqks->tqks_tasks_delayed_requeued.value.ui64 = wmsum_value(&tqs->tqs_tasks_delayed_requeued); tqks->tqks_tasks_cancelled.value.ui64 = wmsum_value(&tqs->tqs_tasks_cancelled); tqks->tqks_thread_wakeups.value.ui64 = wmsum_value(&tqs->tqs_thread_wakeups); tqks->tqks_thread_wakeups_nowork.value.ui64 = wmsum_value(&tqs->tqs_thread_wakeups_nowork); tqks->tqks_thread_sleeps.value.ui64 = wmsum_value(&tqs->tqs_thread_sleeps); return (0); } static void taskq_kstats_init(taskq_t *tq) { char name[TASKQ_NAMELEN+5]; /* 5 for dot, 3x instance digits, null */ snprintf(name, sizeof (name), "%s.%d", tq->tq_name, tq->tq_instance); kstat_t *ksp = kstat_create("taskq", 0, name, "misc", KSTAT_TYPE_NAMED, sizeof (taskq_kstats_t) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (ksp == NULL) return; ksp->ks_private = tq; ksp->ks_update = taskq_kstats_update; ksp->ks_data = kmem_alloc(sizeof (taskq_kstats_t), KM_SLEEP); memcpy(ksp->ks_data, &taskq_kstats_template, sizeof (taskq_kstats_t)); kstat_install(ksp); tq->tq_ksp = ksp; } static void taskq_kstats_fini(taskq_t *tq) { if (tq->tq_ksp == NULL) return; kmem_free(tq->tq_ksp->ks_data, sizeof (taskq_kstats_t)); kstat_delete(tq->tq_ksp); tq->tq_ksp = NULL; } taskq_t * taskq_create(const char *name, int threads_arg, pri_t pri, int minalloc, int maxalloc, uint_t flags) { taskq_t *tq; taskq_thread_t *tqt; int count = 0, rc = 0, i; unsigned long irqflags; int nthreads = threads_arg; ASSERT(name != NULL); ASSERT(minalloc >= 0); ASSERT(!(flags & (TASKQ_CPR_SAFE))); /* Unsupported */ /* Scale the number of threads using nthreads as a percentage */ if (flags & TASKQ_THREADS_CPU_PCT) { ASSERT(nthreads <= 100); ASSERT(nthreads >= 0); nthreads = MIN(threads_arg, 100); nthreads = MAX(nthreads, 0); nthreads = MAX((num_online_cpus() * nthreads) /100, 1); } tq = kmem_alloc(sizeof (*tq), KM_PUSHPAGE); if (tq == NULL) return (NULL); tq->tq_hp_support = B_FALSE; #ifdef HAVE_CPU_HOTPLUG if (flags & TASKQ_THREADS_CPU_PCT) { tq->tq_hp_support = B_TRUE; if (cpuhp_state_add_instance_nocalls(spl_taskq_cpuhp_state, &tq->tq_hp_cb_node) != 0) { kmem_free(tq, sizeof (*tq)); return (NULL); } } #endif spin_lock_init(&tq->tq_lock); INIT_LIST_HEAD(&tq->tq_thread_list); INIT_LIST_HEAD(&tq->tq_active_list); tq->tq_name = kmem_strdup(name); tq->tq_nactive = 0; tq->tq_nthreads = 0; tq->tq_nspawn = 0; tq->tq_maxthreads = nthreads; tq->tq_cpu_pct = threads_arg; tq->tq_pri = pri; tq->tq_minalloc = minalloc; tq->tq_maxalloc = maxalloc; tq->tq_nalloc = 0; tq->tq_flags = (flags | TASKQ_ACTIVE); tq->tq_next_id = TASKQID_INITIAL; tq->tq_lowest_id = TASKQID_INITIAL; tq->lastspawnstop = jiffies; INIT_LIST_HEAD(&tq->tq_free_list); INIT_LIST_HEAD(&tq->tq_pend_list); INIT_LIST_HEAD(&tq->tq_prio_list); INIT_LIST_HEAD(&tq->tq_delay_list); init_waitqueue_head(&tq->tq_work_waitq); init_waitqueue_head(&tq->tq_wait_waitq); tq->tq_lock_class = TQ_LOCK_GENERAL; INIT_LIST_HEAD(&tq->tq_taskqs); taskq_stats_init(tq); if (flags & TASKQ_PREPOPULATE) { spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class); for (i = 0; i < minalloc; i++) task_done(tq, task_alloc(tq, TQ_PUSHPAGE | TQ_NEW, &irqflags)); spin_unlock_irqrestore(&tq->tq_lock, irqflags); } if ((flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) nthreads = 1; for (i = 0; i < nthreads; i++) { tqt = taskq_thread_create(tq); if (tqt == NULL) rc = 1; else count++; } /* Wait for all threads to be started before potential destroy */ wait_event(tq->tq_wait_waitq, tq->tq_nthreads == count); /* * taskq_thread might have touched nspawn, but we don't want them to * because they're not dynamically spawned. So we reset it to 0 */ tq->tq_nspawn = 0; if (rc) { taskq_destroy(tq); return (NULL); } down_write(&tq_list_sem); tq->tq_instance = taskq_find_by_name(name) + 1; list_add_tail(&tq->tq_taskqs, &tq_list); up_write(&tq_list_sem); /* Install kstats late, because the name includes tq_instance */ taskq_kstats_init(tq); return (tq); } EXPORT_SYMBOL(taskq_create); void taskq_destroy(taskq_t *tq) { struct task_struct *thread; taskq_thread_t *tqt; taskq_ent_t *t; unsigned long flags; ASSERT(tq); spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); tq->tq_flags &= ~TASKQ_ACTIVE; spin_unlock_irqrestore(&tq->tq_lock, flags); #ifdef HAVE_CPU_HOTPLUG if (tq->tq_hp_support) { VERIFY0(cpuhp_state_remove_instance_nocalls( spl_taskq_cpuhp_state, &tq->tq_hp_cb_node)); } #endif /* * When TASKQ_ACTIVE is clear new tasks may not be added nor may * new worker threads be spawned for dynamic taskq. */ if (dynamic_taskq != NULL) taskq_wait_outstanding(dynamic_taskq, 0); taskq_wait(tq); taskq_kstats_fini(tq); /* remove taskq from global list used by the kstats */ down_write(&tq_list_sem); list_del(&tq->tq_taskqs); up_write(&tq_list_sem); spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); /* wait for spawning threads to insert themselves to the list */ while (tq->tq_nspawn) { spin_unlock_irqrestore(&tq->tq_lock, flags); schedule_timeout_interruptible(1); spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); } /* * Signal each thread to exit and block until it does. Each thread * is responsible for removing itself from the list and freeing its * taskq_thread_t. This allows for idle threads to opt to remove * themselves from the taskq. They can be recreated as needed. */ while (!list_empty(&tq->tq_thread_list)) { tqt = list_entry(tq->tq_thread_list.next, taskq_thread_t, tqt_thread_list); thread = tqt->tqt_thread; spin_unlock_irqrestore(&tq->tq_lock, flags); kthread_stop(thread); spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); } while (!list_empty(&tq->tq_free_list)) { t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list); ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); list_del_init(&t->tqent_list); task_free(tq, t); } ASSERT0(tq->tq_nthreads); ASSERT0(tq->tq_nalloc); ASSERT0(tq->tq_nspawn); ASSERT(list_empty(&tq->tq_thread_list)); ASSERT(list_empty(&tq->tq_active_list)); ASSERT(list_empty(&tq->tq_free_list)); ASSERT(list_empty(&tq->tq_pend_list)); ASSERT(list_empty(&tq->tq_prio_list)); ASSERT(list_empty(&tq->tq_delay_list)); spin_unlock_irqrestore(&tq->tq_lock, flags); taskq_stats_fini(tq); kmem_strfree(tq->tq_name); kmem_free(tq, sizeof (taskq_t)); } EXPORT_SYMBOL(taskq_destroy); /* * Create a taskq with a specified number of pool threads. Allocate * and return an array of nthreads kthread_t pointers, one for each * thread in the pool. The array is not ordered and must be freed * by the caller. */ taskq_t * taskq_create_synced(const char *name, int nthreads, pri_t pri, int minalloc, int maxalloc, uint_t flags, kthread_t ***ktpp) { taskq_t *tq; taskq_thread_t *tqt; int i = 0; kthread_t **kthreads = kmem_zalloc(sizeof (*kthreads) * nthreads, KM_SLEEP); flags &= ~(TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT | TASKQ_DC_BATCH); /* taskq_create spawns all the threads before returning */ tq = taskq_create(name, nthreads, minclsyspri, nthreads, INT_MAX, flags | TASKQ_PREPOPULATE); VERIFY(tq != NULL); VERIFY(tq->tq_nthreads == nthreads); list_for_each_entry(tqt, &tq->tq_thread_list, tqt_thread_list) { kthreads[i] = tqt->tqt_thread; i++; } ASSERT3S(i, ==, nthreads); *ktpp = kthreads; return (tq); } EXPORT_SYMBOL(taskq_create_synced); static kstat_t *taskq_summary_ksp = NULL; static int spl_taskq_kstat_headers(char *buf, size_t size) { size_t n = snprintf(buf, size, "%-20s | %-17s | %-23s\n" "%-20s | %-17s | %-23s\n" "%-20s | %-17s | %-23s\n", "", "threads", "tasks on queue", "taskq name", "tot [act idl] max", " pend [ norm high] dly", "--------------------", "-----------------", "-----------------------"); return (n >= size ? ENOMEM : 0); } static int spl_taskq_kstat_data(char *buf, size_t size, void *data) { struct list_head *tql = NULL; taskq_t *tq; char name[TASKQ_NAMELEN+5]; /* 5 for dot, 3x instance digits, null */ char threads[25]; char tasks[30]; size_t n; int err = 0; down_read(&tq_list_sem); list_for_each_prev(tql, &tq_list) { tq = list_entry(tql, taskq_t, tq_taskqs); mutex_enter(tq->tq_ksp->ks_lock); taskq_kstats_update(tq->tq_ksp, KSTAT_READ); taskq_kstats_t *tqks = tq->tq_ksp->ks_data; snprintf(name, sizeof (name), "%s.%d", tq->tq_name, tq->tq_instance); snprintf(threads, sizeof (threads), "%3llu [%3llu %3llu] %3llu", tqks->tqks_threads_total.value.ui64, tqks->tqks_threads_active.value.ui64, tqks->tqks_threads_idle.value.ui64, tqks->tqks_threads_max.value.ui64); snprintf(tasks, sizeof (tasks), "%5llu [%5llu %5llu] %3llu", tqks->tqks_tasks_total.value.ui64, tqks->tqks_tasks_pending.value.ui64, tqks->tqks_tasks_priority.value.ui64, tqks->tqks_tasks_delayed.value.ui64); mutex_exit(tq->tq_ksp->ks_lock); n = snprintf(buf, size, "%-20s | %-17s | %-23s\n", name, threads, tasks); if (n >= size) { err = ENOMEM; break; } buf = &buf[n]; size -= n; } up_read(&tq_list_sem); return (err); } static void spl_taskq_kstat_init(void) { kstat_t *ksp = kstat_create("taskq", 0, "summary", "misc", KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); if (ksp == NULL) return; ksp->ks_data = (void *)(uintptr_t)1; ksp->ks_ndata = 1; kstat_set_raw_ops(ksp, spl_taskq_kstat_headers, spl_taskq_kstat_data, NULL); kstat_install(ksp); taskq_summary_ksp = ksp; } static void spl_taskq_kstat_fini(void) { if (taskq_summary_ksp == NULL) return; kstat_delete(taskq_summary_ksp); taskq_summary_ksp = NULL; } static unsigned int spl_taskq_kick = 0; /* * 2.6.36 API Change * module_param_cb is introduced to take kernel_param_ops and * module_param_call is marked as obsolete. Also set and get operations * were changed to take a 'const struct kernel_param *'. */ static int #ifdef module_param_cb param_set_taskq_kick(const char *val, const struct kernel_param *kp) #else param_set_taskq_kick(const char *val, struct kernel_param *kp) #endif { int ret; taskq_t *tq = NULL; taskq_ent_t *t; unsigned long flags; ret = param_set_uint(val, kp); if (ret < 0 || !spl_taskq_kick) return (ret); /* reset value */ spl_taskq_kick = 0; down_read(&tq_list_sem); list_for_each_entry(tq, &tq_list, tq_taskqs) { spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); /* Check if the first pending is older than 5 seconds */ t = taskq_next_ent(tq); if (t && time_after(jiffies, t->tqent_birth + 5*HZ)) { (void) taskq_thread_spawn(tq); printk(KERN_INFO "spl: Kicked taskq %s/%d\n", tq->tq_name, tq->tq_instance); } spin_unlock_irqrestore(&tq->tq_lock, flags); } up_read(&tq_list_sem); return (ret); } #ifdef module_param_cb static const struct kernel_param_ops param_ops_taskq_kick = { .set = param_set_taskq_kick, .get = param_get_uint, }; module_param_cb(spl_taskq_kick, ¶m_ops_taskq_kick, &spl_taskq_kick, 0644); #else module_param_call(spl_taskq_kick, param_set_taskq_kick, param_get_uint, &spl_taskq_kick, 0644); #endif MODULE_PARM_DESC(spl_taskq_kick, "Write nonzero to kick stuck taskqs to spawn more threads"); #ifdef HAVE_CPU_HOTPLUG /* * This callback will be called exactly once for each core that comes online, * for each dynamic taskq. We attempt to expand taskqs that have * TASKQ_THREADS_CPU_PCT set. We need to redo the percentage calculation every * time, to correctly determine whether or not to add a thread. */ static int spl_taskq_expand(unsigned int cpu, struct hlist_node *node) { taskq_t *tq = list_entry(node, taskq_t, tq_hp_cb_node); unsigned long flags; int err = 0; ASSERT(tq); spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); if (!(tq->tq_flags & TASKQ_ACTIVE)) { spin_unlock_irqrestore(&tq->tq_lock, flags); return (err); } ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT); int nthreads = MIN(tq->tq_cpu_pct, 100); nthreads = MAX(((num_online_cpus() + 1) * nthreads) / 100, 1); tq->tq_maxthreads = nthreads; if (!((tq->tq_flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) && tq->tq_maxthreads > tq->tq_nthreads) { spin_unlock_irqrestore(&tq->tq_lock, flags); taskq_thread_t *tqt = taskq_thread_create(tq); if (tqt == NULL) err = -1; return (err); } spin_unlock_irqrestore(&tq->tq_lock, flags); return (err); } /* * While we don't support offlining CPUs, it is possible that CPUs will fail * to online successfully. We do need to be able to handle this case * gracefully. */ static int spl_taskq_prepare_down(unsigned int cpu, struct hlist_node *node) { taskq_t *tq = list_entry(node, taskq_t, tq_hp_cb_node); unsigned long flags; ASSERT(tq); spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); if (!(tq->tq_flags & TASKQ_ACTIVE)) goto out; ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT); int nthreads = MIN(tq->tq_cpu_pct, 100); nthreads = MAX(((num_online_cpus()) * nthreads) / 100, 1); tq->tq_maxthreads = nthreads; if (!((tq->tq_flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) && tq->tq_maxthreads < tq->tq_nthreads) { ASSERT3U(tq->tq_maxthreads, ==, tq->tq_nthreads - 1); taskq_thread_t *tqt = list_entry(tq->tq_thread_list.next, taskq_thread_t, tqt_thread_list); struct task_struct *thread = tqt->tqt_thread; spin_unlock_irqrestore(&tq->tq_lock, flags); kthread_stop(thread); return (0); } out: spin_unlock_irqrestore(&tq->tq_lock, flags); return (0); } #endif int spl_taskq_init(void) { init_rwsem(&tq_list_sem); tsd_create(&taskq_tsd, NULL); #ifdef HAVE_CPU_HOTPLUG spl_taskq_cpuhp_state = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "fs/spl_taskq:online", spl_taskq_expand, spl_taskq_prepare_down); #endif system_taskq = taskq_create("spl_system_taskq", MAX(boot_ncpus, 64), maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); if (system_taskq == NULL) return (-ENOMEM); system_delay_taskq = taskq_create("spl_delay_taskq", MAX(boot_ncpus, 4), maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); if (system_delay_taskq == NULL) { #ifdef HAVE_CPU_HOTPLUG cpuhp_remove_multi_state(spl_taskq_cpuhp_state); #endif taskq_destroy(system_taskq); return (-ENOMEM); } dynamic_taskq = taskq_create("spl_dynamic_taskq", 1, maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE); if (dynamic_taskq == NULL) { #ifdef HAVE_CPU_HOTPLUG cpuhp_remove_multi_state(spl_taskq_cpuhp_state); #endif taskq_destroy(system_taskq); taskq_destroy(system_delay_taskq); return (-ENOMEM); } /* * This is used to annotate tq_lock, so * taskq_dispatch -> taskq_thread_spawn -> taskq_dispatch * does not trigger a lockdep warning re: possible recursive locking */ dynamic_taskq->tq_lock_class = TQ_LOCK_DYNAMIC; spl_taskq_kstat_init(); return (0); } void spl_taskq_fini(void) { spl_taskq_kstat_fini(); taskq_destroy(dynamic_taskq); dynamic_taskq = NULL; taskq_destroy(system_delay_taskq); system_delay_taskq = NULL; taskq_destroy(system_taskq); system_taskq = NULL; tsd_destroy(&taskq_tsd); #ifdef HAVE_CPU_HOTPLUG cpuhp_remove_multi_state(spl_taskq_cpuhp_state); spl_taskq_cpuhp_state = 0; #endif }