Illumos #3581 spa_zio_taskq[ZIO_TYPE_FREE][ZIO_TASKQ_ISSUE]->tq_lock contention

3581 spa_zio_taskq[ZIO_TYPE_FREE][ZIO_TASKQ_ISSUE]->tq_lock is piping hot

Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
Approved by: Richard Lowe <richlowe@richlowe.net>

References:
  illumos/illumos-gate@ec94d32
  https://illumos.org/issues/3581

Notes for Linux port:

Earlier commit 08d08eb reduced contention on this taskq lock by simply
reducing the number of z_fr_iss threads from 100 to one-per-CPU.  We
also optimized the taskq implementation in zfsonlinux/spl@3c6ed54.
These changes significantly improved unlink performance to acceptable
levels.

This patch further reduces time spent spinning on this lock by
randomly dispatching the work items over multiple independent task
queues.  The Illumos ZFS developers stated that this lock contention
only arose after "3329 spa_sync() spends 10-20% of its time in
spa_free_sync_cb()" was landed.  It's not clear if 3329 affects the
Linux port or not.  I didn't see spa_free_sync_cb() show up in
oprofile sessions while unlinking large files, but I may just not
have used the right test case.

I tested unlinking a 1 TB of data with and without the patch and
didn't observe a meaningful difference in elapsed time.  However,
oprofile showed that the percent time spent in taskq_thread() was
reduced from about 16% to about 5%.  Aside from a possible slight
performance benefit this may be worth landing if only for the sake of
maintaining consistency with upstream.

Ported-by: Ned Bass <bass6@llnl.gov>
Closes #1327
This commit is contained in:
Adam Leventhal 2013-05-06 12:24:30 -07:00 committed by Brian Behlendorf
parent 55d85d5a8c
commit 7ef5e54e2e
4 changed files with 174 additions and 83 deletions

View File

@ -80,16 +80,16 @@ typedef struct spa_config_dirent {
char *scd_path; char *scd_path;
} spa_config_dirent_t; } spa_config_dirent_t;
enum zio_taskq_type { typedef enum zio_taskq_type {
ZIO_TASKQ_ISSUE = 0, ZIO_TASKQ_ISSUE = 0,
ZIO_TASKQ_ISSUE_HIGH, ZIO_TASKQ_ISSUE_HIGH,
ZIO_TASKQ_INTERRUPT, ZIO_TASKQ_INTERRUPT,
ZIO_TASKQ_INTERRUPT_HIGH, ZIO_TASKQ_INTERRUPT_HIGH,
ZIO_TASKQ_TYPES ZIO_TASKQ_TYPES
}; } zio_taskq_type_t;
/* /*
* State machine for the zpool-pooname process. The states transitions * State machine for the zpool-poolname process. The states transitions
* are done as follows: * are done as follows:
* *
* From To Routine * From To Routine
@ -107,6 +107,11 @@ typedef enum spa_proc_state {
SPA_PROC_GONE /* spa_thread() is exiting, spa_proc = &p0 */ SPA_PROC_GONE /* spa_thread() is exiting, spa_proc = &p0 */
} spa_proc_state_t; } spa_proc_state_t;
typedef struct spa_taskqs {
uint_t stqs_count;
taskq_t **stqs_taskq;
} spa_taskqs_t;
struct spa { struct spa {
/* /*
* Fields protected by spa_namespace_lock. * Fields protected by spa_namespace_lock.
@ -125,7 +130,7 @@ struct spa {
uint8_t spa_sync_on; /* sync threads are running */ uint8_t spa_sync_on; /* sync threads are running */
spa_load_state_t spa_load_state; /* current load operation */ spa_load_state_t spa_load_state; /* current load operation */
uint64_t spa_import_flags; /* import specific flags */ uint64_t spa_import_flags; /* import specific flags */
taskq_t *spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES]; spa_taskqs_t spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
dsl_pool_t *spa_dsl_pool; dsl_pool_t *spa_dsl_pool;
boolean_t spa_is_initializing; /* true while opening pool */ boolean_t spa_is_initializing; /* true while opening pool */
metaslab_class_t *spa_normal_class; /* normal data class */ metaslab_class_t *spa_normal_class; /* normal data class */
@ -243,6 +248,9 @@ struct spa {
extern char *spa_config_path; extern char *spa_config_path;
extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -78,23 +78,25 @@
#include "zfs_comutil.h" #include "zfs_comutil.h"
typedef enum zti_modes { typedef enum zti_modes {
zti_mode_fixed, /* value is # of threads (min 1) */ ZTI_MODE_FIXED, /* value is # of threads (min 1) */
zti_mode_online_percent, /* value is % of online CPUs */ ZTI_MODE_ONLINE_PERCENT, /* value is % of online CPUs */
zti_mode_batch, /* cpu-intensive; value is ignored */ ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */
zti_mode_null, /* don't create a taskq */ ZTI_MODE_NULL, /* don't create a taskq */
zti_nmodes ZTI_NMODES
} zti_modes_t; } zti_modes_t;
#define ZTI_FIX(n) { zti_mode_fixed, (n) } #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) }
#define ZTI_PCT(n) { zti_mode_online_percent, (n) } #define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 }
#define ZTI_BATCH { zti_mode_batch, 0 } #define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 }
#define ZTI_NULL { zti_mode_null, 0 } #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 }
#define ZTI_ONE ZTI_FIX(1) #define ZTI_N(n) ZTI_P(n, 1)
#define ZTI_ONE ZTI_N(1)
typedef struct zio_taskq_info { typedef struct zio_taskq_info {
enum zti_modes zti_mode; zti_modes_t zti_mode;
uint_t zti_value; uint_t zti_value;
uint_t zti_count;
} zio_taskq_info_t; } zio_taskq_info_t;
static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
@ -102,17 +104,30 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
}; };
/* /*
* Define the taskq threads for the following I/O types: * This table defines the taskq settings for each ZFS I/O type. When
* NULL, READ, WRITE, FREE, CLAIM, and IOCTL * initializing a pool, we use this table to create an appropriately sized
* taskq. Some operations are low volume and therefore have a small, static
* number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
* macros. Other operations process a large amount of data; the ZTI_BATCH
* macro causes us to create a taskq oriented for throughput. Some operations
* are so high frequency and short-lived that the taskq itself can become a a
* point of lock contention. The ZTI_P(#, #) macro indicates that we need an
* additional degree of parallelism specified by the number of threads per-
* taskq and the number of taskqs; when dispatching an event in this case, the
* particular taskq is chosen at random.
*
* The different taskq priorities are to handle the different contexts (issue
* and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
* need to be handled with minimum delay.
*/ */
const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
/* ISSUE ISSUE_HIGH INTR INTR_HIGH */ /* ISSUE ISSUE_HIGH INTR INTR_HIGH */
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */
{ ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, { ZTI_N(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, /* READ */
{ ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(16), ZTI_FIX(5) }, { ZTI_BATCH, ZTI_N(5), ZTI_N(16), ZTI_N(5) }, /* WRITE */
{ ZTI_FIX(8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, { ZTI_P(4, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */
}; };
static dsl_syncfunc_t spa_sync_version; static dsl_syncfunc_t spa_sync_version;
@ -794,48 +809,121 @@ spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
offsetof(spa_error_entry_t, se_avl)); offsetof(spa_error_entry_t, se_avl));
} }
static taskq_t * static void
spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode, spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
uint_t value)
{ {
uint_t flags = TASKQ_PREPOPULATE; const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
enum zti_modes mode = ztip->zti_mode;
uint_t value = ztip->zti_value;
uint_t count = ztip->zti_count;
spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
char name[32];
uint_t i, flags = 0;
boolean_t batch = B_FALSE; boolean_t batch = B_FALSE;
switch (mode) { if (mode == ZTI_MODE_NULL) {
case zti_mode_null: tqs->stqs_count = 0;
return (NULL); /* no taskq needed */ tqs->stqs_taskq = NULL;
return;
case zti_mode_fixed:
ASSERT3U(value, >=, 1);
value = MAX(value, 1);
break;
case zti_mode_batch:
batch = B_TRUE;
flags |= TASKQ_THREADS_CPU_PCT;
value = zio_taskq_batch_pct;
break;
case zti_mode_online_percent:
flags |= TASKQ_THREADS_CPU_PCT;
break;
default:
panic("unrecognized mode for %s taskq (%u:%u) in "
"spa_activate()",
name, mode, value);
break;
} }
if (zio_taskq_sysdc && spa->spa_proc != &p0) { ASSERT3U(count, >, 0);
if (batch)
flags |= TASKQ_DC_BATCH;
return (taskq_create_sysdc(name, value, 50, INT_MAX, tqs->stqs_count = count;
spa->spa_proc, zio_taskq_basedc, flags)); tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
for (i = 0; i < count; i++) {
taskq_t *tq;
switch (mode) {
case ZTI_MODE_FIXED:
ASSERT3U(value, >=, 1);
value = MAX(value, 1);
break;
case ZTI_MODE_BATCH:
batch = B_TRUE;
flags |= TASKQ_THREADS_CPU_PCT;
value = zio_taskq_batch_pct;
break;
case ZTI_MODE_ONLINE_PERCENT:
flags |= TASKQ_THREADS_CPU_PCT;
break;
default:
panic("unrecognized mode for %s_%s taskq (%u:%u) in "
"spa_activate()",
zio_type_name[t], zio_taskq_types[q], mode, value);
break;
}
if (count > 1) {
(void) snprintf(name, sizeof (name), "%s_%s_%u",
zio_type_name[t], zio_taskq_types[q], i);
} else {
(void) snprintf(name, sizeof (name), "%s_%s",
zio_type_name[t], zio_taskq_types[q]);
}
if (zio_taskq_sysdc && spa->spa_proc != &p0) {
if (batch)
flags |= TASKQ_DC_BATCH;
tq = taskq_create_sysdc(name, value, 50, INT_MAX,
spa->spa_proc, zio_taskq_basedc, flags);
} else {
tq = taskq_create_proc(name, value, maxclsyspri, 50,
INT_MAX, spa->spa_proc, flags);
}
tqs->stqs_taskq[i] = tq;
} }
return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX, }
spa->spa_proc, flags));
static void
spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
{
spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
uint_t i;
if (tqs->stqs_taskq == NULL) {
ASSERT3U(tqs->stqs_count, ==, 0);
return;
}
for (i = 0; i < tqs->stqs_count; i++) {
ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
taskq_destroy(tqs->stqs_taskq[i]);
}
kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
tqs->stqs_taskq = NULL;
}
/*
* Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
* Note that a type may have multiple discrete taskqs to avoid lock contention
* on the taskq itself. In that case we choose which taskq at random by using
* the low bits of gethrtime().
*/
void
spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
{
spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
taskq_t *tq;
ASSERT3P(tqs->stqs_taskq, !=, NULL);
ASSERT3U(tqs->stqs_count, !=, 0);
if (tqs->stqs_count == 1) {
tq = tqs->stqs_taskq[0];
} else {
tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count];
}
taskq_dispatch_ent(tq, func, arg, flags, ent);
} }
static void static void
@ -845,16 +933,7 @@ spa_create_zio_taskqs(spa_t *spa)
for (t = 0; t < ZIO_TYPES; t++) { for (t = 0; t < ZIO_TYPES; t++) {
for (q = 0; q < ZIO_TASKQ_TYPES; q++) { for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; spa_taskqs_init(spa, t, q);
enum zti_modes mode = ztip->zti_mode;
uint_t value = ztip->zti_value;
char name[32];
(void) snprintf(name, sizeof (name),
"%s_%s", zio_type_name[t], zio_taskq_types[q]);
spa->spa_zio_taskq[t][q] =
spa_taskq_create(spa, name, mode, value);
} }
} }
} }
@ -1017,9 +1096,7 @@ spa_deactivate(spa_t *spa)
for (t = 0; t < ZIO_TYPES; t++) { for (t = 0; t < ZIO_TYPES; t++) {
for (q = 0; q < ZIO_TASKQ_TYPES; q++) { for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
if (spa->spa_zio_taskq[t][q] != NULL) spa_taskqs_fini(spa, t, q);
taskq_destroy(spa->spa_zio_taskq[t][q]);
spa->spa_zio_taskq[t][q] = NULL;
} }
} }

View File

@ -185,7 +185,7 @@ vdev_file_io_start(zio_t *zio)
return (ZIO_PIPELINE_CONTINUE); return (ZIO_PIPELINE_CONTINUE);
} }
taskq_dispatch_ent(spa->spa_zio_taskq[ZIO_TYPE_FREE][ZIO_TASKQ_ISSUE], spa_taskq_dispatch_ent(spa, ZIO_TYPE_FREE, ZIO_TASKQ_ISSUE,
vdev_file_io_strategy, zio, 0, &zio->io_tqent); vdev_file_io_strategy, zio, 0, &zio->io_tqent);
return (ZIO_PIPELINE_STOP); return (ZIO_PIPELINE_STOP);

View File

@ -1165,7 +1165,7 @@ zio_free_bp_init(zio_t *zio)
*/ */
static void static void
zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline) zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
{ {
spa_t *spa = zio->io_spa; spa_t *spa = zio->io_spa;
zio_type_t t = zio->io_type; zio_type_t t = zio->io_type;
@ -1186,10 +1186,11 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline)
t = ZIO_TYPE_NULL; t = ZIO_TYPE_NULL;
/* /*
* If this is a high priority I/O, then use the high priority taskq. * If this is a high priority I/O, then use the high priority taskq if
* available.
*/ */
if (zio->io_priority == ZIO_PRIORITY_NOW && if (zio->io_priority == ZIO_PRIORITY_NOW &&
spa->spa_zio_taskq[t][q + 1] != NULL) spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
q++; q++;
ASSERT3U(q, <, ZIO_TASKQ_TYPES); ASSERT3U(q, <, ZIO_TASKQ_TYPES);
@ -1200,20 +1201,25 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline)
* to dispatch the zio to another taskq at the same time. * to dispatch the zio to another taskq at the same time.
*/ */
ASSERT(taskq_empty_ent(&zio->io_tqent)); ASSERT(taskq_empty_ent(&zio->io_tqent));
taskq_dispatch_ent(spa->spa_zio_taskq[t][q], spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
(task_func_t *)zio_execute, zio, flags, &zio->io_tqent); flags, &zio->io_tqent);
} }
static boolean_t static boolean_t
zio_taskq_member(zio_t *zio, enum zio_taskq_type q) zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
{ {
kthread_t *executor = zio->io_executor; kthread_t *executor = zio->io_executor;
spa_t *spa = zio->io_spa; spa_t *spa = zio->io_spa;
zio_type_t t; zio_type_t t;
for (t = 0; t < ZIO_TYPES; t++) for (t = 0; t < ZIO_TYPES; t++) {
if (taskq_member(spa->spa_zio_taskq[t][q], executor)) spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
return (B_TRUE); uint_t i;
for (i = 0; i < tqs->stqs_count; i++) {
if (taskq_member(tqs->stqs_taskq[i], executor))
return (B_TRUE);
}
}
return (B_FALSE); return (B_FALSE);
} }
@ -3078,8 +3084,8 @@ zio_done(zio_t *zio)
* Hand it off to the otherwise-unused claim taskq. * Hand it off to the otherwise-unused claim taskq.
*/ */
ASSERT(taskq_empty_ent(&zio->io_tqent)); ASSERT(taskq_empty_ent(&zio->io_tqent));
taskq_dispatch_ent( spa_taskq_dispatch_ent(zio->io_spa,
zio->io_spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
(task_func_t *)zio_reexecute, zio, 0, (task_func_t *)zio_reexecute, zio, 0,
&zio->io_tqent); &zio->io_tqent);
} }