Illumos 6815179, 6844191

6815179 zpool import with a large number of LUNs is too slow
6844191 zpool import, scanning of disks should be multi-threaded

References:
  https://github.com/illumos/illumos-gate/commit/4f67d75

Porting notes:
- This change was originally never ported to Linux due to it
  dependence on the thread pool interface.  This patch solves
  that issue by switching the code to use the existing taskq
  implementation which provides the same basic functionality.
  However, in order for this to work properly thread_init()
  and thread_fini() must be called around to taskq consumer
  to perform the needed thread initialization.

- The check_one_slice, nozpool_all_slices, and check_slices
  functions have been disabled for Linux.  They are difficult,
  but possible, to implement for Linux due to how partitions
  are get names.  Since this is only an optimization this code
  can be added at a latter date.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
This commit is contained in:
Brian Behlendorf 2016-01-20 15:58:29 -08:00
parent 19d55079ae
commit 519129ff4e
3 changed files with 293 additions and 63 deletions

View File

@ -652,6 +652,8 @@ extern int random_get_pseudo_bytes(uint8_t *ptr, size_t len);
extern void kernel_init(int); extern void kernel_init(int);
extern void kernel_fini(void); extern void kernel_fini(void);
extern void thread_init(void);
extern void thread_fini(void);
struct spa; struct spa;
extern void nicenum(uint64_t num, char *buf); extern void nicenum(uint64_t num, char *buf);

View File

@ -933,6 +933,242 @@ zpool_read_label(int fd, nvlist_t **config, int *num_labels)
return (0); return (0);
} }
typedef struct rdsk_node {
char *rn_name;
int rn_num_labels;
int rn_dfd;
libzfs_handle_t *rn_hdl;
nvlist_t *rn_config;
avl_tree_t *rn_avl;
avl_node_t rn_node;
boolean_t rn_nozpool;
} rdsk_node_t;
static int
slice_cache_compare(const void *arg1, const void *arg2)
{
const char *nm1 = ((rdsk_node_t *)arg1)->rn_name;
const char *nm2 = ((rdsk_node_t *)arg2)->rn_name;
char *nm1slice, *nm2slice;
int rv;
/*
* partitions one and three (slices zero and two) are the most
* likely to provide results, so put those first
*/
nm1slice = strstr(nm1, "part1");
nm2slice = strstr(nm2, "part1");
if (nm1slice && !nm2slice) {
return (-1);
}
if (!nm1slice && nm2slice) {
return (1);
}
nm1slice = strstr(nm1, "part3");
nm2slice = strstr(nm2, "part3");
if (nm1slice && !nm2slice) {
return (-1);
}
if (!nm1slice && nm2slice) {
return (1);
}
rv = strcmp(nm1, nm2);
if (rv == 0)
return (0);
return (rv > 0 ? 1 : -1);
}
#ifndef __linux__
static void
check_one_slice(avl_tree_t *r, char *diskname, uint_t partno,
diskaddr_t size, uint_t blksz)
{
rdsk_node_t tmpnode;
rdsk_node_t *node;
char sname[MAXNAMELEN];
tmpnode.rn_name = &sname[0];
(void) snprintf(tmpnode.rn_name, MAXNAMELEN, "%s%u",
diskname, partno);
/* too small to contain a zpool? */
if ((size < (SPA_MINDEVSIZE / blksz)) &&
(node = avl_find(r, &tmpnode, NULL)))
node->rn_nozpool = B_TRUE;
}
#endif
static void
nozpool_all_slices(avl_tree_t *r, const char *sname)
{
#ifndef __linux__
char diskname[MAXNAMELEN];
char *ptr;
int i;
(void) strncpy(diskname, sname, MAXNAMELEN);
if (((ptr = strrchr(diskname, 's')) == NULL) &&
((ptr = strrchr(diskname, 'p')) == NULL))
return;
ptr[0] = 's';
ptr[1] = '\0';
for (i = 0; i < NDKMAP; i++)
check_one_slice(r, diskname, i, 0, 1);
ptr[0] = 'p';
for (i = 0; i <= FD_NUMPART; i++)
check_one_slice(r, diskname, i, 0, 1);
#endif
}
static void
check_slices(avl_tree_t *r, int fd, const char *sname)
{
#ifndef __linux__
struct extvtoc vtoc;
struct dk_gpt *gpt;
char diskname[MAXNAMELEN];
char *ptr;
int i;
(void) strncpy(diskname, sname, MAXNAMELEN);
if ((ptr = strrchr(diskname, 's')) == NULL || !isdigit(ptr[1]))
return;
ptr[1] = '\0';
if (read_extvtoc(fd, &vtoc) >= 0) {
for (i = 0; i < NDKMAP; i++)
check_one_slice(r, diskname, i,
vtoc.v_part[i].p_size, vtoc.v_sectorsz);
} else if (efi_alloc_and_read(fd, &gpt) >= 0) {
/*
* on x86 we'll still have leftover links that point
* to slices s[9-15], so use NDKMAP instead
*/
for (i = 0; i < NDKMAP; i++)
check_one_slice(r, diskname, i,
gpt->efi_parts[i].p_size, gpt->efi_lbasize);
/* nodes p[1-4] are never used with EFI labels */
ptr[0] = 'p';
for (i = 1; i <= FD_NUMPART; i++)
check_one_slice(r, diskname, i, 0, 1);
efi_free(gpt);
}
#endif
}
static void
zpool_open_func(void *arg)
{
rdsk_node_t *rn = arg;
struct stat64 statbuf;
nvlist_t *config;
int num_labels;
int fd;
if (rn->rn_nozpool)
return;
#ifdef __linux__
/*
* Skip devices with well known prefixes there can be side effects
* when opening devices which need to be avoided.
*
* core - Symlink to /proc/kcore
* fd* - Floppy interface.
* fuse - Fuse control device.
* hpet - High Precision Event Timer
* lp* - Printer interface.
* parport* - Parallel port interface.
* ppp - Generic PPP driver.
* random - Random device
* rtc - Real Time Clock
* tty* - Generic serial interface.
* urandom - Random device.
* usbmon* - USB IO monitor.
* vcs* - Virtual console memory.
* watchdog - Watchdog must be closed in a special way.
*/
if ((strncmp(rn->rn_name, "core", 4) == 0) ||
(strncmp(rn->rn_name, "fd", 2) == 0) ||
(strncmp(rn->rn_name, "fuse", 4) == 0) ||
(strncmp(rn->rn_name, "hpet", 4) == 0) ||
(strncmp(rn->rn_name, "lp", 2) == 0) ||
(strncmp(rn->rn_name, "parport", 7) == 0) ||
(strncmp(rn->rn_name, "ppp", 3) == 0) ||
(strncmp(rn->rn_name, "random", 6) == 0) ||
(strncmp(rn->rn_name, "rtc", 3) == 0) ||
(strncmp(rn->rn_name, "tty", 3) == 0) ||
(strncmp(rn->rn_name, "urandom", 7) == 0) ||
(strncmp(rn->rn_name, "usbmon", 6) == 0) ||
(strncmp(rn->rn_name, "vcs", 3) == 0) ||
(strncmp(rn->rn_name, "watchdog", 8) == 0))
return;
/*
* Ignore failed stats. We only want regular files and block devices.
*/
if (fstatat64(rn->rn_dfd, rn->rn_name, &statbuf, 0) != 0 ||
(!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode)))
return;
if ((fd = openat64(rn->rn_dfd, rn->rn_name, O_RDONLY)) < 0) {
/* symlink to a device that's no longer there */
if (errno == ENOENT)
nozpool_all_slices(rn->rn_avl, rn->rn_name);
return;
}
#else
if ((fd = openat64(rn->rn_dfd, rn->rn_name, O_RDONLY)) < 0) {
/* symlink to a device that's no longer there */
if (errno == ENOENT)
nozpool_all_slices(rn->rn_avl, rn->rn_name);
return;
}
/*
* Ignore failed stats. We only want regular
* files, character devs and block devs.
*/
if (fstat64(fd, &statbuf) != 0 ||
(!S_ISREG(statbuf.st_mode) &&
!S_ISCHR(statbuf.st_mode) &&
!S_ISBLK(statbuf.st_mode))) {
(void) close(fd);
return;
}
#endif
/* this file is too small to hold a zpool */
if (S_ISREG(statbuf.st_mode) &&
statbuf.st_size < SPA_MINDEVSIZE) {
(void) close(fd);
return;
} else if (!S_ISREG(statbuf.st_mode)) {
/*
* Try to read the disk label first so we don't have to
* open a bunch of minor nodes that can't have a zpool.
*/
check_slices(rn->rn_avl, fd, rn->rn_name);
}
if ((zpool_read_label(fd, &config, &num_labels)) != 0) {
(void) close(fd);
(void) no_memory(rn->rn_hdl);
return;
}
if (num_labels == 0) {
(void) close(fd);
nvlist_free(config);
return;
}
(void) close(fd);
rn->rn_config = config;
rn->rn_num_labels = num_labels;
if (config != NULL) {
assert(rn->rn_nozpool == B_FALSE);
}
}
/* /*
* Given a file descriptor, clear (zero) the label information. This function * Given a file descriptor, clear (zero) the label information. This function
* is used in the appliance stack as part of the ZFS sysevent module and * is used in the appliance stack as part of the ZFS sysevent module and
@ -1058,20 +1294,21 @@ zpool_default_import_path[DEFAULT_IMPORT_PATH_SIZE] = {
static nvlist_t * static nvlist_t *
zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg)
{ {
int i, num_labels, dirs = iarg->paths; int i, dirs = iarg->paths;
DIR *dirp = NULL; DIR *dirp = NULL;
struct dirent64 *dp; struct dirent64 *dp;
char path[MAXPATHLEN]; char path[MAXPATHLEN];
char *end, **dir = iarg->path; char *end, **dir = iarg->path;
size_t pathleft; size_t pathleft;
struct stat64 statbuf; nvlist_t *ret = NULL;
nvlist_t *ret = NULL, *config;
int fd;
pool_list_t pools = { 0 }; pool_list_t pools = { 0 };
pool_entry_t *pe, *penext; pool_entry_t *pe, *penext;
vdev_entry_t *ve, *venext; vdev_entry_t *ve, *venext;
config_entry_t *ce, *cenext; config_entry_t *ce, *cenext;
name_entry_t *ne, *nenext; name_entry_t *ne, *nenext;
avl_tree_t slice_cache;
rdsk_node_t *slice;
void *cookie;
verify(iarg->poolname == NULL || iarg->guid == 0); verify(iarg->poolname == NULL || iarg->guid == 0);
@ -1096,6 +1333,7 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg)
* and toplevel GUID. * and toplevel GUID.
*/ */
for (i = 0; i < dirs; i++) { for (i = 0; i < dirs; i++) {
taskq_t *t;
char *rdsk; char *rdsk;
int dfd; int dfd;
@ -1135,6 +1373,9 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg)
goto error; goto error;
} }
avl_create(&slice_cache, slice_cache_compare,
sizeof (rdsk_node_t), offsetof(rdsk_node_t, rn_node));
/* /*
* This is not MT-safe, but we have no MT consumers of libzfs * This is not MT-safe, but we have no MT consumers of libzfs
*/ */
@ -1144,65 +1385,49 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg)
(name[1] == 0 || (name[1] == '.' && name[2] == 0))) (name[1] == 0 || (name[1] == '.' && name[2] == 0)))
continue; continue;
/* slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
* Skip checking devices with well known prefixes: slice->rn_name = zfs_strdup(hdl, name);
* watchdog - A special close is required to avoid slice->rn_avl = &slice_cache;
* triggering it and resetting the system. slice->rn_dfd = dfd;
* fuse - Fuse control device. slice->rn_hdl = hdl;
* ppp - Generic PPP driver. slice->rn_nozpool = B_FALSE;
* tty* - Generic serial interface. avl_add(&slice_cache, slice);
* vcs* - Virtual console memory.
* parport* - Parallel port interface.
* lp* - Printer interface.
* fd* - Floppy interface.
* hpet - High Precision Event Timer, crashes qemu
* when accessed from a virtual machine.
* core - Symlink to /proc/kcore, causes a crash
* when access from Xen dom0.
*/
if ((strncmp(name, "watchdog", 8) == 0) ||
(strncmp(name, "fuse", 4) == 0) ||
(strncmp(name, "ppp", 3) == 0) ||
(strncmp(name, "tty", 3) == 0) ||
(strncmp(name, "vcs", 3) == 0) ||
(strncmp(name, "parport", 7) == 0) ||
(strncmp(name, "lp", 2) == 0) ||
(strncmp(name, "fd", 2) == 0) ||
(strncmp(name, "hpet", 4) == 0) ||
(strncmp(name, "core", 4) == 0))
continue;
/*
* Ignore failed stats. We only want regular
* files and block devices.
*/
if ((fstatat64(dfd, name, &statbuf, 0) != 0) ||
(!S_ISREG(statbuf.st_mode) &&
!S_ISBLK(statbuf.st_mode)))
continue;
if ((fd = openat64(dfd, name, O_RDONLY)) < 0)
continue;
if ((zpool_read_label(fd, &config, &num_labels))) {
(void) close(fd);
(void) no_memory(hdl);
goto error;
} }
/*
* create a thread pool to do all of this in parallel;
* rn_nozpool is not protected, so this is racy in that
* multiple tasks could decide that the same slice can
* not hold a zpool, which is benign. Also choose
* double the number of processors; we hold a lot of
* locks in the kernel, so going beyond this doesn't
* buy us much.
*/
thread_init();
t = taskq_create("z_import", 2 * boot_ncpus, defclsyspri,
2 * boot_ncpus, INT_MAX, TASKQ_PREPOPULATE);
for (slice = avl_first(&slice_cache); slice;
(slice = avl_walk(&slice_cache, slice,
AVL_AFTER)))
(void) taskq_dispatch(t, zpool_open_func, slice,
TQ_SLEEP);
taskq_wait(t);
taskq_destroy(t);
thread_fini();
(void) close(fd); cookie = NULL;
while ((slice = avl_destroy_nodes(&slice_cache,
if (config != NULL) { &cookie)) != NULL) {
if (slice->rn_config != NULL) {
nvlist_t *config = slice->rn_config;
boolean_t matched = B_TRUE; boolean_t matched = B_TRUE;
if (iarg->poolname != NULL) {
char *pname; char *pname;
if ((iarg->poolname != NULL) && matched = nvlist_lookup_string(config,
(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
ZPOOL_CONFIG_POOL_NAME, &pname) == 0)) { &pname) == 0 &&
strcmp(iarg->poolname, pname) == 0;
if (strcmp(iarg->poolname, pname))
matched = B_FALSE;
} else if (iarg->guid != 0) { } else if (iarg->guid != 0) {
uint64_t this_guid; uint64_t this_guid;
@ -1217,12 +1442,15 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg)
continue; continue;
} }
/* use the non-raw path for the config */ /* use the non-raw path for the config */
(void) strlcpy(end, name, pathleft); (void) strlcpy(end, slice->rn_name, pathleft);
if (add_config(hdl, &pools, path, i+1, if (add_config(hdl, &pools, path, i+1,
num_labels, config)) slice->rn_num_labels, config))
goto error; goto error;
} }
free(slice->rn_name);
free(slice);
} }
avl_destroy(&slice_cache);
(void) closedir(dirp); (void) closedir(dirp);
dirp = NULL; dirp = NULL;

View File

@ -68,7 +68,7 @@ pthread_mutex_t kthread_lock = PTHREAD_MUTEX_INITIALIZER;
pthread_key_t kthread_key; pthread_key_t kthread_key;
int kthread_nr = 0; int kthread_nr = 0;
static void void
thread_init(void) thread_init(void)
{ {
kthread_t *kt; kthread_t *kt;
@ -87,7 +87,7 @@ thread_init(void)
kthread_nr = 1; kthread_nr = 1;
} }
static void void
thread_fini(void) thread_fini(void)
{ {
kthread_t *kt = curthread; kthread_t *kt = curthread;