mirror of
				https://git.proxmox.com/git/mirror_zfs.git
				synced 2025-10-20 23:15:01 +03:00 
			
		
		
		
	Illumos 6815179, 6844191
6815179 zpool import with a large number of LUNs is too slow 6844191 zpool import, scanning of disks should be multi-threaded References: https://github.com/illumos/illumos-gate/commit/4f67d75 Porting notes: - This change was originally never ported to Linux due to it dependence on the thread pool interface. This patch solves that issue by switching the code to use the existing taskq implementation which provides the same basic functionality. However, in order for this to work properly thread_init() and thread_fini() must be called around to taskq consumer to perform the needed thread initialization. - The check_one_slice, nozpool_all_slices, and check_slices functions have been disabled for Linux. They are difficult, but possible, to implement for Linux due to how partitions are get names. Since this is only an optimization this code can be added at a latter date. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
This commit is contained in:
		
							parent
							
								
									19d55079ae
								
							
						
					
					
						commit
						519129ff4e
					
				| @ -652,6 +652,8 @@ extern int random_get_pseudo_bytes(uint8_t *ptr, size_t len); | ||||
| 
 | ||||
| extern void kernel_init(int); | ||||
| extern void kernel_fini(void); | ||||
| extern void thread_init(void); | ||||
| extern void thread_fini(void); | ||||
| 
 | ||||
| struct spa; | ||||
| extern void nicenum(uint64_t num, char *buf); | ||||
|  | ||||
| @ -933,6 +933,242 @@ zpool_read_label(int fd, nvlist_t **config, int *num_labels) | ||||
| 	return (0); | ||||
| } | ||||
| 
 | ||||
| typedef struct rdsk_node { | ||||
| 	char *rn_name; | ||||
| 	int rn_num_labels; | ||||
| 	int rn_dfd; | ||||
| 	libzfs_handle_t *rn_hdl; | ||||
| 	nvlist_t *rn_config; | ||||
| 	avl_tree_t *rn_avl; | ||||
| 	avl_node_t rn_node; | ||||
| 	boolean_t rn_nozpool; | ||||
| } rdsk_node_t; | ||||
| 
 | ||||
| static int | ||||
| slice_cache_compare(const void *arg1, const void *arg2) | ||||
| { | ||||
| 	const char  *nm1 = ((rdsk_node_t *)arg1)->rn_name; | ||||
| 	const char  *nm2 = ((rdsk_node_t *)arg2)->rn_name; | ||||
| 	char *nm1slice, *nm2slice; | ||||
| 	int rv; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * partitions one and three (slices zero and two) are the most | ||||
| 	 * likely to provide results, so put those first | ||||
| 	 */ | ||||
| 	nm1slice = strstr(nm1, "part1"); | ||||
| 	nm2slice = strstr(nm2, "part1"); | ||||
| 	if (nm1slice && !nm2slice) { | ||||
| 		return (-1); | ||||
| 	} | ||||
| 	if (!nm1slice && nm2slice) { | ||||
| 		return (1); | ||||
| 	} | ||||
| 	nm1slice = strstr(nm1, "part3"); | ||||
| 	nm2slice = strstr(nm2, "part3"); | ||||
| 	if (nm1slice && !nm2slice) { | ||||
| 		return (-1); | ||||
| 	} | ||||
| 	if (!nm1slice && nm2slice) { | ||||
| 		return (1); | ||||
| 	} | ||||
| 
 | ||||
| 	rv = strcmp(nm1, nm2); | ||||
| 	if (rv == 0) | ||||
| 		return (0); | ||||
| 	return (rv > 0 ? 1 : -1); | ||||
| } | ||||
| 
 | ||||
| #ifndef __linux__ | ||||
| static void | ||||
| check_one_slice(avl_tree_t *r, char *diskname, uint_t partno, | ||||
|     diskaddr_t size, uint_t blksz) | ||||
| { | ||||
| 	rdsk_node_t tmpnode; | ||||
| 	rdsk_node_t *node; | ||||
| 	char sname[MAXNAMELEN]; | ||||
| 
 | ||||
| 	tmpnode.rn_name = &sname[0]; | ||||
| 	(void) snprintf(tmpnode.rn_name, MAXNAMELEN, "%s%u", | ||||
| 	    diskname, partno); | ||||
| 	/* too small to contain a zpool? */ | ||||
| 	if ((size < (SPA_MINDEVSIZE / blksz)) && | ||||
| 	    (node = avl_find(r, &tmpnode, NULL))) | ||||
| 		node->rn_nozpool = B_TRUE; | ||||
| } | ||||
| #endif | ||||
| 
 | ||||
| static void | ||||
| nozpool_all_slices(avl_tree_t *r, const char *sname) | ||||
| { | ||||
| #ifndef __linux__ | ||||
| 	char diskname[MAXNAMELEN]; | ||||
| 	char *ptr; | ||||
| 	int i; | ||||
| 
 | ||||
| 	(void) strncpy(diskname, sname, MAXNAMELEN); | ||||
| 	if (((ptr = strrchr(diskname, 's')) == NULL) && | ||||
| 	    ((ptr = strrchr(diskname, 'p')) == NULL)) | ||||
| 		return; | ||||
| 	ptr[0] = 's'; | ||||
| 	ptr[1] = '\0'; | ||||
| 	for (i = 0; i < NDKMAP; i++) | ||||
| 		check_one_slice(r, diskname, i, 0, 1); | ||||
| 	ptr[0] = 'p'; | ||||
| 	for (i = 0; i <= FD_NUMPART; i++) | ||||
| 		check_one_slice(r, diskname, i, 0, 1); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| static void | ||||
| check_slices(avl_tree_t *r, int fd, const char *sname) | ||||
| { | ||||
| #ifndef __linux__ | ||||
| 	struct extvtoc vtoc; | ||||
| 	struct dk_gpt *gpt; | ||||
| 	char diskname[MAXNAMELEN]; | ||||
| 	char *ptr; | ||||
| 	int i; | ||||
| 
 | ||||
| 	(void) strncpy(diskname, sname, MAXNAMELEN); | ||||
| 	if ((ptr = strrchr(diskname, 's')) == NULL || !isdigit(ptr[1])) | ||||
| 		return; | ||||
| 	ptr[1] = '\0'; | ||||
| 
 | ||||
| 	if (read_extvtoc(fd, &vtoc) >= 0) { | ||||
| 		for (i = 0; i < NDKMAP; i++) | ||||
| 			check_one_slice(r, diskname, i, | ||||
| 			    vtoc.v_part[i].p_size, vtoc.v_sectorsz); | ||||
| 	} else if (efi_alloc_and_read(fd, &gpt) >= 0) { | ||||
| 		/*
 | ||||
| 		 * on x86 we'll still have leftover links that point | ||||
| 		 * to slices s[9-15], so use NDKMAP instead | ||||
| 		 */ | ||||
| 		for (i = 0; i < NDKMAP; i++) | ||||
| 			check_one_slice(r, diskname, i, | ||||
| 			    gpt->efi_parts[i].p_size, gpt->efi_lbasize); | ||||
| 		/* nodes p[1-4] are never used with EFI labels */ | ||||
| 		ptr[0] = 'p'; | ||||
| 		for (i = 1; i <= FD_NUMPART; i++) | ||||
| 			check_one_slice(r, diskname, i, 0, 1); | ||||
| 		efi_free(gpt); | ||||
| 	} | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| static void | ||||
| zpool_open_func(void *arg) | ||||
| { | ||||
| 	rdsk_node_t *rn = arg; | ||||
| 	struct stat64 statbuf; | ||||
| 	nvlist_t *config; | ||||
| 	int num_labels; | ||||
| 	int fd; | ||||
| 
 | ||||
| 	if (rn->rn_nozpool) | ||||
| 		return; | ||||
| #ifdef __linux__ | ||||
| 	/*
 | ||||
| 	 * Skip devices with well known prefixes there can be side effects | ||||
| 	 * when opening devices which need to be avoided. | ||||
| 	 * | ||||
| 	 * core     - Symlink to /proc/kcore | ||||
| 	 * fd*      - Floppy interface. | ||||
| 	 * fuse     - Fuse control device. | ||||
| 	 * hpet     - High Precision Event Timer | ||||
| 	 * lp*      - Printer interface. | ||||
| 	 * parport* - Parallel port interface. | ||||
| 	 * ppp      - Generic PPP driver. | ||||
| 	 * random   - Random device | ||||
| 	 * rtc      - Real Time Clock | ||||
| 	 * tty*     - Generic serial interface. | ||||
| 	 * urandom  - Random device. | ||||
| 	 * usbmon*  - USB IO monitor. | ||||
| 	 * vcs*     - Virtual console memory. | ||||
| 	 * watchdog - Watchdog must be closed in a special way. | ||||
| 	 */ | ||||
| 	if ((strncmp(rn->rn_name, "core", 4) == 0) || | ||||
| 	    (strncmp(rn->rn_name, "fd", 2) == 0) || | ||||
| 	    (strncmp(rn->rn_name, "fuse", 4) == 0) || | ||||
| 	    (strncmp(rn->rn_name, "hpet", 4) == 0) || | ||||
| 	    (strncmp(rn->rn_name, "lp", 2) == 0) || | ||||
| 	    (strncmp(rn->rn_name, "parport", 7) == 0) || | ||||
| 	    (strncmp(rn->rn_name, "ppp", 3) == 0) || | ||||
| 	    (strncmp(rn->rn_name, "random", 6) == 0) || | ||||
| 	    (strncmp(rn->rn_name, "rtc", 3) == 0) || | ||||
| 	    (strncmp(rn->rn_name, "tty", 3) == 0) || | ||||
| 	    (strncmp(rn->rn_name, "urandom", 7) == 0) || | ||||
| 	    (strncmp(rn->rn_name, "usbmon", 6) == 0) || | ||||
| 	    (strncmp(rn->rn_name, "vcs", 3) == 0) || | ||||
| 	    (strncmp(rn->rn_name, "watchdog", 8) == 0)) | ||||
| 		return; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Ignore failed stats.  We only want regular files and block devices. | ||||
| 	 */ | ||||
| 	if (fstatat64(rn->rn_dfd, rn->rn_name, &statbuf, 0) != 0 || | ||||
| 	    (!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode))) | ||||
| 		return; | ||||
| 
 | ||||
| 	if ((fd = openat64(rn->rn_dfd, rn->rn_name, O_RDONLY)) < 0) { | ||||
| 		/* symlink to a device that's no longer there */ | ||||
| 		if (errno == ENOENT) | ||||
| 			nozpool_all_slices(rn->rn_avl, rn->rn_name); | ||||
| 		return; | ||||
| 	} | ||||
| #else | ||||
| 	if ((fd = openat64(rn->rn_dfd, rn->rn_name, O_RDONLY)) < 0) { | ||||
| 		/* symlink to a device that's no longer there */ | ||||
| 		if (errno == ENOENT) | ||||
| 			nozpool_all_slices(rn->rn_avl, rn->rn_name); | ||||
| 		return; | ||||
| 	} | ||||
| 	/*
 | ||||
| 	 * Ignore failed stats.  We only want regular | ||||
| 	 * files, character devs and block devs. | ||||
| 	 */ | ||||
| 	if (fstat64(fd, &statbuf) != 0 || | ||||
| 	    (!S_ISREG(statbuf.st_mode) && | ||||
| 	    !S_ISCHR(statbuf.st_mode) && | ||||
| 	    !S_ISBLK(statbuf.st_mode))) { | ||||
| 		(void) close(fd); | ||||
| 		return; | ||||
| 	} | ||||
| #endif | ||||
| 	/* this file is too small to hold a zpool */ | ||||
| 	if (S_ISREG(statbuf.st_mode) && | ||||
| 	    statbuf.st_size < SPA_MINDEVSIZE) { | ||||
| 		(void) close(fd); | ||||
| 		return; | ||||
| 	} else if (!S_ISREG(statbuf.st_mode)) { | ||||
| 		/*
 | ||||
| 		 * Try to read the disk label first so we don't have to | ||||
| 		 * open a bunch of minor nodes that can't have a zpool. | ||||
| 		 */ | ||||
| 		check_slices(rn->rn_avl, fd, rn->rn_name); | ||||
| 	} | ||||
| 
 | ||||
| 	if ((zpool_read_label(fd, &config, &num_labels)) != 0) { | ||||
| 		(void) close(fd); | ||||
| 		(void) no_memory(rn->rn_hdl); | ||||
| 		return; | ||||
| 	} | ||||
| 
 | ||||
| 	if (num_labels == 0) { | ||||
| 		(void) close(fd); | ||||
| 		nvlist_free(config); | ||||
| 		return; | ||||
| 	} | ||||
| 
 | ||||
| 	(void) close(fd); | ||||
| 
 | ||||
| 	rn->rn_config = config; | ||||
| 	rn->rn_num_labels = num_labels; | ||||
| 	if (config != NULL) { | ||||
| 		assert(rn->rn_nozpool == B_FALSE); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Given a file descriptor, clear (zero) the label information.  This function | ||||
|  * is used in the appliance stack as part of the ZFS sysevent module and | ||||
| @ -1058,20 +1294,21 @@ zpool_default_import_path[DEFAULT_IMPORT_PATH_SIZE] = { | ||||
| static nvlist_t * | ||||
| zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) | ||||
| { | ||||
| 	int i, num_labels, dirs = iarg->paths; | ||||
| 	int i, dirs = iarg->paths; | ||||
| 	DIR *dirp = NULL; | ||||
| 	struct dirent64 *dp; | ||||
| 	char path[MAXPATHLEN]; | ||||
| 	char *end, **dir = iarg->path; | ||||
| 	size_t pathleft; | ||||
| 	struct stat64 statbuf; | ||||
| 	nvlist_t *ret = NULL, *config; | ||||
| 	int fd; | ||||
| 	nvlist_t *ret = NULL; | ||||
| 	pool_list_t pools = { 0 }; | ||||
| 	pool_entry_t *pe, *penext; | ||||
| 	vdev_entry_t *ve, *venext; | ||||
| 	config_entry_t *ce, *cenext; | ||||
| 	name_entry_t *ne, *nenext; | ||||
| 	avl_tree_t slice_cache; | ||||
| 	rdsk_node_t *slice; | ||||
| 	void *cookie; | ||||
| 
 | ||||
| 	verify(iarg->poolname == NULL || iarg->guid == 0); | ||||
| 
 | ||||
| @ -1096,6 +1333,7 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) | ||||
| 	 * and toplevel GUID. | ||||
| 	 */ | ||||
| 	for (i = 0; i < dirs; i++) { | ||||
| 		taskq_t *t; | ||||
| 		char *rdsk; | ||||
| 		int dfd; | ||||
| 
 | ||||
| @ -1135,6 +1373,9 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) | ||||
| 			goto error; | ||||
| 		} | ||||
| 
 | ||||
| 		avl_create(&slice_cache, slice_cache_compare, | ||||
| 		    sizeof (rdsk_node_t), offsetof(rdsk_node_t, rn_node)); | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * This is not MT-safe, but we have no MT consumers of libzfs | ||||
| 		 */ | ||||
| @ -1144,65 +1385,49 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) | ||||
| 			    (name[1] == 0 || (name[1] == '.' && name[2] == 0))) | ||||
| 				continue; | ||||
| 
 | ||||
| 			/*
 | ||||
| 			 * Skip checking devices with well known prefixes: | ||||
| 			 * watchdog - A special close is required to avoid | ||||
| 			 *            triggering it and resetting the system. | ||||
| 			 * fuse     - Fuse control device. | ||||
| 			 * ppp      - Generic PPP driver. | ||||
| 			 * tty*     - Generic serial interface. | ||||
| 			 * vcs*     - Virtual console memory. | ||||
| 			 * parport* - Parallel port interface. | ||||
| 			 * lp*      - Printer interface. | ||||
| 			 * fd*      - Floppy interface. | ||||
| 			 * hpet     - High Precision Event Timer, crashes qemu | ||||
| 			 *            when accessed from a virtual machine. | ||||
| 			 * core     - Symlink to /proc/kcore, causes a crash | ||||
| 			 *            when access from Xen dom0. | ||||
| 			 */ | ||||
| 			if ((strncmp(name, "watchdog", 8) == 0) || | ||||
| 			    (strncmp(name, "fuse", 4) == 0) || | ||||
| 			    (strncmp(name, "ppp", 3) == 0) || | ||||
| 			    (strncmp(name, "tty", 3) == 0) || | ||||
| 			    (strncmp(name, "vcs", 3) == 0) || | ||||
| 			    (strncmp(name, "parport", 7) == 0) || | ||||
| 			    (strncmp(name, "lp", 2) == 0) || | ||||
| 			    (strncmp(name, "fd", 2) == 0) || | ||||
| 			    (strncmp(name, "hpet", 4) == 0) || | ||||
| 			    (strncmp(name, "core", 4) == 0)) | ||||
| 				continue; | ||||
| 			slice = zfs_alloc(hdl, sizeof (rdsk_node_t)); | ||||
| 			slice->rn_name = zfs_strdup(hdl, name); | ||||
| 			slice->rn_avl = &slice_cache; | ||||
| 			slice->rn_dfd = dfd; | ||||
| 			slice->rn_hdl = hdl; | ||||
| 			slice->rn_nozpool = B_FALSE; | ||||
| 			avl_add(&slice_cache, slice); | ||||
| 		} | ||||
| 		/*
 | ||||
| 		 * create a thread pool to do all of this in parallel; | ||||
| 		 * rn_nozpool is not protected, so this is racy in that | ||||
| 		 * multiple tasks could decide that the same slice can | ||||
| 		 * not hold a zpool, which is benign.  Also choose | ||||
| 		 * double the number of processors; we hold a lot of | ||||
| 		 * locks in the kernel, so going beyond this doesn't | ||||
| 		 * buy us much. | ||||
| 		 */ | ||||
| 		thread_init(); | ||||
| 		t = taskq_create("z_import", 2 * boot_ncpus, defclsyspri, | ||||
| 		    2 * boot_ncpus, INT_MAX, TASKQ_PREPOPULATE); | ||||
| 		for (slice = avl_first(&slice_cache); slice; | ||||
| 		    (slice = avl_walk(&slice_cache, slice, | ||||
| 		    AVL_AFTER))) | ||||
| 			(void) taskq_dispatch(t, zpool_open_func, slice, | ||||
| 			    TQ_SLEEP); | ||||
| 		taskq_wait(t); | ||||
| 		taskq_destroy(t); | ||||
| 		thread_fini(); | ||||
| 
 | ||||
| 			/*
 | ||||
| 			 * Ignore failed stats.  We only want regular | ||||
| 			 * files and block devices. | ||||
| 			 */ | ||||
| 			if ((fstatat64(dfd, name, &statbuf, 0) != 0) || | ||||
| 			    (!S_ISREG(statbuf.st_mode) && | ||||
| 			    !S_ISBLK(statbuf.st_mode))) | ||||
| 				continue; | ||||
| 
 | ||||
| 			if ((fd = openat64(dfd, name, O_RDONLY)) < 0) | ||||
| 				continue; | ||||
| 
 | ||||
| 			if ((zpool_read_label(fd, &config, &num_labels))) { | ||||
| 				(void) close(fd); | ||||
| 				(void) no_memory(hdl); | ||||
| 				goto error; | ||||
| 			} | ||||
| 
 | ||||
| 			(void) close(fd); | ||||
| 
 | ||||
| 			if (config != NULL) { | ||||
| 		cookie = NULL; | ||||
| 		while ((slice = avl_destroy_nodes(&slice_cache, | ||||
| 		    &cookie)) != NULL) { | ||||
| 			if (slice->rn_config != NULL) { | ||||
| 				nvlist_t *config = slice->rn_config; | ||||
| 				boolean_t matched = B_TRUE; | ||||
| 				char *pname; | ||||
| 
 | ||||
| 				if ((iarg->poolname != NULL) && | ||||
| 				    (nvlist_lookup_string(config, | ||||
| 				    ZPOOL_CONFIG_POOL_NAME, &pname) == 0)) { | ||||
| 
 | ||||
| 					if (strcmp(iarg->poolname, pname)) | ||||
| 						matched = B_FALSE; | ||||
| 				if (iarg->poolname != NULL) { | ||||
| 					char *pname; | ||||
| 
 | ||||
| 					matched = nvlist_lookup_string(config, | ||||
| 					    ZPOOL_CONFIG_POOL_NAME, | ||||
| 					    &pname) == 0 && | ||||
| 					    strcmp(iarg->poolname, pname) == 0; | ||||
| 				} else if (iarg->guid != 0) { | ||||
| 					uint64_t this_guid; | ||||
| 
 | ||||
| @ -1217,12 +1442,15 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) | ||||
| 					continue; | ||||
| 				} | ||||
| 				/* use the non-raw path for the config */ | ||||
| 				(void) strlcpy(end, name, pathleft); | ||||
| 				(void) strlcpy(end, slice->rn_name, pathleft); | ||||
| 				if (add_config(hdl, &pools, path, i+1, | ||||
| 				    num_labels, config)) | ||||
| 				    slice->rn_num_labels, config)) | ||||
| 					goto error; | ||||
| 			} | ||||
| 			free(slice->rn_name); | ||||
| 			free(slice); | ||||
| 		} | ||||
| 		avl_destroy(&slice_cache); | ||||
| 
 | ||||
| 		(void) closedir(dirp); | ||||
| 		dirp = NULL; | ||||
|  | ||||
| @ -68,7 +68,7 @@ pthread_mutex_t kthread_lock = PTHREAD_MUTEX_INITIALIZER; | ||||
| pthread_key_t kthread_key; | ||||
| int kthread_nr = 0; | ||||
| 
 | ||||
| static void | ||||
| void | ||||
| thread_init(void) | ||||
| { | ||||
| 	kthread_t *kt; | ||||
| @ -87,7 +87,7 @@ thread_init(void) | ||||
| 	kthread_nr = 1; | ||||
| } | ||||
| 
 | ||||
| static void | ||||
| void | ||||
| thread_fini(void) | ||||
| { | ||||
| 	kthread_t *kt = curthread; | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 Brian Behlendorf
						Brian Behlendorf