Print this page
zpool import is braindead
@@ -20,10 +20,11 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2014 RackTop Systems.
*/
/*
* Pool import support functions.
*
@@ -432,16 +433,16 @@
get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
{
pool_entry_t *pe;
vdev_entry_t *ve;
config_entry_t *ce;
- nvlist_t *ret = NULL, *config = NULL, *tmp, *nvtop, *nvroot;
+ nvlist_t *ret = NULL, *config = NULL, *tmp = NULL, *nvtop, *nvroot;
nvlist_t **spares, **l2cache;
uint_t i, nspares, nl2cache;
boolean_t config_seen;
uint64_t best_txg;
- char *name, *hostname;
+ char *name, *hostname = NULL;
uint64_t guid;
uint_t children = 0;
nvlist_t **child = NULL;
uint_t holes;
uint64_t *hole_array, max_id;
@@ -903,146 +904,129 @@
free(label);
*config = NULL;
return (0);
}
-typedef struct rdsk_node {
- char *rn_name;
- int rn_dfd;
- libzfs_handle_t *rn_hdl;
- nvlist_t *rn_config;
- avl_tree_t *rn_avl;
- avl_node_t rn_node;
- boolean_t rn_nozpool;
-} rdsk_node_t;
+typedef struct slice_node {
+ char *sn_name;
+ nvlist_t *sn_config;
+ boolean_t sn_nozpool;
+ int sn_partno;
+ struct disk_node *sn_disk;
+ struct slice_node *sn_next;
+} slice_node_t;
+
+typedef struct disk_node {
+ char *dn_name;
+ int dn_dfd;
+ libzfs_handle_t *dn_hdl;
+ nvlist_t *dn_config;
+ struct slice_node *dn_slices;
+ struct disk_node *dn_next;
+} disk_node_t;
+
+#ifdef sparc
+#define WHOLE_DISK "s2"
+#else
+#define WHOLE_DISK "p0"
+#endif
-static int
-slice_cache_compare(const void *arg1, const void *arg2)
+/*
+ * This function splits the slice from the device name. Currently it supports
+ * VTOC slices (s[0-16]) and DOS/FDISK partitions (p[0-4]). If this function
+ * is updated to support other slice types then the check_slices function will
+ * also need to be updated.
+ */
+static boolean_t
+get_disk_slice(libzfs_handle_t *hdl, char *disk, char **slice, int *partno)
{
- const char *nm1 = ((rdsk_node_t *)arg1)->rn_name;
- const char *nm2 = ((rdsk_node_t *)arg2)->rn_name;
- char *nm1slice, *nm2slice;
- int rv;
+ char *p;
- /*
- * slices zero and two are the most likely to provide results,
- * so put those first
- */
- nm1slice = strstr(nm1, "s0");
- nm2slice = strstr(nm2, "s0");
- if (nm1slice && !nm2slice) {
- return (-1);
- }
- if (!nm1slice && nm2slice) {
- return (1);
- }
- nm1slice = strstr(nm1, "s2");
- nm2slice = strstr(nm2, "s2");
- if (nm1slice && !nm2slice) {
- return (-1);
- }
- if (!nm1slice && nm2slice) {
- return (1);
- }
+ if ((p = strrchr(disk, 's')) == NULL &&
+ (p = strrchr(disk, 'p')) == NULL)
+ return (B_FALSE);
- rv = strcmp(nm1, nm2);
- if (rv == 0)
- return (0);
- return (rv > 0 ? 1 : -1);
+ if (!isdigit(p[1]))
+ return (B_FALSE);
+
+ *slice = zfs_strdup(hdl, p);
+ *partno = atoi(p + 1);
+
+ p = '\0';
+ return (B_TRUE);
}
static void
-check_one_slice(avl_tree_t *r, char *diskname, uint_t partno,
- diskaddr_t size, uint_t blksz)
+check_one_slice(slice_node_t *slice, diskaddr_t size, uint_t blksz)
{
- rdsk_node_t tmpnode;
- rdsk_node_t *node;
- char sname[MAXNAMELEN];
-
- tmpnode.rn_name = &sname[0];
- (void) snprintf(tmpnode.rn_name, MAXNAMELEN, "%s%u",
- diskname, partno);
/*
* protect against division by zero for disk labels that
* contain a bogus sector size
*/
if (blksz == 0)
blksz = DEV_BSIZE;
/* too small to contain a zpool? */
- if ((size < (SPA_MINDEVSIZE / blksz)) &&
- (node = avl_find(r, &tmpnode, NULL)))
- node->rn_nozpool = B_TRUE;
+ if (size < (SPA_MINDEVSIZE / blksz))
+ slice->sn_nozpool = B_TRUE;
}
static void
-nozpool_all_slices(avl_tree_t *r, const char *sname)
-{
- char diskname[MAXNAMELEN];
- char *ptr;
- int i;
-
- (void) strncpy(diskname, sname, MAXNAMELEN);
- if (((ptr = strrchr(diskname, 's')) == NULL) &&
- ((ptr = strrchr(diskname, 'p')) == NULL))
- return;
- ptr[0] = 's';
- ptr[1] = '\0';
- for (i = 0; i < NDKMAP; i++)
- check_one_slice(r, diskname, i, 0, 1);
- ptr[0] = 'p';
- for (i = 0; i <= FD_NUMPART; i++)
- check_one_slice(r, diskname, i, 0, 1);
-}
-
-static void
-check_slices(avl_tree_t *r, int fd, const char *sname)
+check_slices(slice_node_t *slices, int fd)
{
struct extvtoc vtoc;
struct dk_gpt *gpt;
- char diskname[MAXNAMELEN];
- char *ptr;
- int i;
-
- (void) strncpy(diskname, sname, MAXNAMELEN);
- if ((ptr = strrchr(diskname, 's')) == NULL || !isdigit(ptr[1]))
- return;
- ptr[1] = '\0';
+ slice_node_t *slice;
+ diskaddr_t size;
if (read_extvtoc(fd, &vtoc) >= 0) {
- for (i = 0; i < NDKMAP; i++)
- check_one_slice(r, diskname, i,
- vtoc.v_part[i].p_size, vtoc.v_sectorsz);
+ for (slice = slices; slice; slice = slice->sn_next) {
+ if (slice->sn_name[0] == 'p')
+ continue;
+ size = vtoc.v_part[slice->sn_partno].p_size;
+ check_one_slice(slice, size, vtoc.v_sectorsz);
+ }
} else if (efi_alloc_and_read(fd, &gpt) >= 0) {
- /*
- * on x86 we'll still have leftover links that point
- * to slices s[9-15], so use NDKMAP instead
- */
- for (i = 0; i < NDKMAP; i++)
- check_one_slice(r, diskname, i,
- gpt->efi_parts[i].p_size, gpt->efi_lbasize);
+ for (slice = slices; slice; slice = slice->sn_next) {
/* nodes p[1-4] are never used with EFI labels */
- ptr[0] = 'p';
- for (i = 1; i <= FD_NUMPART; i++)
- check_one_slice(r, diskname, i, 0, 1);
+ if (slice->sn_name[0] == 'p') {
+ if (slice->sn_partno > 0)
+ slice->sn_nozpool = B_TRUE;
+ continue;
+ }
+ size = gpt->efi_parts[slice->sn_partno].p_size;
+ check_one_slice(slice, size, gpt->efi_lbasize);
+ }
efi_free(gpt);
}
}
static void
zpool_open_func(void *arg)
{
- rdsk_node_t *rn = arg;
+ disk_node_t *disk = arg;
struct stat64 statbuf;
+ slice_node_t *slice;
nvlist_t *config;
+ char *devname;
int fd;
- if (rn->rn_nozpool)
+ /*
+ * If the disk has no slices we open it directly, otherwise we try
+ * to open the whole disk slice.
+ */
+ if (disk->dn_slices == NULL)
+ devname = strdup(disk->dn_name);
+ else
+ (void) asprintf(&devname, "%s" WHOLE_DISK, disk->dn_name);
+
+ if (devname == NULL) {
+ (void) no_memory(disk->dn_hdl);
return;
- if ((fd = openat64(rn->rn_dfd, rn->rn_name, O_RDONLY)) < 0) {
- /* symlink to a device that's no longer there */
- if (errno == ENOENT)
- nozpool_all_slices(rn->rn_avl, rn->rn_name);
+ }
+
+ if ((fd = openat64(disk->dn_dfd, devname, O_RDONLY)) < 0) {
+ free(devname);
return;
}
/*
* Ignore failed stats. We only want regular
* files, character devs and block devs.
@@ -1050,36 +1034,79 @@
if (fstat64(fd, &statbuf) != 0 ||
(!S_ISREG(statbuf.st_mode) &&
!S_ISCHR(statbuf.st_mode) &&
!S_ISBLK(statbuf.st_mode))) {
(void) close(fd);
+ free(devname);
return;
}
/* this file is too small to hold a zpool */
- if (S_ISREG(statbuf.st_mode) &&
- statbuf.st_size < SPA_MINDEVSIZE) {
+ if (S_ISREG(statbuf.st_mode) && statbuf.st_size < SPA_MINDEVSIZE) {
(void) close(fd);
+ free(devname);
return;
- } else if (!S_ISREG(statbuf.st_mode)) {
+ } else if (!S_ISREG(statbuf.st_mode) && disk->dn_slices != NULL) {
/*
* Try to read the disk label first so we don't have to
* open a bunch of minor nodes that can't have a zpool.
*/
- check_slices(rn->rn_avl, fd, rn->rn_name);
+ check_slices(disk->dn_slices, fd);
}
- if ((zpool_read_label(fd, &config)) != 0) {
+ /*
+ * If we're working with the device directly (it has no slices)
+ * then we can just read the config and we're done.
+ */
+ if (disk->dn_slices == NULL) {
+ if (zpool_read_label(fd, &config) != 0) {
+ (void) no_memory(disk->dn_hdl);
(void) close(fd);
- (void) no_memory(rn->rn_hdl);
+ free(devname);
return;
}
+ disk->dn_config = config;
+ (void) close(fd);
+ free(devname);
+ return;
+ }
+
(void) close(fd);
+ free(devname);
+
+ /*
+ * Go through and read the label off each slice. The check_slices
+ * function has already performed some basic checks and set the
+ * sn_nozpool flag on any slices which just can't contain a zpool.
+ */
+ for (slice = disk->dn_slices; slice; slice = slice->sn_next) {
+ if (slice->sn_nozpool == B_TRUE)
+ continue;
+
+ (void) asprintf(&devname, "%s%s", disk->dn_name,
+ slice->sn_name);
+
+ if (devname == NULL) {
+ (void) no_memory(disk->dn_hdl);
+ free(devname);
+ return;
+ }
+ if ((fd = openat64(disk->dn_dfd, devname, O_RDONLY)) < 0) {
+ free(devname);
+ continue;
+ }
+
+ if ((zpool_read_label(fd, &config)) != 0) {
+ (void) no_memory(disk->dn_hdl);
+ (void) close(fd);
+ free(devname);
+ return;
+ }
- rn->rn_config = config;
- if (config != NULL) {
- assert(rn->rn_nozpool == B_FALSE);
+ slice->sn_config = config;
+ (void) close(fd);
+ free(devname);
}
}
/*
* Given a file descriptor, clear (zero) the label information. This function
@@ -1132,12 +1159,10 @@
pool_list_t pools = { 0 };
pool_entry_t *pe, *penext;
vdev_entry_t *ve, *venext;
config_entry_t *ce, *cenext;
name_entry_t *ne, *nenext;
- avl_tree_t slice_cache;
- rdsk_node_t *slice;
void *cookie;
if (dirs == 0) {
dirs = 1;
dir = &default_dir;
@@ -1150,10 +1175,12 @@
*/
for (i = 0; i < dirs; i++) {
tpool_t *t;
char *rdsk;
int dfd;
+ disk_node_t *disks = NULL, *curdisk = NULL;
+ slice_node_t *curslice = NULL;
/* use realpath to normalize the path */
if (realpath(dir[i], path) == 0) {
(void) zfs_error_fmt(hdl, EZFS_BADPATH,
dgettext(TEXT_DOMAIN, "cannot open '%s'"), dir[i]);
@@ -1181,54 +1208,107 @@
dgettext(TEXT_DOMAIN, "cannot open '%s'"),
rdsk);
goto error;
}
- avl_create(&slice_cache, slice_cache_compare,
- sizeof (rdsk_node_t), offsetof(rdsk_node_t, rn_node));
/*
* This is not MT-safe, but we have no MT consumers of libzfs
*/
while ((dp = readdir64(dirp)) != NULL) {
- const char *name = dp->d_name;
- if (name[0] == '.' &&
- (name[1] == 0 || (name[1] == '.' && name[2] == 0)))
+ boolean_t isslice;
+ char *name, *sname;
+ int partno;
+
+ if (dp->d_name[0] == '.' && (dp->d_name[1] == '\0' ||
+ (dp->d_name[1] == '.' && dp->d_name[2] == '\0')))
continue;
- slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
- slice->rn_name = zfs_strdup(hdl, name);
- slice->rn_avl = &slice_cache;
- slice->rn_dfd = dfd;
- slice->rn_hdl = hdl;
- slice->rn_nozpool = B_FALSE;
- avl_add(&slice_cache, slice);
+ name = zfs_strdup(hdl, dp->d_name);
+
+ /*
+ * We create a new disk node every time we encounter
+ * a disk with no slices or the disk name changes.
+ */
+ isslice = get_disk_slice(hdl, name, &sname, &partno);
+ if (isslice == B_FALSE || curdisk == NULL ||
+ strcmp(curdisk->dn_name, name) != 0) {
+ disk_node_t *newdisk;
+
+ newdisk = zfs_alloc(hdl, sizeof (disk_node_t));
+ newdisk->dn_name = name;
+ newdisk->dn_dfd = dfd;
+ newdisk->dn_hdl = hdl;
+
+ if (curdisk != NULL)
+ curdisk->dn_next = newdisk;
+ else
+ disks = newdisk;
+
+ curdisk = newdisk;
+ curslice = NULL;
+ }
+
+ assert(curdisk != NULL);
+
+ /*
+ * Add a new slice node to the current disk node.
+ * We do this for all slices including zero slices.
+ */
+ if (isslice == B_TRUE) {
+ slice_node_t *newslice;
+
+ newslice = zfs_alloc(hdl,
+ sizeof (slice_node_t));
+ newslice->sn_name = sname;
+ newslice->sn_partno = partno;
+ newslice->sn_disk = curdisk;
+
+ if (curslice != NULL)
+ curslice->sn_next = newslice;
+ else
+ curdisk->dn_slices = newslice;
+
+ curslice = newslice;
+ }
}
/*
* create a thread pool to do all of this in parallel;
- * rn_nozpool is not protected, so this is racy in that
- * multiple tasks could decide that the same slice can
- * not hold a zpool, which is benign. Also choose
- * double the number of processors; we hold a lot of
- * locks in the kernel, so going beyond this doesn't
- * buy us much.
+ * choose double the number of processors; we hold a lot
+ * of locks in the kernel, so going beyond this doesn't
+ * buy us much. Each disk (and any slices it might have)
+ * is handled inside a single thread.
*/
t = tpool_create(1, 2 * sysconf(_SC_NPROCESSORS_ONLN),
0, NULL);
- for (slice = avl_first(&slice_cache); slice;
- (slice = avl_walk(&slice_cache, slice,
- AVL_AFTER)))
- (void) tpool_dispatch(t, zpool_open_func, slice);
+ for (curdisk = disks; curdisk; curdisk = curdisk->dn_next)
+ (void) tpool_dispatch(t, zpool_open_func, curdisk);
tpool_wait(t);
tpool_destroy(t);
- cookie = NULL;
- while ((slice = avl_destroy_nodes(&slice_cache,
- &cookie)) != NULL) {
- if (slice->rn_config != NULL) {
- nvlist_t *config = slice->rn_config;
+ curdisk = disks;
+ while (curdisk != NULL) {
+ nvlist_t *config;
+ disk_node_t *prevdisk;
+
+ /*
+ * If the device has slices we examine the config on
+ * each of those. If not we use the config directly
+ * from the device instead.
+ */
+ curslice = curdisk->dn_slices;
+
+ if (curslice != NULL)
+ config = curslice->sn_config;
+ else
+ config = curdisk->dn_config;
+
+ do {
boolean_t matched = B_TRUE;
+ if (config == NULL)
+ goto next;
+
if (iarg->poolname != NULL) {
char *pname;
matched = nvlist_lookup_string(config,
ZPOOL_CONFIG_POOL_NAME,
@@ -1240,24 +1320,56 @@
matched = nvlist_lookup_uint64(config,
ZPOOL_CONFIG_POOL_GUID,
&this_guid) == 0 &&
iarg->guid == this_guid;
}
+
if (!matched) {
nvlist_free(config);
- config = NULL;
- continue;
+ goto next;
}
+
/* use the non-raw path for the config */
- (void) strlcpy(end, slice->rn_name, pathleft);
+ if (curslice != NULL)
+ (void) snprintf(end, pathleft, "%s%s",
+ curdisk->dn_name,
+ curslice->sn_name);
+ else
+ (void) strlcpy(end, curdisk->dn_name,
+ pathleft);
if (add_config(hdl, &pools, path, config) != 0)
goto error;
+
+next:
+ /*
+ * If we're looking at slices free this one
+ * and go move onto the next.
+ */
+ if (curslice != NULL) {
+ slice_node_t *prevslice;
+
+ prevslice = curslice;
+ curslice = curslice->sn_next;
+
+ free(prevslice->sn_name);
+ free(prevslice);
+
+ if (curslice != NULL) {
+ config = curslice->sn_config;
+ }
}
- free(slice->rn_name);
- free(slice);
+ } while (curslice != NULL);
+
+ /*
+ * Free this disk and move onto the next one.
+ */
+ prevdisk = curdisk;
+ curdisk = curdisk->dn_next;
+
+ free(prevdisk->dn_name);
+ free(prevdisk);
}
- avl_destroy(&slice_cache);
(void) closedir(dirp);
dirp = NULL;
}