mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-23 10:54:35 +03:00
Illumos 5960, 5925
5960 zfs recv should prefetch indirect blocks 5925 zfs receive -o origin= Reviewed by: Prakash Surya <prakash.surya@delphix.com> Reviewed by: Matthew Ahrens <mahrens@delphix.com> References: https://www.illumos.org/issues/5960 https://www.illumos.org/issues/5925 https://github.com/illumos/illumos-gate/commit/a2cdcdd Porting notes: - [lib/libzfs/libzfs_sendrecv.c] -b8864a2Fix gcc cast warnings -325f023Add linux kernel device support -5c3f61eIncrease Linux pipe buffer size on 'zfs receive' - [module/zfs/zfs_vnops.c] -3558fd7Prototype/structure update for Linux -c12e3a5Restructure zfs_readdir() to fix regressions - [module/zfs/zvol.c] - Function @zvol_map_block() isn't needed in ZoL -9965059Prefetch start and end of volumes - [module/zfs/dmu.c] - Fixed ISO C90 - mixed declarations and code - Function dmu_prefetch() 'int i' is initialized before the following code block (c90 vs. c99) - [module/zfs/dbuf.c] -fc5bb51Fix stack dbuf_hold_impl() -9b67f60Illumos 4757, 4913 - 34229a2 Reduce stack usage for recursive traverse_visitbp() - [module/zfs/dmu_send.c] - Fixed ISO C90 - mixed declarations and code -b58986eUse large stacks when available -241b541Illumos 5959 - clean up per-dataset feature count code -77aef6fUse vmem_alloc() for nvlists -00b4602Add linux kernel memory support Ported-by: kernelOfTruth kerneloftruth@gmail.com Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
This commit is contained in:
committed by
Brian Behlendorf
parent
00af2ff6f2
commit
fcff0f35bd
+120
-27
@@ -63,6 +63,9 @@ int zio_delay_max = ZIO_DELAY_MAX;
|
||||
#define ZIO_PIPELINE_CONTINUE 0x100
|
||||
#define ZIO_PIPELINE_STOP 0x101
|
||||
|
||||
#define BP_SPANB(indblkshift, level) \
|
||||
(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
|
||||
#define COMPARE_META_LEVEL 0x80000000ul
|
||||
/*
|
||||
* The following actions directly effect the spa's sync-to-convergence logic.
|
||||
* The values below define the sync pass when we start performing the action.
|
||||
@@ -3450,39 +3453,129 @@ static zio_pipe_stage_t *zio_pipeline[] = {
|
||||
zio_done
|
||||
};
|
||||
|
||||
/* dnp is the dnode for zb1->zb_object */
|
||||
boolean_t
|
||||
zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1,
|
||||
const zbookmark_phys_t *zb2)
|
||||
{
|
||||
uint64_t zb1nextL0, zb2thisobj;
|
||||
|
||||
ASSERT(zb1->zb_objset == zb2->zb_objset);
|
||||
ASSERT(zb2->zb_level == 0);
|
||||
|
||||
|
||||
/*
|
||||
* Compare two zbookmark_phys_t's to see which we would reach first in a
|
||||
* pre-order traversal of the object tree.
|
||||
*
|
||||
* This is simple in every case aside from the meta-dnode object. For all other
|
||||
* objects, we traverse them in order (object 1 before object 2, and so on).
|
||||
* However, all of these objects are traversed while traversing object 0, since
|
||||
* the data it points to is the list of objects. Thus, we need to convert to a
|
||||
* canonical representation so we can compare meta-dnode bookmarks to
|
||||
* non-meta-dnode bookmarks.
|
||||
*
|
||||
* We do this by calculating "equivalents" for each field of the zbookmark.
|
||||
* zbookmarks outside of the meta-dnode use their own object and level, and
|
||||
* calculate the level 0 equivalent (the first L0 blkid that is contained in the
|
||||
* blocks this bookmark refers to) by multiplying their blkid by their span
|
||||
* (the number of L0 blocks contained within one block at their level).
|
||||
* zbookmarks inside the meta-dnode calculate their object equivalent
|
||||
* (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use
|
||||
* level + 1<<31 (any value larger than a level could ever be) for their level.
|
||||
* This causes them to always compare before a bookmark in their object
|
||||
* equivalent, compare appropriately to bookmarks in other objects, and to
|
||||
* compare appropriately to other bookmarks in the meta-dnode.
|
||||
*/
|
||||
int
|
||||
zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2,
|
||||
const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2)
|
||||
{
|
||||
/*
|
||||
* These variables represent the "equivalent" values for the zbookmark,
|
||||
* after converting zbookmarks inside the meta dnode to their
|
||||
* normal-object equivalents.
|
||||
*/
|
||||
uint64_t zb1obj, zb2obj;
|
||||
uint64_t zb1L0, zb2L0;
|
||||
uint64_t zb1level, zb2level;
|
||||
|
||||
if (zb1->zb_object == zb2->zb_object &&
|
||||
zb1->zb_level == zb2->zb_level &&
|
||||
zb1->zb_blkid == zb2->zb_blkid)
|
||||
return (0);
|
||||
|
||||
/*
|
||||
* BP_SPANB calculates the span in blocks.
|
||||
*/
|
||||
zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level);
|
||||
zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level);
|
||||
|
||||
if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
|
||||
zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
|
||||
zb1L0 = 0;
|
||||
zb1level = zb1->zb_level + COMPARE_META_LEVEL;
|
||||
} else {
|
||||
zb1obj = zb1->zb_object;
|
||||
zb1level = zb1->zb_level;
|
||||
}
|
||||
|
||||
if (zb2->zb_object == DMU_META_DNODE_OBJECT) {
|
||||
zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
|
||||
zb2L0 = 0;
|
||||
zb2level = zb2->zb_level + COMPARE_META_LEVEL;
|
||||
} else {
|
||||
zb2obj = zb2->zb_object;
|
||||
zb2level = zb2->zb_level;
|
||||
}
|
||||
|
||||
/* Now that we have a canonical representation, do the comparison. */
|
||||
if (zb1obj != zb2obj)
|
||||
return (zb1obj < zb2obj ? -1 : 1);
|
||||
else if (zb1L0 != zb2L0)
|
||||
return (zb1L0 < zb2L0 ? -1 : 1);
|
||||
else if (zb1level != zb2level)
|
||||
return (zb1level > zb2level ? -1 : 1);
|
||||
/*
|
||||
* This can (theoretically) happen if the bookmarks have the same object
|
||||
* and level, but different blkids, if the block sizes are not the same.
|
||||
* There is presently no way to change the indirect block sizes
|
||||
*/
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function checks the following: given that last_block is the place that
|
||||
* our traversal stopped last time, does that guarantee that we've visited
|
||||
* every node under subtree_root? Therefore, we can't just use the raw output
|
||||
* of zbookmark_compare. We have to pass in a modified version of
|
||||
* subtree_root; by incrementing the block id, and then checking whether
|
||||
* last_block is before or equal to that, we can tell whether or not having
|
||||
* visited last_block implies that all of subtree_root's children have been
|
||||
* visited.
|
||||
*/
|
||||
boolean_t
|
||||
zbookmark_subtree_completed(const dnode_phys_t *dnp,
|
||||
const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
|
||||
{
|
||||
zbookmark_phys_t mod_zb = *subtree_root;
|
||||
mod_zb.zb_blkid++;
|
||||
ASSERT(last_block->zb_level == 0);
|
||||
|
||||
/* The objset_phys_t isn't before anything. */
|
||||
if (dnp == NULL)
|
||||
return (B_FALSE);
|
||||
|
||||
zb1nextL0 = (zb1->zb_blkid + 1) <<
|
||||
((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
|
||||
|
||||
zb2thisobj = zb2->zb_object ? zb2->zb_object :
|
||||
zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
|
||||
|
||||
if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
|
||||
uint64_t nextobj = zb1nextL0 *
|
||||
(dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
|
||||
return (nextobj <= zb2thisobj);
|
||||
}
|
||||
|
||||
if (zb1->zb_object < zb2thisobj)
|
||||
return (B_TRUE);
|
||||
if (zb1->zb_object > zb2thisobj)
|
||||
return (B_FALSE);
|
||||
if (zb2->zb_object == DMU_META_DNODE_OBJECT)
|
||||
return (B_FALSE);
|
||||
return (zb1nextL0 <= zb2->zb_blkid);
|
||||
/*
|
||||
* We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the
|
||||
* data block size in sectors, because that variable is only used if
|
||||
* the bookmark refers to a block in the meta-dnode. Since we don't
|
||||
* know without examining it what object it refers to, and there's no
|
||||
* harm in passing in this value in other cases, we always pass it in.
|
||||
*
|
||||
* We pass in 0 for the indirect block size shift because zb2 must be
|
||||
* level 0. The indirect block size is only used to calculate the span
|
||||
* of the bookmark, but since the bookmark must be level 0, the span is
|
||||
* always 1, so the math works out.
|
||||
*
|
||||
* If you make changes to how the zbookmark_compare code works, be sure
|
||||
* to make sure that this code still works afterwards.
|
||||
*/
|
||||
return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
|
||||
1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb,
|
||||
last_block) <= 0);
|
||||
}
|
||||
|
||||
#if defined(_KERNEL) && defined(HAVE_SPL)
|
||||
|
||||
Reference in New Issue
Block a user