mirror of
				https://git.proxmox.com/git/mirror_zfs.git
				synced 2025-10-25 09:25:00 +03:00 
			
		
		
		
	 4e0f33ffe0
			
		
	
	
		4e0f33ffe0
		
	
	
	
	
		
			
			6214 zpools going south
Reviewed by: Igor Kozhukhov <ikozhukhov@gmail.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Reviewed by: Saso Kiselkov <skiselkov.ml@gmail.com>
References:
  https://www.illumos.org/issues/6214
  http://cr.illumos.org/~webrev/sensille/6214_zpools_going_south/
Porting Notes:
Reintroduce b_compress to the l2arc_buf_hdr_t.  In commit b9541d6
the compression flags were moved to the generic b_flags in the
arc_buf_hdr_t.  This is a problem because l2arc_compress_buf()
may manipulate the compression flags and this can only be done
safely under the hash lock which is not held.  See Illumos 6214
for a detailed analysis of the race.
HDR_GET_COMPRESS() macro was removed from arc_buf_info().
Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3757
		
	
			
		
			
				
	
	
		
			229 lines
		
	
	
		
			7.5 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			229 lines
		
	
	
		
			7.5 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /*
 | |
|  * CDDL HEADER START
 | |
|  *
 | |
|  * The contents of this file are subject to the terms of the
 | |
|  * Common Development and Distribution License (the "License").
 | |
|  * You may not use this file except in compliance with the License.
 | |
|  *
 | |
|  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 | |
|  * or http://www.opensolaris.org/os/licensing.
 | |
|  * See the License for the specific language governing permissions
 | |
|  * and limitations under the License.
 | |
|  *
 | |
|  * When distributing Covered Code, include this CDDL HEADER in each
 | |
|  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 | |
|  * If applicable, add the following below this CDDL HEADER, with the
 | |
|  * fields enclosed by brackets "[]" replaced with your own identifying
 | |
|  * information: Portions Copyright [yyyy] [name of copyright owner]
 | |
|  *
 | |
|  * CDDL HEADER END
 | |
|  */
 | |
| /*
 | |
|  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 | |
|  * Copyright (c) 2013 by Delphix. All rights reserved.
 | |
|  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 | |
|  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
 | |
|  */
 | |
| 
 | |
| #ifndef _SYS_ARC_IMPL_H
 | |
| #define	_SYS_ARC_IMPL_H
 | |
| 
 | |
| #include <sys/arc.h>
 | |
| 
 | |
| #ifdef __cplusplus
 | |
| extern "C" {
 | |
| #endif
 | |
| 
 | |
| /*
 | |
|  * Note that buffers can be in one of 6 states:
 | |
|  *	ARC_anon	- anonymous (discussed below)
 | |
|  *	ARC_mru		- recently used, currently cached
 | |
|  *	ARC_mru_ghost	- recentely used, no longer in cache
 | |
|  *	ARC_mfu		- frequently used, currently cached
 | |
|  *	ARC_mfu_ghost	- frequently used, no longer in cache
 | |
|  *	ARC_l2c_only	- exists in L2ARC but not other states
 | |
|  * When there are no active references to the buffer, they are
 | |
|  * are linked onto a list in one of these arc states.  These are
 | |
|  * the only buffers that can be evicted or deleted.  Within each
 | |
|  * state there are multiple lists, one for meta-data and one for
 | |
|  * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
 | |
|  * etc.) is tracked separately so that it can be managed more
 | |
|  * explicitly: favored over data, limited explicitly.
 | |
|  *
 | |
|  * Anonymous buffers are buffers that are not associated with
 | |
|  * a DVA.  These are buffers that hold dirty block copies
 | |
|  * before they are written to stable storage.  By definition,
 | |
|  * they are "ref'd" and are considered part of arc_mru
 | |
|  * that cannot be freed.  Generally, they will aquire a DVA
 | |
|  * as they are written and migrate onto the arc_mru list.
 | |
|  *
 | |
|  * The ARC_l2c_only state is for buffers that are in the second
 | |
|  * level ARC but no longer in any of the ARC_m* lists.  The second
 | |
|  * level ARC itself may also contain buffers that are in any of
 | |
|  * the ARC_m* states - meaning that a buffer can exist in two
 | |
|  * places.  The reason for the ARC_l2c_only state is to keep the
 | |
|  * buffer header in the hash table, so that reads that hit the
 | |
|  * second level ARC benefit from these fast lookups.
 | |
|  */
 | |
| 
 | |
| typedef struct arc_state {
 | |
| 	/*
 | |
| 	 * list of evictable buffers
 | |
| 	 */
 | |
| 	multilist_t arcs_list[ARC_BUFC_NUMTYPES];
 | |
| 	/*
 | |
| 	 * total amount of evictable data in this state
 | |
| 	 */
 | |
| 	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];
 | |
| 	/*
 | |
| 	 * total amount of data in this state; this includes: evictable,
 | |
| 	 * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
 | |
| 	 */
 | |
| 	refcount_t arcs_size;
 | |
| 	/*
 | |
| 	 * supports the "dbufs" kstat
 | |
| 	 */
 | |
| 	arc_state_type_t arcs_state;
 | |
| } arc_state_t;
 | |
| 
 | |
| typedef struct arc_callback arc_callback_t;
 | |
| 
 | |
| struct arc_callback {
 | |
| 	void			*acb_private;
 | |
| 	arc_done_func_t		*acb_done;
 | |
| 	arc_buf_t		*acb_buf;
 | |
| 	zio_t			*acb_zio_dummy;
 | |
| 	arc_callback_t		*acb_next;
 | |
| };
 | |
| 
 | |
| typedef struct arc_write_callback arc_write_callback_t;
 | |
| 
 | |
| struct arc_write_callback {
 | |
| 	void		*awcb_private;
 | |
| 	arc_done_func_t	*awcb_ready;
 | |
| 	arc_done_func_t	*awcb_physdone;
 | |
| 	arc_done_func_t	*awcb_done;
 | |
| 	arc_buf_t	*awcb_buf;
 | |
| };
 | |
| 
 | |
| /*
 | |
|  * ARC buffers are separated into multiple structs as a memory saving measure:
 | |
|  *   - Common fields struct, always defined, and embedded within it:
 | |
|  *       - L2-only fields, always allocated but undefined when not in L2ARC
 | |
|  *       - L1-only fields, only allocated when in L1ARC
 | |
|  *
 | |
|  *           Buffer in L1                     Buffer only in L2
 | |
|  *    +------------------------+          +------------------------+
 | |
|  *    | arc_buf_hdr_t          |          | arc_buf_hdr_t          |
 | |
|  *    |                        |          |                        |
 | |
|  *    |                        |          |                        |
 | |
|  *    |                        |          |                        |
 | |
|  *    +------------------------+          +------------------------+
 | |
|  *    | l2arc_buf_hdr_t        |          | l2arc_buf_hdr_t        |
 | |
|  *    | (undefined if L1-only) |          |                        |
 | |
|  *    +------------------------+          +------------------------+
 | |
|  *    | l1arc_buf_hdr_t        |
 | |
|  *    |                        |
 | |
|  *    |                        |
 | |
|  *    |                        |
 | |
|  *    |                        |
 | |
|  *    +------------------------+
 | |
|  *
 | |
|  * Because it's possible for the L2ARC to become extremely large, we can wind
 | |
|  * up eating a lot of memory in L2ARC buffer headers, so the size of a header
 | |
|  * is minimized by only allocating the fields necessary for an L1-cached buffer
 | |
|  * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
 | |
|  * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
 | |
|  * words in pointers. arc_hdr_realloc() is used to switch a header between
 | |
|  * these two allocation states.
 | |
|  */
 | |
| typedef struct l1arc_buf_hdr {
 | |
| 	kmutex_t		b_freeze_lock;
 | |
| 
 | |
| 	arc_buf_t		*b_buf;
 | |
| 	uint32_t		b_datacnt;
 | |
| 	/* for waiting on writes to complete */
 | |
| 	kcondvar_t		b_cv;
 | |
| 
 | |
| 
 | |
| 	/* protected by arc state mutex */
 | |
| 	arc_state_t		*b_state;
 | |
| 	multilist_node_t	b_arc_node;
 | |
| 
 | |
| 	/* updated atomically */
 | |
| 	clock_t			b_arc_access;
 | |
| 	uint32_t		b_mru_hits;
 | |
| 	uint32_t		b_mru_ghost_hits;
 | |
| 	uint32_t		b_mfu_hits;
 | |
| 	uint32_t		b_mfu_ghost_hits;
 | |
| 	uint32_t		b_l2_hits;
 | |
| 
 | |
| 	/* self protecting */
 | |
| 	refcount_t		b_refcnt;
 | |
| 
 | |
| 	arc_callback_t		*b_acb;
 | |
| 	/* temporary buffer holder for in-flight compressed data */
 | |
| 	void			*b_tmp_cdata;
 | |
| } l1arc_buf_hdr_t;
 | |
| 
 | |
| typedef struct l2arc_dev {
 | |
| 	vdev_t			*l2ad_vdev;	/* vdev */
 | |
| 	spa_t			*l2ad_spa;	/* spa */
 | |
| 	uint64_t		l2ad_hand;	/* next write location */
 | |
| 	uint64_t		l2ad_start;	/* first addr on device */
 | |
| 	uint64_t		l2ad_end;	/* last addr on device */
 | |
| 	boolean_t		l2ad_first;	/* first sweep through */
 | |
| 	boolean_t		l2ad_writing;	/* currently writing */
 | |
| 	kmutex_t		l2ad_mtx;	/* lock for buffer list */
 | |
| 	list_t			l2ad_buflist;	/* buffer list */
 | |
| 	list_node_t		l2ad_node;	/* device list node */
 | |
| 	refcount_t		l2ad_alloc;	/* allocated bytes */
 | |
| } l2arc_dev_t;
 | |
| 
 | |
| typedef struct l2arc_buf_hdr {
 | |
| 	/* protected by arc_buf_hdr mutex */
 | |
| 	l2arc_dev_t		*b_dev;		/* L2ARC device */
 | |
| 	uint64_t		b_daddr;	/* disk address, offset byte */
 | |
| 	/* real alloc'd buffer size depending on b_compress applied */
 | |
| 	uint32_t		b_hits;
 | |
| 	int32_t			b_asize;
 | |
| 	uint8_t			b_compress;
 | |
| 
 | |
| 	list_node_t		b_l2node;
 | |
| } l2arc_buf_hdr_t;
 | |
| 
 | |
| typedef struct l2arc_write_callback {
 | |
| 	l2arc_dev_t	*l2wcb_dev;		/* device info */
 | |
| 	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
 | |
| } l2arc_write_callback_t;
 | |
| 
 | |
| struct arc_buf_hdr {
 | |
| 	/* protected by hash lock */
 | |
| 	dva_t			b_dva;
 | |
| 	uint64_t		b_birth;
 | |
| 	/*
 | |
| 	 * Even though this checksum is only set/verified when a buffer is in
 | |
| 	 * the L1 cache, it needs to be in the set of common fields because it
 | |
| 	 * must be preserved from the time before a buffer is written out to
 | |
| 	 * L2ARC until after it is read back in.
 | |
| 	 */
 | |
| 	zio_cksum_t		*b_freeze_cksum;
 | |
| 
 | |
| 	arc_buf_hdr_t		*b_hash_next;
 | |
| 	arc_flags_t		b_flags;
 | |
| 
 | |
| 	/* immutable */
 | |
| 	int32_t			b_size;
 | |
| 	uint64_t		b_spa;
 | |
| 
 | |
| 	/* L2ARC fields. Undefined when not in L2ARC. */
 | |
| 	l2arc_buf_hdr_t		b_l2hdr;
 | |
| 	/* L1ARC fields. Undefined when in l2arc_only state */
 | |
| 	l1arc_buf_hdr_t		b_l1hdr;
 | |
| };
 | |
| #ifdef __cplusplus
 | |
| }
 | |
| #endif
 | |
| 
 | |
| #endif /* _SYS_ARC_IMPL_H */
 |