Merge branch 'lock-contention-on-arcs_mtx-final'

Signed-off-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Brian Behlendorf
Closes #3115
Closes #3481
This commit is contained in:
Brian Behlendorf 2015-06-11 10:27:36 -07:00
commit 06358ea16e
33 changed files with 3041 additions and 1507 deletions

View File

@ -191,12 +191,10 @@ def get_arc_summary(Kstat):
### ARC Misc. ###
deleted = Kstat["kstat.zfs.misc.arcstats.deleted"]
mutex_miss = Kstat["kstat.zfs.misc.arcstats.mutex_miss"]
recycle_miss = Kstat["kstat.zfs.misc.arcstats.recycle_miss"]
### ARC Misc. ###
output["arc_misc"] = {}
output["arc_misc"]["deleted"] = fHits(deleted)
output["arc_misc"]['recycle_miss'] = fHits(recycle_miss)
output["arc_misc"]['mutex_miss'] = fHits(mutex_miss)
output["arc_misc"]['evict_skips'] = fHits(mutex_miss)
@ -302,8 +300,6 @@ def _arc_summary(Kstat):
### ARC Misc. ###
sys.stdout.write("ARC Misc:\n")
sys.stdout.write("\tDeleted:\t\t\t\t%s\n" % arc['arc_misc']['deleted'])
sys.stdout.write("\tRecycle Misses:\t\t\t\t%s\n" %
arc['arc_misc']['recycle_miss'])
sys.stdout.write("\tMutex Misses:\t\t\t\t%s\n" %
arc['arc_misc']['mutex_miss'])
sys.stdout.write("\tEvict Skips:\t\t\t\t%s\n" %

View File

@ -82,7 +82,6 @@ cols = {
"mrug": [4, 1000, "MRU Ghost List hits per second"],
"eskip": [5, 1000, "evict_skip per second"],
"mtxmis": [6, 1000, "mutex_miss per second"],
"rmis": [4, 1000, "recycle_miss per second"],
"dread": [5, 1000, "Demand accesses per second"],
"pread": [5, 1000, "Prefetch accesses per second"],
"l2hits": [6, 1000, "L2ARC hits per second"],
@ -406,7 +405,6 @@ def calculate():
v["mrug"] = d["mru_ghost_hits"] / sint
v["mfug"] = d["mfu_ghost_hits"] / sint
v["eskip"] = d["evict_skip"] / sint
v["rmis"] = d["recycle_miss"] / sint
v["mtxmis"] = d["mutex_miss"] / sint
if l2exist:

View File

@ -1250,7 +1250,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
print_indirect(bp, zb, dnp);
if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {
uint32_t flags = ARC_WAIT;
arc_flags_t flags = ARC_FLAG_WAIT;
int i;
blkptr_t *cbp;
int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;

View File

@ -4042,7 +4042,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
* assign an arcbuf to a dbuf.
*/
for (j = 0; j < s; j++) {
if (i != 5) {
if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
bigbuf_arcbufs[j] =
dmu_request_arcbuf(bonus_db, chunksize);
} else {
@ -4066,7 +4066,8 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
umem_free(packbuf, packsize);
umem_free(bigbuf, bigsize);
for (j = 0; j < s; j++) {
if (i != 5) {
if (i != 5 ||
chunksize < (SPA_MINBLOCKSIZE * 2)) {
dmu_return_arcbuf(bigbuf_arcbufs[j]);
} else {
dmu_return_arcbuf(
@ -4111,7 +4112,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
}
for (off = bigoff, j = 0; j < s; j++, off += chunksize) {
dmu_buf_t *dbt;
if (i != 5) {
if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
bcopy((caddr_t)bigbuf + (off - bigoff),
bigbuf_arcbufs[j]->b_data, chunksize);
} else {
@ -4128,7 +4129,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
VERIFY(dmu_buf_hold(os, bigobj, off,
FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0);
}
if (i != 5) {
if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
dmu_assign_arcbuf(bonus_db, off,
bigbuf_arcbufs[j], tx);
} else {

View File

@ -33,6 +33,7 @@ COMMON_H = \
$(top_srcdir)/include/sys/efi_partition.h \
$(top_srcdir)/include/sys/metaslab.h \
$(top_srcdir)/include/sys/metaslab_impl.h \
$(top_srcdir)/include/sys/multilist.h \
$(top_srcdir)/include/sys/nvpair.h \
$(top_srcdir)/include/sys/nvpair_impl.h \
$(top_srcdir)/include/sys/range_tree.h \
@ -53,6 +54,7 @@ COMMON_H = \
$(top_srcdir)/include/sys/trace_dbuf.h \
$(top_srcdir)/include/sys/trace_dmu.h \
$(top_srcdir)/include/sys/trace_dnode.h \
$(top_srcdir)/include/sys/trace_multilist.h \
$(top_srcdir)/include/sys/trace_txg.h \
$(top_srcdir)/include/sys/trace_zil.h \
$(top_srcdir)/include/sys/trace_zrlock.h \

View File

@ -38,6 +38,12 @@ extern "C" {
#include <sys/spa.h>
#include <sys/refcount.h>
/*
* Used by arc_flush() to inform arc_evict_state() that it should evict
* all available buffers from the arc state being passed in.
*/
#define ARC_EVICT_ALL -1ULL
typedef struct arc_buf_hdr arc_buf_hdr_t;
typedef struct arc_buf arc_buf_t;
typedef struct arc_prune arc_prune_t;
@ -53,10 +59,65 @@ arc_done_func_t arc_getbuf_func;
struct arc_prune {
arc_prune_func_t *p_pfunc;
void *p_private;
uint64_t p_adjust;
list_node_t p_node;
refcount_t p_refcnt;
};
typedef enum arc_strategy {
ARC_STRATEGY_META_ONLY = 0, /* Evict only meta data buffers */
ARC_STRATEGY_META_BALANCED = 1, /* Evict data buffers if needed */
} arc_strategy_t;
typedef enum arc_flags
{
/*
* Public flags that can be passed into the ARC by external consumers.
*/
ARC_FLAG_NONE = 1 << 0, /* No flags set */
ARC_FLAG_WAIT = 1 << 1, /* perform sync I/O */
ARC_FLAG_NOWAIT = 1 << 2, /* perform async I/O */
ARC_FLAG_PREFETCH = 1 << 3, /* I/O is a prefetch */
ARC_FLAG_CACHED = 1 << 4, /* I/O was in cache */
ARC_FLAG_L2CACHE = 1 << 5, /* cache in L2ARC */
ARC_FLAG_L2COMPRESS = 1 << 6, /* compress in L2ARC */
/*
* Private ARC flags. These flags are private ARC only flags that
* will show up in b_flags in the arc_hdr_buf_t. These flags should
* only be set by ARC code.
*/
ARC_FLAG_IN_HASH_TABLE = 1 << 7, /* buffer is hashed */
ARC_FLAG_IO_IN_PROGRESS = 1 << 8, /* I/O in progress */
ARC_FLAG_IO_ERROR = 1 << 9, /* I/O failed for buf */
ARC_FLAG_FREED_IN_READ = 1 << 10, /* freed during read */
ARC_FLAG_BUF_AVAILABLE = 1 << 11, /* block not in use */
ARC_FLAG_INDIRECT = 1 << 12, /* indirect block */
ARC_FLAG_L2_WRITING = 1 << 13, /* write in progress */
ARC_FLAG_L2_EVICTED = 1 << 14, /* evicted during I/O */
ARC_FLAG_L2_WRITE_HEAD = 1 << 15, /* head of write list */
/* indicates that the buffer contains metadata (otherwise, data) */
ARC_FLAG_BUFC_METADATA = 1 << 16,
/* Flags specifying whether optional hdr struct fields are defined */
ARC_FLAG_HAS_L1HDR = 1 << 17,
ARC_FLAG_HAS_L2HDR = 1 << 18,
/*
* The arc buffer's compression mode is stored in the top 7 bits of the
* flags field, so these dummy flags are included so that MDB can
* interpret the enum properly.
*/
ARC_FLAG_COMPRESS_0 = 1 << 24,
ARC_FLAG_COMPRESS_1 = 1 << 25,
ARC_FLAG_COMPRESS_2 = 1 << 26,
ARC_FLAG_COMPRESS_3 = 1 << 27,
ARC_FLAG_COMPRESS_4 = 1 << 28,
ARC_FLAG_COMPRESS_5 = 1 << 29,
ARC_FLAG_COMPRESS_6 = 1 << 30
} arc_flags_t;
struct arc_buf {
arc_buf_hdr_t *b_hdr;
arc_buf_t *b_next;
@ -71,15 +132,6 @@ typedef enum arc_buf_contents {
ARC_BUFC_METADATA, /* buffer contains metadata */
ARC_BUFC_NUMTYPES
} arc_buf_contents_t;
/*
* These are the flags we pass into calls to the arc
*/
#define ARC_WAIT (1 << 1) /* perform I/O synchronously */
#define ARC_NOWAIT (1 << 2) /* perform I/O asynchronously */
#define ARC_PREFETCH (1 << 3) /* I/O is a prefetch */
#define ARC_CACHED (1 << 4) /* I/O was already in cache */
#define ARC_L2CACHE (1 << 5) /* cache in L2ARC */
#define ARC_L2COMPRESS (1 << 6) /* compress in L2ARC */
/*
* The following breakdows of arc_size exist for kstat only.
@ -106,7 +158,6 @@ typedef enum arc_state_type {
typedef struct arc_buf_info {
arc_state_type_t abi_state_type;
arc_buf_contents_t abi_state_contents;
uint64_t abi_state_index;
uint32_t abi_flags;
uint32_t abi_datacnt;
uint64_t abi_size;
@ -146,7 +197,7 @@ int arc_referenced(arc_buf_t *buf);
int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
arc_done_func_t *done, void *private, zio_priority_t priority, int flags,
uint32_t *arc_flags, const zbookmark_phys_t *zb);
arc_flags_t *arc_flags, const zbookmark_phys_t *zb);
zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
@ -160,7 +211,7 @@ void arc_freed(spa_t *spa, const blkptr_t *bp);
void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private);
boolean_t arc_clear_callback(arc_buf_t *buf);
void arc_flush(spa_t *spa);
void arc_flush(spa_t *spa, boolean_t retry);
void arc_tempreserve_clear(uint64_t reserve);
int arc_tempreserve_space(uint64_t reserve, uint64_t txg);

View File

@ -67,15 +67,25 @@ extern "C" {
*/
typedef struct arc_state {
list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */
uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
uint64_t arcs_size; /* total amount of data in this state */
kmutex_t arcs_mtx;
/*
* list of evictable buffers
*/
multilist_t arcs_list[ARC_BUFC_NUMTYPES];
/*
* total amount of evictable data in this state
*/
uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];
/*
* total amount of data in this state; this includes: evictable,
* non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
*/
uint64_t arcs_size;
/*
* supports the "dbufs" kstat
*/
arc_state_type_t arcs_state;
} arc_state_t;
typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
typedef struct arc_callback arc_callback_t;
struct arc_callback {
@ -96,31 +106,49 @@ struct arc_write_callback {
arc_buf_t *awcb_buf;
};
struct arc_buf_hdr {
/* protected by hash lock */
dva_t b_dva;
uint64_t b_birth;
uint64_t b_cksum0;
/*
* ARC buffers are separated into multiple structs as a memory saving measure:
* - Common fields struct, always defined, and embedded within it:
* - L2-only fields, always allocated but undefined when not in L2ARC
* - L1-only fields, only allocated when in L1ARC
*
* Buffer in L1 Buffer only in L2
* +------------------------+ +------------------------+
* | arc_buf_hdr_t | | arc_buf_hdr_t |
* | | | |
* | | | |
* | | | |
* +------------------------+ +------------------------+
* | l2arc_buf_hdr_t | | l2arc_buf_hdr_t |
* | (undefined if L1-only) | | |
* +------------------------+ +------------------------+
* | l1arc_buf_hdr_t |
* | |
* | |
* | |
* | |
* +------------------------+
*
* Because it's possible for the L2ARC to become extremely large, we can wind
* up eating a lot of memory in L2ARC buffer headers, so the size of a header
* is minimized by only allocating the fields necessary for an L1-cached buffer
* when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
* l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
* words in pointers. arc_hdr_realloc() is used to switch a header between
* these two allocation states.
*/
typedef struct l1arc_buf_hdr {
kmutex_t b_freeze_lock;
zio_cksum_t *b_freeze_cksum;
arc_buf_hdr_t *b_hash_next;
arc_buf_t *b_buf;
uint32_t b_flags;
uint32_t b_datacnt;
arc_callback_t *b_acb;
/* for waiting on writes to complete */
kcondvar_t b_cv;
/* immutable */
arc_buf_contents_t b_type;
uint64_t b_size;
uint64_t b_spa;
/* protected by arc state mutex */
arc_state_t *b_state;
list_node_t b_arc_node;
multilist_node_t b_arc_node;
/* updated atomically */
clock_t b_arc_access;
@ -133,9 +161,10 @@ struct arc_buf_hdr {
/* self protecting */
refcount_t b_refcnt;
l2arc_buf_hdr_t *b_l2hdr;
list_node_t b_l2node;
};
arc_callback_t *b_acb;
/* temporary buffer holder for in-flight compressed data */
void *b_tmp_cdata;
} l1arc_buf_hdr_t;
typedef struct l2arc_dev {
vdev_t *l2ad_vdev; /* vdev */
@ -146,15 +175,51 @@ typedef struct l2arc_dev {
uint64_t l2ad_evict; /* last addr eviction reached */
boolean_t l2ad_first; /* first sweep through */
boolean_t l2ad_writing; /* currently writing */
list_t *l2ad_buflist; /* buffer list */
kmutex_t l2ad_mtx; /* lock for buffer list */
list_t l2ad_buflist; /* buffer list */
list_node_t l2ad_node; /* device list node */
} l2arc_dev_t;
typedef struct l2arc_buf_hdr {
/* protected by arc_buf_hdr mutex */
l2arc_dev_t *b_dev; /* L2ARC device */
uint64_t b_daddr; /* disk address, offset byte */
/* real alloc'd buffer size depending on b_compress applied */
uint32_t b_hits;
int32_t b_asize;
list_node_t b_l2node;
} l2arc_buf_hdr_t;
typedef struct l2arc_write_callback {
l2arc_dev_t *l2wcb_dev; /* device info */
arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
} l2arc_write_callback_t;
struct arc_buf_hdr {
/* protected by hash lock */
dva_t b_dva;
uint64_t b_birth;
/*
* Even though this checksum is only set/verified when a buffer is in
* the L1 cache, it needs to be in the set of common fields because it
* must be preserved from the time before a buffer is written out to
* L2ARC until after it is read back in.
*/
zio_cksum_t *b_freeze_cksum;
arc_buf_hdr_t *b_hash_next;
arc_flags_t b_flags;
/* immutable */
int32_t b_size;
uint64_t b_spa;
/* L2ARC fields. Undefined when not in L2ARC. */
l2arc_buf_hdr_t b_l2hdr;
/* L1ARC fields. Undefined when in l2arc_only state */
l1arc_buf_hdr_t b_l1hdr;
};
#ifdef __cplusplus
}
#endif

105
include/sys/multilist.h Normal file
View File

@ -0,0 +1,105 @@
/*
* CDDL HEADER START
*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
* http://www.illumos.org/license/CDDL.
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2013, 2014 by Delphix. All rights reserved.
*/
#ifndef _SYS_MULTILIST_H
#define _SYS_MULTILIST_H
#include <sys/zfs_context.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef list_node_t multilist_node_t;
typedef struct multilist multilist_t;
typedef struct multilist_sublist multilist_sublist_t;
typedef unsigned int multilist_sublist_index_func_t(multilist_t *, void *);
struct multilist_sublist {
/*
* The mutex used internally to implement thread safe insertions
* and removals to this individual sublist. It can also be locked
* by a consumer using multilist_sublist_{lock,unlock}, which is
* useful if a consumer needs to traverse the list in a thread
* safe manner.
*/
kmutex_t mls_lock;
/*
* The actual list object containing all objects in this sublist.
*/
list_t mls_list;
/*
* Pad to cache line, in an effort to try and prevent cache line
* contention.
*/
} ____cacheline_aligned;
struct multilist {
/*
* This is used to get to the multilist_node_t structure given
* the void *object contained on the list.
*/
size_t ml_offset;
/*
* The number of sublists used internally by this multilist.
*/
uint64_t ml_num_sublists;
/*
* The array of pointers to the actual sublists.
*/
multilist_sublist_t *ml_sublists;
/*
* Pointer to function which determines the sublist to use
* when inserting and removing objects from this multilist.
* Please see the comment above multilist_create for details.
*/
multilist_sublist_index_func_t *ml_index_func;
};
void multilist_destroy(multilist_t *);
void multilist_create(multilist_t *, size_t, size_t, unsigned int,
multilist_sublist_index_func_t *);
void multilist_insert(multilist_t *, void *);
void multilist_remove(multilist_t *, void *);
int multilist_is_empty(multilist_t *);
unsigned int multilist_get_num_sublists(multilist_t *);
unsigned int multilist_get_random_index(multilist_t *);
multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int);
void multilist_sublist_unlock(multilist_sublist_t *);
void multilist_sublist_insert_head(multilist_sublist_t *, void *);
void multilist_sublist_insert_tail(multilist_sublist_t *, void *);
void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj);
void multilist_sublist_remove(multilist_sublist_t *, void *);
void *multilist_sublist_head(multilist_sublist_t *);
void *multilist_sublist_tail(multilist_sublist_t *);
void *multilist_sublist_next(multilist_sublist_t *, void *);
void *multilist_sublist_prev(multilist_sublist_t *, void *);
void multilist_link_init(multilist_node_t *);
int multilist_link_active(multilist_node_t *);
#ifdef __cplusplus
}
#endif
#endif /* _SYS_MULTILIST_H */

View File

@ -45,7 +45,6 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class,
TP_STRUCT__entry(
__array(uint64_t, hdr_dva_word, 2)
__field(uint64_t, hdr_birth)
__field(uint64_t, hdr_cksum0)
__field(uint32_t, hdr_flags)
__field(uint32_t, hdr_datacnt)
__field(arc_buf_contents_t, hdr_type)
@ -64,27 +63,25 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class,
__entry->hdr_dva_word[0] = ab->b_dva.dva_word[0];
__entry->hdr_dva_word[1] = ab->b_dva.dva_word[1];
__entry->hdr_birth = ab->b_birth;
__entry->hdr_cksum0 = ab->b_cksum0;
__entry->hdr_flags = ab->b_flags;
__entry->hdr_datacnt = ab->b_datacnt;
__entry->hdr_type = ab->b_type;
__entry->hdr_datacnt = ab->b_l1hdr.b_datacnt;
__entry->hdr_size = ab->b_size;
__entry->hdr_spa = ab->b_spa;
__entry->hdr_state_type = ab->b_state->arcs_state;
__entry->hdr_access = ab->b_arc_access;
__entry->hdr_mru_hits = ab->b_mru_hits;
__entry->hdr_mru_ghost_hits = ab->b_mru_ghost_hits;
__entry->hdr_mfu_hits = ab->b_mfu_hits;
__entry->hdr_mfu_ghost_hits = ab->b_mfu_ghost_hits;
__entry->hdr_l2_hits = ab->b_l2_hits;
__entry->hdr_refcount = ab->b_refcnt.rc_count;
__entry->hdr_state_type = ab->b_l1hdr.b_state->arcs_state;
__entry->hdr_access = ab->b_l1hdr.b_arc_access;
__entry->hdr_mru_hits = ab->b_l1hdr.b_mru_hits;
__entry->hdr_mru_ghost_hits = ab->b_l1hdr.b_mru_ghost_hits;
__entry->hdr_mfu_hits = ab->b_l1hdr.b_mfu_hits;
__entry->hdr_mfu_ghost_hits = ab->b_l1hdr.b_mfu_ghost_hits;
__entry->hdr_l2_hits = ab->b_l1hdr.b_l2_hits;
__entry->hdr_refcount = ab->b_l1hdr.b_refcnt.rc_count;
),
TP_printk("hdr { dva 0x%llx:0x%llx birth %llu cksum0 0x%llx "
TP_printk("hdr { dva 0x%llx:0x%llx birth %llu "
"flags 0x%x datacnt %u type %u size %llu spa %llu "
"state_type %u access %lu mru_hits %u mru_ghost_hits %u "
"mfu_hits %u mfu_ghost_hits %u l2_hits %u refcount %lli }",
__entry->hdr_dva_word[0], __entry->hdr_dva_word[1],
__entry->hdr_birth, __entry->hdr_cksum0, __entry->hdr_flags,
__entry->hdr_birth, __entry->hdr_flags,
__entry->hdr_datacnt, __entry->hdr_type, __entry->hdr_size,
__entry->hdr_spa, __entry->hdr_state_type,
__entry->hdr_access, __entry->hdr_mru_hits,
@ -261,7 +258,6 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class,
TP_STRUCT__entry(
__array(uint64_t, hdr_dva_word, 2)
__field(uint64_t, hdr_birth)
__field(uint64_t, hdr_cksum0)
__field(uint32_t, hdr_flags)
__field(uint32_t, hdr_datacnt)
__field(arc_buf_contents_t, hdr_type)
@ -292,20 +288,18 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class,
__entry->hdr_dva_word[0] = hdr->b_dva.dva_word[0];
__entry->hdr_dva_word[1] = hdr->b_dva.dva_word[1];
__entry->hdr_birth = hdr->b_birth;
__entry->hdr_cksum0 = hdr->b_cksum0;
__entry->hdr_flags = hdr->b_flags;
__entry->hdr_datacnt = hdr->b_datacnt;
__entry->hdr_type = hdr->b_type;
__entry->hdr_datacnt = hdr->b_l1hdr.b_datacnt;
__entry->hdr_size = hdr->b_size;
__entry->hdr_spa = hdr->b_spa;
__entry->hdr_state_type = hdr->b_state->arcs_state;
__entry->hdr_access = hdr->b_arc_access;
__entry->hdr_mru_hits = hdr->b_mru_hits;
__entry->hdr_mru_ghost_hits = hdr->b_mru_ghost_hits;
__entry->hdr_mfu_hits = hdr->b_mfu_hits;
__entry->hdr_mfu_ghost_hits = hdr->b_mfu_ghost_hits;
__entry->hdr_l2_hits = hdr->b_l2_hits;
__entry->hdr_refcount = hdr->b_refcnt.rc_count;
__entry->hdr_state_type = hdr->b_l1hdr.b_state->arcs_state;
__entry->hdr_access = hdr->b_l1hdr.b_arc_access;
__entry->hdr_mru_hits = hdr->b_l1hdr.b_mru_hits;
__entry->hdr_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits;
__entry->hdr_mfu_hits = hdr->b_l1hdr.b_mfu_hits;
__entry->hdr_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits;
__entry->hdr_l2_hits = hdr->b_l1hdr.b_l2_hits;
__entry->hdr_refcount = hdr->b_l1hdr.b_refcnt.rc_count;
__entry->bp_dva0[0] = bp->blk_dva[0].dva_word[0];
__entry->bp_dva0[1] = bp->blk_dva[0].dva_word[1];
@ -325,8 +319,8 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class,
__entry->zb_level = zb->zb_level;
__entry->zb_blkid = zb->zb_blkid;
),
TP_printk("hdr { dva 0x%llx:0x%llx birth %llu cksum0 0x%llx "
"flags 0x%x datacnt %u type %u size %llu spa %llu state_type %u "
TP_printk("hdr { dva 0x%llx:0x%llx birth %llu "
"flags 0x%x datacnt %u size %llu spa %llu state_type %u "
"access %lu mru_hits %u mru_ghost_hits %u mfu_hits %u "
"mfu_ghost_hits %u l2_hits %u refcount %lli } "
"bp { dva0 0x%llx:0x%llx dva1 0x%llx:0x%llx dva2 "
@ -334,8 +328,8 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class,
"lsize %llu } zb { objset %llu object %llu level %lli "
"blkid %llu }",
__entry->hdr_dva_word[0], __entry->hdr_dva_word[1],
__entry->hdr_birth, __entry->hdr_cksum0, __entry->hdr_flags,
__entry->hdr_datacnt, __entry->hdr_type, __entry->hdr_size,
__entry->hdr_birth, __entry->hdr_flags,
__entry->hdr_datacnt, __entry->hdr_size,
__entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access,
__entry->hdr_mru_hits, __entry->hdr_mru_ghost_hits,
__entry->hdr_mfu_hits, __entry->hdr_mfu_ghost_hits,

View File

@ -0,0 +1,76 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
#if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS)
#undef TRACE_SYSTEM
#define TRACE_SYSTEM zfs
#if !defined(_TRACE_MULTILIST_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_MULTILIST_H
#include <linux/tracepoint.h>
#include <sys/types.h>
/*
* Generic support for three argument tracepoints of the form:
*
* DTRACE_PROBE3(...,
* multilist_t *, ...,
* unsigned int, ...,
* void *, ...);
*/
DECLARE_EVENT_CLASS(zfs_multilist_insert_remove_class,
TP_PROTO(multilist_t *ml, unsigned sublist_idx, void *obj),
TP_ARGS(ml, sublist_idx, obj),
TP_STRUCT__entry(
__field(size_t, ml_offset)
__field(uint64_t, ml_num_sublists)
__field(unsigned int, sublist_idx)
),
TP_fast_assign(
__entry->ml_offset = ml->ml_offset;
__entry->ml_num_sublists = ml->ml_num_sublists;
__entry->sublist_idx = sublist_idx;
),
TP_printk("ml { offset %ld numsublists %llu sublistidx %u } ",
__entry->ml_offset, __entry->ml_num_sublists, __entry->sublist_idx)
);
#define DEFINE_MULTILIST_INSERT_REMOVE_EVENT(name) \
DEFINE_EVENT(zfs_multilist_insert_remove_class, name, \
TP_PROTO(multilist_t *ml, unsigned int sublist_idx, void *obj), \
TP_ARGS(ml, sublist_idx, obj))
DEFINE_MULTILIST_INSERT_REMOVE_EVENT(zfs_multilist__insert);
DEFINE_MULTILIST_INSERT_REMOVE_EVENT(zfs_multilist__remove);
#endif /* _TRACE_MULTILIST_H */
#undef TRACE_INCLUDE_PATH
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_PATH sys
#define TRACE_INCLUDE_FILE trace_multilist
#include <trace/define_trace.h>
#endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */

View File

@ -468,6 +468,7 @@ extern void taskq_init_ent(taskq_ent_t *);
extern void taskq_destroy(taskq_t *);
extern void taskq_wait(taskq_t *);
extern void taskq_wait_id(taskq_t *, taskqid_t);
extern void taskq_wait_outstanding(taskq_t *, taskqid_t);
extern int taskq_member(taskq_t *, kthread_t *);
extern int taskq_cancel_id(taskq_t *, taskqid_t);
extern void system_taskq_init(void);
@ -609,6 +610,7 @@ extern void delay(clock_t ticks);
} while (0);
#define max_ncpus 64
#define num_online_cpus() (sysconf(_SC_NPROCESSORS_ONLN))
#define minclsyspri 60
#define maxclsyspri 99

View File

@ -55,6 +55,7 @@ libzpool_la_SOURCES = \
$(top_srcdir)/module/zfs/lzjb.c \
$(top_srcdir)/module/zfs/lz4.c \
$(top_srcdir)/module/zfs/metaslab.c \
$(top_srcdir)/module/zfs/multilist.c \
$(top_srcdir)/module/zfs/range_tree.c \
$(top_srcdir)/module/zfs/refcount.c \
$(top_srcdir)/module/zfs/rrwlock.c \

View File

@ -220,6 +220,12 @@ taskq_wait_id(taskq_t *tq, taskqid_t id)
taskq_wait(tq);
}
void
taskq_wait_outstanding(taskq_t *tq, taskqid_t id)
{
taskq_wait(tq);
}
static void
taskq_thread(void *arg)
{

View File

@ -347,6 +347,19 @@ increased to reduce the memory footprint.
Default value: \fB8192\fR.
.RE
.sp
.ne 2
.na
\fBzfs_arc_evict_batch_limit\fR (int)
.ad
.RS 12n
Number ARC headers to evict per sub-list before proceding to another sub-list.
This batch-style operation prevents entire sub-lists from being evicted at once
but comes at a cost of additional unlocking and locking.
.sp
Default value: \fB10\fR.
.RE
.sp
.ne 2
.na
@ -395,6 +408,19 @@ for meta data.
Default value: \fB0\fR.
.RE
.sp
.ne 2
.na
\fBzfs_arc_meta_min\fR (ulong)
.ad
.RS 12n
The minimum allowed size in bytes that meta data buffers may consume in
the ARC. This value defaults to 0 which disables a floor on the amount
of the ARC devoted meta data.
.sp
Default value: \fB0\fR.
.RE
.sp
.ne 2
.na
@ -447,6 +473,40 @@ Min life of prefetch block
Default value: \fB100\fR.
.RE
.sp
.ne 2
.na
\fBzfs_arc_num_sublists_per_state\fR (int)
.ad
.RS 12n
To allow more fine-grained locking, each ARC state contains a series
of lists for both data and meta data objects. Locking is performed at
the level of these "sub-lists". This parameters controls the number of
sub-lists per ARC state.
.sp
Default value: 1 or the number of on-online CPUs, whichever is greater
.RE
.sp
.ne 2
.na
\fBzfs_arc_overflow_shift\fR (int)
.ad
.RS 12n
The ARC size is considered to be overflowing if it exceeds the current
ARC target size (arc_c) by a threshold determined by this parameter.
The threshold is calculated as a fraction of arc_c using the formula
"arc_c >> \fBzfs_arc_overflow_shift\fR".
The default value of 8 causes the ARC to be considered to be overflowing
if it exceeds the target size by 1/256th (0.3%) of the target size.
When the ARC is overflowing, new buffer allocations are stalled until
the reclaim thread catches up and the overflow condition no longer exists.
.sp
Default value: \fB8\fR.
.RE
.sp
.ne 2
.na

View File

@ -37,6 +37,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/gzip.o
$(MODULE)-objs += @top_srcdir@/module/zfs/lzjb.o
$(MODULE)-objs += @top_srcdir@/module/zfs/lz4.o
$(MODULE)-objs += @top_srcdir@/module/zfs/metaslab.o
$(MODULE)-objs += @top_srcdir@/module/zfs/multilist.o
$(MODULE)-objs += @top_srcdir@/module/zfs/range_tree.o
$(MODULE)-objs += @top_srcdir@/module/zfs/refcount.o
$(MODULE)-objs += @top_srcdir@/module/zfs/rrwlock.o

File diff suppressed because it is too large Load Diff

View File

@ -653,7 +653,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
{
dnode_t *dn;
zbookmark_phys_t zb;
uint32_t aflags = ARC_NOWAIT;
uint32_t aflags = ARC_FLAG_NOWAIT;
int err;
DB_DNODE_ENTER(db);
@ -707,9 +707,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
mutex_exit(&db->db_mtx);
if (DBUF_IS_L2CACHEABLE(db))
aflags |= ARC_L2CACHE;
aflags |= ARC_FLAG_L2CACHE;
if (DBUF_IS_L2COMPRESSIBLE(db))
aflags |= ARC_L2COMPRESS;
aflags |= ARC_FLAG_L2COMPRESS;
SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
@ -721,7 +721,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
(*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
&aflags, &zb);
if (aflags & ARC_CACHED)
if (aflags & ARC_FLAG_CACHED)
*flags |= DB_RF_CACHED;
return (SET_ERROR(err));
@ -2028,7 +2028,8 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp, NULL) == 0) {
if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
arc_flags_t aflags =
ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
zbookmark_phys_t zb;
SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,

View File

@ -48,12 +48,12 @@ dbuf_stats_hash_table_headers(char *buf, size_t size)
(void) snprintf(buf, size,
"%-88s | %-124s | %s\n"
"%-16s %-8s %-8s %-8s %-8s %-8s %-8s %-5s %-5s %5s | "
"%-5s %-5s %-6s %-8s %-6s %-8s %-12s "
"%-5s %-5s %-8s %-6s %-8s %-12s "
"%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-5s | "
"%-6s %-6s %-8s %-8s %-6s %-6s %-5s %-8s %-8s\n",
"dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level",
"blkid", "offset", "dbsize", "meta", "state", "dbholds", "list",
"atype", "index", "flags", "count", "asize", "access",
"atype", "flags", "count", "asize", "access",
"mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize",
"l2_comp", "aholds", "dtype", "btype", "data_bs", "meta_bs",
"bsize", "lvls", "dholds", "blocks", "dsize");
@ -77,7 +77,7 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db)
nwritten = snprintf(buf, size,
"%-16s %-8llu %-8lld %-8lld %-8lld %-8llu %-8llu %-5d %-5d %-5lu | "
"%-5d %-5d %-6lld 0x%-6x %-6lu %-8llu %-12llu "
"%-5d %-5d 0x%-6x %-6lu %-8llu %-12llu "
"%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-5lu | "
"%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-5lu %-8llu %-8llu\n",
/* dmu_buf_impl_t */
@ -94,7 +94,6 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db)
/* arc_buf_info_t */
abi.abi_state_type,
abi.abi_state_contents,
(longlong_t)abi.abi_state_index,
abi.abi_flags,
(ulong_t)abi.abi_datacnt,
(u_longlong_t)abi.abi_size,

View File

@ -129,7 +129,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
} else if (zb->zb_level == 0) {
dnode_phys_t *blk;
arc_buf_t *abuf;
uint32_t aflags = ARC_WAIT;
arc_flags_t aflags = ARC_FLAG_WAIT;
int blksz = BP_GET_LSIZE(bp);
int i;

View File

@ -306,15 +306,15 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
os->os_spa = spa;
os->os_rootbp = bp;
if (!BP_IS_HOLE(os->os_rootbp)) {
uint32_t aflags = ARC_WAIT;
arc_flags_t aflags = ARC_FLAG_WAIT;
zbookmark_phys_t zb;
SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
if (DMU_OS_IS_L2CACHEABLE(os))
aflags |= ARC_L2CACHE;
aflags |= ARC_FLAG_L2CACHE;
if (DMU_OS_IS_L2COMPRESSIBLE(os))
aflags |= ARC_L2COMPRESS;
aflags |= ARC_FLAG_L2COMPRESS;
dprintf_bp(os->os_rootbp, "reading %s", "");
err = arc_read(NULL, spa, os->os_rootbp,

View File

@ -486,7 +486,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
dnode_phys_t *blk;
int i;
int blksz = BP_GET_LSIZE(bp);
uint32_t aflags = ARC_WAIT;
arc_flags_t aflags = ARC_FLAG_WAIT;
arc_buf_t *abuf;
if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
@ -504,7 +504,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
}
(void) arc_buf_remove_ref(abuf, &abuf);
} else if (type == DMU_OT_SA) {
uint32_t aflags = ARC_WAIT;
arc_flags_t aflags = ARC_FLAG_WAIT;
arc_buf_t *abuf;
int blksz = BP_GET_LSIZE(bp);
@ -521,8 +521,8 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
err = dump_write_embedded(dsp, zb->zb_object,
zb->zb_blkid * blksz, blksz, bp);
} else { /* it's a level-0 block of a regular object */
uint32_t aflags = ARC_WAIT;
uint64_t offset;
arc_flags_t aflags = ARC_FLAG_WAIT;
arc_buf_t *abuf;
int blksz = BP_GET_LSIZE(bp);

View File

@ -177,7 +177,7 @@ static void
traverse_prefetch_metadata(traverse_data_t *td,
const blkptr_t *bp, const zbookmark_phys_t *zb)
{
uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA))
return;
@ -273,7 +273,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
}
if (BP_GET_LEVEL(bp) > 0) {
uint32_t flags = ARC_WAIT;
uint32_t flags = ARC_FLAG_WAIT;
int32_t i;
int32_t epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
zbookmark_phys_t *czb;
@ -307,7 +307,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
kmem_free(czb, sizeof (zbookmark_phys_t));
} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
uint32_t flags = ARC_WAIT;
uint32_t flags = ARC_FLAG_WAIT;
int32_t i;
int32_t epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
dnode_phys_t *cdnp;
@ -331,7 +331,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
break;
}
} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
uint32_t flags = ARC_WAIT;
arc_flags_t flags = ARC_FLAG_WAIT;
objset_phys_t *osp;
dnode_phys_t *mdnp, *gdnp, *udnp;
@ -448,7 +448,7 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
{
prefetch_data_t *pfd = arg;
uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
ASSERT(pfd->pd_bytes_fetched >= 0);
if (pfd->pd_cancel)
@ -545,7 +545,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
/* See comment on ZIL traversal in dsl_scan_visitds. */
if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) {
uint32_t flags = ARC_WAIT;
uint32_t flags = ARC_FLAG_WAIT;
objset_phys_t *osp;
arc_buf_t *buf;

View File

@ -317,7 +317,14 @@ dsl_pool_close(dsl_pool_t *dp)
txg_list_destroy(&dp->dp_sync_tasks);
txg_list_destroy(&dp->dp_dirty_dirs);
arc_flush(dp->dp_spa);
/*
* We can't set retry to TRUE since we're explicitly specifying
* a spa to flush. This is good enough; any missed buffers for
* this spa won't cause trouble, and they'll eventually fall
* out of the ARC just like any other unused buffer.
*/
arc_flush(dp->dp_spa, FALSE);
txg_fini(dp);
dsl_scan_fini(dp);
dmu_buf_user_evict_wait();

View File

@ -590,7 +590,7 @@ dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
uint64_t objset, uint64_t object, uint64_t blkid)
{
zbookmark_phys_t czb;
uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
if (zfs_no_scrub_prefetch)
return;
@ -655,7 +655,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
int err;
if (BP_GET_LEVEL(bp) > 0) {
uint32_t flags = ARC_WAIT;
arc_flags_t flags = ARC_FLAG_WAIT;
int i;
blkptr_t *cbp;
int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
@ -682,7 +682,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
}
(void) arc_buf_remove_ref(buf, &buf);
} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
uint32_t flags = ARC_WAIT;
arc_flags_t flags = ARC_FLAG_WAIT;
dnode_phys_t *cdnp;
int i, j;
int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
@ -708,7 +708,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
(void) arc_buf_remove_ref(buf, &buf);
} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
uint32_t flags = ARC_WAIT;
arc_flags_t flags = ARC_FLAG_WAIT;
objset_phys_t *osp;
arc_buf_t *buf;

View File

@ -556,7 +556,7 @@ metaslab_group_passivate(metaslab_group_t *mg)
return;
}
taskq_wait(mg->mg_taskq);
taskq_wait_outstanding(mg->mg_taskq, 0);
metaslab_group_alloc_update(mg);
mgprev = mg->mg_prev;
@ -1596,7 +1596,7 @@ metaslab_group_preload(metaslab_group_t *mg)
int m = 0;
if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
taskq_wait(mg->mg_taskq);
taskq_wait_outstanding(mg->mg_taskq, 0);
return;
}

375
module/zfs/multilist.c Normal file
View File

@ -0,0 +1,375 @@
/*
* CDDL HEADER START
*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
* http://www.illumos.org/license/CDDL.
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2013, 2014 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/multilist.h>
#include <sys/trace_multilist.h>
/* needed for spa_get_random() */
#include <sys/spa.h>
/*
* Given the object contained on the list, return a pointer to the
* object's multilist_node_t structure it contains.
*/
#ifdef DEBUG
static multilist_node_t *
multilist_d2l(multilist_t *ml, void *obj)
{
return ((multilist_node_t *)((char *)obj + ml->ml_offset));
}
#endif
/*
* Initialize a new mutlilist using the parameters specified.
*
* - 'size' denotes the size of the structure containing the
* multilist_node_t.
* - 'offset' denotes the byte offset of the mutlilist_node_t within
* the structure that contains it.
* - 'num' specifies the number of internal sublists to create.
* - 'index_func' is used to determine which sublist to insert into
* when the multilist_insert() function is called; as well as which
* sublist to remove from when multilist_remove() is called. The
* requirements this function must meet, are the following:
*
* - It must always return the same value when called on the same
* object (to ensure the object is removed from the list it was
* inserted into).
*
* - It must return a value in the range [0, number of sublists).
* The multilist_get_num_sublists() function may be used to
* determine the number of sublists in the multilist.
*
* Also, in order to reduce internal contention between the sublists
* during insertion and removal, this function should choose evenly
* between all available sublists when inserting. This isn't a hard
* requirement, but a general rule of thumb in order to garner the
* best multi-threaded performance out of the data structure.
*/
void
multilist_create(multilist_t *ml, size_t size, size_t offset, unsigned int num,
multilist_sublist_index_func_t *index_func)
{
int i;
ASSERT3P(ml, !=, NULL);
ASSERT3U(size, >, 0);
ASSERT3U(size, >=, offset + sizeof (multilist_node_t));
ASSERT3U(num, >, 0);
ASSERT3P(index_func, !=, NULL);
ml->ml_offset = offset;
ml->ml_num_sublists = num;
ml->ml_index_func = index_func;
ml->ml_sublists = kmem_zalloc(sizeof (multilist_sublist_t) *
ml->ml_num_sublists, KM_SLEEP);
ASSERT3P(ml->ml_sublists, !=, NULL);
for (i = 0; i < ml->ml_num_sublists; i++) {
multilist_sublist_t *mls = &ml->ml_sublists[i];
mutex_init(&mls->mls_lock, NULL, MUTEX_DEFAULT, NULL);
list_create(&mls->mls_list, size, offset);
}
}
/*
* Destroy the given multilist object, and free up any memory it holds.
*/
void
multilist_destroy(multilist_t *ml)
{
int i;
ASSERT(multilist_is_empty(ml));
for (i = 0; i < ml->ml_num_sublists; i++) {
multilist_sublist_t *mls = &ml->ml_sublists[i];
ASSERT(list_is_empty(&mls->mls_list));
list_destroy(&mls->mls_list);
mutex_destroy(&mls->mls_lock);
}
ASSERT3P(ml->ml_sublists, !=, NULL);
kmem_free(ml->ml_sublists,
sizeof (multilist_sublist_t) * ml->ml_num_sublists);
ml->ml_num_sublists = 0;
ml->ml_offset = 0;
}
/*
* Insert the given object into the multilist.
*
* This function will insert the object specified into the sublist
* determined using the function given at multilist creation time.
*
* The sublist locks are automatically acquired if not already held, to
* ensure consistency when inserting and removing from multiple threads.
*/
void
multilist_insert(multilist_t *ml, void *obj)
{
unsigned int sublist_idx = ml->ml_index_func(ml, obj);
multilist_sublist_t *mls;
boolean_t need_lock;
DTRACE_PROBE3(multilist__insert, multilist_t *, ml,
unsigned int, sublist_idx, void *, obj);
ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
mls = &ml->ml_sublists[sublist_idx];
/*
* Note: Callers may already hold the sublist lock by calling
* multilist_sublist_lock(). Here we rely on MUTEX_HELD()
* returning TRUE if and only if the current thread holds the
* lock. While it's a little ugly to make the lock recursive in
* this way, it works and allows the calling code to be much
* simpler -- otherwise it would have to pass around a flag
* indicating that it already has the lock.
*/
need_lock = !MUTEX_HELD(&mls->mls_lock);
if (need_lock)
mutex_enter(&mls->mls_lock);
ASSERT(!multilist_link_active(multilist_d2l(ml, obj)));
multilist_sublist_insert_head(mls, obj);
if (need_lock)
mutex_exit(&mls->mls_lock);
}
/*
* Remove the given object from the multilist.
*
* This function will remove the object specified from the sublist
* determined using the function given at multilist creation time.
*
* The necessary sublist locks are automatically acquired, to ensure
* consistency when inserting and removing from multiple threads.
*/
void
multilist_remove(multilist_t *ml, void *obj)
{
unsigned int sublist_idx = ml->ml_index_func(ml, obj);
multilist_sublist_t *mls;
boolean_t need_lock;
DTRACE_PROBE3(multilist__remove, multilist_t *, ml,
unsigned int, sublist_idx, void *, obj);
ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
mls = &ml->ml_sublists[sublist_idx];
/* See comment in multilist_insert(). */
need_lock = !MUTEX_HELD(&mls->mls_lock);
if (need_lock)
mutex_enter(&mls->mls_lock);
ASSERT(multilist_link_active(multilist_d2l(ml, obj)));
multilist_sublist_remove(mls, obj);
if (need_lock)
mutex_exit(&mls->mls_lock);
}
/*
* Check to see if this multilist object is empty.
*
* This will return TRUE if it finds all of the sublists of this
* multilist to be empty, and FALSE otherwise. Each sublist lock will be
* automatically acquired as necessary.
*
* If concurrent insertions and removals are occurring, the semantics
* of this function become a little fuzzy. Instead of locking all
* sublists for the entire call time of the function, each sublist is
* only locked as it is individually checked for emptiness. Thus, it's
* possible for this function to return TRUE with non-empty sublists at
* the time the function returns. This would be due to another thread
* inserting into a given sublist, after that specific sublist was check
* and deemed empty, but before all sublists have been checked.
*/
int
multilist_is_empty(multilist_t *ml)
{
int i;
for (i = 0; i < ml->ml_num_sublists; i++) {
multilist_sublist_t *mls = &ml->ml_sublists[i];
/* See comment in multilist_insert(). */
boolean_t need_lock = !MUTEX_HELD(&mls->mls_lock);
if (need_lock)
mutex_enter(&mls->mls_lock);
if (!list_is_empty(&mls->mls_list)) {
if (need_lock)
mutex_exit(&mls->mls_lock);
return (FALSE);
}
if (need_lock)
mutex_exit(&mls->mls_lock);
}
return (TRUE);
}
/* Return the number of sublists composing this multilist */
unsigned int
multilist_get_num_sublists(multilist_t *ml)
{
return (ml->ml_num_sublists);
}
/* Return a randomly selected, valid sublist index for this multilist */
unsigned int
multilist_get_random_index(multilist_t *ml)
{
return (spa_get_random(ml->ml_num_sublists));
}
/* Lock and return the sublist specified at the given index */
multilist_sublist_t *
multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
{
multilist_sublist_t *mls;
ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
mls = &ml->ml_sublists[sublist_idx];
mutex_enter(&mls->mls_lock);
return (mls);
}
void
multilist_sublist_unlock(multilist_sublist_t *mls)
{
mutex_exit(&mls->mls_lock);
}
/*
* We're allowing any object to be inserted into this specific sublist,
* but this can lead to trouble if multilist_remove() is called to
* remove this object. Specifically, if calling ml_index_func on this
* object returns an index for sublist different than what is passed as
* a parameter here, any call to multilist_remove() with this newly
* inserted object is undefined! (the call to multilist_remove() will
* remove the object from a list that it isn't contained in)
*/
void
multilist_sublist_insert_head(multilist_sublist_t *mls, void *obj)
{
ASSERT(MUTEX_HELD(&mls->mls_lock));
list_insert_head(&mls->mls_list, obj);
}
/* please see comment above multilist_sublist_insert_head */
void
multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj)
{
ASSERT(MUTEX_HELD(&mls->mls_lock));
list_insert_tail(&mls->mls_list, obj);
}
/*
* Move the object one element forward in the list.
*
* This function will move the given object forward in the list (towards
* the head) by one object. So, in essence, it will swap its position in
* the list with its "prev" pointer. If the given object is already at the
* head of the list, it cannot be moved forward any more than it already
* is, so no action is taken.
*
* NOTE: This function **must not** remove any object from the list other
* than the object given as the parameter. This is relied upon in
* arc_evict_state_impl().
*/
void
multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj)
{
void *prev = list_prev(&mls->mls_list, obj);
ASSERT(MUTEX_HELD(&mls->mls_lock));
ASSERT(!list_is_empty(&mls->mls_list));
/* 'obj' must be at the head of the list, nothing to do */
if (prev == NULL)
return;
list_remove(&mls->mls_list, obj);
list_insert_before(&mls->mls_list, prev, obj);
}
void
multilist_sublist_remove(multilist_sublist_t *mls, void *obj)
{
ASSERT(MUTEX_HELD(&mls->mls_lock));
list_remove(&mls->mls_list, obj);
}
void *
multilist_sublist_head(multilist_sublist_t *mls)
{
ASSERT(MUTEX_HELD(&mls->mls_lock));
return (list_head(&mls->mls_list));
}
void *
multilist_sublist_tail(multilist_sublist_t *mls)
{
ASSERT(MUTEX_HELD(&mls->mls_lock));
return (list_tail(&mls->mls_list));
}
void *
multilist_sublist_next(multilist_sublist_t *mls, void *obj)
{
ASSERT(MUTEX_HELD(&mls->mls_lock));
return (list_next(&mls->mls_list, obj));
}
void *
multilist_sublist_prev(multilist_sublist_t *mls, void *obj)
{
ASSERT(MUTEX_HELD(&mls->mls_lock));
return (list_prev(&mls->mls_list, obj));
}
void
multilist_link_init(multilist_node_t *link)
{
list_link_init(link);
}
int
multilist_link_active(multilist_node_t *link)
{
return (list_link_active(link));
}

View File

@ -200,7 +200,7 @@ spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags)
if (zfs_read_history == 0 && ssh->size == 0)
return;
if (zfs_read_history_hits == 0 && (aflags & ARC_CACHED))
if (zfs_read_history_hits == 0 && (aflags & ARC_FLAG_CACHED))
return;
srh = kmem_zalloc(sizeof (spa_read_history_t), KM_SLEEP);

View File

@ -23,6 +23,7 @@
* (and only one) C file, so this dummy file exists for that purpose.
*/
#include <sys/multilist.h>
#include <sys/arc_impl.h>
#include <sys/vdev_impl.h>
#include <sys/zio.h>
@ -31,6 +32,7 @@
#include <sys/dsl_dataset.h>
#include <sys/dmu_tx.h>
#include <sys/dnode.h>
#include <sys/multilist.h>
#include <sys/zfs_znode.h>
#include <sys/zil_impl.h>
#include <sys/zrlock.h>
@ -42,6 +44,7 @@
#include <sys/trace_dbuf.h>
#include <sys/trace_dmu.h>
#include <sys/trace_dnode.h>
#include <sys/trace_multilist.h>
#include <sys/trace_txg.h>
#include <sys/trace_zil.h>
#include <sys/trace_zrlock.h>

View File

@ -471,7 +471,7 @@ txg_wait_callbacks(dsl_pool_t *dp)
tx_state_t *tx = &dp->dp_tx;
if (tx->tx_commit_cb_taskq != NULL)
taskq_wait(tx->tx_commit_cb_taskq);
taskq_wait_outstanding(tx->tx_commit_cb_taskq, 0);
}
static void

View File

@ -1152,8 +1152,8 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting)
*/
int round = 0;
while (zsb->z_nr_znodes > 0) {
taskq_wait(dsl_pool_iput_taskq(dmu_objset_pool(
zsb->z_os)));
taskq_wait_outstanding(dsl_pool_iput_taskq(
dmu_objset_pool(zsb->z_os)), 0);
if (++round > 1 && !unmounting)
break;
}
@ -1740,7 +1740,7 @@ zfs_init(void)
void
zfs_fini(void)
{
taskq_wait(system_taskq);
taskq_wait_outstanding(system_taskq, 0);
unregister_filesystem(&zpl_fs_type);
zfs_znode_fini();
zfsctl_fini();

View File

@ -204,7 +204,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
char **end)
{
enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
uint32_t aflags = ARC_WAIT;
arc_flags_t aflags = ARC_FLAG_WAIT;
arc_buf_t *abuf = NULL;
zbookmark_phys_t zb;
int error;
@ -280,7 +280,7 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
{
enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
const blkptr_t *bp = &lr->lr_blkptr;
uint32_t aflags = ARC_WAIT;
arc_flags_t aflags = ARC_FLAG_WAIT;
arc_buf_t *abuf = NULL;
zbookmark_phys_t zb;
int error;

View File

@ -2241,7 +2241,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
if (ddp->ddp_phys_birth != 0) {
arc_buf_t *abuf = NULL;
uint32_t aflags = ARC_WAIT;
arc_flags_t aflags = ARC_FLAG_WAIT;
blkptr_t blk = *zio->io_bp;
int error;

View File

@ -439,7 +439,11 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
* fault injection isn't a performance critical path.
*/
if (flags & ZINJECT_FLUSH_ARC)
arc_flush(NULL);
/*
* We must use FALSE to ensure arc_flush returns, since
* we're not preventing concurrent ARC insertions.
*/
arc_flush(NULL, FALSE);
return (0);
}