Improvements on persistent L2ARC

Functional changes:

We implement refcounts of log blocks and their aligned size on the
cache device along with two corresponding arcstats. The refcounts are
reflected in the header of the device and provide valuable information
as to whether log blocks are accounted for correctly. These are
dynamically adjusted as log blocks are committed/evicted. zdb also uses
this information in the device header and compares it to the
corresponding values as reported by dump_l2arc_log_blocks() which
emulates l2arc_rebuild(). If the refcounts saved in the device header
report higher values, zdb exits with an error. For this feature to work
correctly there should be no active writes on the device. This is also
employed in the tests of persistent L2ARC. We extend the structure of
the cache device header by adding the two new variables mirroring the
refcounts after the existing variables to preserve backward
compatibility in terms of persistent L2ARC.

1) a new arcstat "l2_log_blk_asize" and refcount "l2ad_lb_asize" which
   reflect the total aligned size of log blocks on the device. This is
   also reflected in the header of the cache device as "dh_lb_asize".
2) a new arcstat "l2arc_log_blk_count" and refcount "l2ad_lb_count"
   which reflect the total number of L2ARC log blocks present on cache
   devices.  It is also reflected in the header of the cache device as
   "dh_lb_count".

In l2arc_rebuild_vdev() if the amount of committed log entries in a log
block is 0 and the device header is valid we update the device header.
This will facilitate trimming of the whole device in this case when
TRIM for L2ARC is implemented.

Improve loop protection in l2arc_rebuild() by using the starting offset
of the payload of each log block instead of the starting offset of the
log block.

If the zio in l2arc_write_buffers() fails, restore the lbps array in the
header of the device to its previous state in l2arc_write_done().

If l2arc_rebuild() ends the rebuild process without restoring any L2ARC
log blocks in ARC and without any other error, this means that the lbps
array in the header is pointing to non-existent or invalid log blocks.
Reset the device header in this case.

In l2arc_rebuild() change the zfs_dbgmsg messages to
spa_history_log_internal() making them user visible with zpool history
command.

Non-functional changes:

Make the first test in persistent L2ARC use `zdb -lll` to increase
coverage in `zdb.c`.

Rename psize with asize when referring to log blocks, since
L2ARC_SET_PSIZE stores the vdev aligned size for log blocks. Also
rename dh_log_blk_entries to dh_log_entries to make it clear that
it is a mirror of l2ad_log_entries. Added comments for both changes.

Fix inaccurate comments for example in l2arc_log_blk_restore().

Add asserts at the end in l2arc_evict() and l2arc_write_buffers().

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: George Amanakis <gamanakis@gmail.com>
Closes #10228
This commit is contained in:
George Amanakis 2020-05-07 19:34:03 -04:00 committed by GitHub
parent 108a454a46
commit 657fd33bcf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 278 additions and 126 deletions

View File

@ -3493,12 +3493,13 @@ print_l2arc_log_blocks(void)
static void
dump_l2arc_log_entries(uint64_t log_entries,
l2arc_log_ent_phys_t *le, int i)
l2arc_log_ent_phys_t *le, uint64_t i)
{
for (int j = 0; j < log_entries; j++) {
dva_t dva = le[j].le_dva;
(void) printf("lb[%4d]\tle[%4d]\tDVA asize: %llu, "
"vdev: %llu, offset: %llu\n", i, j + 1,
(void) printf("lb[%4llu]\tle[%4d]\tDVA asize: %llu, "
"vdev: %llu, offset: %llu\n",
(u_longlong_t)i, j + 1,
(u_longlong_t)DVA_GET_ASIZE(&dva),
(u_longlong_t)DVA_GET_VDEV(&dva),
(u_longlong_t)DVA_GET_OFFSET(&dva));
@ -3533,7 +3534,7 @@ dump_l2arc_log_blkptr(l2arc_log_blkptr_t lbps)
(u_longlong_t)lbps.lbp_payload_start);
(void) printf("|\t\tlsize: %llu\n",
(u_longlong_t)L2BLK_GET_LSIZE((&lbps)->lbp_prop));
(void) printf("|\t\tpsize: %llu\n",
(void) printf("|\t\tasize: %llu\n",
(u_longlong_t)L2BLK_GET_PSIZE((&lbps)->lbp_prop));
(void) printf("|\t\tcompralgo: %llu\n",
(u_longlong_t)L2BLK_GET_COMPRESS((&lbps)->lbp_prop));
@ -3543,17 +3544,19 @@ dump_l2arc_log_blkptr(l2arc_log_blkptr_t lbps)
}
static void
dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr)
dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr,
l2arc_dev_hdr_phys_t *rebuild)
{
l2arc_log_blk_phys_t this_lb;
uint64_t psize;
uint64_t asize;
l2arc_log_blkptr_t lbps[2];
abd_t *abd;
zio_cksum_t cksum;
int i = 0, failed = 0;
int failed = 0;
l2arc_dev_t dev;
print_l2arc_log_blocks();
if (!dump_opt['q'])
print_l2arc_log_blocks();
bcopy((&l2dhdr)->dh_start_lbps, lbps, sizeof (lbps));
dev.l2ad_evict = l2dhdr.dh_evict;
@ -3562,8 +3565,10 @@ dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr)
if (l2dhdr.dh_start_lbps[0].lbp_daddr == 0) {
/* no log blocks to read */
(void) printf("No log blocks to read\n");
(void) printf("\n");
if (!dump_opt['q']) {
(void) printf("No log blocks to read\n");
(void) printf("\n");
}
return;
} else {
dev.l2ad_hand = lbps[0].lbp_daddr +
@ -3576,17 +3581,23 @@ dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr)
if (!l2arc_log_blkptr_valid(&dev, &lbps[0]))
break;
psize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
if (pread64(fd, &this_lb, psize, lbps[0].lbp_daddr) != psize) {
(void) printf("Error while reading next log block\n\n");
/* L2BLK_GET_PSIZE returns aligned size for log blocks */
asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
if (pread64(fd, &this_lb, asize, lbps[0].lbp_daddr) != asize) {
if (!dump_opt['q']) {
(void) printf("Error while reading next log "
"block\n\n");
}
break;
}
fletcher_4_native_varsize(&this_lb, psize, &cksum);
fletcher_4_native_varsize(&this_lb, asize, &cksum);
if (!ZIO_CHECKSUM_EQUAL(cksum, lbps[0].lbp_cksum)) {
failed++;
(void) printf("Invalid cksum\n");
dump_l2arc_log_blkptr(lbps[0]);
if (!dump_opt['q']) {
(void) printf("Invalid cksum\n");
dump_l2arc_log_blkptr(lbps[0]);
}
break;
}
@ -3594,11 +3605,11 @@ dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr)
case ZIO_COMPRESS_OFF:
break;
case ZIO_COMPRESS_LZ4:
abd = abd_alloc_for_io(psize, B_TRUE);
abd_copy_from_buf_off(abd, &this_lb, 0, psize);
abd = abd_alloc_for_io(asize, B_TRUE);
abd_copy_from_buf_off(abd, &this_lb, 0, asize);
zio_decompress_data(L2BLK_GET_COMPRESS(
(&lbps[0])->lbp_prop), abd, &this_lb,
psize, sizeof (this_lb));
asize, sizeof (this_lb));
abd_free(abd);
break;
default:
@ -3608,39 +3619,52 @@ dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr)
if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
byteswap_uint64_array(&this_lb, sizeof (this_lb));
if (this_lb.lb_magic != L2ARC_LOG_BLK_MAGIC) {
(void) printf("Invalid log block magic\n\n");
if (!dump_opt['q'])
(void) printf("Invalid log block magic\n\n");
break;
}
i++;
if (dump_opt['l'] > 1) {
(void) printf("lb[%4d]\tmagic: %llu\n", i,
rebuild->dh_lb_count++;
rebuild->dh_lb_asize += asize;
if (dump_opt['l'] > 1 && !dump_opt['q']) {
(void) printf("lb[%4llu]\tmagic: %llu\n",
(u_longlong_t)rebuild->dh_lb_count,
(u_longlong_t)this_lb.lb_magic);
dump_l2arc_log_blkptr(lbps[0]);
}
if (dump_opt['l'] > 2)
dump_l2arc_log_entries(l2dhdr.dh_log_blk_ent,
this_lb.lb_entries, i);
if (dump_opt['l'] > 2 && !dump_opt['q'])
dump_l2arc_log_entries(l2dhdr.dh_log_entries,
this_lb.lb_entries,
rebuild->dh_lb_count);
if (l2arc_range_check_overlap(lbps[1].lbp_daddr,
lbps[0].lbp_daddr, dev.l2ad_evict) && !dev.l2ad_first)
if (l2arc_range_check_overlap(lbps[1].lbp_payload_start,
lbps[0].lbp_payload_start, dev.l2ad_evict) &&
!dev.l2ad_first)
break;
lbps[0] = lbps[1];
lbps[1] = this_lb.lb_prev_lbp;
}
(void) printf("log_blk_count:\t %d with valid cksum\n", i);
(void) printf("\t\t %d with invalid cksum\n\n", failed);
if (!dump_opt['q']) {
(void) printf("log_blk_count:\t %llu with valid cksum\n",
(u_longlong_t)rebuild->dh_lb_count);
(void) printf("\t\t %d with invalid cksum\n", failed);
(void) printf("log_blk_asize:\t %llu\n\n",
(u_longlong_t)rebuild->dh_lb_asize);
}
}
static void
static int
dump_l2arc_header(int fd)
{
l2arc_dev_hdr_phys_t l2dhdr;
l2arc_dev_hdr_phys_t l2dhdr, rebuild;
int error = B_FALSE;
bzero(&l2dhdr, sizeof (l2dhdr));
bzero(&rebuild, sizeof (rebuild));
if (pread64(fd, &l2dhdr, sizeof (l2dhdr),
VDEV_LABEL_START_SIZE) != sizeof (l2dhdr)) {
error = B_TRUE;
@ -3654,6 +3678,8 @@ dump_l2arc_header(int fd)
if (error) {
(void) printf("L2ARC device header not found\n\n");
/* Do not return an error here for backward compatibility */
return (0);
} else if (!dump_opt['q']) {
print_l2arc_header();
@ -3672,16 +3698,39 @@ dump_l2arc_header(int fd)
(u_longlong_t)
l2dhdr.dh_start_lbps[1].lbp_daddr);
(void) printf(" log_blk_ent: %llu\n",
(u_longlong_t)l2dhdr.dh_log_blk_ent);
(u_longlong_t)l2dhdr.dh_log_entries);
(void) printf(" start: %llu\n",
(u_longlong_t)l2dhdr.dh_start);
(void) printf(" end: %llu\n",
(u_longlong_t)l2dhdr.dh_end);
(void) printf(" evict: %llu\n\n",
(void) printf(" evict: %llu\n",
(u_longlong_t)l2dhdr.dh_evict);
dump_l2arc_log_blocks(fd, l2dhdr);
(void) printf(" lb_asize_refcount: %llu\n",
(u_longlong_t)l2dhdr.dh_lb_asize);
(void) printf(" lb_count_refcount: %llu\n\n",
(u_longlong_t)l2dhdr.dh_lb_count);
}
dump_l2arc_log_blocks(fd, l2dhdr, &rebuild);
/*
* The total aligned size of log blocks and the number of log blocks
* reported in the header of the device may be less than what zdb
* reports by dump_l2arc_log_blocks() which emulates l2arc_rebuild().
* This happens because dump_l2arc_log_blocks() lacks the memory
* pressure valve that l2arc_rebuild() has. Thus, if we are on a system
* with low memory, l2arc_rebuild will exit prematurely and dh_lb_asize
* and dh_lb_count will be lower to begin with than what exists on the
* device. This is normal and zdb should not exit with an error. The
* opposite case should never happen though, the values reported in the
* header should never be higher than what dump_l2arc_log_blocks() and
* l2arc_rebuild() report. If this happens there is a leak in the
* accounting of log blocks.
*/
if (l2dhdr.dh_lb_asize > rebuild.dh_lb_asize ||
l2dhdr.dh_lb_count > rebuild.dh_lb_count)
return (1);
return (0);
}
static void
@ -4009,7 +4058,7 @@ dump_label(const char *dev)
* Dump the L2ARC header, if existent.
*/
if (read_l2arc_header)
dump_l2arc_header(fd);
error |= dump_l2arc_header(fd);
cookie = NULL;
while ((node = avl_destroy_nodes(&config_tree, &cookie)) != NULL)

View File

@ -200,7 +200,7 @@ typedef struct l2arc_log_blkptr {
/*
* lbp_prop has the following format:
* * logical size (in bytes)
* * physical (compressed) size (in bytes)
* * aligned (after compression) size (in bytes)
* * compression algorithm (we always LZ4-compress l2arc logs)
* * checksum algorithm (used for lbp_cksum)
*/
@ -221,22 +221,26 @@ typedef struct l2arc_dev_hdr_phys {
*/
uint64_t dh_spa_guid;
uint64_t dh_vdev_guid;
uint64_t dh_log_blk_ent; /* entries per log blk */
uint64_t dh_log_entries; /* mirror of l2ad_log_entries */
uint64_t dh_evict; /* evicted offset in bytes */
uint64_t dh_flags; /* l2arc_dev_hdr_flags_t */
/*
* Used in zdb.c for determining if a log block is valid, in the same
* way that l2arc_rebuild() does.
*/
uint64_t dh_start;
uint64_t dh_end;
uint64_t dh_start; /* mirror of l2ad_start */
uint64_t dh_end; /* mirror of l2ad_end */
/*
* Start of log block chain. [0] -> newest log, [1] -> one older (used
* for initiating prefetch).
*/
l2arc_log_blkptr_t dh_start_lbps[2];
const uint64_t dh_pad[34]; /* pad to 512 bytes */
/*
* Aligned size of all log blocks as accounted by vdev_space_update().
*/
uint64_t dh_lb_asize; /* mirror of l2ad_lb_asize */
uint64_t dh_lb_count; /* mirror of l2ad_lb_count */
const uint64_t dh_pad[32]; /* pad to 512 bytes */
zio_eck_t dh_tail;
} l2arc_dev_hdr_phys_t;
CTASSERT_GLOBAL(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE);
@ -387,6 +391,14 @@ typedef struct l2arc_dev {
uint64_t l2ad_evict; /* evicted offset in bytes */
/* List of pointers to log blocks present in the L2ARC device */
list_t l2ad_lbptr_list;
/*
* Aligned size of all log blocks as accounted by vdev_space_update().
*/
zfs_refcount_t l2ad_lb_asize;
/*
* Number of log blocks present on the device.
*/
zfs_refcount_t l2ad_lb_count;
} l2arc_dev_t;
/*
@ -738,14 +750,18 @@ typedef struct arc_stats {
*/
kstat_named_t arcstat_l2_log_blk_writes;
/*
* Moving average of the physical size of the L2ARC log blocks, in
* Moving average of the aligned size of the L2ARC log blocks, in
* bytes. Updated during L2ARC rebuild and during writing of L2ARC
* log blocks.
*/
kstat_named_t arcstat_l2_log_blk_avg_size;
kstat_named_t arcstat_l2_log_blk_avg_asize;
/* Aligned size of L2ARC log blocks on L2ARC devices. */
kstat_named_t arcstat_l2_log_blk_asize;
/* Number of L2ARC log blocks present on L2ARC devices. */
kstat_named_t arcstat_l2_log_blk_count;
/*
* Moving average of the physical size of L2ARC restored data, in bytes,
* to the physical size of their metadata in ARC, in bytes.
* Moving average of the aligned size of L2ARC restored data, in bytes,
* to the aligned size of their metadata in L2ARC, in bytes.
* Updated during L2ARC rebuild and during writing of L2ARC log blocks.
*/
kstat_named_t arcstat_l2_data_to_meta_ratio;
@ -780,6 +796,8 @@ typedef struct arc_stats {
kstat_named_t arcstat_l2_rebuild_abort_lowmem;
/* Logical size of L2ARC restored data, in bytes. */
kstat_named_t arcstat_l2_rebuild_size;
/* Aligned size of L2ARC restored data, in bytes. */
kstat_named_t arcstat_l2_rebuild_asize;
/*
* Number of L2ARC log entries (buffers) that were successfully
* restored in ARC.
@ -790,8 +808,6 @@ typedef struct arc_stats {
* were not restored again.
*/
kstat_named_t arcstat_l2_rebuild_bufs_precached;
/* Physical size of L2ARC restored data, in bytes. */
kstat_named_t arcstat_l2_rebuild_psize;
/*
* Number of L2ARC log blocks that were restored successfully. Each
* log block may hold up to L2ARC_LOG_BLK_MAX_ENTRIES buffers.

View File

@ -216,7 +216,10 @@ Read the vdev labels and L2ARC header from the specified device.
.Nm Fl l
will return 0 if valid label was found, 1 if error occurred, and 2 if no valid
labels were found. The presence of L2ARC header is indicated by a specific
sequence (L2ARC_DEV_HDR_MAGIC). Each unique configuration is displayed only
sequence (L2ARC_DEV_HDR_MAGIC). If there is an accounting error in the size
or the number of L2ARC log blocks
.Nm Fl l
will return 1. Each unique configuration is displayed only
once.
.It Fl ll Ar device
In addition display label space usage stats. If a valid L2ARC header was found

View File

@ -530,7 +530,9 @@ arc_stats_t arc_stats = {
{ "l2_asize", KSTAT_DATA_UINT64 },
{ "l2_hdr_size", KSTAT_DATA_UINT64 },
{ "l2_log_blk_writes", KSTAT_DATA_UINT64 },
{ "l2_log_blk_avg_size", KSTAT_DATA_UINT64 },
{ "l2_log_blk_avg_asize", KSTAT_DATA_UINT64 },
{ "l2_log_blk_asize", KSTAT_DATA_UINT64 },
{ "l2_log_blk_count", KSTAT_DATA_UINT64 },
{ "l2_data_to_meta_ratio", KSTAT_DATA_UINT64 },
{ "l2_rebuild_success", KSTAT_DATA_UINT64 },
{ "l2_rebuild_unsupported", KSTAT_DATA_UINT64 },
@ -539,9 +541,9 @@ arc_stats_t arc_stats = {
{ "l2_rebuild_cksum_lb_errors", KSTAT_DATA_UINT64 },
{ "l2_rebuild_lowmem", KSTAT_DATA_UINT64 },
{ "l2_rebuild_size", KSTAT_DATA_UINT64 },
{ "l2_rebuild_asize", KSTAT_DATA_UINT64 },
{ "l2_rebuild_bufs", KSTAT_DATA_UINT64 },
{ "l2_rebuild_bufs_precached", KSTAT_DATA_UINT64 },
{ "l2_rebuild_psize", KSTAT_DATA_UINT64 },
{ "l2_rebuild_log_blks", KSTAT_DATA_UINT64 },
{ "memory_throttle_count", KSTAT_DATA_UINT64 },
{ "memory_direct_count", KSTAT_DATA_UINT64 },
@ -895,7 +897,7 @@ static void l2arc_log_blk_fetch_abort(zio_t *zio);
/* L2ARC persistence block restoration routines. */
static void l2arc_log_blk_restore(l2arc_dev_t *dev,
const l2arc_log_blk_phys_t *lb, uint64_t lb_psize, uint64_t lb_daddr);
const l2arc_log_blk_phys_t *lb, uint64_t lb_asize, uint64_t lb_daddr);
static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
l2arc_dev_t *dev);
@ -7864,6 +7866,7 @@ l2arc_write_done(zio_t *zio)
l2arc_lb_abd_buf_t *abd_buf;
l2arc_lb_ptr_buf_t *lb_ptr_buf;
l2arc_dev_t *dev;
l2arc_dev_hdr_phys_t *l2dhdr;
list_t *buflist;
arc_buf_hdr_t *head, *hdr, *hdr_prev;
kmutex_t *hash_lock;
@ -7872,6 +7875,7 @@ l2arc_write_done(zio_t *zio)
cb = zio->io_private;
ASSERT3P(cb, !=, NULL);
dev = cb->l2wcb_dev;
l2dhdr = dev->l2ad_dev_hdr;
ASSERT3P(dev, !=, NULL);
head = cb->l2wcb_head;
ASSERT3P(head, !=, NULL);
@ -7975,8 +7979,18 @@ top:
zio_buf_free(abd_buf, sizeof (*abd_buf));
if (zio->io_error != 0) {
lb_ptr_buf = list_remove_head(&dev->l2ad_lbptr_list);
bytes_dropped +=
/*
* L2BLK_GET_PSIZE returns aligned size for log
* blocks.
*/
uint64_t asize =
L2BLK_GET_PSIZE((lb_ptr_buf->lb_ptr)->lbp_prop);
bytes_dropped += asize;
ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
lb_ptr_buf);
zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf);
kmem_free(lb_ptr_buf->lb_ptr,
sizeof (l2arc_log_blkptr_t));
kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
@ -7984,6 +7998,17 @@ top:
}
list_destroy(&cb->l2wcb_abd_list);
if (zio->io_error != 0) {
/* restore the lbps array in the header to its previous state */
lb_ptr_buf = list_head(&dev->l2ad_lbptr_list);
for (int i = 0; i < 2; i++) {
bcopy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[i],
sizeof (l2arc_log_blkptr_t));
lb_ptr_buf = list_next(&dev->l2ad_lbptr_list,
lb_ptr_buf);
}
}
atomic_inc_64(&l2arc_writes_done);
list_remove(buflist, head);
ASSERT(!HDR_HAS_L1HDR(head));
@ -8277,21 +8302,21 @@ l2arc_sublist_lock(int list_num)
/*
* Calculates the maximum overhead of L2ARC metadata log blocks for a given
* L2ARC write size. l2arc_evict and l2arc_write_buffers need to include this
* L2ARC write size. l2arc_evict and l2arc_write_size need to include this
* overhead in processing to make sure there is enough headroom available
* when writing buffers.
*/
static inline uint64_t
l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev)
{
if (dev->l2ad_dev_hdr->dh_log_blk_ent == 0) {
if (dev->l2ad_log_entries == 0) {
return (0);
} else {
uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT;
uint64_t log_blocks = (log_entries +
dev->l2ad_dev_hdr->dh_log_blk_ent - 1) /
dev->l2ad_dev_hdr->dh_log_blk_ent;
dev->l2ad_log_entries - 1) /
dev->l2ad_log_entries;
return (vdev_psize_to_asize(dev->l2ad_vdev,
sizeof (l2arc_log_blk_phys_t)) * log_blocks);
@ -8373,17 +8398,24 @@ retry:
lb_ptr_buf_prev = list_prev(&dev->l2ad_lbptr_list, lb_ptr_buf);
/* L2BLK_GET_PSIZE returns aligned size for log blocks */
uint64_t asize = L2BLK_GET_PSIZE(
(lb_ptr_buf->lb_ptr)->lbp_prop);
/*
* We don't worry about log blocks left behind (ie
* lbp_daddr + psize < l2ad_hand) because l2arc_write_buffers()
* lbp_payload_start < l2ad_hand) because l2arc_write_buffers()
* will never write more than l2arc_evict() evicts.
*/
if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) {
break;
} else {
vdev_space_update(dev->l2ad_vdev,
-L2BLK_GET_PSIZE(
(lb_ptr_buf->lb_ptr)->lbp_prop), 0, 0);
vdev_space_update(dev->l2ad_vdev, -asize, 0, 0);
ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
lb_ptr_buf);
zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf);
list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf);
kmem_free(lb_ptr_buf->lb_ptr,
sizeof (l2arc_log_blkptr_t));
@ -8475,6 +8507,10 @@ out:
dev->l2ad_first = B_FALSE;
goto top;
}
ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end);
if (!dev->l2ad_first)
ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict);
}
/*
@ -8777,6 +8813,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
sizeof (l2arc_write_callback_t), KM_SLEEP);
cb->l2wcb_dev = dev;
cb->l2wcb_head = head;
/*
* Create a list to save allocated abd buffers
* for l2arc_log_blk_commit().
*/
list_create(&cb->l2wcb_abd_list,
sizeof (l2arc_lb_abd_buf_t),
offsetof(l2arc_lb_abd_buf_t, node));
@ -8846,6 +8886,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
return (0);
}
if (!dev->l2ad_first)
ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict);
ASSERT3U(write_asize, <=, target_sz);
ARCSTAT_BUMP(arcstat_l2_writes_sent);
ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize);
@ -9036,6 +9079,8 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
zfs_refcount_create(&adddev->l2ad_alloc);
zfs_refcount_create(&adddev->l2ad_lb_asize);
zfs_refcount_create(&adddev->l2ad_lb_count);
/*
* Add device to global list
@ -9059,7 +9104,7 @@ l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen)
uint64_t l2dhdr_asize;
spa_t *spa;
int err;
boolean_t rebuild = B_TRUE;
boolean_t l2dhdr_valid = B_TRUE;
dev = l2arc_vdev_get(vd);
ASSERT3P(dev, !=, NULL);
@ -9089,9 +9134,9 @@ l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen)
* Read the device header, if an error is returned do not rebuild L2ARC.
*/
if ((err = l2arc_dev_hdr_read(dev)) != 0)
rebuild = B_FALSE;
l2dhdr_valid = B_FALSE;
if (rebuild && l2dhdr->dh_log_blk_ent > 0) {
if (l2dhdr_valid && dev->l2ad_log_entries > 0) {
/*
* If we are onlining a cache device (vdev_reopen) that was
* still present (l2arc_vdev_present()) and rebuild is enabled,
@ -9117,12 +9162,10 @@ l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen)
* async task which will call l2arc_spa_rebuild_start.
*/
dev->l2ad_rebuild = B_TRUE;
} else if (!rebuild && spa_writeable(spa)) {
} else if (spa_writeable(spa)) {
/*
* The boolean rebuild is false if reading the device header
* returned an error. In this case create a new header. We
* zero out the memory holding the header to reset
* dh_start_lbps.
* In this case create a new header. We zero out the memory
* holding the header to reset dh_start_lbps.
*/
bzero(l2dhdr, l2dhdr_asize);
l2arc_dev_hdr_update(dev);
@ -9172,6 +9215,8 @@ l2arc_remove_vdev(vdev_t *vd)
list_destroy(&remdev->l2ad_lbptr_list);
mutex_destroy(&remdev->l2ad_mtx);
zfs_refcount_destroy(&remdev->l2ad_alloc);
zfs_refcount_destroy(&remdev->l2ad_lb_asize);
zfs_refcount_destroy(&remdev->l2ad_lb_count);
kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize);
vmem_free(remdev, sizeof (l2arc_dev_t));
}
@ -9309,7 +9354,7 @@ l2arc_rebuild(l2arc_dev_t *dev)
{
vdev_t *vd = dev->l2ad_vdev;
spa_t *spa = vd->vdev_spa;
int i = 0, err = 0;
int err = 0;
l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
l2arc_log_blk_phys_t *this_lb, *next_lb;
zio_t *this_io = NULL, *next_io = NULL;
@ -9332,6 +9377,7 @@ l2arc_rebuild(l2arc_dev_t *dev)
/*
* Retrieve the persistent L2ARC device state.
* L2BLK_GET_PSIZE returns aligned size for log blocks.
*/
dev->l2ad_evict = MAX(l2dhdr->dh_evict, dev->l2ad_start);
dev->l2ad_hand = MAX(l2dhdr->dh_start_lbps[0].lbp_daddr +
@ -9381,11 +9427,10 @@ l2arc_rebuild(l2arc_dev_t *dev)
/*
* Now that we know that the next_lb checks out alright, we
* can start reconstruction from this log block.
* L2BLK_GET_PSIZE returns aligned size for log blocks.
*/
l2arc_log_blk_restore(dev, this_lb,
L2BLK_GET_PSIZE((&lbps[0])->lbp_prop),
lbps[0].lbp_daddr);
i++;
uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
l2arc_log_blk_restore(dev, this_lb, asize, lbps[0].lbp_daddr);
/*
* log block restored, include its pointer in the list of
@ -9398,9 +9443,12 @@ l2arc_rebuild(l2arc_dev_t *dev)
sizeof (l2arc_log_blkptr_t));
mutex_enter(&dev->l2ad_mtx);
list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf);
ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize);
ARCSTAT_BUMP(arcstat_l2_log_blk_count);
zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf);
zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf);
mutex_exit(&dev->l2ad_mtx);
vdev_space_update(vd,
L2BLK_GET_PSIZE((&lbps[0])->lbp_prop), 0, 0);
vdev_space_update(vd, asize, 0, 0);
/*
* Protection against loops of log blocks:
@ -9417,13 +9465,16 @@ l2arc_rebuild(l2arc_dev_t *dev)
* l2arc_log_blkptr_valid() but the log block should not be
* restored as it is overwritten by the payload of log block
* (0). Only log blocks (0)-(3) should be restored. We check
* whether l2ad_evict lies in between the next log block
* offset (lbps[1].lbp_daddr) and the present log block offset
* (lbps[0].lbp_daddr). If true and this isn't the first pass,
* we are looping from the beginning and we should stop.
* whether l2ad_evict lies in between the payload starting
* offset of the next log block (lbps[1].lbp_payload_start)
* and the payload starting offset of the present log block
* (lbps[0].lbp_payload_start). If true and this isn't the
* first pass, we are looping from the beginning and we should
* stop.
*/
if (l2arc_range_check_overlap(lbps[1].lbp_daddr,
lbps[0].lbp_daddr, dev->l2ad_evict) && !dev->l2ad_first)
if (l2arc_range_check_overlap(lbps[1].lbp_payload_start,
lbps[0].lbp_payload_start, dev->l2ad_evict) &&
!dev->l2ad_first)
goto out;
for (;;) {
@ -9470,14 +9521,27 @@ out:
vmem_free(next_lb, sizeof (*next_lb));
if (!l2arc_rebuild_enabled) {
zfs_dbgmsg("L2ARC rebuild disabled");
} else if (err == 0 && i > 0) {
spa_history_log_internal(spa, "L2ARC rebuild", NULL,
"disabled");
} else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) > 0) {
ARCSTAT_BUMP(arcstat_l2_rebuild_success);
zfs_dbgmsg("L2ARC successfully rebuilt, "
"restored %d blocks", i);
spa_history_log_internal(spa, "L2ARC rebuild", NULL,
"successful, restored %llu blocks",
(u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
} else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) == 0) {
/*
* No error but also nothing restored, meaning the lbps array
* in the device header points to invalid/non-present log
* blocks. Reset the header.
*/
spa_history_log_internal(spa, "L2ARC rebuild", NULL,
"no valid log blocks");
bzero(l2dhdr, dev->l2ad_dev_hdr_asize);
l2arc_dev_hdr_update(dev);
} else if (err != 0) {
zfs_dbgmsg("L2ARC rebuild aborted, "
"restored %d blocks", i);
spa_history_log_internal(spa, "L2ARC rebuild", NULL,
"aborted, restored %llu blocks",
(u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
}
if (lock_held)
@ -9527,7 +9591,7 @@ l2arc_dev_hdr_read(l2arc_dev_t *dev)
l2dhdr->dh_spa_guid != guid ||
l2dhdr->dh_vdev_guid != dev->l2ad_vdev->vdev_guid ||
l2dhdr->dh_version != L2ARC_PERSISTENT_VERSION ||
l2dhdr->dh_log_blk_ent != dev->l2ad_log_entries ||
l2dhdr->dh_log_entries != dev->l2ad_log_entries ||
l2dhdr->dh_end != dev->l2ad_end ||
!l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end,
l2dhdr->dh_evict)) {
@ -9578,7 +9642,7 @@ l2arc_log_blk_read(l2arc_dev_t *dev,
int err = 0;
zio_cksum_t cksum;
abd_t *abd = NULL;
uint64_t psize;
uint64_t asize;
ASSERT(this_lbp != NULL && next_lbp != NULL);
ASSERT(this_lb != NULL && next_lb != NULL);
@ -9616,9 +9680,12 @@ l2arc_log_blk_read(l2arc_dev_t *dev,
goto cleanup;
}
/* Make sure the buffer checks out */
psize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop);
fletcher_4_native(this_lb, psize, NULL, &cksum);
/*
* Make sure the buffer checks out.
* L2BLK_GET_PSIZE returns aligned size for log blocks.
*/
asize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop);
fletcher_4_native(this_lb, asize, NULL, &cksum);
if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) {
ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_lb_errors);
zfs_dbgmsg("L2ARC log block cksum failed, offset: %llu, "
@ -9634,11 +9701,11 @@ l2arc_log_blk_read(l2arc_dev_t *dev,
case ZIO_COMPRESS_OFF:
break;
case ZIO_COMPRESS_LZ4:
abd = abd_alloc_for_io(psize, B_TRUE);
abd_copy_from_buf_off(abd, this_lb, 0, psize);
abd = abd_alloc_for_io(asize, B_TRUE);
abd_copy_from_buf_off(abd, this_lb, 0, asize);
if ((err = zio_decompress_data(
L2BLK_GET_COMPRESS((this_lbp)->lbp_prop),
abd, this_lb, psize, sizeof (*this_lb))) != 0) {
abd, this_lb, asize, sizeof (*this_lb))) != 0) {
err = SET_ERROR(EINVAL);
goto cleanup;
}
@ -9672,10 +9739,10 @@ cleanup:
*/
static void
l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb,
uint64_t lb_psize, uint64_t lb_daddr)
uint64_t lb_asize, uint64_t lb_daddr)
{
uint64_t size = 0, psize = 0;
uint64_t log_entries = dev->l2ad_dev_hdr->dh_log_blk_ent;
uint64_t size = 0, asize = 0;
uint64_t log_entries = dev->l2ad_log_entries;
for (int i = log_entries - 1; i >= 0; i--) {
/*
@ -9692,27 +9759,28 @@ l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb,
* ^ ^
* | |
* | |
* l2arc_fill_thread l2arc_rebuild
* places new bufs here restores bufs here
* l2arc_feed_thread l2arc_rebuild
* will place new bufs here restores bufs here
*
* This also works when the restored bufs get evicted at any
* point during the rebuild.
* During l2arc_rebuild() the device is not used by
* l2arc_feed_thread() as dev->l2ad_rebuild is set to true.
*/
size += L2BLK_GET_LSIZE((&lb->lb_entries[i])->le_prop);
psize += L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop);
asize += vdev_psize_to_asize(dev->l2ad_vdev,
L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop));
l2arc_hdr_restore(&lb->lb_entries[i], dev);
}
/*
* Record rebuild stats:
* size Logical size of restored buffers in the L2ARC
* psize Physical size of restored buffers in the L2ARC
* asize Aligned size of restored buffers in the L2ARC
*/
ARCSTAT_INCR(arcstat_l2_rebuild_size, size);
ARCSTAT_INCR(arcstat_l2_rebuild_psize, psize);
ARCSTAT_INCR(arcstat_l2_rebuild_asize, asize);
ARCSTAT_INCR(arcstat_l2_rebuild_bufs, log_entries);
ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, lb_psize);
ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, psize / lb_psize);
ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, lb_asize);
ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, asize / lb_asize);
ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks);
}
@ -9800,18 +9868,20 @@ static zio_t *
l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp,
l2arc_log_blk_phys_t *lb)
{
uint32_t psize;
uint32_t asize;
zio_t *pio;
l2arc_read_callback_t *cb;
psize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
ASSERT(psize <= sizeof (l2arc_log_blk_phys_t));
/* L2BLK_GET_PSIZE returns aligned size for log blocks */
asize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
ASSERT(asize <= sizeof (l2arc_log_blk_phys_t));
cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP);
cb->l2rcb_abd = abd_get_from_buf(lb, psize);
cb->l2rcb_abd = abd_get_from_buf(lb, asize);
pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb,
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
ZIO_FLAG_DONT_RETRY);
(void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, psize,
(void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize,
cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE));
@ -9841,14 +9911,18 @@ l2arc_dev_hdr_update(l2arc_dev_t *dev)
abd_t *abd;
int err;
VERIFY(spa_config_held(dev->l2ad_spa, SCL_STATE_ALL, RW_READER));
l2dhdr->dh_magic = L2ARC_DEV_HDR_MAGIC;
l2dhdr->dh_version = L2ARC_PERSISTENT_VERSION;
l2dhdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
l2dhdr->dh_vdev_guid = dev->l2ad_vdev->vdev_guid;
l2dhdr->dh_log_blk_ent = dev->l2ad_log_entries;
l2dhdr->dh_log_entries = dev->l2ad_log_entries;
l2dhdr->dh_evict = dev->l2ad_evict;
l2dhdr->dh_start = dev->l2ad_start;
l2dhdr->dh_end = dev->l2ad_end;
l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize);
l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count);
l2dhdr->dh_flags = 0;
if (dev->l2ad_first)
l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST;
@ -9884,7 +9958,7 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
uint8_t *tmpbuf;
l2arc_lb_ptr_buf_t *lb_ptr_buf;
VERIFY3S(dev->l2ad_log_ent_idx, ==, l2dhdr->dh_log_blk_ent);
VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries);
tmpbuf = zio_buf_alloc(sizeof (*lb));
abd_buf = zio_buf_alloc(sizeof (*abd_buf));
@ -9896,8 +9970,14 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
lb->lb_prev_lbp = l2dhdr->dh_start_lbps[1];
lb->lb_magic = L2ARC_LOG_BLK_MAGIC;
/* try to compress the buffer */
/*
* l2arc_log_blk_commit() may be called multiple times during a single
* l2arc_write_buffers() call. Save the allocated abd buffers in a list
* so we can free them in l2arc_write_done() later on.
*/
list_insert_tail(&cb->l2wcb_abd_list, abd_buf);
/* try to compress the buffer */
psize = zio_compress_data(ZIO_COMPRESS_LZ4,
abd_buf->abd, tmpbuf, sizeof (*lb));
@ -9962,13 +10042,17 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
sizeof (l2arc_log_blkptr_t));
mutex_enter(&dev->l2ad_mtx);
list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf);
ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize);
ARCSTAT_BUMP(arcstat_l2_log_blk_count);
zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf);
zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf);
mutex_exit(&dev->l2ad_mtx);
vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
/* bump the kstats */
ARCSTAT_INCR(arcstat_l2_write_bytes, asize);
ARCSTAT_BUMP(arcstat_l2_log_blk_writes);
ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, asize);
ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, asize);
ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio,
dev->l2ad_log_blk_payload_asize / asize);
@ -9985,8 +10069,9 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
boolean_t
l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp)
{
uint64_t psize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
uint64_t end = lbp->lbp_daddr + psize - 1;
/* L2BLK_GET_PSIZE returns aligned size for log blocks */
uint64_t asize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
uint64_t end = lbp->lbp_daddr + asize - 1;
uint64_t start = lbp->lbp_payload_start;
boolean_t evicted = B_FALSE;
@ -10017,7 +10102,7 @@ l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp)
l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, end);
return (start >= dev->l2ad_start && end <= dev->l2ad_end &&
psize > 0 && psize <= sizeof (l2arc_log_blk_phys_t) &&
asize > 0 && asize <= sizeof (l2arc_log_blk_phys_t) &&
(!evicted || dev->l2ad_first));
}
@ -10032,14 +10117,13 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
{
l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk;
l2arc_log_ent_phys_t *le;
l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
if (l2dhdr->dh_log_blk_ent == 0)
if (dev->l2ad_log_entries == 0)
return (B_FALSE);
int index = dev->l2ad_log_ent_idx++;
ASSERT3S(index, <, l2dhdr->dh_log_blk_ent);
ASSERT3S(index, <, dev->l2ad_log_entries);
ASSERT(HDR_HAS_L2HDR(hdr));
le = &lb->lb_entries[index];
@ -10059,7 +10143,7 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev,
HDR_GET_PSIZE(hdr));
return (dev->l2ad_log_ent_idx == l2dhdr->dh_log_blk_ent);
return (dev->l2ad_log_ent_idx == dev->l2ad_log_entries);
}
/*

View File

@ -99,7 +99,7 @@ typeset l2_rebuild_log_blk_end=$(get_arcstat l2_rebuild_log_blks)
log_must test $l2_dh_log_blk -eq $(( $l2_rebuild_log_blk_end - $l2_rebuild_log_blk_start ))
log_must test $l2_dh_log_blk -gt 0
log_must zdb -lq $VDEV_CACHE
log_must zdb -lll $VDEV_CACHE
log_must zpool destroy -f $TESTPOOL