Support re-prioritizing asynchronous prefetches

When sequential scrubs were merged, all calls to arc_read()
(including prefetch IOs) were given ZIO_PRIORITY_ASYNC_READ.
Unfortunately, this behaves badly with an existing issue where
prefetch IOs cannot be re-prioritized after the issue. The
result is that synchronous reads end up in the same vdev_queue
as the scrub IOs and can have (in some workloads) multiple
seconds of latency.

This patch incorporates 2 changes. The first ensures that all
scrub IOs are given ZIO_PRIORITY_SCRUB to allow the vdev_queue
code to differentiate between these I/Os and user prefetches.
Second, this patch introduces zio_change_priority() to provide
the missing capability to upgrade a zio's priority.

Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #6921 
Closes #6926
This commit is contained in:
Tom Caputi 2017-12-21 12:13:06 -05:00 committed by Brian Behlendorf
parent 993669a7bf
commit a8b2e30685
11 changed files with 124 additions and 43 deletions

View File

@ -98,6 +98,7 @@ struct arc_callback {
boolean_t acb_noauth;
uint64_t acb_dsobj;
zio_t *acb_zio_dummy;
zio_t *acb_zio_head;
arc_callback_t *acb_next;
};

View File

@ -108,7 +108,7 @@ DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__evict);
DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__delete);
DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mru);
DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mfu);
DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__sync__wait__for__async);
DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__async__upgrade__sync);
DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__demand__hit__predictive__prefetch);
DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__hit);
DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__miss);

View File

@ -123,6 +123,7 @@ extern void vdev_queue_init(vdev_t *vd);
extern void vdev_queue_fini(vdev_t *vd);
extern zio_t *vdev_queue_io(zio_t *zio);
extern void vdev_queue_io_done(zio_t *zio);
extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
extern int vdev_queue_length(vdev_t *vd);
extern uint64_t vdev_queue_last_offset(vdev_t *vd);

View File

@ -586,6 +586,8 @@ extern void zio_vdev_io_bypass(zio_t *zio);
extern void zio_vdev_io_reissue(zio_t *zio);
extern void zio_vdev_io_redone(zio_t *zio);
extern void zio_change_priority(zio_t *pio, zio_priority_t priority);
extern void zio_checksum_verified(zio_t *zio);
extern int zio_worst_error(int e1, int e2);

0
io_spa- Normal file
View File

View File

@ -663,7 +663,7 @@ typedef struct arc_stats {
kstat_named_t arcstat_dnode_limit;
kstat_named_t arcstat_meta_max;
kstat_named_t arcstat_meta_min;
kstat_named_t arcstat_sync_wait_for_async;
kstat_named_t arcstat_async_upgrade_sync;
kstat_named_t arcstat_demand_hit_predictive_prefetch;
kstat_named_t arcstat_demand_hit_prescient_prefetch;
kstat_named_t arcstat_need_free;
@ -763,7 +763,7 @@ static arc_stats_t arc_stats = {
{ "arc_dnode_limit", KSTAT_DATA_UINT64 },
{ "arc_meta_max", KSTAT_DATA_UINT64 },
{ "arc_meta_min", KSTAT_DATA_UINT64 },
{ "sync_wait_for_async", KSTAT_DATA_UINT64 },
{ "async_upgrade_sync", KSTAT_DATA_UINT64 },
{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
{ "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
{ "arc_need_free", KSTAT_DATA_UINT64 },
@ -5911,32 +5911,20 @@ top:
*arc_flags |= ARC_FLAG_CACHED;
if (HDR_IO_IN_PROGRESS(hdr)) {
zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
ASSERT3P(head_zio, !=, NULL);
if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
priority == ZIO_PRIORITY_SYNC_READ) {
/*
* This sync read must wait for an
* in-progress async read (e.g. a predictive
* prefetch). Async reads are queued
* separately at the vdev_queue layer, so
* this is a form of priority inversion.
* Ideally, we would "inherit" the demand
* i/o's priority by moving the i/o from
* the async queue to the synchronous queue,
* but there is currently no mechanism to do
* so. Track this so that we can evaluate
* the magnitude of this potential performance
* problem.
*
* Note that if the prefetch i/o is already
* active (has been issued to the device),
* the prefetch improved performance, because
* we issued it sooner than we would have
* without the prefetch.
* This is a sync read that needs to wait for
* an in-flight async read. Request that the
* zio have its priority upgraded.
*/
DTRACE_PROBE1(arc__sync__wait__for__async,
zio_change_priority(head_zio, priority);
DTRACE_PROBE1(arc__async__upgrade__sync,
arc_buf_hdr_t *, hdr);
ARCSTAT_BUMP(arcstat_sync_wait_for_async);
ARCSTAT_BUMP(arcstat_async_upgrade_sync);
}
if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
arc_hdr_clear_flags(hdr,
@ -5966,6 +5954,7 @@ top:
spa, NULL, NULL, NULL, zio_flags);
ASSERT3P(acb->acb_done, !=, NULL);
acb->acb_zio_head = head_zio;
acb->acb_next = hdr->b_l1hdr.b_acb;
hdr->b_l1hdr.b_acb = acb;
mutex_exit(hash_lock);
@ -6182,14 +6171,17 @@ top:
vd = NULL;
}
if (priority == ZIO_PRIORITY_ASYNC_READ)
/*
* We count both async reads and scrub IOs as asynchronous so
* that both can be upgraded in the event of a cache hit while
* the read IO is still in-flight.
*/
if (priority == ZIO_PRIORITY_ASYNC_READ ||
priority == ZIO_PRIORITY_SCRUB)
arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
else
arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
if (hash_lock != NULL)
mutex_exit(hash_lock);
/*
* At this point, we have a level 1 cache miss. Try again in
* L2ARC if possible.
@ -6260,6 +6252,10 @@ top:
ZIO_FLAG_CANFAIL |
ZIO_FLAG_DONT_PROPAGATE |
ZIO_FLAG_DONT_RETRY, B_FALSE);
acb->acb_zio_head = rzio;
if (hash_lock != NULL)
mutex_exit(hash_lock);
DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
zio_t *, rzio);
@ -6276,6 +6272,8 @@ top:
goto out;
/* l2arc read error; goto zio_read() */
if (hash_lock != NULL)
mutex_enter(hash_lock);
} else {
DTRACE_PROBE1(l2arc__miss,
arc_buf_hdr_t *, hdr);
@ -6296,6 +6294,10 @@ top:
rzio = zio_read(pio, spa, bp, hdr_abd, size,
arc_read_done, hdr, priority, zio_flags, zb);
acb->acb_zio_head = rzio;
if (hash_lock != NULL)
mutex_exit(hash_lock);
if (*arc_flags & ARC_FLAG_WAIT) {
rc = zio_wait(rzio);

View File

@ -1529,7 +1529,7 @@ dsl_scan_prefetch_thread(void *arg)
/* issue the prefetch asynchronously */
(void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa,
&spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc,
ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, &spic->spic_zb);
ZIO_PRIORITY_SCRUB, zio_flags, &flags, &spic->spic_zb);
kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
}
@ -1611,7 +1611,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
arc_buf_t *buf;
err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
if (err) {
scn->scn_phys.scn_errors++;
return (err);
@ -1639,7 +1639,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
}
err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
if (err) {
scn->scn_phys.scn_errors++;
return (err);
@ -1658,7 +1658,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
arc_buf_t *buf;
err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
if (err) {
scn->scn_phys.scn_errors++;
return (err);

View File

@ -803,6 +803,48 @@ vdev_queue_io_done(zio_t *zio)
mutex_exit(&vq->vq_lock);
}
void
vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
{
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
avl_tree_t *tree;
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
if (zio->io_type == ZIO_TYPE_READ) {
if (priority != ZIO_PRIORITY_SYNC_READ &&
priority != ZIO_PRIORITY_ASYNC_READ &&
priority != ZIO_PRIORITY_SCRUB)
priority = ZIO_PRIORITY_ASYNC_READ;
} else {
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
if (priority != ZIO_PRIORITY_SYNC_WRITE &&
priority != ZIO_PRIORITY_ASYNC_WRITE)
priority = ZIO_PRIORITY_ASYNC_WRITE;
}
mutex_enter(&vq->vq_lock);
/*
* If the zio is in none of the queues we can simply change
* the priority. If the zio is waiting to be submitted we must
* remove it from the queue and re-insert it with the new priority.
* Otherwise, the zio is currently active and we cannot change its
* priority.
*/
tree = vdev_queue_class_tree(vq, zio->io_priority);
if (avl_find(tree, zio, NULL) == zio) {
avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
zio->io_priority = priority;
avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
} else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) {
zio->io_priority = priority;
}
mutex_exit(&vq->vq_lock);
}
/*
* As these two methods are only used for load calculations we're not
* concerned if we get an incorrect value on 32bit platforms due to lack of

View File

@ -539,6 +539,8 @@ zio_walk_children(zio_t *pio, zio_link_t **zl)
{
list_t *cl = &pio->io_child_list;
ASSERT(MUTEX_HELD(&pio->io_lock));
*zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl);
if (*zl == NULL)
return (NULL);
@ -573,8 +575,8 @@ zio_add_child(zio_t *pio, zio_t *cio)
zl->zl_parent = pio;
zl->zl_child = cio;
mutex_enter(&cio->io_lock);
mutex_enter(&pio->io_lock);
mutex_enter(&cio->io_lock);
ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
@ -587,8 +589,8 @@ zio_add_child(zio_t *pio, zio_t *cio)
pio->io_child_count++;
cio->io_parent_count++;
mutex_exit(&pio->io_lock);
mutex_exit(&cio->io_lock);
mutex_exit(&pio->io_lock);
}
static void
@ -597,8 +599,8 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
ASSERT(zl->zl_parent == pio);
ASSERT(zl->zl_child == cio);
mutex_enter(&cio->io_lock);
mutex_enter(&pio->io_lock);
mutex_enter(&cio->io_lock);
list_remove(&pio->io_child_list, zl);
list_remove(&cio->io_parent_list, zl);
@ -606,8 +608,8 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
pio->io_child_count--;
cio->io_parent_count--;
mutex_exit(&pio->io_lock);
mutex_exit(&cio->io_lock);
mutex_exit(&pio->io_lock);
kmem_cache_free(zio_link_cache, zl);
}
@ -1963,14 +1965,16 @@ zio_reexecute(zio_t *pio)
* cannot be affected by any side effects of reexecuting 'cio'.
*/
zio_link_t *zl = NULL;
mutex_enter(&pio->io_lock);
for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
cio_next = zio_walk_children(pio, &zl);
mutex_enter(&pio->io_lock);
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
pio->io_children[cio->io_child_type][w]++;
mutex_exit(&pio->io_lock);
zio_reexecute(cio);
mutex_enter(&pio->io_lock);
}
mutex_exit(&pio->io_lock);
/*
* Now that all children have been reexecuted, execute the parent.
@ -3474,6 +3478,35 @@ zio_vdev_io_done(zio_t *zio)
return (ZIO_PIPELINE_CONTINUE);
}
/*
* This function is used to change the priority of an existing zio that is
* currently in-flight. This is used by the arc to upgrade priority in the
* event that a demand read is made for a block that is currently queued
* as a scrub or async read IO. Otherwise, the high priority read request
* would end up having to wait for the lower priority IO.
*/
void
zio_change_priority(zio_t *pio, zio_priority_t priority)
{
zio_t *cio, *cio_next;
zio_link_t *zl = NULL;
ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) {
vdev_queue_change_io_priority(pio, priority);
} else {
pio->io_priority = priority;
}
mutex_enter(&pio->io_lock);
for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
cio_next = zio_walk_children(pio, &zl);
zio_change_priority(cio, priority);
}
mutex_exit(&pio->io_lock);
}
/*
* For non-raidz ZIOs, we can just copy aside the bad data read from the
* disk, and use that to finish the checksum ereport later.

0
spa_stats.io_history Normal file
View File

View File

@ -41,9 +41,9 @@ function get_prefetched_demand_reads
echo $demand_reads
}
function get_sync_wait_for_async
function get_async_upgrade_sync
{
typeset -l sync_wait=`awk '$1 == "sync_wait_for_async" \
typeset -l sync_wait=`awk '$1 == "async_upgrade_sync" \
{ print $3 }' $zfs_kstats/arcstats`
echo $sync_wait
@ -59,7 +59,7 @@ poolname=$1
interval=$2
prefetch_ios=$(get_prefetch_ios)
prefetched_demand_reads=$(get_prefetched_demand_reads)
sync_wait_for_async=$(get_sync_wait_for_async)
async_upgrade_sync=$(get_async_upgrade_sync)
while true
do
@ -73,10 +73,10 @@ do
$(( $new_prefetched_demand_reads - $prefetched_demand_reads ))
prefetched_demand_reads=$new_prefetched_demand_reads
new_sync_wait_for_async=$(get_sync_wait_for_async)
printf "%-24s\t%u\n" "sync_wait_for_async" \
$(( $new_sync_wait_for_async - $sync_wait_for_async ))
sync_wait_for_async=$new_sync_wait_for_async
new_async_upgrade_sync=$(get_async_upgrade_sync)
printf "%-24s\t%u\n" "async_upgrade_sync" \
$(( $new_async_upgrade_sync - $async_upgrade_sync ))
async_upgrade_sync=$new_async_upgrade_sync
sleep $interval
done