Add zpool status -s (slow I/Os) and -p (parseable)

This patch adds a new slow I/Os (-s) column to zpool status to show the number of VDEV slow I/Os. This is the number of I/Os that didn't complete in zio_slow_io_ms milliseconds. It also adds a new parsable (-p) flag to display exact values. NAME STATE READ WRITE CKSUM SLOW testpool ONLINE 0 0 0 - mirror-0 ONLINE 0 0 0 - loop0 ONLINE 0 0 0 20 loop1 ONLINE 0 0 0 0 Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Signed-off-by: Tony Hutter <hutter2@llnl.gov> Closes #7756 Closes #6885
2018-11-08 16:47:24 -08:00 · 2018-11-08 16:47:24 -08:00 · ad796b8a3b
parent 877d925a9e
commit ad796b8a3b
16 changed files with 321 additions and 136 deletions
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@ -365,8 +365,8 @@ get_usage(zpool_help_t idx)
 	case HELP_RESILVER:
 		return (gettext("\tresilver <pool> ...\n"));
 	case HELP_STATUS:
-		return (gettext("\tstatus [-c [script1,script2,...]] [-gLPvxD]"
-		    "[-T d|u] [pool] ... \n"
+		return (gettext("\tstatus [-c [script1,script2,...]] "
+		    "[-gLpPsvxD]  [-T d|u] [pool] ... \n"
 		    "\t    [interval [count]]\n"));
 	case HELP_UPGRADE:
 		return (gettext("\tupgrade\n"
@ -1669,10 +1669,12 @@ typedef struct status_cbdata {
 	int		cb_namewidth;
 	boolean_t	cb_allpools;
 	boolean_t	cb_verbose;
+	boolean_t	cb_literal;
 	boolean_t	cb_explain;
 	boolean_t	cb_first;
 	boolean_t	cb_dedup_stats;
 	boolean_t	cb_print_status;
+	boolean_t	cb_print_slow_ios;
 	vdev_cmd_data_list_t	*vcdl;
 } status_cbdata_t;

@ -1788,10 +1790,34 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
 	    name, state);

 	if (!isspare) {
-		zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf));
-		zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf));
-		zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf));
-		(void) printf(" %5s %5s %5s", rbuf, wbuf, cbuf);
+		if (cb->cb_literal) {
+			printf(" %5llu %5llu %5llu",
+			    (u_longlong_t)vs->vs_read_errors,
+			    (u_longlong_t)vs->vs_write_errors,
+			    (u_longlong_t)vs->vs_checksum_errors);
+		} else {
+			zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf));
+			zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf));
+			zfs_nicenum(vs->vs_checksum_errors, cbuf,
+			    sizeof (cbuf));
+			printf(" %5s %5s %5s", rbuf, wbuf, cbuf);
+		}
+
+		if (cb->cb_print_slow_ios) {
+			if (children == 0)  {
+				/* Only leafs vdevs have slow IOs */
+				zfs_nicenum(vs->vs_slow_ios, rbuf,
+				    sizeof (rbuf));
+			} else {
+				snprintf(rbuf, sizeof (rbuf), "-");
+			}
+
+			if (cb->cb_literal)
+				printf(" %5llu", (u_longlong_t)vs->vs_slow_ios);
+			else
+				printf(" %5s", rbuf);
+		}
+
 	}

 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
@ -7175,6 +7201,9 @@ status_callback(zpool_handle_t *zhp, void *data)
 		    cbp->cb_namewidth, "NAME", "STATE", "READ", "WRITE",
 		    "CKSUM");

+		if (cbp->cb_print_slow_ios)
+			(void) printf(" %5s", gettext("SLOW"));
+
 		if (cbp->vcdl != NULL)
 			print_cmd_columns(cbp->vcdl, 0);

@ -7241,13 +7270,15 @@ status_callback(zpool_handle_t *zhp, void *data)
 }

 /*
- * zpool status [-c [script1,script2,...]] [-gLPvx] [-T d|u] [pool] ...
+ * zpool status [-c [script1,script2,...]] [-gLpPsvx] [-T d|u] [pool] ...
 *              [interval [count]]
 *
 *	-c CMD	For each vdev, run command CMD
 *	-g	Display guid for individual vdev name.
 *	-L	Follow links when resolving vdev path name.
+ *	-p	Display values in parsable (exact) format.
 *	-P	Display full path for vdev name.
+ *	-s	Display slow IOs column.
 *	-v	Display complete error logs
 *	-x	Display only pools with potential problems
 *	-D	Display dedup status (undocumented)
@ -7266,7 +7297,7 @@ zpool_do_status(int argc, char **argv)
 	char *cmd = NULL;

 	/* check options */
-	while ((c = getopt(argc, argv, "c:gLPvxDT:")) != -1) {
+	while ((c = getopt(argc, argv, "c:gLpPsvxDT:")) != -1) {
 		switch (c) {
 		case 'c':
 			if (cmd != NULL) {
@ -7298,9 +7329,15 @@ zpool_do_status(int argc, char **argv)
 		case 'L':
 			cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS;
 			break;
+		case 'p':
+			cb.cb_literal = B_TRUE;
+			break;
 		case 'P':
 			cb.cb_name_flags |= VDEV_NAME_PATH;
 			break;
+		case 's':
+			cb.cb_print_slow_ios = B_TRUE;
+			break;
 		case 'v':
 			cb.cb_verbose = B_TRUE;
 			break;
--- a/include/sys/fm/fs/zfs.h
+++ b/include/sys/fm/fs/zfs.h
@ -74,6 +74,7 @@ extern "C" {
 #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS	"vdev_read_errors"
 #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS "vdev_write_errors"
 #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS "vdev_cksum_errors"
+#define	FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS	"vdev_delays"
 #define	FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID	"parent_guid"
 #define	FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE	"parent_type"
 #define	FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH	"parent_path"
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@ -665,6 +665,9 @@ typedef struct zpool_load_policy {
 #define	ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO	"vdev_async_agg_w_histo"
 #define	ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO	"vdev_agg_scrub_histo"

+/* Number of slow IOs */
+#define	ZPOOL_CONFIG_VDEV_SLOW_IOS		"vdev_slow_ios"
+
 /* vdev enclosure sysfs path */
 #define	ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH	"vdev_enc_sysfs_path"

@ -990,6 +993,7 @@ typedef struct vdev_stat {
 	uint64_t	vs_fragmentation;	/* device fragmentation */
 	uint64_t	vs_checkpoint_space;    /* checkpoint-consumed space */
 	uint64_t	vs_resilver_deferred;	/* resilver deferred	*/
+	uint64_t	vs_slow_ios;		/* slow IOs */
 } vdev_stat_t;

 /*
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@ -1076,9 +1076,11 @@ extern const char *spa_state_to_name(spa_t *spa);
 /* error handling */
 struct zbookmark_phys;
 extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb);
-extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd,
+extern int zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd,
    const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset,
    uint64_t length);
+extern boolean_t zfs_ereport_is_valid(const char *class, spa_t *spa, vdev_t *vd,
+    zio_t *zio);
 extern nvlist_t *zfs_event_create(spa_t *spa, vdev_t *vd, const char *type,
    const char *name, nvlist_t *aux);
 extern void zfs_post_remove(spa_t *spa, vdev_t *vd);
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@ -158,11 +158,6 @@ enum zio_encrypt {
 	(compress) == ZIO_COMPRESS_ON ||		\
 	(compress) == ZIO_COMPRESS_OFF)

-/*
- * Default Linux timeout for a sd device.
- */
-#define	ZIO_DELAY_MAX			(30 * MILLISEC)
-
 #define	ZIO_FAILURE_MODE_WAIT		0
 #define	ZIO_FAILURE_MODE_CONTINUE	1
 #define	ZIO_FAILURE_MODE_PANIC		2
@ -664,7 +659,7 @@ extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report,
 extern void zfs_ereport_free_checksum(zio_cksum_report_t *report);

 /* If we have the good data in hand, this function can be used */
-extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
+extern int zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
    const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset,
    uint64_t length, const abd_t *good_data, const abd_t *bad_data,
    struct zio_bad_cksum *info);
--- a/man/man5/zfs-events.5
+++ b/man/man5/zfs-events.5
@ -96,8 +96,9 @@ information regarding "hung" I/O detection and configuration.
 .ad
 .RS 12n
 Issued when a completed I/O exceeds the maximum allowed time specified
-by the \fBzio_delay_max\fR module option.  This can be an indicator of
-problems with the underlying storage device.
+by the \fBzio_slow_io_ms\fR module option.  This can be an indicator of
+problems with the underlying storage device.  The number of delay events is
+ratelimited by the \fBzfs_slow_io_events_per_second\fR module parameter.
 .RE

 .sp
@ -697,10 +698,9 @@ full list of all the I/O stages.
 \fBzio_delay\fR
 .ad
 .RS 12n
-The time in ticks (HZ) required for the block layer to service the I/O.  Unlike
-\fBzio_delta\fR this does not include any vdev queuing time and is therefore
-solely a measure of the block layer performance.  On most modern Linux systems
-HZ is defined as 1000 making a tick equivalent to 1 millisecond.
+The time elapsed (in nanoseconds) waiting for the block layer to complete the
+I/O.  Unlike \fBzio_delta\fR this does not include any vdev queuing time and is
+therefore solely a measure of the block layer performance.
 .RE

 .sp
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@ -1086,7 +1086,7 @@ Default value: \fB600,000\fR.
 .ad
 .RS 12n
 Interval in milliseconds after which the deadman is triggered and an
-individual IO operation is considered to be "hung".  As long as the I/O
+individual I/O operation is considered to be "hung".  As long as the I/O
 remains "hung" the deadman will be invoked every \fBzfs_deadman_checktime_ms\fR
 milliseconds until the I/O completes.
 .sp
@ -1141,10 +1141,10 @@ Default value: \fB500,000\fR.
 .sp
 .ne 2
 .na
-\fBzfs_delays_per_second\fR (int)
+\fBzfs_low_ios_per_second\fR (int)
 .ad
 .RS 12n
-Rate limit IO delay events to this many per second.
+Rate limit delay zevents (which report slow I/Os) to this many per second.
 .sp
 Default value: 20
 .RE
@ -1655,7 +1655,7 @@ Default value: \fB50\fR.
 .ad
 .RS 12n
 We currently support block sizes from 512 bytes to 16MB.  The benefits of
-larger blocks, and thus larger IO, need to be weighed against the cost of
+larger blocks, and thus larger I/O, need to be weighed against the cost of
 COWing a giant block to modify one byte.  Additionally, very large blocks
 can have an impact on i/o latency, and also potentially on the memory
 allocator.  Therefore, we do not allow the recordsize to be set larger than
@ -2552,12 +2552,13 @@ Default value: \fB0\fR.
 .sp
 .ne 2
 .na
-\fBzio_delay_max\fR (int)
+\fBzio_slow_io_ms\fR (int)
 .ad
 .RS 12n
-A zevent will be logged if a ZIO operation takes more than N milliseconds to
-complete. Note that this is only a logging facility, not a timeout on
-operations.
+When an I/O operation takes more than \fBzio_slow_io_ms\fR milliseconds to
+complete is marked as a slow I/O.  Each slow I/O causes a delay zevent.  Slow
+I/O counters can be seen with "zpool status -s".
+
 .sp
 Default value: \fB30,000\fR.
 .RE
@ -2568,7 +2569,7 @@ Default value: \fB30,000\fR.
 \fBzio_dva_throttle_enabled\fR (int)
 .ad
 .RS 12n
-Throttle block allocations in the ZIO pipeline. This allows for
+Throttle block allocations in the I/O pipeline. This allows for
 dynamic allocation distribution when devices are imbalanced.
 When enabled, the maximum number of pending allocations per top-level vdev
 is limited by \fBzfs_vdev_queue_depth_pct\fR.
@ -2594,7 +2595,7 @@ Default value: \fB0\fR.
 .ad
 .RS 12n
 Percentage of online CPUs (or CPU cores, etc) which will run a worker thread
-for IO. These workers are responsible for IO work such as compression and
+for I/O. These workers are responsible for I/O work such as compression and
 checksum calculations. Fractional number of CPUs will be rounded down.
 .sp
 The default value of 75 was chosen to avoid using all CPUs which can result in
--- a/man/man8/zpool.8
+++ b/man/man8/zpool.8
@ -182,7 +182,7 @@
 .Nm
 .Cm status
 .Oo Fl c Ar SCRIPT Oc
-.Op Fl gLPvxD
+.Op Fl DgLpPsvx
 .Op Fl T Sy u Ns | Ns Sy d
 .Oo Ar pool Oc Ns ...
 .Op Ar interval Op Ar count
@ -2167,7 +2167,7 @@ and automatically import it.
 .Nm
 .Cm status
 .Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns ...
-.Op Fl gLPvxD
+.Op Fl DgLpPsvx
 .Op Fl T Sy u Ns | Ns Sy d
 .Oo Ar pool Oc Ns ...
 .Op Ar interval Op Ar count
@ -2203,6 +2203,8 @@ Display real paths for vdevs resolving all symbolic links. This can
 be used to look up the current block device name regardless of the
 .Pa /dev/disk/
 path used to open it.
+.It Fl p
+Display numbers in parsable (exact) values.
 .It Fl P
 Display full paths for vdevs instead of only the last component of
 the path. This can be used in conjunction with the
@ -2214,6 +2216,12 @@ Display a histogram of deduplication statistics, showing the allocated
 and referenced
 .Pq logically referenced in the pool
 block counts and sizes by reference count.
+.It Fl s
+Display the number of leaf VDEV slow IOs.  This is the number of IOs that
+didn't complete in \fBzio_slow_io_ms\fR milliseconds (default 30 seconds).
+This does not necessarily mean the IOs failed to complete, just took an
+unreasonably long amount of time.  This may indicate a problem with the
+underlying storage.
 .It Fl T Sy u Ns | Ns Sy d
 Display a time stamp.
 Specify
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@ -77,14 +77,14 @@ int vdev_validate_skip = B_FALSE;
 int vdev_dtl_sm_blksz = (1 << 12);

 /*
- * Rate limit delay events to this many IO delays per second.
+ * Rate limit slow IO (delay) events to this many per second.
 */
-unsigned int zfs_delays_per_second = 20;
+unsigned int zfs_slow_io_events_per_second = 20;

 /*
 * Rate limit checksum events after this many checksum errors per second.
 */
-unsigned int zfs_checksums_per_second = 20;
+unsigned int zfs_checksum_events_per_second = 20;

 /*
 * Ignore errors during scrub/resilver.  Allows to work around resilver
@ -507,8 +507,10 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 	 * and checksum events so that we don't overwhelm ZED with thousands
 	 * of events when a disk is acting up.
 	 */
-	zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_delays_per_second, 1);
-	zfs_ratelimit_init(&vd->vdev_checksum_rl, &zfs_checksums_per_second, 1);
+	zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second,
+	    1);
+	zfs_ratelimit_init(&vd->vdev_checksum_rl,
+	    &zfs_checksum_events_per_second, 1);

 	list_link_init(&vd->vdev_config_dirty_node);
 	list_link_init(&vd->vdev_state_dirty_node);
@ -3591,6 +3593,7 @@ vdev_clear(spa_t *spa, vdev_t *vd)
 	vd->vdev_stat.vs_read_errors = 0;
 	vd->vdev_stat.vs_write_errors = 0;
 	vd->vdev_stat.vs_checksum_errors = 0;
+	vd->vdev_stat.vs_slow_ios = 0;

 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_clear(spa, vd->vdev_child[c]);
@ -4630,12 +4633,12 @@ module_param(vdev_ms_count_limit, int, 0644);
 MODULE_PARM_DESC(vdev_ms_count_limit,
 	"Practical upper limit of total metaslabs per top-level vdev");

-module_param(zfs_delays_per_second, uint, 0644);
-MODULE_PARM_DESC(zfs_delays_per_second, "Rate limit delay events to this many "
-	"IO delays per second");
+module_param(zfs_slow_io_events_per_second, uint, 0644);
+MODULE_PARM_DESC(zfs_slow_io_events_per_second,
+	"Rate limit slow IO (delay) events to this many per second");

-module_param(zfs_checksums_per_second, uint, 0644);
-	MODULE_PARM_DESC(zfs_checksums_per_second, "Rate limit checksum events "
+module_param(zfs_checksum_events_per_second, uint, 0644);
+MODULE_PARM_DESC(zfs_checksum_events_per_second, "Rate limit checksum events "
 	"to this many checksum errors per second (do not set below zed"
 	"threshold).");

--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@ -347,6 +347,9 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
 	    vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB],
 	    ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB]));

+	/* IO delays */
+	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SLOW_IOS, vs->vs_slow_ios);
+
 	/* Add extended stats nvlist to main nvlist */
 	fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx);

--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c
@ -140,7 +140,10 @@ zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd)
 	return (rc);
 }

-static void
+/*
+ * Return B_TRUE if the event actually posted, B_FALSE if not.
+ */
+static boolean_t
 zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
    const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
    zio_t *zio, uint64_t stateoroffset, uint64_t size)
@ -150,78 +153,15 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
 	uint64_t ena;
 	char class[64];

-	/*
-	 * If we are doing a spa_tryimport() or in recovery mode,
-	 * ignore errors.
-	 */
-	if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
-	    spa_load_state(spa) == SPA_LOAD_RECOVER)
-		return;
-
-	/*
-	 * If we are in the middle of opening a pool, and the previous attempt
-	 * failed, don't bother logging any new ereports - we're just going to
-	 * get the same diagnosis anyway.
-	 */
-	if (spa_load_state(spa) != SPA_LOAD_NONE &&
-	    spa->spa_last_open_failed)
-		return;
-
-	if (zio != NULL) {
-		/*
-		 * If this is not a read or write zio, ignore the error.  This
-		 * can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
-		 */
-		if (zio->io_type != ZIO_TYPE_READ &&
-		    zio->io_type != ZIO_TYPE_WRITE)
-			return;
-
-		if (vd != NULL) {
-			/*
-			 * If the vdev has already been marked as failing due
-			 * to a failed probe, then ignore any subsequent I/O
-			 * errors, as the DE will automatically fault the vdev
-			 * on the first such failure.  This also catches cases
-			 * where vdev_remove_wanted is set and the device has
-			 * not yet been asynchronously placed into the REMOVED
-			 * state.
-			 */
-			if (zio->io_vd == vd && !vdev_accessible(vd, zio))
-				return;
-
-			/*
-			 * Ignore checksum errors for reads from DTL regions of
-			 * leaf vdevs.
-			 */
-			if (zio->io_type == ZIO_TYPE_READ &&
-			    zio->io_error == ECKSUM &&
-			    vd->vdev_ops->vdev_op_leaf &&
-			    vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1))
-				return;
-		}
-	}
-
-	/*
-	 * For probe failure, we want to avoid posting ereports if we've
-	 * already removed the device in the meantime.
-	 */
-	if (vd != NULL &&
-	    strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 &&
-	    (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED))
-		return;
-
-	if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) &&
-	    (zio != NULL) && (!zio->io_timestamp)) {
-		/* Ignore bogus delay events */
-		return;
-	}
+	if (!zfs_ereport_is_valid(subclass, spa, vd, zio))
+		return (B_FALSE);

 	if ((ereport = fm_nvlist_create(NULL)) == NULL)
-		return;
+		return (B_FALSE);

 	if ((detector = fm_nvlist_create(NULL)) == NULL) {
 		fm_nvlist_destroy(ereport, FM_NVA_FREE);
-		return;
+		return (B_FALSE);
 	}

 	/*
@ -332,7 +272,10 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS,
 			    DATA_TYPE_UINT64, vs->vs_write_errors,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS,
-			    DATA_TYPE_UINT64, vs->vs_checksum_errors, NULL);
+			    DATA_TYPE_UINT64, vs->vs_checksum_errors,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS,
+			    DATA_TYPE_UINT64, vs->vs_slow_ios,
+			    NULL);
 		}

 		if (pvd != NULL) {
@ -427,7 +370,7 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
 	/*
 	 * Payload for I/Os with corresponding logical information.
 	 */
-	if (zb != NULL && (zio == NULL || zio->io_logical != NULL))
+	if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) {
 		fm_payload_set(ereport,
 		    FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
 		    DATA_TYPE_UINT64, zb->zb_objset,
@ -437,11 +380,13 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
 		    DATA_TYPE_INT64, zb->zb_level,
 		    FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
 		    DATA_TYPE_UINT64, zb->zb_blkid, NULL);
+	}

 	mutex_exit(&spa->spa_errlist_lock);

 	*ereport_out = ereport;
 	*detector_out = detector;
+	return (B_TRUE);
 }

 /* if it's <= 128 bytes, save the corruption directly */
@ -765,27 +710,111 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
 }
 #endif

-void
+/*
+ * Make sure our event is still valid for the given zio/vdev/pool.  For example,
+ * we don't want to keep logging events for a faulted or missing vdev.
+ */
+boolean_t
+zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio)
+{
+#ifdef _KERNEL
+	/*
+	 * If we are doing a spa_tryimport() or in recovery mode,
+	 * ignore errors.
+	 */
+	if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
+	    spa_load_state(spa) == SPA_LOAD_RECOVER)
+		return (B_FALSE);
+
+	/*
+	 * If we are in the middle of opening a pool, and the previous attempt
+	 * failed, don't bother logging any new ereports - we're just going to
+	 * get the same diagnosis anyway.
+	 */
+	if (spa_load_state(spa) != SPA_LOAD_NONE &&
+	    spa->spa_last_open_failed)
+		return (B_FALSE);
+
+	if (zio != NULL) {
+		/*
+		 * If this is not a read or write zio, ignore the error.  This
+		 * can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
+		 */
+		if (zio->io_type != ZIO_TYPE_READ &&
+		    zio->io_type != ZIO_TYPE_WRITE)
+			return (B_FALSE);
+
+		if (vd != NULL) {
+			/*
+			 * If the vdev has already been marked as failing due
+			 * to a failed probe, then ignore any subsequent I/O
+			 * errors, as the DE will automatically fault the vdev
+			 * on the first such failure.  This also catches cases
+			 * where vdev_remove_wanted is set and the device has
+			 * not yet been asynchronously placed into the REMOVED
+			 * state.
+			 */
+			if (zio->io_vd == vd && !vdev_accessible(vd, zio))
+				return (B_FALSE);
+
+			/*
+			 * Ignore checksum errors for reads from DTL regions of
+			 * leaf vdevs.
+			 */
+			if (zio->io_type == ZIO_TYPE_READ &&
+			    zio->io_error == ECKSUM &&
+			    vd->vdev_ops->vdev_op_leaf &&
+			    vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1))
+				return (B_FALSE);
+		}
+	}
+
+	/*
+	 * For probe failure, we want to avoid posting ereports if we've
+	 * already removed the device in the meantime.
+	 */
+	if (vd != NULL &&
+	    strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 &&
+	    (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED))
+		return (B_FALSE);
+
+	/* Ignore bogus delay events (like from ioctls or unqueued IOs) */
+	if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) &&
+	    (zio != NULL) && (!zio->io_timestamp)) {
+		return (B_FALSE);
+	}
+#endif
+	return (B_TRUE);
+}
+
+/*
+ * Return 0 if event was posted, EINVAL if there was a problem posting it or
+ * EBUSY if the event was rate limited.
+ */
+int
 zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd,
    const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset,
    uint64_t size)
 {
+	int rc = 0;
 #ifdef _KERNEL
 	nvlist_t *ereport = NULL;
 	nvlist_t *detector = NULL;

 	if (zfs_is_ratelimiting_event(subclass, vd))
-		return;
+		return (SET_ERROR(EBUSY));

-	zfs_ereport_start(&ereport, &detector, subclass, spa, vd,
-	    zb, zio, stateoroffset, size);
+	if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd,
+	    zb, zio, stateoroffset, size))
+		return (SET_ERROR(EINVAL));	/* couldn't post event */

 	if (ereport == NULL)
-		return;
+		return (SET_ERROR(EINVAL));

 	/* Cleanup is handled by the callback function */
-	zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
+	rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
 #endif
+	return (rc);
 }

 void
@ -795,7 +824,6 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
 {
 	zio_cksum_report_t *report;

-
 #ifdef _KERNEL
 	if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
 		return;
@ -874,30 +902,34 @@ zfs_ereport_free_checksum(zio_cksum_report_t *rpt)
 }


-void
+int
 zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
    struct zio *zio, uint64_t offset, uint64_t length,
    const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc)
 {
+	int rc = 0;
 #ifdef _KERNEL
 	nvlist_t *ereport = NULL;
 	nvlist_t *detector = NULL;
 	zfs_ecksum_info_t *info;

-	zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM,
-	    spa, vd, zb, zio, offset, length);
+	if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
+		return (EBUSY);

-	if (ereport == NULL)
-		return;
+	if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM,
+	    spa, vd, zb, zio, offset, length) || (ereport == NULL)) {
+		return (SET_ERROR(EINVAL));
+	}

 	info = annotate_ecksum(ereport, zbc, good_data, bad_data, length,
 	    B_FALSE);

 	if (info != NULL) {
-		zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
+		rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
 		kmem_free(info, sizeof (*info));
 	}
 #endif
+	return (rc);
 }

 /*
@ -1043,6 +1075,7 @@ zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate)

 #if defined(_KERNEL)
 EXPORT_SYMBOL(zfs_ereport_post);
+EXPORT_SYMBOL(zfs_ereport_is_valid);
 EXPORT_SYMBOL(zfs_ereport_post_checksum);
 EXPORT_SYMBOL(zfs_post_remove);
 EXPORT_SYMBOL(zfs_post_autoreplace);
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@ -77,7 +77,8 @@ uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 #endif

-int zio_delay_max = ZIO_DELAY_MAX;
+/* Mark IOs as "slow" if they take longer than 30 seconds */
+int zio_slow_io_ms = (30 * MILLISEC);

 #define	BP_SPANB(indblkshift, level) \
 	(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
@ -4431,10 +4432,28 @@ zio_done(zio_t *zio)
 	 * 30 seconds to complete, post an error described the I/O delay.
 	 * We ignore these errors if the device is currently unavailable.
 	 */
-	if (zio->io_delay >= MSEC2NSEC(zio_delay_max)) {
-		if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd))
-			zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
-			    zio->io_vd, &zio->io_bookmark, zio, 0, 0);
+	if (zio->io_delay >= MSEC2NSEC(zio_slow_io_ms)) {
+		if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) {
+			/*
+			 * We want to only increment our slow IO counters if
+			 * the IO is valid (i.e. not if the drive is removed).
+			 *
+			 * zfs_ereport_post() will also do these checks, but
+			 * it can also ratelimit and have other failures, so we
+			 * need to increment the slow_io counters independent
+			 * of it.
+			 */
+			if (zfs_ereport_is_valid(FM_EREPORT_ZFS_DELAY,
+			    zio->io_spa, zio->io_vd, zio)) {
+				mutex_enter(&zio->io_vd->vdev_stat_lock);
+				zio->io_vd->vdev_stat.vs_slow_ios++;
+				mutex_exit(&zio->io_vd->vdev_stat_lock);
+
+				zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
+				    zio->io_spa, zio->io_vd, &zio->io_bookmark,
+				    zio, 0, 0);
+			}
+		}
 	}

 	if (zio->io_error) {
@ -4823,8 +4842,9 @@ EXPORT_SYMBOL(zio_data_buf_alloc);
 EXPORT_SYMBOL(zio_buf_free);
 EXPORT_SYMBOL(zio_data_buf_free);

-module_param(zio_delay_max, int, 0644);
-MODULE_PARM_DESC(zio_delay_max, "Max zio millisec delay before posting event");
+module_param(zio_slow_io_ms, int, 0644);
+MODULE_PARM_DESC(zio_slow_io_ms,
+	"Max I/O completion time (milliseconds) before marking it as slow");

 module_param(zio_requeue_io_start_cut_in_line, int, 0644);
 MODULE_PARM_DESC(zio_requeue_io_start_cut_in_line, "Prioritize requeued I/O");
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@ -546,7 +546,7 @@ tags = ['functional', 'exec']
 tests = ['auto_online_001_pos', 'auto_replace_001_pos', 'auto_spare_001_pos',
    'auto_spare_002_pos', 'auto_spare_ashift', 'auto_spare_multiple',
    'auto_spare_shared', 'scrub_after_resilver', 'decrypt_fault',
-    'decompress_fault']
+    'decompress_fault','zpool_status_-s']
 tags = ['functional', 'fault']

 [tests/functional/features/async_destroy]
--- a/tests/zfs-tests/tests/functional/fault/Makefile.am
+++ b/tests/zfs-tests/tests/functional/fault/Makefile.am
@ -11,7 +11,8 @@ dist_pkgdata_SCRIPTS = \
 	auto_spare_shared.ksh \
 	decrypt_fault.ksh \
 	decompress_fault.ksh \
-	scrub_after_resilver.ksh
+	scrub_after_resilver.ksh \
+	zpool_status_-s.ksh

 dist_pkgdata_DATA = \
 	fault.cfg
--- a/tests/zfs-tests/tests/functional/fault/zpool_status_-s.ksh
+++ b/tests/zfs-tests/tests/functional/fault/zpool_status_-s.ksh
@ -0,0 +1,77 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2018 by Lawrence Livermore National Security, LLC.
+#
+
+# DESCRIPTION:
+#	Verify zpool status -s (slow IOs) works
+#
+# STRATEGY:
+#	1. Create a file
+#	2. Inject slow IOs into the pool
+#	3. Verify we can see the slow IOs with "zpool status -s".
+#	4. Verify we can see delay events.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/include/zpool_script.shlib
+
+DISK=${DISKS%% *}
+
+verify_runnable "both"
+
+log_must zpool create $TESTPOOL mirror ${DISKS}
+
+function cleanup
+{
+	log_must zinject -c all
+	log_must set_tunable64 zio_slow_io_ms $OLD_SLOW_IO
+	log_must set_tunable64 zfs_slow_io_events_per_second $OLD_SLOW_IO_EVENTS
+	log_must destroy_pool $TESTPOOL
+}
+
+log_onexit cleanup
+
+log_must zpool events -c
+
+# Mark any IOs greater than 10ms as slow IOs
+OLD_SLOW_IO=$(get_tunable zio_slow_io_ms)
+OLD_SLOW_IO_EVENTS=$(get_tunable zfs_slow_io_events_per_second)
+log_must set_tunable64 zio_slow_io_ms 10
+log_must set_tunable64 zfs_slow_io_events_per_second 1000
+
+# Create 20ms IOs
+log_must zinject -d $DISK -D20:100 $TESTPOOL
+log_must mkfile 1048576 /$TESTPOOL/testfile
+log_must zpool sync $TESTPOOL
+
+log_must zinject -c all
+SLOW_IOS=$(zpool status -sp | grep "$DISK" | awk '{print $6}')
+DELAY_EVENTS=$(zpool events | grep delay | wc -l)
+
+if [ $SLOW_IOS -gt 0 ] && [ $DELAY_EVENTS -gt 0 ] ; then
+	log_pass "Correctly saw $SLOW_IOS slow IOs and $DELAY_EVENTS delay events"
+else
+	log_fail "Only saw $SLOW_IOS slow IOs and $DELAY_EVENTS delay events"
+fi
--- a/tests/zfs-tests/tests/perf/perf.shlib
+++ b/tests/zfs-tests/tests/perf/perf.shlib
@ -410,7 +410,7 @@ function get_system_config
 		    zfs_vdev_async_write_max_active \
 		    zfs_vdev_sync_read_max_active \
 		    zfs_vdev_sync_write_max_active \
-		    zio_delay_max
+		    zio_slow_io_ms
 		do
 			if [ "$tunable" != "zfs_arc_max" ]
 			then