device-mapper-multipath/0004-libmultipath-fix-marginal-paths-queueing-errors.patch
Benjamin Marzinski 35f5570500 device-mapper-multipath-0.8.0-1
Update Source to upstream version 0.8.0
  * Previous patches 0006 & 0007 are included in this commit
Rename files
  * Previous patches 0008-0016 & 0100 are now patches 0018-0027
Add 0006-multipathd-Fix-miscounting-active-paths.patch
Add 0007-multipathd-ignore-failed-wwid-recheck.patch
  * multipathd will no longer disable paths if it is unable to
    get their wwid on a change event
Add 0008-libmutipath-continue-to-use-old-state-on-PATH_PENDIN.patch
Add 0009-multipathd-use-update_path_groups-instead-of-reload_.patch
Add 0010-multipath.conf-add-missing-options-to-man-page.patch
Add 0011-libmultipath-add-get_uid-fallback-code-for-NVMe-devi.patch
Add 0012-libmulitpath-cleanup-uid_fallback-code.patch
Add 0013-multipathd-handle-changed-wwids-by-removal-and-addit.patch
  * if a path device changes wwid, it will now be removed and re-added
    to the correct multipath device.
Add 0014-multipathd-remove-wwid_changed-path-attribute.patch
Add 0015-multipathd-ignore-disable_changed_wwids.patch
Add 0016-multipathd-Don-t-use-fallback-code-after-getting-wwi.patch
Add 0017-libmultipath-silence-dm_is_mpath-error-messages.patch
  * The above 12 patches have been submitted upstream
2019-04-05 00:20:53 -05:00

177 lines
6.5 KiB
Diff

From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Benjamin Marzinski <bmarzins@redhat.com>
Date: Mon, 28 Jan 2019 00:20:53 -0600
Subject: [PATCH] libmultipath: fix marginal paths queueing errors
The current marginal paths code tries to enqueue paths for io error
checking when multipathd receives a uevent on path failure. This can run
into a couple of problems. First, this uevent could happen before or
after multipathd actually fails the path, so simply checking nr_active
doesn't tell us if this is the last active path. Also, The code to fail
the path in enqueue_io_err_stat_by_path() doesn't ever update the path
state. This can cause the path to get failed twice, temporarily leading
to incorrect nr_active counts. Further, The point when multipathd should
care if this is the last active path is when the path has come up again,
not when it goes down. Lastly, if the path is down, it is often
impossible to open the path device, causing setup_directio_ctx() to
fail, which causes multipathd to skip io error checking and mark the
path as not marginal.
Instead, multipathd should just make sure that if the path is marginal,
it gets failed in the uevent, so as not to race with the checkerloop
thread. Then, when the path comes back up, check_path() can enqueue it,
just like it does for paths that need to get rechecked. To make it
obvious that the state PATH_IO_ERR_IN_POLLING_RECHECK and the function
hit_io_err_recheck_time() now apply to paths waiting to be enqueued for
the first time as well, I've also changed their names to
PATH_IO_ERR_WAITING_TO_CHECK and need_io_err_check(), respectively.
Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
---
libmultipath/io_err_stat.c | 55 +++++++++++++++++---------------------
libmultipath/io_err_stat.h | 2 +-
multipathd/main.c | 2 +-
3 files changed, 27 insertions(+), 32 deletions(-)
diff --git a/libmultipath/io_err_stat.c b/libmultipath/io_err_stat.c
index 416e13a..72aacf3 100644
--- a/libmultipath/io_err_stat.c
+++ b/libmultipath/io_err_stat.c
@@ -41,7 +41,7 @@
#define CONCUR_NR_EVENT 32
#define PATH_IO_ERR_IN_CHECKING -1
-#define PATH_IO_ERR_IN_POLLING_RECHECK -2
+#define PATH_IO_ERR_WAITING_TO_CHECK -2
#define io_err_stat_log(prio, fmt, args...) \
condlog(prio, "io error statistic: " fmt, ##args)
@@ -283,24 +283,6 @@ static int enqueue_io_err_stat_by_path(struct path *path)
vector_set_slot(paths->pathvec, p);
pthread_mutex_unlock(&paths->mutex);
- if (!path->io_err_disable_reinstate) {
- /*
- *fail the path in the kernel for the time of the to make
- *the test more reliable
- */
- io_err_stat_log(3, "%s: fail dm path %s before checking",
- path->mpp->alias, path->dev);
- path->io_err_disable_reinstate = 1;
- dm_fail_path(path->mpp->alias, path->dev_t);
- update_queue_mode_del_path(path->mpp);
-
- /*
- * schedule path check as soon as possible to
- * update path state to delayed state
- */
- path->tick = 1;
-
- }
io_err_stat_log(2, "%s: enqueue path %s to check",
path->mpp->alias, path->dev);
return 0;
@@ -317,7 +299,6 @@ free_ioerr_path:
int io_err_stat_handle_pathfail(struct path *path)
{
struct timespec curr_time;
- int res;
if (uatomic_read(&io_err_thread_running) == 0)
return 1;
@@ -332,8 +313,6 @@ int io_err_stat_handle_pathfail(struct path *path)
if (!path->mpp)
return 1;
- if (path->mpp->nr_active <= 1)
- return 1;
if (path->mpp->marginal_path_double_failed_time <= 0 ||
path->mpp->marginal_path_err_sample_time <= 0 ||
path->mpp->marginal_path_err_recheck_gap_time <= 0 ||
@@ -371,17 +350,33 @@ int io_err_stat_handle_pathfail(struct path *path)
}
path->io_err_pathfail_cnt++;
if (path->io_err_pathfail_cnt >= FLAKY_PATHFAIL_THRESHOLD) {
- res = enqueue_io_err_stat_by_path(path);
- if (!res)
- path->io_err_pathfail_cnt = PATH_IO_ERR_IN_CHECKING;
- else
- path->io_err_pathfail_cnt = 0;
+ path->io_err_disable_reinstate = 1;
+ path->io_err_pathfail_cnt = PATH_IO_ERR_WAITING_TO_CHECK;
+ /* enqueue path as soon as it comes up */
+ path->io_err_dis_reinstate_time = 0;
+ if (path->state != PATH_DOWN) {
+ struct config *conf;
+ int oldstate = path->state;
+ int checkint;
+
+ conf = get_multipath_config();
+ checkint = conf->checkint;
+ put_multipath_config(conf);
+ io_err_stat_log(2, "%s: mark as failed", path->dev);
+ path->mpp->stat_path_failures++;
+ path->state = PATH_DOWN;
+ path->dmstate = PSTATE_FAILED;
+ if (oldstate == PATH_UP || oldstate == PATH_GHOST)
+ update_queue_mode_del_path(path->mpp);
+ if (path->tick > checkint)
+ path->tick = checkint;
+ }
}
return 0;
}
-int hit_io_err_recheck_time(struct path *pp)
+int need_io_err_check(struct path *pp)
{
struct timespec curr_time;
int r;
@@ -392,7 +387,7 @@ int hit_io_err_recheck_time(struct path *pp)
io_err_stat_log(2, "%s: recover path early", pp->dev);
goto recover;
}
- if (pp->io_err_pathfail_cnt != PATH_IO_ERR_IN_POLLING_RECHECK)
+ if (pp->io_err_pathfail_cnt != PATH_IO_ERR_WAITING_TO_CHECK)
return 1;
if (clock_gettime(CLOCK_MONOTONIC, &curr_time) != 0 ||
(curr_time.tv_sec - pp->io_err_dis_reinstate_time) >
@@ -489,7 +484,7 @@ static int poll_io_err_stat(struct vectors *vecs, struct io_err_stat_path *pp)
} else if (path->mpp && path->mpp->nr_active > 1) {
io_err_stat_log(3, "%s: keep failing the dm path %s",
path->mpp->alias, path->dev);
- path->io_err_pathfail_cnt = PATH_IO_ERR_IN_POLLING_RECHECK;
+ path->io_err_pathfail_cnt = PATH_IO_ERR_WAITING_TO_CHECK;
path->io_err_disable_reinstate = 1;
path->io_err_dis_reinstate_time = currtime.tv_sec;
io_err_stat_log(3, "%s: disable reinstating of %s",
diff --git a/libmultipath/io_err_stat.h b/libmultipath/io_err_stat.h
index bbf31b4..53d6d7d 100644
--- a/libmultipath/io_err_stat.h
+++ b/libmultipath/io_err_stat.h
@@ -10,6 +10,6 @@ extern pthread_attr_t io_err_stat_attr;
int start_io_err_stat_thread(void *data);
void stop_io_err_stat_thread(void);
int io_err_stat_handle_pathfail(struct path *path);
-int hit_io_err_recheck_time(struct path *pp);
+int need_io_err_check(struct path *pp);
#endif /* _IO_ERR_STAT_H */
diff --git a/multipathd/main.c b/multipathd/main.c
index fe6d8ef..43830e8 100644
--- a/multipathd/main.c
+++ b/multipathd/main.c
@@ -2080,7 +2080,7 @@ check_path (struct vectors * vecs, struct path * pp, int ticks)
}
if ((newstate == PATH_UP || newstate == PATH_GHOST) &&
- pp->io_err_disable_reinstate && hit_io_err_recheck_time(pp)) {
+ pp->io_err_disable_reinstate && need_io_err_check(pp)) {
pp->state = PATH_SHAKY;
/*
* to reschedule as soon as possible,so that this path can
--
2.17.2