83 lines
3.4 KiB
Diff
83 lines
3.4 KiB
Diff
commit 28ddb87a2f4562c5d1752a778744cc56136f81c1
|
|
Author: Fabio M. Di Nitto <fdinitto@redhat.com>
|
|
Date: Sun Nov 7 17:02:05 2021 +0100
|
|
|
|
[udp] use ICMP error messages to trigger faster link down detection
|
|
|
|
this solves a possible race condition when:
|
|
|
|
- node1 is running
|
|
- node2 very fast
|
|
- node1 does NOT have enough time to detect that node2 has gone
|
|
and reset the local seq numbers / buffers
|
|
- node1 will start rejecting valid packets from node2
|
|
|
|
There is still a potential minor race condition where app
|
|
can restart so fast that kernel / network don't have time
|
|
to generate an ICMP error. This will be addressed using
|
|
instance id in onwire v2 protocol, as suggested by Jan F.
|
|
|
|
Signed-off-by: Fabio M. Di Nitto <fdinitto@redhat.com>
|
|
|
|
diff --git a/libknet/transport_udp.c b/libknet/transport_udp.c
|
|
index 963340d..32dd032 100644
|
|
--- a/libknet/transport_udp.c
|
|
+++ b/libknet/transport_udp.c
|
|
@@ -364,6 +364,46 @@ static int read_errs_from_sock(knet_handle_t knet_h, int sockfd)
|
|
log_debug(knet_h, KNET_SUB_TRANSP_UDP, "Received ICMP error from %s: %s destination unknown", addr_str, strerror(sock_err->ee_errno));
|
|
} else {
|
|
log_debug(knet_h, KNET_SUB_TRANSP_UDP, "Received ICMP error from %s: %s %s", addr_str, strerror(sock_err->ee_errno), addr_remote_str);
|
|
+ if ((sock_err->ee_errno == ECONNREFUSED) || /* knet is not running on the other node */
|
|
+ (sock_err->ee_errno == ECONNABORTED) || /* local kernel closed the socket */
|
|
+ (sock_err->ee_errno == ENONET) || /* network does not exist */
|
|
+ (sock_err->ee_errno == ENETUNREACH) || /* network unreachable */
|
|
+ (sock_err->ee_errno == EHOSTUNREACH) || /* host unreachable */
|
|
+ (sock_err->ee_errno == EHOSTDOWN) || /* host down (from kernel/net/ipv4/icmp.c */
|
|
+ (sock_err->ee_errno == ENETDOWN)) { /* network down */
|
|
+ struct knet_host *host = NULL;
|
|
+ struct knet_link *kn_link = NULL;
|
|
+ int link_idx, found = 0;
|
|
+
|
|
+ for (host = knet_h->host_head; host != NULL; host = host->next) {
|
|
+ for (link_idx = 0; link_idx < KNET_MAX_LINK; link_idx++) {
|
|
+ kn_link = &host->link[link_idx];
|
|
+ if (kn_link->outsock == sockfd) {
|
|
+ if (!cmpaddr(&remote, &kn_link->dst_addr)) {
|
|
+ found = 1;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ if (found) {
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if ((host) && (kn_link) &&
|
|
+ (kn_link->status.connected)) {
|
|
+ log_debug(knet_h, KNET_SUB_TRANSP_UDP, "Setting down host %u link %i", host->host_id, kn_link->link_id);
|
|
+ /*
|
|
+ * setting transport_connected = 0 will trigger
|
|
+ * thread_heartbeat link_down process.
|
|
+ *
|
|
+ * the process terminates calling into transport_link_down
|
|
+ * below that will set transport_connected = 1
|
|
+ */
|
|
+ kn_link->transport_connected = 0;
|
|
+ }
|
|
+
|
|
+ }
|
|
}
|
|
}
|
|
break;
|
|
@@ -436,5 +476,9 @@ int udp_transport_link_dyn_connect(knet_handle_t knet_h, int sockfd, struct knet
|
|
|
|
int udp_transport_link_is_down(knet_handle_t knet_h, struct knet_link *kn_link)
|
|
{
|
|
+ /*
|
|
+ * see comments about handling ICMP error messages
|
|
+ */
|
|
+ kn_link->transport_connected = 1;
|
|
return 0;
|
|
}
|