frr/0027-dynamic-netlink-buffer.patch

268 lines
8.4 KiB
Diff

From 2cf7651f0b1b0123dc5568ebad00ac84a9b3c348 Mon Sep 17 00:00:00 2001
From: Donald Sharp <sharpd@nvidia.com>
Date: Wed, 2 Feb 2022 13:28:42 -0500
Subject: [PATCH] zebra: Make netlink buffer reads resizeable when needed
Currently when the kernel sends netlink messages to FRR
the buffers to receive this data is of fixed length.
The kernel, with certain configurations, will send
netlink messages that are larger than this fixed length.
This leads to situations where, on startup, zebra gets
really confused about the state of the kernel. Effectively
the current algorithm is this:
read up to buffer in size
while (data to parse)
get netlink message header, look at size
parse if you can
The problem is that there is a 32k buffer we read.
We get the first message that is say 1k in size,
subtract that 1k to 31k left to parse. We then
get the next header and notice that the length
of the message is 33k. Which is obviously larger
than what we read in. FRR has no recover mechanism
nor is there a way to know, a priori, what the maximum
size the kernel will send us.
Modify FRR to look at the kernel message and see if the
buffer is large enough, if not, make it large enough to
read in the message.
This code has to be per netlink socket because of the usage
of pthreads. So add to `struct nlsock` the buffer and current
buffer length. Growing it as necessary.
Fixes: #10404
Signed-off-by: Donald Sharp <sharpd@nvidia.com>
---
zebra/kernel_netlink.c | 68 +++++++++++++++++++++++++-----------------
zebra/kernel_netlink.h | 2 +-
zebra/zebra_dplane.c | 4 +++
zebra/zebra_ns.h | 3 ++
4 files changed, 49 insertions(+), 28 deletions(-)
diff --git a/zebra/kernel_netlink.h b/zebra/kernel_netlink.h
index ae88f3372b1c..9421ea1c611a 100644
--- a/zebra/kernel_netlink.h
+++ b/zebra/kernel_netlink.h
@@ -96,7 +96,7 @@ extern const char *nl_family_to_str(uint8_t family);
extern const char *nl_rttype_to_str(uint8_t rttype);
extern int netlink_parse_info(int (*filter)(struct nlmsghdr *, ns_id_t, int),
- const struct nlsock *nl,
+ struct nlsock *nl,
const struct zebra_dplane_info *dp_info,
int count, int startup);
extern int netlink_talk_filter(struct nlmsghdr *h, ns_id_t ns, int startup);
diff --git a/zebra/zebra_ns.h b/zebra/zebra_ns.h
index 0519e1d5b33d..7a0ffbc1ee6f 100644
--- a/zebra/zebra_ns.h
+++ b/zebra/zebra_ns.h
@@ -39,6 +39,9 @@ struct nlsock {
int seq;
struct sockaddr_nl snl;
char name[64];
+
+ uint8_t *buf;
+ size_t buflen;
};
#endif
diff --git a/zebra/kernel_netlink.c b/zebra/kernel_netlink.c
index b8eaeb1..14a40a9 100644
--- a/zebra/kernel_netlink.c
+++ b/zebra/kernel_netlink.c
@@ -90,8 +90,6 @@
*/
#define NL_DEFAULT_BATCH_SEND_THRESHOLD (15 * NL_PKT_BUF_SIZE)
-#define NL_BATCH_RX_BUFSIZE NL_RCV_PKT_BUF_SIZE
-
static const struct message nlmsg_str[] = {{RTM_NEWROUTE, "RTM_NEWROUTE"},
{RTM_DELROUTE, "RTM_DELROUTE"},
{RTM_GETROUTE, "RTM_GETROUTE"},
@@ -164,8 +162,6 @@ DEFINE_MTYPE_STATIC(ZEBRA, NL_BUF, "Zebra Netlink buffers")
size_t nl_batch_tx_bufsize;
char *nl_batch_tx_buf;
-char nl_batch_rx_buf[NL_BATCH_RX_BUFSIZE];
-
_Atomic uint32_t nl_batch_bufsize = NL_DEFAULT_BATCH_BUFSIZE;
_Atomic uint32_t nl_batch_send_threshold = NL_DEFAULT_BATCH_SEND_THRESHOLD;
@@ -322,6 +318,9 @@ static int netlink_socket(struct nlsock *nl, unsigned long groups,
nl->snl = snl;
nl->sock = sock;
+ nl->buflen = NL_RCV_PKT_BUF_SIZE;
+ nl->buf = XMALLOC(MTYPE_NL_BUF, nl->buflen);
+
return ret;
}
@@ -729,19 +728,29 @@ static ssize_t netlink_send_msg(const struct nlsock *nl, void *buf,
*
* Returns -1 on error, 0 if read would block or the number of bytes received.
*/
-static int netlink_recv_msg(const struct nlsock *nl, struct msghdr msg,
- void *buf, size_t buflen)
+static int netlink_recv_msg(struct nlsock *nl, struct msghdr *msg)
{
struct iovec iov;
int status;
- iov.iov_base = buf;
- iov.iov_len = buflen;
- msg.msg_iov = &iov;
- msg.msg_iovlen = 1;
+ iov.iov_base = nl->buf;
+ iov.iov_len = nl->buflen;
+ msg->msg_iov = &iov;
+ msg->msg_iovlen = 1;
do {
- status = recvmsg(nl->sock, &msg, 0);
+ int bytes;
+
+ bytes = recv(nl->sock, NULL, 0, MSG_PEEK | MSG_TRUNC);
+
+ if (bytes >= 0 && (size_t)bytes > nl->buflen) {
+ nl->buf = XREALLOC(MTYPE_NL_BUF, nl->buf, bytes);
+ nl->buflen = bytes;
+ iov.iov_base = nl->buf;
+ iov.iov_len = nl->buflen;
+ }
+
+ status = recvmsg(nl->sock, msg, 0);
} while (status == -1 && errno == EINTR);
if (status == -1) {
@@ -761,10 +770,10 @@ static int netlink_recv_msg(const struct nlsock *nl, struct msghdr msg,
return -1;
}
- if (msg.msg_namelen != sizeof(struct sockaddr_nl)) {
+ if (msg->msg_namelen != sizeof(struct sockaddr_nl)) {
flog_err(EC_ZEBRA_NETLINK_LENGTH_ERROR,
"%s sender address length error: length %d", nl->name,
- msg.msg_namelen);
+ msg->msg_namelen);
return -1;
}
@@ -873,8 +882,7 @@ static int netlink_parse_error(const struct nlsock *nl, struct nlmsghdr *h,
* the filter.
*/
int netlink_parse_info(int (*filter)(struct nlmsghdr *, ns_id_t, int),
- const struct nlsock *nl,
- const struct zebra_dplane_info *zns,
+ struct nlsock *nl, const struct zebra_dplane_info *zns,
int count, int startup)
{
int status;
@@ -883,7 +891,6 @@ int netlink_parse_info(int (*filter)(struct nlmsghdr *, ns_id_t, int),
int read_in = 0;
while (1) {
- char buf[NL_RCV_PKT_BUF_SIZE];
struct sockaddr_nl snl;
struct msghdr msg = {.msg_name = (void *)&snl,
.msg_namelen = sizeof(snl)};
@@ -892,14 +899,14 @@ int netlink_parse_info(int (*filter)(struct nlmsghdr *, ns_id_t, int),
if (count && read_in >= count)
return 0;
- status = netlink_recv_msg(nl, msg, buf, sizeof(buf));
+ status = netlink_recv_msg(nl, &msg);
if (status == -1)
return -1;
else if (status == 0)
break;
read_in++;
- for (h = (struct nlmsghdr *)buf;
+ for (h = (struct nlmsghdr *)nl->buf;
(status >= 0 && NLMSG_OK(h, (unsigned int)status));
h = NLMSG_NEXT(h, status)) {
/* Finish of reading. */
@@ -976,10 +983,10 @@ int netlink_parse_info(int (*filter)(struct nlmsghdr *, ns_id_t, int),
*/
static int
netlink_talk_info(int (*filter)(struct nlmsghdr *, ns_id_t, int startup),
- struct nlmsghdr *n, const struct zebra_dplane_info *dp_info,
+ struct nlmsghdr *n, struct zebra_dplane_info *dp_info,
int startup)
{
- const struct nlsock *nl;
+ struct nlsock *nl;
nl = &(dp_info->nls);
n->nlmsg_seq = nl->seq;
@@ -1067,12 +1074,11 @@ static int nl_batch_read_resp(struct nl_batch *bth)
* message at a time.
*/
while (true) {
- status = netlink_recv_msg(nl, msg, nl_batch_rx_buf,
- sizeof(nl_batch_rx_buf));
+ status = netlink_recv_msg(nl, &msg);
if (status == -1 || status == 0)
return status;
- h = (struct nlmsghdr *)nl_batch_rx_buf;
+ h = (struct nlmsghdr *)nl->buf;
ignore_msg = false;
seq = h->nlmsg_seq;
/*
@@ -1506,11 +1512,15 @@ void kernel_terminate(struct zebra_ns *zns, bool complete)
if (zns->netlink.sock >= 0) {
close(zns->netlink.sock);
zns->netlink.sock = -1;
+ XFREE(MTYPE_NL_BUF, zns->netlink.buf);
+ zns->netlink.buflen = 0;
}
if (zns->netlink_cmd.sock >= 0) {
close(zns->netlink_cmd.sock);
zns->netlink_cmd.sock = -1;
+ XFREE(MTYPE_NL_BUF, zns->netlink_cmd.buf);
+ zns->netlink_cmd.buflen = 0;
}
/* During zebra shutdown, we need to leave the dataplane socket
@@ -1520,6 +1530,8 @@ void kernel_terminate(struct zebra_ns *zns, bool complete)
if (zns->netlink_dplane.sock >= 0) {
close(zns->netlink_dplane.sock);
zns->netlink_dplane.sock = -1;
+ XFREE(MTYPE_NL_BUF, zns->netlink_dplane.buf);
+ zns->netlink_dplane.buflen = 0;
}
}
}
diff --git a/zebra/kernel_netlink.c b/zebra/kernel_netlink.c
index 14a40a9..2b566d4 100644
--- a/zebra/kernel_netlink.c
+++ b/zebra/kernel_netlink.c
@@ -779,7 +779,7 @@ static int netlink_recv_msg(struct nlsock *nl, struct msghdr *msg)
if (IS_ZEBRA_DEBUG_KERNEL_MSGDUMP_RECV) {
zlog_debug("%s: << netlink message dump [recv]", __func__);
- zlog_hexdump(buf, status);
+ zlog_hexdump(nl->buf, status);
}
return status;
diff --git a/zebra/kernel_netlink.c b/zebra/kernel_netlink.c
index 2b566d4..0564a6b 100644
--- a/zebra/kernel_netlink.c
+++ b/zebra/kernel_netlink.c
@@ -1060,7 +1060,7 @@ static int nl_batch_read_resp(struct nl_batch *bth)
struct sockaddr_nl snl;
struct msghdr msg = {};
int status, seq;
- const struct nlsock *nl;
+ struct nlsock *nl;
struct zebra_dplane_ctx *ctx;
bool ignore_msg;