Recreate RHEL 5.14.0-687.19.1 from CS9/upstream backports

This commit is contained in:
Andrew Lukoshko 2026-06-29 11:18:51 +00:00
parent 7f71864ff0
commit 26248cc311
384 changed files with 61990 additions and 2 deletions

View File

@ -0,0 +1,508 @@
From 23e3d86443306f1ab3a60ea10e0a8403ecbbdb27 Mon Sep 17 00:00:00 2001
From: CKI Backport Bot <cki-ci-bot+cki-gitlab-backport-bot@redhat.com>
Date: Fri, 15 May 2026 17:29:34 +0000
Subject: [PATCH] netfilter: flowtable: strictly check for maximum number of
actions
JIRA: https://redhat.atlassian.net/browse/RHEL-176922
CVE: CVE-2026-43329
commit 76522fcdbc3a02b568f5d957f7e66fc194abb893
Author: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu Mar 26 00:17:09 2026 +0100
netfilter: flowtable: strictly check for maximum number of actions
The maximum number of flowtable hardware offload actions in IPv6 is:
* ethernet mangling (4 payload actions, 2 for each ethernet address)
* SNAT (4 payload actions)
* DNAT (4 payload actions)
* Double VLAN (4 vlan actions, 2 for popping vlan, and 2 for pushing)
for QinQ.
* Redirect (1 action)
Which makes 17, while the maximum is 16. But act_ct supports for tunnels
actions too. Note that payload action operates at 32-bit word level, so
mangling an IPv6 address takes 4 payload actions.
Update flow_action_entry_next() calls to check for the maximum number of
supported actions.
While at it, rise the maximum number of actions per flow from 16 to 24
so this works fine with IPv6 setups.
Fixes: c29f74e0df7a ("netfilter: nf_flow_table: hardware offload support")
Reported-by: Hyunwoo Kim <imv4bel@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: CKI Backport Bot <cki-ci-bot+cki-gitlab-backport-bot@redhat.com>
diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
index e59fa3be408c..0f3bccc69b57 100644
--- a/net/netfilter/nf_flow_table_offload.c
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -13,6 +13,8 @@
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_tuple.h>
+#define NF_FLOW_RULE_ACTION_MAX 24
+
static struct workqueue_struct *nf_flow_offload_add_wq;
static struct workqueue_struct *nf_flow_offload_del_wq;
static struct workqueue_struct *nf_flow_offload_stats_wq;
@@ -208,7 +210,12 @@ static void flow_offload_mangle(struct flow_action_entry *entry,
static inline struct flow_action_entry *
flow_action_entry_next(struct nf_flow_rule *flow_rule)
{
- int i = flow_rule->rule->action.num_entries++;
+ int i;
+
+ if (unlikely(flow_rule->rule->action.num_entries >= NF_FLOW_RULE_ACTION_MAX))
+ return NULL;
+
+ i = flow_rule->rule->action.num_entries++;
return &flow_rule->rule->action.entries[i];
}
@@ -226,6 +233,9 @@ static int flow_offload_eth_src(struct net *net,
u32 mask, val;
u16 val16;
+ if (!entry0 || !entry1)
+ return -E2BIG;
+
this_tuple = &flow->tuplehash[dir].tuple;
switch (this_tuple->xmit_type) {
@@ -276,6 +286,9 @@ static int flow_offload_eth_dst(struct net *net,
u8 nud_state;
u16 val16;
+ if (!entry0 || !entry1)
+ return -E2BIG;
+
this_tuple = &flow->tuplehash[dir].tuple;
switch (this_tuple->xmit_type) {
@@ -317,16 +330,19 @@ static int flow_offload_eth_dst(struct net *net,
return 0;
}
-static void flow_offload_ipv4_snat(struct net *net,
- const struct flow_offload *flow,
- enum flow_offload_tuple_dir dir,
- struct nf_flow_rule *flow_rule)
+static int flow_offload_ipv4_snat(struct net *net,
+ const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
{
struct flow_action_entry *entry = flow_action_entry_next(flow_rule);
u32 mask = ~htonl(0xffffffff);
__be32 addr;
u32 offset;
+ if (!entry)
+ return -E2BIG;
+
switch (dir) {
case FLOW_OFFLOAD_DIR_ORIGINAL:
addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr;
@@ -337,23 +353,27 @@ static void flow_offload_ipv4_snat(struct net *net,
offset = offsetof(struct iphdr, daddr);
break;
default:
- return;
+ return -EOPNOTSUPP;
}
flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP4, offset,
&addr, &mask);
+ return 0;
}
-static void flow_offload_ipv4_dnat(struct net *net,
- const struct flow_offload *flow,
- enum flow_offload_tuple_dir dir,
- struct nf_flow_rule *flow_rule)
+static int flow_offload_ipv4_dnat(struct net *net,
+ const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
{
struct flow_action_entry *entry = flow_action_entry_next(flow_rule);
u32 mask = ~htonl(0xffffffff);
__be32 addr;
u32 offset;
+ if (!entry)
+ return -E2BIG;
+
switch (dir) {
case FLOW_OFFLOAD_DIR_ORIGINAL:
addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr;
@@ -364,14 +384,15 @@ static void flow_offload_ipv4_dnat(struct net *net,
offset = offsetof(struct iphdr, saddr);
break;
default:
- return;
+ return -EOPNOTSUPP;
}
flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP4, offset,
&addr, &mask);
+ return 0;
}
-static void flow_offload_ipv6_mangle(struct nf_flow_rule *flow_rule,
+static int flow_offload_ipv6_mangle(struct nf_flow_rule *flow_rule,
unsigned int offset,
const __be32 *addr, const __be32 *mask)
{
@@ -380,15 +401,20 @@ static void flow_offload_ipv6_mangle(struct nf_flow_rule *flow_rule,
for (i = 0; i < sizeof(struct in6_addr) / sizeof(u32); i++) {
entry = flow_action_entry_next(flow_rule);
+ if (!entry)
+ return -E2BIG;
+
flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP6,
offset + i * sizeof(u32), &addr[i], mask);
}
+
+ return 0;
}
-static void flow_offload_ipv6_snat(struct net *net,
- const struct flow_offload *flow,
- enum flow_offload_tuple_dir dir,
- struct nf_flow_rule *flow_rule)
+static int flow_offload_ipv6_snat(struct net *net,
+ const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
{
u32 mask = ~htonl(0xffffffff);
const __be32 *addr;
@@ -404,16 +430,16 @@ static void flow_offload_ipv6_snat(struct net *net,
offset = offsetof(struct ipv6hdr, daddr);
break;
default:
- return;
+ return -EOPNOTSUPP;
}
- flow_offload_ipv6_mangle(flow_rule, offset, addr, &mask);
+ return flow_offload_ipv6_mangle(flow_rule, offset, addr, &mask);
}
-static void flow_offload_ipv6_dnat(struct net *net,
- const struct flow_offload *flow,
- enum flow_offload_tuple_dir dir,
- struct nf_flow_rule *flow_rule)
+static int flow_offload_ipv6_dnat(struct net *net,
+ const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
{
u32 mask = ~htonl(0xffffffff);
const __be32 *addr;
@@ -429,10 +455,10 @@ static void flow_offload_ipv6_dnat(struct net *net,
offset = offsetof(struct ipv6hdr, saddr);
break;
default:
- return;
+ return -EOPNOTSUPP;
}
- flow_offload_ipv6_mangle(flow_rule, offset, addr, &mask);
+ return flow_offload_ipv6_mangle(flow_rule, offset, addr, &mask);
}
static int flow_offload_l4proto(const struct flow_offload *flow)
@@ -454,15 +480,18 @@ static int flow_offload_l4proto(const struct flow_offload *flow)
return type;
}
-static void flow_offload_port_snat(struct net *net,
- const struct flow_offload *flow,
- enum flow_offload_tuple_dir dir,
- struct nf_flow_rule *flow_rule)
+static int flow_offload_port_snat(struct net *net,
+ const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
{
struct flow_action_entry *entry = flow_action_entry_next(flow_rule);
u32 mask, port;
u32 offset;
+ if (!entry)
+ return -E2BIG;
+
switch (dir) {
case FLOW_OFFLOAD_DIR_ORIGINAL:
port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port);
@@ -477,22 +506,26 @@ static void flow_offload_port_snat(struct net *net,
mask = ~htonl(0xffff);
break;
default:
- return;
+ return -EOPNOTSUPP;
}
flow_offload_mangle(entry, flow_offload_l4proto(flow), offset,
&port, &mask);
+ return 0;
}
-static void flow_offload_port_dnat(struct net *net,
- const struct flow_offload *flow,
- enum flow_offload_tuple_dir dir,
- struct nf_flow_rule *flow_rule)
+static int flow_offload_port_dnat(struct net *net,
+ const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
{
struct flow_action_entry *entry = flow_action_entry_next(flow_rule);
u32 mask, port;
u32 offset;
+ if (!entry)
+ return -E2BIG;
+
switch (dir) {
case FLOW_OFFLOAD_DIR_ORIGINAL:
port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port);
@@ -507,20 +540,24 @@ static void flow_offload_port_dnat(struct net *net,
mask = ~htonl(0xffff0000);
break;
default:
- return;
+ return -EOPNOTSUPP;
}
flow_offload_mangle(entry, flow_offload_l4proto(flow), offset,
&port, &mask);
+ return 0;
}
-static void flow_offload_ipv4_checksum(struct net *net,
- const struct flow_offload *flow,
- struct nf_flow_rule *flow_rule)
+static int flow_offload_ipv4_checksum(struct net *net,
+ const struct flow_offload *flow,
+ struct nf_flow_rule *flow_rule)
{
u8 protonum = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto;
struct flow_action_entry *entry = flow_action_entry_next(flow_rule);
+ if (!entry)
+ return -E2BIG;
+
entry->id = FLOW_ACTION_CSUM;
entry->csum_flags = TCA_CSUM_UPDATE_FLAG_IPV4HDR;
@@ -532,12 +569,14 @@ static void flow_offload_ipv4_checksum(struct net *net,
entry->csum_flags |= TCA_CSUM_UPDATE_FLAG_UDP;
break;
}
+
+ return 0;
}
-static void flow_offload_redirect(struct net *net,
- const struct flow_offload *flow,
- enum flow_offload_tuple_dir dir,
- struct nf_flow_rule *flow_rule)
+static int flow_offload_redirect(struct net *net,
+ const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
{
const struct flow_offload_tuple *this_tuple, *other_tuple;
struct flow_action_entry *entry;
@@ -555,21 +594,28 @@ static void flow_offload_redirect(struct net *net,
ifindex = other_tuple->iifidx;
break;
default:
- return;
+ return -EOPNOTSUPP;
}
dev = dev_get_by_index(net, ifindex);
if (!dev)
- return;
+ return -ENODEV;
entry = flow_action_entry_next(flow_rule);
+ if (!entry) {
+ dev_put(dev);
+ return -E2BIG;
+ }
+
entry->id = FLOW_ACTION_REDIRECT;
entry->dev = dev;
+
+ return 0;
}
-static void flow_offload_encap_tunnel(const struct flow_offload *flow,
- enum flow_offload_tuple_dir dir,
- struct nf_flow_rule *flow_rule)
+static int flow_offload_encap_tunnel(const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
{
const struct flow_offload_tuple *this_tuple;
struct flow_action_entry *entry;
@@ -577,7 +623,7 @@ static void flow_offload_encap_tunnel(const struct flow_offload *flow,
this_tuple = &flow->tuplehash[dir].tuple;
if (this_tuple->xmit_type == FLOW_OFFLOAD_XMIT_DIRECT)
- return;
+ return 0;
dst = this_tuple->dst_cache;
if (dst && dst->lwtstate) {
@@ -586,15 +632,19 @@ static void flow_offload_encap_tunnel(const struct flow_offload *flow,
tun_info = lwt_tun_info(dst->lwtstate);
if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX)) {
entry = flow_action_entry_next(flow_rule);
+ if (!entry)
+ return -E2BIG;
entry->id = FLOW_ACTION_TUNNEL_ENCAP;
entry->tunnel = tun_info;
}
}
+
+ return 0;
}
-static void flow_offload_decap_tunnel(const struct flow_offload *flow,
- enum flow_offload_tuple_dir dir,
- struct nf_flow_rule *flow_rule)
+static int flow_offload_decap_tunnel(const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
{
const struct flow_offload_tuple *other_tuple;
struct flow_action_entry *entry;
@@ -602,7 +652,7 @@ static void flow_offload_decap_tunnel(const struct flow_offload *flow,
other_tuple = &flow->tuplehash[!dir].tuple;
if (other_tuple->xmit_type == FLOW_OFFLOAD_XMIT_DIRECT)
- return;
+ return 0;
dst = other_tuple->dst_cache;
if (dst && dst->lwtstate) {
@@ -611,9 +661,13 @@ static void flow_offload_decap_tunnel(const struct flow_offload *flow,
tun_info = lwt_tun_info(dst->lwtstate);
if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX)) {
entry = flow_action_entry_next(flow_rule);
+ if (!entry)
+ return -E2BIG;
entry->id = FLOW_ACTION_TUNNEL_DECAP;
}
}
+
+ return 0;
}
static int
@@ -625,8 +679,9 @@ nf_flow_rule_route_common(struct net *net, const struct flow_offload *flow,
const struct flow_offload_tuple *tuple;
int i;
- flow_offload_decap_tunnel(flow, dir, flow_rule);
- flow_offload_encap_tunnel(flow, dir, flow_rule);
+ if (flow_offload_decap_tunnel(flow, dir, flow_rule) < 0 ||
+ flow_offload_encap_tunnel(flow, dir, flow_rule) < 0)
+ return -1;
if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 ||
flow_offload_eth_dst(net, flow, dir, flow_rule) < 0)
@@ -642,6 +697,8 @@ nf_flow_rule_route_common(struct net *net, const struct flow_offload *flow,
if (tuple->encap[i].proto == htons(ETH_P_8021Q)) {
entry = flow_action_entry_next(flow_rule);
+ if (!entry)
+ return -1;
entry->id = FLOW_ACTION_VLAN_POP;
}
}
@@ -655,6 +712,8 @@ nf_flow_rule_route_common(struct net *net, const struct flow_offload *flow,
continue;
entry = flow_action_entry_next(flow_rule);
+ if (!entry)
+ return -1;
switch (other_tuple->encap[i].proto) {
case htons(ETH_P_PPP_SES):
@@ -680,18 +739,22 @@ int nf_flow_rule_route_ipv4(struct net *net, struct flow_offload *flow,
return -1;
if (test_bit(NF_FLOW_SNAT, &flow->flags)) {
- flow_offload_ipv4_snat(net, flow, dir, flow_rule);
- flow_offload_port_snat(net, flow, dir, flow_rule);
+ if (flow_offload_ipv4_snat(net, flow, dir, flow_rule) < 0 ||
+ flow_offload_port_snat(net, flow, dir, flow_rule) < 0)
+ return -1;
}
if (test_bit(NF_FLOW_DNAT, &flow->flags)) {
- flow_offload_ipv4_dnat(net, flow, dir, flow_rule);
- flow_offload_port_dnat(net, flow, dir, flow_rule);
+ if (flow_offload_ipv4_dnat(net, flow, dir, flow_rule) < 0 ||
+ flow_offload_port_dnat(net, flow, dir, flow_rule) < 0)
+ return -1;
}
if (test_bit(NF_FLOW_SNAT, &flow->flags) ||
test_bit(NF_FLOW_DNAT, &flow->flags))
- flow_offload_ipv4_checksum(net, flow, flow_rule);
+ if (flow_offload_ipv4_checksum(net, flow, flow_rule) < 0)
+ return -1;
- flow_offload_redirect(net, flow, dir, flow_rule);
+ if (flow_offload_redirect(net, flow, dir, flow_rule) < 0)
+ return -1;
return 0;
}
@@ -705,22 +768,23 @@ int nf_flow_rule_route_ipv6(struct net *net, struct flow_offload *flow,
return -1;
if (test_bit(NF_FLOW_SNAT, &flow->flags)) {
- flow_offload_ipv6_snat(net, flow, dir, flow_rule);
- flow_offload_port_snat(net, flow, dir, flow_rule);
+ if (flow_offload_ipv6_snat(net, flow, dir, flow_rule) < 0 ||
+ flow_offload_port_snat(net, flow, dir, flow_rule) < 0)
+ return -1;
}
if (test_bit(NF_FLOW_DNAT, &flow->flags)) {
- flow_offload_ipv6_dnat(net, flow, dir, flow_rule);
- flow_offload_port_dnat(net, flow, dir, flow_rule);
+ if (flow_offload_ipv6_dnat(net, flow, dir, flow_rule) < 0 ||
+ flow_offload_port_dnat(net, flow, dir, flow_rule) < 0)
+ return -1;
}
- flow_offload_redirect(net, flow, dir, flow_rule);
+ if (flow_offload_redirect(net, flow, dir, flow_rule) < 0)
+ return -1;
return 0;
}
EXPORT_SYMBOL_GPL(nf_flow_rule_route_ipv6);
-#define NF_FLOW_RULE_ACTION_MAX 16
-
static struct nf_flow_rule *
nf_flow_offload_rule_alloc(struct net *net,
const struct flow_offload_work *offload,
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,112 @@
From aed3d041ab061ec8a64f50a3edda0f4db7280025 Mon Sep 17 00:00:00 2001
From: Yussuf Khalil <dev@pp3345.net>
Date: Fri, 6 Mar 2026 12:06:35 +0000
Subject: [PATCH] drm/amd/display: Do not skip unrelated mode changes in DSC
validation
Starting with commit 17ce8a6907f7 ("drm/amd/display: Add dsc pre-validation in
atomic check"), amdgpu resets the CRTC state mode_changed flag to false when
recomputing the DSC configuration results in no timing change for a particular
stream.
However, this is incorrect in scenarios where a change in MST/DSC configuration
happens in the same KMS commit as another (unrelated) mode change. For example,
the integrated panel of a laptop may be configured differently (e.g., HDR
enabled/disabled) depending on whether external screens are attached. In this
case, plugging in external DP-MST screens may result in the mode_changed flag
being dropped incorrectly for the integrated panel if its DSC configuration
did not change during precomputation in pre_validate_dsc().
At this point, however, dm_update_crtc_state() has already created new streams
for CRTCs with DSC-independent mode changes. In turn,
amdgpu_dm_commit_streams() will never release the old stream, resulting in a
memory leak. amdgpu_dm_atomic_commit_tail() will never acquire a reference to
the new stream either, which manifests as a use-after-free when the stream gets
disabled later on:
BUG: KASAN: use-after-free in dc_stream_release+0x25/0x90 [amdgpu]
Write of size 4 at addr ffff88813d836524 by task kworker/9:9/29977
Workqueue: events drm_mode_rmfb_work_fn
Call Trace:
<TASK>
dump_stack_lvl+0x6e/0xa0
print_address_description.constprop.0+0x88/0x320
? dc_stream_release+0x25/0x90 [amdgpu]
print_report+0xfc/0x1ff
? srso_alias_return_thunk+0x5/0xfbef5
? __virt_addr_valid+0x225/0x4e0
? dc_stream_release+0x25/0x90 [amdgpu]
kasan_report+0xe1/0x180
? dc_stream_release+0x25/0x90 [amdgpu]
kasan_check_range+0x125/0x200
dc_stream_release+0x25/0x90 [amdgpu]
dc_state_destruct+0x14d/0x5c0 [amdgpu]
dc_state_release.part.0+0x4e/0x130 [amdgpu]
dm_atomic_destroy_state+0x3f/0x70 [amdgpu]
drm_atomic_state_default_clear+0x8ee/0xf30
? drm_mode_object_put.part.0+0xb1/0x130
__drm_atomic_state_free+0x15c/0x2d0
atomic_remove_fb+0x67e/0x980
Since there is no reliable way of figuring out whether a CRTC has unrelated
mode changes pending at the time of DSC validation, remember the value of the
mode_changed flag from before the point where a CRTC was marked as potentially
affected by a change in DSC configuration. Reset the mode_changed flag to this
earlier value instead in pre_validate_dsc().
Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/5004
Fixes: 17ce8a6907f7 ("drm/amd/display: Add dsc pre-validation in atomic check")
Signed-off-by: Yussuf Khalil <dev@pp3345.net>
Reviewed-by: Harry Wentland <harry.wentland@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
(cherry picked from commit cc7c7121ae082b7b82891baa7280f1ff2608f22b)
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 085cc98bd875..a9c398b1516b 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -12523,6 +12523,11 @@ static int amdgpu_dm_atomic_check(struct drm_device *dev,
}
if (dc_resource_is_dsc_encoding_supported(dc)) {
+ for_each_oldnew_crtc_in_state(state, crtc, old_crtc_state, new_crtc_state, i) {
+ dm_new_crtc_state = to_dm_crtc_state(new_crtc_state);
+ dm_new_crtc_state->mode_changed_independent_from_dsc = new_crtc_state->mode_changed;
+ }
+
for_each_oldnew_crtc_in_state(state, crtc, old_crtc_state, new_crtc_state, i) {
if (drm_atomic_crtc_needs_modeset(new_crtc_state)) {
ret = add_affected_mst_dsc_crtcs(state, crtc);
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
index 800813671748..d15812d51d72 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
@@ -984,6 +984,7 @@ struct dm_crtc_state {
bool freesync_vrr_info_changed;
+ bool mode_changed_independent_from_dsc;
bool dsc_force_changed;
bool vrr_supported;
struct mod_freesync_config freesync_config;
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
index 7be50e8c0636..5d8c4c7020b1 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
@@ -1744,9 +1744,11 @@ int pre_validate_dsc(struct drm_atomic_state *state,
int ind = find_crtc_index_in_state_by_stream(state, stream);
if (ind >= 0) {
+ struct dm_crtc_state *dm_new_crtc_state = to_dm_crtc_state(state->crtcs[ind].new_state);
+
DRM_INFO_ONCE("%s:%d MST_DSC no mode changed for stream 0x%p\n",
__func__, __LINE__, stream);
- state->crtcs[ind].new_state->mode_changed = 0;
+ dm_new_crtc_state->base.mode_changed = dm_new_crtc_state->mode_changed_independent_from_dsc;
}
}
}
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,63 @@
From 0452b6526b2f54b2413b9cb4ff1ea2ac542c99c7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 26 Mar 2026 20:26:08 +0000
Subject: [PATCH] ipv6: icmp: clear skb2->cb[] in ip6_err_gen_icmpv6_unreach()
[ Upstream commit 86ab3e55673a7a49a841838776f1ab18d23a67b5 ]
Sashiko AI-review observed:
In ip6_err_gen_icmpv6_unreach(), the skb is an outer IPv4 ICMP error packet
where its cb contains an IPv4 inet_skb_parm. When skb is cloned into skb2
and passed to icmp6_send(), it uses IP6CB(skb2).
IP6CB interprets the IPv4 inet_skb_parm as an inet6_skb_parm. The cipso
offset in inet_skb_parm.opt directly overlaps with dsthao in inet6_skb_parm
at offset 18.
If an attacker sends a forged ICMPv4 error with a CIPSO IP option, dsthao
would be a non-zero offset. Inside icmp6_send(), mip6_addr_swap() is called
and uses ipv6_find_tlv(skb, opt->dsthao, IPV6_TLV_HAO).
This would scan the inner, attacker-controlled IPv6 packet starting at that
offset, potentially returning a fake TLV without checking if the remaining
packet length can hold the full 18-byte struct ipv6_destopt_hao.
Could mip6_addr_swap() then perform a 16-byte swap that extends past the end
of the packet data into skb_shared_info?
Should the cb array also be cleared in ip6_err_gen_icmpv6_unreach() and
ip6ip6_err() to prevent this?
This patch implements the first suggestion.
I am not sure if ip6ip6_err() needs to be changed.
A separate patch would be better anyway.
Fixes: ca15a078bd90 ("sit: generate icmpv6 error when receiving icmpv4 error")
Reported-by: Ido Schimmel <idosch@nvidia.com>
Closes: https://sashiko.dev/#/patchset/20260326155138.2429480-1-edumazet%40google.com
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Oskar Kjos <oskar.kjos@hotmail.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Link: https://patch.msgid.link/20260326202608.2976021-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 8601c76f3cc9..6f053874de74 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -674,6 +674,9 @@ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type,
if (!skb2)
return 1;
+ /* Remove debris left by IPv4 stack. */
+ memset(IP6CB(skb2), 0, sizeof(*IP6CB(skb2)));
+
skb_dst_drop(skb2);
skb_pull(skb2, nhs);
skb_reset_network_header(skb2);
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,137 @@
From dc9c57624e89fab59f90148b663d5171e0fa2416 Mon Sep 17 00:00:00 2001
From: CKI Backport Bot <cki-ci-bot+cki-gitlab-backport-bot@redhat.com>
Date: Wed, 27 May 2026 17:14:34 +0000
Subject: [PATCH] ALSA: aloop: Fix peer runtime UAF during format-change stop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
JIRA: https://redhat.atlassian.net/browse/RHEL-179312
CVE: CVE-2026-46090
Backported from tree(s): linux
commit e5c33cdc6f402eab8abd36ecf436b22c9d3a8aff
Author: Cássio Gabriel <cassiogabrielcontato@gmail.com>
Date: Fri Apr 24 09:48:41 2026 -0300
ALSA: aloop: Fix peer runtime UAF during format-change stop
loopback_check_format() may stop the capture side when playback starts
with parameters that no longer match a running capture stream. Commit
826af7fa62e3 ("ALSA: aloop: Fix racy access at PCM trigger") moved
the peer lookup under cable->lock, but the actual snd_pcm_stop() still
runs after dropping that lock.
A concurrent close can clear the capture entry from cable->streams[] and
detach or free its runtime while the playback trigger path still holds a
stale peer substream pointer.
Keep a per-cable count of in-flight peer stops before dropping
cable->lock, and make free_cable() wait for those stops before
detaching the runtime. This preserves the existing behavior while
making the peer runtime lifetime explicit.
Reported-by: syzbot+8fa95c41eafbc9d2ff6f@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=8fa95c41eafbc9d2ff6f
Fixes: 597603d615d2 ("ALSA: introduce the snd-aloop module for the PCM loopback")
Cc: stable@vger.kernel.org
Suggested-by: Takashi Iwai <tiwai@suse.com>
Signed-off-by: Cássio Gabriel <cassiogabrielcontato@gmail.com>
Link: https://patch.msgid.link/20260424-alsa-aloop-peer-stop-uaf-v2-1-94e68101db8a@gmail.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: CKI Backport Bot <cki-ci-bot+cki-gitlab-backport-bot@redhat.com>
diff --git a/sound/drivers/aloop.c b/sound/drivers/aloop.c
index db137222d319..d2b9160a08dd 100644
--- a/sound/drivers/aloop.c
+++ b/sound/drivers/aloop.c
@@ -99,6 +99,9 @@ struct loopback_ops {
struct loopback_cable {
spinlock_t lock;
struct loopback_pcm *streams[2];
+ /* in-flight peer stops running outside cable->lock */
+ atomic_t stop_count;
+ wait_queue_head_t stop_wait;
struct snd_pcm_hardware hw;
/* flags */
unsigned int valid;
@@ -366,8 +369,11 @@ static int loopback_check_format(struct loopback_cable *cable, int stream)
return 0;
if (stream == SNDRV_PCM_STREAM_CAPTURE)
return -EIO;
- else if (cruntime->state == SNDRV_PCM_STATE_RUNNING)
+ else if (cruntime->state == SNDRV_PCM_STATE_RUNNING) {
+ /* close must not free the peer runtime below */
+ atomic_inc(&cable->stop_count);
stop_capture = true;
+ }
}
setup = get_setup(dpcm_play);
@@ -396,8 +402,11 @@ static int loopback_check_format(struct loopback_cable *cable, int stream)
}
}
- if (stop_capture)
+ if (stop_capture) {
snd_pcm_stop(dpcm_capt->substream, SNDRV_PCM_STATE_DRAINING);
+ if (atomic_dec_and_test(&cable->stop_count))
+ wake_up(&cable->stop_wait);
+ }
return 0;
}
@@ -1049,23 +1058,29 @@ static void free_cable(struct snd_pcm_substream *substream)
struct loopback *loopback = substream->private_data;
int dev = get_cable_index(substream);
struct loopback_cable *cable;
+ struct loopback_pcm *dpcm;
+ bool other_alive;
cable = loopback->cables[substream->number][dev];
if (!cable)
return;
- if (cable->streams[!substream->stream]) {
- /* other stream is still alive */
- guard(spinlock_irq)(&cable->lock);
- cable->streams[substream->stream] = NULL;
- } else {
- struct loopback_pcm *dpcm = substream->runtime->private_data;
- if (cable->ops && cable->ops->close_cable && dpcm)
- cable->ops->close_cable(dpcm);
- /* free the cable */
- loopback->cables[substream->number][dev] = NULL;
- kfree(cable);
+ scoped_guard(spinlock_irq, &cable->lock) {
+ cable->streams[substream->stream] = NULL;
+ other_alive = cable->streams[!substream->stream];
}
+
+ /* Pair with the stop_count increment in loopback_check_format(). */
+ wait_event(cable->stop_wait, !atomic_read(&cable->stop_count));
+ if (other_alive)
+ return;
+
+ dpcm = substream->runtime->private_data;
+ if (cable->ops && cable->ops->close_cable && dpcm)
+ cable->ops->close_cable(dpcm);
+ /* free the cable */
+ loopback->cables[substream->number][dev] = NULL;
+ kfree(cable);
}
static int loopback_jiffies_timer_open(struct loopback_pcm *dpcm)
@@ -1260,6 +1275,8 @@ static int loopback_open(struct snd_pcm_substream *substream)
goto unlock;
}
spin_lock_init(&cable->lock);
+ atomic_set(&cable->stop_count, 0);
+ init_waitqueue_head(&cable->stop_wait);
cable->hw = loopback_pcm_hardware;
if (loopback->timer_source)
cable->ops = &loopback_snd_timer_ops;
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,208 @@
From efd0aa1426972ae0542b15484850fdd73395262f Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Tue, 28 Apr 2026 13:24:18 -0400
Subject: [PATCH] RDMA/iwcm: Fix workqueue list corruption by removing
work_list
JIRA: https://redhat.atlassian.net/browse/RHEL-163491
commit 7874eeacfa42177565c01d5198726671acf7adf2
Author: Jacob Moroni <jmoroni@google.com>
Date: Mon Jan 12 02:00:06 2026 +0000
RDMA/iwcm: Fix workqueue list corruption by removing work_list
The commit e1168f0 ("RDMA/iwcm: Simplify cm_event_handler()")
changed the work submission logic to unconditionally call
queue_work() with the expectation that queue_work() would
have no effect if work was already pending. The problem is
that a free list of struct iwcm_work is used (for which
struct work_struct is embedded), so each call to queue_work()
is basically unique and therefore does indeed queue the work.
This causes a problem in the work handler which walks the work_list
until it's empty to process entries. This means that a single
run of the work handler could process item N+1 and release it
back to the free list while the actual workqueue entry is still
queued. It could then get reused (INIT_WORK...) and lead to
list corruption in the workqueue logic.
Fix this by just removing the work_list. The workqueue already
does this for us.
This fixes the following error that was observed when stress
testing with ucmatose on an Intel E830 in iWARP mode:
[ 151.465780] list_del corruption. next->prev should be ffff9f0915c69c08, but was ffff9f0a1116be08. (next=ffff9f0a15b11c08)
[ 151.466639] ------------[ cut here ]------------
[ 151.466986] kernel BUG at lib/list_debug.c:67!
[ 151.467349] Oops: invalid opcode: 0000 [#1] SMP NOPTI
[ 151.467753] CPU: 14 UID: 0 PID: 2306 Comm: kworker/u64:18 Not tainted 6.19.0-rc4+ #1 PREEMPT(voluntary)
[ 151.468466] Hardware name: QEMU Ubuntu 24.04 PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
[ 151.469192] Workqueue: 0x0 (iw_cm_wq)
[ 151.469478] RIP: 0010:__list_del_entry_valid_or_report+0xf0/0x100
[ 151.469942] Code: c7 58 5f 4c b2 e8 10 50 aa ff 0f 0b 48 89 ef e8 36 57 cb ff 48 8b 55 08 48 89 e9 48 89 de 48 c7 c7 a8 5f 4c b2 e8 f0 4f aa ff <0f> 0b 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 90 90 90 90 90 90
[ 151.471323] RSP: 0000:ffffb15644e7bd68 EFLAGS: 00010046
[ 151.471712] RAX: 000000000000006d RBX: ffff9f0915c69c08 RCX: 0000000000000027
[ 151.472243] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff9f0a37d9c600
[ 151.472768] RBP: ffff9f0a15b11c08 R08: 0000000000000000 R09: c0000000ffff7fff
[ 151.473294] R10: 0000000000000001 R11: ffffb15644e7bba8 R12: ffff9f092339ee68
[ 151.473817] R13: ffff9f0900059c28 R14: ffff9f092339ee78 R15: 0000000000000000
[ 151.474344] FS: 0000000000000000(0000) GS:ffff9f0a847b5000(0000) knlGS:0000000000000000
[ 151.474934] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 151.475362] CR2: 0000559e233a9088 CR3: 000000020296b004 CR4: 0000000000770ef0
[ 151.475895] PKRU: 55555554
[ 151.476118] Call Trace:
[ 151.476331] <TASK>
[ 151.476497] move_linked_works+0x49/0xa0
[ 151.476792] __pwq_activate_work.isra.46+0x2f/0xa0
[ 151.477151] pwq_dec_nr_in_flight+0x1e0/0x2f0
[ 151.477479] process_scheduled_works+0x1c8/0x410
[ 151.477823] worker_thread+0x125/0x260
[ 151.478108] ? __pfx_worker_thread+0x10/0x10
[ 151.478430] kthread+0xfe/0x240
[ 151.478671] ? __pfx_kthread+0x10/0x10
[ 151.478955] ? __pfx_kthread+0x10/0x10
[ 151.479240] ret_from_fork+0x208/0x270
[ 151.479523] ? __pfx_kthread+0x10/0x10
[ 151.479806] ret_from_fork_asm+0x1a/0x30
[ 151.480103] </TASK>
Fixes: e1168f09b331 ("RDMA/iwcm: Simplify cm_event_handler()")
Signed-off-by: Jacob Moroni <jmoroni@google.com>
Link: https://patch.msgid.link/20260112020006.1352438-1-jmoroni@google.com
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c
index 9419ab4435df..a2cf6135fcde 100644
--- a/drivers/infiniband/core/iwcm.c
+++ b/drivers/infiniband/core/iwcm.c
@@ -95,7 +95,6 @@ static struct workqueue_struct *iwcm_wq;
struct iwcm_work {
struct work_struct work;
struct iwcm_id_private *cm_id;
- struct list_head list;
struct iw_cm_event event;
struct list_head free_list;
};
@@ -179,7 +178,6 @@ static int alloc_work_entries(struct iwcm_id_private *cm_id_priv, int count)
return -ENOMEM;
}
work->cm_id = cm_id_priv;
- INIT_LIST_HEAD(&work->list);
put_work(work);
}
return 0;
@@ -214,7 +212,6 @@ static void free_cm_id(struct iwcm_id_private *cm_id_priv)
static bool iwcm_deref_id(struct iwcm_id_private *cm_id_priv)
{
if (refcount_dec_and_test(&cm_id_priv->refcount)) {
- BUG_ON(!list_empty(&cm_id_priv->work_list));
free_cm_id(cm_id_priv);
return true;
}
@@ -261,7 +258,6 @@ struct iw_cm_id *iw_create_cm_id(struct ib_device *device,
refcount_set(&cm_id_priv->refcount, 1);
init_waitqueue_head(&cm_id_priv->connect_wait);
init_completion(&cm_id_priv->destroy_comp);
- INIT_LIST_HEAD(&cm_id_priv->work_list);
INIT_LIST_HEAD(&cm_id_priv->work_free_list);
return &cm_id_priv->id;
@@ -1008,13 +1004,13 @@ static int process_event(struct iwcm_id_private *cm_id_priv,
}
/*
- * Process events on the work_list for the cm_id. If the callback
- * function requests that the cm_id be deleted, a flag is set in the
- * cm_id flags to indicate that when the last reference is
- * removed, the cm_id is to be destroyed. This is necessary to
- * distinguish between an object that will be destroyed by the app
- * thread asleep on the destroy_comp list vs. an object destroyed
- * here synchronously when the last reference is removed.
+ * Process events for the cm_id. If the callback function requests
+ * that the cm_id be deleted, a flag is set in the cm_id flags to
+ * indicate that when the last reference is removed, the cm_id is
+ * to be destroyed. This is necessary to distinguish between an
+ * object that will be destroyed by the app thread asleep on the
+ * destroy_comp list vs. an object destroyed here synchronously
+ * when the last reference is removed.
*/
static void cm_work_handler(struct work_struct *_work)
{
@@ -1025,35 +1021,26 @@ static void cm_work_handler(struct work_struct *_work)
int ret = 0;
spin_lock_irqsave(&cm_id_priv->lock, flags);
- while (!list_empty(&cm_id_priv->work_list)) {
- work = list_first_entry(&cm_id_priv->work_list,
- struct iwcm_work, list);
- list_del_init(&work->list);
- levent = work->event;
- put_work(work);
- spin_unlock_irqrestore(&cm_id_priv->lock, flags);
-
- if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) {
- ret = process_event(cm_id_priv, &levent);
- if (ret) {
- destroy_cm_id(&cm_id_priv->id);
- WARN_ON_ONCE(iwcm_deref_id(cm_id_priv));
- }
- } else
- pr_debug("dropping event %d\n", levent.event);
- if (iwcm_deref_id(cm_id_priv))
- return;
- spin_lock_irqsave(&cm_id_priv->lock, flags);
- }
+ levent = work->event;
+ put_work(work);
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) {
+ ret = process_event(cm_id_priv, &levent);
+ if (ret) {
+ destroy_cm_id(&cm_id_priv->id);
+ WARN_ON_ONCE(iwcm_deref_id(cm_id_priv));
+ }
+ } else
+ pr_debug("dropping event %d\n", levent.event);
+ if (iwcm_deref_id(cm_id_priv))
+ return;
}
/*
* This function is called on interrupt context. Schedule events on
* the iwcm_wq thread to allow callback functions to downcall into
- * the CM and/or block. Events are queued to a per-CM_ID
- * work_list. If this is the first event on the work_list, the work
- * element is also queued on the iwcm_wq thread.
+ * the CM and/or block.
*
* Each event holds a reference on the cm_id. Until the last posted
* event has been delivered and processed, the cm_id cannot be
@@ -1095,7 +1082,6 @@ static int cm_event_handler(struct iw_cm_id *cm_id,
}
refcount_inc(&cm_id_priv->refcount);
- list_add_tail(&work->list, &cm_id_priv->work_list);
queue_work(iwcm_wq, &work->work);
out:
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
diff --git a/drivers/infiniband/core/iwcm.h b/drivers/infiniband/core/iwcm.h
index bf74639be128..b56fb12edece 100644
--- a/drivers/infiniband/core/iwcm.h
+++ b/drivers/infiniband/core/iwcm.h
@@ -50,7 +50,6 @@ struct iwcm_id_private {
struct ib_qp *qp;
struct completion destroy_comp;
wait_queue_head_t connect_wait;
- struct list_head work_list;
spinlock_t lock;
refcount_t refcount;
struct list_head work_free_list;
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,332 @@
From 3a8d8e4f0e85b288fc43ec65f8ffac858af16239 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Wed, 3 Jun 2026 15:16:45 +0200
Subject: [PATCH] binder: use cred instead of task for selinux checks
JIRA: https://issues.redhat.com/browse/RHEL-179443
CVE: CVE-2026-46054
commit 52f88693378a58094c538662ba652aff0253c4fe
Author: Todd Kjos <tkjos@google.com>
Date: Tue Oct 12 09:56:13 2021 -0700
binder: use cred instead of task for selinux checks
Since binder was integrated with selinux, it has passed
'struct task_struct' associated with the binder_proc
to represent the source and target of transactions.
The conversion of task to SID was then done in the hook
implementations. It turns out that there are race conditions
which can result in an incorrect security context being used.
Fix by using the 'struct cred' saved during binder_open and pass
it to the selinux subsystem.
Cc: stable@vger.kernel.org # 5.14 (need backport for earlier stables)
Fixes: 79af73079d75 ("Add security hooks to binder and implement the hooks for SELinux.")
Suggested-by: Jann Horn <jannh@google.com>
Signed-off-by: Todd Kjos <tkjos@google.com>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 4ef4e2dc47cb..3a01e1862d9e 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2049,7 +2049,7 @@ static int binder_translate_binder(struct flat_binder_object *fp,
ret = -EINVAL;
goto done;
}
- if (security_binder_transfer_binder(proc->tsk, target_proc->tsk)) {
+ if (security_binder_transfer_binder(proc->cred, target_proc->cred)) {
ret = -EPERM;
goto done;
}
@@ -2095,7 +2095,7 @@ static int binder_translate_handle(struct flat_binder_object *fp,
proc->pid, thread->pid, fp->handle);
return -EINVAL;
}
- if (security_binder_transfer_binder(proc->tsk, target_proc->tsk)) {
+ if (security_binder_transfer_binder(proc->cred, target_proc->cred)) {
ret = -EPERM;
goto done;
}
@@ -2183,7 +2183,7 @@ static int binder_translate_fd(u32 fd, binder_size_t fd_offset,
ret = -EBADF;
goto err_fget;
}
- ret = security_binder_transfer_file(proc->tsk, target_proc->tsk, file);
+ ret = security_binder_transfer_file(proc->cred, target_proc->cred, file);
if (ret < 0) {
ret = -EPERM;
goto err_security;
@@ -2588,8 +2588,8 @@ static void binder_transaction(struct binder_proc *proc,
return_error_line = __LINE__;
goto err_invalid_target_handle;
}
- if (security_binder_transaction(proc->tsk,
- target_proc->tsk) < 0) {
+ if (security_binder_transaction(proc->cred,
+ target_proc->cred) < 0) {
return_error = BR_FAILED_REPLY;
return_error_param = -EPERM;
return_error_line = __LINE__;
@@ -4554,7 +4554,7 @@ static int binder_ioctl_set_ctx_mgr(struct file *filp,
ret = -EBUSY;
goto out;
}
- ret = security_binder_set_context_mgr(proc->tsk);
+ ret = security_binder_set_context_mgr(proc->cred);
if (ret < 0)
goto out;
if (uid_valid(context->binder_context_mgr_uid)) {
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index fc89fae1ea60..1c2be7057bd9 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -26,13 +26,13 @@
* #undef LSM_HOOK
* };
*/
-LSM_HOOK(int, 0, binder_set_context_mgr, struct task_struct *mgr)
-LSM_HOOK(int, 0, binder_transaction, struct task_struct *from,
- struct task_struct *to)
-LSM_HOOK(int, 0, binder_transfer_binder, struct task_struct *from,
- struct task_struct *to)
-LSM_HOOK(int, 0, binder_transfer_file, struct task_struct *from,
- struct task_struct *to, struct file *file)
+LSM_HOOK(int, 0, binder_set_context_mgr, const struct cred *mgr)
+LSM_HOOK(int, 0, binder_transaction, const struct cred *from,
+ const struct cred *to)
+LSM_HOOK(int, 0, binder_transfer_binder, const struct cred *from,
+ const struct cred *to)
+LSM_HOOK(int, 0, binder_transfer_file, const struct cred *from,
+ const struct cred *to, struct file *file)
LSM_HOOK(int, 0, ptrace_access_check, struct task_struct *child,
unsigned int mode)
LSM_HOOK(int, 0, ptrace_traceme, struct task_struct *parent)
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 3f04476cc692..7577ecfc79e4 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1330,22 +1330,22 @@
*
* @binder_set_context_mgr:
* Check whether @mgr is allowed to be the binder context manager.
- * @mgr contains the task_struct for the task being registered.
+ * @mgr contains the struct cred for the current binder process.
* Return 0 if permission is granted.
* @binder_transaction:
* Check whether @from is allowed to invoke a binder transaction call
* to @to.
- * @from contains the task_struct for the sending task.
- * @to contains the task_struct for the receiving task.
+ * @from contains the struct cred for the sending process.
+ * @to contains the struct cred for the receiving process.
* @binder_transfer_binder:
* Check whether @from is allowed to transfer a binder reference to @to.
- * @from contains the task_struct for the sending task.
- * @to contains the task_struct for the receiving task.
+ * @from contains the struct cred for the sending process.
+ * @to contains the struct cred for the receiving process.
* @binder_transfer_file:
* Check whether @from is allowed to transfer @file to @to.
- * @from contains the task_struct for the sending task.
+ * @from contains the struct cred for the sending process.
* @file contains the struct file being transferred.
- * @to contains the task_struct for the receiving task.
+ * @to contains the struct cred for the receiving process.
*
* @ptrace_access_check:
* Check permission before allowing the current process to trace the
diff --git a/include/linux/security.h b/include/linux/security.h
index 16f44e78b7e6..3d216c94fd69 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -263,13 +263,13 @@ extern int security_init(void);
extern int early_security_init(void);
/* Security operations */
-int security_binder_set_context_mgr(struct task_struct *mgr);
-int security_binder_transaction(struct task_struct *from,
- struct task_struct *to);
-int security_binder_transfer_binder(struct task_struct *from,
- struct task_struct *to);
-int security_binder_transfer_file(struct task_struct *from,
- struct task_struct *to, struct file *file);
+int security_binder_set_context_mgr(const struct cred *mgr);
+int security_binder_transaction(const struct cred *from,
+ const struct cred *to);
+int security_binder_transfer_binder(const struct cred *from,
+ const struct cred *to);
+int security_binder_transfer_file(const struct cred *from,
+ const struct cred *to, struct file *file);
int security_ptrace_access_check(struct task_struct *child, unsigned int mode);
int security_ptrace_traceme(struct task_struct *parent);
int security_capget(struct task_struct *target,
@@ -520,25 +520,25 @@ static inline int early_security_init(void)
return 0;
}
-static inline int security_binder_set_context_mgr(struct task_struct *mgr)
+static inline int security_binder_set_context_mgr(const struct cred *mgr)
{
return 0;
}
-static inline int security_binder_transaction(struct task_struct *from,
- struct task_struct *to)
+static inline int security_binder_transaction(const struct cred *from,
+ const struct cred *to)
{
return 0;
}
-static inline int security_binder_transfer_binder(struct task_struct *from,
- struct task_struct *to)
+static inline int security_binder_transfer_binder(const struct cred *from,
+ const struct cred *to)
{
return 0;
}
-static inline int security_binder_transfer_file(struct task_struct *from,
- struct task_struct *to,
+static inline int security_binder_transfer_file(const struct cred *from,
+ const struct cred *to,
struct file *file)
{
return 0;
diff --git a/security/security.c b/security/security.c
index 5660bbab9845..2092b657af9f 100644
--- a/security/security.c
+++ b/security/security.c
@@ -887,25 +887,25 @@ OUT: \
/* Security operations */
-int security_binder_set_context_mgr(struct task_struct *mgr)
+int security_binder_set_context_mgr(const struct cred *mgr)
{
return call_int_hook(binder_set_context_mgr, mgr);
}
-int security_binder_transaction(struct task_struct *from,
- struct task_struct *to)
+int security_binder_transaction(const struct cred *from,
+ const struct cred *to)
{
return call_int_hook(binder_transaction, from, to);
}
-int security_binder_transfer_binder(struct task_struct *from,
- struct task_struct *to)
+int security_binder_transfer_binder(const struct cred *from,
+ const struct cred *to)
{
return call_int_hook(binder_transfer_binder, from, to);
}
-int security_binder_transfer_file(struct task_struct *from,
- struct task_struct *to, struct file *file)
+int security_binder_transfer_file(const struct cred *from,
+ const struct cred *to, struct file *file)
{
return call_int_hook(binder_transfer_file, from, to, file);
}
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 9a69a2a4b31d..22173e8e88e2 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -245,29 +245,6 @@ static inline u32 task_sid_obj(const struct task_struct *task)
return sid;
}
-/*
- * get the security ID of a task for use with binder
- */
-static inline u32 task_sid_binder(const struct task_struct *task)
-{
- /*
- * In many case where this function is used we should be using the
- * task's subjective SID, but we can't reliably access the subjective
- * creds of a task other than our own so we must use the objective
- * creds/SID, which are safe to access. The downside is that if a task
- * is temporarily overriding it's creds it will not be reflected here;
- * however, it isn't clear that binder would handle that case well
- * anyway.
- *
- * If this ever changes and we can safely reference the subjective
- * creds/SID of another task, this function will make it easier to
- * identify the various places where we make use of the task SIDs in
- * the binder code. It is also likely that we will need to adjust
- * the main drivers/android binder code as well.
- */
- return task_sid_obj(task);
-}
-
static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dentry);
/*
@@ -2039,18 +2016,19 @@ static inline u32 open_file_to_av(struct file *file)
/* Hook functions begin here. */
-static int selinux_binder_set_context_mgr(struct task_struct *mgr)
+static int selinux_binder_set_context_mgr(const struct cred *mgr)
{
return avc_has_perm(&selinux_state,
- current_sid(), task_sid_binder(mgr), SECCLASS_BINDER,
+ current_sid(), cred_sid(mgr), SECCLASS_BINDER,
BINDER__SET_CONTEXT_MGR, NULL);
}
-static int selinux_binder_transaction(struct task_struct *from,
- struct task_struct *to)
+static int selinux_binder_transaction(const struct cred *from,
+ const struct cred *to)
{
u32 mysid = current_sid();
- u32 fromsid = task_sid_binder(from);
+ u32 fromsid = cred_sid(from);
+ u32 tosid = cred_sid(to);
int rc;
if (mysid != fromsid) {
@@ -2061,24 +2039,24 @@ static int selinux_binder_transaction(struct task_struct *from,
return rc;
}
- return avc_has_perm(&selinux_state, fromsid, task_sid_binder(to),
+ return avc_has_perm(&selinux_state, fromsid, tosid,
SECCLASS_BINDER, BINDER__CALL, NULL);
}
-static int selinux_binder_transfer_binder(struct task_struct *from,
- struct task_struct *to)
+static int selinux_binder_transfer_binder(const struct cred *from,
+ const struct cred *to)
{
return avc_has_perm(&selinux_state,
- task_sid_binder(from), task_sid_binder(to),
+ cred_sid(from), cred_sid(to),
SECCLASS_BINDER, BINDER__TRANSFER,
NULL);
}
-static int selinux_binder_transfer_file(struct task_struct *from,
- struct task_struct *to,
+static int selinux_binder_transfer_file(const struct cred *from,
+ const struct cred *to,
struct file *file)
{
- u32 sid = task_sid_binder(to);
+ u32 sid = cred_sid(to);
struct file_security_struct *fsec = selinux_file(file);
struct dentry *dentry = file->f_path.dentry;
struct inode_security_struct *isec;
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,115 @@
From 16ac6be5ae309b2c31c37898511025294dcadcc8 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Wed, 3 Jun 2026 10:30:04 +0200
Subject: [PATCH] locks: fix TOCTOU race when granting write lease
JIRA: https://issues.redhat.com/browse/RHEL-179443
CVE: CVE-2026-46054
commit d6da19c9cace63290ccfccb1fc35151ffefc0bec
Author: Amir Goldstein <amir73il@gmail.com>
Date: Tue Aug 16 17:53:17 2022 +0300
locks: fix TOCTOU race when granting write lease
Thread A trying to acquire a write lease checks the value of i_readcount
and i_writecount in check_conflicting_open() to verify that its own fd
is the only fd referencing the file.
Thread B trying to open the file for read will call break_lease() in
do_dentry_open() before incrementing i_readcount, which leaves a small
window where thread A can acquire the write lease and then thread B
completes the open of the file for read without breaking the write lease
that was acquired by thread A.
Fix this race by incrementing i_readcount before checking for existing
leases, same as the case with i_writecount.
Use a helper put_file_access() to decrement i_readcount or i_writecount
in do_dentry_open() and __fput().
Fixes: 387e3746d01c ("locks: eliminate false positive conflicts for write lease")
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
diff --git a/fs/file_table.c b/fs/file_table.c
index cdc1dea33154..845c741dc518 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -366,12 +366,7 @@ static void __fput(struct file *file)
}
fops_put(file->f_op);
put_pid(file->f_owner.pid);
- if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
- i_readcount_dec(inode);
- if (mode & FMODE_WRITER) {
- put_write_access(inode);
- __mnt_drop_write(mnt);
- }
+ put_file_access(file);
dput(dentry);
if (unlikely(mode & FMODE_NEED_UNMOUNT))
dissolve_on_fput(mnt);
diff --git a/fs/internal.h b/fs/internal.h
index 3e8dbf777ce2..c3701d285c69 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -96,6 +96,16 @@ extern void chroot_fs_refs(const struct path *, const struct path *);
extern struct file *alloc_empty_file(int, const struct cred *);
extern struct file *alloc_empty_file_noaccount(int, const struct cred *);
+static inline void put_file_access(struct file *file)
+{
+ if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
+ i_readcount_dec(file->f_inode);
+ } else if (file->f_mode & FMODE_WRITER) {
+ put_write_access(file->f_inode);
+ __mnt_drop_write(file->f_path.mnt);
+ }
+}
+
/*
* super.c
*/
diff --git a/fs/open.c b/fs/open.c
index 51052202ecdc..a84909d62168 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -861,7 +861,9 @@ static int do_dentry_open(struct file *f,
return 0;
}
- if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
+ if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
+ i_readcount_inc(inode);
+ } else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
error = get_write_access(inode);
if (unlikely(error))
goto cleanup_file;
@@ -901,8 +903,6 @@ static int do_dentry_open(struct file *f,
goto cleanup_all;
}
f->f_mode |= FMODE_OPENED;
- if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
- i_readcount_inc(inode);
if ((f->f_mode & FMODE_READ) &&
likely(f->f_op->read || f->f_op->read_iter))
f->f_mode |= FMODE_CAN_READ;
@@ -948,10 +948,7 @@ static int do_dentry_open(struct file *f,
if (WARN_ON_ONCE(error > 0))
error = -EINVAL;
fops_put(f->f_op);
- if (f->f_mode & FMODE_WRITER) {
- put_write_access(inode);
- __mnt_drop_write(f->f_path.mnt);
- }
+ put_file_access(f);
cleanup_file:
path_put(&f->f_path);
f->f_path.mnt = NULL;
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,122 @@
From f3196ef468589e50d09ad0edc6d2874c267418e2 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Wed, 3 Jun 2026 09:46:36 +0200
Subject: [PATCH] fs: use a helper for opening kernel internal files
JIRA: https://issues.redhat.com/browse/RHEL-179443
CVE: CVE-2026-46054
Conflicts:
- include/linux/fs.h: context fuzz
- fs/overlayfs/util.c: previous backport introduced an
open_with_fake_path() caller that also needs to be renamed here
commit cbb0b9d4bbcfa96e7872808a63be03202536f1bc
Author: Amir Goldstein <amir73il@gmail.com>
Date: Thu Jun 15 14:22:26 2023 +0300
fs: use a helper for opening kernel internal files
cachefiles uses kernel_open_tmpfile() to open kernel internal tmpfile
without accounting for nr_files.
cachefiles uses open_with_fake_path() for the same reason without the
need for a fake path.
Fork open_with_fake_path() to kernel_file_open() which only does the
noaccount part and use it in cachefiles.
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Message-Id: <20230615112229.2143178-3-amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 6f5c59baec08..bc2bb2001318 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -560,8 +560,8 @@ static bool cachefiles_open_file(struct cachefiles_object *object,
*/
path.mnt = cache->mnt;
path.dentry = dentry;
- file = open_with_fake_path(&path, O_RDWR | O_LARGEFILE | O_DIRECT,
- d_backing_inode(dentry), cache->cache_cred);
+ file = kernel_file_open(&path, O_RDWR | O_LARGEFILE | O_DIRECT,
+ d_backing_inode(dentry), cache->cache_cred);
if (IS_ERR(file)) {
trace_cachefiles_vfs_error(object, d_backing_inode(dentry),
PTR_ERR(file),
diff --git a/fs/open.c b/fs/open.c
index a84909d62168..3eac96e10eb0 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1089,6 +1089,39 @@ struct file *dentry_create(const struct path *path, int flags, umode_t mode,
}
EXPORT_SYMBOL(dentry_create);
+/**
+ * kernel_file_open - open a file for kernel internal use
+ * @path: path of the file to open
+ * @flags: open flags
+ * @inode: the inode
+ * @cred: credentials for open
+ *
+ * Open a file for use by in-kernel consumers. The file is not accounted
+ * against nr_files and must not be installed into the file descriptor
+ * table.
+ *
+ * Return: Opened file on success, an error pointer on failure.
+ */
+struct file *kernel_file_open(const struct path *path, int flags,
+ struct inode *inode, const struct cred *cred)
+{
+ struct file *f;
+ int error;
+
+ f = alloc_empty_file_noaccount(flags, cred);
+ if (IS_ERR(f))
+ return f;
+
+ f->f_path = *path;
+ error = do_dentry_open(f, inode, NULL);
+ if (error) {
+ fput(f);
+ f = ERR_PTR(error);
+ }
+ return f;
+}
+EXPORT_SYMBOL_GPL(kernel_file_open);
+
struct file *open_with_fake_path(const struct path *path, int flags,
struct inode *inode, const struct cred *cred)
{
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 2cdbb70d2b5d..6b31b6587e4d 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -1370,7 +1370,7 @@ int ovl_ensure_verity_loaded(struct path *datapath)
* If this inode was not yet opened, the verity info hasn't been
* loaded yet, so we need to do that here to force it into memory.
*/
- filp = open_with_fake_path(datapath, O_RDONLY, inode, current_cred());
+ filp = kernel_file_open(datapath, O_RDONLY, inode, current_cred());
if (IS_ERR(filp))
return PTR_ERR(filp);
fput(filp);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index be94651061c1..363cdadb04ba 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1833,6 +1833,8 @@ static inline int vfs_whiteout(struct mnt_idmap *idmap,
struct file *vfs_tmpfile_open(struct mnt_idmap *idmap,
const struct path *parentpath,
umode_t mode, int open_flag, const struct cred *cred);
+struct file *kernel_file_open(const struct path *path, int flags,
+ struct inode *inode, const struct cred *cred);
int vfs_mkobj(struct dentry *, umode_t,
int (*f)(struct dentry *, umode_t, void *),
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,124 @@
From f226635eb71a0c5f680f89a64ec6332e5b2f8ee7 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Wed, 3 Jun 2026 15:22:00 +0200
Subject: [PATCH] fs: move kmem_cache_zalloc() into alloc_empty_file*() helpers
JIRA: https://issues.redhat.com/browse/RHEL-179443
CVE: CVE-2026-46054
Conflicts:
- only context fuzz
commit 8a05a8c31d06c5d0d67b273a4a00f87269adde82
Author: Amir Goldstein <amir73il@gmail.com>
Date: Thu Jun 15 14:22:27 2023 +0300
fs: move kmem_cache_zalloc() into alloc_empty_file*() helpers
Use a common helper init_file() instead of __alloc_file() for
alloc_empty_file*() helpers and improrve the documentation.
This is needed for a follow up patch that allocates a backing_file
container.
Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Message-Id: <20230615112229.2143178-4-amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
diff --git a/fs/file_table.c b/fs/file_table.c
index 845c741dc518..9fee3de138d6 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -131,20 +131,15 @@ static int __init init_fs_stat_sysctls(void)
fs_initcall(init_fs_stat_sysctls);
#endif
-static struct file *__alloc_file(int flags, const struct cred *cred)
+static int init_file(struct file *f, int flags, const struct cred *cred)
{
- struct file *f;
int error;
- f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
- if (unlikely(!f))
- return ERR_PTR(-ENOMEM);
-
f->f_cred = get_cred(cred);
error = security_file_alloc(f);
if (unlikely(error)) {
file_free_rcu(&f->f_u.fu_rcuhead);
- return ERR_PTR(error);
+ return error;
}
atomic_long_set(&f->f_count, 1);
@@ -155,7 +150,7 @@ static struct file *__alloc_file(int flags, const struct cred *cred)
f->f_mode = OPEN_FMODE(flags);
/* f->f_version: 0 */
- return f;
+ return 0;
}
/* Find an unused file structure and return a pointer to it.
@@ -172,6 +167,7 @@ struct file *alloc_empty_file(int flags, const struct cred *cred)
{
static long old_max;
struct file *f;
+ int error;
/*
* Privileged users can go above max_files
@@ -185,9 +181,15 @@ struct file *alloc_empty_file(int flags, const struct cred *cred)
goto over;
}
- f = __alloc_file(flags, cred);
- if (!IS_ERR(f))
- percpu_counter_inc(&nr_files);
+ f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
+ if (unlikely(!f))
+ return ERR_PTR(-ENOMEM);
+
+ error = init_file(f, flags, cred);
+ if (unlikely(error))
+ return ERR_PTR(error);
+
+ percpu_counter_inc(&nr_files);
return f;
@@ -203,14 +205,23 @@ struct file *alloc_empty_file(int flags, const struct cred *cred)
/*
* Variant of alloc_empty_file() that doesn't check and modify nr_files.
*
- * Should not be used unless there's a very good reason to do so.
+ * This is only for kernel internal use, and the allocate file must not be
+ * installed into file tables or such.
*/
struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred)
{
- struct file *f = __alloc_file(flags, cred);
+ struct file *f;
+ int error;
+
+ f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
+ if (unlikely(!f))
+ return ERR_PTR(-ENOMEM);
+
+ error = init_file(f, flags, cred);
+ if (unlikely(error))
+ return ERR_PTR(error);
- if (!IS_ERR(f))
- f->f_mode |= FMODE_NOACCOUNT;
+ f->f_mode |= FMODE_NOACCOUNT;
return f;
}
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,253 @@
From 9e62303890ee2ae993e8c78bb442176d2467e927 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Wed, 3 Jun 2026 09:49:05 +0200
Subject: [PATCH] fs: use backing_file container for internal files with "fake"
f_path
JIRA: https://issues.redhat.com/browse/RHEL-179443
CVE: CVE-2026-46054
commit 62d53c4a1dfe347bd87ede46ffad38c9a3870338
Author: Amir Goldstein <amir73il@gmail.com>
Date: Thu Jun 15 14:22:28 2023 +0300
fs: use backing_file container for internal files with "fake" f_path
Overlayfs uses open_with_fake_path() to allocate internal kernel files,
with a "fake" path - whose f_path is not on the same fs as f_inode.
Allocate a container struct backing_file for those internal files, that
is used to hold the "fake" ovl path along with the real path.
backing_file_real_path() can be used to access the stored real path.
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Message-Id: <20230615112229.2143178-5-amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
diff --git a/fs/file_table.c b/fs/file_table.c
index 9fee3de138d6..3f02d00c1396 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -44,18 +44,40 @@ static struct kmem_cache *filp_cachep __read_mostly;
static struct percpu_counter nr_files __cacheline_aligned_in_smp;
+/* Container for backing file with optional real path */
+struct backing_file {
+ struct file file;
+ struct path real_path;
+};
+
+static inline struct backing_file *backing_file(struct file *f)
+{
+ return container_of(f, struct backing_file, file);
+}
+
+struct path *backing_file_real_path(struct file *f)
+{
+ return &backing_file(f)->real_path;
+}
+EXPORT_SYMBOL_GPL(backing_file_real_path);
+
static void file_free_rcu(struct rcu_head *head)
{
struct file *f = container_of(head, struct file, f_u.fu_rcuhead);
put_cred(f->f_cred);
- kmem_cache_free(filp_cachep, f);
+ if (unlikely(f->f_mode & FMODE_BACKING))
+ kfree(backing_file(f));
+ else
+ kmem_cache_free(filp_cachep, f);
}
static inline void file_free(struct file *f)
{
security_file_free(f);
- if (!(f->f_mode & FMODE_NOACCOUNT))
+ if (unlikely(f->f_mode & FMODE_BACKING))
+ path_put(backing_file_real_path(f));
+ if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
percpu_counter_dec(&nr_files);
call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
}
@@ -226,6 +248,30 @@ struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred)
return f;
}
+/*
+ * Variant of alloc_empty_file() that allocates a backing_file container
+ * and doesn't check and modify nr_files.
+ *
+ * This is only for kernel internal use, and the allocate file must not be
+ * installed into file tables or such.
+ */
+struct file *alloc_empty_backing_file(int flags, const struct cred *cred)
+{
+ struct backing_file *ff;
+ int error;
+
+ ff = kzalloc(sizeof(struct backing_file), GFP_KERNEL);
+ if (unlikely(!ff))
+ return ERR_PTR(-ENOMEM);
+
+ error = init_file(&ff->file, flags, cred);
+ if (unlikely(error))
+ return ERR_PTR(error);
+
+ ff->file.f_mode |= FMODE_BACKING | FMODE_NOACCOUNT;
+ return &ff->file;
+}
+
/**
* file_init_path - initialize a 'struct file' based on path
*
diff --git a/fs/internal.h b/fs/internal.h
index c3701d285c69..62b558fa6395 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -93,8 +93,9 @@ extern void chroot_fs_refs(const struct path *, const struct path *);
/*
* file_table.c
*/
-extern struct file *alloc_empty_file(int, const struct cred *);
-extern struct file *alloc_empty_file_noaccount(int, const struct cred *);
+struct file *alloc_empty_file(int flags, const struct cred *cred);
+struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred);
+struct file *alloc_empty_backing_file(int flags, const struct cred *cred);
static inline void put_file_access(struct file *file)
{
diff --git a/fs/open.c b/fs/open.c
index 3eac96e10eb0..e2419242456e 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1122,23 +1122,44 @@ struct file *kernel_file_open(const struct path *path, int flags,
}
EXPORT_SYMBOL_GPL(kernel_file_open);
-struct file *open_with_fake_path(const struct path *path, int flags,
- struct inode *inode, const struct cred *cred)
+/**
+ * backing_file_open - open a backing file for kernel internal use
+ * @path: path of the file to open
+ * @flags: open flags
+ * @path: path of the backing file
+ * @cred: credentials for open
+ *
+ * Open a backing file for a stackable filesystem (e.g., overlayfs).
+ * @path may be on the stackable filesystem and backing inode on the
+ * underlying filesystem. In this case, we want to be able to return
+ * the @real_path of the backing inode. This is done by embedding the
+ * returned file into a container structure that also stores the path of
+ * the backing inode on the underlying filesystem, which can be
+ * retrieved using backing_file_real_path().
+ */
+struct file *backing_file_open(const struct path *path, int flags,
+ const struct path *real_path,
+ const struct cred *cred)
{
- struct file *f = alloc_empty_file_noaccount(flags, cred);
- if (!IS_ERR(f)) {
- int error;
+ struct file *f;
+ int error;
- f->f_path = *path;
- error = do_dentry_open(f, inode, NULL);
- if (error) {
- fput(f);
- f = ERR_PTR(error);
- }
+ f = alloc_empty_backing_file(flags, cred);
+ if (IS_ERR(f))
+ return f;
+
+ f->f_path = *path;
+ path_get(real_path);
+ *backing_file_real_path(f) = *real_path;
+ error = do_dentry_open(f, d_inode(real_path->dentry), NULL);
+ if (error) {
+ fput(f);
+ f = ERR_PTR(error);
}
+
return f;
}
-EXPORT_SYMBOL(open_with_fake_path);
+EXPORT_SYMBOL_GPL(backing_file_open);
#define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE))
#define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC)
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 5db89c8de140..99f2ae8e3864 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -65,8 +65,8 @@ static struct file *ovl_open_realfile(const struct file *file,
if (!inode_owner_or_capable(real_idmap, realinode))
flags &= ~O_NOATIME;
- realfile = open_with_fake_path(&file->f_path, flags, realinode,
- current_cred());
+ realfile = backing_file_open(&file->f_path, flags, realpath,
+ current_cred());
}
revert_creds(old_cred);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 363cdadb04ba..48ec31b9d230 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -167,6 +167,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
/* Supports IOCB_HAS_METADATA */
#define FMODE_HAS_METADATA ((__force fmode_t)0x800000)
+/* File is embedded in backing_file object */
+#define FMODE_BACKING ((__force fmode_t)0x2000000)
+
/* File was opened by fanotify and shouldn't generate fanotify events */
#define FMODE_NONOTIFY ((__force fmode_t)0x4000000)
@@ -2579,11 +2582,31 @@ static inline struct file *file_open_root_mnt(struct vfsmount *mnt,
return file_open_root(&(struct path){.mnt = mnt, .dentry = mnt->mnt_root},
name, flags, mode);
}
-extern struct file * dentry_open(const struct path *, int, const struct cred *);
-extern struct file *dentry_create(const struct path *path, int flags,
- umode_t mode, const struct cred *cred);
-extern struct file * open_with_fake_path(const struct path *, int,
- struct inode*, const struct cred *);
+struct file *dentry_open(const struct path *path, int flags,
+ const struct cred *creds);
+struct file *dentry_create(const struct path *path, int flags, umode_t mode,
+ const struct cred *cred);
+struct file *backing_file_open(const struct path *path, int flags,
+ const struct path *real_path,
+ const struct cred *cred);
+struct path *backing_file_real_path(struct file *f);
+
+/*
+ * file_real_path - get the path corresponding to f_inode
+ *
+ * When opening a backing file for a stackable filesystem (e.g.,
+ * overlayfs) f_path may be on the stackable filesystem and f_inode on
+ * the underlying filesystem. When the path associated with f_inode is
+ * needed, this helper should be used instead of accessing f_path
+ * directly.
+*/
+static inline const struct path *file_real_path(struct file *f)
+{
+ if (unlikely(f->f_mode & FMODE_BACKING))
+ return backing_file_real_path(f);
+ return &f->f_path;
+}
+
static inline struct file *file_clone_open(struct file *file)
{
return dentry_open(&file->f_path, file->f_flags, file->f_cred);
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,75 @@
From c030fdbbb0542056bdd257409b781cee8d8b8c39 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Wed, 3 Jun 2026 09:49:52 +0200
Subject: [PATCH] ovl: enable fsnotify events on underlying real files
JIRA: https://issues.redhat.com/browse/RHEL-179443
CVE: CVE-2026-46054
commit bc2473c90fca55bf95b2ab6af1dacee26a4f92f6
Author: Amir Goldstein <amir73il@gmail.com>
Date: Thu Jun 15 14:22:29 2023 +0300
ovl: enable fsnotify events on underlying real files
Overlayfs creates the real underlying files with fake f_path, whose
f_inode is on the underlying fs and f_path on overlayfs.
Those real files were open with FMODE_NONOTIFY, because fsnotify code was
not prapared to handle fsnotify hooks on files with fake path correctly
and fanotify would report unexpected event->fd with fake overlayfs path,
when the underlying fs was being watched.
Teach fsnotify to handle events on the real files, and do not set real
files to FMODE_NONOTIFY to allow operations on real file (e.g. open,
access, modify, close) to generate async and permission events.
Because fsnotify does not have notifications on address space
operations, we do not need to worry about ->vm_file not reporting
events to a watched overlayfs when users are accessing a mapped
overlayfs file.
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Message-Id: <20230615112229.2143178-6-amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 99f2ae8e3864..cd0770bb3020 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -38,8 +38,8 @@ static char ovl_whatisit(struct inode *inode, struct inode *realinode)
return 'm';
}
-/* No atime modification nor notify on underlying */
-#define OVL_OPEN_FLAGS (O_NOATIME | FMODE_NONOTIFY)
+/* No atime modification on underlying */
+#define OVL_OPEN_FLAGS (O_NOATIME)
static struct file *ovl_open_realfile(const struct file *file,
const struct path *realpath)
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index bb8467cd11ae..ed48e4f1e755 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -91,11 +91,13 @@ static inline void fsnotify_dentry(struct dentry *dentry, __u32 mask)
static inline int fsnotify_file(struct file *file, __u32 mask)
{
- const struct path *path = &file->f_path;
+ const struct path *path;
if (file->f_mode & FMODE_NONOTIFY)
return 0;
+ /* Overlayfs internal files have fake f_path */
+ path = file_real_path(file);
return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH);
}
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,82 @@
From 119b885de3aa505ccc26df2a8a074555e04be774 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Wed, 3 Jun 2026 20:28:16 +0200
Subject: [PATCH] fs: move cleanup from init_file() into its callers
JIRA: https://issues.redhat.com/browse/RHEL-179443
Conflicts:
- fs/file_table.c: slightly different argument to file_free_rcu()
downstream, replacement kept the same
commit dff745c1221a402b4921d54f292288373cff500c
Author: Amir Goldstein <amir73il@gmail.com>
Date: Sat Jul 1 20:11:34 2023 +0300
fs: move cleanup from init_file() into its callers
The use of file_free_rcu() in init_file() to free the struct that was
allocated by the caller was hacky and we got what we deserved.
Let init_file() and its callers take care of cleaning up each after
their own allocated resources on error.
Fixes: 62d53c4a1dfe ("fs: use backing_file container for internal files with "fake" f_path") # mainline only
Reported-and-tested-by: syzbot+ada42aab05cf51b00e98@syzkaller.appspotmail.com
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Message-Id: <20230701171134.239409-1-amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
diff --git a/fs/file_table.c b/fs/file_table.c
index 3f02d00c1396..b0a8c2608530 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -160,7 +160,7 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
f->f_cred = get_cred(cred);
error = security_file_alloc(f);
if (unlikely(error)) {
- file_free_rcu(&f->f_u.fu_rcuhead);
+ put_cred(f->f_cred);
return error;
}
@@ -208,8 +208,10 @@ struct file *alloc_empty_file(int flags, const struct cred *cred)
return ERR_PTR(-ENOMEM);
error = init_file(f, flags, cred);
- if (unlikely(error))
+ if (unlikely(error)) {
+ kmem_cache_free(filp_cachep, f);
return ERR_PTR(error);
+ }
percpu_counter_inc(&nr_files);
@@ -240,8 +242,10 @@ struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred)
return ERR_PTR(-ENOMEM);
error = init_file(f, flags, cred);
- if (unlikely(error))
+ if (unlikely(error)) {
+ kmem_cache_free(filp_cachep, f);
return ERR_PTR(error);
+ }
f->f_mode |= FMODE_NOACCOUNT;
@@ -265,8 +269,10 @@ struct file *alloc_empty_backing_file(int flags, const struct cred *cred)
return ERR_PTR(-ENOMEM);
error = init_file(&ff->file, flags, cred);
- if (unlikely(error))
+ if (unlikely(error)) {
+ kfree(ff);
return ERR_PTR(error);
+ }
ff->file.f_mode |= FMODE_BACKING | FMODE_NOACCOUNT;
return &ff->file;
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,119 @@
From f792642489a3af9cec81ca366edffd79fe1d1359 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Wed, 3 Jun 2026 15:16:52 +0200
Subject: [PATCH] lsm: constify the 'file' parameter in
security_binder_transfer_file()
JIRA: https://issues.redhat.com/browse/RHEL-179443
CVE: CVE-2026-46054
commit 8e4672d6f902d5c4db1e87e8aa9f530149d85bc6
Author: Khadija Kamran <kamrankhadijadj@gmail.com>
Date: Sat Aug 12 20:31:08 2023 +0500
lsm: constify the 'file' parameter in security_binder_transfer_file()
SELinux registers the implementation for the "binder_transfer_file"
hook. Looking at the function implementation we observe that the
parameter "file" is not changing.
Mark the "file" parameter of LSM hook security_binder_transfer_file() as
"const" since it will not be changing in the LSM hook.
Signed-off-by: Khadija Kamran <kamrankhadijadj@gmail.com>
[PM: subject line whitespace fix]
Signed-off-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 1c2be7057bd9..b6fbb446bab7 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -32,7 +32,7 @@ LSM_HOOK(int, 0, binder_transaction, const struct cred *from,
LSM_HOOK(int, 0, binder_transfer_binder, const struct cred *from,
const struct cred *to)
LSM_HOOK(int, 0, binder_transfer_file, const struct cred *from,
- const struct cred *to, struct file *file)
+ const struct cred *to, const struct file *file)
LSM_HOOK(int, 0, ptrace_access_check, struct task_struct *child,
unsigned int mode)
LSM_HOOK(int, 0, ptrace_traceme, struct task_struct *parent)
diff --git a/include/linux/security.h b/include/linux/security.h
index 3d216c94fd69..d2888c127859 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -269,7 +269,7 @@ int security_binder_transaction(const struct cred *from,
int security_binder_transfer_binder(const struct cred *from,
const struct cred *to);
int security_binder_transfer_file(const struct cred *from,
- const struct cred *to, struct file *file);
+ const struct cred *to, const struct file *file);
int security_ptrace_access_check(struct task_struct *child, unsigned int mode);
int security_ptrace_traceme(struct task_struct *parent);
int security_capget(struct task_struct *target,
@@ -539,7 +539,7 @@ static inline int security_binder_transfer_binder(const struct cred *from,
static inline int security_binder_transfer_file(const struct cred *from,
const struct cred *to,
- struct file *file)
+ const struct file *file)
{
return 0;
}
diff --git a/security/security.c b/security/security.c
index 2092b657af9f..b59af216324f 100644
--- a/security/security.c
+++ b/security/security.c
@@ -905,7 +905,7 @@ int security_binder_transfer_binder(const struct cred *from,
}
int security_binder_transfer_file(const struct cred *from,
- const struct cred *to, struct file *file)
+ const struct cred *to, const struct file *file)
{
return call_int_hook(binder_transfer_file, from, to, file);
}
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 22173e8e88e2..deacc9a63fae 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1703,7 +1703,7 @@ static inline int file_path_has_perm(const struct cred *cred,
}
#ifdef CONFIG_BPF_SYSCALL
-static int bpf_fd_pass(struct file *file, u32 sid);
+static int bpf_fd_pass(const struct file *file, u32 sid);
#endif
/* Check whether a task can use an open file descriptor to
@@ -1976,7 +1976,7 @@ static inline u32 file_mask_to_av(int mode, int mask)
}
/* Convert a Linux file to an access vector. */
-static inline u32 file_to_av(struct file *file)
+static inline u32 file_to_av(const struct file *file)
{
u32 av = 0;
@@ -2054,7 +2054,7 @@ static int selinux_binder_transfer_binder(const struct cred *from,
static int selinux_binder_transfer_file(const struct cred *from,
const struct cred *to,
- struct file *file)
+ const struct file *file)
{
u32 sid = cred_sid(to);
struct file_security_struct *fsec = selinux_file(file);
@@ -6885,7 +6885,7 @@ static u32 bpf_map_fmode_to_av(fmode_t fmode)
* access the bpf object and that's why we have to add this additional check in
* selinux_file_receive and selinux_binder_transfer_files.
*/
-static int bpf_fd_pass(struct file *file, u32 sid)
+static int bpf_fd_pass(const struct file *file, u32 sid)
{
struct bpf_security_struct *bpfsec;
struct bpf_prog *prog;
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,73 @@
From 7adb9b7eb6c32cc5b7cea983ed187cf3f4122cf5 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Wed, 3 Jun 2026 10:49:48 +0200
Subject: [PATCH] cachefiles: use kiocb_{start,end}_write() helpers
JIRA: https://issues.redhat.com/browse/RHEL-179443
CVE: CVE-2026-46054
commit e6fa4c728fb671765291cca3a905986612c06b6e
Author: Amir Goldstein <amir73il@gmail.com>
Date: Thu Aug 17 17:13:37 2023 +0300
cachefiles: use kiocb_{start,end}_write() helpers
Use helpers instead of the open coded dance to silence lockdep warnings.
Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Message-Id: <20230817141337.1025891-8-amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index 175a25fcade8..009d23cd435b 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -259,9 +259,7 @@ static void cachefiles_write_complete(struct kiocb *iocb, long ret)
_enter("%ld", ret);
- /* Tell lockdep we inherited freeze protection from submission thread */
- __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
- __sb_end_write(inode->i_sb, SB_FREEZE_WRITE);
+ kiocb_end_write(iocb);
if (ret < 0)
trace_cachefiles_io_error(object, inode, ret,
@@ -286,7 +284,6 @@ int __cachefiles_write(struct cachefiles_object *object,
{
struct cachefiles_cache *cache;
struct cachefiles_kiocb *ki;
- struct inode *inode;
unsigned int old_nofs;
ssize_t ret;
size_t len = iov_iter_count(iter);
@@ -322,19 +319,12 @@ int __cachefiles_write(struct cachefiles_object *object,
ki->iocb.ki_complete = cachefiles_write_complete;
atomic_long_add(ki->b_writing, &cache->b_writing);
- /* Open-code file_start_write here to grab freeze protection, which
- * will be released by another thread in aio_complete_rw(). Fool
- * lockdep by telling it the lock got released so that it doesn't
- * complain about the held lock when we return to userspace.
- */
- inode = file_inode(file);
- __sb_start_write(inode->i_sb, SB_FREEZE_WRITE);
- __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE);
+ kiocb_start_write(&ki->iocb);
get_file(ki->iocb.ki_filp);
cachefiles_grab_object(object, cachefiles_obj_get_ioreq);
- trace_cachefiles_write(object, inode, ki->iocb.ki_pos, len);
+ trace_cachefiles_write(object, file_inode(file), ki->iocb.ki_pos, len);
old_nofs = memalloc_nofs_save();
ret = cachefiles_inject_write_error();
if (ret == 0)
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,188 @@
From fb0111b77bf45d069d7722682a1a1d202fbf5f7d Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Wed, 3 Jun 2026 08:59:20 +0200
Subject: [PATCH] fs: Fix kernel-doc warnings
JIRA: https://issues.redhat.com/browse/RHEL-179443
CVE: CVE-2026-46054
Conflicts:
- fs/fs_context.c: one docstring fix was already applied
commit 35931eb3945b8d38c31f8e956aee3cf31c52121b
Author: Matthew Wilcox (Oracle) <willy@infradead.org>
Date: Fri Aug 18 21:08:24 2023 +0100
fs: Fix kernel-doc warnings
These have a variety of causes and a corresponding variety of solutions.
Signed-off-by: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Message-Id: <20230818200824.2720007-1-willy@infradead.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
diff --git a/fs/file.c b/fs/file.c
index 236e11a38c18..70962ee8c68b 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -679,7 +679,7 @@ EXPORT_SYMBOL(close_fd); /* for ksys_close() */
/**
* last_fd - return last valid index into fd table
- * @cur_fds: files struct
+ * @fdt: File descriptor table.
*
* Context: Either rcu read lock or files_lock must be held.
*
@@ -734,6 +734,7 @@ static inline void __range_close(struct files_struct *cur_fds, unsigned int fd,
*
* @fd: starting file descriptor to close
* @max_fd: last file descriptor to close
+ * @flags: CLOSE_RANGE flags.
*
* This closes a range of file descriptors. All file descriptors
* from @fd up to and including @max_fd are closed.
diff --git a/fs/fs_context.c b/fs/fs_context.c
index 648d2ee9e5fc..3473e63e8399 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -162,6 +162,10 @@ EXPORT_SYMBOL(vfs_parse_fs_param);
/**
* vfs_parse_fs_string - Convenience function to just parse a string.
+ * @fc: Filesystem context.
+ * @key: Parameter name.
+ * @value: Default value.
+ * @v_size: Maximum number of bytes in the value.
*/
int vfs_parse_fs_string(struct fs_context *fc, const char *key,
const char *value, size_t v_size)
@@ -357,7 +361,7 @@ void fc_drop_locked(struct fs_context *fc)
static void legacy_fs_context_free(struct fs_context *fc);
/**
- * vfs_dup_fc_config: Duplicate a filesystem context.
+ * vfs_dup_fs_context - Duplicate a filesystem context.
* @src_fc: The context to copy.
*/
struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc)
@@ -403,7 +407,9 @@ EXPORT_SYMBOL(vfs_dup_fs_context);
/**
* logfc - Log a message to a filesystem context
- * @fc: The filesystem context to log to.
+ * @log: The filesystem context to log to, or NULL to use printk.
+ * @prefix: A string to prefix the output with, or NULL.
+ * @level: 'w' for a warning, 'e' for an error. Anything else is a notice.
* @fmt: The format of the buffer.
*/
void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, ...)
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 088462ee5a81..64776891120c 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -109,9 +109,6 @@ static int ioctl_fibmap(struct file *filp, int __user *p)
* Returns 0 on success, -errno on error, 1 if this was the last
* extent that will fit in user array.
*/
-#define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC)
-#define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED)
-#define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE)
int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
u64 phys, u64 len, u32 flags)
{
@@ -127,6 +124,10 @@ int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max)
return 1;
+#define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC)
+#define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED)
+#define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE)
+
if (flags & SET_UNKNOWN_FLAGS)
flags |= FIEMAP_EXTENT_UNKNOWN;
if (flags & SET_NO_UNMOUNTED_IO_FLAGS)
@@ -913,6 +914,9 @@ SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
#ifdef CONFIG_COMPAT
/**
* compat_ptr_ioctl - generic implementation of .compat_ioctl file operation
+ * @file: The file to operate on.
+ * @cmd: The ioctl command number.
+ * @arg: The argument to the ioctl.
*
* This is not normally called as a function, but instead set in struct
* file_operations as
diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c
index 5d826274570c..c429c42a6867 100644
--- a/fs/kernel_read_file.c
+++ b/fs/kernel_read_file.c
@@ -8,16 +8,16 @@
/**
* kernel_read_file() - read file contents into a kernel buffer
*
- * @file file to read from
- * @offset where to start reading from (see below).
- * @buf pointer to a "void *" buffer for reading into (if
+ * @file: file to read from
+ * @offset: where to start reading from (see below).
+ * @buf: pointer to a "void *" buffer for reading into (if
* *@buf is NULL, a buffer will be allocated, and
* @buf_size will be ignored)
- * @buf_size size of buf, if already allocated. If @buf not
+ * @buf_size: size of buf, if already allocated. If @buf not
* allocated, this is the largest size to allocate.
- * @file_size if non-NULL, the full size of @file will be
+ * @file_size: if non-NULL, the full size of @file will be
* written here.
- * @id the kernel_read_file_id identifying the type of
+ * @id: the kernel_read_file_id identifying the type of
* file contents being read (for LSMs to examine)
*
* @offset must be 0 unless both @buf and @file_size are non-NULL
diff --git a/fs/namei.c b/fs/namei.c
index 0a4b15d9a010..23c73afe57d3 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -644,6 +644,8 @@ static bool nd_alloc_stack(struct nameidata *nd)
/**
* path_connected - Verify that a dentry is below mnt.mnt_root
+ * @mnt: The mountpoint to check.
+ * @dentry: The dentry to check.
*
* Rename can sometimes move a file or directory outside of a bind
* mount, path_connected allows those cases to be detected.
@@ -1083,6 +1085,7 @@ fs_initcall(init_fs_namei_sysctls);
/**
* may_follow_link - Check symlink following for unsafe situations
* @nd: nameidata pathwalk data
+ * @inode: Used for idmapping.
*
* In the case of the sysctl_protected_symlinks sysctl being enabled,
* CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
diff --git a/fs/open.c b/fs/open.c
index e2419242456e..ef2cc51d468c 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1126,7 +1126,7 @@ EXPORT_SYMBOL_GPL(kernel_file_open);
* backing_file_open - open a backing file for kernel internal use
* @path: path of the file to open
* @flags: open flags
- * @path: path of the backing file
+ * @real_path: path of the backing file
* @cred: credentials for open
*
* Open a backing file for a stackable filesystem (e.g., overlayfs).
@@ -1534,7 +1534,7 @@ SYSCALL_DEFINE1(close, unsigned int, fd)
}
/**
- * close_range() - Close all file descriptors in a given range.
+ * sys_close_range() - Close all file descriptors in a given range.
*
* @fd: starting file descriptor to close
* @max_fd: last file descriptor to close
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,300 @@
From 484050dcbd77701cd8a295037aaa6365e0dc2f4e Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Wed, 3 Jun 2026 09:57:52 +0200
Subject: [PATCH] fs: rename __mnt_{want,drop}_write*() helpers
JIRA: https://issues.redhat.com/browse/RHEL-179443
CVE: CVE-2026-46054
Conflicts:
- fs/inode.c: context fuzz
- fs/internal.h: dropped one hunk that modifies a comment in
sb_start_ro_state_change(), which is not present downstream
- fs/namespace.c: needed to rename also exports because of downstream
commit fb4415394e59 ("fs: export mnt_{get,put}_write_access() to modules")
- fs/overlayfs/util.c: previous backport introduced __mnt_...()
callers that also need to be renamed here
commit 3e15dcf77b23b8e9b9b7f3c0d4def8fe9c12c534
Author: Amir Goldstein <amir73il@gmail.com>
Date: Fri Sep 8 16:28:59 2023 +0300
fs: rename __mnt_{want,drop}_write*() helpers
Before exporting these helpers to modules, make their names more
meaningful.
The names mnt_{get,put)_write_access*() were chosen, because they rhyme
with the inode {get,put)_write_access() helpers, which have a very close
meaning for the inode object.
Suggested-by: Christian Brauner <brauner@kernel.org>
Link: https://lore.kernel.org/r/20230817-anfechtbar-ruhelosigkeit-8c6cca8443fc@brauner/
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Message-Id: <20230908132900.2983519-2-amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
diff --git a/fs/inode.c b/fs/inode.c
index 2bc233f8db22..fc484773431b 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1980,7 +1980,7 @@ void touch_atime(const struct path *path)
if (!sb_start_write_trylock(inode->i_sb))
return;
- if (__mnt_want_write(mnt) != 0)
+ if (mnt_get_write_access(mnt) != 0)
goto skip_update;
/*
* File systems can error out when updating inodes if they need to
@@ -1993,7 +1993,7 @@ void touch_atime(const struct path *path)
*/
now = current_time(inode);
update_time(inode, &now, S_ATIME);
- __mnt_drop_write(mnt);
+ mnt_put_write_access(mnt);
skip_update:
sb_end_write(inode->i_sb);
}
@@ -2110,9 +2110,9 @@ static int __file_update_time(struct file *file, struct timespec64 *now,
struct inode *inode = file_inode(file);
/* try to update time settings */
- if (!__mnt_want_write_file(file)) {
+ if (!mnt_get_write_access_file(file)) {
ret = update_time(inode, now, sync_mode);
- __mnt_drop_write_file(file);
+ mnt_put_write_access_file(file);
}
return ret;
diff --git a/fs/internal.h b/fs/internal.h
index 62b558fa6395..85bf69115cf4 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -76,8 +76,8 @@ extern int sb_prepare_remount_readonly(struct super_block *);
extern void __init mnt_init(void);
-extern int __mnt_want_write_file(struct file *);
-extern void __mnt_drop_write_file(struct file *);
+int mnt_get_write_access_file(struct file *file);
+void mnt_put_write_access_file(struct file *file);
extern void dissolve_on_fput(struct vfsmount *);
@@ -103,7 +103,7 @@ static inline void put_file_access(struct file *file)
i_readcount_dec(file->f_inode);
} else if (file->f_mode & FMODE_WRITER) {
put_write_access(file->f_inode);
- __mnt_drop_write(file->f_path.mnt);
+ mnt_put_write_access(file->f_path.mnt);
}
}
diff --git a/fs/namespace.c b/fs/namespace.c
index d032d84d66ed..218f1b77bf56 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -333,16 +333,16 @@ static int mnt_is_readonly(struct vfsmount *mnt)
* can determine when writes are able to occur to a filesystem.
*/
/**
- * __mnt_want_write - get write access to a mount without freeze protection
+ * mnt_get_write_access - get write access to a mount without freeze protection
* @m: the mount on which to take a write
*
* This tells the low-level filesystem that a write is about to be performed to
* it, and makes sure that writes are allowed (mnt it read-write) before
* returning success. This operation does not protect against filesystem being
- * frozen. When the write operation is finished, __mnt_drop_write() must be
+ * frozen. When the write operation is finished, mnt_put_write_access() must be
* called. This is effectively a refcount.
*/
-int __mnt_want_write(struct vfsmount *m)
+int mnt_get_write_access(struct vfsmount *m)
{
struct mount *mnt = real_mount(m);
int ret = 0;
@@ -371,7 +371,7 @@ int __mnt_want_write(struct vfsmount *m)
return ret;
}
-EXPORT_SYMBOL_GPL(__mnt_want_write);
+EXPORT_SYMBOL_GPL(mnt_get_write_access);
/**
* mnt_want_write - get write access to a mount
@@ -387,7 +387,7 @@ int mnt_want_write(struct vfsmount *m)
int ret;
sb_start_write(m->mnt_sb);
- ret = __mnt_want_write(m);
+ ret = mnt_get_write_access(m);
if (ret)
sb_end_write(m->mnt_sb);
return ret;
@@ -395,15 +395,15 @@ int mnt_want_write(struct vfsmount *m)
EXPORT_SYMBOL_GPL(mnt_want_write);
/**
- * __mnt_want_write_file - get write access to a file's mount
+ * mnt_get_write_access_file - get write access to a file's mount
* @file: the file who's mount on which to take a write
*
- * This is like __mnt_want_write, but if the file is already open for writing it
+ * This is like mnt_get_write_access, but if @file is already open for write it
* skips incrementing mnt_writers (since the open file already has a reference)
* and instead only does the check for emergency r/o remounts. This must be
- * paired with __mnt_drop_write_file.
+ * paired with mnt_put_write_access_file.
*/
-int __mnt_want_write_file(struct file *file)
+int mnt_get_write_access_file(struct file *file)
{
if (file->f_mode & FMODE_WRITER) {
/*
@@ -414,7 +414,7 @@ int __mnt_want_write_file(struct file *file)
return -EROFS;
return 0;
}
- return __mnt_want_write(file->f_path.mnt);
+ return mnt_get_write_access(file->f_path.mnt);
}
/**
@@ -431,7 +431,7 @@ int mnt_want_write_file(struct file *file)
int ret;
sb_start_write(file_inode(file)->i_sb);
- ret = __mnt_want_write_file(file);
+ ret = mnt_get_write_access_file(file);
if (ret)
sb_end_write(file_inode(file)->i_sb);
return ret;
@@ -439,20 +439,20 @@ int mnt_want_write_file(struct file *file)
EXPORT_SYMBOL_GPL(mnt_want_write_file);
/**
- * __mnt_drop_write - give up write access to a mount
+ * mnt_put_write_access - give up write access to a mount
* @mnt: the mount on which to give up write access
*
* Tells the low-level filesystem that we are done
* performing writes to it. Must be matched with
- * __mnt_want_write() call above.
+ * mnt_get_write_access() call above.
*/
-void __mnt_drop_write(struct vfsmount *mnt)
+void mnt_put_write_access(struct vfsmount *mnt)
{
preempt_disable();
mnt_dec_writers(real_mount(mnt));
preempt_enable();
}
-EXPORT_SYMBOL_GPL(__mnt_drop_write);
+EXPORT_SYMBOL_GPL(mnt_put_write_access);
/**
* mnt_drop_write - give up write access to a mount
@@ -464,20 +464,20 @@ EXPORT_SYMBOL_GPL(__mnt_drop_write);
*/
void mnt_drop_write(struct vfsmount *mnt)
{
- __mnt_drop_write(mnt);
+ mnt_put_write_access(mnt);
sb_end_write(mnt->mnt_sb);
}
EXPORT_SYMBOL_GPL(mnt_drop_write);
-void __mnt_drop_write_file(struct file *file)
+void mnt_put_write_access_file(struct file *file)
{
if (!(file->f_mode & FMODE_WRITER))
- __mnt_drop_write(file->f_path.mnt);
+ mnt_put_write_access(file->f_path.mnt);
}
void mnt_drop_write_file(struct file *file)
{
- __mnt_drop_write_file(file);
+ mnt_put_write_access_file(file);
sb_end_write(file_inode(file)->i_sb);
}
EXPORT_SYMBOL(mnt_drop_write_file);
diff --git a/fs/open.c b/fs/open.c
index ef2cc51d468c..b75b1ab6305b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -867,7 +867,7 @@ static int do_dentry_open(struct file *f,
error = get_write_access(inode);
if (unlikely(error))
goto cleanup_file;
- error = __mnt_want_write(f->f_path.mnt);
+ error = mnt_get_write_access(f->f_path.mnt);
if (unlikely(error)) {
put_write_access(inode);
goto cleanup_file;
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 6b31b6587e4d..81ef76c77cab 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -21,7 +21,7 @@
int ovl_get_write_access(struct dentry *dentry)
{
struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
- return __mnt_want_write(ovl_upper_mnt(ofs));
+ return mnt_get_write_access(ovl_upper_mnt(ofs));
}
/* Get write access to upper sb - may block if upper sb is frozen */
@@ -40,7 +40,7 @@ int ovl_want_write(struct dentry *dentry)
void ovl_put_write_access(struct dentry *dentry)
{
struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
- __mnt_drop_write(ovl_upper_mnt(ofs));
+ mnt_put_write_access(ovl_upper_mnt(ofs));
}
void ovl_end_write(struct dentry *dentry)
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 37dc2a161f73..0f214b0a0992 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -92,8 +92,8 @@ extern bool __mnt_is_readonly(struct vfsmount *mnt);
extern bool mnt_may_suid(struct vfsmount *mnt);
extern struct vfsmount *clone_private_mount(const struct path *path);
-extern int __mnt_want_write(struct vfsmount *);
-extern void __mnt_drop_write(struct vfsmount *);
+int mnt_get_write_access(struct vfsmount *mnt);
+void mnt_put_write_access(struct vfsmount *mnt);
extern struct vfsmount *fc_mount(struct fs_context *fc);
extern struct vfsmount *fc_mount_longterm(struct fs_context *fc);
diff --git a/kernel/acct.c b/kernel/acct.c
index bbea312b9d76..b3e00389d42d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -235,7 +235,7 @@ static int acct_on(struct filename *pathname)
filp_close(file, NULL);
return PTR_ERR(internal);
}
- err = __mnt_want_write(internal);
+ err = mnt_get_write_access(internal);
if (err) {
mntput(internal);
kfree(acct);
@@ -260,7 +260,7 @@ static int acct_on(struct filename *pathname)
old = xchg(&ns->bacct, &acct->pin);
mutex_unlock(&acct->lock);
pin_kill(old);
- __mnt_drop_write(mnt);
+ mnt_put_write_access(mnt);
mntput(mnt);
return 0;
}
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,109 @@
From fed3cc66e15fba2f8b48257cfbe82ab6eac11a39 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Wed, 3 Jun 2026 10:35:52 +0200
Subject: [PATCH] fs: get mnt_writers count for an open backing file's real
path
JIRA: https://issues.redhat.com/browse/RHEL-179443
CVE: CVE-2026-46054
commit 83bc1d294130cc471a89ce10770daa281a93fcb0
Author: Amir Goldstein <amir73il@gmail.com>
Date: Mon Oct 9 18:37:10 2023 +0300
fs: get mnt_writers count for an open backing file's real path
A writeable mapped backing file can perform writes to the real inode.
Therefore, the real path mount must be kept writable so long as the
writable map exists.
This may not be strictly needed for ovelrayfs private upper mount,
but it is correct to take the mnt_writers count in the vfs helper.
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231009153712.1566422-2-amir73il@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
diff --git a/fs/internal.h b/fs/internal.h
index 85bf69115cf4..29369382249d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -97,13 +97,20 @@ struct file *alloc_empty_file(int flags, const struct cred *cred);
struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred);
struct file *alloc_empty_backing_file(int flags, const struct cred *cred);
+static inline void file_put_write_access(struct file *file)
+{
+ put_write_access(file->f_inode);
+ mnt_put_write_access(file->f_path.mnt);
+ if (unlikely(file->f_mode & FMODE_BACKING))
+ mnt_put_write_access(backing_file_real_path(file)->mnt);
+}
+
static inline void put_file_access(struct file *file)
{
if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
i_readcount_dec(file->f_inode);
} else if (file->f_mode & FMODE_WRITER) {
- put_write_access(file->f_inode);
- mnt_put_write_access(file->f_path.mnt);
+ file_put_write_access(file);
}
}
diff --git a/fs/open.c b/fs/open.c
index b75b1ab6305b..64e4bbd1f28c 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -842,6 +842,30 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
return ksys_fchown(fd, user, group);
}
+static inline int file_get_write_access(struct file *f)
+{
+ int error;
+
+ error = get_write_access(f->f_inode);
+ if (unlikely(error))
+ return error;
+ error = mnt_get_write_access(f->f_path.mnt);
+ if (unlikely(error))
+ goto cleanup_inode;
+ if (unlikely(f->f_mode & FMODE_BACKING)) {
+ error = mnt_get_write_access(backing_file_real_path(f)->mnt);
+ if (unlikely(error))
+ goto cleanup_mnt;
+ }
+ return 0;
+
+cleanup_mnt:
+ mnt_put_write_access(f->f_path.mnt);
+cleanup_inode:
+ put_write_access(f->f_inode);
+ return error;
+}
+
static int do_dentry_open(struct file *f,
struct inode *inode,
int (*open)(struct inode *, struct file *))
@@ -864,14 +888,9 @@ static int do_dentry_open(struct file *f,
if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
i_readcount_inc(inode);
} else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
- error = get_write_access(inode);
+ error = file_get_write_access(f);
if (unlikely(error))
goto cleanup_file;
- error = mnt_get_write_access(f->f_path.mnt);
- if (unlikely(error)) {
- put_write_access(inode);
- goto cleanup_file;
- }
f->f_mode |= FMODE_WRITER;
}
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,153 @@
From f3d228ade9542a4fac0e0d4d2721e2a94f2d6d98 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Wed, 3 Jun 2026 10:36:39 +0200
Subject: [PATCH] fs: create helper file_user_path() for user displayed mapped
file path
JIRA: https://issues.redhat.com/browse/RHEL-179443
CVE: CVE-2026-46054
commit 08582d678fcf11fc86188f0b92239d3d49667d8e
Author: Amir Goldstein <amir73il@gmail.com>
Date: Mon Oct 9 18:37:11 2023 +0300
fs: create helper file_user_path() for user displayed mapped file path
Overlayfs uses backing files with "fake" overlayfs f_path and "real"
underlying f_inode, in order to use underlying inode aops for mapped
files and to display the overlayfs path in /proc/<pid>/maps.
In preparation for storing the overlayfs "fake" path instead of the
underlying "real" path in struct backing_file, define a noop helper
file_user_path() that returns f_path for now.
Use the new helper in procfs and kernel logs whenever a path of a
mapped file is displayed to users.
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231009153712.1566422-3-amir73il@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c
index 7654c2e42dc0..134c48374ecd 100644
--- a/arch/arc/kernel/troubleshoot.c
+++ b/arch/arc/kernel/troubleshoot.c
@@ -90,10 +90,12 @@ static void show_faulting_vma(unsigned long address)
*/
if (vma) {
char buf[ARC_PATH_MAX];
- char *nm = "?";
+ char *nm = "anon";
if (vma->vm_file) {
- nm = file_path(vma->vm_file, buf, ARC_PATH_MAX-1);
+ /* XXX: can we use %pD below and get rid of buf? */
+ nm = d_path(file_user_path(vma->vm_file), buf,
+ ARC_PATH_MAX-1);
if (IS_ERR(nm))
nm = "?";
}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index dbb251465954..cb79b5f1d459 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2190,7 +2190,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path)
rc = -ENOENT;
vma = find_exact_vma(mm, vm_start, vm_end);
if (vma && vma->vm_file) {
- *path = vma->vm_file->f_path;
+ *path = *file_user_path(vma->vm_file);
path_get(path);
rc = 0;
}
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 13452b32e2bd..b7e06be41224 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -59,7 +59,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
if (file) {
seq_pad(m, ' ');
- seq_file_path(m, file, "");
+ seq_path(m, file_user_path(file), "");
}
seq_putc(m, '\n');
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index e396e52ca096..6180dc935136 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -295,7 +295,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
if (anon_name)
seq_printf(m, "[anon_shmem:%s]", anon_name->name);
else
- seq_file_path(m, file, "\n");
+ seq_path(m, file_user_path(file), "\n");
goto done;
}
@@ -1952,7 +1952,7 @@ static int show_numa_map(struct seq_file *m, void *v)
if (file) {
seq_puts(m, " file=");
- seq_file_path(m, file, "\n\t= ");
+ seq_path(m, file_user_path(file), "\n\t= ");
} else if (vma_is_initial_heap(vma)) {
seq_puts(m, " heap");
} else if (vma_is_initial_stack(vma)) {
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 4d52623e1bff..a3822c149f12 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -162,7 +162,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
if (file) {
seq_pad(m, ' ');
- seq_file_path(m, file, "");
+ seq_path(m, file_user_path(file), "");
} else if (mm && vma_is_initial_stack(vma)) {
seq_pad(m, ' ');
seq_puts(m, "[stack]");
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 48ec31b9d230..4ee6ed9e2634 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2607,6 +2607,20 @@ static inline const struct path *file_real_path(struct file *f)
return &f->f_path;
}
+/*
+ * file_user_path - get the path to display for memory mapped file
+ *
+ * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file
+ * stored in ->vm_file is a backing file whose f_inode is on the underlying
+ * filesystem. When the mapped file path is displayed to user (e.g. via
+ * /proc/<pid>/maps), this helper should be used to get the path to display
+ * to the user, which is the path of the fd that user has requested to map.
+ */
+static inline const struct path *file_user_path(struct file *f)
+{
+ return &f->f_path;
+}
+
static inline struct file *file_clone_open(struct file *file)
{
return dentry_open(&file->f_path, file->f_flags, file->f_cred);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 0fda3619c425..6d89bc793c96 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -405,7 +405,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
vmstart = vma->vm_start;
}
if (file) {
- ret = trace_seq_path(s, &file->f_path);
+ ret = trace_seq_path(s, file_user_path(file));
if (ret)
trace_seq_printf(s, "[+0x%lx]",
ip - vmstart);
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,240 @@
From da5c3bc8093871a2c6c90187b645cc29cc0230ef Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Wed, 3 Jun 2026 10:37:33 +0200
Subject: [PATCH] fs: store real path instead of fake path in backing file
f_path
JIRA: https://issues.redhat.com/browse/RHEL-179443
CVE: CVE-2026-46054
Conflicts:
- fs/internal.h: context fuzz
commit def3ae83da02f87005210fa3d448c5dd37ba4105
Author: Amir Goldstein <amir73il@gmail.com>
Date: Mon Oct 9 18:37:12 2023 +0300
fs: store real path instead of fake path in backing file f_path
A backing file struct stores two path's, one "real" path that is referring
to f_inode and one "fake" path, which should be displayed to users in
/proc/<pid>/maps.
There is a lot more potential code that needs to know the "real" path, then
code that needs to know the "fake" path.
Instead of code having to request the "real" path with file_real_path(),
store the "real" path in f_path and require code that needs to know the
"fake" path request it with file_user_path().
Replace the file_real_path() helper with a simple const accessor f_path().
After this change, file_dentry() is not expected to observe any files
with overlayfs f_path and real f_inode, so the call to ->d_real() should
not be needed. Leave the ->d_real() call for now and add an assertion
in ovl_d_real() to catch if we made wrong assumptions.
Suggested-by: Miklos Szeredi <miklos@szeredi.hu>
Link: https://lore.kernel.org/r/CAJfpegtt48eXhhjDFA1ojcHPNKj3Go6joryCPtEFAKpocyBsnw@mail.gmail.com/
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231009153712.1566422-4-amir73il@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
diff --git a/fs/file_table.c b/fs/file_table.c
index b0a8c2608530..e5c7b9705109 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -44,10 +44,10 @@ static struct kmem_cache *filp_cachep __read_mostly;
static struct percpu_counter nr_files __cacheline_aligned_in_smp;
-/* Container for backing file with optional real path */
+/* Container for backing file with optional user path */
struct backing_file {
struct file file;
- struct path real_path;
+ struct path user_path;
};
static inline struct backing_file *backing_file(struct file *f)
@@ -55,11 +55,11 @@ static inline struct backing_file *backing_file(struct file *f)
return container_of(f, struct backing_file, file);
}
-struct path *backing_file_real_path(struct file *f)
+struct path *backing_file_user_path(struct file *f)
{
- return &backing_file(f)->real_path;
+ return &backing_file(f)->user_path;
}
-EXPORT_SYMBOL_GPL(backing_file_real_path);
+EXPORT_SYMBOL_GPL(backing_file_user_path);
static void file_free_rcu(struct rcu_head *head)
{
@@ -76,7 +76,7 @@ static inline void file_free(struct file *f)
{
security_file_free(f);
if (unlikely(f->f_mode & FMODE_BACKING))
- path_put(backing_file_real_path(f));
+ path_put(backing_file_user_path(f));
if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
percpu_counter_dec(&nr_files);
call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
diff --git a/fs/internal.h b/fs/internal.h
index 29369382249d..bd0934d0521b 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -102,7 +102,7 @@ static inline void file_put_write_access(struct file *file)
put_write_access(file->f_inode);
mnt_put_write_access(file->f_path.mnt);
if (unlikely(file->f_mode & FMODE_BACKING))
- mnt_put_write_access(backing_file_real_path(file)->mnt);
+ mnt_put_write_access(backing_file_user_path(file)->mnt);
}
static inline void put_file_access(struct file *file)
diff --git a/fs/open.c b/fs/open.c
index 64e4bbd1f28c..45547548a0e5 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -853,7 +853,7 @@ static inline int file_get_write_access(struct file *f)
if (unlikely(error))
goto cleanup_inode;
if (unlikely(f->f_mode & FMODE_BACKING)) {
- error = mnt_get_write_access(backing_file_real_path(f)->mnt);
+ error = mnt_get_write_access(backing_file_user_path(f)->mnt);
if (unlikely(error))
goto cleanup_mnt;
}
@@ -1143,20 +1143,19 @@ EXPORT_SYMBOL_GPL(kernel_file_open);
/**
* backing_file_open - open a backing file for kernel internal use
- * @path: path of the file to open
+ * @user_path: path that the user reuqested to open
* @flags: open flags
* @real_path: path of the backing file
* @cred: credentials for open
*
* Open a backing file for a stackable filesystem (e.g., overlayfs).
- * @path may be on the stackable filesystem and backing inode on the
- * underlying filesystem. In this case, we want to be able to return
- * the @real_path of the backing inode. This is done by embedding the
- * returned file into a container structure that also stores the path of
- * the backing inode on the underlying filesystem, which can be
- * retrieved using backing_file_real_path().
+ * @user_path may be on the stackable filesystem and @real_path on the
+ * underlying filesystem. In this case, we want to be able to return the
+ * @user_path of the stackable filesystem. This is done by embedding the
+ * returned file into a container structure that also stores the stacked
+ * file's path, which can be retrieved using backing_file_user_path().
*/
-struct file *backing_file_open(const struct path *path, int flags,
+struct file *backing_file_open(const struct path *user_path, int flags,
const struct path *real_path,
const struct cred *cred)
{
@@ -1167,9 +1166,9 @@ struct file *backing_file_open(const struct path *path, int flags,
if (IS_ERR(f))
return f;
- f->f_path = *path;
- path_get(real_path);
- *backing_file_real_path(f) = *real_path;
+ path_get(user_path);
+ *backing_file_user_path(f) = *user_path;
+ f->f_path = *real_path;
error = do_dentry_open(f, d_inode(real_path->dentry), NULL);
if (error) {
fput(f);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index c49b1e7575d3..0779c8290ec4 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -34,14 +34,22 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
struct dentry *real = NULL, *lower;
int err;
- /* It's an overlay file */
+ /*
+ * vfs is only expected to call d_real() with NULL from d_real_inode()
+ * and with overlay inode from file_dentry() on an overlay file.
+ *
+ * TODO: remove @inode argument from d_real() API, remove code in this
+ * function that deals with non-NULL @inode and remove d_real() call
+ * from file_dentry().
+ */
if (inode && d_inode(dentry) == inode)
return dentry;
+ else if (inode)
+ goto bug;
if (!d_is_reg(dentry)) {
- if (!inode || inode == d_inode(dentry))
- return dentry;
- goto bug;
+ /* d_real_inode() is only relevant for regular files */
+ return dentry;
}
real = ovl_dentry_upper(dentry);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4ee6ed9e2634..1927fcdad989 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2586,26 +2586,10 @@ struct file *dentry_open(const struct path *path, int flags,
const struct cred *creds);
struct file *dentry_create(const struct path *path, int flags, umode_t mode,
const struct cred *cred);
-struct file *backing_file_open(const struct path *path, int flags,
+struct file *backing_file_open(const struct path *user_path, int flags,
const struct path *real_path,
const struct cred *cred);
-struct path *backing_file_real_path(struct file *f);
-
-/*
- * file_real_path - get the path corresponding to f_inode
- *
- * When opening a backing file for a stackable filesystem (e.g.,
- * overlayfs) f_path may be on the stackable filesystem and f_inode on
- * the underlying filesystem. When the path associated with f_inode is
- * needed, this helper should be used instead of accessing f_path
- * directly.
-*/
-static inline const struct path *file_real_path(struct file *f)
-{
- if (unlikely(f->f_mode & FMODE_BACKING))
- return backing_file_real_path(f);
- return &f->f_path;
-}
+struct path *backing_file_user_path(struct file *f);
/*
* file_user_path - get the path to display for memory mapped file
@@ -2618,6 +2602,8 @@ static inline const struct path *file_real_path(struct file *f)
*/
static inline const struct path *file_user_path(struct file *f)
{
+ if (unlikely(f->f_mode & FMODE_BACKING))
+ return backing_file_user_path(f);
return &f->f_path;
}
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index ed48e4f1e755..bcb6609b54b3 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -96,8 +96,7 @@ static inline int fsnotify_file(struct file *file, __u32 mask)
if (file->f_mode & FMODE_NONOTIFY)
return 0;
- /* Overlayfs internal files have fake f_path */
- path = file_real_path(file);
+ path = &file->f_path;
return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH);
}
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,243 @@
From ecbb1b105f3616d4342dba0f840dcf9265dfbea0 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Wed, 3 Jun 2026 10:42:36 +0200
Subject: [PATCH] fs: prepare for stackable filesystems backing file helpers
JIRA: https://issues.redhat.com/browse/RHEL-179443
CVE: CVE-2026-46054
commit f91a704f7161c2cf0fcd41fa9fbec4355b813fff
Author: Amir Goldstein <amir73il@gmail.com>
Date: Mon Oct 2 17:19:46 2023 +0300
fs: prepare for stackable filesystems backing file helpers
In preparation for factoring out some backing file io helpers from
overlayfs, move backing_file_open() into a new file fs/backing-file.c
and header.
Add a MAINTAINERS entry for stackable filesystems and add a Kconfig
FS_STACK which stackable filesystems need to select.
For now, the backing_file struct, the backing_file alloc/free functions
and the backing_file_real_path() accessor remain internal to file_table.c.
We may change that in the future.
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
diff --git a/MAINTAINERS b/MAINTAINERS
index f5dcab467670..3a29f2d3a2b1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7475,6 +7475,15 @@ F: fs/mnt_idmapping.c
F: include/linux/mnt_idmapping.*
F: tools/testing/selftests/mount_setattr/
+FILESYSTEMS [STACKABLE]
+M: Miklos Szeredi <miklos@szeredi.hu>
+M: Amir Goldstein <amir73il@gmail.com>
+L: linux-fsdevel@vger.kernel.org
+L: linux-unionfs@vger.kernel.org
+S: Maintained
+F: fs/backing-file.c
+F: include/linux/backing-file.h
+
FINTEK F75375S HARDWARE MONITOR AND FAN CONTROLLER DRIVER
M: Riku Voipio <riku.voipio@iki.fi>
L: linux-hwmon@vger.kernel.org
diff --git a/fs/Kconfig b/fs/Kconfig
index 5378e55f87d3..a9c6fa9cff1f 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -18,6 +18,10 @@ config VALIDATE_FS_PARSER
config FS_IOMAP
bool
+# Stackable filesystems
+config FS_STACK
+ bool
+
config BUFFER_HEAD
bool
diff --git a/fs/Makefile b/fs/Makefile
index 0da17ff145c6..716c9fe04dec 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -41,6 +41,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o
obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o
obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o
+obj-$(CONFIG_FS_STACK) += backing-file.o
obj-$(CONFIG_FS_MBCACHE) += mbcache.o
obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o
obj-$(CONFIG_NFS_COMMON) += nfs_common/
diff --git a/fs/backing-file.c b/fs/backing-file.c
new file mode 100644
index 000000000000..04b33036f709
--- /dev/null
+++ b/fs/backing-file.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Common helpers for stackable filesystems and backing files.
+ *
+ * Copyright (C) 2023 CTERA Networks.
+ */
+
+#include <linux/fs.h>
+#include <linux/backing-file.h>
+
+#include "internal.h"
+
+/**
+ * backing_file_open - open a backing file for kernel internal use
+ * @user_path: path that the user reuqested to open
+ * @flags: open flags
+ * @real_path: path of the backing file
+ * @cred: credentials for open
+ *
+ * Open a backing file for a stackable filesystem (e.g., overlayfs).
+ * @user_path may be on the stackable filesystem and @real_path on the
+ * underlying filesystem. In this case, we want to be able to return the
+ * @user_path of the stackable filesystem. This is done by embedding the
+ * returned file into a container structure that also stores the stacked
+ * file's path, which can be retrieved using backing_file_user_path().
+ */
+struct file *backing_file_open(const struct path *user_path, int flags,
+ const struct path *real_path,
+ const struct cred *cred)
+{
+ struct file *f;
+ int error;
+
+ f = alloc_empty_backing_file(flags, cred);
+ if (IS_ERR(f))
+ return f;
+
+ path_get(user_path);
+ *backing_file_user_path(f) = *user_path;
+ error = vfs_open(real_path, f);
+ if (error) {
+ fput(f);
+ f = ERR_PTR(error);
+ }
+
+ return f;
+}
+EXPORT_SYMBOL_GPL(backing_file_open);
diff --git a/fs/open.c b/fs/open.c
index 45547548a0e5..4260d61560d4 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1141,44 +1141,6 @@ struct file *kernel_file_open(const struct path *path, int flags,
}
EXPORT_SYMBOL_GPL(kernel_file_open);
-/**
- * backing_file_open - open a backing file for kernel internal use
- * @user_path: path that the user reuqested to open
- * @flags: open flags
- * @real_path: path of the backing file
- * @cred: credentials for open
- *
- * Open a backing file for a stackable filesystem (e.g., overlayfs).
- * @user_path may be on the stackable filesystem and @real_path on the
- * underlying filesystem. In this case, we want to be able to return the
- * @user_path of the stackable filesystem. This is done by embedding the
- * returned file into a container structure that also stores the stacked
- * file's path, which can be retrieved using backing_file_user_path().
- */
-struct file *backing_file_open(const struct path *user_path, int flags,
- const struct path *real_path,
- const struct cred *cred)
-{
- struct file *f;
- int error;
-
- f = alloc_empty_backing_file(flags, cred);
- if (IS_ERR(f))
- return f;
-
- path_get(user_path);
- *backing_file_user_path(f) = *user_path;
- f->f_path = *real_path;
- error = do_dentry_open(f, d_inode(real_path->dentry), NULL);
- if (error) {
- fput(f);
- f = ERR_PTR(error);
- }
-
- return f;
-}
-EXPORT_SYMBOL_GPL(backing_file_open);
-
#define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE))
#define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC)
diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig
index 6708e54b0e30..148d9567b5c3 100644
--- a/fs/overlayfs/Kconfig
+++ b/fs/overlayfs/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config OVERLAY_FS
tristate "Overlay filesystem support"
+ select FS_STACK
select EXPORTFS
help
An overlay filesystem combines two filesystems - an 'upper' filesystem
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index cd0770bb3020..634a96a65bfd 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -13,6 +13,7 @@
#include <linux/security.h>
#include <linux/mm.h>
#include <linux/fs.h>
+#include <linux/backing-file.h>
#include "overlayfs.h"
#include "../internal.h" /* for sb_init_dio_done_wq */
diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h
new file mode 100644
index 000000000000..55c9e804f780
--- /dev/null
+++ b/include/linux/backing-file.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Common helpers for stackable filesystems and backing files.
+ *
+ * Copyright (C) 2023 CTERA Networks.
+ */
+
+#ifndef _LINUX_BACKING_FILE_H
+#define _LINUX_BACKING_FILE_H
+
+#include <linux/file.h>
+
+struct file *backing_file_open(const struct path *user_path, int flags,
+ const struct path *real_path,
+ const struct cred *cred);
+
+#endif /* _LINUX_BACKING_FILE_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1927fcdad989..5f3ca25c77e5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2586,9 +2586,6 @@ struct file *dentry_open(const struct path *path, int flags,
const struct cred *creds);
struct file *dentry_create(const struct path *path, int flags, umode_t mode,
const struct cred *cred);
-struct file *backing_file_open(const struct path *user_path, int flags,
- const struct path *real_path,
- const struct cred *cred);
struct path *backing_file_user_path(struct file *f);
/*
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,600 @@
From 37577a7bb4d8c217f23a2b47e8fbe66640e0588d Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Wed, 3 Jun 2026 10:51:36 +0200
Subject: [PATCH] fs: factor out backing_file_{read,write}_iter() helpers
JIRA: https://issues.redhat.com/browse/RHEL-179443
CVE: CVE-2026-46054
Conflicts:
- fs/overlayfs/file.c & fs/backing-file.c: carried over downstream
logic of calling *_start_write()/*_end_write() because of not
backported commits: 269aed7014b3 ("fs: move file_start_write() into
vfs_iter_write()") and 6ae654392bb5 ("fs: move kiocb_start_write()
into vfs_iocb_iter_write()") - those commits were skipped in this
MR, because vfs_iter_write() appears in K-A-B-I stable lists and
changing the convention could break partner modules
commit a6293b3e285cd0d7692141d7981a5f144f0e2f0b
Author: Amir Goldstein <amir73il@gmail.com>
Date: Wed Nov 22 17:48:52 2023 +0200
fs: factor out backing_file_{read,write}_iter() helpers
Overlayfs submits files io to backing files on other filesystems.
Factor out some common helpers to perform io to backing files, into
fs/backing-file.c.
Suggested-by: Miklos Szeredi <miklos@szeredi.hu>
Link: https://lore.kernel.org/r/CAJfpeguhmZbjP3JLqtUy0AdWaHOkAPWeP827BBWwRFEAUgnUcQ@mail.gmail.com
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
diff --git a/fs/backing-file.c b/fs/backing-file.c
index 04b33036f709..6d915a45e288 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -2,6 +2,9 @@
/*
* Common helpers for stackable filesystems and backing files.
*
+ * Forked from fs/overlayfs/file.c.
+ *
+ * Copyright (C) 2017 Red Hat, Inc.
* Copyright (C) 2023 CTERA Networks.
*/
@@ -46,3 +49,213 @@ struct file *backing_file_open(const struct path *user_path, int flags,
return f;
}
EXPORT_SYMBOL_GPL(backing_file_open);
+
+struct backing_aio {
+ struct kiocb iocb;
+ refcount_t ref;
+ struct kiocb *orig_iocb;
+ /* used for aio completion */
+ void (*end_write)(struct file *);
+ struct work_struct work;
+ long res;
+};
+
+static struct kmem_cache *backing_aio_cachep;
+
+#define BACKING_IOCB_MASK \
+ (IOCB_NOWAIT | IOCB_HIPRI | IOCB_DSYNC | IOCB_SYNC | IOCB_APPEND)
+
+static rwf_t iocb_to_rw_flags(int flags)
+{
+ return (__force rwf_t)(flags & BACKING_IOCB_MASK);
+}
+
+static void backing_aio_put(struct backing_aio *aio)
+{
+ if (refcount_dec_and_test(&aio->ref)) {
+ fput(aio->iocb.ki_filp);
+ kmem_cache_free(backing_aio_cachep, aio);
+ }
+}
+
+static void backing_aio_cleanup(struct backing_aio *aio, long res)
+{
+ struct kiocb *iocb = &aio->iocb;
+ struct kiocb *orig_iocb = aio->orig_iocb;
+
+ if (iocb->ki_flags & IOCB_WRITE)
+ kiocb_end_write(iocb);
+
+ if (aio->end_write)
+ aio->end_write(orig_iocb->ki_filp);
+
+ orig_iocb->ki_pos = iocb->ki_pos;
+ backing_aio_put(aio);
+}
+
+static void backing_aio_rw_complete(struct kiocb *iocb, long res)
+{
+ struct backing_aio *aio = container_of(iocb, struct backing_aio, iocb);
+ struct kiocb *orig_iocb = aio->orig_iocb;
+
+ backing_aio_cleanup(aio, res);
+ orig_iocb->ki_complete(orig_iocb, res);
+}
+
+static void backing_aio_complete_work(struct work_struct *work)
+{
+ struct backing_aio *aio = container_of(work, struct backing_aio, work);
+
+ backing_aio_rw_complete(&aio->iocb, aio->res);
+}
+
+static void backing_aio_queue_completion(struct kiocb *iocb, long res)
+{
+ struct backing_aio *aio = container_of(iocb, struct backing_aio, iocb);
+
+ /*
+ * Punt to a work queue to serialize updates of mtime/size.
+ */
+ aio->res = res;
+ INIT_WORK(&aio->work, backing_aio_complete_work);
+ queue_work(file_inode(aio->orig_iocb->ki_filp)->i_sb->s_dio_done_wq,
+ &aio->work);
+}
+
+static int backing_aio_init_wq(struct kiocb *iocb)
+{
+ struct super_block *sb = file_inode(iocb->ki_filp)->i_sb;
+
+ if (sb->s_dio_done_wq)
+ return 0;
+
+ return sb_init_dio_done_wq(sb);
+}
+
+
+ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
+ struct kiocb *iocb, int flags,
+ struct backing_file_ctx *ctx)
+{
+ struct backing_aio *aio = NULL;
+ const struct cred *old_cred;
+ ssize_t ret;
+
+ if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)))
+ return -EIO;
+
+ if (!iov_iter_count(iter))
+ return 0;
+
+ if (iocb->ki_flags & IOCB_DIRECT &&
+ !(file->f_mode & FMODE_CAN_ODIRECT))
+ return -EINVAL;
+
+ old_cred = override_creds(ctx->cred);
+ if (is_sync_kiocb(iocb)) {
+ rwf_t rwf = iocb_to_rw_flags(flags);
+
+ ret = vfs_iter_read(file, iter, &iocb->ki_pos, rwf);
+ } else {
+ ret = -ENOMEM;
+ aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
+ if (!aio)
+ goto out;
+
+ aio->orig_iocb = iocb;
+ kiocb_clone(&aio->iocb, iocb, get_file(file));
+ aio->iocb.ki_complete = backing_aio_rw_complete;
+ refcount_set(&aio->ref, 2);
+ ret = vfs_iocb_iter_read(file, &aio->iocb, iter);
+ backing_aio_put(aio);
+ if (ret != -EIOCBQUEUED)
+ backing_aio_cleanup(aio, ret);
+ }
+out:
+ revert_creds(old_cred);
+
+ if (ctx->accessed)
+ ctx->accessed(ctx->user_file);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(backing_file_read_iter);
+
+ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
+ struct kiocb *iocb, int flags,
+ struct backing_file_ctx *ctx)
+{
+ const struct cred *old_cred;
+ ssize_t ret;
+
+ if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)))
+ return -EIO;
+
+ if (!iov_iter_count(iter))
+ return 0;
+
+ ret = file_remove_privs(ctx->user_file);
+ if (ret)
+ return ret;
+
+ if (iocb->ki_flags & IOCB_DIRECT &&
+ !(file->f_mode & FMODE_CAN_ODIRECT))
+ return -EINVAL;
+
+ /*
+ * Stacked filesystems don't support deferred completions, don't copy
+ * this property in case it is set by the issuer.
+ */
+ flags &= ~IOCB_DIO_CALLER_COMP;
+
+ old_cred = override_creds(ctx->cred);
+ if (is_sync_kiocb(iocb)) {
+ rwf_t rwf = iocb_to_rw_flags(flags);
+
+ file_start_write(file);
+ ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf);
+ file_end_write(file);
+ if (ctx->end_write)
+ ctx->end_write(ctx->user_file);
+ } else {
+ struct backing_aio *aio;
+
+ ret = backing_aio_init_wq(iocb);
+ if (ret)
+ goto out;
+
+ ret = -ENOMEM;
+ aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
+ if (!aio)
+ goto out;
+
+ aio->orig_iocb = iocb;
+ aio->end_write = ctx->end_write;
+ kiocb_clone(&aio->iocb, iocb, get_file(file));
+ aio->iocb.ki_flags = flags;
+ aio->iocb.ki_complete = backing_aio_queue_completion;
+ refcount_set(&aio->ref, 2);
+ kiocb_start_write(&aio->iocb);
+ ret = vfs_iocb_iter_write(file, &aio->iocb, iter);
+ backing_aio_put(aio);
+ if (ret != -EIOCBQUEUED)
+ backing_aio_cleanup(aio, ret);
+ }
+out:
+ revert_creds(old_cred);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(backing_file_write_iter);
+
+static int __init backing_aio_init(void)
+{
+ backing_aio_cachep = kmem_cache_create("backing_aio",
+ sizeof(struct backing_aio),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (!backing_aio_cachep)
+ return -ENOMEM;
+
+ return 0;
+}
+fs_initcall(backing_aio_init);
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 634a96a65bfd..3eee9f45971e 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -16,19 +16,6 @@
#include <linux/backing-file.h>
#include "overlayfs.h"
-#include "../internal.h" /* for sb_init_dio_done_wq */
-
-struct ovl_aio_req {
- struct kiocb iocb;
- refcount_t ref;
- struct kiocb *orig_iocb;
- /* used for aio completion */
- struct work_struct work;
- long res;
-};
-
-static struct kmem_cache *ovl_aio_request_cachep;
-
static char ovl_whatisit(struct inode *inode, struct inode *realinode)
{
if (realinode != ovl_inode_upper(inode))
@@ -271,83 +258,16 @@ static void ovl_file_accessed(struct file *file)
touch_atime(&file->f_path);
}
-#define OVL_IOCB_MASK \
- (IOCB_NOWAIT | IOCB_HIPRI | IOCB_DSYNC | IOCB_SYNC | IOCB_APPEND)
-
-static rwf_t iocb_to_rw_flags(int flags)
-{
- return (__force rwf_t)(flags & OVL_IOCB_MASK);
-}
-
-static inline void ovl_aio_put(struct ovl_aio_req *aio_req)
-{
- if (refcount_dec_and_test(&aio_req->ref)) {
- fput(aio_req->iocb.ki_filp);
- kmem_cache_free(ovl_aio_request_cachep, aio_req);
- }
-}
-
-static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req)
-{
- struct kiocb *iocb = &aio_req->iocb;
- struct kiocb *orig_iocb = aio_req->orig_iocb;
-
- if (iocb->ki_flags & IOCB_WRITE) {
- kiocb_end_write(iocb);
- ovl_file_modified(orig_iocb->ki_filp);
- }
-
- orig_iocb->ki_pos = iocb->ki_pos;
- ovl_aio_put(aio_req);
-}
-
-static void ovl_aio_rw_complete(struct kiocb *iocb, long res)
-{
- struct ovl_aio_req *aio_req = container_of(iocb,
- struct ovl_aio_req, iocb);
- struct kiocb *orig_iocb = aio_req->orig_iocb;
-
- ovl_aio_cleanup_handler(aio_req);
- orig_iocb->ki_complete(orig_iocb, res);
-}
-
-static void ovl_aio_complete_work(struct work_struct *work)
-{
- struct ovl_aio_req *aio_req = container_of(work,
- struct ovl_aio_req, work);
-
- ovl_aio_rw_complete(&aio_req->iocb, aio_req->res);
-}
-
-static void ovl_aio_queue_completion(struct kiocb *iocb, long res)
-{
- struct ovl_aio_req *aio_req = container_of(iocb,
- struct ovl_aio_req, iocb);
- struct kiocb *orig_iocb = aio_req->orig_iocb;
-
- /*
- * Punt to a work queue to serialize updates of mtime/size.
- */
- aio_req->res = res;
- INIT_WORK(&aio_req->work, ovl_aio_complete_work);
- queue_work(file_inode(orig_iocb->ki_filp)->i_sb->s_dio_done_wq,
- &aio_req->work);
-}
-
-static int ovl_init_aio_done_wq(struct super_block *sb)
-{
- if (sb->s_dio_done_wq)
- return 0;
-
- return sb_init_dio_done_wq(sb);
-}
-
static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct fd real;
- const struct cred *old_cred;
ssize_t ret;
+ struct backing_file_ctx ctx = {
+ .cred = ovl_creds(file_inode(file)->i_sb),
+ .user_file = file,
+ .accessed = ovl_file_accessed,
+ };
if (!iov_iter_count(iter))
return 0;
@@ -356,37 +276,8 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
if (ret)
return ret;
- ret = -EINVAL;
- if (iocb->ki_flags & IOCB_DIRECT &&
- !(real.file->f_mode & FMODE_CAN_ODIRECT))
- goto out_fdput;
-
- old_cred = ovl_override_creds(file_inode(file)->i_sb);
- if (is_sync_kiocb(iocb)) {
- rwf_t rwf = iocb_to_rw_flags(iocb->ki_flags);
-
- ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, rwf);
- } else {
- struct ovl_aio_req *aio_req;
-
- ret = -ENOMEM;
- aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL);
- if (!aio_req)
- goto out;
-
- aio_req->orig_iocb = iocb;
- kiocb_clone(&aio_req->iocb, iocb, get_file(real.file));
- aio_req->iocb.ki_complete = ovl_aio_rw_complete;
- refcount_set(&aio_req->ref, 2);
- ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter);
- ovl_aio_put(aio_req);
- if (ret != -EIOCBQUEUED)
- ovl_aio_cleanup_handler(aio_req);
- }
-out:
- revert_creds(old_cred);
- ovl_file_accessed(file);
-out_fdput:
+ ret = backing_file_read_iter(real.file, iter, iocb, iocb->ki_flags,
+ &ctx);
fdput(real);
return ret;
@@ -397,9 +288,13 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct fd real;
- const struct cred *old_cred;
ssize_t ret;
int ifl = iocb->ki_flags;
+ struct backing_file_ctx ctx = {
+ .cred = ovl_creds(inode->i_sb),
+ .user_file = file,
+ .end_write = ovl_file_modified,
+ };
if (!iov_iter_count(iter))
return 0;
@@ -407,19 +302,11 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
inode_lock(inode);
/* Update mode */
ovl_copyattr(inode);
- ret = file_remove_privs(file);
- if (ret)
- goto out_unlock;
ret = ovl_real_fdget(file, &real);
if (ret)
goto out_unlock;
- ret = -EINVAL;
- if (iocb->ki_flags & IOCB_DIRECT &&
- !(real.file->f_mode & FMODE_CAN_ODIRECT))
- goto out_fdput;
-
if (!ovl_should_sync(OVL_FS(inode->i_sb)))
ifl &= ~(IOCB_DSYNC | IOCB_SYNC);
@@ -428,42 +315,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
* this property in case it is set by the issuer.
*/
ifl &= ~IOCB_DIO_CALLER_COMP;
-
- old_cred = ovl_override_creds(file_inode(file)->i_sb);
- if (is_sync_kiocb(iocb)) {
- rwf_t rwf = iocb_to_rw_flags(ifl);
-
- file_start_write(real.file);
- ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, rwf);
- file_end_write(real.file);
- /* Update size */
- ovl_file_modified(file);
- } else {
- struct ovl_aio_req *aio_req;
-
- ret = ovl_init_aio_done_wq(inode->i_sb);
- if (ret)
- goto out;
-
- ret = -ENOMEM;
- aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL);
- if (!aio_req)
- goto out;
-
- aio_req->orig_iocb = iocb;
- kiocb_clone(&aio_req->iocb, iocb, get_file(real.file));
- aio_req->iocb.ki_flags = ifl;
- aio_req->iocb.ki_complete = ovl_aio_queue_completion;
- refcount_set(&aio_req->ref, 2);
- kiocb_start_write(&aio_req->iocb);
- ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter);
- ovl_aio_put(aio_req);
- if (ret != -EIOCBQUEUED)
- ovl_aio_cleanup_handler(aio_req);
- }
-out:
- revert_creds(old_cred);
-out_fdput:
+ ret = backing_file_write_iter(real.file, iter, iocb, ifl, &ctx);
fdput(real);
out_unlock:
@@ -775,19 +627,3 @@ const struct file_operations ovl_file_operations = {
.copy_file_range = ovl_copy_file_range,
.remap_file_range = ovl_remap_file_range,
};
-
-int __init ovl_aio_request_cache_init(void)
-{
- ovl_aio_request_cachep = kmem_cache_create("ovl_aio_req",
- sizeof(struct ovl_aio_req),
- 0, SLAB_HWCACHE_ALIGN, NULL);
- if (!ovl_aio_request_cachep)
- return -ENOMEM;
-
- return 0;
-}
-
-void ovl_aio_request_cache_destroy(void)
-{
- kmem_cache_destroy(ovl_aio_request_cachep);
-}
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index a4b94a74b854..8b31bc3ee7a0 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -417,6 +417,12 @@ int ovl_want_write(struct dentry *dentry);
void ovl_drop_write(struct dentry *dentry);
struct dentry *ovl_workdir(struct dentry *dentry);
const struct cred *ovl_override_creds(struct super_block *sb);
+
+static inline const struct cred *ovl_creds(struct super_block *sb)
+{
+ return OVL_FS(sb)->creator_cred;
+}
+
int ovl_can_decode_fh(struct super_block *sb);
struct dentry *ovl_indexdir(struct super_block *sb);
bool ovl_index_all(struct super_block *sb);
@@ -835,8 +841,6 @@ struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir,
/* file.c */
extern const struct file_operations ovl_file_operations;
-int __init ovl_aio_request_cache_init(void);
-void ovl_aio_request_cache_destroy(void);
int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa);
int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa);
int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 0779c8290ec4..37387fd98e34 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1522,14 +1522,10 @@ static int __init ovl_init(void)
if (ovl_inode_cachep == NULL)
return -ENOMEM;
- err = ovl_aio_request_cache_init();
- if (!err) {
- err = register_filesystem(&ovl_fs_type);
- if (!err)
- return 0;
+ err = register_filesystem(&ovl_fs_type);
+ if (!err)
+ return 0;
- ovl_aio_request_cache_destroy();
- }
kmem_cache_destroy(ovl_inode_cachep);
return err;
@@ -1545,7 +1541,6 @@ static void __exit ovl_exit(void)
*/
rcu_barrier();
kmem_cache_destroy(ovl_inode_cachep);
- ovl_aio_request_cache_destroy();
}
module_init(ovl_init);
diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h
index 55c9e804f780..0648d548a418 100644
--- a/include/linux/backing-file.h
+++ b/include/linux/backing-file.h
@@ -9,9 +9,24 @@
#define _LINUX_BACKING_FILE_H
#include <linux/file.h>
+#include <linux/uio.h>
+#include <linux/fs.h>
+
+struct backing_file_ctx {
+ const struct cred *cred;
+ struct file *user_file;
+ void (*accessed)(struct file *);
+ void (*end_write)(struct file *);
+};
struct file *backing_file_open(const struct path *user_path, int flags,
const struct path *real_path,
const struct cred *cred);
+ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
+ struct kiocb *iocb, int flags,
+ struct backing_file_ctx *ctx);
+ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
+ struct kiocb *iocb, int flags,
+ struct backing_file_ctx *ctx);
#endif /* _LINUX_BACKING_FILE_H */
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,189 @@
From 5c28f6ec073077ce1239652c7a74555904eb0577 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Wed, 3 Jun 2026 10:51:41 +0200
Subject: [PATCH] fs: factor out backing_file_splice_{read,write}() helpers
JIRA: https://issues.redhat.com/browse/RHEL-179443
CVE: CVE-2026-46054
commit 9b7e9e2f5d5c3d079ec46bc71b114012e362ea6e
Author: Amir Goldstein <amir73il@gmail.com>
Date: Fri Oct 13 12:13:12 2023 +0300
fs: factor out backing_file_splice_{read,write}() helpers
There is not much in those helpers, but it makes sense to have them
logically next to the backing_file_{read,write}_iter() helpers as they
may grow more common logic in the future.
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
diff --git a/fs/backing-file.c b/fs/backing-file.c
index 6d915a45e288..5cc411566ce0 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -10,6 +10,7 @@
#include <linux/fs.h>
#include <linux/backing-file.h>
+#include <linux/splice.h>
#include "internal.h"
@@ -248,6 +249,56 @@ ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
}
EXPORT_SYMBOL_GPL(backing_file_write_iter);
+ssize_t backing_file_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags,
+ struct backing_file_ctx *ctx)
+{
+ const struct cred *old_cred;
+ ssize_t ret;
+
+ if (WARN_ON_ONCE(!(in->f_mode & FMODE_BACKING)))
+ return -EIO;
+
+ old_cred = override_creds(ctx->cred);
+ ret = vfs_splice_read(in, ppos, pipe, len, flags);
+ revert_creds(old_cred);
+
+ if (ctx->accessed)
+ ctx->accessed(ctx->user_file);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(backing_file_splice_read);
+
+ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
+ struct file *out, loff_t *ppos, size_t len,
+ unsigned int flags,
+ struct backing_file_ctx *ctx)
+{
+ const struct cred *old_cred;
+ ssize_t ret;
+
+ if (WARN_ON_ONCE(!(out->f_mode & FMODE_BACKING)))
+ return -EIO;
+
+ ret = file_remove_privs(ctx->user_file);
+ if (ret)
+ return ret;
+
+ old_cred = override_creds(ctx->cred);
+ file_start_write(out);
+ ret = iter_file_splice_write(pipe, out, ppos, len, flags);
+ file_end_write(out);
+ revert_creds(old_cred);
+
+ if (ctx->end_write)
+ ctx->end_write(ctx->user_file);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(backing_file_splice_write);
+
static int __init backing_aio_init(void)
{
backing_aio_cachep = kmem_cache_create("backing_aio",
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 3eee9f45971e..165a92b25c0a 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -9,7 +9,6 @@
#include <linux/xattr.h>
#include <linux/uio.h>
#include <linux/uaccess.h>
-#include <linux/splice.h>
#include <linux/security.h>
#include <linux/mm.h>
#include <linux/fs.h>
@@ -328,20 +327,21 @@ static ssize_t ovl_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags)
{
- const struct cred *old_cred;
struct fd real;
ssize_t ret;
+ struct backing_file_ctx ctx = {
+ .cred = ovl_creds(file_inode(in)->i_sb),
+ .user_file = in,
+ .accessed = ovl_file_accessed,
+ };
ret = ovl_real_fdget(in, &real);
if (ret)
return ret;
- old_cred = ovl_override_creds(file_inode(in)->i_sb);
- ret = vfs_splice_read(real.file, ppos, pipe, len, flags);
- revert_creds(old_cred);
- ovl_file_accessed(in);
-
+ ret = backing_file_splice_read(real.file, ppos, pipe, len, flags, &ctx);
fdput(real);
+
return ret;
}
@@ -357,30 +357,23 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
loff_t *ppos, size_t len, unsigned int flags)
{
struct fd real;
- const struct cred *old_cred;
struct inode *inode = file_inode(out);
ssize_t ret;
+ struct backing_file_ctx ctx = {
+ .cred = ovl_creds(inode->i_sb),
+ .user_file = out,
+ .end_write = ovl_file_modified,
+ };
inode_lock(inode);
/* Update mode */
ovl_copyattr(inode);
- ret = file_remove_privs(out);
- if (ret)
- goto out_unlock;
ret = ovl_real_fdget(out, &real);
if (ret)
goto out_unlock;
- old_cred = ovl_override_creds(inode->i_sb);
- file_start_write(real.file);
-
- ret = iter_file_splice_write(pipe, real.file, ppos, len, flags);
-
- file_end_write(real.file);
- /* Update size */
- ovl_file_modified(out);
- revert_creds(old_cred);
+ ret = backing_file_splice_write(pipe, real.file, ppos, len, flags, &ctx);
fdput(real);
out_unlock:
diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h
index 0648d548a418..0546d5b1c9f5 100644
--- a/include/linux/backing-file.h
+++ b/include/linux/backing-file.h
@@ -28,5 +28,13 @@ ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
struct kiocb *iocb, int flags,
struct backing_file_ctx *ctx);
+ssize_t backing_file_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags,
+ struct backing_file_ctx *ctx);
+ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
+ struct file *out, loff_t *ppos, size_t len,
+ unsigned int flags,
+ struct backing_file_ctx *ctx);
#endif /* _LINUX_BACKING_FILE_H */
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,124 @@
From 5176a4370a2e9f1ebe16e502bf93897820461b7f Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Wed, 3 Jun 2026 10:51:45 +0200
Subject: [PATCH] fs: factor out backing_file_mmap() helper
JIRA: https://issues.redhat.com/browse/RHEL-179443
CVE: CVE-2026-46054
commit f567377e406c032fff0799bde4fdf4a977529b84
Author: Amir Goldstein <amir73il@gmail.com>
Date: Fri Oct 13 12:49:37 2023 +0300
fs: factor out backing_file_mmap() helper
Assert that the file object is allocated in a backing_file container
so that file_user_path() could be used to display the user path and
not the backing file's path in /proc/<pid>/maps.
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
diff --git a/fs/backing-file.c b/fs/backing-file.c
index 5cc411566ce0..6ea14b6214c1 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -11,6 +11,7 @@
#include <linux/fs.h>
#include <linux/backing-file.h>
#include <linux/splice.h>
+#include <linux/mm.h>
#include "internal.h"
@@ -299,6 +300,32 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
}
EXPORT_SYMBOL_GPL(backing_file_splice_write);
+int backing_file_mmap(struct file *file, struct vm_area_struct *vma,
+ struct backing_file_ctx *ctx)
+{
+ const struct cred *old_cred;
+ int ret;
+
+ if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)) ||
+ WARN_ON_ONCE(ctx->user_file != vma->vm_file))
+ return -EIO;
+
+ if (!file->f_op->mmap)
+ return -ENODEV;
+
+ vma_set_file(vma, file);
+
+ old_cred = override_creds(ctx->cred);
+ ret = call_mmap(vma->vm_file, vma);
+ revert_creds(old_cred);
+
+ if (ctx->accessed)
+ ctx->accessed(ctx->user_file);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(backing_file_mmap);
+
static int __init backing_aio_init(void)
{
backing_aio_cachep = kmem_cache_create("backing_aio",
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 165a92b25c0a..d85385f37ba6 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -10,7 +10,6 @@
#include <linux/uio.h>
#include <linux/uaccess.h>
#include <linux/security.h>
-#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/backing-file.h>
#include "overlayfs.h"
@@ -411,23 +410,13 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
static int ovl_mmap(struct file *file, struct vm_area_struct *vma)
{
struct file *realfile = file->private_data;
- const struct cred *old_cred;
- int ret;
-
- if (!realfile->f_op->mmap)
- return -ENODEV;
-
- if (WARN_ON(file != vma->vm_file))
- return -EIO;
-
- vma_set_file(vma, realfile);
-
- old_cred = ovl_override_creds(file_inode(file)->i_sb);
- ret = call_mmap(vma->vm_file, vma);
- revert_creds(old_cred);
- ovl_file_accessed(file);
+ struct backing_file_ctx ctx = {
+ .cred = ovl_creds(file_inode(file)->i_sb),
+ .user_file = file,
+ .accessed = ovl_file_accessed,
+ };
- return ret;
+ return backing_file_mmap(realfile, vma, &ctx);
}
static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h
index 0546d5b1c9f5..3f1fe1774f1b 100644
--- a/include/linux/backing-file.h
+++ b/include/linux/backing-file.h
@@ -36,5 +36,7 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
struct file *out, loff_t *ppos, size_t len,
unsigned int flags,
struct backing_file_ctx *ctx);
+int backing_file_mmap(struct file *file, struct vm_area_struct *vma,
+ struct backing_file_ctx *ctx);
#endif /* _LINUX_BACKING_FILE_H */
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,158 @@
From f034201051121a87f98c1651368c3883f633182f Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Wed, 3 Jun 2026 11:47:18 +0200
Subject: [PATCH] lsm: add helper for blob allocations
JIRA: https://issues.redhat.com/browse/RHEL-179443
CVE: CVE-2026-46054
Conflicts:
- security/security.c: context fuzz + dropped hunks changing functions
not present downstream
commit 09001284eebfc1b684e81d1db0f006787d35f3e1
Author: Casey Schaufler <casey@schaufler-ca.com>
Date: Wed Jul 10 14:32:27 2024 -0700
lsm: add helper for blob allocations
Create a helper function lsm_blob_alloc() for general use in the hook
specific functions that allocate LSM blobs. Change the hook specific
functions to use this helper. This reduces the code size by a small
amount and will make adding new instances of infrastructure managed
security blobs easier.
Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: John Johansen <john.johansen@canonical.com>
[PM: subject tweak]
Signed-off-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
diff --git a/security/security.c b/security/security.c
index b59af216324f..1e63f23a504a 100644
--- a/security/security.c
+++ b/security/security.c
@@ -645,27 +645,42 @@ int unregister_blocking_lsm_notifier(struct notifier_block *nb)
EXPORT_SYMBOL(unregister_blocking_lsm_notifier);
/**
- * lsm_cred_alloc - allocate a composite cred blob
- * @cred: the cred that needs a blob
+ * lsm_blob_alloc - allocate a composite blob
+ * @dest: the destination for the blob
+ * @size: the size of the blob
* @gfp: allocation type
*
- * Allocate the cred blob for all the modules
+ * Allocate a blob for all the modules
*
* Returns 0, or -ENOMEM if memory can't be allocated.
*/
-static int lsm_cred_alloc(struct cred *cred, gfp_t gfp)
+static int lsm_blob_alloc(void **dest, size_t size, gfp_t gfp)
{
- if (blob_sizes.lbs_cred == 0) {
- cred->security = NULL;
+ if (size == 0) {
+ *dest = NULL;
return 0;
}
- cred->security = kzalloc(blob_sizes.lbs_cred, gfp);
- if (cred->security == NULL)
+ *dest = kzalloc(size, gfp);
+ if (*dest == NULL)
return -ENOMEM;
return 0;
}
+/**
+ * lsm_cred_alloc - allocate a composite cred blob
+ * @cred: the cred that needs a blob
+ * @gfp: allocation type
+ *
+ * Allocate the cred blob for all the modules
+ *
+ * Returns 0, or -ENOMEM if memory can't be allocated.
+ */
+static int lsm_cred_alloc(struct cred *cred, gfp_t gfp)
+{
+ return lsm_blob_alloc(&cred->security, blob_sizes.lbs_cred, gfp);
+}
+
/**
* lsm_early_cred - during initialization allocate a composite cred blob
* @cred: the cred that needs a blob
@@ -732,15 +747,7 @@ static int lsm_inode_alloc(struct inode *inode)
*/
static int lsm_task_alloc(struct task_struct *task)
{
- if (blob_sizes.lbs_task == 0) {
- task->security = NULL;
- return 0;
- }
-
- task->security = kzalloc(blob_sizes.lbs_task, GFP_KERNEL);
- if (task->security == NULL)
- return -ENOMEM;
- return 0;
+ return lsm_blob_alloc(&task->security, blob_sizes.lbs_task, GFP_KERNEL);
}
/**
@@ -753,15 +760,7 @@ static int lsm_task_alloc(struct task_struct *task)
*/
static int lsm_ipc_alloc(struct kern_ipc_perm *kip)
{
- if (blob_sizes.lbs_ipc == 0) {
- kip->security = NULL;
- return 0;
- }
-
- kip->security = kzalloc(blob_sizes.lbs_ipc, GFP_KERNEL);
- if (kip->security == NULL)
- return -ENOMEM;
- return 0;
+ return lsm_blob_alloc(&kip->security, blob_sizes.lbs_ipc, GFP_KERNEL);
}
/**
@@ -774,15 +773,8 @@ static int lsm_ipc_alloc(struct kern_ipc_perm *kip)
*/
static int lsm_msg_msg_alloc(struct msg_msg *mp)
{
- if (blob_sizes.lbs_msg_msg == 0) {
- mp->security = NULL;
- return 0;
- }
-
- mp->security = kzalloc(blob_sizes.lbs_msg_msg, GFP_KERNEL);
- if (mp->security == NULL)
- return -ENOMEM;
- return 0;
+ return lsm_blob_alloc(&mp->security, blob_sizes.lbs_msg_msg,
+ GFP_KERNEL);
}
/**
@@ -809,15 +801,8 @@ static void __init lsm_early_task(struct task_struct *task)
*/
static int lsm_superblock_alloc(struct super_block *sb)
{
- if (blob_sizes.lbs_superblock == 0) {
- sb->s_security = NULL;
- return 0;
- }
-
- sb->s_security = kzalloc(blob_sizes.lbs_superblock, GFP_KERNEL);
- if (sb->s_security == NULL)
- return -ENOMEM;
- return 0;
+ return lsm_blob_alloc(&sb->s_security, blob_sizes.lbs_superblock,
+ GFP_KERNEL);
}
/*
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,74 @@
From c884ff1e458df0e5d801f19b4e847a4673d7471b Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Wed, 3 Jun 2026 11:48:02 +0200
Subject: [PATCH] ovl: Fix nested backing file paths
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
JIRA: https://issues.redhat.com/browse/RHEL-179443
CVE: CVE-2026-46054
commit 924577e4f6ca473de1528953a0e13505fae61d7b
Author: André Almeida <andrealmeid@igalia.com>
Date: Tue Apr 29 15:38:50 2025 -0300
ovl: Fix nested backing file paths
When the lowerdir of an overlayfs is a merged directory of another
overlayfs, ovl_open_realfile() will fail to open the real file and point
to a lower dentry copy, without the proper parent path. After this,
d_path() will then display the path incorrectly as if the file is placed
in the root directory.
This bug can be triggered with the following setup:
mkdir -p ovl-A/lower ovl-A/upper ovl-A/merge ovl-A/work
mkdir -p ovl-B/upper ovl-B/merge ovl-B/work
cp /bin/cat ovl-A/lower/
mount -t overlay overlay -o \
lowerdir=ovl-A/lower,upperdir=ovl-A/upper,workdir=ovl-A/work \
ovl-A/merge
mount -t overlay overlay -o \
lowerdir=ovl-A/merge,upperdir=ovl-B/upper,workdir=ovl-B/work \
ovl-B/merge
ovl-A/merge/cat /proc/self/maps | grep --color cat
ovl-B/merge/cat /proc/self/maps | grep --color cat
The first cat will correctly show `/ovl-A/merge/cat`, while the second
one shows just `/cat`.
To fix that, uses file_user_path() inside of backing_file_open() to get
the correct file path for the dentry.
Co-developed-by: John Schoenick <johns@valvesoftware.com>
Signed-off-by: John Schoenick <johns@valvesoftware.com>
Signed-off-by: André Almeida <andrealmeid@igalia.com>
Fixes: def3ae83da02 ("fs: store real path instead of fake path in backing file f_path")
Cc: <stable@vger.kernel.org> # v6.7
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index d85385f37ba6..3bf52eace698 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -51,8 +51,8 @@ static struct file *ovl_open_realfile(const struct file *file,
if (!inode_owner_or_capable(real_idmap, realinode))
flags &= ~O_NOATIME;
- realfile = backing_file_open(&file->f_path, flags, realpath,
- current_cred());
+ realfile = backing_file_open(file_user_path((struct file *) file),
+ flags, realpath, current_cred());
}
revert_creds(old_cred);
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,105 @@
From 3869325e0bf98aba624155c8355abe6e5db6e674 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Wed, 3 Jun 2026 11:18:45 +0200
Subject: [PATCH] fs: constify file ptr in backing_file accessor helpers
JIRA: https://issues.redhat.com/browse/RHEL-179443
CVE: CVE-2026-46054
Conflicts:
- context fuzz and dropped changes to functions not present downstream
commit 4e301d858af17ae2ce56886296e5458c5a08219a
Author: Amir Goldstein <amir73il@gmail.com>
Date: Sat Jun 7 13:53:03 2025 +0200
fs: constify file ptr in backing_file accessor helpers
Add internal helper backing_file_set_user_path() for the only
two cases that need to modify backing_file fields.
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/20250607115304.2521155-2-amir73il@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
diff --git a/fs/backing-file.c b/fs/backing-file.c
index 6ea14b6214c1..840b45366557 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -41,7 +41,7 @@ struct file *backing_file_open(const struct path *user_path, int flags,
return f;
path_get(user_path);
- *backing_file_user_path(f) = *user_path;
+ backing_file_set_user_path(f, user_path);
error = vfs_open(real_path, f);
if (error) {
fput(f);
diff --git a/fs/file_table.c b/fs/file_table.c
index e5c7b9705109..fa8f4d34efa5 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -50,17 +50,20 @@ struct backing_file {
struct path user_path;
};
-static inline struct backing_file *backing_file(struct file *f)
-{
- return container_of(f, struct backing_file, file);
-}
+#define backing_file(f) container_of(f, struct backing_file, file)
-struct path *backing_file_user_path(struct file *f)
+struct path *backing_file_user_path(const struct file *f)
{
return &backing_file(f)->user_path;
}
EXPORT_SYMBOL_GPL(backing_file_user_path);
+void backing_file_set_user_path(struct file *f, const struct path *path)
+{
+ backing_file(f)->user_path = *path;
+}
+EXPORT_SYMBOL_GPL(backing_file_set_user_path);
+
static void file_free_rcu(struct rcu_head *head)
{
struct file *f = container_of(head, struct file, f_u.fu_rcuhead);
diff --git a/fs/internal.h b/fs/internal.h
index bd0934d0521b..78fcdad80e53 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -96,6 +96,7 @@ extern void chroot_fs_refs(const struct path *, const struct path *);
struct file *alloc_empty_file(int flags, const struct cred *cred);
struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred);
struct file *alloc_empty_backing_file(int flags, const struct cred *cred);
+void backing_file_set_user_path(struct file *f, const struct path *path);
static inline void file_put_write_access(struct file *file)
{
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5f3ca25c77e5..7ed9232f579d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2586,7 +2586,7 @@ struct file *dentry_open(const struct path *path, int flags,
const struct cred *creds);
struct file *dentry_create(const struct path *path, int flags, umode_t mode,
const struct cred *cred);
-struct path *backing_file_user_path(struct file *f);
+struct path *backing_file_user_path(const struct file *f);
/*
* file_user_path - get the path to display for memory mapped file
@@ -2597,7 +2597,7 @@ struct path *backing_file_user_path(struct file *f);
* /proc/<pid>/maps), this helper should be used to get the path to display
* to the user, which is the path of the fd that user has requested to map.
*/
-static inline const struct path *file_user_path(struct file *f)
+static inline const struct path *file_user_path(const struct file *f)
{
if (unlikely(f->f_mode & FMODE_BACKING))
return backing_file_user_path(f);
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,38 @@
From c3f8db29db9e7b9bb68b107e932315f147046ac5 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Wed, 3 Jun 2026 11:48:18 +0200
Subject: [PATCH] ovl: remove unneeded non-const conversion
JIRA: https://issues.redhat.com/browse/RHEL-179443
CVE: CVE-2026-46054
commit 3ec2529eca6f175f4e3e87c4534010e044839b38
Author: Amir Goldstein <amir73il@gmail.com>
Date: Sat Jun 7 13:53:04 2025 +0200
ovl: remove unneeded non-const conversion
file_user_path() now takes a const file ptr.
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/20250607115304.2521155-3-amir73il@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 3bf52eace698..c8e45f503db3 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -51,7 +51,7 @@ static struct file *ovl_open_realfile(const struct file *file,
if (!inode_owner_or_capable(real_idmap, realinode))
flags &= ~O_NOATIME;
- realfile = backing_file_open(file_user_path((struct file *) file),
+ realfile = backing_file_open(file_user_path(file),
flags, realpath, current_cred());
}
revert_creds(old_cred);
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,48 @@
From 99a9c81094c622efebec7695e551baffdac3f89b Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Wed, 3 Jun 2026 20:29:45 +0200
Subject: [PATCH] ovl: remove redundant IOCB_DIO_CALLER_COMP clearing
JIRA: https://issues.redhat.com/browse/RHEL-179443
Conflicts:
- just context fuzz
commit 7933a585d70ee496fa341b50b8b0a95b131867ff
Author: Seong-Gwang Heo <heo@mykernel.net>
Date: Thu Oct 9 13:41:48 2025 +0800
ovl: remove redundant IOCB_DIO_CALLER_COMP clearing
The backing_file_write_iter() function, which is called
immediately after this code, already contains identical
logic to clear the IOCB_DIO_CALLER_COMP flag along with
the same explanatory comment. There is no need to duplicate
this operation in the overlayfs code.
Signed-off-by: Seong-Gwang Heo <heo@mykernel.net>
Fixes: a6293b3e285c ("fs: factor out backing_file_{read,write}_iter() helpers")
Acked-by: Miklos Szeredi <mszeredi@redhat.com>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index c8e45f503db3..3d8539909f74 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -308,11 +308,6 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
if (!ovl_should_sync(OVL_FS(inode->i_sb)))
ifl &= ~(IOCB_DSYNC | IOCB_SYNC);
- /*
- * Overlayfs doesn't support deferred completions, don't copy
- * this property in case it is set by the issuer.
- */
- ifl &= ~IOCB_DIO_CALLER_COMP;
ret = backing_file_write_iter(real.file, iter, iocb, ifl, &ctx);
fdput(real);
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,89 @@
From 627a3d264cb85311131aef67a0ff2397999d5394 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Wed, 3 Jun 2026 20:29:57 +0200
Subject: [PATCH] perf/core: Fix MMAP event path names with backing files
JIRA: https://issues.redhat.com/browse/RHEL-179443
commit 8818f507a9391019a3ec7c57b1a32e4b386e48a5
Author: Adrian Hunter <adrian.hunter@intel.com>
Date: Mon Oct 13 10:22:43 2025 +0300
perf/core: Fix MMAP event path names with backing files
Some file systems like FUSE-based ones or overlayfs may record the backing
file in struct vm_area_struct vm_file, instead of the user file that the
user mmapped.
Since commit def3ae83da02f ("fs: store real path instead of fake path in
backing file f_path"), file_path() no longer returns the user file path
when applied to a backing file. There is an existing helper
file_user_path() for that situation.
Use file_user_path() instead of file_path() to get the path for MMAP
and MMAP2 events.
Example:
Setup:
# cd /root
# mkdir test ; cd test ; mkdir lower upper work merged
# cp `which cat` lower
# mount -t overlay overlay -olowerdir=lower,upperdir=upper,workdir=work merged
# perf record -e intel_pt//u -- /root/test/merged/cat /proc/self/maps
...
55b0ba399000-55b0ba434000 r-xp 00018000 00:1a 3419 /root/test/merged/cat
...
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.060 MB perf.data ]
#
Before:
File name is wrong (/cat), so decoding fails:
# perf script --no-itrace --show-mmap-events
cat 367 [016] 100.491492: PERF_RECORD_MMAP2 367/367: [0x55b0ba399000(0x9b000) @ 0x18000 00:02 3419 489959280]: r-xp /cat
...
# perf script --itrace=e | wc -l
Warning:
19 instruction trace errors
19
#
After:
File name is correct (/root/test/merged/cat), so decoding is ok:
# perf script --no-itrace --show-mmap-events
cat 364 [016] 72.153006: PERF_RECORD_MMAP2 364/364: [0x55ce4003d000(0x9b000) @ 0x18000 00:02 3419 3132534314]: r-xp /root/test/merged/cat
# perf script --itrace=e
# perf script --itrace=e | wc -l
0
#
Fixes: def3ae83da02f ("fs: store real path instead of fake path in backing file f_path")
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Amir Goldstein <amir73il@gmail.com>
Cc: stable@vger.kernel.org
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0d3bd850fee7..5065087dd236 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8953,7 +8953,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
* need to add enough zero bytes after the string to handle
* the 64bit alignment we do later.
*/
- name = file_path(file, buf, PATH_MAX - sizeof(u64));
+ name = d_path(file_user_path(file), buf, PATH_MAX - sizeof(u64));
if (IS_ERR(name)) {
name = "//toolong";
goto cpy_name;
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,85 @@
From 644c8e296eb8628ed6ccaff5609bce2c4b591c8a Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Wed, 3 Jun 2026 11:00:17 +0200
Subject: [PATCH] fs: prepare for adding LSM blob to backing_file
JIRA: https://issues.redhat.com/browse/RHEL-179443
CVE: CVE-2026-46054
Conflicts:
- fs/file_table.c: adjusted the body and call site of
backing_file_free() to downstream state (sme intermediate commits
not backported)
commit 880bd496ec72a6dcb00cb70c430ef752ba242ae7
Author: Amir Goldstein <amir73il@gmail.com>
Date: Mon Mar 30 10:27:51 2026 +0200
fs: prepare for adding LSM blob to backing_file
In preparation to adding LSM blob to backing_file struct, factor out
helpers init_backing_file() and backing_file_free().
Cc: stable@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-unionfs@vger.kernel.org
Cc: linux-erofs@lists.ozlabs.org
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Serge Hallyn <serge@hallyn.com>
[PM: use the term "LSM blob", fix comment style to match file]
Signed-off-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
diff --git a/fs/file_table.c b/fs/file_table.c
index fa8f4d34efa5..34e3863c95b0 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -75,11 +75,16 @@ static void file_free_rcu(struct rcu_head *head)
kmem_cache_free(filp_cachep, f);
}
+static inline void backing_file_free(struct backing_file *ff)
+{
+ path_put(&ff->user_path);
+}
+
static inline void file_free(struct file *f)
{
security_file_free(f);
if (unlikely(f->f_mode & FMODE_BACKING))
- path_put(backing_file_user_path(f));
+ backing_file_free(backing_file(f));
if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
percpu_counter_dec(&nr_files);
call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
@@ -255,6 +260,12 @@ struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred)
return f;
}
+static int init_backing_file(struct backing_file *ff)
+{
+ memset(&ff->user_path, 0, sizeof(ff->user_path));
+ return 0;
+}
+
/*
* Variant of alloc_empty_file() that allocates a backing_file container
* and doesn't check and modify nr_files.
@@ -277,7 +288,14 @@ struct file *alloc_empty_backing_file(int flags, const struct cred *cred)
return ERR_PTR(error);
}
+ /* The f_mode flags must be set before fput(). */
ff->file.f_mode |= FMODE_BACKING | FMODE_NOACCOUNT;
+ error = init_backing_file(ff);
+ if (unlikely(error)) {
+ fput(&ff->file);
+ return ERR_PTR(error);
+ }
+
return &ff->file;
}
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,535 @@
From 807cf6dc47e9871fa1f77369e69fe8666cb0a1d4 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Wed, 3 Jun 2026 11:54:46 +0200
Subject: [PATCH] lsm: add backing_file LSM hooks
JIRA: https://issues.redhat.com/browse/RHEL-179443
CVE: CVE-2026-46054
Conflicts:
- fs/backing-file.c:
- adjust backing_file_mmap() to downstream state (missing scoped
guards, ctx->user_file instead of missing user_file variable)
- backing_tmpfile_open() not present downstream
- fs/erofs/ishare.c: hunk dropped, file not present downstream
- fs/file_table.c: context fuzz + put security_backing_file_free() in the
right place
- fs/fuse/passthrough.c: hunk dropped, file not present downstream
- fs/overlayfs/dir.c: hunk dropped, ovl_create_tmpfile() not present downstream
- fs/overlayfs/file.c: adjust to different indentation
- include/linux/backing-file.h: backing_tmpfile_open() not present downstream
- include/linux/lsm_hooks.h: adjust to downstream's definition of struct lsm_blob_sizes
- security/lsm.h: hunk dropped, file not present downstream
- security/lsm_init.c: hunk dropped, file not present downstream
- security/security.c: misc conflicts, port changes to stuff that was
already in lsm.h/lsm_init.c upstream
commit 6af36aeb147a06dea47c49859cd6ca5659aeb987
Author: Paul Moore <paul@paul-moore.com>
Date: Fri Dec 19 13:18:22 2025 -0500
lsm: add backing_file LSM hooks
Stacked filesystems such as overlayfs do not currently provide the
necessary mechanisms for LSMs to properly enforce access controls on the
mmap() and mprotect() operations. In order to resolve this gap, a LSM
security blob is being added to the backing_file struct and the following
new LSM hooks are being created:
security_backing_file_alloc()
security_backing_file_free()
security_mmap_backing_file()
The first two hooks are to manage the lifecycle of the LSM security blob
in the backing_file struct, while the third provides a new mmap() access
control point for the underlying backing file. It is also expected that
LSMs will likely want to update their security_file_mprotect() callback
to address issues with their mprotect() controls, but that does not
require a change to the security_file_mprotect() LSM hook.
There are a three other small changes to support these new LSM hooks:
* Pass the user file associated with a backing file down to
alloc_empty_backing_file() so it can be included in the
security_backing_file_alloc() hook.
* Add getter and setter functions for the backing_file struct LSM blob
as the backing_file struct remains private to fs/file_table.c.
* Constify the file struct field in the LSM common_audit_data struct to
better support LSMs that need to pass a const file struct pointer into
the common LSM audit code.
Thanks to Arnd Bergmann for identifying the missing EXPORT_SYMBOL_GPL()
and supplying a fixup.
Cc: stable@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-unionfs@vger.kernel.org
Cc: linux-erofs@lists.ozlabs.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Serge Hallyn <serge@hallyn.com>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
diff --git a/fs/backing-file.c b/fs/backing-file.c
index 840b45366557..e6f4fe27b58b 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -12,6 +12,7 @@
#include <linux/backing-file.h>
#include <linux/splice.h>
#include <linux/mm.h>
+#include <linux/security.h>
#include "internal.h"
@@ -29,14 +30,15 @@
* returned file into a container structure that also stores the stacked
* file's path, which can be retrieved using backing_file_user_path().
*/
-struct file *backing_file_open(const struct path *user_path, int flags,
+struct file *backing_file_open(const struct file *user_file, int flags,
const struct path *real_path,
const struct cred *cred)
{
+ const struct path *user_path = &user_file->f_path;
struct file *f;
int error;
- f = alloc_empty_backing_file(flags, cred);
+ f = alloc_empty_backing_file(flags, cred, user_file);
if (IS_ERR(f))
return f;
@@ -316,6 +318,11 @@ int backing_file_mmap(struct file *file, struct vm_area_struct *vma,
vma_set_file(vma, file);
old_cred = override_creds(ctx->cred);
+ ret = security_mmap_backing_file(vma, file, ctx->user_file);
+ if (ret) {
+ revert_creds(old_cred);
+ return ret;
+ }
ret = call_mmap(vma->vm_file, vma);
revert_creds(old_cred);
diff --git a/fs/file_table.c b/fs/file_table.c
index 34e3863c95b0..fc04eb48d550 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -48,6 +48,9 @@ static struct percpu_counter nr_files __cacheline_aligned_in_smp;
struct backing_file {
struct file file;
struct path user_path;
+#ifdef CONFIG_SECURITY
+ void *security;
+#endif
};
#define backing_file(f) container_of(f, struct backing_file, file)
@@ -64,6 +67,18 @@ void backing_file_set_user_path(struct file *f, const struct path *path)
}
EXPORT_SYMBOL_GPL(backing_file_set_user_path);
+#ifdef CONFIG_SECURITY
+void *backing_file_security(const struct file *f)
+{
+ return backing_file(f)->security;
+}
+
+void backing_file_set_security(struct file *f, void *security)
+{
+ backing_file(f)->security = security;
+}
+#endif /* CONFIG_SECURITY */
+
static void file_free_rcu(struct rcu_head *head)
{
struct file *f = container_of(head, struct file, f_u.fu_rcuhead);
@@ -77,6 +92,7 @@ static void file_free_rcu(struct rcu_head *head)
static inline void backing_file_free(struct backing_file *ff)
{
+ security_backing_file_free(&ff->file);
path_put(&ff->user_path);
}
@@ -260,10 +276,12 @@ struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred)
return f;
}
-static int init_backing_file(struct backing_file *ff)
+static int init_backing_file(struct backing_file *ff,
+ const struct file *user_file)
{
memset(&ff->user_path, 0, sizeof(ff->user_path));
- return 0;
+ backing_file_set_security(&ff->file, NULL);
+ return security_backing_file_alloc(&ff->file, user_file);
}
/*
@@ -273,7 +291,8 @@ static int init_backing_file(struct backing_file *ff)
* This is only for kernel internal use, and the allocate file must not be
* installed into file tables or such.
*/
-struct file *alloc_empty_backing_file(int flags, const struct cred *cred)
+struct file *alloc_empty_backing_file(int flags, const struct cred *cred,
+ const struct file *user_file)
{
struct backing_file *ff;
int error;
@@ -290,7 +309,7 @@ struct file *alloc_empty_backing_file(int flags, const struct cred *cred)
/* The f_mode flags must be set before fput(). */
ff->file.f_mode |= FMODE_BACKING | FMODE_NOACCOUNT;
- error = init_backing_file(ff);
+ error = init_backing_file(ff, user_file);
if (unlikely(error)) {
fput(&ff->file);
return ERR_PTR(error);
diff --git a/fs/internal.h b/fs/internal.h
index 78fcdad80e53..f48f5fa349c9 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -95,7 +95,8 @@ extern void chroot_fs_refs(const struct path *, const struct path *);
*/
struct file *alloc_empty_file(int flags, const struct cred *cred);
struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred);
-struct file *alloc_empty_backing_file(int flags, const struct cred *cred);
+struct file *alloc_empty_backing_file(int flags, const struct cred *cred,
+ const struct file *user_file);
void backing_file_set_user_path(struct file *f, const struct path *path);
static inline void file_put_write_access(struct file *file)
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 3d8539909f74..a5bc5dfc930f 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -51,7 +51,7 @@ static struct file *ovl_open_realfile(const struct file *file,
if (!inode_owner_or_capable(real_idmap, realinode))
flags &= ~O_NOATIME;
- realfile = backing_file_open(file_user_path(file),
+ realfile = backing_file_open(file,
flags, realpath, current_cred());
}
revert_creds(old_cred);
diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h
index 3f1fe1774f1b..103b6992b80a 100644
--- a/include/linux/backing-file.h
+++ b/include/linux/backing-file.h
@@ -19,7 +19,7 @@ struct backing_file_ctx {
void (*end_write)(struct file *);
};
-struct file *backing_file_open(const struct path *user_path, int flags,
+struct file *backing_file_open(const struct file *user_file, int flags,
const struct path *real_path,
const struct cred *cred);
ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7ed9232f579d..a94f20ba2bf6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2588,6 +2588,19 @@ struct file *dentry_create(const struct path *path, int flags, umode_t mode,
const struct cred *cred);
struct path *backing_file_user_path(const struct file *f);
+#ifdef CONFIG_SECURITY
+void *backing_file_security(const struct file *f);
+void backing_file_set_security(struct file *f, void *security);
+#else
+static inline void *backing_file_security(const struct file *f)
+{
+ return NULL;
+}
+static inline void backing_file_set_security(struct file *f, void *security)
+{
+}
+#endif /* CONFIG_SECURITY */
+
/*
* file_user_path - get the path to display for memory mapped file
*
diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h
index 97a8b21eb033..c0a2839253fa 100644
--- a/include/linux/lsm_audit.h
+++ b/include/linux/lsm_audit.h
@@ -93,7 +93,7 @@ struct common_audit_data {
#endif
char *kmod_name;
struct lsm_ioctlop_audit *op;
- struct file *file;
+ const struct file *file;
struct lsm_ibpkey_audit *ibpkey;
struct lsm_ibendport_audit *ibendport;
int reason;
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index b6fbb446bab7..304da2a90ba7 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -168,6 +168,9 @@ LSM_HOOK(int, 0, kernfs_init_security, struct kernfs_node *kn_dir,
LSM_HOOK(int, 0, file_permission, struct file *file, int mask)
LSM_HOOK(int, 0, file_alloc_security, struct file *file)
LSM_HOOK(void, LSM_RET_VOID, file_free_security, struct file *file)
+LSM_HOOK(int, 0, backing_file_alloc, struct file *backing_file,
+ const struct file *user_file)
+LSM_HOOK(void, LSM_RET_VOID, backing_file_free, struct file *backing_file)
LSM_HOOK(int, 0, file_ioctl, struct file *file, unsigned int cmd,
unsigned long arg)
LSM_HOOK(int, 0, file_ioctl_compat, struct file *file, unsigned int cmd,
@@ -175,6 +178,8 @@ LSM_HOOK(int, 0, file_ioctl_compat, struct file *file, unsigned int cmd,
LSM_HOOK(int, 0, mmap_addr, unsigned long addr)
LSM_HOOK(int, 0, mmap_file, struct file *file, unsigned long reqprot,
unsigned long prot, unsigned long flags)
+LSM_HOOK(int, 0, mmap_backing_file, struct vm_area_struct *vma,
+ struct file *backing_file, struct file *user_file)
LSM_HOOK(int, 0, file_mprotect, struct vm_area_struct *vma,
unsigned long reqprot, unsigned long prot)
LSM_HOOK(int, 0, file_lock, struct file *file, unsigned int cmd)
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 7577ecfc79e4..a16571929f7b 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1637,6 +1637,7 @@ struct security_hook_list {
struct lsm_blob_sizes {
int lbs_cred;
int lbs_file;
+ int lbs_backing_file;
int lbs_inode;
int lbs_superblock;
int lbs_ipc;
diff --git a/include/linux/security.h b/include/linux/security.h
index d2888c127859..db02db9f623a 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -387,11 +387,17 @@ int security_kernfs_init_security(struct kernfs_node *kn_dir,
int security_file_permission(struct file *file, int mask);
int security_file_alloc(struct file *file);
void security_file_free(struct file *file);
+int security_backing_file_alloc(struct file *backing_file,
+ const struct file *user_file);
+void security_backing_file_free(struct file *backing_file);
int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
int security_file_ioctl_compat(struct file *file, unsigned int cmd,
unsigned long arg);
int security_mmap_file(struct file *file, unsigned long prot,
unsigned long flags);
+int security_mmap_backing_file(struct vm_area_struct *vma,
+ struct file *backing_file,
+ struct file *user_file);
int security_mmap_addr(unsigned long addr);
int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
unsigned long prot);
@@ -976,6 +982,15 @@ static inline int security_file_alloc(struct file *file)
static inline void security_file_free(struct file *file)
{ }
+static inline int security_backing_file_alloc(struct file *backing_file,
+ const struct file *user_file)
+{
+ return 0;
+}
+
+static inline void security_backing_file_free(struct file *backing_file)
+{ }
+
static inline int security_file_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
@@ -995,6 +1010,13 @@ static inline int security_mmap_file(struct file *file, unsigned long prot,
return 0;
}
+static inline int security_mmap_backing_file(struct vm_area_struct *vma,
+ struct file *backing_file,
+ struct file *user_file)
+{
+ return 0;
+}
+
static inline int security_mmap_addr(unsigned long addr)
{
return cap_mmap_addr(addr);
diff --git a/security/security.c b/security/security.c
index 1e63f23a504a..27a309ab0b97 100644
--- a/security/security.c
+++ b/security/security.c
@@ -89,6 +89,7 @@ const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = {
static BLOCKING_NOTIFIER_HEAD(blocking_lsm_notifier_chain);
static struct kmem_cache *lsm_file_cache;
+static struct kmem_cache *lsm_backing_file_cache;
static struct kmem_cache *lsm_inode_cache;
char *lsm_names;
@@ -260,6 +261,8 @@ static void __init lsm_set_blob_sizes(struct lsm_blob_sizes *needed)
lsm_set_blob_size(&needed->lbs_cred, &blob_sizes.lbs_cred);
lsm_set_blob_size(&needed->lbs_file, &blob_sizes.lbs_file);
+ lsm_set_blob_size(&needed->lbs_backing_file,
+ &blob_sizes.lbs_backing_file);
/*
* The inode blob gets an rcu_head in addition to
* what the modules might need.
@@ -447,14 +450,15 @@ static void __init ordered_lsm_init(void)
report_lsm_order();
- init_debug("cred blob size = %d\n", blob_sizes.lbs_cred);
- init_debug("file blob size = %d\n", blob_sizes.lbs_file);
- init_debug("inode blob size = %d\n", blob_sizes.lbs_inode);
- init_debug("ipc blob size = %d\n", blob_sizes.lbs_ipc);
- init_debug("msg_msg blob size = %d\n", blob_sizes.lbs_msg_msg);
- init_debug("superblock blob size = %d\n", blob_sizes.lbs_superblock);
- init_debug("task blob size = %d\n", blob_sizes.lbs_task);
- init_debug("xattr slots = %d\n", blob_sizes.lbs_xattr_count);
+ init_debug("cred blob size = %d\n", blob_sizes.lbs_cred);
+ init_debug("file blob size = %d\n", blob_sizes.lbs_file);
+ init_debug("backing_file blob size = %d\n", blob_sizes.lbs_backing_file);
+ init_debug("inode blob size = %d\n", blob_sizes.lbs_inode);
+ init_debug("ipc blob size = %d\n", blob_sizes.lbs_ipc);
+ init_debug("msg_msg blob size = %d\n", blob_sizes.lbs_msg_msg);
+ init_debug("superblock blob size = %d\n", blob_sizes.lbs_superblock);
+ init_debug("task blob size = %d\n", blob_sizes.lbs_task);
+ init_debug("xattr slots = %d\n", blob_sizes.lbs_xattr_count);
/*
* Create any kmem_caches needed for blobs
@@ -463,6 +467,11 @@ static void __init ordered_lsm_init(void)
lsm_file_cache = kmem_cache_create("lsm_file_cache",
blob_sizes.lbs_file, 0,
SLAB_PANIC, NULL);
+ if (blob_sizes.lbs_backing_file)
+ lsm_backing_file_cache = kmem_cache_create(
+ "lsm_backing_file_cache",
+ blob_sizes.lbs_backing_file,
+ 0, SLAB_PANIC, NULL);
if (blob_sizes.lbs_inode)
lsm_inode_cache = kmem_cache_create("lsm_inode_cache",
blob_sizes.lbs_inode, 0,
@@ -644,6 +653,30 @@ int unregister_blocking_lsm_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL(unregister_blocking_lsm_notifier);
+/**
+ * lsm_backing_file_alloc - allocate a composite backing file blob
+ * @backing_file: the backing file
+ *
+ * Allocate the backing file blob for all the modules.
+ *
+ * Returns 0, or -ENOMEM if memory can't be allocated.
+ */
+static int lsm_backing_file_alloc(struct file *backing_file)
+{
+ void *blob;
+
+ if (!lsm_backing_file_cache) {
+ backing_file_set_security(backing_file, NULL);
+ return 0;
+ }
+
+ blob = kmem_cache_zalloc(lsm_backing_file_cache, GFP_KERNEL);
+ backing_file_set_security(backing_file, blob);
+ if (!blob)
+ return -ENOMEM;
+ return 0;
+}
+
/**
* lsm_blob_alloc - allocate a composite blob
* @dest: the destination for the blob
@@ -1689,6 +1722,57 @@ void security_file_free(struct file *file)
}
}
+/**
+ * security_backing_file_alloc() - Allocate and setup a backing file blob
+ * @backing_file: the backing file
+ * @user_file: the associated user visible file
+ *
+ * Allocate a backing file LSM blob and perform any necessary initialization of
+ * the LSM blob. There will be some operations where the LSM will not have
+ * access to @user_file after this point, so any important state associated
+ * with @user_file that is important to the LSM should be captured in the
+ * backing file's LSM blob.
+ *
+ * LSM's should avoid taking a reference to @user_file in this hook as it will
+ * result in problems later when the system attempts to drop/put the file
+ * references due to a circular dependency.
+ *
+ * Return: Return 0 if the hook is successful, negative values otherwise.
+ */
+int security_backing_file_alloc(struct file *backing_file,
+ const struct file *user_file)
+{
+ int rc;
+
+ rc = lsm_backing_file_alloc(backing_file);
+ if (rc)
+ return rc;
+ rc = call_int_hook(backing_file_alloc, backing_file, user_file);
+ if (unlikely(rc))
+ security_backing_file_free(backing_file);
+
+ return rc;
+}
+
+/**
+ * security_backing_file_free() - Free a backing file blob
+ * @backing_file: the backing file
+ *
+ * Free any LSM state associate with a backing file's LSM blob, including the
+ * blob itself.
+ */
+void security_backing_file_free(struct file *backing_file)
+{
+ void *blob = backing_file_security(backing_file);
+
+ call_void_hook(backing_file_free, backing_file);
+
+ if (blob) {
+ backing_file_set_security(backing_file, NULL);
+ kmem_cache_free(lsm_backing_file_cache, blob);
+ }
+}
+
int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
return call_int_hook(file_ioctl, file, cmd, arg);
@@ -1757,6 +1841,32 @@ int security_mmap_file(struct file *file, unsigned long prot,
return ima_file_mmap(file, prot);
}
+/**
+ * security_mmap_backing_file - Check if mmap'ing a backing file is allowed
+ * @vma: the vm_area_struct for the mmap'd region
+ * @backing_file: the backing file being mmap'd
+ * @user_file: the user file being mmap'd
+ *
+ * Check permissions for a mmap operation on a stacked filesystem. This hook
+ * is called after the security_mmap_file() and is responsible for authorizing
+ * the mmap on @backing_file. It is important to note that the mmap operation
+ * on @user_file has already been authorized and the @vma->vm_file has been
+ * set to @backing_file.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
+int security_mmap_backing_file(struct vm_area_struct *vma,
+ struct file *backing_file,
+ struct file *user_file)
+{
+ /* recommended by the stackable filesystem devs */
+ if (WARN_ON_ONCE(!(backing_file->f_mode & FMODE_BACKING)))
+ return -EIO;
+
+ return call_int_hook(mmap_backing_file, vma, backing_file, user_file);
+}
+EXPORT_SYMBOL_GPL(security_mmap_backing_file);
+
int security_mmap_addr(unsigned long addr)
{
return call_int_hook(mmap_addr, addr);
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,444 @@
From df2d263e4dcf6739be4c4b51d7e1f6c1d3316200 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Wed, 3 Jun 2026 13:01:38 +0200
Subject: [PATCH] selinux: fix overlayfs mmap() and mprotect() access checks
JIRA: https://issues.redhat.com/browse/RHEL-179443
CVE: CVE-2026-46054
Conflicts:
- security/selinux/hooks.c:
- context fuzz
- preserve passing &selinux_state to avc_has_perm()
- preserve honoring the checkreqprot setting (in case of mmap
backing file check it is ignored, but that's the best we can do -
at worst some access would be denied on overlayfs in extremely
exotic use cases)
- security/selinux/include/objsec.h: context fuzz
commit 82544d36b1729153c8aeb179e84750f0c085d3b1
Author: Paul Moore <paul@paul-moore.com>
Date: Thu Jan 1 17:19:18 2026 -0500
selinux: fix overlayfs mmap() and mprotect() access checks
The existing SELinux security model for overlayfs is to allow access if
the current task is able to access the top level file (the "user" file)
and the mounter's credentials are sufficient to access the lower
level file (the "backing" file). Unfortunately, the current code does
not properly enforce these access controls for both mmap() and mprotect()
operations on overlayfs filesystems.
This patch makes use of the newly created security_mmap_backing_file()
LSM hook to provide the missing backing file enforcement for mmap()
operations, and leverages the backing file API and new LSM blob to
provide the necessary information to properly enforce the mprotect()
access controls.
Cc: stable@vger.kernel.org
Acked-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index deacc9a63fae..fb5b9cb027d0 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1706,50 +1706,76 @@ static inline int file_path_has_perm(const struct cred *cred,
static int bpf_fd_pass(const struct file *file, u32 sid);
#endif
-/* Check whether a task can use an open file descriptor to
- access an inode in a given way. Check access to the
- descriptor itself, and then use dentry_has_perm to
- check a particular permission to the file.
- Access to the descriptor is implicitly granted if it
- has the same SID as the process. If av is zero, then
- access to the file is not checked, e.g. for cases
- where only the descriptor is affected like seek. */
-static int file_has_perm(const struct cred *cred,
- struct file *file,
- u32 av)
+static int __file_has_perm(const struct cred *cred, const struct file *file,
+ u32 av, bool bf_user_file)
+
{
- struct file_security_struct *fsec = selinux_file(file);
- struct inode *inode = file_inode(file);
struct common_audit_data ad;
- u32 sid = cred_sid(cred);
+ struct inode *inode;
+ u32 ssid = cred_sid(cred);
+ u32 tsid_fd;
int rc;
- ad.type = LSM_AUDIT_DATA_FILE;
- ad.u.file = file;
+ if (bf_user_file) {
+ struct backing_file_security_struct *bfsec;
+ const struct path *path;
- if (sid != fsec->sid) {
+ if (WARN_ON(!(file->f_mode & FMODE_BACKING)))
+ return -EIO;
+
+ bfsec = selinux_backing_file(file);
+ path = backing_file_user_path(file);
+ tsid_fd = bfsec->uf_sid;
+ inode = d_inode(path->dentry);
+
+ ad.type = LSM_AUDIT_DATA_PATH;
+ ad.u.path = *path;
+ } else {
+ struct file_security_struct *fsec = selinux_file(file);
+
+ tsid_fd = fsec->sid;
+ inode = file_inode(file);
+
+ ad.type = LSM_AUDIT_DATA_FILE;
+ ad.u.file = file;
+ }
+
+ if (ssid != tsid_fd) {
rc = avc_has_perm(&selinux_state,
- sid, fsec->sid,
+ ssid, tsid_fd,
SECCLASS_FD,
FD__USE,
&ad);
if (rc)
- goto out;
+ return rc;
}
#ifdef CONFIG_BPF_SYSCALL
- rc = bpf_fd_pass(file, cred_sid(cred));
+ /* regardless of backing vs user file, use the underlying file here */
+ rc = bpf_fd_pass(file, ssid);
if (rc)
return rc;
#endif
/* av is zero if only checking access to the descriptor. */
- rc = 0;
if (av)
- rc = inode_has_perm(cred, inode, av, &ad);
+ return inode_has_perm(cred, inode, av, &ad);
-out:
- return rc;
+ return 0;
+}
+
+/* Check whether a task can use an open file descriptor to
+ access an inode in a given way. Check access to the
+ descriptor itself, and then use dentry_has_perm to
+ check a particular permission to the file.
+ Access to the descriptor is implicitly granted if it
+ has the same SID as the process. If av is zero, then
+ access to the file is not checked, e.g. for cases
+ where only the descriptor is affected like seek. */
+static inline int file_has_perm(const struct cred *cred,
+ const struct file *file, u32 av)
+{
+ return __file_has_perm(cred, file, av, false);
}
/*
@@ -3646,6 +3672,17 @@ static int selinux_file_alloc_security(struct file *file)
return 0;
}
+static int selinux_backing_file_alloc(struct file *backing_file,
+ const struct file *user_file)
+{
+ struct backing_file_security_struct *bfsec;
+
+ bfsec = selinux_backing_file(backing_file);
+ bfsec->uf_sid = selinux_file(user_file)->sid;
+
+ return 0;
+}
+
/*
* Check whether a task has the ioctl permission and cmd
* operation to an inode.
@@ -3759,43 +3796,56 @@ static int selinux_file_ioctl_compat(struct file *file, unsigned int cmd,
static int default_noexec __ro_after_init;
-static int file_map_prot_check(struct file *file, unsigned long prot, int shared)
+static int __file_map_prot_check(const struct cred *cred,
+ const struct file *file, unsigned long prot,
+ bool shared, bool bf_user_file)
{
- const struct cred *cred = current_cred();
- u32 sid = cred_sid(cred);
- int rc = 0;
+ struct inode *inode = NULL;
+ bool prot_exec = prot & PROT_EXEC;
+ bool prot_write = prot & PROT_WRITE;
+
+ if (file) {
+ if (bf_user_file)
+ inode = d_inode(backing_file_user_path(file)->dentry);
+ else
+ inode = file_inode(file);
+ }
+
+ if (default_noexec && prot_exec &&
+ (!file || IS_PRIVATE(inode) || (!shared && prot_write))) {
+ int rc;
+ u32 sid = cred_sid(cred);
- if (default_noexec &&
- (prot & PROT_EXEC) && (!file || IS_PRIVATE(file_inode(file)) ||
- (!shared && (prot & PROT_WRITE)))) {
/*
- * We are making executable an anonymous mapping or a
- * private file mapping that will also be writable.
- * This has an additional check.
+ * We are making executable an anonymous mapping or a private
+ * file mapping that will also be writable.
*/
rc = avc_has_perm(&selinux_state,
- sid, sid, SECCLASS_PROCESS,
- PROCESS__EXECMEM, NULL);
+ sid, sid, SECCLASS_PROCESS, PROCESS__EXECMEM,
+ NULL);
if (rc)
- goto error;
+ return rc;
}
if (file) {
- /* read access is always possible with a mapping */
+ /* "read" always possible, "write" only if shared */
u32 av = FILE__READ;
-
- /* write access only matters if the mapping is shared */
- if (shared && (prot & PROT_WRITE))
+ if (shared && prot_write)
av |= FILE__WRITE;
-
- if (prot & PROT_EXEC)
+ if (prot_exec)
av |= FILE__EXECUTE;
- return file_has_perm(cred, file, av);
+ return __file_has_perm(cred, file, av, bf_user_file);
}
-error:
- return rc;
+ return 0;
+}
+
+static inline int file_map_prot_check(const struct cred *cred,
+ const struct file *file,
+ unsigned long prot, bool shared)
+{
+ return __file_map_prot_check(cred, file, prot, shared, false);
}
static int selinux_mmap_addr(unsigned long addr)
@@ -3812,17 +3862,17 @@ static int selinux_mmap_addr(unsigned long addr)
return rc;
}
-static int selinux_mmap_file(struct file *file, unsigned long reqprot,
- unsigned long prot, unsigned long flags)
+static int selinux_mmap_file_common(const struct cred *cred, struct file *file,
+ unsigned long reqprot, unsigned long prot,
+ bool shared)
{
- struct common_audit_data ad;
- int rc;
-
if (file) {
+ int rc;
+ struct common_audit_data ad;
+
ad.type = LSM_AUDIT_DATA_FILE;
ad.u.file = file;
- rc = inode_has_perm(current_cred(), file_inode(file),
- FILE__MAP, &ad);
+ rc = inode_has_perm(cred, file_inode(file), FILE__MAP, &ad);
if (rc)
return rc;
}
@@ -3830,23 +3880,68 @@ static int selinux_mmap_file(struct file *file, unsigned long reqprot,
if (checkreqprot_get(&selinux_state))
prot = reqprot;
- return file_map_prot_check(file, prot,
- (flags & MAP_TYPE) == MAP_SHARED);
+ return file_map_prot_check(cred, file, prot, shared);
+}
+
+static int selinux_mmap_file(struct file *file, unsigned long reqprot,
+ unsigned long prot, unsigned long flags)
+{
+ return selinux_mmap_file_common(current_cred(), file, reqprot, prot,
+ (flags & MAP_TYPE) == MAP_SHARED);
+}
+
+/**
+ * selinux_mmap_backing_file - Check mmap permissions on a backing file
+ * @vma: memory region
+ * @backing_file: stacked filesystem backing file
+ * @user_file: user visible file
+ *
+ * This is called after selinux_mmap_file() on stacked filesystems, and it
+ * is this function's responsibility to verify access to @backing_file and
+ * setup the SELinux state for possible later use in the mprotect() code path.
+ *
+ * By the time this function is called, mmap() access to @user_file has already
+ * been authorized and @vma->vm_file has been set to point to @backing_file.
+ *
+ * Return zero on success, negative values otherwise.
+ */
+static int selinux_mmap_backing_file(struct vm_area_struct *vma,
+ struct file *backing_file,
+ struct file *user_file __always_unused)
+{
+ unsigned long prot = 0;
+
+ /* translate vma->vm_flags perms into PROT perms */
+ if (vma->vm_flags & VM_READ)
+ prot |= PROT_READ;
+ if (vma->vm_flags & VM_WRITE)
+ prot |= PROT_WRITE;
+ if (vma->vm_flags & VM_EXEC)
+ prot |= PROT_EXEC;
+
+ return selinux_mmap_file_common(backing_file->f_cred, backing_file,
+ prot, prot, vma->vm_flags & VM_SHARED);
}
static int selinux_file_mprotect(struct vm_area_struct *vma,
unsigned long reqprot,
unsigned long prot)
{
+ int rc;
const struct cred *cred = current_cred();
u32 sid = cred_sid(cred);
+ const struct file *file = vma->vm_file;
+ bool backing_file;
+ bool shared = vma->vm_flags & VM_SHARED;
+
+ /* check if we need to trigger the "backing files are awful" mode */
+ backing_file = file && (file->f_mode & FMODE_BACKING);
if (checkreqprot_get(&selinux_state))
prot = reqprot;
if (default_noexec &&
(prot & PROT_EXEC) && !(vma->vm_flags & VM_EXEC)) {
- int rc = 0;
/*
* We don't use the vma_is_initial_heap() helper as it has
* a history of problems and is currently broken on systems
@@ -3861,12 +3956,16 @@ static int selinux_file_mprotect(struct vm_area_struct *vma,
rc = avc_has_perm(&selinux_state,
sid, sid, SECCLASS_PROCESS,
PROCESS__EXECHEAP, NULL);
- } else if (!vma->vm_file && (vma_is_initial_stack(vma) ||
+ if (rc)
+ return rc;
+ } else if (!file && (vma_is_initial_stack(vma) ||
vma_is_stack_for_current(vma))) {
rc = avc_has_perm(&selinux_state,
sid, sid, SECCLASS_PROCESS,
PROCESS__EXECSTACK, NULL);
- } else if (vma->vm_file && vma->anon_vma) {
+ if (rc)
+ return rc;
+ } else if (file && vma->anon_vma) {
/*
* We are making executable a file mapping that has
* had some COW done. Since pages might have been
@@ -3874,13 +3973,29 @@ static int selinux_file_mprotect(struct vm_area_struct *vma,
* modified content. This typically should only
* occur for text relocations.
*/
- rc = file_has_perm(cred, vma->vm_file, FILE__EXECMOD);
+ rc = __file_has_perm(cred, file, FILE__EXECMOD,
+ backing_file);
+ if (rc)
+ return rc;
+ if (backing_file) {
+ rc = file_has_perm(file->f_cred, file,
+ FILE__EXECMOD);
+ if (rc)
+ return rc;
+ }
}
+ }
+
+ rc = __file_map_prot_check(cred, file, prot, shared, backing_file);
+ if (rc)
+ return rc;
+ if (backing_file) {
+ rc = file_map_prot_check(file->f_cred, file, prot, shared);
if (rc)
return rc;
}
- return file_map_prot_check(vma->vm_file, prot, vma->vm_flags&VM_SHARED);
+ return 0;
}
static int selinux_file_lock(struct file *file, unsigned int cmd)
@@ -7007,6 +7122,7 @@ static void selinux_bpf_token_free(struct bpf_token *token)
struct lsm_blob_sizes selinux_blob_sizes __ro_after_init = {
.lbs_cred = sizeof(struct task_security_struct),
.lbs_file = sizeof(struct file_security_struct),
+ .lbs_backing_file = sizeof(struct backing_file_security_struct),
.lbs_inode = sizeof(struct inode_security_struct),
.lbs_ipc = sizeof(struct ipc_security_struct),
.lbs_msg_msg = sizeof(struct msg_security_struct),
@@ -7216,9 +7332,11 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = {
LSM_HOOK_INIT(file_permission, selinux_file_permission),
LSM_HOOK_INIT(file_alloc_security, selinux_file_alloc_security),
+ LSM_HOOK_INIT(backing_file_alloc, selinux_backing_file_alloc),
LSM_HOOK_INIT(file_ioctl, selinux_file_ioctl),
LSM_HOOK_INIT(file_ioctl_compat, selinux_file_ioctl_compat),
LSM_HOOK_INIT(mmap_file, selinux_mmap_file),
+ LSM_HOOK_INIT(mmap_backing_file, selinux_mmap_backing_file),
LSM_HOOK_INIT(mmap_addr, selinux_mmap_addr),
LSM_HOOK_INIT(file_mprotect, selinux_file_mprotect),
LSM_HOOK_INIT(file_lock, selinux_file_lock),
diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h
index 2953132408bf..b1c5a2877f7e 100644
--- a/security/selinux/include/objsec.h
+++ b/security/selinux/include/objsec.h
@@ -60,6 +60,10 @@ struct file_security_struct {
u32 pseqno; /* Policy seqno at the time of file open */
};
+struct backing_file_security_struct {
+ u32 uf_sid; /* associated user file fsec->sid */
+};
+
struct superblock_security_struct {
u32 sid; /* SID of file system superblock */
u32 def_sid; /* default SID for labeling */
@@ -158,6 +162,13 @@ static inline struct file_security_struct *selinux_file(const struct file *file)
return file->f_security + selinux_blob_sizes.lbs_file;
}
+static inline struct backing_file_security_struct *
+selinux_backing_file(const struct file *backing_file)
+{
+ void *blob = backing_file_security(backing_file);
+ return blob + selinux_blob_sizes.lbs_backing_file;
+}
+
static inline struct inode_security_struct *selinux_inode(
const struct inode *inode)
{
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,130 @@
From ec2a2e4b876c7faed3de5e85406180810cc8539a Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Tue, 16 Jun 2026 10:06:13 +0200
Subject: [PATCH] selinux: RHEL-only hotfix for execmem regression
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
JIRA: https://redhat.atlassian.net/browse/RHEL-185118
Upstream Status: RHEL9-only
As discovered by image-mode/CoreOS testing, the fixes for CVE-2026-46054
caused a regression that results in unexpected execmem denials in
specific scenarios involving overlayfs (or another stacked filesystem).
Specifically in case of image mode / CoreOS there is often (always?) an
overlayfs filesystem mounted during early boot (before SELinux policy is
loaded), which means that overlayfs captures the kernels SELinux
context as part of the mounter credentials, which are later used by
overlayfs+SELinux to verify that file accesses through the overlay mount
dont give the mounter a way to access underlying files it otherwise
wouldnt have access to. This verification would normally pass, as the
policy grants the kernel context almost unrestricted access to the
filesystem. However, the new checks added to fix CVE-2026-46054
erroneously include the execmem check for the mounter and in the policy
kernel_t doesnt have the execmem permission, so mmapping an overlay
file with MAP_PRIVATE and PROT_WRITE|PROT_EXEC would now result in a
SELinux denial.
Fix this by passing a boolean through the helper functions that allows
to distinguish the direct permission check from the mounter check and
skipping the execmem check in the mounter case.
This is a transient RHEL-only fix to allow the CVE fix to go through
without breaking image mode/CoreOS deployments. Once an optimal solution
is figured out and applied upstream, this commit will be reverted and
replaced with the upstream fix (at least in Y-streams). I expect the
upstream solution to be functionally equivalent, though probably
cosmetically different.
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index fb5b9cb027d0..b31b06d4440b 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3798,7 +3798,7 @@ static int default_noexec __ro_after_init;
static int __file_map_prot_check(const struct cred *cred,
const struct file *file, unsigned long prot,
- bool shared, bool bf_user_file)
+ bool shared, bool mounter, bool bf_user_file)
{
struct inode *inode = NULL;
bool prot_exec = prot & PROT_EXEC;
@@ -3812,7 +3812,7 @@ static int __file_map_prot_check(const struct cred *cred,
}
if (default_noexec && prot_exec &&
- (!file || IS_PRIVATE(inode) || (!shared && prot_write))) {
+ (!file || IS_PRIVATE(inode) || (!shared && prot_write)) && !mounter) {
int rc;
u32 sid = cred_sid(cred);
@@ -3843,9 +3843,9 @@ static int __file_map_prot_check(const struct cred *cred,
static inline int file_map_prot_check(const struct cred *cred,
const struct file *file,
- unsigned long prot, bool shared)
+ unsigned long prot, bool shared, bool mounter)
{
- return __file_map_prot_check(cred, file, prot, shared, false);
+ return __file_map_prot_check(cred, file, prot, shared, mounter, false);
}
static int selinux_mmap_addr(unsigned long addr)
@@ -3864,7 +3864,7 @@ static int selinux_mmap_addr(unsigned long addr)
static int selinux_mmap_file_common(const struct cred *cred, struct file *file,
unsigned long reqprot, unsigned long prot,
- bool shared)
+ bool shared, bool mounter)
{
if (file) {
int rc;
@@ -3880,14 +3880,15 @@ static int selinux_mmap_file_common(const struct cred *cred, struct file *file,
if (checkreqprot_get(&selinux_state))
prot = reqprot;
- return file_map_prot_check(cred, file, prot, shared);
+ return file_map_prot_check(cred, file, prot, shared, mounter);
}
static int selinux_mmap_file(struct file *file, unsigned long reqprot,
unsigned long prot, unsigned long flags)
{
return selinux_mmap_file_common(current_cred(), file, reqprot, prot,
- (flags & MAP_TYPE) == MAP_SHARED);
+ (flags & MAP_TYPE) == MAP_SHARED,
+ false);
}
/**
@@ -3920,7 +3921,8 @@ static int selinux_mmap_backing_file(struct vm_area_struct *vma,
prot |= PROT_EXEC;
return selinux_mmap_file_common(backing_file->f_cred, backing_file,
- prot, prot, vma->vm_flags & VM_SHARED);
+ prot, prot, vma->vm_flags & VM_SHARED,
+ true);
}
static int selinux_file_mprotect(struct vm_area_struct *vma,
@@ -3986,11 +3988,11 @@ static int selinux_file_mprotect(struct vm_area_struct *vma,
}
}
- rc = __file_map_prot_check(cred, file, prot, shared, backing_file);
+ rc = __file_map_prot_check(cred, file, prot, shared, false, backing_file);
if (rc)
return rc;
if (backing_file) {
- rc = file_map_prot_check(file->f_cred, file, prot, shared);
+ rc = file_map_prot_check(file->f_cred, file, prot, shared, true);
if (rc)
return rc;
}
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,323 @@
From 0ba00ffd79bdb243b0067aa95fe5846b6c1ecbe7 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:41:58 -0400
Subject: [PATCH] net/mlx5: HWS, Fix matcher action template attach
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 36ef2575e78d1a3c699dc3f1c9dee9be742c9bdd
Author: Vlad Dogaru <vdogaru@nvidia.com>
Date: Thu Apr 10 22:17:31 2025 +0300
net/mlx5: HWS, Fix matcher action template attach
The procedure of attaching an action template to an existing matcher had
a few issues:
1. Attaching accidentally overran the `at` array in bwc_matcher, which
would result in memory corruption. This bug wasn't triggered, but it
is possible to trigger it by attaching action templates beyond the
initial buffer size of 8. Fix this by converting to a dynamically
sized buffer and reallocating if needed.
2. Similarly, the `at` array inside the native matcher was never
reallocated. Fix this the same as above.
3. The bwc layer treated any error in action template attach as a signal
that the matcher should be rehashed to account for a larger number of
action STEs. In reality, there are other unrelated errors that can
arise and they should be propagated upstack. Fix this by adding a
`need_rehash` output parameter that's orthogonal to error codes.
Fixes: 2111bb970c78 ("net/mlx5: HWS, added backward-compatible API handling")
Signed-off-by: Vlad Dogaru <vdogaru@nvidia.com>
Reviewed-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Michal Kubiak <michal.kubiak@intel.com>
Link: https://patch.msgid.link/1744312662-356571-2-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c
index 19dce1ba512d..32de8bfc7644 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c
@@ -90,13 +90,19 @@ int mlx5hws_bwc_matcher_create_simple(struct mlx5hws_bwc_matcher *bwc_matcher,
bwc_matcher->priority = priority;
bwc_matcher->size_log = MLX5HWS_BWC_MATCHER_INIT_SIZE_LOG;
+ bwc_matcher->size_of_at_array = MLX5HWS_BWC_MATCHER_ATTACH_AT_NUM;
+ bwc_matcher->at = kcalloc(bwc_matcher->size_of_at_array,
+ sizeof(*bwc_matcher->at), GFP_KERNEL);
+ if (!bwc_matcher->at)
+ goto free_bwc_matcher_rules;
+
/* create dummy action template */
bwc_matcher->at[0] =
mlx5hws_action_template_create(action_types ?
action_types : init_action_types);
if (!bwc_matcher->at[0]) {
mlx5hws_err(table->ctx, "BWC matcher: failed creating action template\n");
- goto free_bwc_matcher_rules;
+ goto free_bwc_matcher_at_array;
}
bwc_matcher->num_of_at = 1;
@@ -126,6 +132,8 @@ int mlx5hws_bwc_matcher_create_simple(struct mlx5hws_bwc_matcher *bwc_matcher,
mlx5hws_match_template_destroy(bwc_matcher->mt);
free_at:
mlx5hws_action_template_destroy(bwc_matcher->at[0]);
+free_bwc_matcher_at_array:
+ kfree(bwc_matcher->at);
free_bwc_matcher_rules:
kfree(bwc_matcher->rules);
err:
@@ -192,6 +200,7 @@ int mlx5hws_bwc_matcher_destroy_simple(struct mlx5hws_bwc_matcher *bwc_matcher)
for (i = 0; i < bwc_matcher->num_of_at; i++)
mlx5hws_action_template_destroy(bwc_matcher->at[i]);
+ kfree(bwc_matcher->at);
mlx5hws_match_template_destroy(bwc_matcher->mt);
kfree(bwc_matcher->rules);
@@ -520,6 +529,23 @@ hws_bwc_matcher_extend_at(struct mlx5hws_bwc_matcher *bwc_matcher,
struct mlx5hws_rule_action rule_actions[])
{
enum mlx5hws_action_type action_types[MLX5HWS_BWC_MAX_ACTS];
+ void *p;
+
+ if (unlikely(bwc_matcher->num_of_at >= bwc_matcher->size_of_at_array)) {
+ if (bwc_matcher->size_of_at_array >= MLX5HWS_MATCHER_MAX_AT)
+ return -ENOMEM;
+ bwc_matcher->size_of_at_array *= 2;
+ p = krealloc(bwc_matcher->at,
+ bwc_matcher->size_of_at_array *
+ sizeof(*bwc_matcher->at),
+ __GFP_ZERO | GFP_KERNEL);
+ if (!p) {
+ bwc_matcher->size_of_at_array /= 2;
+ return -ENOMEM;
+ }
+
+ bwc_matcher->at = p;
+ }
hws_bwc_rule_actions_to_action_types(rule_actions, action_types);
@@ -777,6 +803,7 @@ int mlx5hws_bwc_rule_create_simple(struct mlx5hws_bwc_rule *bwc_rule,
struct mlx5hws_rule_attr rule_attr;
struct mutex *queue_lock; /* Protect the queue */
u32 num_of_rules;
+ bool need_rehash;
int ret = 0;
int at_idx;
@@ -803,10 +830,14 @@ int mlx5hws_bwc_rule_create_simple(struct mlx5hws_bwc_rule *bwc_rule,
at_idx = bwc_matcher->num_of_at - 1;
ret = mlx5hws_matcher_attach_at(bwc_matcher->matcher,
- bwc_matcher->at[at_idx]);
+ bwc_matcher->at[at_idx],
+ &need_rehash);
if (unlikely(ret)) {
- /* Action template attach failed, possibly due to
- * requiring more action STEs.
+ hws_bwc_unlock_all_queues(ctx);
+ return ret;
+ }
+ if (unlikely(need_rehash)) {
+ /* The new action template requires more action STEs.
* Need to attempt creating new matcher with all
* the action templates, including the new one.
*/
@@ -942,6 +973,7 @@ hws_bwc_rule_action_update(struct mlx5hws_bwc_rule *bwc_rule,
struct mlx5hws_context *ctx = bwc_matcher->matcher->tbl->ctx;
struct mlx5hws_rule_attr rule_attr;
struct mutex *queue_lock; /* Protect the queue */
+ bool need_rehash;
int at_idx, ret;
u16 idx;
@@ -973,12 +1005,17 @@ hws_bwc_rule_action_update(struct mlx5hws_bwc_rule *bwc_rule,
at_idx = bwc_matcher->num_of_at - 1;
ret = mlx5hws_matcher_attach_at(bwc_matcher->matcher,
- bwc_matcher->at[at_idx]);
+ bwc_matcher->at[at_idx],
+ &need_rehash);
if (unlikely(ret)) {
- /* Action template attach failed, possibly due to
- * requiring more action STEs.
- * Need to attempt creating new matcher with all
- * the action templates, including the new one.
+ hws_bwc_unlock_all_queues(ctx);
+ return ret;
+ }
+ if (unlikely(need_rehash)) {
+ /* The new action template requires more action
+ * STEs. Need to attempt creating new matcher
+ * with all the action templates, including the
+ * new one.
*/
ret = hws_bwc_matcher_rehash_at(bwc_matcher);
if (unlikely(ret)) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h
index 47f7ed141553..bb0cf4b922ce 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h
@@ -10,9 +10,7 @@
#define MLX5HWS_BWC_MATCHER_REHASH_BURST_TH 32
/* Max number of AT attach operations for the same matcher.
- * When the limit is reached, next attempt to attach new AT
- * will result in creation of a new matcher and moving all
- * the rules to this matcher.
+ * When the limit is reached, a larger buffer is allocated for the ATs.
*/
#define MLX5HWS_BWC_MATCHER_ATTACH_AT_NUM 8
@@ -23,10 +21,11 @@
struct mlx5hws_bwc_matcher {
struct mlx5hws_matcher *matcher;
struct mlx5hws_match_template *mt;
- struct mlx5hws_action_template *at[MLX5HWS_BWC_MATCHER_ATTACH_AT_NUM];
- u32 priority;
+ struct mlx5hws_action_template **at;
u8 num_of_at;
+ u8 size_of_at_array;
u8 size_log;
+ u32 priority;
atomic_t num_of_rules;
struct list_head *rules;
};
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c
index b61864b32053..37a4497048a6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c
@@ -905,18 +905,48 @@ static int hws_matcher_uninit(struct mlx5hws_matcher *matcher)
return 0;
}
+static int hws_matcher_grow_at_array(struct mlx5hws_matcher *matcher)
+{
+ void *p;
+
+ if (matcher->size_of_at_array >= MLX5HWS_MATCHER_MAX_AT)
+ return -ENOMEM;
+
+ matcher->size_of_at_array *= 2;
+ p = krealloc(matcher->at,
+ matcher->size_of_at_array * sizeof(*matcher->at),
+ __GFP_ZERO | GFP_KERNEL);
+ if (!p) {
+ matcher->size_of_at_array /= 2;
+ return -ENOMEM;
+ }
+
+ matcher->at = p;
+
+ return 0;
+}
+
int mlx5hws_matcher_attach_at(struct mlx5hws_matcher *matcher,
- struct mlx5hws_action_template *at)
+ struct mlx5hws_action_template *at,
+ bool *need_rehash)
{
bool is_jumbo = mlx5hws_matcher_mt_is_jumbo(matcher->mt);
struct mlx5hws_context *ctx = matcher->tbl->ctx;
u32 required_stes;
int ret;
- if (!matcher->attr.max_num_of_at_attach) {
- mlx5hws_dbg(ctx, "Num of current at (%d) exceed allowed value\n",
- matcher->num_of_at);
- return -EOPNOTSUPP;
+ *need_rehash = false;
+
+ if (unlikely(matcher->num_of_at >= matcher->size_of_at_array)) {
+ ret = hws_matcher_grow_at_array(matcher);
+ if (ret)
+ return ret;
+
+ if (matcher->col_matcher) {
+ ret = hws_matcher_grow_at_array(matcher->col_matcher);
+ if (ret)
+ return ret;
+ }
}
ret = hws_matcher_check_and_process_at(matcher, at);
@@ -927,12 +957,11 @@ int mlx5hws_matcher_attach_at(struct mlx5hws_matcher *matcher,
if (matcher->action_ste.max_stes < required_stes) {
mlx5hws_dbg(ctx, "Required STEs [%d] exceeds initial action template STE [%d]\n",
required_stes, matcher->action_ste.max_stes);
- return -ENOMEM;
+ *need_rehash = true;
}
matcher->at[matcher->num_of_at] = *at;
matcher->num_of_at += 1;
- matcher->attr.max_num_of_at_attach -= 1;
if (matcher->col_matcher)
matcher->col_matcher->num_of_at = matcher->num_of_at;
@@ -960,8 +989,9 @@ hws_matcher_set_templates(struct mlx5hws_matcher *matcher,
if (!matcher->mt)
return -ENOMEM;
- matcher->at = kvcalloc(num_of_at + matcher->attr.max_num_of_at_attach,
- sizeof(*matcher->at),
+ matcher->size_of_at_array =
+ num_of_at + matcher->attr.max_num_of_at_attach;
+ matcher->at = kvcalloc(matcher->size_of_at_array, sizeof(*matcher->at),
GFP_KERNEL);
if (!matcher->at) {
mlx5hws_err(ctx, "Failed to allocate action template array\n");
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h
index 020de70270c5..20b32012c418 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h
@@ -23,6 +23,9 @@
*/
#define MLX5HWS_MATCHER_ACTION_RTC_UPDATE_MULT 1
+/* Maximum number of action templates that can be attached to a matcher. */
+#define MLX5HWS_MATCHER_MAX_AT 128
+
enum mlx5hws_matcher_offset {
MLX5HWS_MATCHER_OFFSET_TAG_DW1 = 12,
MLX5HWS_MATCHER_OFFSET_TAG_DW0 = 13,
@@ -72,6 +75,7 @@ struct mlx5hws_matcher {
struct mlx5hws_match_template *mt;
struct mlx5hws_action_template *at;
u8 num_of_at;
+ u8 size_of_at_array;
u8 num_of_mt;
/* enum mlx5hws_matcher_flags */
u8 flags;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h
index 5121951f2778..8ed8a715a2eb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h
@@ -399,11 +399,14 @@ int mlx5hws_matcher_destroy(struct mlx5hws_matcher *matcher);
*
* @matcher: Matcher to attach the action template to.
* @at: Action template to be attached to the matcher.
+ * @need_rehash: Output parameter that tells callers if the matcher needs to be
+ * rehashed.
*
* Return: Zero on success, non-zero otherwise.
*/
int mlx5hws_matcher_attach_at(struct mlx5hws_matcher *matcher,
- struct mlx5hws_action_template *at);
+ struct mlx5hws_action_template *at,
+ bool *need_rehash);
/**
* mlx5hws_matcher_resize_set_target - Link two matchers and enable moving rules.
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,178 @@
From f6b3ae9e3b84ce0d94c00565a0321eb0a1502cee Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:41:58 -0400
Subject: [PATCH] net/mlx5: HWS, Remove unused element array
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit b2ae16214ffeda3e1c25223eebe19f85b0876181
Author: Vlad Dogaru <vdogaru@nvidia.com>
Date: Thu Apr 10 22:17:32 2025 +0300
net/mlx5: HWS, Remove unused element array
Remove the array of elements wrapped in a struct because in reality only
the first element was ever used.
Signed-off-by: Vlad Dogaru <vdogaru@nvidia.com>
Reviewed-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Michal Kubiak <michal.kubiak@intel.com>
Link: https://patch.msgid.link/1744312662-356571-3-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c
index 50a81d360bb2..35ed9bee06a6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c
@@ -293,7 +293,7 @@ static int hws_pool_create_resource_on_index(struct mlx5hws_pool *pool,
}
static struct mlx5hws_pool_elements *
-hws_pool_element_create_new_elem(struct mlx5hws_pool *pool, u32 order, int idx)
+hws_pool_element_create_new_elem(struct mlx5hws_pool *pool, u32 order)
{
struct mlx5hws_pool_elements *elem;
u32 alloc_size;
@@ -311,21 +311,21 @@ hws_pool_element_create_new_elem(struct mlx5hws_pool *pool, u32 order, int idx)
elem->bitmap = hws_pool_create_and_init_bitmap(alloc_size - order);
if (!elem->bitmap) {
mlx5hws_err(pool->ctx,
- "Failed to create bitmap type: %d: size %d index: %d\n",
- pool->type, alloc_size, idx);
+ "Failed to create bitmap type: %d: size %d\n",
+ pool->type, alloc_size);
goto free_elem;
}
elem->log_size = alloc_size - order;
}
- if (hws_pool_create_resource_on_index(pool, alloc_size, idx)) {
- mlx5hws_err(pool->ctx, "Failed to create resource type: %d: size %d index: %d\n",
- pool->type, alloc_size, idx);
+ if (hws_pool_create_resource_on_index(pool, alloc_size, 0)) {
+ mlx5hws_err(pool->ctx, "Failed to create resource type: %d: size %d\n",
+ pool->type, alloc_size);
goto free_db;
}
- pool->db.element_manager->elements[idx] = elem;
+ pool->db.element = elem;
return elem;
@@ -359,9 +359,9 @@ hws_pool_onesize_element_get_mem_chunk(struct mlx5hws_pool *pool, u32 order,
{
struct mlx5hws_pool_elements *elem;
- elem = pool->db.element_manager->elements[0];
+ elem = pool->db.element;
if (!elem)
- elem = hws_pool_element_create_new_elem(pool, order, 0);
+ elem = hws_pool_element_create_new_elem(pool, order);
if (!elem)
goto err_no_elem;
@@ -451,16 +451,14 @@ static int hws_pool_general_element_db_init(struct mlx5hws_pool *pool)
return 0;
}
-static void hws_onesize_element_db_destroy_element(struct mlx5hws_pool *pool,
- struct mlx5hws_pool_elements *elem,
- struct mlx5hws_pool_chunk *chunk)
+static void
+hws_onesize_element_db_destroy_element(struct mlx5hws_pool *pool,
+ struct mlx5hws_pool_elements *elem)
{
- if (unlikely(!pool->resource[chunk->resource_idx]))
- pr_warn("HWS: invalid resource with index %d\n", chunk->resource_idx);
-
- hws_pool_resource_free(pool, chunk->resource_idx);
+ hws_pool_resource_free(pool, 0);
+ bitmap_free(elem->bitmap);
kfree(elem);
- pool->db.element_manager->elements[chunk->resource_idx] = NULL;
+ pool->db.element = NULL;
}
static void hws_onesize_element_db_put_chunk(struct mlx5hws_pool *pool,
@@ -471,7 +469,7 @@ static void hws_onesize_element_db_put_chunk(struct mlx5hws_pool *pool,
if (unlikely(chunk->resource_idx))
pr_warn("HWS: invalid resource with index %d\n", chunk->resource_idx);
- elem = pool->db.element_manager->elements[chunk->resource_idx];
+ elem = pool->db.element;
if (!elem) {
mlx5hws_err(pool->ctx, "No such element (%d)\n", chunk->resource_idx);
return;
@@ -483,7 +481,7 @@ static void hws_onesize_element_db_put_chunk(struct mlx5hws_pool *pool,
if (pool->flags & MLX5HWS_POOL_FLAGS_RELEASE_FREE_RESOURCE &&
!elem->num_of_elements)
- hws_onesize_element_db_destroy_element(pool, elem, chunk);
+ hws_onesize_element_db_destroy_element(pool, elem);
}
static int hws_onesize_element_db_get_chunk(struct mlx5hws_pool *pool,
@@ -504,18 +502,13 @@ static int hws_onesize_element_db_get_chunk(struct mlx5hws_pool *pool,
static void hws_onesize_element_db_uninit(struct mlx5hws_pool *pool)
{
- struct mlx5hws_pool_elements *elem;
- int i;
+ struct mlx5hws_pool_elements *elem = pool->db.element;
- for (i = 0; i < MLX5HWS_POOL_RESOURCE_ARR_SZ; i++) {
- elem = pool->db.element_manager->elements[i];
- if (elem) {
- bitmap_free(elem->bitmap);
- kfree(elem);
- pool->db.element_manager->elements[i] = NULL;
- }
+ if (elem) {
+ bitmap_free(elem->bitmap);
+ kfree(elem);
+ pool->db.element = NULL;
}
- kfree(pool->db.element_manager);
}
/* This memory management works as the following:
@@ -526,10 +519,6 @@ static void hws_onesize_element_db_uninit(struct mlx5hws_pool *pool)
*/
static int hws_pool_onesize_element_db_init(struct mlx5hws_pool *pool)
{
- pool->db.element_manager = kzalloc(sizeof(*pool->db.element_manager), GFP_KERNEL);
- if (!pool->db.element_manager)
- return -ENOMEM;
-
pool->p_db_uninit = &hws_onesize_element_db_uninit;
pool->p_get_chunk = &hws_onesize_element_db_get_chunk;
pool->p_put_chunk = &hws_onesize_element_db_put_chunk;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h
index 621298b352b2..f4258f83fdbf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h
@@ -87,14 +87,10 @@ struct mlx5hws_pool_elements {
bool is_full;
};
-struct mlx5hws_element_manager {
- struct mlx5hws_pool_elements *elements[MLX5HWS_POOL_RESOURCE_ARR_SZ];
-};
-
struct mlx5hws_pool_db {
enum mlx5hws_db_type type;
union {
- struct mlx5hws_element_manager *element_manager;
+ struct mlx5hws_pool_elements *element;
struct mlx5hws_buddy_manager *buddy_manager;
};
};
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,700 @@
From 33eec33671fbe0444075b87f15b803b77059993f Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:41:58 -0400
Subject: [PATCH] net/mlx5: HWS, Make pool single resource
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 38956bea7349ce75c1519b57c27cd97580b4c822
Author: Vlad Dogaru <vdogaru@nvidia.com>
Date: Thu Apr 10 22:17:33 2025 +0300
net/mlx5: HWS, Make pool single resource
The pool implementation claimed to support multiple resources, but this
does not really make sense in context. Callers always allocate a single
STC or STE chunk of exactly the size provided.
The code that handled multiple resources was unused (and likely buggy)
due to the combination of flags passed by callers.
Simplify the pool by having it handle a single resource. As a result of
this simplification, chunks no longer contain a resource offset (there
is now only one resource per pool), and the get_base_id functions no
longer take a chunk parameter.
Signed-off-by: Vlad Dogaru <vdogaru@nvidia.com>
Reviewed-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Michal Kubiak <michal.kubiak@intel.com>
Link: https://patch.msgid.link/1744312662-356571-4-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c
index b5332c54d4fb..781ba8c4f733 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c
@@ -238,6 +238,7 @@ hws_action_fixup_stc_attr(struct mlx5hws_context *ctx,
enum mlx5hws_table_type table_type,
bool is_mirror)
{
+ struct mlx5hws_pool *pool;
bool use_fixup = false;
u32 fw_tbl_type;
u32 base_id;
@@ -253,13 +254,11 @@ hws_action_fixup_stc_attr(struct mlx5hws_context *ctx,
use_fixup = true;
break;
}
+ pool = stc_attr->ste_table.ste_pool;
if (!is_mirror)
- base_id = mlx5hws_pool_chunk_get_base_id(stc_attr->ste_table.ste_pool,
- &stc_attr->ste_table.ste);
+ base_id = mlx5hws_pool_get_base_id(pool);
else
- base_id =
- mlx5hws_pool_chunk_get_base_mirror_id(stc_attr->ste_table.ste_pool,
- &stc_attr->ste_table.ste);
+ base_id = mlx5hws_pool_get_base_mirror_id(pool);
*fixup_stc_attr = *stc_attr;
fixup_stc_attr->ste_table.ste_obj_id = base_id;
@@ -337,7 +336,7 @@ __must_hold(&ctx->ctrl_lock)
if (!mlx5hws_context_cap_dynamic_reparse(ctx))
stc_attr->reparse_mode = MLX5_IFC_STC_REPARSE_IGNORE;
- obj_0_id = mlx5hws_pool_chunk_get_base_id(stc_pool, stc);
+ obj_0_id = mlx5hws_pool_get_base_id(stc_pool);
/* According to table/action limitation change the stc_attr */
use_fixup = hws_action_fixup_stc_attr(ctx, stc_attr, &fixup_stc_attr, table_type, false);
@@ -353,7 +352,7 @@ __must_hold(&ctx->ctrl_lock)
if (table_type == MLX5HWS_TABLE_TYPE_FDB) {
u32 obj_1_id;
- obj_1_id = mlx5hws_pool_chunk_get_base_mirror_id(stc_pool, stc);
+ obj_1_id = mlx5hws_pool_get_base_mirror_id(stc_pool);
use_fixup = hws_action_fixup_stc_attr(ctx, stc_attr,
&fixup_stc_attr,
@@ -393,11 +392,11 @@ __must_hold(&ctx->ctrl_lock)
stc_attr.action_type = MLX5_IFC_STC_ACTION_TYPE_DROP;
stc_attr.action_offset = MLX5HWS_ACTION_OFFSET_HIT;
stc_attr.stc_offset = stc->offset;
- obj_id = mlx5hws_pool_chunk_get_base_id(stc_pool, stc);
+ obj_id = mlx5hws_pool_get_base_id(stc_pool);
mlx5hws_cmd_stc_modify(ctx->mdev, obj_id, &stc_attr);
if (table_type == MLX5HWS_TABLE_TYPE_FDB) {
- obj_id = mlx5hws_pool_chunk_get_base_mirror_id(stc_pool, stc);
+ obj_id = mlx5hws_pool_get_base_mirror_id(stc_pool);
mlx5hws_cmd_stc_modify(ctx->mdev, obj_id, &stc_attr);
}
@@ -1581,7 +1580,6 @@ hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx,
u32 miss_ft_id)
{
struct mlx5hws_cmd_rtc_create_attr rtc_attr = {0};
- struct mlx5hws_action_default_stc *default_stc;
struct mlx5hws_matcher_action_ste *table_ste;
struct mlx5hws_pool_attr pool_attr = {0};
struct mlx5hws_pool *ste_pool, *stc_pool;
@@ -1629,7 +1627,7 @@ hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx,
rtc_attr.fw_gen_wqe = true;
rtc_attr.is_scnd_range = true;
- obj_id = mlx5hws_pool_chunk_get_base_id(ste_pool, ste);
+ obj_id = mlx5hws_pool_get_base_id(ste_pool);
rtc_attr.pd = ctx->pd_num;
rtc_attr.ste_base = obj_id;
@@ -1639,8 +1637,7 @@ hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx,
/* STC is a single resource (obj_id), use any STC for the ID */
stc_pool = ctx->stc_pool;
- default_stc = ctx->common_res.default_stc;
- obj_id = mlx5hws_pool_chunk_get_base_id(stc_pool, &default_stc->default_hit);
+ obj_id = mlx5hws_pool_get_base_id(stc_pool);
rtc_attr.stc_base = obj_id;
ret = mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr, rtc_0_id);
@@ -1650,11 +1647,11 @@ hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx,
}
/* Create mirror RTC */
- obj_id = mlx5hws_pool_chunk_get_base_mirror_id(ste_pool, ste);
+ obj_id = mlx5hws_pool_get_base_mirror_id(ste_pool);
rtc_attr.ste_base = obj_id;
rtc_attr.table_type = mlx5hws_table_get_res_fw_ft_type(MLX5HWS_TABLE_TYPE_FDB, true);
- obj_id = mlx5hws_pool_chunk_get_base_mirror_id(stc_pool, &default_stc->default_hit);
+ obj_id = mlx5hws_pool_get_base_mirror_id(stc_pool);
rtc_attr.stc_base = obj_id;
ret = mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr, rtc_1_id);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c
index 696275fd0ce2..3491408c5d84 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c
@@ -118,7 +118,6 @@ static int hws_debug_dump_matcher(struct seq_file *f, struct mlx5hws_matcher *ma
{
enum mlx5hws_table_type tbl_type = matcher->tbl->type;
struct mlx5hws_cmd_ft_query_attr ft_attr = {0};
- struct mlx5hws_pool_chunk *ste;
struct mlx5hws_pool *ste_pool;
u64 icm_addr_0 = 0;
u64 icm_addr_1 = 0;
@@ -134,12 +133,11 @@ static int hws_debug_dump_matcher(struct seq_file *f, struct mlx5hws_matcher *ma
matcher->end_ft_id,
matcher->col_matcher ? HWS_PTR_TO_ID(matcher->col_matcher) : 0);
- ste = &matcher->match_ste.ste;
ste_pool = matcher->match_ste.pool;
if (ste_pool) {
- ste_0_id = mlx5hws_pool_chunk_get_base_id(ste_pool, ste);
+ ste_0_id = mlx5hws_pool_get_base_id(ste_pool);
if (tbl_type == MLX5HWS_TABLE_TYPE_FDB)
- ste_1_id = mlx5hws_pool_chunk_get_base_mirror_id(ste_pool, ste);
+ ste_1_id = mlx5hws_pool_get_base_mirror_id(ste_pool);
}
seq_printf(f, ",%d,%d,%d,%d",
@@ -148,12 +146,11 @@ static int hws_debug_dump_matcher(struct seq_file *f, struct mlx5hws_matcher *ma
matcher->match_ste.rtc_1_id,
(int)ste_1_id);
- ste = &matcher->action_ste.ste;
ste_pool = matcher->action_ste.pool;
if (ste_pool) {
- ste_0_id = mlx5hws_pool_chunk_get_base_id(ste_pool, ste);
+ ste_0_id = mlx5hws_pool_get_base_id(ste_pool);
if (tbl_type == MLX5HWS_TABLE_TYPE_FDB)
- ste_1_id = mlx5hws_pool_chunk_get_base_mirror_id(ste_pool, ste);
+ ste_1_id = mlx5hws_pool_get_base_mirror_id(ste_pool);
else
ste_1_id = -1;
} else {
@@ -387,14 +384,17 @@ static int hws_debug_dump_context_stc(struct seq_file *f, struct mlx5hws_context
if (!stc_pool)
return 0;
- if (stc_pool->resource[0]) {
- ret = hws_debug_dump_context_stc_resource(f, ctx, stc_pool->resource[0]);
+ if (stc_pool->resource) {
+ ret = hws_debug_dump_context_stc_resource(f, ctx,
+ stc_pool->resource);
if (ret)
return ret;
}
- if (stc_pool->mirror_resource[0]) {
- ret = hws_debug_dump_context_stc_resource(f, ctx, stc_pool->mirror_resource[0]);
+ if (stc_pool->mirror_resource) {
+ struct mlx5hws_pool_resource *res = stc_pool->mirror_resource;
+
+ ret = hws_debug_dump_context_stc_resource(f, ctx, res);
if (ret)
return ret;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c
index 37a4497048a6..59b14db427b4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c
@@ -223,7 +223,6 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher,
struct mlx5hws_cmd_rtc_create_attr rtc_attr = {0};
struct mlx5hws_match_template *mt = matcher->mt;
struct mlx5hws_context *ctx = matcher->tbl->ctx;
- struct mlx5hws_action_default_stc *default_stc;
struct mlx5hws_matcher_action_ste *action_ste;
struct mlx5hws_table *tbl = matcher->tbl;
struct mlx5hws_pool *ste_pool, *stc_pool;
@@ -305,7 +304,7 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher,
return -EINVAL;
}
- obj_id = mlx5hws_pool_chunk_get_base_id(ste_pool, ste);
+ obj_id = mlx5hws_pool_get_base_id(ste_pool);
rtc_attr.pd = ctx->pd_num;
rtc_attr.ste_base = obj_id;
@@ -316,8 +315,7 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher,
/* STC is a single resource (obj_id), use any STC for the ID */
stc_pool = ctx->stc_pool;
- default_stc = ctx->common_res.default_stc;
- obj_id = mlx5hws_pool_chunk_get_base_id(stc_pool, &default_stc->default_hit);
+ obj_id = mlx5hws_pool_get_base_id(stc_pool);
rtc_attr.stc_base = obj_id;
ret = mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr, rtc_0_id);
@@ -328,11 +326,11 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher,
}
if (tbl->type == MLX5HWS_TABLE_TYPE_FDB) {
- obj_id = mlx5hws_pool_chunk_get_base_mirror_id(ste_pool, ste);
+ obj_id = mlx5hws_pool_get_base_mirror_id(ste_pool);
rtc_attr.ste_base = obj_id;
rtc_attr.table_type = mlx5hws_table_get_res_fw_ft_type(tbl->type, true);
- obj_id = mlx5hws_pool_chunk_get_base_mirror_id(stc_pool, &default_stc->default_hit);
+ obj_id = mlx5hws_pool_get_base_mirror_id(stc_pool);
rtc_attr.stc_base = obj_id;
hws_matcher_set_rtc_attr_sz(matcher, &rtc_attr, rtc_type, true);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c
index 35ed9bee06a6..0de03e17624c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c
@@ -20,15 +20,14 @@ static void hws_pool_free_one_resource(struct mlx5hws_pool_resource *resource)
kfree(resource);
}
-static void hws_pool_resource_free(struct mlx5hws_pool *pool,
- int resource_idx)
+static void hws_pool_resource_free(struct mlx5hws_pool *pool)
{
- hws_pool_free_one_resource(pool->resource[resource_idx]);
- pool->resource[resource_idx] = NULL;
+ hws_pool_free_one_resource(pool->resource);
+ pool->resource = NULL;
if (pool->tbl_type == MLX5HWS_TABLE_TYPE_FDB) {
- hws_pool_free_one_resource(pool->mirror_resource[resource_idx]);
- pool->mirror_resource[resource_idx] = NULL;
+ hws_pool_free_one_resource(pool->mirror_resource);
+ pool->mirror_resource = NULL;
}
}
@@ -78,7 +77,7 @@ hws_pool_create_one_resource(struct mlx5hws_pool *pool, u32 log_range,
}
static int
-hws_pool_resource_alloc(struct mlx5hws_pool *pool, u32 log_range, int idx)
+hws_pool_resource_alloc(struct mlx5hws_pool *pool, u32 log_range)
{
struct mlx5hws_pool_resource *resource;
u32 fw_ft_type, opt_log_range;
@@ -91,7 +90,7 @@ hws_pool_resource_alloc(struct mlx5hws_pool *pool, u32 log_range, int idx)
return -EINVAL;
}
- pool->resource[idx] = resource;
+ pool->resource = resource;
if (pool->tbl_type == MLX5HWS_TABLE_TYPE_FDB) {
struct mlx5hws_pool_resource *mirror_resource;
@@ -102,10 +101,10 @@ hws_pool_resource_alloc(struct mlx5hws_pool *pool, u32 log_range, int idx)
if (!mirror_resource) {
mlx5hws_err(pool->ctx, "Failed allocating mirrored resource\n");
hws_pool_free_one_resource(resource);
- pool->resource[idx] = NULL;
+ pool->resource = NULL;
return -EINVAL;
}
- pool->mirror_resource[idx] = mirror_resource;
+ pool->mirror_resource = mirror_resource;
}
return 0;
@@ -129,9 +128,9 @@ static void hws_pool_buddy_db_put_chunk(struct mlx5hws_pool *pool,
{
struct mlx5hws_buddy_mem *buddy;
- buddy = pool->db.buddy_manager->buddies[chunk->resource_idx];
+ buddy = pool->db.buddy;
if (!buddy) {
- mlx5hws_err(pool->ctx, "No such buddy (%d)\n", chunk->resource_idx);
+ mlx5hws_err(pool->ctx, "Bad buddy state\n");
return;
}
@@ -139,86 +138,50 @@ static void hws_pool_buddy_db_put_chunk(struct mlx5hws_pool *pool,
}
static struct mlx5hws_buddy_mem *
-hws_pool_buddy_get_next_buddy(struct mlx5hws_pool *pool, int idx,
- u32 order, bool *is_new_buddy)
+hws_pool_buddy_get_buddy(struct mlx5hws_pool *pool, u32 order)
{
static struct mlx5hws_buddy_mem *buddy;
u32 new_buddy_size;
- buddy = pool->db.buddy_manager->buddies[idx];
+ buddy = pool->db.buddy;
if (buddy)
return buddy;
new_buddy_size = max(pool->alloc_log_sz, order);
- *is_new_buddy = true;
buddy = mlx5hws_buddy_create(new_buddy_size);
if (!buddy) {
- mlx5hws_err(pool->ctx, "Failed to create buddy order: %d index: %d\n",
- new_buddy_size, idx);
+ mlx5hws_err(pool->ctx, "Failed to create buddy order: %d\n",
+ new_buddy_size);
return NULL;
}
- if (hws_pool_resource_alloc(pool, new_buddy_size, idx) != 0) {
- mlx5hws_err(pool->ctx, "Failed to create resource type: %d: size %d index: %d\n",
- pool->type, new_buddy_size, idx);
+ if (hws_pool_resource_alloc(pool, new_buddy_size) != 0) {
+ mlx5hws_err(pool->ctx, "Failed to create resource type: %d: size %d\n",
+ pool->type, new_buddy_size);
mlx5hws_buddy_cleanup(buddy);
return NULL;
}
- pool->db.buddy_manager->buddies[idx] = buddy;
+ pool->db.buddy = buddy;
return buddy;
}
static int hws_pool_buddy_get_mem_chunk(struct mlx5hws_pool *pool,
int order,
- u32 *buddy_idx,
int *seg)
{
struct mlx5hws_buddy_mem *buddy;
- bool new_mem = false;
- int ret = 0;
- int i;
-
- *seg = -1;
-
- /* Find the next free place from the buddy array */
- while (*seg < 0) {
- for (i = 0; i < MLX5HWS_POOL_RESOURCE_ARR_SZ; i++) {
- buddy = hws_pool_buddy_get_next_buddy(pool, i,
- order,
- &new_mem);
- if (!buddy) {
- ret = -ENOMEM;
- goto out;
- }
-
- *seg = mlx5hws_buddy_alloc_mem(buddy, order);
- if (*seg >= 0)
- goto found;
-
- if (pool->flags & MLX5HWS_POOL_FLAGS_ONE_RESOURCE) {
- mlx5hws_err(pool->ctx,
- "Fail to allocate seg for one resource pool\n");
- ret = -ENOMEM;
- goto out;
- }
-
- if (new_mem) {
- /* We have new memory pool, should be place for us */
- mlx5hws_err(pool->ctx,
- "No memory for order: %d with buddy no: %d\n",
- order, i);
- ret = -ENOMEM;
- goto out;
- }
- }
- }
-found:
- *buddy_idx = i;
-out:
- return ret;
+ buddy = hws_pool_buddy_get_buddy(pool, order);
+ if (!buddy)
+ return -ENOMEM;
+
+ *seg = mlx5hws_buddy_alloc_mem(buddy, order);
+ if (*seg >= 0)
+ return 0;
+
+ return -ENOMEM;
}
static int hws_pool_buddy_db_get_chunk(struct mlx5hws_pool *pool,
@@ -226,9 +189,7 @@ static int hws_pool_buddy_db_get_chunk(struct mlx5hws_pool *pool,
{
int ret = 0;
- /* Go over the buddies and find next free slot */
ret = hws_pool_buddy_get_mem_chunk(pool, chunk->order,
- &chunk->resource_idx,
&chunk->offset);
if (ret)
mlx5hws_err(pool->ctx, "Failed to get free slot for chunk with order: %d\n",
@@ -240,33 +201,21 @@ static int hws_pool_buddy_db_get_chunk(struct mlx5hws_pool *pool,
static void hws_pool_buddy_db_uninit(struct mlx5hws_pool *pool)
{
struct mlx5hws_buddy_mem *buddy;
- int i;
-
- for (i = 0; i < MLX5HWS_POOL_RESOURCE_ARR_SZ; i++) {
- buddy = pool->db.buddy_manager->buddies[i];
- if (buddy) {
- mlx5hws_buddy_cleanup(buddy);
- kfree(buddy);
- pool->db.buddy_manager->buddies[i] = NULL;
- }
- }
- kfree(pool->db.buddy_manager);
+ buddy = pool->db.buddy;
+ if (buddy) {
+ mlx5hws_buddy_cleanup(buddy);
+ kfree(buddy);
+ pool->db.buddy = NULL;
+ }
}
static int hws_pool_buddy_db_init(struct mlx5hws_pool *pool, u32 log_range)
{
- pool->db.buddy_manager = kzalloc(sizeof(*pool->db.buddy_manager), GFP_KERNEL);
- if (!pool->db.buddy_manager)
- return -ENOMEM;
-
if (pool->flags & MLX5HWS_POOL_FLAGS_ALLOC_MEM_ON_CREATE) {
- bool new_buddy;
-
- if (!hws_pool_buddy_get_next_buddy(pool, 0, log_range, &new_buddy)) {
+ if (!hws_pool_buddy_get_buddy(pool, log_range)) {
mlx5hws_err(pool->ctx,
"Failed allocating memory on create log_sz: %d\n", log_range);
- kfree(pool->db.buddy_manager);
return -ENOMEM;
}
}
@@ -278,14 +227,13 @@ static int hws_pool_buddy_db_init(struct mlx5hws_pool *pool, u32 log_range)
return 0;
}
-static int hws_pool_create_resource_on_index(struct mlx5hws_pool *pool,
- u32 alloc_size, int idx)
+static int hws_pool_create_resource(struct mlx5hws_pool *pool, u32 alloc_size)
{
- int ret = hws_pool_resource_alloc(pool, alloc_size, idx);
+ int ret = hws_pool_resource_alloc(pool, alloc_size);
if (ret) {
- mlx5hws_err(pool->ctx, "Failed to create resource type: %d: size %d index: %d\n",
- pool->type, alloc_size, idx);
+ mlx5hws_err(pool->ctx, "Failed to create resource type: %d: size %d\n",
+ pool->type, alloc_size);
return ret;
}
@@ -319,7 +267,7 @@ hws_pool_element_create_new_elem(struct mlx5hws_pool *pool, u32 order)
elem->log_size = alloc_size - order;
}
- if (hws_pool_create_resource_on_index(pool, alloc_size, 0)) {
+ if (hws_pool_create_resource(pool, alloc_size)) {
mlx5hws_err(pool->ctx, "Failed to create resource type: %d: size %d\n",
pool->type, alloc_size);
goto free_db;
@@ -355,7 +303,7 @@ static int hws_pool_element_find_seg(struct mlx5hws_pool_elements *elem, int *se
static int
hws_pool_onesize_element_get_mem_chunk(struct mlx5hws_pool *pool, u32 order,
- u32 *idx, int *seg)
+ int *seg)
{
struct mlx5hws_pool_elements *elem;
@@ -370,7 +318,6 @@ hws_pool_onesize_element_get_mem_chunk(struct mlx5hws_pool *pool, u32 order,
return -ENOMEM;
}
- *idx = 0;
elem->num_of_elements++;
return 0;
@@ -379,21 +326,17 @@ hws_pool_onesize_element_get_mem_chunk(struct mlx5hws_pool *pool, u32 order,
return -ENOMEM;
}
-static int
-hws_pool_general_element_get_mem_chunk(struct mlx5hws_pool *pool, u32 order,
- u32 *idx, int *seg)
+static int hws_pool_general_element_get_mem_chunk(struct mlx5hws_pool *pool,
+ u32 order, int *seg)
{
- int ret, i;
-
- for (i = 0; i < MLX5HWS_POOL_RESOURCE_ARR_SZ; i++) {
- if (!pool->resource[i]) {
- ret = hws_pool_create_resource_on_index(pool, order, i);
- if (ret)
- goto err_no_res;
- *idx = i;
- *seg = 0; /* One memory slot in that element */
- return 0;
- }
+ int ret;
+
+ if (!pool->resource) {
+ ret = hws_pool_create_resource(pool, order);
+ if (ret)
+ goto err_no_res;
+ *seg = 0; /* One memory slot in that element */
+ return 0;
}
mlx5hws_err(pool->ctx, "No more resources (last request order: %d)\n", order);
@@ -409,9 +352,7 @@ static int hws_pool_general_element_db_get_chunk(struct mlx5hws_pool *pool,
{
int ret;
- /* Go over all memory elements and find/allocate free slot */
ret = hws_pool_general_element_get_mem_chunk(pool, chunk->order,
- &chunk->resource_idx,
&chunk->offset);
if (ret)
mlx5hws_err(pool->ctx, "Failed to get free slot for chunk with order: %d\n",
@@ -423,11 +364,8 @@ static int hws_pool_general_element_db_get_chunk(struct mlx5hws_pool *pool,
static void hws_pool_general_element_db_put_chunk(struct mlx5hws_pool *pool,
struct mlx5hws_pool_chunk *chunk)
{
- if (unlikely(!pool->resource[chunk->resource_idx]))
- pr_warn("HWS: invalid resource with index %d\n", chunk->resource_idx);
-
if (pool->flags & MLX5HWS_POOL_FLAGS_RELEASE_FREE_RESOURCE)
- hws_pool_resource_free(pool, chunk->resource_idx);
+ hws_pool_resource_free(pool);
}
static void hws_pool_general_element_db_uninit(struct mlx5hws_pool *pool)
@@ -455,7 +393,7 @@ static void
hws_onesize_element_db_destroy_element(struct mlx5hws_pool *pool,
struct mlx5hws_pool_elements *elem)
{
- hws_pool_resource_free(pool, 0);
+ hws_pool_resource_free(pool);
bitmap_free(elem->bitmap);
kfree(elem);
pool->db.element = NULL;
@@ -466,12 +404,9 @@ static void hws_onesize_element_db_put_chunk(struct mlx5hws_pool *pool,
{
struct mlx5hws_pool_elements *elem;
- if (unlikely(chunk->resource_idx))
- pr_warn("HWS: invalid resource with index %d\n", chunk->resource_idx);
-
elem = pool->db.element;
if (!elem) {
- mlx5hws_err(pool->ctx, "No such element (%d)\n", chunk->resource_idx);
+ mlx5hws_err(pool->ctx, "Pool element was not allocated\n");
return;
}
@@ -489,9 +424,7 @@ static int hws_onesize_element_db_get_chunk(struct mlx5hws_pool *pool,
{
int ret = 0;
- /* Go over all memory elements and find/allocate free slot */
ret = hws_pool_onesize_element_get_mem_chunk(pool, chunk->order,
- &chunk->resource_idx,
&chunk->offset);
if (ret)
mlx5hws_err(pool->ctx, "Failed to get free slot for chunk with order: %d\n",
@@ -614,13 +547,10 @@ mlx5hws_pool_create(struct mlx5hws_context *ctx, struct mlx5hws_pool_attr *pool_
int mlx5hws_pool_destroy(struct mlx5hws_pool *pool)
{
- int i;
-
mutex_destroy(&pool->lock);
- for (i = 0; i < MLX5HWS_POOL_RESOURCE_ARR_SZ; i++)
- if (pool->resource[i])
- hws_pool_resource_free(pool, i);
+ if (pool->resource)
+ hws_pool_resource_free(pool);
hws_pool_db_unint(pool);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h
index f4258f83fdbf..112a61cd2997 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h
@@ -6,16 +6,12 @@
#define MLX5HWS_POOL_STC_LOG_SZ 15
-#define MLX5HWS_POOL_RESOURCE_ARR_SZ 100
-
enum mlx5hws_pool_type {
MLX5HWS_POOL_TYPE_STE,
MLX5HWS_POOL_TYPE_STC,
};
struct mlx5hws_pool_chunk {
- u32 resource_idx;
- /* Internal offset, relative to base index */
int offset;
int order;
};
@@ -72,14 +68,10 @@ enum mlx5hws_db_type {
MLX5HWS_POOL_DB_TYPE_GENERAL_SIZE,
/* One resource only, all the elements are with same one size */
MLX5HWS_POOL_DB_TYPE_ONE_SIZE_RESOURCE,
- /* Many resources, the memory allocated with buddy mechanism */
+ /* Entries are managed using a buddy mechanism. */
MLX5HWS_POOL_DB_TYPE_BUDDY,
};
-struct mlx5hws_buddy_manager {
- struct mlx5hws_buddy_mem *buddies[MLX5HWS_POOL_RESOURCE_ARR_SZ];
-};
-
struct mlx5hws_pool_elements {
u32 num_of_elements;
unsigned long *bitmap;
@@ -91,7 +83,7 @@ struct mlx5hws_pool_db {
enum mlx5hws_db_type type;
union {
struct mlx5hws_pool_elements *element;
- struct mlx5hws_buddy_manager *buddy_manager;
+ struct mlx5hws_buddy_mem *buddy;
};
};
@@ -109,8 +101,8 @@ struct mlx5hws_pool {
size_t alloc_log_sz;
enum mlx5hws_table_type tbl_type;
enum mlx5hws_pool_optimize opt_type;
- struct mlx5hws_pool_resource *resource[MLX5HWS_POOL_RESOURCE_ARR_SZ];
- struct mlx5hws_pool_resource *mirror_resource[MLX5HWS_POOL_RESOURCE_ARR_SZ];
+ struct mlx5hws_pool_resource *resource;
+ struct mlx5hws_pool_resource *mirror_resource;
/* DB */
struct mlx5hws_pool_db db;
/* Functions */
@@ -131,17 +123,13 @@ int mlx5hws_pool_chunk_alloc(struct mlx5hws_pool *pool,
void mlx5hws_pool_chunk_free(struct mlx5hws_pool *pool,
struct mlx5hws_pool_chunk *chunk);
-static inline u32
-mlx5hws_pool_chunk_get_base_id(struct mlx5hws_pool *pool,
- struct mlx5hws_pool_chunk *chunk)
+static inline u32 mlx5hws_pool_get_base_id(struct mlx5hws_pool *pool)
{
- return pool->resource[chunk->resource_idx]->base_id;
+ return pool->resource->base_id;
}
-static inline u32
-mlx5hws_pool_chunk_get_base_mirror_id(struct mlx5hws_pool *pool,
- struct mlx5hws_pool_chunk *chunk)
+static inline u32 mlx5hws_pool_get_base_mirror_id(struct mlx5hws_pool *pool)
{
- return pool->mirror_resource[chunk->resource_idx]->base_id;
+ return pool->mirror_resource->base_id;
}
#endif /* MLX5HWS_POOL_H_ */
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,760 @@
From ac067697d1ab53a4dceeec03736f9b8bf2363665 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:41:59 -0400
Subject: [PATCH] net/mlx5: HWS, Refactor pool implementation
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit d171ce3d988868bed9dc3c9eeb8428f87dd9ac85
Author: Vlad Dogaru <vdogaru@nvidia.com>
Date: Thu Apr 10 22:17:34 2025 +0300
net/mlx5: HWS, Refactor pool implementation
Refactor the pool implementation to remove unused flags and clarify its
usage. A pool represents a single range of STEs or STCs which are
allocated at pool creation time.
Pools are used under three patterns:
1. STCs are allocated one at a time from a global pool using a bitmap
based implementation.
2. Action STEs are allocated in power-of-two blocks using a buddy
algorithm.
3. Match STEs do not use allocation, since insertion into these tables
is based on hashes or direct addressing. In such cases we use a pool
only to create the STE range.
Signed-off-by: Vlad Dogaru <vdogaru@nvidia.com>
Reviewed-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Michal Kubiak <michal.kubiak@intel.com>
Link: https://patch.msgid.link/1744312662-356571-5-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c
index 781ba8c4f733..39904b337b81 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c
@@ -1602,7 +1602,6 @@ hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx,
pool_attr.table_type = MLX5HWS_TABLE_TYPE_FDB;
pool_attr.pool_type = MLX5HWS_POOL_TYPE_STE;
- pool_attr.flags = MLX5HWS_POOL_FLAGS_FOR_STE_ACTION_POOL;
pool_attr.alloc_log_sz = 1;
table_ste->pool = mlx5hws_pool_create(ctx, &pool_attr);
if (!table_ste->pool) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c
index 9cda2774fd64..b7cb736b74d7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c
@@ -34,7 +34,6 @@ static int hws_context_pools_init(struct mlx5hws_context *ctx)
/* Create an STC pool per FT type */
pool_attr.pool_type = MLX5HWS_POOL_TYPE_STC;
- pool_attr.flags = MLX5HWS_POOL_FLAGS_FOR_STC_POOL;
max_log_sz = min(MLX5HWS_POOL_STC_LOG_SZ, ctx->caps->stc_alloc_log_max);
pool_attr.alloc_log_sz = max(max_log_sz, ctx->caps->stc_alloc_log_gran);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c
index 59b14db427b4..95d31fd6c976 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c
@@ -265,14 +265,6 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher,
rtc_attr.match_definer_0 = ctx->caps->linear_match_definer;
}
}
-
- /* Match pool requires implicit allocation */
- ret = mlx5hws_pool_chunk_alloc(ste_pool, ste);
- if (ret) {
- mlx5hws_err(ctx, "Failed to allocate STE for %s RTC",
- hws_matcher_rtc_type_to_str(rtc_type));
- return ret;
- }
break;
case HWS_MATCHER_RTC_TYPE_STE_ARRAY:
@@ -357,23 +349,17 @@ static void hws_matcher_destroy_rtc(struct mlx5hws_matcher *matcher,
{
struct mlx5hws_matcher_action_ste *action_ste;
struct mlx5hws_table *tbl = matcher->tbl;
- struct mlx5hws_pool_chunk *ste;
- struct mlx5hws_pool *ste_pool;
u32 rtc_0_id, rtc_1_id;
switch (rtc_type) {
case HWS_MATCHER_RTC_TYPE_MATCH:
rtc_0_id = matcher->match_ste.rtc_0_id;
rtc_1_id = matcher->match_ste.rtc_1_id;
- ste_pool = matcher->match_ste.pool;
- ste = &matcher->match_ste.ste;
break;
case HWS_MATCHER_RTC_TYPE_STE_ARRAY:
action_ste = &matcher->action_ste;
rtc_0_id = action_ste->rtc_0_id;
rtc_1_id = action_ste->rtc_1_id;
- ste_pool = action_ste->pool;
- ste = &action_ste->ste;
break;
default:
return;
@@ -383,8 +369,6 @@ static void hws_matcher_destroy_rtc(struct mlx5hws_matcher *matcher,
mlx5hws_cmd_rtc_destroy(matcher->tbl->ctx->mdev, rtc_1_id);
mlx5hws_cmd_rtc_destroy(matcher->tbl->ctx->mdev, rtc_0_id);
- if (rtc_type == HWS_MATCHER_RTC_TYPE_MATCH)
- mlx5hws_pool_chunk_free(ste_pool, ste);
}
static int
@@ -557,7 +541,7 @@ static int hws_matcher_bind_at(struct mlx5hws_matcher *matcher)
/* Allocate action STE mempool */
pool_attr.table_type = tbl->type;
pool_attr.pool_type = MLX5HWS_POOL_TYPE_STE;
- pool_attr.flags = MLX5HWS_POOL_FLAGS_FOR_STE_ACTION_POOL;
+ pool_attr.flags = MLX5HWS_POOL_FLAG_BUDDY;
/* Pool size is similar to action RTC size */
pool_attr.alloc_log_sz = ilog2(roundup_pow_of_two(action_ste->max_stes)) +
matcher->attr.table.sz_row_log +
@@ -636,7 +620,6 @@ static int hws_matcher_bind_mt(struct mlx5hws_matcher *matcher)
/* Create an STE pool per matcher*/
pool_attr.table_type = matcher->tbl->type;
pool_attr.pool_type = MLX5HWS_POOL_TYPE_STE;
- pool_attr.flags = MLX5HWS_POOL_FLAGS_FOR_MATCHER_STE_POOL;
pool_attr.alloc_log_sz = matcher->attr.table.sz_col_log +
matcher->attr.table.sz_row_log;
hws_matcher_set_pool_attr(&pool_attr, matcher);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c
index 0de03e17624c..270b333faab3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c
@@ -60,10 +60,8 @@ hws_pool_create_one_resource(struct mlx5hws_pool *pool, u32 log_range,
ret = -EINVAL;
}
- if (ret) {
- mlx5hws_err(pool->ctx, "Failed to allocate resource objects\n");
+ if (ret)
goto free_resource;
- }
resource->pool = pool;
resource->range = 1 << log_range;
@@ -76,17 +74,17 @@ hws_pool_create_one_resource(struct mlx5hws_pool *pool, u32 log_range,
return NULL;
}
-static int
-hws_pool_resource_alloc(struct mlx5hws_pool *pool, u32 log_range)
+static int hws_pool_resource_alloc(struct mlx5hws_pool *pool)
{
struct mlx5hws_pool_resource *resource;
u32 fw_ft_type, opt_log_range;
fw_ft_type = mlx5hws_table_get_res_fw_ft_type(pool->tbl_type, false);
- opt_log_range = pool->opt_type == MLX5HWS_POOL_OPTIMIZE_ORIG ? 0 : log_range;
+ opt_log_range = pool->opt_type == MLX5HWS_POOL_OPTIMIZE_ORIG ?
+ 0 : pool->alloc_log_sz;
resource = hws_pool_create_one_resource(pool, opt_log_range, fw_ft_type);
if (!resource) {
- mlx5hws_err(pool->ctx, "Failed allocating resource\n");
+ mlx5hws_err(pool->ctx, "Failed to allocate resource\n");
return -EINVAL;
}
@@ -96,10 +94,11 @@ hws_pool_resource_alloc(struct mlx5hws_pool *pool, u32 log_range)
struct mlx5hws_pool_resource *mirror_resource;
fw_ft_type = mlx5hws_table_get_res_fw_ft_type(pool->tbl_type, true);
- opt_log_range = pool->opt_type == MLX5HWS_POOL_OPTIMIZE_MIRROR ? 0 : log_range;
+ opt_log_range = pool->opt_type == MLX5HWS_POOL_OPTIMIZE_MIRROR ?
+ 0 : pool->alloc_log_sz;
mirror_resource = hws_pool_create_one_resource(pool, opt_log_range, fw_ft_type);
if (!mirror_resource) {
- mlx5hws_err(pool->ctx, "Failed allocating mirrored resource\n");
+ mlx5hws_err(pool->ctx, "Failed to allocate mirrored resource\n");
hws_pool_free_one_resource(resource);
pool->resource = NULL;
return -EINVAL;
@@ -110,92 +109,58 @@ hws_pool_resource_alloc(struct mlx5hws_pool *pool, u32 log_range)
return 0;
}
-static unsigned long *hws_pool_create_and_init_bitmap(u32 log_range)
-{
- unsigned long *cur_bmp;
-
- cur_bmp = bitmap_zalloc(1 << log_range, GFP_KERNEL);
- if (!cur_bmp)
- return NULL;
-
- bitmap_fill(cur_bmp, 1 << log_range);
-
- return cur_bmp;
-}
-
-static void hws_pool_buddy_db_put_chunk(struct mlx5hws_pool *pool,
- struct mlx5hws_pool_chunk *chunk)
+static int hws_pool_buddy_init(struct mlx5hws_pool *pool)
{
struct mlx5hws_buddy_mem *buddy;
- buddy = pool->db.buddy;
+ buddy = mlx5hws_buddy_create(pool->alloc_log_sz);
if (!buddy) {
- mlx5hws_err(pool->ctx, "Bad buddy state\n");
- return;
- }
-
- mlx5hws_buddy_free_mem(buddy, chunk->offset, chunk->order);
-}
-
-static struct mlx5hws_buddy_mem *
-hws_pool_buddy_get_buddy(struct mlx5hws_pool *pool, u32 order)
-{
- static struct mlx5hws_buddy_mem *buddy;
- u32 new_buddy_size;
-
- buddy = pool->db.buddy;
- if (buddy)
- return buddy;
-
- new_buddy_size = max(pool->alloc_log_sz, order);
- buddy = mlx5hws_buddy_create(new_buddy_size);
- if (!buddy) {
- mlx5hws_err(pool->ctx, "Failed to create buddy order: %d\n",
- new_buddy_size);
- return NULL;
+ mlx5hws_err(pool->ctx, "Failed to create buddy order: %zu\n",
+ pool->alloc_log_sz);
+ return -ENOMEM;
}
- if (hws_pool_resource_alloc(pool, new_buddy_size) != 0) {
- mlx5hws_err(pool->ctx, "Failed to create resource type: %d: size %d\n",
- pool->type, new_buddy_size);
+ if (hws_pool_resource_alloc(pool) != 0) {
+ mlx5hws_err(pool->ctx, "Failed to create resource type: %d size %zu\n",
+ pool->type, pool->alloc_log_sz);
mlx5hws_buddy_cleanup(buddy);
- return NULL;
+ return -ENOMEM;
}
pool->db.buddy = buddy;
- return buddy;
+ return 0;
}
-static int hws_pool_buddy_get_mem_chunk(struct mlx5hws_pool *pool,
- int order,
- int *seg)
+static int hws_pool_buddy_db_get_chunk(struct mlx5hws_pool *pool,
+ struct mlx5hws_pool_chunk *chunk)
{
- struct mlx5hws_buddy_mem *buddy;
+ struct mlx5hws_buddy_mem *buddy = pool->db.buddy;
- buddy = hws_pool_buddy_get_buddy(pool, order);
- if (!buddy)
- return -ENOMEM;
+ if (!buddy) {
+ mlx5hws_err(pool->ctx, "Bad buddy state\n");
+ return -EINVAL;
+ }
- *seg = mlx5hws_buddy_alloc_mem(buddy, order);
- if (*seg >= 0)
+ chunk->offset = mlx5hws_buddy_alloc_mem(buddy, chunk->order);
+ if (chunk->offset >= 0)
return 0;
return -ENOMEM;
}
-static int hws_pool_buddy_db_get_chunk(struct mlx5hws_pool *pool,
- struct mlx5hws_pool_chunk *chunk)
+static void hws_pool_buddy_db_put_chunk(struct mlx5hws_pool *pool,
+ struct mlx5hws_pool_chunk *chunk)
{
- int ret = 0;
+ struct mlx5hws_buddy_mem *buddy;
- ret = hws_pool_buddy_get_mem_chunk(pool, chunk->order,
- &chunk->offset);
- if (ret)
- mlx5hws_err(pool->ctx, "Failed to get free slot for chunk with order: %d\n",
- chunk->order);
+ buddy = pool->db.buddy;
+ if (!buddy) {
+ mlx5hws_err(pool->ctx, "Bad buddy state\n");
+ return;
+ }
- return ret;
+ mlx5hws_buddy_free_mem(buddy, chunk->offset, chunk->order);
}
static void hws_pool_buddy_db_uninit(struct mlx5hws_pool *pool)
@@ -210,15 +175,13 @@ static void hws_pool_buddy_db_uninit(struct mlx5hws_pool *pool)
}
}
-static int hws_pool_buddy_db_init(struct mlx5hws_pool *pool, u32 log_range)
+static int hws_pool_buddy_db_init(struct mlx5hws_pool *pool)
{
- if (pool->flags & MLX5HWS_POOL_FLAGS_ALLOC_MEM_ON_CREATE) {
- if (!hws_pool_buddy_get_buddy(pool, log_range)) {
- mlx5hws_err(pool->ctx,
- "Failed allocating memory on create log_sz: %d\n", log_range);
- return -ENOMEM;
- }
- }
+ int ret;
+
+ ret = hws_pool_buddy_init(pool);
+ if (ret)
+ return ret;
pool->p_db_uninit = &hws_pool_buddy_db_uninit;
pool->p_get_chunk = &hws_pool_buddy_db_get_chunk;
@@ -227,234 +190,105 @@ static int hws_pool_buddy_db_init(struct mlx5hws_pool *pool, u32 log_range)
return 0;
}
-static int hws_pool_create_resource(struct mlx5hws_pool *pool, u32 alloc_size)
-{
- int ret = hws_pool_resource_alloc(pool, alloc_size);
-
- if (ret) {
- mlx5hws_err(pool->ctx, "Failed to create resource type: %d: size %d\n",
- pool->type, alloc_size);
- return ret;
- }
-
- return 0;
-}
-
-static struct mlx5hws_pool_elements *
-hws_pool_element_create_new_elem(struct mlx5hws_pool *pool, u32 order)
+static unsigned long *hws_pool_create_and_init_bitmap(u32 log_range)
{
- struct mlx5hws_pool_elements *elem;
- u32 alloc_size;
-
- alloc_size = pool->alloc_log_sz;
+ unsigned long *bitmap;
- elem = kzalloc(sizeof(*elem), GFP_KERNEL);
- if (!elem)
+ bitmap = bitmap_zalloc(1 << log_range, GFP_KERNEL);
+ if (!bitmap)
return NULL;
- /* Sharing the same resource, also means that all the elements are with size 1 */
- if ((pool->flags & MLX5HWS_POOL_FLAGS_FIXED_SIZE_OBJECTS) &&
- !(pool->flags & MLX5HWS_POOL_FLAGS_RESOURCE_PER_CHUNK)) {
- /* Currently all chunks in size 1 */
- elem->bitmap = hws_pool_create_and_init_bitmap(alloc_size - order);
- if (!elem->bitmap) {
- mlx5hws_err(pool->ctx,
- "Failed to create bitmap type: %d: size %d\n",
- pool->type, alloc_size);
- goto free_elem;
- }
-
- elem->log_size = alloc_size - order;
- }
-
- if (hws_pool_create_resource(pool, alloc_size)) {
- mlx5hws_err(pool->ctx, "Failed to create resource type: %d: size %d\n",
- pool->type, alloc_size);
- goto free_db;
- }
-
- pool->db.element = elem;
+ bitmap_fill(bitmap, 1 << log_range);
- return elem;
-
-free_db:
- bitmap_free(elem->bitmap);
-free_elem:
- kfree(elem);
- return NULL;
+ return bitmap;
}
-static int hws_pool_element_find_seg(struct mlx5hws_pool_elements *elem, int *seg)
+static int hws_pool_bitmap_init(struct mlx5hws_pool *pool)
{
- unsigned int segment, size;
+ unsigned long *bitmap;
- size = 1 << elem->log_size;
-
- segment = find_first_bit(elem->bitmap, size);
- if (segment >= size) {
- elem->is_full = true;
+ bitmap = hws_pool_create_and_init_bitmap(pool->alloc_log_sz);
+ if (!bitmap) {
+ mlx5hws_err(pool->ctx, "Failed to create bitmap order: %zu\n",
+ pool->alloc_log_sz);
return -ENOMEM;
}
- bitmap_clear(elem->bitmap, segment, 1);
- *seg = segment;
- return 0;
-}
-
-static int
-hws_pool_onesize_element_get_mem_chunk(struct mlx5hws_pool *pool, u32 order,
- int *seg)
-{
- struct mlx5hws_pool_elements *elem;
-
- elem = pool->db.element;
- if (!elem)
- elem = hws_pool_element_create_new_elem(pool, order);
- if (!elem)
- goto err_no_elem;
-
- if (hws_pool_element_find_seg(elem, seg) != 0) {
- mlx5hws_err(pool->ctx, "No more resources (last request order: %d)\n", order);
+ if (hws_pool_resource_alloc(pool) != 0) {
+ mlx5hws_err(pool->ctx, "Failed to create resource type: %d: size %zu\n",
+ pool->type, pool->alloc_log_sz);
+ bitmap_free(bitmap);
return -ENOMEM;
}
- elem->num_of_elements++;
- return 0;
+ pool->db.bitmap = bitmap;
-err_no_elem:
- mlx5hws_err(pool->ctx, "Failed to allocate element for order: %d\n", order);
- return -ENOMEM;
+ return 0;
}
-static int hws_pool_general_element_get_mem_chunk(struct mlx5hws_pool *pool,
- u32 order, int *seg)
+static int hws_pool_bitmap_db_get_chunk(struct mlx5hws_pool *pool,
+ struct mlx5hws_pool_chunk *chunk)
{
- int ret;
+ unsigned long *bitmap, size;
- if (!pool->resource) {
- ret = hws_pool_create_resource(pool, order);
- if (ret)
- goto err_no_res;
- *seg = 0; /* One memory slot in that element */
- return 0;
+ if (chunk->order != 0) {
+ mlx5hws_err(pool->ctx, "Pool only supports order 0 allocs\n");
+ return -EINVAL;
}
- mlx5hws_err(pool->ctx, "No more resources (last request order: %d)\n", order);
- return -ENOMEM;
-
-err_no_res:
- mlx5hws_err(pool->ctx, "Failed to allocate element for order: %d\n", order);
- return -ENOMEM;
-}
-
-static int hws_pool_general_element_db_get_chunk(struct mlx5hws_pool *pool,
- struct mlx5hws_pool_chunk *chunk)
-{
- int ret;
-
- ret = hws_pool_general_element_get_mem_chunk(pool, chunk->order,
- &chunk->offset);
- if (ret)
- mlx5hws_err(pool->ctx, "Failed to get free slot for chunk with order: %d\n",
- chunk->order);
-
- return ret;
-}
+ bitmap = pool->db.bitmap;
+ if (!bitmap) {
+ mlx5hws_err(pool->ctx, "Bad bitmap state\n");
+ return -EINVAL;
+ }
-static void hws_pool_general_element_db_put_chunk(struct mlx5hws_pool *pool,
- struct mlx5hws_pool_chunk *chunk)
-{
- if (pool->flags & MLX5HWS_POOL_FLAGS_RELEASE_FREE_RESOURCE)
- hws_pool_resource_free(pool);
-}
+ size = 1 << pool->alloc_log_sz;
-static void hws_pool_general_element_db_uninit(struct mlx5hws_pool *pool)
-{
- (void)pool;
-}
+ chunk->offset = find_first_bit(bitmap, size);
+ if (chunk->offset >= size)
+ return -ENOMEM;
-/* This memory management works as the following:
- * - At start doesn't allocate no mem at all.
- * - When new request for chunk arrived:
- * allocate resource and give it.
- * - When free that chunk:
- * the resource is freed.
- */
-static int hws_pool_general_element_db_init(struct mlx5hws_pool *pool)
-{
- pool->p_db_uninit = &hws_pool_general_element_db_uninit;
- pool->p_get_chunk = &hws_pool_general_element_db_get_chunk;
- pool->p_put_chunk = &hws_pool_general_element_db_put_chunk;
+ bitmap_clear(bitmap, chunk->offset, 1);
return 0;
}
-static void
-hws_onesize_element_db_destroy_element(struct mlx5hws_pool *pool,
- struct mlx5hws_pool_elements *elem)
-{
- hws_pool_resource_free(pool);
- bitmap_free(elem->bitmap);
- kfree(elem);
- pool->db.element = NULL;
-}
-
-static void hws_onesize_element_db_put_chunk(struct mlx5hws_pool *pool,
- struct mlx5hws_pool_chunk *chunk)
+static void hws_pool_bitmap_db_put_chunk(struct mlx5hws_pool *pool,
+ struct mlx5hws_pool_chunk *chunk)
{
- struct mlx5hws_pool_elements *elem;
+ unsigned long *bitmap;
- elem = pool->db.element;
- if (!elem) {
- mlx5hws_err(pool->ctx, "Pool element was not allocated\n");
+ bitmap = pool->db.bitmap;
+ if (!bitmap) {
+ mlx5hws_err(pool->ctx, "Bad bitmap state\n");
return;
}
- bitmap_set(elem->bitmap, chunk->offset, 1);
- elem->is_full = false;
- elem->num_of_elements--;
-
- if (pool->flags & MLX5HWS_POOL_FLAGS_RELEASE_FREE_RESOURCE &&
- !elem->num_of_elements)
- hws_onesize_element_db_destroy_element(pool, elem);
+ bitmap_set(bitmap, chunk->offset, 1);
}
-static int hws_onesize_element_db_get_chunk(struct mlx5hws_pool *pool,
- struct mlx5hws_pool_chunk *chunk)
+static void hws_pool_bitmap_db_uninit(struct mlx5hws_pool *pool)
{
- int ret = 0;
-
- ret = hws_pool_onesize_element_get_mem_chunk(pool, chunk->order,
- &chunk->offset);
- if (ret)
- mlx5hws_err(pool->ctx, "Failed to get free slot for chunk with order: %d\n",
- chunk->order);
+ unsigned long *bitmap;
- return ret;
+ bitmap = pool->db.bitmap;
+ if (bitmap) {
+ bitmap_free(bitmap);
+ pool->db.bitmap = NULL;
+ }
}
-static void hws_onesize_element_db_uninit(struct mlx5hws_pool *pool)
+static int hws_pool_bitmap_db_init(struct mlx5hws_pool *pool)
{
- struct mlx5hws_pool_elements *elem = pool->db.element;
+ int ret;
- if (elem) {
- bitmap_free(elem->bitmap);
- kfree(elem);
- pool->db.element = NULL;
- }
-}
+ ret = hws_pool_bitmap_init(pool);
+ if (ret)
+ return ret;
-/* This memory management works as the following:
- * - At start doesn't allocate no mem at all.
- * - When new request for chunk arrived:
- * aloocate the first and only slot of memory/resource
- * when it ended return error.
- */
-static int hws_pool_onesize_element_db_init(struct mlx5hws_pool *pool)
-{
- pool->p_db_uninit = &hws_onesize_element_db_uninit;
- pool->p_get_chunk = &hws_onesize_element_db_get_chunk;
- pool->p_put_chunk = &hws_onesize_element_db_put_chunk;
+ pool->p_db_uninit = &hws_pool_bitmap_db_uninit;
+ pool->p_get_chunk = &hws_pool_bitmap_db_get_chunk;
+ pool->p_put_chunk = &hws_pool_bitmap_db_put_chunk;
return 0;
}
@@ -464,15 +298,14 @@ static int hws_pool_db_init(struct mlx5hws_pool *pool,
{
int ret;
- if (db_type == MLX5HWS_POOL_DB_TYPE_GENERAL_SIZE)
- ret = hws_pool_general_element_db_init(pool);
- else if (db_type == MLX5HWS_POOL_DB_TYPE_ONE_SIZE_RESOURCE)
- ret = hws_pool_onesize_element_db_init(pool);
+ if (db_type == MLX5HWS_POOL_DB_TYPE_BITMAP)
+ ret = hws_pool_bitmap_db_init(pool);
else
- ret = hws_pool_buddy_db_init(pool, pool->alloc_log_sz);
+ ret = hws_pool_buddy_db_init(pool);
if (ret) {
- mlx5hws_err(pool->ctx, "Failed to init general db : %d (ret: %d)\n", db_type, ret);
+ mlx5hws_err(pool->ctx, "Failed to init pool type: %d (ret: %d)\n",
+ db_type, ret);
return ret;
}
@@ -521,15 +354,10 @@ mlx5hws_pool_create(struct mlx5hws_context *ctx, struct mlx5hws_pool_attr *pool_
pool->tbl_type = pool_attr->table_type;
pool->opt_type = pool_attr->opt_type;
- /* Support general db */
- if (pool->flags == (MLX5HWS_POOL_FLAGS_RELEASE_FREE_RESOURCE |
- MLX5HWS_POOL_FLAGS_RESOURCE_PER_CHUNK))
- res_db_type = MLX5HWS_POOL_DB_TYPE_GENERAL_SIZE;
- else if (pool->flags == (MLX5HWS_POOL_FLAGS_ONE_RESOURCE |
- MLX5HWS_POOL_FLAGS_FIXED_SIZE_OBJECTS))
- res_db_type = MLX5HWS_POOL_DB_TYPE_ONE_SIZE_RESOURCE;
- else
+ if (pool->flags & MLX5HWS_POOL_FLAG_BUDDY)
res_db_type = MLX5HWS_POOL_DB_TYPE_BUDDY;
+ else
+ res_db_type = MLX5HWS_POOL_DB_TYPE_BITMAP;
pool->alloc_log_sz = pool_attr->alloc_log_sz;
@@ -545,7 +373,7 @@ mlx5hws_pool_create(struct mlx5hws_context *ctx, struct mlx5hws_pool_attr *pool_
return NULL;
}
-int mlx5hws_pool_destroy(struct mlx5hws_pool *pool)
+void mlx5hws_pool_destroy(struct mlx5hws_pool *pool)
{
mutex_destroy(&pool->lock);
@@ -555,5 +383,4 @@ int mlx5hws_pool_destroy(struct mlx5hws_pool *pool)
hws_pool_db_unint(pool);
kfree(pool);
- return 0;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h
index 112a61cd2997..9a781a87f097 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h
@@ -23,29 +23,10 @@ struct mlx5hws_pool_resource {
};
enum mlx5hws_pool_flags {
- /* Only a one resource in that pool */
- MLX5HWS_POOL_FLAGS_ONE_RESOURCE = 1 << 0,
- MLX5HWS_POOL_FLAGS_RELEASE_FREE_RESOURCE = 1 << 1,
- /* No sharing resources between chunks */
- MLX5HWS_POOL_FLAGS_RESOURCE_PER_CHUNK = 1 << 2,
- /* All objects are in the same size */
- MLX5HWS_POOL_FLAGS_FIXED_SIZE_OBJECTS = 1 << 3,
- /* Managed by buddy allocator */
- MLX5HWS_POOL_FLAGS_BUDDY_MANAGED = 1 << 4,
- /* Allocate pool_type memory on pool creation */
- MLX5HWS_POOL_FLAGS_ALLOC_MEM_ON_CREATE = 1 << 5,
-
- /* These values should be used by the caller */
- MLX5HWS_POOL_FLAGS_FOR_STC_POOL =
- MLX5HWS_POOL_FLAGS_ONE_RESOURCE |
- MLX5HWS_POOL_FLAGS_FIXED_SIZE_OBJECTS,
- MLX5HWS_POOL_FLAGS_FOR_MATCHER_STE_POOL =
- MLX5HWS_POOL_FLAGS_RELEASE_FREE_RESOURCE |
- MLX5HWS_POOL_FLAGS_RESOURCE_PER_CHUNK,
- MLX5HWS_POOL_FLAGS_FOR_STE_ACTION_POOL =
- MLX5HWS_POOL_FLAGS_ONE_RESOURCE |
- MLX5HWS_POOL_FLAGS_BUDDY_MANAGED |
- MLX5HWS_POOL_FLAGS_ALLOC_MEM_ON_CREATE,
+ /* Managed by a buddy allocator. If this is not set only allocations of
+ * order 0 are supported.
+ */
+ MLX5HWS_POOL_FLAG_BUDDY = BIT(0),
};
enum mlx5hws_pool_optimize {
@@ -64,25 +45,16 @@ struct mlx5hws_pool_attr {
};
enum mlx5hws_db_type {
- /* Uses for allocating chunk of big memory, each element has its own resource in the FW*/
- MLX5HWS_POOL_DB_TYPE_GENERAL_SIZE,
- /* One resource only, all the elements are with same one size */
- MLX5HWS_POOL_DB_TYPE_ONE_SIZE_RESOURCE,
+ /* Uses a bitmap, supports only allocations of order 0. */
+ MLX5HWS_POOL_DB_TYPE_BITMAP,
/* Entries are managed using a buddy mechanism. */
MLX5HWS_POOL_DB_TYPE_BUDDY,
};
-struct mlx5hws_pool_elements {
- u32 num_of_elements;
- unsigned long *bitmap;
- u32 log_size;
- bool is_full;
-};
-
struct mlx5hws_pool_db {
enum mlx5hws_db_type type;
union {
- struct mlx5hws_pool_elements *element;
+ unsigned long *bitmap;
struct mlx5hws_buddy_mem *buddy;
};
};
@@ -103,7 +75,6 @@ struct mlx5hws_pool {
enum mlx5hws_pool_optimize opt_type;
struct mlx5hws_pool_resource *resource;
struct mlx5hws_pool_resource *mirror_resource;
- /* DB */
struct mlx5hws_pool_db db;
/* Functions */
mlx5hws_pool_unint_db p_db_uninit;
@@ -115,7 +86,7 @@ struct mlx5hws_pool *
mlx5hws_pool_create(struct mlx5hws_context *ctx,
struct mlx5hws_pool_attr *pool_attr);
-int mlx5hws_pool_destroy(struct mlx5hws_pool *pool);
+void mlx5hws_pool_destroy(struct mlx5hws_pool *pool);
int mlx5hws_pool_chunk_alloc(struct mlx5hws_pool *pool,
struct mlx5hws_pool_chunk *chunk);
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,265 @@
From 333144760a660c248b241cf555a88ed2447c29b1 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:41:59 -0400
Subject: [PATCH] net/mlx5: HWS, Cleanup after pool refactoring
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 43a2038c6d8a810e8e70f0e7fcb965f431c92bfb
Author: Vlad Dogaru <vdogaru@nvidia.com>
Date: Thu Apr 10 22:17:35 2025 +0300
net/mlx5: HWS, Cleanup after pool refactoring
Remove members which are now no longer used. In fact, many of the
`struct mlx5hws_pool_chunk` were not even written to beyond being
initialized, but they were used in various internals.
Also cleanup some local variables which made more sense when the API was
thicker.
Signed-off-by: Vlad Dogaru <vdogaru@nvidia.com>
Reviewed-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Michal Kubiak <michal.kubiak@intel.com>
Link: https://patch.msgid.link/1744312662-356571-6-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c
index 39904b337b81..161ad720b339 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c
@@ -1583,7 +1583,6 @@ hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx,
struct mlx5hws_matcher_action_ste *table_ste;
struct mlx5hws_pool_attr pool_attr = {0};
struct mlx5hws_pool *ste_pool, *stc_pool;
- struct mlx5hws_pool_chunk *ste;
u32 *rtc_0_id, *rtc_1_id;
u32 obj_id;
int ret;
@@ -1613,8 +1612,6 @@ hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx,
rtc_0_id = &table_ste->rtc_0_id;
rtc_1_id = &table_ste->rtc_1_id;
ste_pool = table_ste->pool;
- ste = &table_ste->ste;
- ste->order = 1;
rtc_attr.log_size = 0;
rtc_attr.log_depth = 0;
@@ -1630,7 +1627,6 @@ hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx,
rtc_attr.pd = ctx->pd_num;
rtc_attr.ste_base = obj_id;
- rtc_attr.ste_offset = ste->offset;
rtc_attr.reparse_mode = mlx5hws_context_get_reparse_mode(ctx);
rtc_attr.table_type = mlx5hws_table_get_res_fw_ft_type(MLX5HWS_TABLE_TYPE_FDB, false);
@@ -1833,7 +1829,6 @@ mlx5hws_action_create_dest_match_range(struct mlx5hws_context *ctx,
stc_attr.action_offset = MLX5HWS_ACTION_OFFSET_HIT;
stc_attr.action_type = MLX5_IFC_STC_ACTION_TYPE_JUMP_TO_STE_TABLE;
stc_attr.reparse_mode = MLX5_IFC_STC_REPARSE_IGNORE;
- stc_attr.ste_table.ste = table_ste->ste;
stc_attr.ste_table.ste_pool = table_ste->pool;
stc_attr.ste_table.match_definer_id = ctx->caps->trivial_match_definer;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c
index e8f98c109b99..9c83753e4592 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c
@@ -406,7 +406,6 @@ int mlx5hws_cmd_rtc_create(struct mlx5_core_dev *mdev,
MLX5_SET(rtc, attr, match_definer_1, rtc_attr->match_definer_1);
MLX5_SET(rtc, attr, stc_id, rtc_attr->stc_base);
MLX5_SET(rtc, attr, ste_table_base_id, rtc_attr->ste_base);
- MLX5_SET(rtc, attr, ste_table_offset, rtc_attr->ste_offset);
MLX5_SET(rtc, attr, miss_flow_table_id, rtc_attr->miss_ft_id);
MLX5_SET(rtc, attr, reparse_mode, rtc_attr->reparse_mode);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h
index 51d9e0291ac1..fa6bff210266 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h
@@ -70,7 +70,6 @@ struct mlx5hws_cmd_rtc_create_attr {
u32 pd;
u32 stc_base;
u32 ste_base;
- u32 ste_offset;
u32 miss_ft_id;
bool fw_gen_wqe;
u8 update_index_mode;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c
index 95d31fd6c976..3028e0387e3f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c
@@ -197,22 +197,15 @@ static int hws_matcher_disconnect(struct mlx5hws_matcher *matcher)
static void hws_matcher_set_rtc_attr_sz(struct mlx5hws_matcher *matcher,
struct mlx5hws_cmd_rtc_create_attr *rtc_attr,
- enum mlx5hws_matcher_rtc_type rtc_type,
bool is_mirror)
{
- struct mlx5hws_pool_chunk *ste = &matcher->action_ste.ste;
enum mlx5hws_matcher_flow_src flow_src = matcher->attr.optimize_flow_src;
- bool is_match_rtc = rtc_type == HWS_MATCHER_RTC_TYPE_MATCH;
if ((flow_src == MLX5HWS_MATCHER_FLOW_SRC_VPORT && !is_mirror) ||
(flow_src == MLX5HWS_MATCHER_FLOW_SRC_WIRE && is_mirror)) {
/* Optimize FDB RTC */
rtc_attr->log_size = 0;
rtc_attr->log_depth = 0;
- } else {
- /* Keep original values */
- rtc_attr->log_size = is_match_rtc ? matcher->attr.table.sz_row_log : ste->order;
- rtc_attr->log_depth = is_match_rtc ? matcher->attr.table.sz_col_log : 0;
}
}
@@ -225,8 +218,7 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher,
struct mlx5hws_context *ctx = matcher->tbl->ctx;
struct mlx5hws_matcher_action_ste *action_ste;
struct mlx5hws_table *tbl = matcher->tbl;
- struct mlx5hws_pool *ste_pool, *stc_pool;
- struct mlx5hws_pool_chunk *ste;
+ struct mlx5hws_pool *ste_pool;
u32 *rtc_0_id, *rtc_1_id;
u32 obj_id;
int ret;
@@ -236,8 +228,6 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher,
rtc_0_id = &matcher->match_ste.rtc_0_id;
rtc_1_id = &matcher->match_ste.rtc_1_id;
ste_pool = matcher->match_ste.pool;
- ste = &matcher->match_ste.ste;
- ste->order = attr->table.sz_col_log + attr->table.sz_row_log;
rtc_attr.log_size = attr->table.sz_row_log;
rtc_attr.log_depth = attr->table.sz_col_log;
@@ -273,16 +263,15 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher,
rtc_0_id = &action_ste->rtc_0_id;
rtc_1_id = &action_ste->rtc_1_id;
ste_pool = action_ste->pool;
- ste = &action_ste->ste;
/* Action RTC size calculation:
* log((max number of rules in matcher) *
* (max number of action STEs per rule) *
* (2 to support writing new STEs for update rule))
*/
- ste->order = ilog2(roundup_pow_of_two(action_ste->max_stes)) +
- attr->table.sz_row_log +
- MLX5HWS_MATCHER_ACTION_RTC_UPDATE_MULT;
- rtc_attr.log_size = ste->order;
+ rtc_attr.log_size =
+ ilog2(roundup_pow_of_two(action_ste->max_stes)) +
+ attr->table.sz_row_log +
+ MLX5HWS_MATCHER_ACTION_RTC_UPDATE_MULT;
rtc_attr.log_depth = 0;
rtc_attr.update_index_mode = MLX5_IFC_RTC_STE_UPDATE_MODE_BY_OFFSET;
/* The action STEs use the default always hit definer */
@@ -300,21 +289,19 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher,
rtc_attr.pd = ctx->pd_num;
rtc_attr.ste_base = obj_id;
- rtc_attr.ste_offset = ste->offset;
rtc_attr.reparse_mode = mlx5hws_context_get_reparse_mode(ctx);
rtc_attr.table_type = mlx5hws_table_get_res_fw_ft_type(tbl->type, false);
- hws_matcher_set_rtc_attr_sz(matcher, &rtc_attr, rtc_type, false);
+ hws_matcher_set_rtc_attr_sz(matcher, &rtc_attr, false);
/* STC is a single resource (obj_id), use any STC for the ID */
- stc_pool = ctx->stc_pool;
- obj_id = mlx5hws_pool_get_base_id(stc_pool);
+ obj_id = mlx5hws_pool_get_base_id(ctx->stc_pool);
rtc_attr.stc_base = obj_id;
ret = mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr, rtc_0_id);
if (ret) {
mlx5hws_err(ctx, "Failed to create matcher RTC of type %s",
hws_matcher_rtc_type_to_str(rtc_type));
- goto free_ste;
+ return ret;
}
if (tbl->type == MLX5HWS_TABLE_TYPE_FDB) {
@@ -322,9 +309,9 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher,
rtc_attr.ste_base = obj_id;
rtc_attr.table_type = mlx5hws_table_get_res_fw_ft_type(tbl->type, true);
- obj_id = mlx5hws_pool_get_base_mirror_id(stc_pool);
+ obj_id = mlx5hws_pool_get_base_mirror_id(ctx->stc_pool);
rtc_attr.stc_base = obj_id;
- hws_matcher_set_rtc_attr_sz(matcher, &rtc_attr, rtc_type, true);
+ hws_matcher_set_rtc_attr_sz(matcher, &rtc_attr, true);
ret = mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr, rtc_1_id);
if (ret) {
@@ -338,16 +325,12 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher,
destroy_rtc_0:
mlx5hws_cmd_rtc_destroy(ctx->mdev, *rtc_0_id);
-free_ste:
- if (rtc_type == HWS_MATCHER_RTC_TYPE_MATCH)
- mlx5hws_pool_chunk_free(ste_pool, ste);
return ret;
}
static void hws_matcher_destroy_rtc(struct mlx5hws_matcher *matcher,
enum mlx5hws_matcher_rtc_type rtc_type)
{
- struct mlx5hws_matcher_action_ste *action_ste;
struct mlx5hws_table *tbl = matcher->tbl;
u32 rtc_0_id, rtc_1_id;
@@ -357,18 +340,17 @@ static void hws_matcher_destroy_rtc(struct mlx5hws_matcher *matcher,
rtc_1_id = matcher->match_ste.rtc_1_id;
break;
case HWS_MATCHER_RTC_TYPE_STE_ARRAY:
- action_ste = &matcher->action_ste;
- rtc_0_id = action_ste->rtc_0_id;
- rtc_1_id = action_ste->rtc_1_id;
+ rtc_0_id = matcher->action_ste.rtc_0_id;
+ rtc_1_id = matcher->action_ste.rtc_1_id;
break;
default:
return;
}
if (tbl->type == MLX5HWS_TABLE_TYPE_FDB)
- mlx5hws_cmd_rtc_destroy(matcher->tbl->ctx->mdev, rtc_1_id);
+ mlx5hws_cmd_rtc_destroy(tbl->ctx->mdev, rtc_1_id);
- mlx5hws_cmd_rtc_destroy(matcher->tbl->ctx->mdev, rtc_0_id);
+ mlx5hws_cmd_rtc_destroy(tbl->ctx->mdev, rtc_0_id);
}
static int
@@ -564,7 +546,6 @@ static int hws_matcher_bind_at(struct mlx5hws_matcher *matcher)
stc_attr.action_offset = MLX5HWS_ACTION_OFFSET_HIT;
stc_attr.action_type = MLX5_IFC_STC_ACTION_TYPE_JUMP_TO_STE_TABLE;
stc_attr.reparse_mode = MLX5_IFC_STC_REPARSE_IGNORE;
- stc_attr.ste_table.ste = action_ste->ste;
stc_attr.ste_table.ste_pool = action_ste->pool;
stc_attr.ste_table.match_definer_id = ctx->caps->trivial_match_definer;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h
index 20b32012c418..0450b6175ac9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h
@@ -45,14 +45,12 @@ struct mlx5hws_match_template {
};
struct mlx5hws_matcher_match_ste {
- struct mlx5hws_pool_chunk ste;
u32 rtc_0_id;
u32 rtc_1_id;
struct mlx5hws_pool *pool;
};
struct mlx5hws_matcher_action_ste {
- struct mlx5hws_pool_chunk ste;
struct mlx5hws_pool_chunk stc;
u32 rtc_0_id;
u32 rtc_1_id;
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,108 @@
From 2cd06eab502130ff9491b0f14378269d658826c8 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:41:59 -0400
Subject: [PATCH] net/mlx5: HWS, Add fullness tracking to pool
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 04562694766514f00e7086d3d4884db5f3a22d4e
Author: Vlad Dogaru <vdogaru@nvidia.com>
Date: Thu Apr 10 22:17:36 2025 +0300
net/mlx5: HWS, Add fullness tracking to pool
Future users will need to query whether a pool is empty.
Signed-off-by: Vlad Dogaru <vdogaru@nvidia.com>
Reviewed-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Michal Kubiak <michal.kubiak@intel.com>
Link: https://patch.msgid.link/1744312662-356571-7-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c
index 270b333faab3..26d85fe3c417 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c
@@ -324,6 +324,8 @@ int mlx5hws_pool_chunk_alloc(struct mlx5hws_pool *pool,
mutex_lock(&pool->lock);
ret = pool->p_get_chunk(pool, chunk);
+ if (ret == 0)
+ pool->available_elems -= 1 << chunk->order;
mutex_unlock(&pool->lock);
return ret;
@@ -334,6 +336,7 @@ void mlx5hws_pool_chunk_free(struct mlx5hws_pool *pool,
{
mutex_lock(&pool->lock);
pool->p_put_chunk(pool, chunk);
+ pool->available_elems += 1 << chunk->order;
mutex_unlock(&pool->lock);
}
@@ -360,6 +363,7 @@ mlx5hws_pool_create(struct mlx5hws_context *ctx, struct mlx5hws_pool_attr *pool_
res_db_type = MLX5HWS_POOL_DB_TYPE_BITMAP;
pool->alloc_log_sz = pool_attr->alloc_log_sz;
+ pool->available_elems = 1 << pool_attr->alloc_log_sz;
if (hws_pool_db_init(pool, res_db_type))
goto free_pool;
@@ -377,6 +381,9 @@ void mlx5hws_pool_destroy(struct mlx5hws_pool *pool)
{
mutex_destroy(&pool->lock);
+ if (pool->available_elems != 1 << pool->alloc_log_sz)
+ mlx5hws_err(pool->ctx, "Attempting to destroy non-empty pool\n");
+
if (pool->resource)
hws_pool_resource_free(pool);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h
index 9a781a87f097..c82760d53e1a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h
@@ -71,6 +71,7 @@ struct mlx5hws_pool {
enum mlx5hws_pool_flags flags;
struct mutex lock; /* protect the pool */
size_t alloc_log_sz;
+ size_t available_elems;
enum mlx5hws_table_type tbl_type;
enum mlx5hws_pool_optimize opt_type;
struct mlx5hws_pool_resource *resource;
@@ -103,4 +104,28 @@ static inline u32 mlx5hws_pool_get_base_mirror_id(struct mlx5hws_pool *pool)
{
return pool->mirror_resource->base_id;
}
+
+static inline bool
+mlx5hws_pool_empty(struct mlx5hws_pool *pool)
+{
+ bool ret;
+
+ mutex_lock(&pool->lock);
+ ret = pool->available_elems == 0;
+ mutex_unlock(&pool->lock);
+
+ return ret;
+}
+
+static inline bool
+mlx5hws_pool_full(struct mlx5hws_pool *pool)
+{
+ bool ret;
+
+ mutex_lock(&pool->lock);
+ ret = pool->available_elems == (1 << pool->alloc_log_sz);
+ mutex_unlock(&pool->lock);
+
+ return ret;
+}
#endif /* MLX5HWS_POOL_H_ */
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,53 @@
From 83591e87d75f1fbe1bad278c3b590cc83e85c276 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:41:59 -0400
Subject: [PATCH] net/mlx5: HWS, Fix pool size optimization
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit a68334f9750f41fc36990840090ef9dbee1e2c7e
Author: Vlad Dogaru <vdogaru@nvidia.com>
Date: Thu Apr 10 22:17:37 2025 +0300
net/mlx5: HWS, Fix pool size optimization
The optimization to create a size-one STE range for the unused direction
was broken. The hardware prevents us from creating RTCs over unallocated
STE space, so the only reason this has worked so far is because the
optimization was never used.
Signed-off-by: Vlad Dogaru <vdogaru@nvidia.com>
Reviewed-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Michal Kubiak <michal.kubiak@intel.com>
Link: https://patch.msgid.link/1744312662-356571-8-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c
index 26d85fe3c417..7e37d6e9eb83 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c
@@ -80,7 +80,7 @@ static int hws_pool_resource_alloc(struct mlx5hws_pool *pool)
u32 fw_ft_type, opt_log_range;
fw_ft_type = mlx5hws_table_get_res_fw_ft_type(pool->tbl_type, false);
- opt_log_range = pool->opt_type == MLX5HWS_POOL_OPTIMIZE_ORIG ?
+ opt_log_range = pool->opt_type == MLX5HWS_POOL_OPTIMIZE_MIRROR ?
0 : pool->alloc_log_sz;
resource = hws_pool_create_one_resource(pool, opt_log_range, fw_ft_type);
if (!resource) {
@@ -94,7 +94,7 @@ static int hws_pool_resource_alloc(struct mlx5hws_pool *pool)
struct mlx5hws_pool_resource *mirror_resource;
fw_ft_type = mlx5hws_table_get_res_fw_ft_type(pool->tbl_type, true);
- opt_log_range = pool->opt_type == MLX5HWS_POOL_OPTIMIZE_MIRROR ?
+ opt_log_range = pool->opt_type == MLX5HWS_POOL_OPTIMIZE_ORIG ?
0 : pool->alloc_log_sz;
mirror_resource = hws_pool_create_one_resource(pool, opt_log_range, fw_ft_type);
if (!mirror_resource) {
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,585 @@
From be613df0e750dc94c718ad4c944fa5542870a95c Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:00 -0400
Subject: [PATCH] net/mlx5: HWS, Implement action STE pool
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 983d01b2ce0ac688bb42489f33a29a02274366d5
Author: Vlad Dogaru <vdogaru@nvidia.com>
Date: Thu Apr 10 22:17:38 2025 +0300
net/mlx5: HWS, Implement action STE pool
Implement a per-queue pool of action STEs that match STEs can link to,
regardless of matcher.
The code relies on hints to optimize whether a given rule is added to
rx-only, tx-only or both. Correspondingly, action STEs need to be added
to different RTC for ingress or egress paths. For rx-and-tx rules, the
current rule implementation dictates that the offsets for a given rule
must be the same in both RTCs.
To avoid wasting STEs, each action STE pool element holds 3 pools:
rx-only, tx-only, and rx-and-tx, corresponding to the possible values of
the pool optimization enum. The implementation then chooses at rule
creation / update which of these elements to allocate from.
Each element holds multiple action STE tables, which wrap an RTC, an STE
range, the logic to buddy-allocate offsets from the range, and an STC
that allows match STEs to point to this table. When allocating offsets
from an element, we iterate through available action STE tables and, if
needed, create a new table.
Similar to the previous implementation, this iteration does not free any
resources. This is implemented in a subsequent patch.
Signed-off-by: Vlad Dogaru <vdogaru@nvidia.com>
Reviewed-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Michal Kubiak <michal.kubiak@intel.com>
Link: https://patch.msgid.link/1744312662-356571-9-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 568bbe5f83f5..d292e6a9e22c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -154,7 +154,8 @@ mlx5_core-$(CONFIG_MLX5_HW_STEERING) += steering/hws/cmd.o \
steering/hws/vport.o \
steering/hws/bwc_complex.o \
steering/hws/fs_hws_pools.o \
- steering/hws/fs_hws.o
+ steering/hws/fs_hws.o \
+ steering/hws/action_ste_pool.o
#
# SF device
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.c
new file mode 100644
index 000000000000..cb6ad8411631
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.c
@@ -0,0 +1,387 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2025 NVIDIA Corporation & Affiliates */
+
+#include "internal.h"
+
+static const char *
+hws_pool_opt_to_str(enum mlx5hws_pool_optimize opt)
+{
+ switch (opt) {
+ case MLX5HWS_POOL_OPTIMIZE_NONE:
+ return "rx-and-tx";
+ case MLX5HWS_POOL_OPTIMIZE_ORIG:
+ return "rx-only";
+ case MLX5HWS_POOL_OPTIMIZE_MIRROR:
+ return "tx-only";
+ default:
+ return "unknown";
+ }
+}
+
+static int
+hws_action_ste_table_create_pool(struct mlx5hws_context *ctx,
+ struct mlx5hws_action_ste_table *action_tbl,
+ enum mlx5hws_pool_optimize opt, size_t log_sz)
+{
+ struct mlx5hws_pool_attr pool_attr = { 0 };
+
+ pool_attr.pool_type = MLX5HWS_POOL_TYPE_STE;
+ pool_attr.table_type = MLX5HWS_TABLE_TYPE_FDB;
+ pool_attr.flags = MLX5HWS_POOL_FLAG_BUDDY;
+ pool_attr.opt_type = opt;
+ pool_attr.alloc_log_sz = log_sz;
+
+ action_tbl->pool = mlx5hws_pool_create(ctx, &pool_attr);
+ if (!action_tbl->pool) {
+ mlx5hws_err(ctx, "Failed to allocate STE pool\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int hws_action_ste_table_create_single_rtc(
+ struct mlx5hws_context *ctx,
+ struct mlx5hws_action_ste_table *action_tbl,
+ enum mlx5hws_pool_optimize opt, size_t log_sz, bool tx)
+{
+ struct mlx5hws_cmd_rtc_create_attr rtc_attr = { 0 };
+ u32 *rtc_id;
+
+ rtc_attr.log_depth = 0;
+ rtc_attr.update_index_mode = MLX5_IFC_RTC_STE_UPDATE_MODE_BY_OFFSET;
+ /* Action STEs use the default always hit definer. */
+ rtc_attr.match_definer_0 = ctx->caps->trivial_match_definer;
+ rtc_attr.is_frst_jumbo = false;
+ rtc_attr.miss_ft_id = 0;
+ rtc_attr.pd = ctx->pd_num;
+ rtc_attr.reparse_mode = mlx5hws_context_get_reparse_mode(ctx);
+
+ if (tx) {
+ rtc_attr.table_type = FS_FT_FDB_TX;
+ rtc_attr.ste_base =
+ mlx5hws_pool_get_base_mirror_id(action_tbl->pool);
+ rtc_attr.stc_base =
+ mlx5hws_pool_get_base_mirror_id(ctx->stc_pool);
+ rtc_attr.log_size =
+ opt == MLX5HWS_POOL_OPTIMIZE_ORIG ? 0 : log_sz;
+ rtc_id = &action_tbl->rtc_1_id;
+ } else {
+ rtc_attr.table_type = FS_FT_FDB_RX;
+ rtc_attr.ste_base = mlx5hws_pool_get_base_id(action_tbl->pool);
+ rtc_attr.stc_base = mlx5hws_pool_get_base_id(ctx->stc_pool);
+ rtc_attr.log_size =
+ opt == MLX5HWS_POOL_OPTIMIZE_MIRROR ? 0 : log_sz;
+ rtc_id = &action_tbl->rtc_0_id;
+ }
+
+ return mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr, rtc_id);
+}
+
+static int
+hws_action_ste_table_create_rtcs(struct mlx5hws_context *ctx,
+ struct mlx5hws_action_ste_table *action_tbl,
+ enum mlx5hws_pool_optimize opt, size_t log_sz)
+{
+ int err;
+
+ err = hws_action_ste_table_create_single_rtc(ctx, action_tbl, opt,
+ log_sz, false);
+ if (err)
+ return err;
+
+ err = hws_action_ste_table_create_single_rtc(ctx, action_tbl, opt,
+ log_sz, true);
+ if (err) {
+ mlx5hws_cmd_rtc_destroy(ctx->mdev, action_tbl->rtc_0_id);
+ return err;
+ }
+
+ return 0;
+}
+
+static void
+hws_action_ste_table_destroy_rtcs(struct mlx5hws_action_ste_table *action_tbl)
+{
+ mlx5hws_cmd_rtc_destroy(action_tbl->pool->ctx->mdev,
+ action_tbl->rtc_1_id);
+ mlx5hws_cmd_rtc_destroy(action_tbl->pool->ctx->mdev,
+ action_tbl->rtc_0_id);
+}
+
+static int
+hws_action_ste_table_create_stc(struct mlx5hws_context *ctx,
+ struct mlx5hws_action_ste_table *action_tbl)
+{
+ struct mlx5hws_cmd_stc_modify_attr stc_attr = { 0 };
+
+ stc_attr.action_offset = MLX5HWS_ACTION_OFFSET_HIT;
+ stc_attr.action_type = MLX5_IFC_STC_ACTION_TYPE_JUMP_TO_STE_TABLE;
+ stc_attr.reparse_mode = MLX5_IFC_STC_REPARSE_IGNORE;
+ stc_attr.ste_table.ste_pool = action_tbl->pool;
+ stc_attr.ste_table.match_definer_id = ctx->caps->trivial_match_definer;
+
+ return mlx5hws_action_alloc_single_stc(ctx, &stc_attr,
+ MLX5HWS_TABLE_TYPE_FDB,
+ &action_tbl->stc);
+}
+
+static struct mlx5hws_action_ste_table *
+hws_action_ste_table_alloc(struct mlx5hws_action_ste_pool_element *parent_elem)
+{
+ enum mlx5hws_pool_optimize opt = parent_elem->opt;
+ struct mlx5hws_context *ctx = parent_elem->ctx;
+ struct mlx5hws_action_ste_table *action_tbl;
+ size_t log_sz;
+ int err;
+
+ log_sz = min(parent_elem->log_sz ?
+ parent_elem->log_sz +
+ MLX5HWS_ACTION_STE_TABLE_STEP_LOG_SZ :
+ MLX5HWS_ACTION_STE_TABLE_INIT_LOG_SZ,
+ MLX5HWS_ACTION_STE_TABLE_MAX_LOG_SZ);
+
+ action_tbl = kzalloc(sizeof(*action_tbl), GFP_KERNEL);
+ if (!action_tbl)
+ return ERR_PTR(-ENOMEM);
+
+ err = hws_action_ste_table_create_pool(ctx, action_tbl, opt, log_sz);
+ if (err)
+ goto free_tbl;
+
+ err = hws_action_ste_table_create_rtcs(ctx, action_tbl, opt, log_sz);
+ if (err)
+ goto destroy_pool;
+
+ err = hws_action_ste_table_create_stc(ctx, action_tbl);
+ if (err)
+ goto destroy_rtcs;
+
+ action_tbl->parent_elem = parent_elem;
+ INIT_LIST_HEAD(&action_tbl->list_node);
+ list_add(&action_tbl->list_node, &parent_elem->available);
+ parent_elem->log_sz = log_sz;
+
+ mlx5hws_dbg(ctx,
+ "Allocated %s action STE table log_sz %zu; STEs (%d, %d); RTCs (%d, %d); STC %d\n",
+ hws_pool_opt_to_str(opt), log_sz,
+ mlx5hws_pool_get_base_id(action_tbl->pool),
+ mlx5hws_pool_get_base_mirror_id(action_tbl->pool),
+ action_tbl->rtc_0_id, action_tbl->rtc_1_id,
+ action_tbl->stc.offset);
+
+ return action_tbl;
+
+destroy_rtcs:
+ hws_action_ste_table_destroy_rtcs(action_tbl);
+destroy_pool:
+ mlx5hws_pool_destroy(action_tbl->pool);
+free_tbl:
+ kfree(action_tbl);
+
+ return ERR_PTR(err);
+}
+
+static void
+hws_action_ste_table_destroy(struct mlx5hws_action_ste_table *action_tbl)
+{
+ struct mlx5hws_context *ctx = action_tbl->parent_elem->ctx;
+
+ mlx5hws_dbg(ctx,
+ "Destroying %s action STE table: STEs (%d, %d); RTCs (%d, %d); STC %d\n",
+ hws_pool_opt_to_str(action_tbl->parent_elem->opt),
+ mlx5hws_pool_get_base_id(action_tbl->pool),
+ mlx5hws_pool_get_base_mirror_id(action_tbl->pool),
+ action_tbl->rtc_0_id, action_tbl->rtc_1_id,
+ action_tbl->stc.offset);
+
+ mlx5hws_action_free_single_stc(ctx, MLX5HWS_TABLE_TYPE_FDB,
+ &action_tbl->stc);
+ hws_action_ste_table_destroy_rtcs(action_tbl);
+ mlx5hws_pool_destroy(action_tbl->pool);
+
+ list_del(&action_tbl->list_node);
+ kfree(action_tbl);
+}
+
+static int
+hws_action_ste_pool_element_init(struct mlx5hws_context *ctx,
+ struct mlx5hws_action_ste_pool_element *elem,
+ enum mlx5hws_pool_optimize opt)
+{
+ elem->ctx = ctx;
+ elem->opt = opt;
+ INIT_LIST_HEAD(&elem->available);
+ INIT_LIST_HEAD(&elem->full);
+
+ return 0;
+}
+
+static void hws_action_ste_pool_element_destroy(
+ struct mlx5hws_action_ste_pool_element *elem)
+{
+ struct mlx5hws_action_ste_table *action_tbl, *p;
+
+ /* This should be empty, but attempt to free its elements anyway. */
+ list_for_each_entry_safe(action_tbl, p, &elem->full, list_node)
+ hws_action_ste_table_destroy(action_tbl);
+
+ list_for_each_entry_safe(action_tbl, p, &elem->available, list_node)
+ hws_action_ste_table_destroy(action_tbl);
+}
+
+static int hws_action_ste_pool_init(struct mlx5hws_context *ctx,
+ struct mlx5hws_action_ste_pool *pool)
+{
+ enum mlx5hws_pool_optimize opt;
+ int err;
+
+ /* Rules which are added for both RX and TX must use the same action STE
+ * indices for both. If we were to use a single table, then RX-only and
+ * TX-only rules would waste the unused entries. Thus, we use separate
+ * table sets for the three cases.
+ */
+ for (opt = MLX5HWS_POOL_OPTIMIZE_NONE; opt < MLX5HWS_POOL_OPTIMIZE_MAX;
+ opt++) {
+ err = hws_action_ste_pool_element_init(ctx, &pool->elems[opt],
+ opt);
+ if (err)
+ goto destroy_elems;
+ }
+
+ return 0;
+
+destroy_elems:
+ while (opt-- > MLX5HWS_POOL_OPTIMIZE_NONE)
+ hws_action_ste_pool_element_destroy(&pool->elems[opt]);
+
+ return err;
+}
+
+static void hws_action_ste_pool_destroy(struct mlx5hws_action_ste_pool *pool)
+{
+ int opt;
+
+ for (opt = MLX5HWS_POOL_OPTIMIZE_MAX - 1;
+ opt >= MLX5HWS_POOL_OPTIMIZE_NONE; opt--)
+ hws_action_ste_pool_element_destroy(&pool->elems[opt]);
+}
+
+int mlx5hws_action_ste_pool_init(struct mlx5hws_context *ctx)
+{
+ struct mlx5hws_action_ste_pool *pool;
+ size_t queues = ctx->queues;
+ int i, err;
+
+ pool = kcalloc(queues, sizeof(*pool), GFP_KERNEL);
+ if (!pool)
+ return -ENOMEM;
+
+ for (i = 0; i < queues; i++) {
+ err = hws_action_ste_pool_init(ctx, &pool[i]);
+ if (err)
+ goto free_pool;
+ }
+
+ ctx->action_ste_pool = pool;
+
+ return 0;
+
+free_pool:
+ while (i--)
+ hws_action_ste_pool_destroy(&pool[i]);
+ kfree(pool);
+
+ return err;
+}
+
+void mlx5hws_action_ste_pool_uninit(struct mlx5hws_context *ctx)
+{
+ size_t queues = ctx->queues;
+ int i;
+
+ for (i = 0; i < queues; i++)
+ hws_action_ste_pool_destroy(&ctx->action_ste_pool[i]);
+
+ kfree(ctx->action_ste_pool);
+}
+
+static struct mlx5hws_action_ste_pool_element *
+hws_action_ste_choose_elem(struct mlx5hws_action_ste_pool *pool,
+ bool skip_rx, bool skip_tx)
+{
+ if (skip_rx)
+ return &pool->elems[MLX5HWS_POOL_OPTIMIZE_MIRROR];
+
+ if (skip_tx)
+ return &pool->elems[MLX5HWS_POOL_OPTIMIZE_ORIG];
+
+ return &pool->elems[MLX5HWS_POOL_OPTIMIZE_NONE];
+}
+
+static int
+hws_action_ste_table_chunk_alloc(struct mlx5hws_action_ste_table *action_tbl,
+ struct mlx5hws_action_ste_chunk *chunk)
+{
+ int err;
+
+ err = mlx5hws_pool_chunk_alloc(action_tbl->pool, &chunk->ste);
+ if (err)
+ return err;
+
+ chunk->action_tbl = action_tbl;
+
+ return 0;
+}
+
+int mlx5hws_action_ste_chunk_alloc(struct mlx5hws_action_ste_pool *pool,
+ bool skip_rx, bool skip_tx,
+ struct mlx5hws_action_ste_chunk *chunk)
+{
+ struct mlx5hws_action_ste_pool_element *elem;
+ struct mlx5hws_action_ste_table *action_tbl;
+ bool found;
+ int err;
+
+ if (skip_rx && skip_tx)
+ return -EINVAL;
+
+ elem = hws_action_ste_choose_elem(pool, skip_rx, skip_tx);
+
+ mlx5hws_dbg(elem->ctx,
+ "Allocating action STEs skip_rx %d skip_tx %d order %d\n",
+ skip_rx, skip_tx, chunk->ste.order);
+
+ found = false;
+ list_for_each_entry(action_tbl, &elem->available, list_node) {
+ if (!hws_action_ste_table_chunk_alloc(action_tbl, chunk)) {
+ found = true;
+ break;
+ }
+ }
+
+ if (!found) {
+ action_tbl = hws_action_ste_table_alloc(elem);
+ if (IS_ERR(action_tbl))
+ return PTR_ERR(action_tbl);
+
+ err = hws_action_ste_table_chunk_alloc(action_tbl, chunk);
+ if (err)
+ return err;
+ }
+
+ if (mlx5hws_pool_empty(action_tbl->pool))
+ list_move(&action_tbl->list_node, &elem->full);
+
+ return 0;
+}
+
+void mlx5hws_action_ste_chunk_free(struct mlx5hws_action_ste_chunk *chunk)
+{
+ mlx5hws_dbg(chunk->action_tbl->pool->ctx,
+ "Freeing action STEs offset %d order %d\n",
+ chunk->ste.offset, chunk->ste.order);
+ mlx5hws_pool_chunk_free(chunk->action_tbl->pool, &chunk->ste);
+ list_move(&chunk->action_tbl->list_node,
+ &chunk->action_tbl->parent_elem->available);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.h
new file mode 100644
index 000000000000..2de660a63223
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2025 NVIDIA Corporation & Affiliates */
+
+#ifndef ACTION_STE_POOL_H_
+#define ACTION_STE_POOL_H_
+
+#define MLX5HWS_ACTION_STE_TABLE_INIT_LOG_SZ 10
+#define MLX5HWS_ACTION_STE_TABLE_STEP_LOG_SZ 1
+#define MLX5HWS_ACTION_STE_TABLE_MAX_LOG_SZ 20
+
+struct mlx5hws_action_ste_pool_element;
+
+struct mlx5hws_action_ste_table {
+ struct mlx5hws_action_ste_pool_element *parent_elem;
+ /* Wraps the RTC and STE range for this given action. */
+ struct mlx5hws_pool *pool;
+ /* Match STEs use this STC to jump to this pool's RTC. */
+ struct mlx5hws_pool_chunk stc;
+ u32 rtc_0_id;
+ u32 rtc_1_id;
+ struct list_head list_node;
+};
+
+struct mlx5hws_action_ste_pool_element {
+ struct mlx5hws_context *ctx;
+ size_t log_sz; /* Size of the largest table so far. */
+ enum mlx5hws_pool_optimize opt;
+ struct list_head available;
+ struct list_head full;
+};
+
+/* Central repository of action STEs. The context contains one of these pools
+ * per queue.
+ */
+struct mlx5hws_action_ste_pool {
+ struct mlx5hws_action_ste_pool_element elems[MLX5HWS_POOL_OPTIMIZE_MAX];
+};
+
+/* A chunk of STEs and the table it was allocated from. Used by rules. */
+struct mlx5hws_action_ste_chunk {
+ struct mlx5hws_action_ste_table *action_tbl;
+ struct mlx5hws_pool_chunk ste;
+};
+
+int mlx5hws_action_ste_pool_init(struct mlx5hws_context *ctx);
+
+void mlx5hws_action_ste_pool_uninit(struct mlx5hws_context *ctx);
+
+/* Callers are expected to fill chunk->ste.order. On success, this function
+ * populates chunk->tbl and chunk->ste.offset.
+ */
+int mlx5hws_action_ste_chunk_alloc(struct mlx5hws_action_ste_pool *pool,
+ bool skip_rx, bool skip_tx,
+ struct mlx5hws_action_ste_chunk *chunk);
+
+void mlx5hws_action_ste_chunk_free(struct mlx5hws_action_ste_chunk *chunk);
+
+#endif /* ACTION_STE_POOL_H_ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c
index b7cb736b74d7..428dae869706 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c
@@ -158,10 +158,16 @@ static int hws_context_init_hws(struct mlx5hws_context *ctx,
if (ret)
goto pools_uninit;
+ ret = mlx5hws_action_ste_pool_init(ctx);
+ if (ret)
+ goto close_queues;
+
INIT_LIST_HEAD(&ctx->tbl_list);
return 0;
+close_queues:
+ mlx5hws_send_queues_close(ctx);
pools_uninit:
hws_context_pools_uninit(ctx);
uninit_pd:
@@ -174,6 +180,7 @@ static void hws_context_uninit_hws(struct mlx5hws_context *ctx)
if (!(ctx->flags & MLX5HWS_CONTEXT_FLAG_HWS_SUPPORT))
return;
+ mlx5hws_action_ste_pool_uninit(ctx);
mlx5hws_send_queues_close(ctx);
hws_context_pools_uninit(ctx);
hws_context_uninit_pd(ctx);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h
index 38c3647444ad..e987e93bbc6e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h
@@ -39,6 +39,7 @@ struct mlx5hws_context {
struct mlx5hws_cmd_query_caps *caps;
u32 pd_num;
struct mlx5hws_pool *stc_pool;
+ struct mlx5hws_action_ste_pool *action_ste_pool; /* One per queue */
struct mlx5hws_context_common_res common_res;
struct mlx5hws_pattern_cache *pattern_cache;
struct mlx5hws_definer_cache *definer_cache;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/internal.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/internal.h
index 30ccd635b505..21279d503117 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/internal.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/internal.h
@@ -17,6 +17,7 @@
#include "context.h"
#include "table.h"
#include "send.h"
+#include "action_ste_pool.h"
#include "rule.h"
#include "cmd.h"
#include "action.h"
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h
index c82760d53e1a..33e33d5f1fb3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h
@@ -33,6 +33,7 @@ enum mlx5hws_pool_optimize {
MLX5HWS_POOL_OPTIMIZE_NONE = 0x0,
MLX5HWS_POOL_OPTIMIZE_ORIG = 0x1,
MLX5HWS_POOL_OPTIMIZE_MIRROR = 0x2,
+ MLX5HWS_POOL_OPTIMIZE_MAX = 0x3,
};
struct mlx5hws_pool_attr {
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,190 @@
From 95ecc26ff257b1b713163c5a13570543b03678f2 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:00 -0400
Subject: [PATCH] net/mlx5: HWS, Use the new action STE pool
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 593a9470a8565a59a07b577d6bcb3c199f232d4a
Author: Vlad Dogaru <vdogaru@nvidia.com>
Date: Thu Apr 10 22:17:39 2025 +0300
net/mlx5: HWS, Use the new action STE pool
Use the central action STE pool when creating / updating rules.
Signed-off-by: Vlad Dogaru <vdogaru@nvidia.com>
Reviewed-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Michal Kubiak <michal.kubiak@intel.com>
Link: https://patch.msgid.link/1744312662-356571-10-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c
index a27a2d5ffc7b..5b758467ed03 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c
@@ -195,44 +195,30 @@ hws_rule_load_delete_info(struct mlx5hws_rule *rule,
}
}
-static int hws_rule_alloc_action_ste(struct mlx5hws_rule *rule)
+static int mlx5hws_rule_alloc_action_ste(struct mlx5hws_rule *rule,
+ u16 queue_id, bool skip_rx,
+ bool skip_tx)
{
struct mlx5hws_matcher *matcher = rule->matcher;
- struct mlx5hws_matcher_action_ste *action_ste;
- struct mlx5hws_pool_chunk ste = {0};
- int ret;
-
- action_ste = &matcher->action_ste;
- ste.order = ilog2(roundup_pow_of_two(action_ste->max_stes));
- ret = mlx5hws_pool_chunk_alloc(action_ste->pool, &ste);
- if (unlikely(ret)) {
- mlx5hws_err(matcher->tbl->ctx,
- "Failed to allocate STE for rule actions");
- return ret;
- }
-
- rule->action_ste.pool = matcher->action_ste.pool;
- rule->action_ste.num_stes = matcher->action_ste.max_stes;
- rule->action_ste.index = ste.offset;
+ struct mlx5hws_context *ctx = matcher->tbl->ctx;
- return 0;
+ rule->action_ste.ste.order =
+ ilog2(roundup_pow_of_two(matcher->action_ste.max_stes));
+ return mlx5hws_action_ste_chunk_alloc(&ctx->action_ste_pool[queue_id],
+ skip_rx, skip_tx,
+ &rule->action_ste);
}
-void mlx5hws_rule_free_action_ste(struct mlx5hws_rule_action_ste_info *action_ste)
+void mlx5hws_rule_free_action_ste(struct mlx5hws_action_ste_chunk *action_ste)
{
- struct mlx5hws_pool_chunk ste = {0};
-
- if (!action_ste->num_stes)
+ if (!action_ste->action_tbl)
return;
- ste.order = ilog2(roundup_pow_of_two(action_ste->num_stes));
- ste.offset = action_ste->index;
-
/* This release is safe only when the rule match STE was deleted
* (when the rule is being deleted) or replaced with the new STE that
* isn't pointing to old action STEs (when the rule is being updated).
*/
- mlx5hws_pool_chunk_free(action_ste->pool, &ste);
+ mlx5hws_action_ste_chunk_free(action_ste);
}
static void hws_rule_create_init(struct mlx5hws_rule *rule,
@@ -250,22 +236,15 @@ static void hws_rule_create_init(struct mlx5hws_rule *rule,
rule->rtc_0 = 0;
rule->rtc_1 = 0;
- rule->action_ste.pool = NULL;
- rule->action_ste.num_stes = 0;
- rule->action_ste.index = -1;
-
rule->status = MLX5HWS_RULE_STATUS_CREATING;
} else {
rule->status = MLX5HWS_RULE_STATUS_UPDATING;
+ /* Save the old action STE info so we can free it after writing
+ * new action STEs and a corresponding match STE.
+ */
+ rule->old_action_ste = rule->action_ste;
}
- /* Initialize the old action STE info - shallow-copy action_ste.
- * In create flow this will set old_action_ste fields to initial values.
- * In update flow this will save the existing action STE info,
- * so that we will later use it to free old STEs.
- */
- rule->old_action_ste = rule->action_ste;
-
rule->pending_wqes = 0;
/* Init default send STE attributes */
@@ -277,7 +256,6 @@ static void hws_rule_create_init(struct mlx5hws_rule *rule,
/* Init default action apply */
apply->tbl_type = tbl->type;
apply->common_res = &ctx->common_res;
- apply->jump_to_action_stc = matcher->action_ste.stc.offset;
apply->require_dep = 0;
}
@@ -353,17 +331,24 @@ static int hws_rule_create_hws(struct mlx5hws_rule *rule,
if (action_stes) {
/* Allocate action STEs for rules that need more than match STE */
- ret = hws_rule_alloc_action_ste(rule);
+ ret = mlx5hws_rule_alloc_action_ste(rule, attr->queue_id,
+ !!ste_attr.rtc_0,
+ !!ste_attr.rtc_1);
if (ret) {
mlx5hws_err(ctx, "Failed to allocate action memory %d", ret);
mlx5hws_send_abort_new_dep_wqe(queue);
return ret;
}
+ apply.jump_to_action_stc =
+ rule->action_ste.action_tbl->stc.offset;
/* Skip RX/TX based on the dep_wqe init */
- ste_attr.rtc_0 = dep_wqe->rtc_0 ? matcher->action_ste.rtc_0_id : 0;
- ste_attr.rtc_1 = dep_wqe->rtc_1 ? matcher->action_ste.rtc_1_id : 0;
+ ste_attr.rtc_0 = dep_wqe->rtc_0 ?
+ rule->action_ste.action_tbl->rtc_0_id : 0;
+ ste_attr.rtc_1 = dep_wqe->rtc_1 ?
+ rule->action_ste.action_tbl->rtc_1_id : 0;
/* Action STEs are written to a specific index last to first */
- ste_attr.direct_index = rule->action_ste.index + action_stes;
+ ste_attr.direct_index =
+ rule->action_ste.ste.offset + action_stes;
apply.next_direct_idx = ste_attr.direct_index;
} else {
apply.next_direct_idx = 0;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.h
index b5ee94ac449b..1c47a9c11572 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.h
@@ -43,12 +43,6 @@ struct mlx5hws_rule_match_tag {
};
};
-struct mlx5hws_rule_action_ste_info {
- struct mlx5hws_pool *pool;
- int index; /* STE array index */
- u8 num_stes;
-};
-
struct mlx5hws_rule_resize_info {
u32 rtc_0;
u32 rtc_1;
@@ -64,8 +58,8 @@ struct mlx5hws_rule {
struct mlx5hws_rule_match_tag tag;
struct mlx5hws_rule_resize_info *resize_info;
};
- struct mlx5hws_rule_action_ste_info action_ste;
- struct mlx5hws_rule_action_ste_info old_action_ste;
+ struct mlx5hws_action_ste_chunk action_ste;
+ struct mlx5hws_action_ste_chunk old_action_ste;
u32 rtc_0; /* The RTC into which the STE was inserted */
u32 rtc_1; /* The RTC into which the STE was inserted */
u8 status; /* enum mlx5hws_rule_status */
@@ -75,7 +69,7 @@ struct mlx5hws_rule {
*/
};
-void mlx5hws_rule_free_action_ste(struct mlx5hws_rule_action_ste_info *action_ste);
+void mlx5hws_rule_free_action_ste(struct mlx5hws_action_ste_chunk *action_ste);
int mlx5hws_rule_move_hws_remove(struct mlx5hws_rule *rule,
void *queue, void *user_data);
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,875 @@
From 0e63b341ab3882d9bf6aacf824f70c2b41ef65e7 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:00 -0400
Subject: [PATCH] net/mlx5: HWS, Cleanup matcher action STE table
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 22174f16f1218fc98e374b3653decae54aa481f8
Author: Vlad Dogaru <vdogaru@nvidia.com>
Date: Thu Apr 10 22:17:40 2025 +0300
net/mlx5: HWS, Cleanup matcher action STE table
Remove the matcher action STE implementation now that the code uses
per-queue action STE pools. This also allows simplifying matcher code
because it is now only handling a single type of RTC/STE.
The matcher resize data is also going away. Matchers were saving old
action STE data because the rules still used it, but now that data lives
in the action STE pool and is no longer coupled to a matcher.
Furthermore, matchers no longer need to rehash a due to action template
addition. If a new action template needs more action STEs, we simply
update the matcher's num_of_action_stes and future rules will allocate
the correct number. Existing rules are unaffected by such an operation
and can continue to use their existing action STEs.
The range action was using the matcher action STE implementation, but
there was no reason to do this other than the container fitting the
purpose. Extract that information to a separate structure.
Finally, stop dumping per-matcher information about action RTCs,
because they no longer exist. A later patch in this series will add
support for dumping action STE pools.
Signed-off-by: Vlad Dogaru <vdogaru@nvidia.com>
Reviewed-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Michal Kubiak <michal.kubiak@intel.com>
Link: https://patch.msgid.link/1744312662-356571-11-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c
index 161ad720b339..bef4d25c1a2a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c
@@ -1574,13 +1574,13 @@ hws_action_create_dest_match_range_definer(struct mlx5hws_context *ctx)
return definer;
}
-static struct mlx5hws_matcher_action_ste *
+static struct mlx5hws_range_action_table *
hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx,
struct mlx5hws_definer *definer,
u32 miss_ft_id)
{
struct mlx5hws_cmd_rtc_create_attr rtc_attr = {0};
- struct mlx5hws_matcher_action_ste *table_ste;
+ struct mlx5hws_range_action_table *table_ste;
struct mlx5hws_pool_attr pool_attr = {0};
struct mlx5hws_pool *ste_pool, *stc_pool;
u32 *rtc_0_id, *rtc_1_id;
@@ -1669,9 +1669,9 @@ hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx,
return NULL;
}
-static void
-hws_action_destroy_dest_match_range_table(struct mlx5hws_context *ctx,
- struct mlx5hws_matcher_action_ste *table_ste)
+static void hws_action_destroy_dest_match_range_table(
+ struct mlx5hws_context *ctx,
+ struct mlx5hws_range_action_table *table_ste)
{
mutex_lock(&ctx->ctrl_lock);
@@ -1683,12 +1683,11 @@ hws_action_destroy_dest_match_range_table(struct mlx5hws_context *ctx,
mutex_unlock(&ctx->ctrl_lock);
}
-static int
-hws_action_create_dest_match_range_fill_table(struct mlx5hws_context *ctx,
- struct mlx5hws_matcher_action_ste *table_ste,
- struct mlx5hws_action *hit_ft_action,
- struct mlx5hws_definer *range_definer,
- u32 min, u32 max)
+static int hws_action_create_dest_match_range_fill_table(
+ struct mlx5hws_context *ctx,
+ struct mlx5hws_range_action_table *table_ste,
+ struct mlx5hws_action *hit_ft_action,
+ struct mlx5hws_definer *range_definer, u32 min, u32 max)
{
struct mlx5hws_wqe_gta_data_seg_ste match_wqe_data = {0};
struct mlx5hws_wqe_gta_data_seg_ste range_wqe_data = {0};
@@ -1784,7 +1783,7 @@ mlx5hws_action_create_dest_match_range(struct mlx5hws_context *ctx,
u32 min, u32 max, u32 flags)
{
struct mlx5hws_cmd_stc_modify_attr stc_attr = {0};
- struct mlx5hws_matcher_action_ste *table_ste;
+ struct mlx5hws_range_action_table *table_ste;
struct mlx5hws_action *hit_ft_action;
struct mlx5hws_definer *definer;
struct mlx5hws_action *action;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h
index 64b76075f7f8..25fa0d4c9221 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h
@@ -118,6 +118,12 @@ struct mlx5hws_action_template {
u8 only_term;
};
+struct mlx5hws_range_action_table {
+ struct mlx5hws_pool *pool;
+ u32 rtc_0_id;
+ u32 rtc_1_id;
+};
+
struct mlx5hws_action {
u8 type;
u8 flags;
@@ -186,7 +192,7 @@ struct mlx5hws_action {
size_t size;
} remove_header;
struct {
- struct mlx5hws_matcher_action_ste *table_ste;
+ struct mlx5hws_range_action_table *table_ste;
struct mlx5hws_action *hit_ft_action;
struct mlx5hws_definer *definer;
} range;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c
index 32de8bfc7644..510bfbbe5991 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c
@@ -478,21 +478,9 @@ hws_bwc_matcher_size_maxed_out(struct mlx5hws_bwc_matcher *bwc_matcher)
struct mlx5hws_cmd_query_caps *caps = bwc_matcher->matcher->tbl->ctx->caps;
/* check the match RTC size */
- if ((bwc_matcher->size_log +
- MLX5HWS_MATCHER_ASSURED_MAIN_TBL_DEPTH +
- MLX5HWS_BWC_MATCHER_SIZE_LOG_STEP) >
- (caps->ste_alloc_log_max - 1))
- return true;
-
- /* check the action RTC size */
- if ((bwc_matcher->size_log +
- MLX5HWS_BWC_MATCHER_SIZE_LOG_STEP +
- ilog2(roundup_pow_of_two(bwc_matcher->matcher->action_ste.max_stes)) +
- MLX5HWS_MATCHER_ACTION_RTC_UPDATE_MULT) >
- (caps->ste_alloc_log_max - 1))
- return true;
-
- return false;
+ return (bwc_matcher->size_log + MLX5HWS_MATCHER_ASSURED_MAIN_TBL_DEPTH +
+ MLX5HWS_BWC_MATCHER_SIZE_LOG_STEP) >
+ (caps->ste_alloc_log_max - 1);
}
static bool
@@ -779,19 +767,6 @@ hws_bwc_matcher_rehash_size(struct mlx5hws_bwc_matcher *bwc_matcher)
return hws_bwc_matcher_move(bwc_matcher);
}
-static int
-hws_bwc_matcher_rehash_at(struct mlx5hws_bwc_matcher *bwc_matcher)
-{
- /* Rehash by action template doesn't require any additional checking.
- * The bwc_matcher already contains the new action template.
- * Just do the usual rehash:
- * - create new matcher
- * - move all the rules to the new matcher
- * - destroy the old matcher
- */
- return hws_bwc_matcher_move(bwc_matcher);
-}
-
int mlx5hws_bwc_rule_create_simple(struct mlx5hws_bwc_rule *bwc_rule,
u32 *match_param,
struct mlx5hws_rule_action rule_actions[],
@@ -803,7 +778,6 @@ int mlx5hws_bwc_rule_create_simple(struct mlx5hws_bwc_rule *bwc_rule,
struct mlx5hws_rule_attr rule_attr;
struct mutex *queue_lock; /* Protect the queue */
u32 num_of_rules;
- bool need_rehash;
int ret = 0;
int at_idx;
@@ -830,30 +804,11 @@ int mlx5hws_bwc_rule_create_simple(struct mlx5hws_bwc_rule *bwc_rule,
at_idx = bwc_matcher->num_of_at - 1;
ret = mlx5hws_matcher_attach_at(bwc_matcher->matcher,
- bwc_matcher->at[at_idx],
- &need_rehash);
+ bwc_matcher->at[at_idx]);
if (unlikely(ret)) {
hws_bwc_unlock_all_queues(ctx);
return ret;
}
- if (unlikely(need_rehash)) {
- /* The new action template requires more action STEs.
- * Need to attempt creating new matcher with all
- * the action templates, including the new one.
- */
- ret = hws_bwc_matcher_rehash_at(bwc_matcher);
- if (unlikely(ret)) {
- mlx5hws_action_template_destroy(bwc_matcher->at[at_idx]);
- bwc_matcher->at[at_idx] = NULL;
- bwc_matcher->num_of_at--;
-
- hws_bwc_unlock_all_queues(ctx);
-
- mlx5hws_err(ctx,
- "BWC rule insertion: rehash AT failed (%d)\n", ret);
- return ret;
- }
- }
hws_bwc_unlock_all_queues(ctx);
mutex_lock(queue_lock);
@@ -973,7 +928,6 @@ hws_bwc_rule_action_update(struct mlx5hws_bwc_rule *bwc_rule,
struct mlx5hws_context *ctx = bwc_matcher->matcher->tbl->ctx;
struct mlx5hws_rule_attr rule_attr;
struct mutex *queue_lock; /* Protect the queue */
- bool need_rehash;
int at_idx, ret;
u16 idx;
@@ -1005,32 +959,11 @@ hws_bwc_rule_action_update(struct mlx5hws_bwc_rule *bwc_rule,
at_idx = bwc_matcher->num_of_at - 1;
ret = mlx5hws_matcher_attach_at(bwc_matcher->matcher,
- bwc_matcher->at[at_idx],
- &need_rehash);
+ bwc_matcher->at[at_idx]);
if (unlikely(ret)) {
hws_bwc_unlock_all_queues(ctx);
return ret;
}
- if (unlikely(need_rehash)) {
- /* The new action template requires more action
- * STEs. Need to attempt creating new matcher
- * with all the action templates, including the
- * new one.
- */
- ret = hws_bwc_matcher_rehash_at(bwc_matcher);
- if (unlikely(ret)) {
- mlx5hws_action_template_destroy(bwc_matcher->at[at_idx]);
- bwc_matcher->at[at_idx] = NULL;
- bwc_matcher->num_of_at--;
-
- hws_bwc_unlock_all_queues(ctx);
-
- mlx5hws_err(ctx,
- "BWC rule update: rehash AT failed (%d)\n",
- ret);
- return ret;
- }
- }
}
hws_bwc_unlock_all_queues(ctx);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c
index 3491408c5d84..38f75dec9cfc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c
@@ -146,18 +146,6 @@ static int hws_debug_dump_matcher(struct seq_file *f, struct mlx5hws_matcher *ma
matcher->match_ste.rtc_1_id,
(int)ste_1_id);
- ste_pool = matcher->action_ste.pool;
- if (ste_pool) {
- ste_0_id = mlx5hws_pool_get_base_id(ste_pool);
- if (tbl_type == MLX5HWS_TABLE_TYPE_FDB)
- ste_1_id = mlx5hws_pool_get_base_mirror_id(ste_pool);
- else
- ste_1_id = -1;
- } else {
- ste_0_id = -1;
- ste_1_id = -1;
- }
-
ft_attr.type = matcher->tbl->fw_ft_type;
ret = mlx5hws_cmd_flow_table_query(matcher->tbl->ctx->mdev,
matcher->end_ft_id,
@@ -167,10 +155,7 @@ static int hws_debug_dump_matcher(struct seq_file *f, struct mlx5hws_matcher *ma
if (ret)
return ret;
- seq_printf(f, ",%d,%d,%d,%d,%d,0x%llx,0x%llx\n",
- matcher->action_ste.rtc_0_id, (int)ste_0_id,
- matcher->action_ste.rtc_1_id, (int)ste_1_id,
- 0,
+ seq_printf(f, ",-1,-1,-1,-1,0,0x%llx,0x%llx\n",
mlx5hws_debug_icm_to_idx(icm_addr_0),
mlx5hws_debug_icm_to_idx(icm_addr_1));
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c
index 3028e0387e3f..716502732d3d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c
@@ -3,25 +3,6 @@
#include "internal.h"
-enum mlx5hws_matcher_rtc_type {
- HWS_MATCHER_RTC_TYPE_MATCH,
- HWS_MATCHER_RTC_TYPE_STE_ARRAY,
- HWS_MATCHER_RTC_TYPE_MAX,
-};
-
-static const char * const mlx5hws_matcher_rtc_type_str[] = {
- [HWS_MATCHER_RTC_TYPE_MATCH] = "MATCH",
- [HWS_MATCHER_RTC_TYPE_STE_ARRAY] = "STE_ARRAY",
- [HWS_MATCHER_RTC_TYPE_MAX] = "UNKNOWN",
-};
-
-static const char *hws_matcher_rtc_type_to_str(enum mlx5hws_matcher_rtc_type rtc_type)
-{
- if (rtc_type > HWS_MATCHER_RTC_TYPE_MAX)
- rtc_type = HWS_MATCHER_RTC_TYPE_MAX;
- return mlx5hws_matcher_rtc_type_str[rtc_type];
-}
-
static bool hws_matcher_requires_col_tbl(u8 log_num_of_rules)
{
/* Collision table concatenation is done only for large rule tables */
@@ -209,83 +190,52 @@ static void hws_matcher_set_rtc_attr_sz(struct mlx5hws_matcher *matcher,
}
}
-static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher,
- enum mlx5hws_matcher_rtc_type rtc_type)
+static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher)
{
struct mlx5hws_matcher_attr *attr = &matcher->attr;
struct mlx5hws_cmd_rtc_create_attr rtc_attr = {0};
struct mlx5hws_match_template *mt = matcher->mt;
struct mlx5hws_context *ctx = matcher->tbl->ctx;
- struct mlx5hws_matcher_action_ste *action_ste;
struct mlx5hws_table *tbl = matcher->tbl;
- struct mlx5hws_pool *ste_pool;
- u32 *rtc_0_id, *rtc_1_id;
u32 obj_id;
int ret;
- switch (rtc_type) {
- case HWS_MATCHER_RTC_TYPE_MATCH:
- rtc_0_id = &matcher->match_ste.rtc_0_id;
- rtc_1_id = &matcher->match_ste.rtc_1_id;
- ste_pool = matcher->match_ste.pool;
-
- rtc_attr.log_size = attr->table.sz_row_log;
- rtc_attr.log_depth = attr->table.sz_col_log;
- rtc_attr.is_frst_jumbo = mlx5hws_matcher_mt_is_jumbo(mt);
- rtc_attr.is_scnd_range = 0;
- rtc_attr.miss_ft_id = matcher->end_ft_id;
-
- if (attr->insert_mode == MLX5HWS_MATCHER_INSERT_BY_HASH) {
- /* The usual Hash Table */
- rtc_attr.update_index_mode = MLX5_IFC_RTC_STE_UPDATE_MODE_BY_HASH;
-
- /* The first mt is used since all share the same definer */
- rtc_attr.match_definer_0 = mlx5hws_definer_get_id(mt->definer);
- } else if (attr->insert_mode == MLX5HWS_MATCHER_INSERT_BY_INDEX) {
- rtc_attr.update_index_mode = MLX5_IFC_RTC_STE_UPDATE_MODE_BY_OFFSET;
- rtc_attr.num_hash_definer = 1;
-
- if (attr->distribute_mode == MLX5HWS_MATCHER_DISTRIBUTE_BY_HASH) {
- /* Hash Split Table */
- rtc_attr.access_index_mode = MLX5_IFC_RTC_STE_ACCESS_MODE_BY_HASH;
- rtc_attr.match_definer_0 = mlx5hws_definer_get_id(mt->definer);
- } else if (attr->distribute_mode == MLX5HWS_MATCHER_DISTRIBUTE_BY_LINEAR) {
- /* Linear Lookup Table */
- rtc_attr.access_index_mode = MLX5_IFC_RTC_STE_ACCESS_MODE_LINEAR;
- rtc_attr.match_definer_0 = ctx->caps->linear_match_definer;
- }
+ rtc_attr.log_size = attr->table.sz_row_log;
+ rtc_attr.log_depth = attr->table.sz_col_log;
+ rtc_attr.is_frst_jumbo = mlx5hws_matcher_mt_is_jumbo(mt);
+ rtc_attr.is_scnd_range = 0;
+ rtc_attr.miss_ft_id = matcher->end_ft_id;
+
+ if (attr->insert_mode == MLX5HWS_MATCHER_INSERT_BY_HASH) {
+ /* The usual Hash Table */
+ rtc_attr.update_index_mode =
+ MLX5_IFC_RTC_STE_UPDATE_MODE_BY_HASH;
+
+ /* The first mt is used since all share the same definer */
+ rtc_attr.match_definer_0 = mlx5hws_definer_get_id(mt->definer);
+ } else if (attr->insert_mode == MLX5HWS_MATCHER_INSERT_BY_INDEX) {
+ rtc_attr.update_index_mode =
+ MLX5_IFC_RTC_STE_UPDATE_MODE_BY_OFFSET;
+ rtc_attr.num_hash_definer = 1;
+
+ if (attr->distribute_mode ==
+ MLX5HWS_MATCHER_DISTRIBUTE_BY_HASH) {
+ /* Hash Split Table */
+ rtc_attr.access_index_mode =
+ MLX5_IFC_RTC_STE_ACCESS_MODE_BY_HASH;
+ rtc_attr.match_definer_0 =
+ mlx5hws_definer_get_id(mt->definer);
+ } else if (attr->distribute_mode ==
+ MLX5HWS_MATCHER_DISTRIBUTE_BY_LINEAR) {
+ /* Linear Lookup Table */
+ rtc_attr.access_index_mode =
+ MLX5_IFC_RTC_STE_ACCESS_MODE_LINEAR;
+ rtc_attr.match_definer_0 =
+ ctx->caps->linear_match_definer;
}
- break;
-
- case HWS_MATCHER_RTC_TYPE_STE_ARRAY:
- action_ste = &matcher->action_ste;
-
- rtc_0_id = &action_ste->rtc_0_id;
- rtc_1_id = &action_ste->rtc_1_id;
- ste_pool = action_ste->pool;
- /* Action RTC size calculation:
- * log((max number of rules in matcher) *
- * (max number of action STEs per rule) *
- * (2 to support writing new STEs for update rule))
- */
- rtc_attr.log_size =
- ilog2(roundup_pow_of_two(action_ste->max_stes)) +
- attr->table.sz_row_log +
- MLX5HWS_MATCHER_ACTION_RTC_UPDATE_MULT;
- rtc_attr.log_depth = 0;
- rtc_attr.update_index_mode = MLX5_IFC_RTC_STE_UPDATE_MODE_BY_OFFSET;
- /* The action STEs use the default always hit definer */
- rtc_attr.match_definer_0 = ctx->caps->trivial_match_definer;
- rtc_attr.is_frst_jumbo = false;
- rtc_attr.miss_ft_id = 0;
- break;
-
- default:
- mlx5hws_err(ctx, "HWS Invalid RTC type\n");
- return -EINVAL;
}
- obj_id = mlx5hws_pool_get_base_id(ste_pool);
+ obj_id = mlx5hws_pool_get_base_id(matcher->match_ste.pool);
rtc_attr.pd = ctx->pd_num;
rtc_attr.ste_base = obj_id;
@@ -297,15 +247,16 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher,
obj_id = mlx5hws_pool_get_base_id(ctx->stc_pool);
rtc_attr.stc_base = obj_id;
- ret = mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr, rtc_0_id);
+ ret = mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr,
+ &matcher->match_ste.rtc_0_id);
if (ret) {
- mlx5hws_err(ctx, "Failed to create matcher RTC of type %s",
- hws_matcher_rtc_type_to_str(rtc_type));
+ mlx5hws_err(ctx, "Failed to create matcher RTC\n");
return ret;
}
if (tbl->type == MLX5HWS_TABLE_TYPE_FDB) {
- obj_id = mlx5hws_pool_get_base_mirror_id(ste_pool);
+ obj_id = mlx5hws_pool_get_base_mirror_id(
+ matcher->match_ste.pool);
rtc_attr.ste_base = obj_id;
rtc_attr.table_type = mlx5hws_table_get_res_fw_ft_type(tbl->type, true);
@@ -313,10 +264,10 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher,
rtc_attr.stc_base = obj_id;
hws_matcher_set_rtc_attr_sz(matcher, &rtc_attr, true);
- ret = mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr, rtc_1_id);
+ ret = mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr,
+ &matcher->match_ste.rtc_1_id);
if (ret) {
- mlx5hws_err(ctx, "Failed to create peer matcher RTC of type %s",
- hws_matcher_rtc_type_to_str(rtc_type));
+ mlx5hws_err(ctx, "Failed to create mirror matcher RTC\n");
goto destroy_rtc_0;
}
}
@@ -324,33 +275,18 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher,
return 0;
destroy_rtc_0:
- mlx5hws_cmd_rtc_destroy(ctx->mdev, *rtc_0_id);
+ mlx5hws_cmd_rtc_destroy(ctx->mdev, matcher->match_ste.rtc_0_id);
return ret;
}
-static void hws_matcher_destroy_rtc(struct mlx5hws_matcher *matcher,
- enum mlx5hws_matcher_rtc_type rtc_type)
+static void hws_matcher_destroy_rtc(struct mlx5hws_matcher *matcher)
{
- struct mlx5hws_table *tbl = matcher->tbl;
- u32 rtc_0_id, rtc_1_id;
-
- switch (rtc_type) {
- case HWS_MATCHER_RTC_TYPE_MATCH:
- rtc_0_id = matcher->match_ste.rtc_0_id;
- rtc_1_id = matcher->match_ste.rtc_1_id;
- break;
- case HWS_MATCHER_RTC_TYPE_STE_ARRAY:
- rtc_0_id = matcher->action_ste.rtc_0_id;
- rtc_1_id = matcher->action_ste.rtc_1_id;
- break;
- default:
- return;
- }
+ struct mlx5_core_dev *mdev = matcher->tbl->ctx->mdev;
- if (tbl->type == MLX5HWS_TABLE_TYPE_FDB)
- mlx5hws_cmd_rtc_destroy(tbl->ctx->mdev, rtc_1_id);
+ if (matcher->tbl->type == MLX5HWS_TABLE_TYPE_FDB)
+ mlx5hws_cmd_rtc_destroy(mdev, matcher->match_ste.rtc_1_id);
- mlx5hws_cmd_rtc_destroy(tbl->ctx->mdev, rtc_0_id);
+ mlx5hws_cmd_rtc_destroy(mdev, matcher->match_ste.rtc_0_id);
}
static int
@@ -418,85 +354,17 @@ static int hws_matcher_check_and_process_at(struct mlx5hws_matcher *matcher,
return 0;
}
-static int hws_matcher_resize_init(struct mlx5hws_matcher *src_matcher)
-{
- struct mlx5hws_matcher_resize_data *resize_data;
-
- resize_data = kzalloc(sizeof(*resize_data), GFP_KERNEL);
- if (!resize_data)
- return -ENOMEM;
-
- resize_data->max_stes = src_matcher->action_ste.max_stes;
-
- resize_data->stc = src_matcher->action_ste.stc;
- resize_data->rtc_0_id = src_matcher->action_ste.rtc_0_id;
- resize_data->rtc_1_id = src_matcher->action_ste.rtc_1_id;
- resize_data->pool = src_matcher->action_ste.max_stes ?
- src_matcher->action_ste.pool : NULL;
-
- /* Place the new resized matcher on the dst matcher's list */
- list_add(&resize_data->list_node, &src_matcher->resize_dst->resize_data);
-
- /* Move all the previous resized matchers to the dst matcher's list */
- while (!list_empty(&src_matcher->resize_data)) {
- resize_data = list_first_entry(&src_matcher->resize_data,
- struct mlx5hws_matcher_resize_data,
- list_node);
- list_del_init(&resize_data->list_node);
- list_add(&resize_data->list_node, &src_matcher->resize_dst->resize_data);
- }
-
- return 0;
-}
-
-static void hws_matcher_resize_uninit(struct mlx5hws_matcher *matcher)
-{
- struct mlx5hws_matcher_resize_data *resize_data;
-
- if (!mlx5hws_matcher_is_resizable(matcher))
- return;
-
- while (!list_empty(&matcher->resize_data)) {
- resize_data = list_first_entry(&matcher->resize_data,
- struct mlx5hws_matcher_resize_data,
- list_node);
- list_del_init(&resize_data->list_node);
-
- if (resize_data->max_stes) {
- mlx5hws_action_free_single_stc(matcher->tbl->ctx,
- matcher->tbl->type,
- &resize_data->stc);
-
- if (matcher->tbl->type == MLX5HWS_TABLE_TYPE_FDB)
- mlx5hws_cmd_rtc_destroy(matcher->tbl->ctx->mdev,
- resize_data->rtc_1_id);
-
- mlx5hws_cmd_rtc_destroy(matcher->tbl->ctx->mdev,
- resize_data->rtc_0_id);
-
- if (resize_data->pool)
- mlx5hws_pool_destroy(resize_data->pool);
- }
-
- kfree(resize_data);
- }
-}
-
static int hws_matcher_bind_at(struct mlx5hws_matcher *matcher)
{
bool is_jumbo = mlx5hws_matcher_mt_is_jumbo(matcher->mt);
- struct mlx5hws_cmd_stc_modify_attr stc_attr = {0};
- struct mlx5hws_matcher_action_ste *action_ste;
- struct mlx5hws_table *tbl = matcher->tbl;
- struct mlx5hws_pool_attr pool_attr = {0};
- struct mlx5hws_context *ctx = tbl->ctx;
- u32 required_stes;
- u8 max_stes = 0;
+ struct mlx5hws_context *ctx = matcher->tbl->ctx;
+ u8 required_stes, max_stes;
int i, ret;
if (matcher->flags & MLX5HWS_MATCHER_FLAGS_COLLISION)
return 0;
+ max_stes = 0;
for (i = 0; i < matcher->num_of_at; i++) {
struct mlx5hws_action_template *at = &matcher->at[i];
@@ -512,74 +380,9 @@ static int hws_matcher_bind_at(struct mlx5hws_matcher *matcher)
/* Future: Optimize reparse */
}
- /* There are no additional STEs required for matcher */
- if (!max_stes)
- return 0;
-
- matcher->action_ste.max_stes = max_stes;
-
- action_ste = &matcher->action_ste;
-
- /* Allocate action STE mempool */
- pool_attr.table_type = tbl->type;
- pool_attr.pool_type = MLX5HWS_POOL_TYPE_STE;
- pool_attr.flags = MLX5HWS_POOL_FLAG_BUDDY;
- /* Pool size is similar to action RTC size */
- pool_attr.alloc_log_sz = ilog2(roundup_pow_of_two(action_ste->max_stes)) +
- matcher->attr.table.sz_row_log +
- MLX5HWS_MATCHER_ACTION_RTC_UPDATE_MULT;
- hws_matcher_set_pool_attr(&pool_attr, matcher);
- action_ste->pool = mlx5hws_pool_create(ctx, &pool_attr);
- if (!action_ste->pool) {
- mlx5hws_err(ctx, "Failed to create action ste pool\n");
- return -EINVAL;
- }
-
- /* Allocate action RTC */
- ret = hws_matcher_create_rtc(matcher, HWS_MATCHER_RTC_TYPE_STE_ARRAY);
- if (ret) {
- mlx5hws_err(ctx, "Failed to create action RTC\n");
- goto free_ste_pool;
- }
-
- /* Allocate STC for jumps to STE */
- stc_attr.action_offset = MLX5HWS_ACTION_OFFSET_HIT;
- stc_attr.action_type = MLX5_IFC_STC_ACTION_TYPE_JUMP_TO_STE_TABLE;
- stc_attr.reparse_mode = MLX5_IFC_STC_REPARSE_IGNORE;
- stc_attr.ste_table.ste_pool = action_ste->pool;
- stc_attr.ste_table.match_definer_id = ctx->caps->trivial_match_definer;
-
- ret = mlx5hws_action_alloc_single_stc(ctx, &stc_attr, tbl->type,
- &action_ste->stc);
- if (ret) {
- mlx5hws_err(ctx, "Failed to create action jump to table STC\n");
- goto free_rtc;
- }
+ matcher->num_of_action_stes = max_stes;
return 0;
-
-free_rtc:
- hws_matcher_destroy_rtc(matcher, HWS_MATCHER_RTC_TYPE_STE_ARRAY);
-free_ste_pool:
- mlx5hws_pool_destroy(action_ste->pool);
- return ret;
-}
-
-static void hws_matcher_unbind_at(struct mlx5hws_matcher *matcher)
-{
- struct mlx5hws_matcher_action_ste *action_ste;
- struct mlx5hws_table *tbl = matcher->tbl;
-
- action_ste = &matcher->action_ste;
-
- if (!action_ste->max_stes ||
- matcher->flags & MLX5HWS_MATCHER_FLAGS_COLLISION ||
- mlx5hws_matcher_is_in_resize(matcher))
- return;
-
- mlx5hws_action_free_single_stc(tbl->ctx, tbl->type, &action_ste->stc);
- hws_matcher_destroy_rtc(matcher, HWS_MATCHER_RTC_TYPE_STE_ARRAY);
- mlx5hws_pool_destroy(action_ste->pool);
}
static int hws_matcher_bind_mt(struct mlx5hws_matcher *matcher)
@@ -723,10 +526,10 @@ static int hws_matcher_create_and_connect(struct mlx5hws_matcher *matcher)
/* Create matcher end flow table anchor */
ret = hws_matcher_create_end_ft(matcher);
if (ret)
- goto unbind_at;
+ goto unbind_mt;
/* Allocate the RTC for the new matcher */
- ret = hws_matcher_create_rtc(matcher, HWS_MATCHER_RTC_TYPE_MATCH);
+ ret = hws_matcher_create_rtc(matcher);
if (ret)
goto destroy_end_ft;
@@ -738,11 +541,9 @@ static int hws_matcher_create_and_connect(struct mlx5hws_matcher *matcher)
return 0;
destroy_rtc:
- hws_matcher_destroy_rtc(matcher, HWS_MATCHER_RTC_TYPE_MATCH);
+ hws_matcher_destroy_rtc(matcher);
destroy_end_ft:
hws_matcher_destroy_end_ft(matcher);
-unbind_at:
- hws_matcher_unbind_at(matcher);
unbind_mt:
hws_matcher_unbind_mt(matcher);
return ret;
@@ -750,11 +551,9 @@ static int hws_matcher_create_and_connect(struct mlx5hws_matcher *matcher)
static void hws_matcher_destroy_and_disconnect(struct mlx5hws_matcher *matcher)
{
- hws_matcher_resize_uninit(matcher);
hws_matcher_disconnect(matcher);
- hws_matcher_destroy_rtc(matcher, HWS_MATCHER_RTC_TYPE_MATCH);
+ hws_matcher_destroy_rtc(matcher);
hws_matcher_destroy_end_ft(matcher);
- hws_matcher_unbind_at(matcher);
hws_matcher_unbind_mt(matcher);
}
@@ -776,8 +575,6 @@ hws_matcher_create_col_matcher(struct mlx5hws_matcher *matcher)
if (!col_matcher)
return -ENOMEM;
- INIT_LIST_HEAD(&col_matcher->resize_data);
-
col_matcher->tbl = matcher->tbl;
col_matcher->mt = matcher->mt;
col_matcher->at = matcher->at;
@@ -831,8 +628,6 @@ static int hws_matcher_init(struct mlx5hws_matcher *matcher)
struct mlx5hws_context *ctx = matcher->tbl->ctx;
int ret;
- INIT_LIST_HEAD(&matcher->resize_data);
-
mutex_lock(&ctx->ctrl_lock);
/* Allocate matcher resource and connect to the packet pipe */
@@ -889,16 +684,12 @@ static int hws_matcher_grow_at_array(struct mlx5hws_matcher *matcher)
}
int mlx5hws_matcher_attach_at(struct mlx5hws_matcher *matcher,
- struct mlx5hws_action_template *at,
- bool *need_rehash)
+ struct mlx5hws_action_template *at)
{
bool is_jumbo = mlx5hws_matcher_mt_is_jumbo(matcher->mt);
- struct mlx5hws_context *ctx = matcher->tbl->ctx;
u32 required_stes;
int ret;
- *need_rehash = false;
-
if (unlikely(matcher->num_of_at >= matcher->size_of_at_array)) {
ret = hws_matcher_grow_at_array(matcher);
if (ret)
@@ -916,11 +707,8 @@ int mlx5hws_matcher_attach_at(struct mlx5hws_matcher *matcher,
return ret;
required_stes = at->num_of_action_stes - (!is_jumbo || at->only_term);
- if (matcher->action_ste.max_stes < required_stes) {
- mlx5hws_dbg(ctx, "Required STEs [%d] exceeds initial action template STE [%d]\n",
- required_stes, matcher->action_ste.max_stes);
- *need_rehash = true;
- }
+ if (matcher->num_of_action_stes < required_stes)
+ matcher->num_of_action_stes = required_stes;
matcher->at[matcher->num_of_at] = *at;
matcher->num_of_at += 1;
@@ -1102,7 +890,7 @@ static int hws_matcher_resize_precheck(struct mlx5hws_matcher *src_matcher,
return -EINVAL;
}
- if (src_matcher->action_ste.max_stes > dst_matcher->action_ste.max_stes) {
+ if (src_matcher->num_of_action_stes > dst_matcher->num_of_action_stes) {
mlx5hws_err(ctx, "Src/dst matcher max STEs mismatch\n");
return -EINVAL;
}
@@ -1131,10 +919,6 @@ int mlx5hws_matcher_resize_set_target(struct mlx5hws_matcher *src_matcher,
src_matcher->resize_dst = dst_matcher;
- ret = hws_matcher_resize_init(src_matcher);
- if (ret)
- src_matcher->resize_dst = NULL;
-
out:
mutex_unlock(&src_matcher->tbl->ctx->ctrl_lock);
return ret;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h
index 0450b6175ac9..bad1fa8f77fd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h
@@ -50,23 +50,6 @@ struct mlx5hws_matcher_match_ste {
struct mlx5hws_pool *pool;
};
-struct mlx5hws_matcher_action_ste {
- struct mlx5hws_pool_chunk stc;
- u32 rtc_0_id;
- u32 rtc_1_id;
- struct mlx5hws_pool *pool;
- u8 max_stes;
-};
-
-struct mlx5hws_matcher_resize_data {
- struct mlx5hws_pool_chunk stc;
- u32 rtc_0_id;
- u32 rtc_1_id;
- struct mlx5hws_pool *pool;
- u8 max_stes;
- struct list_head list_node;
-};
-
struct mlx5hws_matcher {
struct mlx5hws_table *tbl;
struct mlx5hws_matcher_attr attr;
@@ -75,15 +58,14 @@ struct mlx5hws_matcher {
u8 num_of_at;
u8 size_of_at_array;
u8 num_of_mt;
+ u8 num_of_action_stes;
/* enum mlx5hws_matcher_flags */
u8 flags;
u32 end_ft_id;
struct mlx5hws_matcher *col_matcher;
struct mlx5hws_matcher *resize_dst;
struct mlx5hws_matcher_match_ste match_ste;
- struct mlx5hws_matcher_action_ste action_ste;
struct list_head list_node;
- struct list_head resize_data;
};
static inline bool
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h
index 8ed8a715a2eb..5121951f2778 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h
@@ -399,14 +399,11 @@ int mlx5hws_matcher_destroy(struct mlx5hws_matcher *matcher);
*
* @matcher: Matcher to attach the action template to.
* @at: Action template to be attached to the matcher.
- * @need_rehash: Output parameter that tells callers if the matcher needs to be
- * rehashed.
*
* Return: Zero on success, non-zero otherwise.
*/
int mlx5hws_matcher_attach_at(struct mlx5hws_matcher *matcher,
- struct mlx5hws_action_template *at,
- bool *need_rehash);
+ struct mlx5hws_action_template *at);
/**
* mlx5hws_matcher_resize_set_target - Link two matchers and enable moving rules.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c
index 5b758467ed03..9e6f35d68445 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c
@@ -203,7 +203,7 @@ static int mlx5hws_rule_alloc_action_ste(struct mlx5hws_rule *rule,
struct mlx5hws_context *ctx = matcher->tbl->ctx;
rule->action_ste.ste.order =
- ilog2(roundup_pow_of_two(matcher->action_ste.max_stes));
+ ilog2(roundup_pow_of_two(matcher->num_of_action_stes));
return mlx5hws_action_ste_chunk_alloc(&ctx->action_ste_pool[queue_id],
skip_rx, skip_tx,
&rule->action_ste);
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,254 @@
From b2936ce02b8545dd8b6b4bc1a135ba7d19d63488 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:00 -0400
Subject: [PATCH] net/mlx5: HWS, Free unused action STE tables
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 864531ca2072c55ff00ba9dfd8c15cf0f576051b
Author: Vlad Dogaru <vdogaru@nvidia.com>
Date: Thu Apr 10 22:17:41 2025 +0300
net/mlx5: HWS, Free unused action STE tables
Periodically check for unused action STE tables and free their
associated resources. In order to do this safely, add a per-queue lock
to synchronize the garbage collect work with regular operations on
steering rules.
Signed-off-by: Vlad Dogaru <vdogaru@nvidia.com>
Reviewed-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Michal Kubiak <michal.kubiak@intel.com>
Link: https://patch.msgid.link/1744312662-356571-12-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.c
index cb6ad8411631..5766a9c82f96 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.c
@@ -159,6 +159,7 @@ hws_action_ste_table_alloc(struct mlx5hws_action_ste_pool_element *parent_elem)
action_tbl->parent_elem = parent_elem;
INIT_LIST_HEAD(&action_tbl->list_node);
+ action_tbl->last_used = jiffies;
list_add(&action_tbl->list_node, &parent_elem->available);
parent_elem->log_sz = log_sz;
@@ -236,6 +237,8 @@ static int hws_action_ste_pool_init(struct mlx5hws_context *ctx,
enum mlx5hws_pool_optimize opt;
int err;
+ mutex_init(&pool->lock);
+
/* Rules which are added for both RX and TX must use the same action STE
* indices for both. If we were to use a single table, then RX-only and
* TX-only rules would waste the unused entries. Thus, we use separate
@@ -247,6 +250,7 @@ static int hws_action_ste_pool_init(struct mlx5hws_context *ctx,
opt);
if (err)
goto destroy_elems;
+ pool->elems[opt].parent_pool = pool;
}
return 0;
@@ -267,6 +271,58 @@ static void hws_action_ste_pool_destroy(struct mlx5hws_action_ste_pool *pool)
hws_action_ste_pool_element_destroy(&pool->elems[opt]);
}
+static void hws_action_ste_pool_element_collect_stale(
+ struct mlx5hws_action_ste_pool_element *elem, struct list_head *cleanup)
+{
+ struct mlx5hws_action_ste_table *action_tbl, *p;
+ unsigned long expire_time, now;
+
+ expire_time = secs_to_jiffies(MLX5HWS_ACTION_STE_POOL_EXPIRE_SECONDS);
+ now = jiffies;
+
+ list_for_each_entry_safe(action_tbl, p, &elem->available, list_node) {
+ if (mlx5hws_pool_full(action_tbl->pool) &&
+ time_before(action_tbl->last_used + expire_time, now))
+ list_move(&action_tbl->list_node, cleanup);
+ }
+}
+
+static void hws_action_ste_table_cleanup_list(struct list_head *cleanup)
+{
+ struct mlx5hws_action_ste_table *action_tbl, *p;
+
+ list_for_each_entry_safe(action_tbl, p, cleanup, list_node)
+ hws_action_ste_table_destroy(action_tbl);
+}
+
+static void hws_action_ste_pool_cleanup(struct work_struct *work)
+{
+ enum mlx5hws_pool_optimize opt;
+ struct mlx5hws_context *ctx;
+ LIST_HEAD(cleanup);
+ int i;
+
+ ctx = container_of(work, struct mlx5hws_context,
+ action_ste_cleanup.work);
+
+ for (i = 0; i < ctx->queues; i++) {
+ struct mlx5hws_action_ste_pool *p = &ctx->action_ste_pool[i];
+
+ mutex_lock(&p->lock);
+ for (opt = MLX5HWS_POOL_OPTIMIZE_NONE;
+ opt < MLX5HWS_POOL_OPTIMIZE_MAX; opt++)
+ hws_action_ste_pool_element_collect_stale(
+ &p->elems[opt], &cleanup);
+ mutex_unlock(&p->lock);
+ }
+
+ hws_action_ste_table_cleanup_list(&cleanup);
+
+ schedule_delayed_work(&ctx->action_ste_cleanup,
+ secs_to_jiffies(
+ MLX5HWS_ACTION_STE_POOL_CLEANUP_SECONDS));
+}
+
int mlx5hws_action_ste_pool_init(struct mlx5hws_context *ctx)
{
struct mlx5hws_action_ste_pool *pool;
@@ -285,6 +341,12 @@ int mlx5hws_action_ste_pool_init(struct mlx5hws_context *ctx)
ctx->action_ste_pool = pool;
+ INIT_DELAYED_WORK(&ctx->action_ste_cleanup,
+ hws_action_ste_pool_cleanup);
+ schedule_delayed_work(
+ &ctx->action_ste_cleanup,
+ secs_to_jiffies(MLX5HWS_ACTION_STE_POOL_CLEANUP_SECONDS));
+
return 0;
free_pool:
@@ -300,6 +362,8 @@ void mlx5hws_action_ste_pool_uninit(struct mlx5hws_context *ctx)
size_t queues = ctx->queues;
int i;
+ cancel_delayed_work_sync(&ctx->action_ste_cleanup);
+
for (i = 0; i < queues; i++)
hws_action_ste_pool_destroy(&ctx->action_ste_pool[i]);
@@ -330,6 +394,7 @@ hws_action_ste_table_chunk_alloc(struct mlx5hws_action_ste_table *action_tbl,
return err;
chunk->action_tbl = action_tbl;
+ action_tbl->last_used = jiffies;
return 0;
}
@@ -346,6 +411,8 @@ int mlx5hws_action_ste_chunk_alloc(struct mlx5hws_action_ste_pool *pool,
if (skip_rx && skip_tx)
return -EINVAL;
+ mutex_lock(&pool->lock);
+
elem = hws_action_ste_choose_elem(pool, skip_rx, skip_tx);
mlx5hws_dbg(elem->ctx,
@@ -362,26 +429,39 @@ int mlx5hws_action_ste_chunk_alloc(struct mlx5hws_action_ste_pool *pool,
if (!found) {
action_tbl = hws_action_ste_table_alloc(elem);
- if (IS_ERR(action_tbl))
- return PTR_ERR(action_tbl);
+ if (IS_ERR(action_tbl)) {
+ err = PTR_ERR(action_tbl);
+ goto out;
+ }
err = hws_action_ste_table_chunk_alloc(action_tbl, chunk);
if (err)
- return err;
+ goto out;
}
if (mlx5hws_pool_empty(action_tbl->pool))
list_move(&action_tbl->list_node, &elem->full);
- return 0;
+ err = 0;
+
+out:
+ mutex_unlock(&pool->lock);
+
+ return err;
}
void mlx5hws_action_ste_chunk_free(struct mlx5hws_action_ste_chunk *chunk)
{
+ struct mutex *lock = &chunk->action_tbl->parent_elem->parent_pool->lock;
+
mlx5hws_dbg(chunk->action_tbl->pool->ctx,
"Freeing action STEs offset %d order %d\n",
chunk->ste.offset, chunk->ste.order);
+
+ mutex_lock(lock);
mlx5hws_pool_chunk_free(chunk->action_tbl->pool, &chunk->ste);
+ chunk->action_tbl->last_used = jiffies;
list_move(&chunk->action_tbl->list_node,
&chunk->action_tbl->parent_elem->available);
+ mutex_unlock(lock);
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.h
index 2de660a63223..a8ba97359e31 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.h
@@ -8,6 +8,9 @@
#define MLX5HWS_ACTION_STE_TABLE_STEP_LOG_SZ 1
#define MLX5HWS_ACTION_STE_TABLE_MAX_LOG_SZ 20
+#define MLX5HWS_ACTION_STE_POOL_CLEANUP_SECONDS 300
+#define MLX5HWS_ACTION_STE_POOL_EXPIRE_SECONDS 300
+
struct mlx5hws_action_ste_pool_element;
struct mlx5hws_action_ste_table {
@@ -19,10 +22,12 @@ struct mlx5hws_action_ste_table {
u32 rtc_0_id;
u32 rtc_1_id;
struct list_head list_node;
+ unsigned long last_used;
};
struct mlx5hws_action_ste_pool_element {
struct mlx5hws_context *ctx;
+ struct mlx5hws_action_ste_pool *parent_pool;
size_t log_sz; /* Size of the largest table so far. */
enum mlx5hws_pool_optimize opt;
struct list_head available;
@@ -33,6 +38,12 @@ struct mlx5hws_action_ste_pool_element {
* per queue.
*/
struct mlx5hws_action_ste_pool {
+ /* Protects the entire pool. We have one pool per queue and only one
+ * operation can be active per rule at a given time. Thus this lock
+ * protects solely against concurrent garbage collection and we expect
+ * very little contention.
+ */
+ struct mutex lock;
struct mlx5hws_action_ste_pool_element elems[MLX5HWS_POOL_OPTIMIZE_MAX];
};
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h
index e987e93bbc6e..3f8938c73dc0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h
@@ -40,6 +40,7 @@ struct mlx5hws_context {
u32 pd_num;
struct mlx5hws_pool *stc_pool;
struct mlx5hws_action_ste_pool *action_ste_pool; /* One per queue */
+ struct delayed_work action_ste_cleanup;
struct mlx5hws_context_common_res common_res;
struct mlx5hws_pattern_cache *pattern_cache;
struct mlx5hws_definer_cache *definer_cache;
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,99 @@
From d1985c5a5885ee6fa478036997488534d181983c Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:01 -0400
Subject: [PATCH] net/mlx5: HWS, Export action STE tables to debugfs
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 3db55f8cc8d329a97e06fb44347b64a0ca44e780
Author: Vlad Dogaru <vdogaru@nvidia.com>
Date: Thu Apr 10 22:17:42 2025 +0300
net/mlx5: HWS, Export action STE tables to debugfs
Introduce a new type of dump object and dump all action STE tables,
along with information on their RTCs and STEs.
Signed-off-by: Vlad Dogaru <vdogaru@nvidia.com>
Reviewed-by: Hamdan Agbariya <hamdani@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Michal Kubiak <michal.kubiak@intel.com>
Link: https://patch.msgid.link/1744312662-356571-13-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c
index 38f75dec9cfc..91568d6c1dac 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c
@@ -387,10 +387,41 @@ static int hws_debug_dump_context_stc(struct seq_file *f, struct mlx5hws_context
return 0;
}
+static void
+hws_debug_dump_action_ste_table(struct seq_file *f,
+ struct mlx5hws_action_ste_table *action_tbl)
+{
+ int ste_0_id = mlx5hws_pool_get_base_id(action_tbl->pool);
+ int ste_1_id = mlx5hws_pool_get_base_mirror_id(action_tbl->pool);
+
+ seq_printf(f, "%d,0x%llx,%d,%d,%d,%d\n",
+ MLX5HWS_DEBUG_RES_TYPE_ACTION_STE_TABLE,
+ HWS_PTR_TO_ID(action_tbl),
+ action_tbl->rtc_0_id, ste_0_id,
+ action_tbl->rtc_1_id, ste_1_id);
+}
+
+static void hws_debug_dump_action_ste_pool(struct seq_file *f,
+ struct mlx5hws_action_ste_pool *pool)
+{
+ struct mlx5hws_action_ste_table *action_tbl;
+ enum mlx5hws_pool_optimize opt;
+
+ mutex_lock(&pool->lock);
+ for (opt = MLX5HWS_POOL_OPTIMIZE_NONE; opt < MLX5HWS_POOL_OPTIMIZE_MAX;
+ opt++) {
+ list_for_each_entry(action_tbl, &pool->elems[opt].available,
+ list_node) {
+ hws_debug_dump_action_ste_table(f, action_tbl);
+ }
+ }
+ mutex_unlock(&pool->lock);
+}
+
static int hws_debug_dump_context(struct seq_file *f, struct mlx5hws_context *ctx)
{
struct mlx5hws_table *tbl;
- int ret;
+ int ret, i;
ret = hws_debug_dump_context_info(f, ctx);
if (ret)
@@ -410,6 +441,9 @@ static int hws_debug_dump_context(struct seq_file *f, struct mlx5hws_context *ct
return ret;
}
+ for (i = 0; i < ctx->queues; i++)
+ hws_debug_dump_action_ste_pool(f, &ctx->action_ste_pool[i]);
+
return 0;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.h
index e44e7ae28f93..89c396f9f266 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.h
@@ -26,6 +26,8 @@ enum mlx5hws_debug_res_type {
MLX5HWS_DEBUG_RES_TYPE_MATCHER_TEMPLATE_HASH_DEFINER = 4205,
MLX5HWS_DEBUG_RES_TYPE_MATCHER_TEMPLATE_RANGE_DEFINER = 4206,
MLX5HWS_DEBUG_RES_TYPE_MATCHER_TEMPLATE_COMPARE_MATCH_DEFINER = 4207,
+
+ MLX5HWS_DEBUG_RES_TYPE_ACTION_STE_TABLE = 4300,
};
static inline u64
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,100 @@
From e0fb28731ba130ec0f45f724aa5c27a9d49f363d Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:01 -0400
Subject: [PATCH] net/mlx5e: ethtool: Fix formatting of
ptp_rq0_csum_complete_tail_slow
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit cfba1d1b61ae3f32e4bc06e9860711a4488d98b7
Author: Kees Cook <kees@kernel.org>
Date: Tue Apr 15 19:01:14 2025 -0700
net/mlx5e: ethtool: Fix formatting of ptp_rq0_csum_complete_tail_slow
The new GCC 15 warning -Wunterminated-string-initialization reports:
In file included from drivers/net/ethernet/mellanox/mlx5/core/en.h:55,
from drivers/net/ethernet/mellanox/mlx5/core/en_stats.c:34:
drivers/net/ethernet/mellanox/mlx5/core/en_stats.h:57:46: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization]
57 | #define MLX5E_DECLARE_PTP_RQ_STAT(type, fld) "ptp_rq%d_"#fld, offsetof(type, fld)
| ^~~~~~~~~~~
drivers/net/ethernet/mellanox/mlx5/core/en_stats.c:2279:11: note: in expansion of macro 'MLX5E_DECLARE_PTP_RQ_STAT'
2279 | { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, csum_complete_tail_slow) },
| ^~~~~~~~~~~~~~~~~~~~~~~~~
This stat string is being used in ethtool_sprintf(), so it must be a
valid NUL-terminated string. Currently the string lacks the final NUL
byte (as GCC warns), but by absolute luck, the next byte in memory is a
space (decimal 32) followed by a NUL. "format" is immediately followed
by little-endian size_t:
struct counter_desc {
char format[32]; /* 0 32 */
size_t offset; /* 32 8 */
};
The "offset" member is populated by the stats member offset:
#define MLX5E_DECLARE_PTP_RQ_STAT(type, fld) "ptp_rq%d_"#fld, offsetof(type, fld)
which for this struct mlx5e_rq_stats member, csum_complete_tail_slow, is
32, or space, and then the rest of the "offset" bytes are NULs.
struct mlx5e_rq_stats {
...
u64 csum_complete_tail_slow; /* 32 8 */
The use of vsnprintf(), within ethtool_sprintf(), reads past the end of
"format" and sees the format string as "ptp_rq%d_csum_complete_tail_slow ",
with %d getting resolved by MLX5E_PTP_CHANNEL_IX (value 0):
ethtool_sprintf(data, ptp_rq_stats_desc[i].format,
MLX5E_PTP_CHANNEL_IX);
With an output result of "ptp_rq0_csum_complete_tail_slow", which gets
precisely truncated to 31 characters with a trailing NUL.
So, instead of accidentally getting this correct due to the NUL bytes
at the end of the size_t that happens to follow the format string, just
make the string initializer 1 byte shorter by replacing "%d" with "0",
since MLX5E_PTP_CHANNEL_IX is already hard-coded. This results in no
initializer truncation and no need to call sprintf().
Signed-off-by: Kees Cook <kees@kernel.org>
Reviewed-by: Dragos Tatulea <dtatulea@nvidia.com>
Link: https://patch.msgid.link/20250416020109.work.297-kees@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
index 1c121b435016..19664fa7f217 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
@@ -2424,8 +2424,7 @@ static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(ptp)
}
if (priv->rx_ptp_opened) {
for (i = 0; i < NUM_PTP_RQ_STATS; i++)
- ethtool_sprintf(data, ptp_rq_stats_desc[i].format,
- MLX5E_PTP_CHANNEL_IX);
+ ethtool_puts(data, ptp_rq_stats_desc[i].format);
}
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
index 8de6fcbd3a03..def5dea1463d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
@@ -54,7 +54,7 @@
#define MLX5E_DECLARE_PTP_TX_STAT(type, fld) "ptp_tx%d_"#fld, offsetof(type, fld)
#define MLX5E_DECLARE_PTP_CH_STAT(type, fld) "ptp_ch_"#fld, offsetof(type, fld)
#define MLX5E_DECLARE_PTP_CQ_STAT(type, fld) "ptp_cq%d_"#fld, offsetof(type, fld)
-#define MLX5E_DECLARE_PTP_RQ_STAT(type, fld) "ptp_rq%d_"#fld, offsetof(type, fld)
+#define MLX5E_DECLARE_PTP_RQ_STAT(type, fld) "ptp_rq0_"#fld, offsetof(type, fld)
#define MLX5E_DECLARE_QOS_TX_STAT(type, fld) "qos_tx%d_"#fld, offsetof(type, fld)
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,58 @@
From 3a54d4daa0cfe0a44c2f8cdc68d5b0c8b277a990 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:01 -0400
Subject: [PATCH] net/mlx5: Fix spelling mistakes in mlx5_core_dbg message and
comments
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 1e36473215297708dbe144c65b9f242c6e604520
Author: Colin Ian King <colin.i.king@gmail.com>
Date: Fri Apr 18 14:57:03 2025 +0100
net/mlx5: Fix spelling mistakes in mlx5_core_dbg message and comments
There is a spelling mistake in a mlx5_core_dbg and two spelling mistakes
in comment blocks. Fix them.
Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Acked-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250418135703.542722-1-colin.i.king@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
index 2c5f850c31f6..40024cfa3099 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
@@ -148,7 +148,7 @@ int mlx5_set_msix_vec_count(struct mlx5_core_dev *dev, int function_id,
* Free the IRQ and other resources such as rmap from the system.
* BUT doesn't free or remove reference from mlx5.
* This function is very important for the shutdown flow, where we need to
- * cleanup system resoruces but keep mlx5 objects alive,
+ * cleanup system resources but keep mlx5 objects alive,
* see mlx5_irq_table_free_irqs().
*/
static void mlx5_system_free_irq(struct mlx5_irq *irq)
@@ -588,7 +588,7 @@ static void irq_pool_free(struct mlx5_irq_pool *pool)
struct mlx5_irq *irq;
unsigned long index;
- /* There are cases in which we are destrying the irq_table before
+ /* There are cases in which we are destroying the irq_table before
* freeing all the IRQs, fast teardown for example. Hence, free the irqs
* which might not have been freed.
*/
@@ -617,7 +617,7 @@ static int irq_pools_init(struct mlx5_core_dev *dev, int sf_vec, int pcif_vec,
if (!mlx5_sf_max_functions(dev))
return 0;
if (sf_vec < MLX5_IRQ_VEC_COMP_BASE_SF) {
- mlx5_core_dbg(dev, "Not enught IRQs for SFs. SF may run at lower performance\n");
+ mlx5_core_dbg(dev, "Not enough IRQs for SFs. SF may run at lower performance\n");
return 0;
}
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,136 @@
From 988625f598a4722c53b34743d5ddef5d48a46a20 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:02 -0400
Subject: [PATCH] net/mlx5: HWS, Fix IP version decision
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 5f2f8d8b6800e4fc760c2eccec9b2bd2cacf80cf
Author: Vlad Dogaru <vdogaru@nvidia.com>
Date: Tue Apr 22 12:25:38 2025 +0300
net/mlx5: HWS, Fix IP version decision
Unify the check for IP version when creating a definer. A given matcher
is deemed to match on IPv6 if any of the higher order (>31) bits of
source or destination address mask are set.
A single packet cannot mix IP versions between source and destination
addresses, so it makes no sense that they would be decided on
independently.
Signed-off-by: Vlad Dogaru <vdogaru@nvidia.com>
Reviewed-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250422092540.182091-2-mbloch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c
index c8cc0c8115f5..5257e706dde2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c
@@ -509,9 +509,9 @@ static int
hws_definer_conv_outer(struct mlx5hws_definer_conv_data *cd,
u32 *match_param)
{
- bool is_s_ipv6, is_d_ipv6, smac_set, dmac_set;
struct mlx5hws_definer_fc *fc = cd->fc;
struct mlx5hws_definer_fc *curr_fc;
+ bool is_ipv6, smac_set, dmac_set;
u32 *s_ipv6, *d_ipv6;
if (HWS_IS_FLD_SET_SZ(match_param, outer_headers.l4_type, 0x2) ||
@@ -570,10 +570,10 @@ hws_definer_conv_outer(struct mlx5hws_definer_conv_data *cd,
outer_headers.dst_ipv4_dst_ipv6.ipv6_layout);
/* Assume IPv6 is used if ipv6 bits are set */
- is_s_ipv6 = s_ipv6[0] || s_ipv6[1] || s_ipv6[2];
- is_d_ipv6 = d_ipv6[0] || d_ipv6[1] || d_ipv6[2];
+ is_ipv6 = s_ipv6[0] || s_ipv6[1] || s_ipv6[2] ||
+ d_ipv6[0] || d_ipv6[1] || d_ipv6[2];
- if (is_s_ipv6) {
+ if (is_ipv6) {
/* Handle IPv6 source address */
HWS_SET_HDR(fc, match_param, IPV6_SRC_127_96_O,
outer_headers.src_ipv4_src_ipv6.ipv6_simple_layout.ipv6_127_96,
@@ -587,13 +587,6 @@ hws_definer_conv_outer(struct mlx5hws_definer_conv_data *cd,
HWS_SET_HDR(fc, match_param, IPV6_SRC_31_0_O,
outer_headers.src_ipv4_src_ipv6.ipv6_simple_layout.ipv6_31_0,
ipv6_src_outer.ipv6_address_31_0);
- } else {
- /* Handle IPv4 source address */
- HWS_SET_HDR(fc, match_param, IPV4_SRC_O,
- outer_headers.src_ipv4_src_ipv6.ipv6_simple_layout.ipv6_31_0,
- ipv4_src_dest_outer.source_address);
- }
- if (is_d_ipv6) {
/* Handle IPv6 destination address */
HWS_SET_HDR(fc, match_param, IPV6_DST_127_96_O,
outer_headers.dst_ipv4_dst_ipv6.ipv6_simple_layout.ipv6_127_96,
@@ -608,6 +601,10 @@ hws_definer_conv_outer(struct mlx5hws_definer_conv_data *cd,
outer_headers.dst_ipv4_dst_ipv6.ipv6_simple_layout.ipv6_31_0,
ipv6_dst_outer.ipv6_address_31_0);
} else {
+ /* Handle IPv4 source address */
+ HWS_SET_HDR(fc, match_param, IPV4_SRC_O,
+ outer_headers.src_ipv4_src_ipv6.ipv6_simple_layout.ipv6_31_0,
+ ipv4_src_dest_outer.source_address);
/* Handle IPv4 destination address */
HWS_SET_HDR(fc, match_param, IPV4_DST_O,
outer_headers.dst_ipv4_dst_ipv6.ipv6_simple_layout.ipv6_31_0,
@@ -665,9 +662,9 @@ static int
hws_definer_conv_inner(struct mlx5hws_definer_conv_data *cd,
u32 *match_param)
{
- bool is_s_ipv6, is_d_ipv6, smac_set, dmac_set;
struct mlx5hws_definer_fc *fc = cd->fc;
struct mlx5hws_definer_fc *curr_fc;
+ bool is_ipv6, smac_set, dmac_set;
u32 *s_ipv6, *d_ipv6;
if (HWS_IS_FLD_SET_SZ(match_param, inner_headers.l4_type, 0x2) ||
@@ -728,10 +725,10 @@ hws_definer_conv_inner(struct mlx5hws_definer_conv_data *cd,
inner_headers.dst_ipv4_dst_ipv6.ipv6_layout);
/* Assume IPv6 is used if ipv6 bits are set */
- is_s_ipv6 = s_ipv6[0] || s_ipv6[1] || s_ipv6[2];
- is_d_ipv6 = d_ipv6[0] || d_ipv6[1] || d_ipv6[2];
+ is_ipv6 = s_ipv6[0] || s_ipv6[1] || s_ipv6[2] ||
+ d_ipv6[0] || d_ipv6[1] || d_ipv6[2];
- if (is_s_ipv6) {
+ if (is_ipv6) {
/* Handle IPv6 source address */
HWS_SET_HDR(fc, match_param, IPV6_SRC_127_96_I,
inner_headers.src_ipv4_src_ipv6.ipv6_simple_layout.ipv6_127_96,
@@ -745,13 +742,6 @@ hws_definer_conv_inner(struct mlx5hws_definer_conv_data *cd,
HWS_SET_HDR(fc, match_param, IPV6_SRC_31_0_I,
inner_headers.src_ipv4_src_ipv6.ipv6_simple_layout.ipv6_31_0,
ipv6_src_inner.ipv6_address_31_0);
- } else {
- /* Handle IPv4 source address */
- HWS_SET_HDR(fc, match_param, IPV4_SRC_I,
- inner_headers.src_ipv4_src_ipv6.ipv6_simple_layout.ipv6_31_0,
- ipv4_src_dest_inner.source_address);
- }
- if (is_d_ipv6) {
/* Handle IPv6 destination address */
HWS_SET_HDR(fc, match_param, IPV6_DST_127_96_I,
inner_headers.dst_ipv4_dst_ipv6.ipv6_simple_layout.ipv6_127_96,
@@ -766,6 +756,10 @@ hws_definer_conv_inner(struct mlx5hws_definer_conv_data *cd,
inner_headers.dst_ipv4_dst_ipv6.ipv6_simple_layout.ipv6_31_0,
ipv6_dst_inner.ipv6_address_31_0);
} else {
+ /* Handle IPv4 source address */
+ HWS_SET_HDR(fc, match_param, IPV4_SRC_I,
+ inner_headers.src_ipv4_src_ipv6.ipv6_simple_layout.ipv6_31_0,
+ ipv4_src_dest_inner.source_address);
/* Handle IPv4 destination address */
HWS_SET_HDR(fc, match_param, IPV4_DST_I,
inner_headers.dst_ipv4_dst_ipv6.ipv6_simple_layout.ipv6_31_0,
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,127 @@
From 024b08ee6e9f4a7d00dcdbde0e76f34bebc32c27 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:02 -0400
Subject: [PATCH] net/mlx5: HWS, Harden IP version definer checks
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 6991a975e416154576b0f5f06256aec13e23b0a7
Author: Vlad Dogaru <vdogaru@nvidia.com>
Date: Tue Apr 22 12:25:39 2025 +0300
net/mlx5: HWS, Harden IP version definer checks
Replicate some sanity checks that firmware does, since hardware steering
does not go through firmware.
When creating a definer, disallow matching on IP addresses without also
matching on IP version. The latter can be satisfied by matching either
on the version field in the IP header, or on the ethertype field.
Also refuse to match IPv4 IHL alongside IPv6.
Signed-off-by: Vlad Dogaru <vdogaru@nvidia.com>
Reviewed-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250422092540.182091-3-mbloch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c
index 5257e706dde2..1061a46811ac 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c
@@ -509,9 +509,9 @@ static int
hws_definer_conv_outer(struct mlx5hws_definer_conv_data *cd,
u32 *match_param)
{
+ bool is_ipv6, smac_set, dmac_set, ip_addr_set, ip_ver_set;
struct mlx5hws_definer_fc *fc = cd->fc;
struct mlx5hws_definer_fc *curr_fc;
- bool is_ipv6, smac_set, dmac_set;
u32 *s_ipv6, *d_ipv6;
if (HWS_IS_FLD_SET_SZ(match_param, outer_headers.l4_type, 0x2) ||
@@ -521,6 +521,20 @@ hws_definer_conv_outer(struct mlx5hws_definer_conv_data *cd,
return -EINVAL;
}
+ ip_addr_set = HWS_IS_FLD_SET_SZ(match_param,
+ outer_headers.src_ipv4_src_ipv6,
+ 0x80) ||
+ HWS_IS_FLD_SET_SZ(match_param,
+ outer_headers.dst_ipv4_dst_ipv6, 0x80);
+ ip_ver_set = HWS_IS_FLD_SET(match_param, outer_headers.ip_version) ||
+ HWS_IS_FLD_SET(match_param, outer_headers.ethertype);
+
+ if (ip_addr_set && !ip_ver_set) {
+ mlx5hws_err(cd->ctx,
+ "Unsupported match on IP address without version or ethertype\n");
+ return -EINVAL;
+ }
+
/* L2 Check ethertype */
HWS_SET_HDR(fc, match_param, ETH_TYPE_O,
outer_headers.ethertype,
@@ -573,6 +587,12 @@ hws_definer_conv_outer(struct mlx5hws_definer_conv_data *cd,
is_ipv6 = s_ipv6[0] || s_ipv6[1] || s_ipv6[2] ||
d_ipv6[0] || d_ipv6[1] || d_ipv6[2];
+ /* IHL is an IPv4-specific field. */
+ if (is_ipv6 && HWS_IS_FLD_SET(match_param, outer_headers.ipv4_ihl)) {
+ mlx5hws_err(cd->ctx, "Unsupported match on IPv6 address and IPv4 IHL\n");
+ return -EINVAL;
+ }
+
if (is_ipv6) {
/* Handle IPv6 source address */
HWS_SET_HDR(fc, match_param, IPV6_SRC_127_96_O,
@@ -662,9 +682,9 @@ static int
hws_definer_conv_inner(struct mlx5hws_definer_conv_data *cd,
u32 *match_param)
{
+ bool is_ipv6, smac_set, dmac_set, ip_addr_set, ip_ver_set;
struct mlx5hws_definer_fc *fc = cd->fc;
struct mlx5hws_definer_fc *curr_fc;
- bool is_ipv6, smac_set, dmac_set;
u32 *s_ipv6, *d_ipv6;
if (HWS_IS_FLD_SET_SZ(match_param, inner_headers.l4_type, 0x2) ||
@@ -674,6 +694,20 @@ hws_definer_conv_inner(struct mlx5hws_definer_conv_data *cd,
return -EINVAL;
}
+ ip_addr_set = HWS_IS_FLD_SET_SZ(match_param,
+ inner_headers.src_ipv4_src_ipv6,
+ 0x80) ||
+ HWS_IS_FLD_SET_SZ(match_param,
+ inner_headers.dst_ipv4_dst_ipv6, 0x80);
+ ip_ver_set = HWS_IS_FLD_SET(match_param, inner_headers.ip_version) ||
+ HWS_IS_FLD_SET(match_param, inner_headers.ethertype);
+
+ if (ip_addr_set && !ip_ver_set) {
+ mlx5hws_err(cd->ctx,
+ "Unsupported match on IP address without version or ethertype\n");
+ return -EINVAL;
+ }
+
/* L2 Check ethertype */
HWS_SET_HDR(fc, match_param, ETH_TYPE_I,
inner_headers.ethertype,
@@ -728,6 +762,12 @@ hws_definer_conv_inner(struct mlx5hws_definer_conv_data *cd,
is_ipv6 = s_ipv6[0] || s_ipv6[1] || s_ipv6[2] ||
d_ipv6[0] || d_ipv6[1] || d_ipv6[2];
+ /* IHL is an IPv4-specific field. */
+ if (is_ipv6 && HWS_IS_FLD_SET(match_param, inner_headers.ipv4_ihl)) {
+ mlx5hws_err(cd->ctx, "Unsupported match on IPv6 address and IPv4 IHL\n");
+ return -EINVAL;
+ }
+
if (is_ipv6) {
/* Handle IPv6 source address */
HWS_SET_HDR(fc, match_param, IPV6_SRC_127_96_I,
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,256 @@
From c0c4826b9a633587cd14595358b62c40a3672204 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:02 -0400
Subject: [PATCH] net/mlx5: HWS, Disallow matcher IP version mixing
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit f41f3edf0b15d7ce0b0f71c00a6125e8d7ca735f
Author: Vlad Dogaru <vdogaru@nvidia.com>
Date: Tue Apr 22 12:25:40 2025 +0300
net/mlx5: HWS, Disallow matcher IP version mixing
Signal clearly to the user, via an error, that mixing IPv4 and IPv6
rules in the same matcher is not supported. Previously such cases
silently failed by adding a rule that did not work correctly.
Rules can specify an IP version by one of two fields: IP version or
ethertype. At matcher creation, store whether the template matches on
any of these two fields. If yes, inspect each rule for its corresponding
match value and store the IP version inside the matcher to guard against
inconsistencies with subsequent rules.
Furthermore, also check rules for internal consistency, i.e. verify that
the ethertype and IP version match values do not contradict each other.
The logic applies to inner and outer headers independently, to account
for tunneling.
Rules that do not match on IP addresses are not affected.
Signed-off-by: Vlad Dogaru <vdogaru@nvidia.com>
Reviewed-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250422092540.182091-4-mbloch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c
index 716502732d3d..5b0c1623499b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c
@@ -385,6 +385,30 @@ static int hws_matcher_bind_at(struct mlx5hws_matcher *matcher)
return 0;
}
+static void hws_matcher_set_ip_version_match(struct mlx5hws_matcher *matcher)
+{
+ int i;
+
+ for (i = 0; i < matcher->mt->fc_sz; i++) {
+ switch (matcher->mt->fc[i].fname) {
+ case MLX5HWS_DEFINER_FNAME_ETH_TYPE_O:
+ matcher->matches_outer_ethertype = 1;
+ break;
+ case MLX5HWS_DEFINER_FNAME_ETH_L3_TYPE_O:
+ matcher->matches_outer_ip_version = 1;
+ break;
+ case MLX5HWS_DEFINER_FNAME_ETH_TYPE_I:
+ matcher->matches_inner_ethertype = 1;
+ break;
+ case MLX5HWS_DEFINER_FNAME_ETH_L3_TYPE_I:
+ matcher->matches_inner_ip_version = 1;
+ break;
+ default:
+ break;
+ }
+ }
+}
+
static int hws_matcher_bind_mt(struct mlx5hws_matcher *matcher)
{
struct mlx5hws_context *ctx = matcher->tbl->ctx;
@@ -401,6 +425,8 @@ static int hws_matcher_bind_mt(struct mlx5hws_matcher *matcher)
}
}
+ hws_matcher_set_ip_version_match(matcher);
+
/* Create an STE pool per matcher*/
pool_attr.table_type = matcher->tbl->type;
pool_attr.pool_type = MLX5HWS_POOL_TYPE_STE;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h
index bad1fa8f77fd..8e95158a66b5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h
@@ -50,6 +50,12 @@ struct mlx5hws_matcher_match_ste {
struct mlx5hws_pool *pool;
};
+enum {
+ MLX5HWS_MATCHER_IPV_UNSET = 0,
+ MLX5HWS_MATCHER_IPV_4 = 1,
+ MLX5HWS_MATCHER_IPV_6 = 2,
+};
+
struct mlx5hws_matcher {
struct mlx5hws_table *tbl;
struct mlx5hws_matcher_attr attr;
@@ -61,6 +67,12 @@ struct mlx5hws_matcher {
u8 num_of_action_stes;
/* enum mlx5hws_matcher_flags */
u8 flags;
+ u8 matches_outer_ethertype:1;
+ u8 matches_outer_ip_version:1;
+ u8 matches_inner_ethertype:1;
+ u8 matches_inner_ip_version:1;
+ u8 outer_ip_version:2;
+ u8 inner_ip_version:2;
u32 end_ft_id;
struct mlx5hws_matcher *col_matcher;
struct mlx5hws_matcher *resize_dst;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c
index 9e6f35d68445..5342a4cc7194 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c
@@ -655,6 +655,124 @@ int mlx5hws_rule_move_hws_add(struct mlx5hws_rule *rule,
return 0;
}
+static u8 hws_rule_ethertype_to_matcher_ipv(u32 ethertype)
+{
+ switch (ethertype) {
+ case ETH_P_IP:
+ return MLX5HWS_MATCHER_IPV_4;
+ case ETH_P_IPV6:
+ return MLX5HWS_MATCHER_IPV_6;
+ default:
+ return MLX5HWS_MATCHER_IPV_UNSET;
+ }
+}
+
+static u8 hws_rule_ip_version_to_matcher_ipv(u32 ip_version)
+{
+ switch (ip_version) {
+ case 4:
+ return MLX5HWS_MATCHER_IPV_4;
+ case 6:
+ return MLX5HWS_MATCHER_IPV_6;
+ default:
+ return MLX5HWS_MATCHER_IPV_UNSET;
+ }
+}
+
+static int hws_rule_check_outer_ip_version(struct mlx5hws_matcher *matcher,
+ u32 *match_param)
+{
+ struct mlx5hws_context *ctx = matcher->tbl->ctx;
+ u8 outer_ipv_ether = MLX5HWS_MATCHER_IPV_UNSET;
+ u8 outer_ipv_ip = MLX5HWS_MATCHER_IPV_UNSET;
+ u8 outer_ipv, ver;
+
+ if (matcher->matches_outer_ethertype) {
+ ver = MLX5_GET(fte_match_param, match_param,
+ outer_headers.ethertype);
+ outer_ipv_ether = hws_rule_ethertype_to_matcher_ipv(ver);
+ }
+ if (matcher->matches_outer_ip_version) {
+ ver = MLX5_GET(fte_match_param, match_param,
+ outer_headers.ip_version);
+ outer_ipv_ip = hws_rule_ip_version_to_matcher_ipv(ver);
+ }
+
+ if (outer_ipv_ether != MLX5HWS_MATCHER_IPV_UNSET &&
+ outer_ipv_ip != MLX5HWS_MATCHER_IPV_UNSET &&
+ outer_ipv_ether != outer_ipv_ip) {
+ mlx5hws_err(ctx, "Rule matches on inconsistent outer ethertype and ip version\n");
+ return -EINVAL;
+ }
+
+ outer_ipv = outer_ipv_ether != MLX5HWS_MATCHER_IPV_UNSET ?
+ outer_ipv_ether : outer_ipv_ip;
+ if (outer_ipv != MLX5HWS_MATCHER_IPV_UNSET &&
+ matcher->outer_ip_version != MLX5HWS_MATCHER_IPV_UNSET &&
+ outer_ipv != matcher->outer_ip_version) {
+ mlx5hws_err(ctx, "Matcher and rule disagree on outer IP version\n");
+ return -EINVAL;
+ }
+ matcher->outer_ip_version = outer_ipv;
+
+ return 0;
+}
+
+static int hws_rule_check_inner_ip_version(struct mlx5hws_matcher *matcher,
+ u32 *match_param)
+{
+ struct mlx5hws_context *ctx = matcher->tbl->ctx;
+ u8 inner_ipv_ether = MLX5HWS_MATCHER_IPV_UNSET;
+ u8 inner_ipv_ip = MLX5HWS_MATCHER_IPV_UNSET;
+ u8 inner_ipv, ver;
+
+ if (matcher->matches_inner_ethertype) {
+ ver = MLX5_GET(fte_match_param, match_param,
+ inner_headers.ethertype);
+ inner_ipv_ether = hws_rule_ethertype_to_matcher_ipv(ver);
+ }
+ if (matcher->matches_inner_ip_version) {
+ ver = MLX5_GET(fte_match_param, match_param,
+ inner_headers.ip_version);
+ inner_ipv_ip = hws_rule_ip_version_to_matcher_ipv(ver);
+ }
+
+ if (inner_ipv_ether != MLX5HWS_MATCHER_IPV_UNSET &&
+ inner_ipv_ip != MLX5HWS_MATCHER_IPV_UNSET &&
+ inner_ipv_ether != inner_ipv_ip) {
+ mlx5hws_err(ctx, "Rule matches on inconsistent inner ethertype and ip version\n");
+ return -EINVAL;
+ }
+
+ inner_ipv = inner_ipv_ether != MLX5HWS_MATCHER_IPV_UNSET ?
+ inner_ipv_ether : inner_ipv_ip;
+ if (inner_ipv != MLX5HWS_MATCHER_IPV_UNSET &&
+ matcher->inner_ip_version != MLX5HWS_MATCHER_IPV_UNSET &&
+ inner_ipv != matcher->inner_ip_version) {
+ mlx5hws_err(ctx, "Matcher and rule disagree on inner IP version\n");
+ return -EINVAL;
+ }
+ matcher->inner_ip_version = inner_ipv;
+
+ return 0;
+}
+
+static int hws_rule_check_ip_version(struct mlx5hws_matcher *matcher,
+ u32 *match_param)
+{
+ int ret;
+
+ ret = hws_rule_check_outer_ip_version(matcher, match_param);
+ if (unlikely(ret))
+ return ret;
+
+ ret = hws_rule_check_inner_ip_version(matcher, match_param);
+ if (unlikely(ret))
+ return ret;
+
+ return 0;
+}
+
int mlx5hws_rule_create(struct mlx5hws_matcher *matcher,
u8 mt_idx,
u32 *match_param,
@@ -665,6 +783,10 @@ int mlx5hws_rule_create(struct mlx5hws_matcher *matcher,
{
int ret;
+ ret = hws_rule_check_ip_version(matcher, match_param);
+ if (unlikely(ret))
+ return ret;
+
rule_handle->matcher = matcher;
ret = hws_rule_enqueue_precheck_create(rule_handle, attr);
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,142 @@
From 85e0a9a7588dbdecc6ff8e2facde4a75b8ff4299 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:02 -0400
Subject: [PATCH] RDMA/mlx5: Fix error flow upon firmware failure for RQ
destruction
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 5d2ea5aebbb2f3ebde4403f9c55b2b057e5dd2d6
Author: Patrisious Haddad <phaddad@nvidia.com>
Date: Mon Apr 28 14:34:07 2025 +0300
RDMA/mlx5: Fix error flow upon firmware failure for RQ destruction
Upon RQ destruction if the firmware command fails which is the
last resource to be destroyed some SW resources were already cleaned
regardless of the failure.
Now properly rollback the object to its original state upon such failure.
In order to avoid a use-after free in case someone tries to destroy the
object again, which results in the following kernel trace:
refcount_t: underflow; use-after-free.
WARNING: CPU: 0 PID: 37589 at lib/refcount.c:28 refcount_warn_saturate+0xf4/0x148
Modules linked in: rdma_ucm(OE) rdma_cm(OE) iw_cm(OE) ib_ipoib(OE) ib_cm(OE) ib_umad(OE) mlx5_ib(OE) rfkill mlx5_core(OE) mlxdevm(OE) ib_uverbs(OE) ib_core(OE) psample mlxfw(OE) mlx_compat(OE) macsec tls pci_hyperv_intf sunrpc vfat fat virtio_net net_failover failover fuse loop nfnetlink vsock_loopback vmw_vsock_virtio_transport_common vmw_vsock_vmci_transport vmw_vmci vsock xfs crct10dif_ce ghash_ce sha2_ce sha256_arm64 sha1_ce virtio_console virtio_gpu virtio_blk virtio_dma_buf virtio_mmio dm_mirror dm_region_hash dm_log dm_mod xpmem(OE)
CPU: 0 UID: 0 PID: 37589 Comm: python3 Kdump: loaded Tainted: G OE ------- --- 6.12.0-54.el10.aarch64 #1
Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE
Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
pc : refcount_warn_saturate+0xf4/0x148
lr : refcount_warn_saturate+0xf4/0x148
sp : ffff80008b81b7e0
x29: ffff80008b81b7e0 x28: ffff000133d51600 x27: 0000000000000001
x26: 0000000000000000 x25: 00000000ffffffea x24: ffff00010ae80f00
x23: ffff00010ae80f80 x22: ffff0000c66e5d08 x21: 0000000000000000
x20: ffff0000c66e0000 x19: ffff00010ae80340 x18: 0000000000000006
x17: 0000000000000000 x16: 0000000000000020 x15: ffff80008b81b37f
x14: 0000000000000000 x13: 2e656572662d7265 x12: ffff80008283ef78
x11: ffff80008257efd0 x10: ffff80008283efd0 x9 : ffff80008021ed90
x8 : 0000000000000001 x7 : 00000000000bffe8 x6 : c0000000ffff7fff
x5 : ffff0001fb8e3408 x4 : 0000000000000000 x3 : ffff800179993000
x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff000133d51600
Call trace:
refcount_warn_saturate+0xf4/0x148
mlx5_core_put_rsc+0x88/0xa0 [mlx5_ib]
mlx5_core_destroy_rq_tracked+0x64/0x98 [mlx5_ib]
mlx5_ib_destroy_wq+0x34/0x80 [mlx5_ib]
ib_destroy_wq_user+0x30/0xc0 [ib_core]
uverbs_free_wq+0x28/0x58 [ib_uverbs]
destroy_hw_idr_uobject+0x34/0x78 [ib_uverbs]
uverbs_destroy_uobject+0x48/0x240 [ib_uverbs]
__uverbs_cleanup_ufile+0xd4/0x1a8 [ib_uverbs]
uverbs_destroy_ufile_hw+0x48/0x120 [ib_uverbs]
ib_uverbs_close+0x2c/0x100 [ib_uverbs]
__fput+0xd8/0x2f0
__fput_sync+0x50/0x70
__arm64_sys_close+0x40/0x90
invoke_syscall.constprop.0+0x74/0xd0
do_el0_svc+0x48/0xe8
el0_svc+0x44/0x1d0
el0t_64_sync_handler+0x120/0x130
el0t_64_sync+0x1a4/0x1a8
Fixes: e2013b212f9f ("net/mlx5_core: Add RQ and SQ event handling")
Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
Link: https://patch.msgid.link/3181433ccdd695c63560eeeb3f0c990961732101.1745839855.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c
index d3dcc272200a..146d03ae40bd 100644
--- a/drivers/infiniband/hw/mlx5/qpc.c
+++ b/drivers/infiniband/hw/mlx5/qpc.c
@@ -21,8 +21,10 @@ mlx5_get_rsc(struct mlx5_qp_table *table, u32 rsn)
spin_lock_irqsave(&table->lock, flags);
common = radix_tree_lookup(&table->tree, rsn);
- if (common)
+ if (common && !common->invalid)
refcount_inc(&common->refcount);
+ else
+ common = NULL;
spin_unlock_irqrestore(&table->lock, flags);
@@ -178,6 +180,18 @@ static int create_resource_common(struct mlx5_ib_dev *dev,
return 0;
}
+static void modify_resource_common_state(struct mlx5_ib_dev *dev,
+ struct mlx5_core_qp *qp,
+ bool invalid)
+{
+ struct mlx5_qp_table *table = &dev->qp_table;
+ unsigned long flags;
+
+ spin_lock_irqsave(&table->lock, flags);
+ qp->common.invalid = invalid;
+ spin_unlock_irqrestore(&table->lock, flags);
+}
+
static void destroy_resource_common(struct mlx5_ib_dev *dev,
struct mlx5_core_qp *qp)
{
@@ -609,8 +623,20 @@ int mlx5_core_create_rq_tracked(struct mlx5_ib_dev *dev, u32 *in, int inlen,
int mlx5_core_destroy_rq_tracked(struct mlx5_ib_dev *dev,
struct mlx5_core_qp *rq)
{
+ int ret;
+
+ /* The rq destruction can be called again in case it fails, hence we
+ * mark the common resource as invalid and only once FW destruction
+ * is completed successfully we actually destroy the resources.
+ */
+ modify_resource_common_state(dev, rq, true);
+ ret = destroy_rq_tracked(dev, rq->qpn, rq->uid);
+ if (ret) {
+ modify_resource_common_state(dev, rq, false);
+ return ret;
+ }
destroy_resource_common(dev, rq);
- return destroy_rq_tracked(dev, rq->qpn, rq->uid);
+ return 0;
}
static void destroy_sq_tracked(struct mlx5_ib_dev *dev, u32 sqn, u16 uid)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 04705078dfab..df76aece6be9 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -398,6 +398,7 @@ struct mlx5_core_rsc_common {
enum mlx5_res_type res;
refcount_t refcount;
struct completion free;
+ bool invalid;
};
struct mlx5_uars_page {
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,78 @@
From caaa5c0c5b3a539eefe31c4fe578b881ba6512bf Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:03 -0400
Subject: [PATCH] net/mlx5: support software TX timestamp
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 2451d3fb388f29d87d1abd3d2952d5ce36109816
Author: Stanislav Fomichev <sdf@fomichev.me>
Date: Thu May 8 16:51:09 2025 -0700
net/mlx5: support software TX timestamp
Having a software timestamp (along with existing hardware one) is
useful to trace how the packets flow through the stack.
mlx5e_tx_skb_update_hwts_flags is called from tx paths
to setup HW timestamp; extend it to add software one as well.
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Signed-off-by: Stanislav Fomichev <stfomichev@gmail.com>
Reviewed-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/20250508235109.585096-1-stfomichev@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 8578f03783bc..e6c9338ddae8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -1686,6 +1686,7 @@ int mlx5e_ethtool_get_ts_info(struct mlx5e_priv *priv,
return 0;
info->so_timestamping = SOF_TIMESTAMPING_TX_HARDWARE |
+ SOF_TIMESTAMPING_TX_SOFTWARE |
SOF_TIMESTAMPING_RX_HARDWARE |
SOF_TIMESTAMPING_RAW_HARDWARE;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 4fd853d19e31..55a8629f0792 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -337,10 +337,11 @@ static void mlx5e_sq_calc_wqe_attr(struct sk_buff *skb, const struct mlx5e_tx_at
};
}
-static void mlx5e_tx_skb_update_hwts_flags(struct sk_buff *skb)
+static void mlx5e_tx_skb_update_ts_flags(struct sk_buff *skb)
{
if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP))
skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
+ skb_tx_timestamp(skb);
}
static void mlx5e_tx_check_stop(struct mlx5e_txqsq *sq)
@@ -392,7 +393,7 @@ mlx5e_txwqe_complete(struct mlx5e_txqsq *sq, struct sk_buff *skb,
cseg->opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | attr->opcode);
cseg->qpn_ds = cpu_to_be32((sq->sqn << 8) | wqe_attr->ds_cnt);
- mlx5e_tx_skb_update_hwts_flags(skb);
+ mlx5e_tx_skb_update_ts_flags(skb);
sq->pc += wi->num_wqebbs;
@@ -625,7 +626,7 @@ mlx5e_sq_xmit_mpwqe(struct mlx5e_txqsq *sq, struct sk_buff *skb,
mlx5e_dma_push(sq, txd.dma_addr, txd.len, MLX5E_DMA_MAP_SINGLE);
mlx5e_skb_fifo_push(&sq->db.skb_fifo, skb);
mlx5e_tx_mpwqe_add_dseg(sq, &txd);
- mlx5e_tx_skb_update_hwts_flags(skb);
+ mlx5e_tx_skb_update_ts_flags(skb);
if (unlikely(mlx5e_tx_mpwqe_is_full(&sq->mpwqe))) {
/* Might stop the queue and affect the retval of __netdev_tx_sent_queue. */
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,77 @@
From 61618987cfeb590d9694abd9fbcdb68f8845d29b Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:03 -0400
Subject: [PATCH] net/mlx5: HWS, expose function mlx5hws_table_ft_set_next_ft
in header
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit d2338a27fcee9158d0378d759152b8e0a5933c88
Author: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Sun May 11 22:38:01 2025 +0300
net/mlx5: HWS, expose function mlx5hws_table_ft_set_next_ft in header
In preparation for complex matcher support, make function
mlx5hws_table_ft_set_next_ft() non-static and expose it in header.
Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Vlad Dogaru <vdogaru@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1746992290-568936-2-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c
index ab1297531232..568f691733f3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c
@@ -342,10 +342,10 @@ int mlx5hws_table_ft_set_next_rtc(struct mlx5hws_context *ctx,
return mlx5hws_cmd_flow_table_modify(ctx->mdev, &ft_attr, ft_id);
}
-static int hws_table_ft_set_next_ft(struct mlx5hws_context *ctx,
- u32 ft_id,
- u32 fw_ft_type,
- u32 next_ft_id)
+int mlx5hws_table_ft_set_next_ft(struct mlx5hws_context *ctx,
+ u32 ft_id,
+ u32 fw_ft_type,
+ u32 next_ft_id)
{
struct mlx5hws_cmd_ft_modify_attr ft_attr = {0};
@@ -389,10 +389,10 @@ int mlx5hws_table_connect_to_miss_table(struct mlx5hws_table *src_tbl,
if (dst_tbl) {
if (list_empty(&dst_tbl->matchers_list)) {
/* Connect src_tbl last_ft to dst_tbl start anchor */
- ret = hws_table_ft_set_next_ft(src_tbl->ctx,
- last_ft_id,
- src_tbl->fw_ft_type,
- dst_tbl->ft_id);
+ ret = mlx5hws_table_ft_set_next_ft(src_tbl->ctx,
+ last_ft_id,
+ src_tbl->fw_ft_type,
+ dst_tbl->ft_id);
if (ret)
return ret;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.h
index dd50420eec9e..0400cce0c317 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.h
@@ -65,4 +65,9 @@ int mlx5hws_table_ft_set_next_rtc(struct mlx5hws_context *ctx,
u32 rtc_0_id,
u32 rtc_1_id);
+int mlx5hws_table_ft_set_next_ft(struct mlx5hws_context *ctx,
+ u32 ft_id,
+ u32 fw_ft_type,
+ u32 next_ft_id);
+
#endif /* MLX5HWS_TABLE_H_ */
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,263 @@
From f8db8d6e3362f5fac65193f3ece0fffb4ad20588 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:03 -0400
Subject: [PATCH] net/mlx5: HWS, add definer function to get field name str
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit fed5f4831281593a4bda2f8ef6912fdbcad6e670
Author: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Sun May 11 22:38:02 2025 +0300
net/mlx5: HWS, add definer function to get field name str
In preparation for complex matcher support, add function for
converting definer fname to str, which will be used in following
patches.
Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Vlad Dogaru <vdogaru@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1746992290-568936-3-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c
index 1061a46811ac..5cc0dc002ac1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c
@@ -158,6 +158,218 @@ struct mlx5hws_definer_conv_data {
u32 match_flags;
};
+#define HWS_DEFINER_ENTRY(name)[MLX5HWS_DEFINER_FNAME_##name] = #name
+
+static const char * const hws_definer_fname_to_str[] = {
+ HWS_DEFINER_ENTRY(ETH_SMAC_47_16_O),
+ HWS_DEFINER_ENTRY(ETH_SMAC_47_16_I),
+ HWS_DEFINER_ENTRY(ETH_SMAC_15_0_O),
+ HWS_DEFINER_ENTRY(ETH_SMAC_15_0_I),
+ HWS_DEFINER_ENTRY(ETH_DMAC_47_16_O),
+ HWS_DEFINER_ENTRY(ETH_DMAC_47_16_I),
+ HWS_DEFINER_ENTRY(ETH_DMAC_15_0_O),
+ HWS_DEFINER_ENTRY(ETH_DMAC_15_0_I),
+ HWS_DEFINER_ENTRY(ETH_TYPE_O),
+ HWS_DEFINER_ENTRY(ETH_TYPE_I),
+ HWS_DEFINER_ENTRY(ETH_L3_TYPE_O),
+ HWS_DEFINER_ENTRY(ETH_L3_TYPE_I),
+ HWS_DEFINER_ENTRY(VLAN_TYPE_O),
+ HWS_DEFINER_ENTRY(VLAN_TYPE_I),
+ HWS_DEFINER_ENTRY(VLAN_FIRST_PRIO_O),
+ HWS_DEFINER_ENTRY(VLAN_FIRST_PRIO_I),
+ HWS_DEFINER_ENTRY(VLAN_CFI_O),
+ HWS_DEFINER_ENTRY(VLAN_CFI_I),
+ HWS_DEFINER_ENTRY(VLAN_ID_O),
+ HWS_DEFINER_ENTRY(VLAN_ID_I),
+ HWS_DEFINER_ENTRY(VLAN_SECOND_TYPE_O),
+ HWS_DEFINER_ENTRY(VLAN_SECOND_TYPE_I),
+ HWS_DEFINER_ENTRY(VLAN_SECOND_PRIO_O),
+ HWS_DEFINER_ENTRY(VLAN_SECOND_PRIO_I),
+ HWS_DEFINER_ENTRY(VLAN_SECOND_CFI_O),
+ HWS_DEFINER_ENTRY(VLAN_SECOND_CFI_I),
+ HWS_DEFINER_ENTRY(VLAN_SECOND_ID_O),
+ HWS_DEFINER_ENTRY(VLAN_SECOND_ID_I),
+ HWS_DEFINER_ENTRY(IPV4_IHL_O),
+ HWS_DEFINER_ENTRY(IPV4_IHL_I),
+ HWS_DEFINER_ENTRY(IP_DSCP_O),
+ HWS_DEFINER_ENTRY(IP_DSCP_I),
+ HWS_DEFINER_ENTRY(IP_ECN_O),
+ HWS_DEFINER_ENTRY(IP_ECN_I),
+ HWS_DEFINER_ENTRY(IP_TTL_O),
+ HWS_DEFINER_ENTRY(IP_TTL_I),
+ HWS_DEFINER_ENTRY(IPV4_DST_O),
+ HWS_DEFINER_ENTRY(IPV4_DST_I),
+ HWS_DEFINER_ENTRY(IPV4_SRC_O),
+ HWS_DEFINER_ENTRY(IPV4_SRC_I),
+ HWS_DEFINER_ENTRY(IP_VERSION_O),
+ HWS_DEFINER_ENTRY(IP_VERSION_I),
+ HWS_DEFINER_ENTRY(IP_FRAG_O),
+ HWS_DEFINER_ENTRY(IP_FRAG_I),
+ HWS_DEFINER_ENTRY(IP_LEN_O),
+ HWS_DEFINER_ENTRY(IP_LEN_I),
+ HWS_DEFINER_ENTRY(IP_TOS_O),
+ HWS_DEFINER_ENTRY(IP_TOS_I),
+ HWS_DEFINER_ENTRY(IPV6_FLOW_LABEL_O),
+ HWS_DEFINER_ENTRY(IPV6_FLOW_LABEL_I),
+ HWS_DEFINER_ENTRY(IPV6_DST_127_96_O),
+ HWS_DEFINER_ENTRY(IPV6_DST_95_64_O),
+ HWS_DEFINER_ENTRY(IPV6_DST_63_32_O),
+ HWS_DEFINER_ENTRY(IPV6_DST_31_0_O),
+ HWS_DEFINER_ENTRY(IPV6_DST_127_96_I),
+ HWS_DEFINER_ENTRY(IPV6_DST_95_64_I),
+ HWS_DEFINER_ENTRY(IPV6_DST_63_32_I),
+ HWS_DEFINER_ENTRY(IPV6_DST_31_0_I),
+ HWS_DEFINER_ENTRY(IPV6_SRC_127_96_O),
+ HWS_DEFINER_ENTRY(IPV6_SRC_95_64_O),
+ HWS_DEFINER_ENTRY(IPV6_SRC_63_32_O),
+ HWS_DEFINER_ENTRY(IPV6_SRC_31_0_O),
+ HWS_DEFINER_ENTRY(IPV6_SRC_127_96_I),
+ HWS_DEFINER_ENTRY(IPV6_SRC_95_64_I),
+ HWS_DEFINER_ENTRY(IPV6_SRC_63_32_I),
+ HWS_DEFINER_ENTRY(IPV6_SRC_31_0_I),
+ HWS_DEFINER_ENTRY(IP_PROTOCOL_O),
+ HWS_DEFINER_ENTRY(IP_PROTOCOL_I),
+ HWS_DEFINER_ENTRY(L4_SPORT_O),
+ HWS_DEFINER_ENTRY(L4_SPORT_I),
+ HWS_DEFINER_ENTRY(L4_DPORT_O),
+ HWS_DEFINER_ENTRY(L4_DPORT_I),
+ HWS_DEFINER_ENTRY(TCP_FLAGS_I),
+ HWS_DEFINER_ENTRY(TCP_FLAGS_O),
+ HWS_DEFINER_ENTRY(TCP_SEQ_NUM),
+ HWS_DEFINER_ENTRY(TCP_ACK_NUM),
+ HWS_DEFINER_ENTRY(GTP_TEID),
+ HWS_DEFINER_ENTRY(GTP_MSG_TYPE),
+ HWS_DEFINER_ENTRY(GTP_EXT_FLAG),
+ HWS_DEFINER_ENTRY(GTP_NEXT_EXT_HDR),
+ HWS_DEFINER_ENTRY(GTP_EXT_HDR_PDU),
+ HWS_DEFINER_ENTRY(GTP_EXT_HDR_QFI),
+ HWS_DEFINER_ENTRY(GTPU_DW0),
+ HWS_DEFINER_ENTRY(GTPU_FIRST_EXT_DW0),
+ HWS_DEFINER_ENTRY(GTPU_DW2),
+ HWS_DEFINER_ENTRY(FLEX_PARSER_0),
+ HWS_DEFINER_ENTRY(FLEX_PARSER_1),
+ HWS_DEFINER_ENTRY(FLEX_PARSER_2),
+ HWS_DEFINER_ENTRY(FLEX_PARSER_3),
+ HWS_DEFINER_ENTRY(FLEX_PARSER_4),
+ HWS_DEFINER_ENTRY(FLEX_PARSER_5),
+ HWS_DEFINER_ENTRY(FLEX_PARSER_6),
+ HWS_DEFINER_ENTRY(FLEX_PARSER_7),
+ HWS_DEFINER_ENTRY(VPORT_REG_C_0),
+ HWS_DEFINER_ENTRY(VXLAN_FLAGS),
+ HWS_DEFINER_ENTRY(VXLAN_VNI),
+ HWS_DEFINER_ENTRY(VXLAN_GPE_FLAGS),
+ HWS_DEFINER_ENTRY(VXLAN_GPE_RSVD0),
+ HWS_DEFINER_ENTRY(VXLAN_GPE_PROTO),
+ HWS_DEFINER_ENTRY(VXLAN_GPE_VNI),
+ HWS_DEFINER_ENTRY(VXLAN_GPE_RSVD1),
+ HWS_DEFINER_ENTRY(GENEVE_OPT_LEN),
+ HWS_DEFINER_ENTRY(GENEVE_OAM),
+ HWS_DEFINER_ENTRY(GENEVE_PROTO),
+ HWS_DEFINER_ENTRY(GENEVE_VNI),
+ HWS_DEFINER_ENTRY(SOURCE_QP),
+ HWS_DEFINER_ENTRY(SOURCE_GVMI),
+ HWS_DEFINER_ENTRY(REG_0),
+ HWS_DEFINER_ENTRY(REG_1),
+ HWS_DEFINER_ENTRY(REG_2),
+ HWS_DEFINER_ENTRY(REG_3),
+ HWS_DEFINER_ENTRY(REG_4),
+ HWS_DEFINER_ENTRY(REG_5),
+ HWS_DEFINER_ENTRY(REG_6),
+ HWS_DEFINER_ENTRY(REG_7),
+ HWS_DEFINER_ENTRY(REG_8),
+ HWS_DEFINER_ENTRY(REG_9),
+ HWS_DEFINER_ENTRY(REG_10),
+ HWS_DEFINER_ENTRY(REG_11),
+ HWS_DEFINER_ENTRY(REG_A),
+ HWS_DEFINER_ENTRY(REG_B),
+ HWS_DEFINER_ENTRY(GRE_KEY_PRESENT),
+ HWS_DEFINER_ENTRY(GRE_C),
+ HWS_DEFINER_ENTRY(GRE_K),
+ HWS_DEFINER_ENTRY(GRE_S),
+ HWS_DEFINER_ENTRY(GRE_PROTOCOL),
+ HWS_DEFINER_ENTRY(GRE_OPT_KEY),
+ HWS_DEFINER_ENTRY(GRE_OPT_SEQ),
+ HWS_DEFINER_ENTRY(GRE_OPT_CHECKSUM),
+ HWS_DEFINER_ENTRY(INTEGRITY_O),
+ HWS_DEFINER_ENTRY(INTEGRITY_I),
+ HWS_DEFINER_ENTRY(ICMP_DW1),
+ HWS_DEFINER_ENTRY(ICMP_DW2),
+ HWS_DEFINER_ENTRY(ICMP_DW3),
+ HWS_DEFINER_ENTRY(IPSEC_SPI),
+ HWS_DEFINER_ENTRY(IPSEC_SEQUENCE_NUMBER),
+ HWS_DEFINER_ENTRY(IPSEC_SYNDROME),
+ HWS_DEFINER_ENTRY(MPLS0_O),
+ HWS_DEFINER_ENTRY(MPLS1_O),
+ HWS_DEFINER_ENTRY(MPLS2_O),
+ HWS_DEFINER_ENTRY(MPLS3_O),
+ HWS_DEFINER_ENTRY(MPLS4_O),
+ HWS_DEFINER_ENTRY(MPLS0_I),
+ HWS_DEFINER_ENTRY(MPLS1_I),
+ HWS_DEFINER_ENTRY(MPLS2_I),
+ HWS_DEFINER_ENTRY(MPLS3_I),
+ HWS_DEFINER_ENTRY(MPLS4_I),
+ HWS_DEFINER_ENTRY(FLEX_PARSER0_OK),
+ HWS_DEFINER_ENTRY(FLEX_PARSER1_OK),
+ HWS_DEFINER_ENTRY(FLEX_PARSER2_OK),
+ HWS_DEFINER_ENTRY(FLEX_PARSER3_OK),
+ HWS_DEFINER_ENTRY(FLEX_PARSER4_OK),
+ HWS_DEFINER_ENTRY(FLEX_PARSER5_OK),
+ HWS_DEFINER_ENTRY(FLEX_PARSER6_OK),
+ HWS_DEFINER_ENTRY(FLEX_PARSER7_OK),
+ HWS_DEFINER_ENTRY(OKS2_MPLS0_O),
+ HWS_DEFINER_ENTRY(OKS2_MPLS1_O),
+ HWS_DEFINER_ENTRY(OKS2_MPLS2_O),
+ HWS_DEFINER_ENTRY(OKS2_MPLS3_O),
+ HWS_DEFINER_ENTRY(OKS2_MPLS4_O),
+ HWS_DEFINER_ENTRY(OKS2_MPLS0_I),
+ HWS_DEFINER_ENTRY(OKS2_MPLS1_I),
+ HWS_DEFINER_ENTRY(OKS2_MPLS2_I),
+ HWS_DEFINER_ENTRY(OKS2_MPLS3_I),
+ HWS_DEFINER_ENTRY(OKS2_MPLS4_I),
+ HWS_DEFINER_ENTRY(GENEVE_OPT_OK_0),
+ HWS_DEFINER_ENTRY(GENEVE_OPT_OK_1),
+ HWS_DEFINER_ENTRY(GENEVE_OPT_OK_2),
+ HWS_DEFINER_ENTRY(GENEVE_OPT_OK_3),
+ HWS_DEFINER_ENTRY(GENEVE_OPT_OK_4),
+ HWS_DEFINER_ENTRY(GENEVE_OPT_OK_5),
+ HWS_DEFINER_ENTRY(GENEVE_OPT_OK_6),
+ HWS_DEFINER_ENTRY(GENEVE_OPT_OK_7),
+ HWS_DEFINER_ENTRY(GENEVE_OPT_DW_0),
+ HWS_DEFINER_ENTRY(GENEVE_OPT_DW_1),
+ HWS_DEFINER_ENTRY(GENEVE_OPT_DW_2),
+ HWS_DEFINER_ENTRY(GENEVE_OPT_DW_3),
+ HWS_DEFINER_ENTRY(GENEVE_OPT_DW_4),
+ HWS_DEFINER_ENTRY(GENEVE_OPT_DW_5),
+ HWS_DEFINER_ENTRY(GENEVE_OPT_DW_6),
+ HWS_DEFINER_ENTRY(GENEVE_OPT_DW_7),
+ HWS_DEFINER_ENTRY(IB_L4_OPCODE),
+ HWS_DEFINER_ENTRY(IB_L4_QPN),
+ HWS_DEFINER_ENTRY(IB_L4_A),
+ HWS_DEFINER_ENTRY(RANDOM_NUM),
+ HWS_DEFINER_ENTRY(PTYPE_L2_O),
+ HWS_DEFINER_ENTRY(PTYPE_L2_I),
+ HWS_DEFINER_ENTRY(PTYPE_L3_O),
+ HWS_DEFINER_ENTRY(PTYPE_L3_I),
+ HWS_DEFINER_ENTRY(PTYPE_L4_O),
+ HWS_DEFINER_ENTRY(PTYPE_L4_I),
+ HWS_DEFINER_ENTRY(PTYPE_L4_EXT_O),
+ HWS_DEFINER_ENTRY(PTYPE_L4_EXT_I),
+ HWS_DEFINER_ENTRY(PTYPE_FRAG_O),
+ HWS_DEFINER_ENTRY(PTYPE_FRAG_I),
+ HWS_DEFINER_ENTRY(TNL_HDR_0),
+ HWS_DEFINER_ENTRY(TNL_HDR_1),
+ HWS_DEFINER_ENTRY(TNL_HDR_2),
+ HWS_DEFINER_ENTRY(TNL_HDR_3),
+ [MLX5HWS_DEFINER_FNAME_MAX] = "DEFINER_FNAME_UNKNOWN",
+};
+
+const char *mlx5hws_definer_fname_to_str(enum mlx5hws_definer_fname fname)
+{
+ if (fname > MLX5HWS_DEFINER_FNAME_MAX)
+ fname = MLX5HWS_DEFINER_FNAME_MAX;
+ return hws_definer_fname_to_str[fname];
+}
+
static void
hws_definer_ones_set(struct mlx5hws_definer_fc *fc,
void *match_param,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.h
index 5c1a2086efba..62da55389331 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.h
@@ -831,4 +831,6 @@ mlx5hws_definer_conv_match_params_to_compressed_fc(struct mlx5hws_context *ctx,
u32 *match_param,
int *fc_sz);
+const char *mlx5hws_definer_fname_to_str(enum mlx5hws_definer_fname fname);
+
#endif /* HWS_DEFINER_H_ */
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,120 @@
From 7287dfe961f39c52d8e7fbc0a719036457df2b41 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:04 -0400
Subject: [PATCH] net/mlx5: HWS, expose polling function in header file
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 3c739d1624e3c3186a0a0248e91851a085f6e45b
Author: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Sun May 11 22:38:03 2025 +0300
net/mlx5: HWS, expose polling function in header file
In preparation for complex matcher, expose the function that is
polling queue for completion (mlx5hws_bwc_queue_poll) in header
file, so that it will be used by complex matcher code.
Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Vlad Dogaru <vdogaru@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1746992290-568936-4-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c
index 510bfbbe5991..27b6420678d8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c
@@ -223,10 +223,10 @@ int mlx5hws_bwc_matcher_destroy(struct mlx5hws_bwc_matcher *bwc_matcher)
return 0;
}
-static int hws_bwc_queue_poll(struct mlx5hws_context *ctx,
- u16 queue_id,
- u32 *pending_rules,
- bool drain)
+int mlx5hws_bwc_queue_poll(struct mlx5hws_context *ctx,
+ u16 queue_id,
+ u32 *pending_rules,
+ bool drain)
{
unsigned long timeout = jiffies +
secs_to_jiffies(MLX5HWS_BWC_POLLING_TIMEOUT);
@@ -361,7 +361,8 @@ hws_bwc_rule_destroy_hws_sync(struct mlx5hws_bwc_rule *bwc_rule,
if (unlikely(ret))
return ret;
- ret = hws_bwc_queue_poll(ctx, rule_attr->queue_id, &expected_completions, true);
+ ret = mlx5hws_bwc_queue_poll(ctx, rule_attr->queue_id,
+ &expected_completions, true);
if (unlikely(ret))
return ret;
@@ -442,9 +443,8 @@ hws_bwc_rule_create_sync(struct mlx5hws_bwc_rule *bwc_rule,
if (unlikely(ret))
return ret;
- ret = hws_bwc_queue_poll(ctx, rule_attr->queue_id, &expected_completions, true);
-
- return ret;
+ return mlx5hws_bwc_queue_poll(ctx, rule_attr->queue_id,
+ &expected_completions, true);
}
static int
@@ -465,7 +465,8 @@ hws_bwc_rule_update_sync(struct mlx5hws_bwc_rule *bwc_rule,
if (unlikely(ret))
return ret;
- ret = hws_bwc_queue_poll(ctx, rule_attr->queue_id, &expected_completions, true);
+ ret = mlx5hws_bwc_queue_poll(ctx, rule_attr->queue_id,
+ &expected_completions, true);
if (unlikely(ret))
mlx5hws_err(ctx, "Failed updating BWC rule (%d)\n", ret);
@@ -651,8 +652,10 @@ static int hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_match
&bwc_matcher->rules[i]) ?
NULL : list_next_entry(bwc_rules[i], list_node);
- ret = hws_bwc_queue_poll(ctx, rule_attr.queue_id,
- &pending_rules[i], false);
+ ret = mlx5hws_bwc_queue_poll(ctx,
+ rule_attr.queue_id,
+ &pending_rules[i],
+ false);
if (unlikely(ret)) {
mlx5hws_err(ctx,
"Moving BWC rule failed during rehash (%d)\n",
@@ -669,8 +672,8 @@ static int hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_match
u16 queue_id = mlx5hws_bwc_get_queue_id(ctx, i);
mlx5hws_send_engine_flush_queue(&ctx->send_queue[queue_id]);
- ret = hws_bwc_queue_poll(ctx, queue_id,
- &pending_rules[i], true);
+ ret = mlx5hws_bwc_queue_poll(ctx, queue_id,
+ &pending_rules[i], true);
if (unlikely(ret)) {
mlx5hws_err(ctx,
"Moving BWC rule failed during rehash (%d)\n", ret);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h
index bb0cf4b922ce..a2aa2d5da694 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h
@@ -64,6 +64,11 @@ void mlx5hws_bwc_rule_fill_attr(struct mlx5hws_bwc_matcher *bwc_matcher,
u32 flow_source,
struct mlx5hws_rule_attr *rule_attr);
+int mlx5hws_bwc_queue_poll(struct mlx5hws_context *ctx,
+ u16 queue_id,
+ u32 *pending_rules,
+ bool drain);
+
static inline u16 mlx5hws_bwc_queues(struct mlx5hws_context *ctx)
{
/* Besides the control queue, half of the queues are
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,414 @@
From e52f06951f1709e7bd3b78b3c4932d5fc69d10eb Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:04 -0400
Subject: [PATCH] net/mlx5: HWS, introduce isolated matchers
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit b816743a182f532faaeaa9aaed147ff09513e375
Author: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Sun May 11 22:38:04 2025 +0300
net/mlx5: HWS, introduce isolated matchers
In preparation for complex matcher support, introduce the isolated
matcher.
Isolated matcher is a matcher that has its own isolated table.
It is used as the second half of the complex matcher: when the rule
is split into two parts (complex rule), then matching on the first
part will send the packet to the isolated matcher that will try to
match on the second part. In case of miss, the packet goes back to
the matcher's end flow table.
Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Vlad Dogaru <vdogaru@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1746992290-568936-5-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c
index 5b0c1623499b..ce28ee1c0e41 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c
@@ -23,19 +23,199 @@ static void hws_matcher_destroy_end_ft(struct mlx5hws_matcher *matcher)
mlx5hws_table_destroy_default_ft(matcher->tbl, matcher->end_ft_id);
}
+int mlx5hws_matcher_update_end_ft_isolated(struct mlx5hws_table *tbl,
+ u32 miss_ft_id)
+{
+ struct mlx5hws_matcher *tmp_matcher;
+
+ if (list_empty(&tbl->matchers_list))
+ return -EINVAL;
+
+ /* Update isolated_matcher_end_ft_id attribute for all
+ * the matchers in isolated table.
+ */
+ list_for_each_entry(tmp_matcher, &tbl->matchers_list, list_node)
+ tmp_matcher->attr.isolated_matcher_end_ft_id = miss_ft_id;
+
+ tmp_matcher = list_last_entry(&tbl->matchers_list,
+ struct mlx5hws_matcher,
+ list_node);
+
+ return mlx5hws_table_ft_set_next_ft(tbl->ctx,
+ tmp_matcher->end_ft_id,
+ tbl->fw_ft_type,
+ miss_ft_id);
+}
+
+static int hws_matcher_connect_end_ft_isolated(struct mlx5hws_matcher *matcher)
+{
+ struct mlx5hws_table *tbl = matcher->tbl;
+ u32 end_ft_id;
+ int ret;
+
+ /* Reset end_ft next RTCs */
+ ret = mlx5hws_table_ft_set_next_rtc(tbl->ctx,
+ matcher->end_ft_id,
+ matcher->tbl->fw_ft_type,
+ 0, 0);
+ if (ret) {
+ mlx5hws_err(tbl->ctx, "Isolated matcher: failed to reset FT's next RTCs\n");
+ return ret;
+ }
+
+ /* Connect isolated matcher's end_ft to the complex matcher's end FT */
+ end_ft_id = matcher->attr.isolated_matcher_end_ft_id;
+ ret = mlx5hws_table_ft_set_next_ft(tbl->ctx,
+ matcher->end_ft_id,
+ matcher->tbl->fw_ft_type,
+ end_ft_id);
+
+ if (ret) {
+ mlx5hws_err(tbl->ctx, "Isolated matcher: failed to set FT's miss_ft_id\n");
+ return ret;
+ }
+
+ return 0;
+}
+
+static int hws_matcher_create_end_ft_isolated(struct mlx5hws_matcher *matcher)
+{
+ struct mlx5hws_table *tbl = matcher->tbl;
+ int ret;
+
+ ret = mlx5hws_table_create_default_ft(tbl->ctx->mdev,
+ tbl,
+ &matcher->end_ft_id);
+ if (ret) {
+ mlx5hws_err(tbl->ctx, "Isolated matcher: failed to create end flow table\n");
+ return ret;
+ }
+
+ ret = hws_matcher_connect_end_ft_isolated(matcher);
+ if (ret) {
+ mlx5hws_err(tbl->ctx, "Isolated matcher: failed to connect end FT\n");
+ goto destroy_default_ft;
+ }
+
+ return 0;
+
+destroy_default_ft:
+ mlx5hws_table_destroy_default_ft(tbl, matcher->end_ft_id);
+ return ret;
+}
+
static int hws_matcher_create_end_ft(struct mlx5hws_matcher *matcher)
{
struct mlx5hws_table *tbl = matcher->tbl;
int ret;
- ret = mlx5hws_table_create_default_ft(tbl->ctx->mdev, tbl, &matcher->end_ft_id);
+ if (mlx5hws_matcher_is_isolated(matcher))
+ ret = hws_matcher_create_end_ft_isolated(matcher);
+ else
+ ret = mlx5hws_table_create_default_ft(tbl->ctx->mdev, tbl,
+ &matcher->end_ft_id);
+
if (ret) {
mlx5hws_err(tbl->ctx, "Failed to create matcher end flow table\n");
return ret;
}
+
+ return 0;
+}
+
+static int hws_matcher_connect_isolated_first(struct mlx5hws_matcher *matcher)
+{
+ struct mlx5hws_table *tbl = matcher->tbl;
+ struct mlx5hws_context *ctx = tbl->ctx;
+ int ret;
+
+ /* Isolated matcher's end_ft is already pointing to the end_ft
+ * of the complex matcher - it was set at creation of end_ft,
+ * so no need to connect it.
+ * We still need to connect the isolated table's start FT to
+ * this matcher's RTC.
+ */
+ ret = mlx5hws_table_ft_set_next_rtc(ctx,
+ tbl->ft_id,
+ tbl->fw_ft_type,
+ matcher->match_ste.rtc_0_id,
+ matcher->match_ste.rtc_1_id);
+ if (ret) {
+ mlx5hws_err(ctx, "Isolated matcher: failed to connect start FT to match RTC\n");
+ return ret;
+ }
+
+ /* Reset table's FT default miss (drop refcount) */
+ ret = mlx5hws_table_ft_set_default_next_ft(tbl, tbl->ft_id);
+ if (ret) {
+ mlx5hws_err(ctx, "Isolated matcher: failed to reset table ft default miss\n");
+ return ret;
+ }
+
+ list_add(&matcher->list_node, &tbl->matchers_list);
+
+ return ret;
+}
+
+static int hws_matcher_connect_isolated_last(struct mlx5hws_matcher *matcher)
+{
+ struct mlx5hws_table *tbl = matcher->tbl;
+ struct mlx5hws_context *ctx = tbl->ctx;
+ struct mlx5hws_matcher *last;
+ int ret;
+
+ last = list_last_entry(&tbl->matchers_list,
+ struct mlx5hws_matcher,
+ list_node);
+
+ /* New matcher's end_ft is already pointing to the end_ft of
+ * the complex matcher.
+ * Connect previous matcher's end_ft to this new matcher RTC.
+ */
+ ret = mlx5hws_table_ft_set_next_rtc(ctx,
+ last->end_ft_id,
+ tbl->fw_ft_type,
+ matcher->match_ste.rtc_0_id,
+ matcher->match_ste.rtc_1_id);
+ if (ret) {
+ mlx5hws_err(ctx,
+ "Isolated matcher: failed to connect matcher end_ft to new match RTC\n");
+ return ret;
+ }
+
+ /* Reset prev matcher FT default miss (drop refcount) */
+ ret = mlx5hws_table_ft_set_default_next_ft(tbl, last->end_ft_id);
+ if (ret) {
+ mlx5hws_err(ctx, "Isolated matcher: failed to reset matcher ft default miss\n");
+ return ret;
+ }
+
+ /* Insert after the last matcher */
+ list_add(&matcher->list_node, &last->list_node);
+
return 0;
}
+static int hws_matcher_connect_isolated(struct mlx5hws_matcher *matcher)
+{
+ /* Isolated matcher is expected to be the only one in its table.
+ * However, it can have a collision matcher, and it can go through
+ * rehash process, in which case we will temporary have both old and
+ * new matchers in the isolated table.
+ * Check if this is the first matcher in the isolated table.
+ */
+ if (list_empty(&matcher->tbl->matchers_list))
+ return hws_matcher_connect_isolated_first(matcher);
+
+ /* If this wasn't the first matcher, then we have 3 possible cases:
+ * - this is a collision matcher for the first matcher
+ * - this is a new rehash dest matcher
+ * - this is a collision matcher for the new rehash dest matcher
+ * The logic to add new matcher is the same for all these cases.
+ */
+ return hws_matcher_connect_isolated_last(matcher);
+}
+
static int hws_matcher_connect(struct mlx5hws_matcher *matcher)
{
struct mlx5hws_table *tbl = matcher->tbl;
@@ -45,6 +225,9 @@ static int hws_matcher_connect(struct mlx5hws_matcher *matcher)
struct mlx5hws_matcher *tmp_matcher;
int ret;
+ if (mlx5hws_matcher_is_isolated(matcher))
+ return hws_matcher_connect_isolated(matcher);
+
/* Find location in matcher list */
if (list_empty(&tbl->matchers_list)) {
list_add(&matcher->list_node, &tbl->matchers_list);
@@ -121,6 +304,92 @@ static int hws_matcher_connect(struct mlx5hws_matcher *matcher)
return ret;
}
+static int hws_matcher_disconnect_isolated(struct mlx5hws_matcher *matcher)
+{
+ struct mlx5hws_matcher *first, *last, *prev, *next;
+ struct mlx5hws_table *tbl = matcher->tbl;
+ struct mlx5hws_context *ctx = tbl->ctx;
+ u32 end_ft_id;
+ int ret;
+
+ first = list_first_entry(&tbl->matchers_list,
+ struct mlx5hws_matcher,
+ list_node);
+ last = list_last_entry(&tbl->matchers_list,
+ struct mlx5hws_matcher,
+ list_node);
+ prev = list_prev_entry(matcher, list_node);
+ next = list_next_entry(matcher, list_node);
+
+ list_del_init(&matcher->list_node);
+
+ if (first == last) {
+ /* This was the only matcher in the list.
+ * Reset isolated table FT next RTCs and connect it
+ * to the whole complex matcher end FT instead.
+ */
+ ret = mlx5hws_table_ft_set_next_rtc(ctx,
+ tbl->ft_id,
+ tbl->fw_ft_type,
+ 0, 0);
+ if (ret) {
+ mlx5hws_err(tbl->ctx, "Isolated matcher: failed to reset FT's next RTCs\n");
+ return ret;
+ }
+
+ end_ft_id = matcher->attr.isolated_matcher_end_ft_id;
+ ret = mlx5hws_table_ft_set_next_ft(tbl->ctx,
+ tbl->ft_id,
+ tbl->fw_ft_type,
+ end_ft_id);
+ if (ret) {
+ mlx5hws_err(tbl->ctx, "Isolated matcher: failed to set FT's miss_ft_id\n");
+ return ret;
+ }
+
+ return 0;
+ }
+
+ /* At this point we know that there are more matchers in the list */
+
+ if (matcher == first) {
+ /* We've disconnected the first matcher.
+ * Now update isolated table default FT.
+ */
+ if (!next)
+ return -EINVAL;
+ return mlx5hws_table_ft_set_next_rtc(ctx,
+ tbl->ft_id,
+ tbl->fw_ft_type,
+ next->match_ste.rtc_0_id,
+ next->match_ste.rtc_1_id);
+ }
+
+ if (matcher == last) {
+ /* If we've disconnected the last matcher - update prev
+ * matcher's end_ft to point to the complex matcher end_ft.
+ */
+ if (!prev)
+ return -EINVAL;
+ return hws_matcher_connect_end_ft_isolated(prev);
+ }
+
+ /* This wasn't the first or the last matcher, which means that it has
+ * both prev and next matchers. Note that this only happens if we're
+ * disconnecting collision matcher of the old matcher during rehash.
+ */
+ if (!prev || !next ||
+ !(matcher->flags & MLX5HWS_MATCHER_FLAGS_COLLISION))
+ return -EINVAL;
+
+ /* Update prev end FT to point to next match RTC */
+ return mlx5hws_table_ft_set_next_rtc(ctx,
+ prev->end_ft_id,
+ tbl->fw_ft_type,
+ next->match_ste.rtc_0_id,
+ next->match_ste.rtc_1_id);
+}
+
static int hws_matcher_disconnect(struct mlx5hws_matcher *matcher)
{
struct mlx5hws_matcher *next = NULL, *prev = NULL;
@@ -128,6 +397,9 @@ static int hws_matcher_disconnect(struct mlx5hws_matcher *matcher)
u32 prev_ft_id = tbl->ft_id;
int ret;
+ if (mlx5hws_matcher_is_isolated(matcher))
+ return hws_matcher_disconnect_isolated(matcher);
+
if (!list_is_first(&matcher->list_node, &tbl->matchers_list)) {
prev = list_prev_entry(matcher, list_node);
prev_ft_id = prev->end_ft_id;
@@ -531,6 +803,8 @@ hws_matcher_process_attr(struct mlx5hws_cmd_query_caps *caps,
attr->table.sz_col_log = hws_matcher_rules_to_tbl_depth(attr->rule.num_log);
matcher->flags |= attr->resizable ? MLX5HWS_MATCHER_FLAGS_RESIZABLE : 0;
+ matcher->flags |= attr->isolated_matcher_end_ft_id ?
+ MLX5HWS_MATCHER_FLAGS_ISOLATED : 0;
return hws_matcher_check_attr_sz(caps, matcher);
}
@@ -617,6 +891,8 @@ hws_matcher_create_col_matcher(struct mlx5hws_matcher *matcher)
col_matcher->attr.table.sz_row_log -= MLX5HWS_MATCHER_ASSURED_ROW_RATIO;
col_matcher->attr.max_num_of_at_attach = matcher->attr.max_num_of_at_attach;
+ col_matcher->attr.isolated_matcher_end_ft_id =
+ matcher->attr.isolated_matcher_end_ft_id;
ret = hws_matcher_process_attr(ctx->caps, col_matcher);
if (ret)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h
index 8e95158a66b5..32e83cddcd60 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h
@@ -34,6 +34,7 @@ enum mlx5hws_matcher_offset {
enum mlx5hws_matcher_flags {
MLX5HWS_MATCHER_FLAGS_COLLISION = 1 << 2,
MLX5HWS_MATCHER_FLAGS_RESIZABLE = 1 << 3,
+ MLX5HWS_MATCHER_FLAGS_ISOLATED = 1 << 4,
};
struct mlx5hws_match_template {
@@ -96,9 +97,17 @@ static inline bool mlx5hws_matcher_is_in_resize(struct mlx5hws_matcher *matcher)
return !!matcher->resize_dst;
}
+static inline bool mlx5hws_matcher_is_isolated(struct mlx5hws_matcher *matcher)
+{
+ return !!(matcher->flags & MLX5HWS_MATCHER_FLAGS_ISOLATED);
+}
+
static inline bool mlx5hws_matcher_is_insert_by_idx(struct mlx5hws_matcher *matcher)
{
return matcher->attr.insert_mode == MLX5HWS_MATCHER_INSERT_BY_INDEX;
}
+int mlx5hws_matcher_update_end_ft_isolated(struct mlx5hws_table *tbl,
+ u32 miss_ft_id);
+
#endif /* HWS_MATCHER_H_ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h
index 5121951f2778..fbd63369da10 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h
@@ -119,6 +119,8 @@ struct mlx5hws_matcher_attr {
};
/* Optional AT attach configuration - Max number of additional AT */
u8 max_num_of_at_attach;
+ /* Optional end FT (miss FT ID) for match RTC (for isolated matcher) */
+ u32 isolated_matcher_end_ft_id;
};
struct mlx5hws_rule_attr {
--
2.50.1 (Apple Git-155)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,93 @@
From bf97c3c319b7e24f5e432eae209ee48da86864e3 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:04 -0400
Subject: [PATCH] net/mlx5: HWS, force rehash when rule insertion failed
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 9d4024edce1063b616fa8bf7b2363290503cc322
Author: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Sun May 11 22:38:06 2025 +0300
net/mlx5: HWS, force rehash when rule insertion failed
Rules are inserted into hash table in accordance with their hash index.
When a certain number of rules is reached, the table is rehashed:
a bigger new table is allocated and all the rules are moved there.
But sometimes a new rule can't be inserted into the hash table
because its index is full, even though the number of rules in the
table is well below the threshold. The hash function is not perfect,
so such cases are not rare. When that happens, we want to do the same
rehash, in order to increase the table size and lower the probability
for such cases.
This patch fixes the usecase where rule insertion was failing, but
rehash couldn't be initiated due to low number of rules: it adds flag
that denotes that rehash is required, even if the number of rules in
the table is below the rehash threshold.
Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Vlad Dogaru <vdogaru@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1746992290-568936-7-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c
index d70db6948dbb..dce2605fc99b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c
@@ -169,6 +169,7 @@ mlx5hws_bwc_matcher_create(struct mlx5hws_table *table,
return NULL;
atomic_set(&bwc_matcher->num_of_rules, 0);
+ atomic_set(&bwc_matcher->rehash_required, false);
/* Check if the required match params can be all matched
* in single STE, otherwise complex matcher is needed.
@@ -769,9 +770,9 @@ hws_bwc_matcher_rehash_size(struct mlx5hws_bwc_matcher *bwc_matcher)
/* It is possible that other rule has already performed rehash.
* Need to check again if we really need rehash.
- * If the reason for rehash was size, but not any more - skip rehash.
*/
- if (!hws_bwc_matcher_rehash_size_needed(bwc_matcher,
+ if (!atomic_read(&bwc_matcher->rehash_required) &&
+ !hws_bwc_matcher_rehash_size_needed(bwc_matcher,
atomic_read(&bwc_matcher->num_of_rules)))
return 0;
@@ -782,6 +783,8 @@ hws_bwc_matcher_rehash_size(struct mlx5hws_bwc_matcher *bwc_matcher)
* - destroy the old matcher
*/
+ atomic_set(&bwc_matcher->rehash_required, false);
+
ret = hws_bwc_matcher_extend_size(bwc_matcher);
if (ret)
return ret;
@@ -875,6 +878,7 @@ int mlx5hws_bwc_rule_create_simple(struct mlx5hws_bwc_rule *bwc_rule,
* Try rehash by size and insert rule again - last chance.
*/
+ atomic_set(&bwc_matcher->rehash_required, true);
mutex_unlock(queue_lock);
hws_bwc_lock_all_queues(ctx);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h
index cf2b65146317..d21fc247a510 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h
@@ -30,6 +30,7 @@ struct mlx5hws_bwc_matcher {
u8 size_log;
u32 priority;
atomic_t num_of_rules;
+ atomic_t rehash_required;
struct list_head *rules;
};
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,99 @@
From cfa1d3ecb42da569e4f38ae06c426e91a98f92ea Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:05 -0400
Subject: [PATCH] net/mlx5: HWS, fix counting of rules in the matcher
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 4c56b5cbc323a10ebb6595500fb78fd8a4762efd
Author: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Sun May 11 22:38:07 2025 +0300
net/mlx5: HWS, fix counting of rules in the matcher
Currently the counter that counts number of rules in a matcher is
increased only when rule insertion is completed. In a multi-threaded
usecase this can lead to a scenario that many rules can be in process
of insertion in the same matcher, while none of them has completed
the insertion and the rule counter is not updated. This results in
a rule insertion failure for many of them at first attempt, which
leads to all of them requiring rehash and requiring locking of all
the queue locks.
This patch fixes the case by increasing the rule counter in the
beginning of insertion process and decreasing in case of any failure.
Signed-off-by: Vlad Dogaru <vdogaru@nvidia.com>
Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1746992290-568936-8-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c
index dce2605fc99b..7d991a61eeb3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c
@@ -341,16 +341,12 @@ static void hws_bwc_rule_list_add(struct mlx5hws_bwc_rule *bwc_rule, u16 idx)
{
struct mlx5hws_bwc_matcher *bwc_matcher = bwc_rule->bwc_matcher;
- atomic_inc(&bwc_matcher->num_of_rules);
bwc_rule->bwc_queue_idx = idx;
list_add(&bwc_rule->list_node, &bwc_matcher->rules[idx]);
}
static void hws_bwc_rule_list_remove(struct mlx5hws_bwc_rule *bwc_rule)
{
- struct mlx5hws_bwc_matcher *bwc_matcher = bwc_rule->bwc_matcher;
-
- atomic_dec(&bwc_matcher->num_of_rules);
list_del_init(&bwc_rule->list_node);
}
@@ -404,6 +400,7 @@ int mlx5hws_bwc_rule_destroy_simple(struct mlx5hws_bwc_rule *bwc_rule)
mutex_lock(queue_lock);
ret = hws_bwc_rule_destroy_hws_sync(bwc_rule, &attr);
+ atomic_dec(&bwc_matcher->num_of_rules);
hws_bwc_rule_list_remove(bwc_rule);
mutex_unlock(queue_lock);
@@ -840,7 +837,7 @@ int mlx5hws_bwc_rule_create_simple(struct mlx5hws_bwc_rule *bwc_rule,
}
/* check if number of rules require rehash */
- num_of_rules = atomic_read(&bwc_matcher->num_of_rules);
+ num_of_rules = atomic_inc_return(&bwc_matcher->num_of_rules);
if (unlikely(hws_bwc_matcher_rehash_size_needed(bwc_matcher, num_of_rules))) {
mutex_unlock(queue_lock);
@@ -854,6 +851,7 @@ int mlx5hws_bwc_rule_create_simple(struct mlx5hws_bwc_rule *bwc_rule,
bwc_matcher->size_log - MLX5HWS_BWC_MATCHER_SIZE_LOG_STEP,
bwc_matcher->size_log,
ret);
+ atomic_dec(&bwc_matcher->num_of_rules);
return ret;
}
@@ -887,6 +885,7 @@ int mlx5hws_bwc_rule_create_simple(struct mlx5hws_bwc_rule *bwc_rule,
if (ret) {
mlx5hws_err(ctx, "BWC rule insertion: rehash failed (%d)\n", ret);
+ atomic_dec(&bwc_matcher->num_of_rules);
return ret;
}
@@ -902,6 +901,7 @@ int mlx5hws_bwc_rule_create_simple(struct mlx5hws_bwc_rule *bwc_rule,
if (unlikely(ret)) {
mutex_unlock(queue_lock);
mlx5hws_err(ctx, "BWC rule insertion failed (%d)\n", ret);
+ atomic_dec(&bwc_matcher->num_of_rules);
return ret;
}
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,171 @@
From cb05a4cab576c3226584ec674529506113f984f5 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:05 -0400
Subject: [PATCH] net/mlx5: HWS, fix redundant extension of action templates
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 041861b40f599311214a52075140db8be29fd27f
Author: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Sun May 11 22:38:08 2025 +0300
net/mlx5: HWS, fix redundant extension of action templates
When a rule is inserted into a matcher, we search for the suitable
action template. If such template is not found, action template array
is extended with the new template. However, when several threads are
performing this in parallel, there is a race - we can end up with
extending the action templates array with the same template.
This patch is doing the following:
- refactor the code to find action template index in rule create and
update, have the common code in an auxiliary function
- after locking all the queues, check again if the action template
array still needs to be extended
Signed-off-by: Vlad Dogaru <vdogaru@nvidia.com>
Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1746992290-568936-9-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c
index 7d991a61eeb3..456fac895f5e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c
@@ -789,6 +789,53 @@ hws_bwc_matcher_rehash_size(struct mlx5hws_bwc_matcher *bwc_matcher)
return hws_bwc_matcher_move(bwc_matcher);
}
+static int hws_bwc_rule_get_at_idx(struct mlx5hws_bwc_rule *bwc_rule,
+ struct mlx5hws_rule_action rule_actions[],
+ u16 bwc_queue_idx)
+{
+ struct mlx5hws_bwc_matcher *bwc_matcher = bwc_rule->bwc_matcher;
+ struct mlx5hws_context *ctx = bwc_matcher->matcher->tbl->ctx;
+ struct mutex *queue_lock; /* Protect the queue */
+ int at_idx, ret;
+
+ /* check if rehash needed due to missing action template */
+ at_idx = hws_bwc_matcher_find_at(bwc_matcher, rule_actions);
+ if (likely(at_idx >= 0))
+ return at_idx;
+
+ /* we need to extend BWC matcher action templates array */
+ queue_lock = hws_bwc_get_queue_lock(ctx, bwc_queue_idx);
+ mutex_unlock(queue_lock);
+ hws_bwc_lock_all_queues(ctx);
+
+ /* check again - perhaps other thread already did extend_at */
+ at_idx = hws_bwc_matcher_find_at(bwc_matcher, rule_actions);
+ if (at_idx >= 0)
+ goto out;
+
+ ret = hws_bwc_matcher_extend_at(bwc_matcher, rule_actions);
+ if (unlikely(ret)) {
+ mlx5hws_err(ctx, "BWC rule: failed extending AT (%d)", ret);
+ at_idx = -EINVAL;
+ goto out;
+ }
+
+ /* action templates array was extended, we need the last idx */
+ at_idx = bwc_matcher->num_of_at - 1;
+ ret = mlx5hws_matcher_attach_at(bwc_matcher->matcher,
+ bwc_matcher->at[at_idx]);
+ if (unlikely(ret)) {
+ mlx5hws_err(ctx, "BWC rule: failed attaching new AT (%d)", ret);
+ at_idx = -EINVAL;
+ goto out;
+ }
+
+out:
+ hws_bwc_unlock_all_queues(ctx);
+ mutex_lock(queue_lock);
+ return at_idx;
+}
+
int mlx5hws_bwc_rule_create_simple(struct mlx5hws_bwc_rule *bwc_rule,
u32 *match_param,
struct mlx5hws_rule_action rule_actions[],
@@ -809,31 +856,12 @@ int mlx5hws_bwc_rule_create_simple(struct mlx5hws_bwc_rule *bwc_rule,
mutex_lock(queue_lock);
- /* check if rehash needed due to missing action template */
- at_idx = hws_bwc_matcher_find_at(bwc_matcher, rule_actions);
+ at_idx = hws_bwc_rule_get_at_idx(bwc_rule, rule_actions, bwc_queue_idx);
if (unlikely(at_idx < 0)) {
- /* we need to extend BWC matcher action templates array */
mutex_unlock(queue_lock);
- hws_bwc_lock_all_queues(ctx);
-
- ret = hws_bwc_matcher_extend_at(bwc_matcher, rule_actions);
- if (unlikely(ret)) {
- hws_bwc_unlock_all_queues(ctx);
- return ret;
- }
-
- /* action templates array was extended, we need the last idx */
- at_idx = bwc_matcher->num_of_at - 1;
-
- ret = mlx5hws_matcher_attach_at(bwc_matcher->matcher,
- bwc_matcher->at[at_idx]);
- if (unlikely(ret)) {
- hws_bwc_unlock_all_queues(ctx);
- return ret;
- }
-
- hws_bwc_unlock_all_queues(ctx);
- mutex_lock(queue_lock);
+ mlx5hws_err(ctx, "BWC rule create: failed getting AT (%d)",
+ ret);
+ return -EINVAL;
}
/* check if number of rules require rehash */
@@ -971,36 +999,11 @@ hws_bwc_rule_action_update(struct mlx5hws_bwc_rule *bwc_rule,
mutex_lock(queue_lock);
- /* check if rehash needed due to missing action template */
- at_idx = hws_bwc_matcher_find_at(bwc_matcher, rule_actions);
+ at_idx = hws_bwc_rule_get_at_idx(bwc_rule, rule_actions, idx);
if (unlikely(at_idx < 0)) {
- /* we need to extend BWC matcher action templates array */
mutex_unlock(queue_lock);
- hws_bwc_lock_all_queues(ctx);
-
- /* check again - perhaps other thread already did extend_at */
- at_idx = hws_bwc_matcher_find_at(bwc_matcher, rule_actions);
- if (likely(at_idx < 0)) {
- ret = hws_bwc_matcher_extend_at(bwc_matcher, rule_actions);
- if (unlikely(ret)) {
- hws_bwc_unlock_all_queues(ctx);
- mlx5hws_err(ctx, "BWC rule update: failed extending AT (%d)", ret);
- return -EINVAL;
- }
-
- /* action templates array was extended, we need the last idx */
- at_idx = bwc_matcher->num_of_at - 1;
-
- ret = mlx5hws_matcher_attach_at(bwc_matcher->matcher,
- bwc_matcher->at[at_idx]);
- if (unlikely(ret)) {
- hws_bwc_unlock_all_queues(ctx);
- return ret;
- }
- }
-
- hws_bwc_unlock_all_queues(ctx);
- mutex_lock(queue_lock);
+ mlx5hws_err(ctx, "BWC rule update: failed getting AT\n");
+ return -EINVAL;
}
ret = hws_bwc_rule_update_sync(bwc_rule,
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,209 @@
From a380ec59fea7f1801a94b324bd7f688f2d3be0dd Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:05 -0400
Subject: [PATCH] net/mlx5: HWS, rework rehash loop
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit ef94799a87415790d4297cf06075f99b70c420cd
Author: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Sun May 11 22:38:09 2025 +0300
net/mlx5: HWS, rework rehash loop
Reworking the rehash loop - simplifying the code and making it less
error prone:
- Instead of doing round-robin on all the queues with batch of rules in
each cycle, just go over all the queues and move all the rules that
belong to this queue.
- If at some stage of moving the rule we get a failure (which should
not happen), this can't be rolled back. So instead of aborting
rehash and leaving the matcher in a broken state, allow the loop
to continue: attempt to move the rest of the rules and delete the
old matcher. A rule that failed to move to a new matcher will loose
its match STE once the rehash is completed and the old matcher is
deleted, so the rule won't match any traffic any more. This rule's
packets will fall back to the steering pipeline w/o HW offload.
Rehash procedure will return an error, which will cause the rule
insertion to fail for the rule that started this whole rehash.
Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Vlad Dogaru <vdogaru@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1746992290-568936-10-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c
index 456fac895f5e..9e057f808ea5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c
@@ -610,95 +610,69 @@ hws_bwc_matcher_find_at(struct mlx5hws_bwc_matcher *bwc_matcher,
static int hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_matcher)
{
+ bool move_error = false, poll_error = false, drain_error = false;
struct mlx5hws_context *ctx = bwc_matcher->matcher->tbl->ctx;
+ struct mlx5hws_matcher *matcher = bwc_matcher->matcher;
u16 bwc_queues = mlx5hws_bwc_queues(ctx);
- struct mlx5hws_bwc_rule **bwc_rules;
struct mlx5hws_rule_attr rule_attr;
- u32 *pending_rules;
- int i, j, ret = 0;
- bool all_done;
- u16 burst_th;
+ struct mlx5hws_bwc_rule *bwc_rule;
+ struct mlx5hws_send_engine *queue;
+ struct list_head *rules_list;
+ u32 pending_rules;
+ int i, ret = 0;
mlx5hws_bwc_rule_fill_attr(bwc_matcher, 0, 0, &rule_attr);
- pending_rules = kcalloc(bwc_queues, sizeof(*pending_rules), GFP_KERNEL);
- if (!pending_rules)
- return -ENOMEM;
-
- bwc_rules = kcalloc(bwc_queues, sizeof(*bwc_rules), GFP_KERNEL);
- if (!bwc_rules) {
- ret = -ENOMEM;
- goto free_pending_rules;
- }
-
for (i = 0; i < bwc_queues; i++) {
if (list_empty(&bwc_matcher->rules[i]))
- bwc_rules[i] = NULL;
- else
- bwc_rules[i] = list_first_entry(&bwc_matcher->rules[i],
- struct mlx5hws_bwc_rule,
- list_node);
- }
+ continue;
- do {
- all_done = true;
+ pending_rules = 0;
+ rule_attr.queue_id = mlx5hws_bwc_get_queue_id(ctx, i);
+ rules_list = &bwc_matcher->rules[i];
- for (i = 0; i < bwc_queues; i++) {
- rule_attr.queue_id = mlx5hws_bwc_get_queue_id(ctx, i);
- burst_th = hws_bwc_get_burst_th(ctx, rule_attr.queue_id);
-
- for (j = 0; j < burst_th && bwc_rules[i]; j++) {
- rule_attr.burst = !!((j + 1) % burst_th);
- ret = mlx5hws_matcher_resize_rule_move(bwc_matcher->matcher,
- bwc_rules[i]->rule,
- &rule_attr);
- if (unlikely(ret)) {
- mlx5hws_err(ctx,
- "Moving BWC rule failed during rehash (%d)\n",
- ret);
- goto free_bwc_rules;
- }
+ list_for_each_entry(bwc_rule, rules_list, list_node) {
+ ret = mlx5hws_matcher_resize_rule_move(matcher,
+ bwc_rule->rule,
+ &rule_attr);
+ if (unlikely(ret && !move_error)) {
+ mlx5hws_err(ctx,
+ "Moving BWC rule: move failed (%d), attempting to move rest of the rules\n",
+ ret);
+ move_error = true;
+ }
- all_done = false;
- pending_rules[i]++;
- bwc_rules[i] = list_is_last(&bwc_rules[i]->list_node,
- &bwc_matcher->rules[i]) ?
- NULL : list_next_entry(bwc_rules[i], list_node);
-
- ret = mlx5hws_bwc_queue_poll(ctx,
- rule_attr.queue_id,
- &pending_rules[i],
- false);
- if (unlikely(ret)) {
- mlx5hws_err(ctx,
- "Moving BWC rule failed during rehash (%d)\n",
- ret);
- goto free_bwc_rules;
- }
+ pending_rules++;
+ ret = mlx5hws_bwc_queue_poll(ctx,
+ rule_attr.queue_id,
+ &pending_rules,
+ false);
+ if (unlikely(ret && !poll_error)) {
+ mlx5hws_err(ctx,
+ "Moving BWC rule: poll failed (%d), attempting to move rest of the rules\n",
+ ret);
+ poll_error = true;
}
}
- } while (!all_done);
-
- /* drain all the bwc queues */
- for (i = 0; i < bwc_queues; i++) {
- if (pending_rules[i]) {
- u16 queue_id = mlx5hws_bwc_get_queue_id(ctx, i);
- mlx5hws_send_engine_flush_queue(&ctx->send_queue[queue_id]);
- ret = mlx5hws_bwc_queue_poll(ctx, queue_id,
- &pending_rules[i], true);
- if (unlikely(ret)) {
+ if (pending_rules) {
+ queue = &ctx->send_queue[rule_attr.queue_id];
+ mlx5hws_send_engine_flush_queue(queue);
+ ret = mlx5hws_bwc_queue_poll(ctx,
+ rule_attr.queue_id,
+ &pending_rules,
+ true);
+ if (unlikely(ret && !drain_error)) {
mlx5hws_err(ctx,
- "Moving BWC rule failed during rehash (%d)\n", ret);
- goto free_bwc_rules;
+ "Moving BWC rule: drain failed (%d), attempting to move rest of the rules\n",
+ ret);
+ drain_error = true;
}
}
}
-free_bwc_rules:
- kfree(bwc_rules);
-free_pending_rules:
- kfree(pending_rules);
+ if (move_error || poll_error || drain_error)
+ ret = -EINVAL;
return ret;
}
@@ -742,15 +716,18 @@ static int hws_bwc_matcher_move(struct mlx5hws_bwc_matcher *bwc_matcher)
}
ret = hws_bwc_matcher_move_all(bwc_matcher);
- if (ret) {
- mlx5hws_err(ctx, "Rehash error: moving rules failed\n");
- return -ENOMEM;
- }
+ if (ret)
+ mlx5hws_err(ctx, "Rehash error: moving rules failed, attempting to remove the old matcher\n");
+
+ /* Error during rehash can't be rolled back.
+ * The best option here is to allow the rehash to complete and remove
+ * the old matcher - can't leave the matcher in the 'in_resize' state.
+ */
bwc_matcher->matcher = new_matcher;
mlx5hws_matcher_destroy(old_matcher);
- return 0;
+ return ret;
}
static int
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,191 @@
From 965036aa649e8bc7524cb8c517e5701242864867 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:05 -0400
Subject: [PATCH] net/mlx5: HWS, dump bad completion details
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 578b856b5e72b7b8cd2390a0e525e240d3e80c92
Author: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Sun May 11 22:38:10 2025 +0300
net/mlx5: HWS, dump bad completion details
Failing to insert/delete a rule should not happen. If it does happen,
it would be good to know at which stage it happened and what was the
failure. This patch adds printing of bad CQE details.
Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Vlad Dogaru <vdogaru@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1746992290-568936-11-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c
index cb6abc4ab7df..c4b22be19a9b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c
@@ -344,18 +344,133 @@ hws_send_engine_update_rule_resize(struct mlx5hws_send_engine *queue,
}
}
+static void hws_send_engine_dump_error_cqe(struct mlx5hws_send_engine *queue,
+ struct mlx5hws_send_ring_priv *priv,
+ struct mlx5_cqe64 *cqe)
+{
+ u8 wqe_opcode = cqe ? be32_to_cpu(cqe->sop_drop_qpn) >> 24 : 0;
+ struct mlx5hws_context *ctx = priv->rule->matcher->tbl->ctx;
+ u32 opcode = cqe ? get_cqe_opcode(cqe) : 0;
+ struct mlx5hws_rule *rule = priv->rule;
+
+ /* If something bad happens and lots of rules are failing, we don't
+ * want to pollute dmesg. Print only the first bad cqe per engine,
+ * the one that started the avalanche.
+ */
+ if (queue->error_cqe_printed)
+ return;
+
+ queue->error_cqe_printed = true;
+
+ if (mlx5hws_rule_move_in_progress(rule))
+ mlx5hws_err(ctx,
+ "--- rule 0x%08llx: error completion moving rule: phase %s, wqes left %d\n",
+ HWS_PTR_TO_ID(rule),
+ rule->resize_info->state ==
+ MLX5HWS_RULE_RESIZE_STATE_WRITING ? "WRITING" :
+ rule->resize_info->state ==
+ MLX5HWS_RULE_RESIZE_STATE_DELETING ? "DELETING" :
+ "UNKNOWN",
+ rule->pending_wqes);
+ else
+ mlx5hws_err(ctx,
+ "--- rule 0x%08llx: error completion %s (%d), wqes left %d\n",
+ HWS_PTR_TO_ID(rule),
+ rule->status ==
+ MLX5HWS_RULE_STATUS_CREATING ? "CREATING" :
+ rule->status ==
+ MLX5HWS_RULE_STATUS_DELETING ? "DELETING" :
+ rule->status ==
+ MLX5HWS_RULE_STATUS_FAILING ? "FAILING" :
+ rule->status ==
+ MLX5HWS_RULE_STATUS_UPDATING ? "UPDATING" : "NA",
+ rule->status,
+ rule->pending_wqes);
+
+ mlx5hws_err(ctx, " rule 0x%08llx: matcher 0x%llx %s\n",
+ HWS_PTR_TO_ID(rule),
+ HWS_PTR_TO_ID(rule->matcher),
+ (rule->matcher->flags & MLX5HWS_MATCHER_FLAGS_ISOLATED) ?
+ "(isolated)" : "");
+
+ if (!cqe) {
+ mlx5hws_err(ctx, " rule 0x%08llx: no CQE\n",
+ HWS_PTR_TO_ID(rule));
+ return;
+ }
+
+ mlx5hws_err(ctx, " rule 0x%08llx: cqe->opcode = %d %s\n",
+ HWS_PTR_TO_ID(rule), opcode,
+ opcode == MLX5_CQE_REQ ? "(MLX5_CQE_REQ)" :
+ opcode == MLX5_CQE_REQ_ERR ? "(MLX5_CQE_REQ_ERR)" : " ");
+
+ if (opcode == MLX5_CQE_REQ_ERR) {
+ struct mlx5_err_cqe *err_cqe = (struct mlx5_err_cqe *)cqe;
+
+ mlx5hws_err(ctx,
+ " rule 0x%08llx: |--- hw_error_syndrome = 0x%x\n",
+ HWS_PTR_TO_ID(rule),
+ err_cqe->rsvd1[16]);
+ mlx5hws_err(ctx,
+ " rule 0x%08llx: |--- hw_syndrome_type = 0x%x\n",
+ HWS_PTR_TO_ID(rule),
+ err_cqe->rsvd1[17] >> 4);
+ mlx5hws_err(ctx,
+ " rule 0x%08llx: |--- vendor_err_synd = 0x%x\n",
+ HWS_PTR_TO_ID(rule),
+ err_cqe->vendor_err_synd);
+ mlx5hws_err(ctx,
+ " rule 0x%08llx: |--- syndrome = 0x%x\n",
+ HWS_PTR_TO_ID(rule),
+ err_cqe->syndrome);
+ }
+
+ mlx5hws_err(ctx,
+ " rule 0x%08llx: cqe->byte_cnt = 0x%08x\n",
+ HWS_PTR_TO_ID(rule), be32_to_cpu(cqe->byte_cnt));
+ mlx5hws_err(ctx,
+ " rule 0x%08llx: |-- UPDATE STATUS = %s\n",
+ HWS_PTR_TO_ID(rule),
+ (be32_to_cpu(cqe->byte_cnt) & 0x80000000) ?
+ "FAILURE" : "SUCCESS");
+ mlx5hws_err(ctx,
+ " rule 0x%08llx: |------- SYNDROME = %s\n",
+ HWS_PTR_TO_ID(rule),
+ ((be32_to_cpu(cqe->byte_cnt) & 0x00000003) == 1) ?
+ "SET_FLOW_FAIL" :
+ ((be32_to_cpu(cqe->byte_cnt) & 0x00000003) == 2) ?
+ "DISABLE_FLOW_FAIL" : "UNKNOWN");
+ mlx5hws_err(ctx,
+ " rule 0x%08llx: cqe->sop_drop_qpn = 0x%08x\n",
+ HWS_PTR_TO_ID(rule), be32_to_cpu(cqe->sop_drop_qpn));
+ mlx5hws_err(ctx,
+ " rule 0x%08llx: |-send wqe opcode = 0x%02x %s\n",
+ HWS_PTR_TO_ID(rule), wqe_opcode,
+ wqe_opcode == MLX5HWS_WQE_OPCODE_TBL_ACCESS ?
+ "(MLX5HWS_WQE_OPCODE_TBL_ACCESS)" : "(UNKNOWN)");
+ mlx5hws_err(ctx,
+ " rule 0x%08llx: |------------ qpn = 0x%06x\n",
+ HWS_PTR_TO_ID(rule),
+ be32_to_cpu(cqe->sop_drop_qpn) & 0xffffff);
+}
+
static void hws_send_engine_update_rule(struct mlx5hws_send_engine *queue,
struct mlx5hws_send_ring_priv *priv,
u16 wqe_cnt,
- enum mlx5hws_flow_op_status *status)
+ enum mlx5hws_flow_op_status *status,
+ struct mlx5_cqe64 *cqe)
{
priv->rule->pending_wqes--;
- if (*status == MLX5HWS_FLOW_OP_ERROR) {
+ if (unlikely(*status == MLX5HWS_FLOW_OP_ERROR)) {
if (priv->retry_id) {
+ /* If there is a retry_id, then it's not an error yet,
+ * retry to insert this rule in the collision RTC.
+ */
hws_send_engine_retry_post_send(queue, priv, wqe_cnt);
return;
}
+ hws_send_engine_dump_error_cqe(queue, priv, cqe);
/* Some part of the rule failed */
priv->rule->status = MLX5HWS_RULE_STATUS_FAILING;
*priv->used_id = 0;
@@ -420,7 +535,8 @@ static void hws_send_engine_update(struct mlx5hws_send_engine *queue,
if (priv->user_data) {
if (priv->rule) {
- hws_send_engine_update_rule(queue, priv, wqe_cnt, &status);
+ hws_send_engine_update_rule(queue, priv, wqe_cnt,
+ &status, cqe);
/* Completion is provided on the last rule WQE */
if (priv->rule->pending_wqes)
return;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.h
index f833092235c1..3fb8e99309b2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.h
@@ -140,6 +140,7 @@ struct mlx5hws_send_engine {
u16 used_entries;
u16 num_entries;
bool err;
+ bool error_cqe_printed;
struct mutex lock; /* Protects the send engine */
};
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,40 @@
From 5447a8c66fd43b005c3f33fe8b63145af1ce5893 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:06 -0400
Subject: [PATCH] net/mlx5: Use to_delayed_work()
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit ee39bae6c141876f5b4c001f6b12b4f8ffb4cd08
Author: Chen Ni <nichen@iscas.ac.cn>
Date: Wed May 14 15:24:19 2025 +0800
net/mlx5: Use to_delayed_work()
Use to_delayed_work() instead of open-coding it.
Signed-off-by: Chen Ni <nichen@iscas.ac.cn>
Acked-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/20250514072419.2707578-1-nichen@iscas.ac.cn
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index e53dbdc0a7a1..b1aeea7c4a91 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -927,8 +927,7 @@ static void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool force
static void cb_timeout_handler(struct work_struct *work)
{
- struct delayed_work *dwork = container_of(work, struct delayed_work,
- work);
+ struct delayed_work *dwork = to_delayed_work(work);
struct mlx5_cmd_work_ent *ent = container_of(dwork,
struct mlx5_cmd_work_ent,
cb_timeout_work);
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,196 @@
From d9f5ece10ab6345c8de1a61e674bcdef45d0ce56 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:06 -0400
Subject: [PATCH] net/mlx5: SWS, fix reformat id error handling
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit ca7690dae1269f454572c163ed5271feed060af5
Author: Vlad Dogaru <vdogaru@nvidia.com>
Date: Tue May 20 21:46:39 2025 +0300
net/mlx5: SWS, fix reformat id error handling
The firmware reformat id is a u32 and can't safely be returned as an
int. Because the functions also need a way to signal error, prefer to
return the id as an output parameter and keep the return code only for
success/error.
While we're at it, also extract some duplicate code to fetch the
reformat id from a more generic struct pkt_reformat.
Signed-off-by: Vlad Dogaru <vdogaru@nvidia.com>
Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1747766802-958178-2-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index a47c29571f64..1af76da8b132 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -527,7 +527,7 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
struct mlx5_flow_rule *dst;
void *in_flow_context, *vlan;
void *in_match_value;
- int reformat_id = 0;
+ u32 reformat_id = 0;
unsigned int inlen;
int dst_cnt_size;
u32 *in, action;
@@ -580,23 +580,21 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
MLX5_SET(flow_context, in_flow_context, action, action);
if (!extended_dest && fte->act_dests.action.pkt_reformat) {
- struct mlx5_pkt_reformat *pkt_reformat = fte->act_dests.action.pkt_reformat;
-
- if (pkt_reformat->owner == MLX5_FLOW_RESOURCE_OWNER_SW) {
- reformat_id = mlx5_fs_dr_action_get_pkt_reformat_id(pkt_reformat);
- if (reformat_id < 0) {
- mlx5_core_err(dev,
- "Unsupported SW-owned pkt_reformat type (%d) in FW-owned table\n",
- pkt_reformat->reformat_type);
- err = reformat_id;
- goto err_out;
- }
- } else {
- reformat_id = fte->act_dests.action.pkt_reformat->id;
+ struct mlx5_pkt_reformat *pkt_reformat =
+ fte->act_dests.action.pkt_reformat;
+
+ err = mlx5_fs_get_packet_reformat_id(pkt_reformat,
+ &reformat_id);
+ if (err) {
+ mlx5_core_err(dev,
+ "Unsupported pkt_reformat type (%d)\n",
+ pkt_reformat->reformat_type);
+ goto err_out;
}
}
- MLX5_SET(flow_context, in_flow_context, packet_reformat_id, (u32)reformat_id);
+ MLX5_SET(flow_context, in_flow_context, packet_reformat_id,
+ reformat_id);
if (fte->act_dests.action.modify_hdr) {
if (fte->act_dests.action.modify_hdr->owner == MLX5_FLOW_RESOURCE_OWNER_SW) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index a22ecf141518..c7ce9fc797c4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -1830,14 +1830,33 @@ static int create_auto_flow_group(struct mlx5_flow_table *ft,
return err;
}
+int mlx5_fs_get_packet_reformat_id(struct mlx5_pkt_reformat *pkt_reformat,
+ u32 *id)
+{
+ switch (pkt_reformat->owner) {
+ case MLX5_FLOW_RESOURCE_OWNER_FW:
+ *id = pkt_reformat->id;
+ return 0;
+ case MLX5_FLOW_RESOURCE_OWNER_SW:
+ return mlx5_fs_dr_action_get_pkt_reformat_id(pkt_reformat, id);
+ default:
+ return -EINVAL;
+ }
+}
+
static bool mlx5_pkt_reformat_cmp(struct mlx5_pkt_reformat *p1,
struct mlx5_pkt_reformat *p2)
{
- return p1->owner == p2->owner &&
- (p1->owner == MLX5_FLOW_RESOURCE_OWNER_FW ?
- p1->id == p2->id :
- mlx5_fs_dr_action_get_pkt_reformat_id(p1) ==
- mlx5_fs_dr_action_get_pkt_reformat_id(p2));
+ int err1, err2;
+ u32 id1, id2;
+
+ if (p1->owner != p2->owner)
+ return false;
+
+ err1 = mlx5_fs_get_packet_reformat_id(p1, &id1);
+ err2 = mlx5_fs_get_packet_reformat_id(p2, &id2);
+
+ return !err1 && !err2 && id1 == id2;
}
static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 1f523fb761f6..a41d3491d2af 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -387,6 +387,9 @@ u32 mlx5_fs_get_capabilities(struct mlx5_core_dev *dev, enum mlx5_flow_namespace
struct mlx5_flow_root_namespace *find_root(struct fs_node *node);
+int mlx5_fs_get_packet_reformat_id(struct mlx5_pkt_reformat *pkt_reformat,
+ u32 *id);
+
#define fs_get_obj(v, _node) {v = container_of((_node), typeof(*v), node); }
#define fs_list_for_each_entry(pos, root) \
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/fs_dr.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/fs_dr.c
index 8007d3f523c9..f367997ab61e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/fs_dr.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/fs_dr.c
@@ -833,15 +833,21 @@ static u32 mlx5_cmd_dr_get_capabilities(struct mlx5_flow_root_namespace *ns,
return steering_caps;
}
-int mlx5_fs_dr_action_get_pkt_reformat_id(struct mlx5_pkt_reformat *pkt_reformat)
+int
+mlx5_fs_dr_action_get_pkt_reformat_id(struct mlx5_pkt_reformat *pkt_reformat,
+ u32 *reformat_id)
{
+ struct mlx5dr_action *dr_action;
+
switch (pkt_reformat->reformat_type) {
case MLX5_REFORMAT_TYPE_L2_TO_VXLAN:
case MLX5_REFORMAT_TYPE_L2_TO_NVGRE:
case MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL:
case MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL:
case MLX5_REFORMAT_TYPE_INSERT_HDR:
- return mlx5dr_action_get_pkt_reformat_id(pkt_reformat->fs_dr_action.dr_action);
+ dr_action = pkt_reformat->fs_dr_action.dr_action;
+ *reformat_id = mlx5dr_action_get_pkt_reformat_id(dr_action);
+ return 0;
}
return -EOPNOTSUPP;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/fs_dr.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/fs_dr.h
index 99a3b2eff6b8..f869f2daefbf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/fs_dr.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/fs_dr.h
@@ -38,7 +38,9 @@ struct mlx5_fs_dr_table {
bool mlx5_fs_dr_is_supported(struct mlx5_core_dev *dev);
-int mlx5_fs_dr_action_get_pkt_reformat_id(struct mlx5_pkt_reformat *pkt_reformat);
+int
+mlx5_fs_dr_action_get_pkt_reformat_id(struct mlx5_pkt_reformat *pkt_reformat,
+ u32 *reformat_id);
const struct mlx5_flow_cmds *mlx5_fs_cmd_get_dr_cmds(void);
@@ -49,9 +51,11 @@ static inline const struct mlx5_flow_cmds *mlx5_fs_cmd_get_dr_cmds(void)
return NULL;
}
-static inline u32 mlx5_fs_dr_action_get_pkt_reformat_id(struct mlx5_pkt_reformat *pkt_reformat)
+static inline int
+mlx5_fs_dr_action_get_pkt_reformat_id(struct mlx5_pkt_reformat *pkt_reformat,
+ u32 *reformat_id)
{
- return 0;
+ return -EOPNOTSUPP;
}
static inline bool mlx5_fs_dr_is_supported(struct mlx5_core_dev *dev)
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,246 @@
From a2158eb7d1e077c0cf3a4b1bad863e6bd081d31d Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:06 -0400
Subject: [PATCH] net/mlx5: HWS, register reformat actions with fw
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit b206d9ec19dfc2db706883ff6b46b259831a033d
Author: Vlad Dogaru <vdogaru@nvidia.com>
Date: Tue May 20 21:46:40 2025 +0300
net/mlx5: HWS, register reformat actions with fw
Hardware steering handles actions differently from firmware, but for
termination rules that use encapsulation the firmware needs to be aware
of the action.
Fix this by registering reformat actions with the firmware the first
time this is needed. To do this, add a third possible owner for an
action, and also a lock to protect against registration of the same
action from different threads.
Signed-off-by: Vlad Dogaru <vdogaru@nvidia.com>
Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1747766802-958178-3-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index c7ce9fc797c4..c330b64a506b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -1839,6 +1839,8 @@ int mlx5_fs_get_packet_reformat_id(struct mlx5_pkt_reformat *pkt_reformat,
return 0;
case MLX5_FLOW_RESOURCE_OWNER_SW:
return mlx5_fs_dr_action_get_pkt_reformat_id(pkt_reformat, id);
+ case MLX5_FLOW_RESOURCE_OWNER_HWS:
+ return mlx5_fs_hws_action_get_pkt_reformat_id(pkt_reformat, id);
default:
return -EINVAL;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index a41d3491d2af..e6a95b310b55 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -58,6 +58,7 @@ struct mlx5_flow_definer {
enum mlx5_flow_resource_owner {
MLX5_FLOW_RESOURCE_OWNER_FW,
MLX5_FLOW_RESOURCE_OWNER_SW,
+ MLX5_FLOW_RESOURCE_OWNER_HWS,
};
struct mlx5_modify_hdr {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c
index bef4d25c1a2a..aa47a7af6f50 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c
@@ -72,6 +72,11 @@ enum mlx5hws_action_type mlx5hws_action_get_type(struct mlx5hws_action *action)
return action->type;
}
+struct mlx5_core_dev *mlx5hws_action_get_dev(struct mlx5hws_action *action)
+{
+ return action->ctx->mdev;
+}
+
static int hws_action_get_shared_stc_nic(struct mlx5hws_context *ctx,
enum mlx5hws_context_shared_stc_type stc_type,
u8 tbl_type)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c
index 1b787cd66e6f..9d1c0e4b224a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c
@@ -1081,13 +1081,8 @@ static int mlx5_cmd_hws_create_fte(struct mlx5_flow_root_namespace *ns,
struct mlx5hws_bwc_rule *rule;
int err = 0;
- if (mlx5_fs_cmd_is_fw_term_table(ft)) {
- /* Packet reformat on terminamtion table not supported yet */
- if (fte->act_dests.action.action &
- MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT)
- return -EOPNOTSUPP;
+ if (mlx5_fs_cmd_is_fw_term_table(ft))
return mlx5_fs_cmd_get_fw_cmds()->create_fte(ns, ft, group, fte);
- }
err = mlx5_fs_fte_get_hws_actions(ns, ft, group, fte, &ractions);
if (err)
@@ -1362,7 +1357,7 @@ mlx5_cmd_hws_packet_reformat_alloc(struct mlx5_flow_root_namespace *ns,
pkt_reformat->fs_hws_action.pr_data = pr_data;
}
- pkt_reformat->owner = MLX5_FLOW_RESOURCE_OWNER_SW;
+ pkt_reformat->owner = MLX5_FLOW_RESOURCE_OWNER_HWS;
pkt_reformat->fs_hws_action.hws_action = hws_action;
return 0;
@@ -1380,6 +1375,15 @@ static void mlx5_cmd_hws_packet_reformat_dealloc(struct mlx5_flow_root_namespace
struct mlx5_fs_hws_pr *pr_data;
struct mlx5_fs_pool *pr_pool;
+ if (pkt_reformat->fs_hws_action.fw_reformat_id != 0) {
+ struct mlx5_pkt_reformat fw_pkt_reformat = { 0 };
+
+ fw_pkt_reformat.id = pkt_reformat->fs_hws_action.fw_reformat_id;
+ mlx5_fs_cmd_get_fw_cmds()->
+ packet_reformat_dealloc(ns, &fw_pkt_reformat);
+ pkt_reformat->fs_hws_action.fw_reformat_id = 0;
+ }
+
if (pkt_reformat->reformat_type == MLX5_REFORMAT_TYPE_REMOVE_HDR)
return;
@@ -1499,6 +1503,7 @@ static int mlx5_cmd_hws_modify_header_alloc(struct mlx5_flow_root_namespace *ns,
err = -ENOMEM;
goto release_mh;
}
+ mutex_init(&modify_hdr->fs_hws_action.lock);
modify_hdr->fs_hws_action.mh_data = mh_data;
modify_hdr->fs_hws_action.fs_pool = pool;
modify_hdr->owner = MLX5_FLOW_RESOURCE_OWNER_SW;
@@ -1532,6 +1537,58 @@ static void mlx5_cmd_hws_modify_header_dealloc(struct mlx5_flow_root_namespace *
modify_hdr->fs_hws_action.mh_data = NULL;
}
+int
+mlx5_fs_hws_action_get_pkt_reformat_id(struct mlx5_pkt_reformat *pkt_reformat,
+ u32 *reformat_id)
+{
+ enum mlx5_flow_namespace_type ns_type = pkt_reformat->ns_type;
+ struct mutex *lock = &pkt_reformat->fs_hws_action.lock;
+ u32 *id = &pkt_reformat->fs_hws_action.fw_reformat_id;
+ struct mlx5_pkt_reformat fw_pkt_reformat = { 0 };
+ struct mlx5_pkt_reformat_params params = { 0 };
+ struct mlx5_flow_root_namespace *ns;
+ struct mlx5_core_dev *dev;
+ int ret;
+
+ mutex_lock(lock);
+
+ if (*id != 0) {
+ *reformat_id = *id;
+ ret = 0;
+ goto unlock;
+ }
+
+ dev = mlx5hws_action_get_dev(pkt_reformat->fs_hws_action.hws_action);
+ if (!dev) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ ns = mlx5_get_root_namespace(dev, ns_type);
+ if (!ns) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ params.type = pkt_reformat->reformat_type;
+ params.size = pkt_reformat->fs_hws_action.pr_data->data_size;
+ params.data = pkt_reformat->fs_hws_action.pr_data->data;
+
+ ret = mlx5_fs_cmd_get_fw_cmds()->
+ packet_reformat_alloc(ns, &params, ns_type, &fw_pkt_reformat);
+ if (ret)
+ goto unlock;
+
+ *id = fw_pkt_reformat.id;
+ *reformat_id = *id;
+ ret = 0;
+
+unlock:
+ mutex_unlock(lock);
+
+ return ret;
+}
+
static int mlx5_cmd_hws_create_match_definer(struct mlx5_flow_root_namespace *ns,
u16 format_id, u32 *match_mask)
{
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.h
index 8b56298288da..b92d55b2d147 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.h
@@ -41,6 +41,11 @@ struct mlx5_fs_hws_action {
struct mlx5_fs_pool *fs_pool;
struct mlx5_fs_hws_pr *pr_data;
struct mlx5_fs_hws_mh *mh_data;
+ u32 fw_reformat_id;
+ /* Protect `fw_reformat_id` against being initialized from multiple
+ * threads.
+ */
+ struct mutex lock;
};
struct mlx5_fs_hws_matcher {
@@ -84,12 +89,23 @@ void mlx5_fs_put_hws_action(struct mlx5_fs_hws_data *fs_hws_data);
#ifdef CONFIG_MLX5_HW_STEERING
+int
+mlx5_fs_hws_action_get_pkt_reformat_id(struct mlx5_pkt_reformat *pkt_reformat,
+ u32 *reformat_id);
+
bool mlx5_fs_hws_is_supported(struct mlx5_core_dev *dev);
const struct mlx5_flow_cmds *mlx5_fs_cmd_get_hws_cmds(void);
#else
+static inline int
+mlx5_fs_hws_action_get_pkt_reformat_id(struct mlx5_pkt_reformat *pkt_reformat,
+ u32 *reformat_id)
+{
+ return -EOPNOTSUPP;
+}
+
static inline bool mlx5_fs_hws_is_supported(struct mlx5_core_dev *dev)
{
return false;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h
index fbd63369da10..9bbadc4d8a0b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h
@@ -503,6 +503,15 @@ int mlx5hws_rule_action_update(struct mlx5hws_rule *rule,
enum mlx5hws_action_type
mlx5hws_action_get_type(struct mlx5hws_action *action);
+/**
+ * mlx5hws_action_get_dev - Get mlx5 core device.
+ *
+ * @action: The action to get the device from.
+ *
+ * Return: mlx5 core device.
+ */
+struct mlx5_core_dev *mlx5hws_action_get_dev(struct mlx5hws_action *action);
+
/**
* mlx5hws_action_create_dest_drop - Create a direct rule drop action.
*
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,220 @@
From 80492ad30af1ed97b996b34f6e6daac2f98ef98d Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:07 -0400
Subject: [PATCH] net/mlx5: HWS, fix typo - 'nope' to 'nop'
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 0b6e452caf03da63aeb2e84475771d6fb6d6cd99
Author: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Tue May 20 21:46:41 2025 +0300
net/mlx5: HWS, fix typo - 'nope' to 'nop'
Fix typo - rename 'nope_locations' to 'nop_locations', which describes
the locations of 'nop' actions. To shorten the lines, this renaming
also required some refactoring.
Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1747766802-958178-4-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c
index aa47a7af6f50..64d115feef2c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c
@@ -1207,16 +1207,16 @@ hws_action_create_modify_header_hws(struct mlx5hws_action *action,
for (i = 0; i < num_of_patterns; i++) {
size_t new_num_actions;
size_t cur_num_actions;
- u32 nope_location;
+ u32 nop_locations;
cur_num_actions = pattern[i].sz / MLX5HWS_MODIFY_ACTION_SIZE;
- mlx5hws_pat_calc_nope(pattern[i].data, cur_num_actions,
- pat_max_sz / MLX5HWS_MODIFY_ACTION_SIZE,
- &new_num_actions, &nope_location,
- &new_pattern[i * pat_max_sz]);
+ mlx5hws_pat_calc_nop(pattern[i].data, cur_num_actions,
+ pat_max_sz / MLX5HWS_MODIFY_ACTION_SIZE,
+ &new_num_actions, &nop_locations,
+ &new_pattern[i * pat_max_sz]);
- action[i].modify_header.nope_locations = nope_location;
+ action[i].modify_header.nop_locations = nop_locations;
action[i].modify_header.num_of_actions = new_num_actions;
max_mh_actions = max(max_mh_actions, new_num_actions);
@@ -1263,7 +1263,7 @@ hws_action_create_modify_header_hws(struct mlx5hws_action *action,
MLX5_GET(set_action_in, pattern[i].data, action_type);
} else {
/* Multiple modify actions require a pattern */
- if (unlikely(action[i].modify_header.nope_locations)) {
+ if (unlikely(action[i].modify_header.nop_locations)) {
size_t pattern_sz;
pattern_sz = action[i].modify_header.num_of_actions *
@@ -2105,12 +2105,12 @@ static void hws_action_modify_write(struct mlx5hws_send_engine *queue,
u32 arg_idx,
u8 *arg_data,
u16 num_of_actions,
- u32 nope_locations)
+ u32 nop_locations)
{
u8 *new_arg_data = NULL;
int i, j;
- if (unlikely(nope_locations)) {
+ if (unlikely(nop_locations)) {
new_arg_data = kcalloc(num_of_actions,
MLX5HWS_MODIFY_ACTION_SIZE, GFP_KERNEL);
if (unlikely(!new_arg_data))
@@ -2118,7 +2118,7 @@ static void hws_action_modify_write(struct mlx5hws_send_engine *queue,
for (i = 0, j = 0; i < num_of_actions; i++, j++) {
memcpy(&new_arg_data[j], arg_data, MLX5HWS_MODIFY_ACTION_SIZE);
- if (BIT(i) & nope_locations)
+ if (BIT(i) & nop_locations)
j++;
}
}
@@ -2215,6 +2215,7 @@ hws_action_setter_modify_header(struct mlx5hws_actions_apply_data *apply,
struct mlx5hws_action *action;
u32 arg_sz, arg_idx;
u8 *single_action;
+ u8 max_actions;
__be32 stc_idx;
rule_action = &apply->rule_action[setter->idx_double];
@@ -2242,21 +2243,23 @@ hws_action_setter_modify_header(struct mlx5hws_actions_apply_data *apply,
apply->wqe_data[MLX5HWS_ACTION_OFFSET_DW7] =
*(__be32 *)MLX5_ADDR_OF(set_action_in, single_action, data);
- } else {
- /* Argument offset multiple with number of args per these actions */
- arg_sz = mlx5hws_arg_get_arg_size(action->modify_header.max_num_of_actions);
- arg_idx = rule_action->modify_header.offset * arg_sz;
-
- apply->wqe_data[MLX5HWS_ACTION_OFFSET_DW7] = htonl(arg_idx);
-
- if (!(action->flags & MLX5HWS_ACTION_FLAG_SHARED)) {
- apply->require_dep = 1;
- hws_action_modify_write(apply->queue,
- action->modify_header.arg_id + arg_idx,
- rule_action->modify_header.data,
- action->modify_header.num_of_actions,
- action->modify_header.nope_locations);
- }
+ return;
+ }
+
+ /* Argument offset multiple with number of args per these actions */
+ max_actions = action->modify_header.max_num_of_actions;
+ arg_sz = mlx5hws_arg_get_arg_size(max_actions);
+ arg_idx = rule_action->modify_header.offset * arg_sz;
+
+ apply->wqe_data[MLX5HWS_ACTION_OFFSET_DW7] = htonl(arg_idx);
+
+ if (!(action->flags & MLX5HWS_ACTION_FLAG_SHARED)) {
+ apply->require_dep = 1;
+ hws_action_modify_write(apply->queue,
+ action->modify_header.arg_id + arg_idx,
+ rule_action->modify_header.data,
+ action->modify_header.num_of_actions,
+ action->modify_header.nop_locations);
}
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h
index 25fa0d4c9221..55a079fdd08f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h
@@ -136,7 +136,7 @@ struct mlx5hws_action {
u32 pat_id;
u32 arg_id;
__be64 single_action;
- u32 nope_locations;
+ u32 nop_locations;
u8 num_of_patterns;
u8 single_action_type;
u8 num_of_actions;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c
index f51ed24526b9..78de19c074a7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c
@@ -522,9 +522,9 @@ bool mlx5hws_pat_verify_actions(struct mlx5hws_context *ctx, __be64 pattern[], s
return true;
}
-void mlx5hws_pat_calc_nope(__be64 *pattern, size_t num_actions,
- size_t max_actions, size_t *new_size,
- u32 *nope_location, __be64 *new_pat)
+void mlx5hws_pat_calc_nop(__be64 *pattern, size_t num_actions,
+ size_t max_actions, size_t *new_size,
+ u32 *nop_locations, __be64 *new_pat)
{
u16 prev_src_field = 0, prev_dst_field = 0;
u16 src_field, dst_field;
@@ -532,7 +532,7 @@ void mlx5hws_pat_calc_nope(__be64 *pattern, size_t num_actions,
size_t i, j;
*new_size = num_actions;
- *nope_location = 0;
+ *nop_locations = 0;
if (num_actions == 1)
return;
@@ -546,18 +546,18 @@ void mlx5hws_pat_calc_nope(__be64 *pattern, size_t num_actions,
if (action_type == MLX5_ACTION_TYPE_COPY &&
(prev_src_field == src_field ||
prev_dst_field == dst_field)) {
- /* need Nope */
+ /* need Nop */
*new_size += 1;
- *nope_location |= BIT(i);
+ *nop_locations |= BIT(i);
memset(&new_pat[j], 0, MLX5HWS_MODIFY_ACTION_SIZE);
MLX5_SET(set_action_in, &new_pat[j],
action_type,
MLX5_MODIFICATION_TYPE_NOP);
j++;
} else if (prev_src_field == src_field) {
- /* need Nope*/
+ /* need Nop */
*new_size += 1;
- *nope_location |= BIT(i);
+ *nop_locations |= BIT(i);
MLX5_SET(set_action_in, &new_pat[j],
action_type,
MLX5_MODIFICATION_TYPE_NOP);
@@ -568,7 +568,7 @@ void mlx5hws_pat_calc_nope(__be64 *pattern, size_t num_actions,
/* check if no more space */
if (j > max_actions) {
*new_size = num_actions;
- *nope_location = 0;
+ *nop_locations = 0;
return;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.h
index 8ddb51980044..91bd2572a341 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.h
@@ -96,6 +96,7 @@ int mlx5hws_arg_write_inline_arg_data(struct mlx5hws_context *ctx,
u8 *arg_data,
size_t data_size);
-void mlx5hws_pat_calc_nope(__be64 *pattern, size_t num_actions, size_t max_actions,
- size_t *new_size, u32 *nope_location, __be64 *new_pat);
+void mlx5hws_pat_calc_nop(__be64 *pattern, size_t num_actions,
+ size_t max_actions, size_t *new_size,
+ u32 *nop_locations, __be64 *new_pat);
#endif /* MLX5HWS_PAT_ARG_H_ */
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,228 @@
From 49dd65e6ee03b3423cd69b885fdab5ae5ec32303 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:07 -0400
Subject: [PATCH] net/mlx5: HWS, handle modify header actions dependency
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 01e035fd0380b285d72725adb5a45f1d73549db8
Author: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Tue May 20 21:46:42 2025 +0300
net/mlx5: HWS, handle modify header actions dependency
Having adjacent accelerated modify header actions (so-called
pattern-argument actions) may result in inconsistent outcome.
These inconsistencies can take the form of writes to the same
field or a read coupled with a write to the same field. The
solution is to detect such dependencies and insert nops between
the offending actions.
The existing implementation had a few issues, which pretty much
required a complete rewrite of the code that handles these
dependencies.
In the new implementation we're doing the following:
* Checking any two adjacent actions for conflicts (not just
odd-even pairs).
* Marking 'set' and 'add' action fields as destination, rather
than source, for the purposes of checking for conflicts.
* Checking all types of actions ('add', 'set', 'copy') for
dependencies.
* Managing offsets of the args in the buffer - copy the action
args to the right place in the buffer.
* Checking that after inserting nops we're still within the number
of supported actions - return an error otherwise.
Signed-off-by: Vlad Dogaru <vdogaru@nvidia.com>
Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1747766802-958178-5-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c
index 64d115feef2c..fb62f3bc4bd4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c
@@ -1190,14 +1190,15 @@ hws_action_create_modify_header_hws(struct mlx5hws_action *action,
struct mlx5hws_action_mh_pattern *pattern,
u32 log_bulk_size)
{
+ u16 num_actions, max_mh_actions = 0, hw_max_actions;
struct mlx5hws_context *ctx = action->ctx;
- u16 num_actions, max_mh_actions = 0;
int i, ret, size_in_bytes;
u32 pat_id, arg_id = 0;
__be64 *new_pattern;
size_t pat_max_sz;
pat_max_sz = MLX5HWS_ARG_CHUNK_SIZE_MAX * MLX5HWS_ARG_DATA_SIZE;
+ hw_max_actions = pat_max_sz / MLX5HWS_MODIFY_ACTION_SIZE;
size_in_bytes = pat_max_sz * sizeof(__be64);
new_pattern = kcalloc(num_of_patterns, size_in_bytes, GFP_KERNEL);
if (!new_pattern)
@@ -1211,10 +1212,14 @@ hws_action_create_modify_header_hws(struct mlx5hws_action *action,
cur_num_actions = pattern[i].sz / MLX5HWS_MODIFY_ACTION_SIZE;
- mlx5hws_pat_calc_nop(pattern[i].data, cur_num_actions,
- pat_max_sz / MLX5HWS_MODIFY_ACTION_SIZE,
- &new_num_actions, &nop_locations,
- &new_pattern[i * pat_max_sz]);
+ ret = mlx5hws_pat_calc_nop(pattern[i].data, cur_num_actions,
+ hw_max_actions, &new_num_actions,
+ &nop_locations,
+ &new_pattern[i * pat_max_sz]);
+ if (ret) {
+ mlx5hws_err(ctx, "Too many actions after nop insertion\n");
+ goto free_new_pat;
+ }
action[i].modify_header.nop_locations = nop_locations;
action[i].modify_header.num_of_actions = new_num_actions;
@@ -2116,10 +2121,12 @@ static void hws_action_modify_write(struct mlx5hws_send_engine *queue,
if (unlikely(!new_arg_data))
return;
- for (i = 0, j = 0; i < num_of_actions; i++, j++) {
- memcpy(&new_arg_data[j], arg_data, MLX5HWS_MODIFY_ACTION_SIZE);
+ for (i = 0, j = 0; j < num_of_actions; i++, j++) {
if (BIT(i) & nop_locations)
j++;
+ memcpy(&new_arg_data[j * MLX5HWS_MODIFY_ACTION_SIZE],
+ &arg_data[i * MLX5HWS_MODIFY_ACTION_SIZE],
+ MLX5HWS_MODIFY_ACTION_SIZE);
}
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c
index 78de19c074a7..51e4c551e0ef 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c
@@ -490,8 +490,8 @@ hws_action_modify_get_target_fields(u8 action_type, __be64 *pattern,
switch (action_type) {
case MLX5_ACTION_TYPE_SET:
case MLX5_ACTION_TYPE_ADD:
- *src_field = MLX5_GET(set_action_in, pattern, field);
- *dst_field = INVALID_FIELD;
+ *src_field = INVALID_FIELD;
+ *dst_field = MLX5_GET(set_action_in, pattern, field);
break;
case MLX5_ACTION_TYPE_COPY:
*src_field = MLX5_GET(copy_action_in, pattern, src_field);
@@ -522,57 +522,59 @@ bool mlx5hws_pat_verify_actions(struct mlx5hws_context *ctx, __be64 pattern[], s
return true;
}
-void mlx5hws_pat_calc_nop(__be64 *pattern, size_t num_actions,
- size_t max_actions, size_t *new_size,
- u32 *nop_locations, __be64 *new_pat)
+int mlx5hws_pat_calc_nop(__be64 *pattern, size_t num_actions,
+ size_t max_actions, size_t *new_size,
+ u32 *nop_locations, __be64 *new_pat)
{
- u16 prev_src_field = 0, prev_dst_field = 0;
+ u16 prev_src_field = INVALID_FIELD, prev_dst_field = INVALID_FIELD;
u16 src_field, dst_field;
u8 action_type;
+ bool dependent;
size_t i, j;
*new_size = num_actions;
*nop_locations = 0;
if (num_actions == 1)
- return;
+ return 0;
for (i = 0, j = 0; i < num_actions; i++, j++) {
- action_type = MLX5_GET(set_action_in, &pattern[i], action_type);
+ if (j >= max_actions)
+ return -EINVAL;
+ action_type = MLX5_GET(set_action_in, &pattern[i], action_type);
hws_action_modify_get_target_fields(action_type, &pattern[i],
&src_field, &dst_field);
- if (i % 2) {
- if (action_type == MLX5_ACTION_TYPE_COPY &&
- (prev_src_field == src_field ||
- prev_dst_field == dst_field)) {
- /* need Nop */
- *new_size += 1;
- *nop_locations |= BIT(i);
- memset(&new_pat[j], 0, MLX5HWS_MODIFY_ACTION_SIZE);
- MLX5_SET(set_action_in, &new_pat[j],
- action_type,
- MLX5_MODIFICATION_TYPE_NOP);
- j++;
- } else if (prev_src_field == src_field) {
- /* need Nop */
- *new_size += 1;
- *nop_locations |= BIT(i);
- MLX5_SET(set_action_in, &new_pat[j],
- action_type,
- MLX5_MODIFICATION_TYPE_NOP);
- j++;
- }
- }
- memcpy(&new_pat[j], &pattern[i], MLX5HWS_MODIFY_ACTION_SIZE);
- /* check if no more space */
- if (j > max_actions) {
- *new_size = num_actions;
- *nop_locations = 0;
- return;
+
+ /* For every action, look at it and the previous one. The two
+ * actions are dependent if:
+ */
+ dependent =
+ (i > 0) &&
+ /* At least one of the actions is a write and */
+ (dst_field != INVALID_FIELD ||
+ prev_dst_field != INVALID_FIELD) &&
+ /* One reads from the other's source */
+ (dst_field == prev_src_field ||
+ src_field == prev_dst_field ||
+ /* Or both write to the same destination */
+ dst_field == prev_dst_field);
+
+ if (dependent) {
+ *new_size += 1;
+ *nop_locations |= BIT(i);
+ memset(&new_pat[j], 0, MLX5HWS_MODIFY_ACTION_SIZE);
+ MLX5_SET(set_action_in, &new_pat[j], action_type,
+ MLX5_MODIFICATION_TYPE_NOP);
+ j++;
+ if (j >= max_actions)
+ return -EINVAL;
}
+ memcpy(&new_pat[j], &pattern[i], MLX5HWS_MODIFY_ACTION_SIZE);
prev_src_field = src_field;
prev_dst_field = dst_field;
}
+
+ return 0;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.h
index 91bd2572a341..7fbd8dc7aa18 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.h
@@ -96,7 +96,7 @@ int mlx5hws_arg_write_inline_arg_data(struct mlx5hws_context *ctx,
u8 *arg_data,
size_t data_size);
-void mlx5hws_pat_calc_nop(__be64 *pattern, size_t num_actions,
- size_t max_actions, size_t *new_size,
- u32 *nop_locations, __be64 *new_pat);
+int mlx5hws_pat_calc_nop(__be64 *pattern, size_t num_actions,
+ size_t max_actions, size_t *new_size,
+ u32 *nop_locations, __be64 *new_pat);
#endif /* MLX5HWS_PAT_ARG_H_ */
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,64 @@
From 07d72a58ed409c2ba3a0099febcaf5e9186274e4 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:07 -0400
Subject: [PATCH] net/mlx5_core: Add error handling
inmlx5_query_nic_vport_qkey_viol_cntr()
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit f0b50730bdd8f2734e548de541e845c0d40dceb6
Author: Wentao Liang <vulab@iscas.ac.cn>
Date: Wed May 21 21:36:20 2025 +0800
net/mlx5_core: Add error handling inmlx5_query_nic_vport_qkey_viol_cntr()
The function mlx5_query_nic_vport_qkey_viol_cntr() calls the function
mlx5_query_nic_vport_context() but does not check its return value. This
could lead to undefined behavior if the query fails. A proper
implementation can be found in mlx5_nic_vport_query_local_lb().
Add error handling for mlx5_query_nic_vport_context(). If it fails, free
the out buffer via kvfree() and return error code.
Fixes: 9efa75254593 ("net/mlx5_core: Introduce access functions to query vport RoCE fields")
Cc: stable@vger.kernel.org # v4.5
Signed-off-by: Wentao Liang <vulab@iscas.ac.cn>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/20250521133620.912-1-vulab@iscas.ac.cn
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index d10d4c396040..a3c57bb8b521 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -519,19 +519,22 @@ int mlx5_query_nic_vport_qkey_viol_cntr(struct mlx5_core_dev *mdev,
{
u32 *out;
int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+ int err;
out = kvzalloc(outlen, GFP_KERNEL);
if (!out)
return -ENOMEM;
- mlx5_query_nic_vport_context(mdev, 0, out);
+ err = mlx5_query_nic_vport_context(mdev, 0, out);
+ if (err)
+ goto out;
*qkey_viol_cntr = MLX5_GET(query_nic_vport_context_out, out,
nic_vport_context.qkey_violation_counter);
-
+out:
kvfree(out);
- return 0;
+ return err;
}
EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_qkey_viol_cntr);
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,41 @@
From 8c0cb6c3c0865ac0421987596b2b3d90ec2f2510 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:08 -0400
Subject: [PATCH] net/mlx5e: Allow setting MAC address of representors
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit f95633adc177416ac21f16db9ce1e75c74db805a
Author: Mark Bloch <mbloch@nvidia.com>
Date: Thu May 22 10:13:56 2025 +0300
net/mlx5e: Allow setting MAC address of representors
A representor netdev does not correspond to real hardware that needs to
be updated when setting the MAC address. The default eth_mac_addr() is
sufficient for simply updating the netdev's MAC address with validation.
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Dragos Tatulea <dtatulea@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/1747898036-1121904-1-git-send-email-tariqt@nvidia.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 58cd153ccc61..2640cace0f76 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -803,6 +803,7 @@ static const struct net_device_ops mlx5e_netdev_ops_rep = {
.ndo_stop = mlx5e_rep_close,
.ndo_start_xmit = mlx5e_xmit,
.ndo_setup_tc = mlx5e_rep_setup_tc,
+ .ndo_set_mac_address = eth_mac_addr,
.ndo_get_stats64 = mlx5e_rep_get_stats,
.ndo_has_offload_stats = mlx5e_rep_has_offload_stats,
.ndo_get_offload_stats = mlx5e_rep_get_offload_stats,
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,63 @@
From dd1d14ed8fcc14c857f5bcd97ad842a291d390e6 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:08 -0400
Subject: [PATCH] net/mlx5: Add error handling in
mlx5_query_nic_vport_node_guid()
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit c6bb8a21cdad8c975a3a646b9e5c8df01ad29783
Author: Wentao Liang <vulab@iscas.ac.cn>
Date: Sun May 25 00:34:25 2025 +0800
net/mlx5: Add error handling in mlx5_query_nic_vport_node_guid()
The function mlx5_query_nic_vport_node_guid() calls the function
mlx5_query_nic_vport_context() but does not check its return value.
A proper implementation can be found in mlx5_nic_vport_query_local_lb().
Add error handling for mlx5_query_nic_vport_context(). If it fails, free
the out buffer via kvfree() and return error code.
Fixes: 9efa75254593 ("net/mlx5_core: Introduce access functions to query vport RoCE fields")
Cc: stable@vger.kernel.org # v4.5
Signed-off-by: Wentao Liang <vulab@iscas.ac.cn>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/20250524163425.1695-1-vulab@iscas.ac.cn
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index a3c57bb8b521..da5c24fc7b30 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -465,19 +465,22 @@ int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid)
{
u32 *out;
int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+ int err;
out = kvzalloc(outlen, GFP_KERNEL);
if (!out)
return -ENOMEM;
- mlx5_query_nic_vport_context(mdev, 0, out);
+ err = mlx5_query_nic_vport_context(mdev, 0, out);
+ if (err)
+ goto out;
*node_guid = MLX5_GET64(query_nic_vport_context_out, out,
nic_vport_context.node_guid);
-
+out:
kvfree(out);
- return 0;
+ return err;
}
EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_node_guid);
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,43 @@
From e2f7050c9a027152fc3001519b825cfd333320c0 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:08 -0400
Subject: [PATCH] net/mlx5: HWS, Fix an error code in
mlx5hws_bwc_rule_create_complex()
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit a540ee75945a96f606c6ac955bfed5410d318f7d
Author: Dan Carpenter <dan.carpenter@linaro.org>
Date: Fri May 23 19:00:12 2025 +0300
net/mlx5: HWS, Fix an error code in mlx5hws_bwc_rule_create_complex()
This was intended to be negative -ENOMEM but the '-' character was left
off accidentally. This typo doesn't affect runtime because the caller
treats all non-zero returns the same.
Fixes: 17e0accac577 ("net/mlx5: HWS, support complex matchers")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Reviewed-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/aDCbjNcquNC68Hyj@stanley.mountain
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc_complex.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc_complex.c
index 5d30c5b094fc..70768953a4f6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc_complex.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc_complex.c
@@ -1188,7 +1188,7 @@ int mlx5hws_bwc_rule_create_complex(struct mlx5hws_bwc_rule *bwc_rule,
GFP_KERNEL);
if (unlikely(!match_buf_2)) {
mlx5hws_err(ctx, "Complex rule: failed allocating match_buf\n");
- ret = ENOMEM;
+ ret = -ENOMEM;
goto hash_node_put;
}
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,45 @@
From 15be556a20263870bb8b7e0974d3452f4d4b3616 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:08 -0400
Subject: [PATCH] net/mlx5: Ensure fw pages are always allocated on same NUMA
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit f37258133c1e95e61db532e14067e28b4881bf24
Author: Moshe Shemesh <moshe@nvidia.com>
Date: Tue Jun 10 18:15:06 2025 +0300
net/mlx5: Ensure fw pages are always allocated on same NUMA
When firmware asks the driver to allocate more pages, using event of
give_pages, the driver should always allocate it from same NUMA, the
original device NUMA. Current code uses dev_to_node() which can result
in different NUMA as it is changed by other driver flows, such as
mlx5_dma_zalloc_coherent_node(). Instead, use saved numa node for
allocating firmware pages.
Fixes: 311c7c71c9bb ("net/mlx5e: Allocate DMA coherent memory on reader NUMA node")
Signed-off-by: Moshe Shemesh <moshe@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250610151514.1094735-2-mbloch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
index 972e8e9df585..9bc9bd83c232 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
@@ -291,7 +291,7 @@ static void free_4k(struct mlx5_core_dev *dev, u64 addr, u32 function)
static int alloc_system_page(struct mlx5_core_dev *dev, u32 function)
{
struct device *device = mlx5_core_dma_dev(dev);
- int nid = dev_to_node(device);
+ int nid = dev->priv.numa_node;
struct page *page;
u64 zero_addr = 1;
u64 addr;
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,64 @@
From 92a46cdb282adb03ef6c38e4e163528e9a19f3d7 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:09 -0400
Subject: [PATCH] net/mlx5: Fix return value when searching for existing flow
group
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 8ec40e3f1f72bf8f8accf18020d487caa99f46a4
Author: Patrisious Haddad <phaddad@nvidia.com>
Date: Tue Jun 10 18:15:08 2025 +0300
net/mlx5: Fix return value when searching for existing flow group
When attempting to add a rule to an existing flow group, if a matching
flow group exists but is not active, the error code returned should be
EAGAIN, so that the rule can be added to the matching flow group once
it is active, rather than ENOENT, which indicates that no matching
flow group was found.
Fixes: bd71b08ec2ee ("net/mlx5: Support multiple updates of steering rules in parallel")
Signed-off-by: Gavi Teitz <gavi@nvidia.com>
Signed-off-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250610151514.1094735-4-mbloch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index c330b64a506b..5f0f546fa126 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -2228,6 +2228,7 @@ try_add_to_existing_fg(struct mlx5_flow_table *ft,
struct mlx5_flow_handle *rule;
struct match_list *iter;
bool take_write = false;
+ bool try_again = false;
struct fs_fte *fte;
u64 version = 0;
int err;
@@ -2292,6 +2293,7 @@ try_add_to_existing_fg(struct mlx5_flow_table *ft,
nested_down_write_ref_node(&g->node, FS_LOCK_PARENT);
if (!g->node.active) {
+ try_again = true;
up_write_ref_node(&g->node, false);
continue;
}
@@ -2313,7 +2315,8 @@ try_add_to_existing_fg(struct mlx5_flow_table *ft,
tree_put_node(&fte->node, false);
return rule;
}
- rule = ERR_PTR(-ENOENT);
+ err = try_again ? -EAGAIN : -ENOENT;
+ rule = ERR_PTR(err);
out:
kmem_cache_free(steering->ftes_cache, fte);
return rule;
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,51 @@
From 3d6361f488b417481996a88647f6d958c09986d9 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:09 -0400
Subject: [PATCH] net/mlx5: HWS, Init mutex on the correct path
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit a002602676cdae0c9996adb75b9310559b718a93
Author: Vlad Dogaru <vdogaru@nvidia.com>
Date: Tue Jun 10 18:15:09 2025 +0300
net/mlx5: HWS, Init mutex on the correct path
The newly introduced mutex is only used for reformat actions, but it was
initialized for modify header instead.
The struct that contains the mutex is zero-initialized and an all-zero
mutex is valid, so the issue only shows up with CONFIG_DEBUG_MUTEXES.
Fixes: b206d9ec19df ("net/mlx5: HWS, register reformat actions with fw")
Signed-off-by: Vlad Dogaru <vdogaru@nvidia.com>
Reviewed-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250610151514.1094735-5-mbloch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c
index 9d1c0e4b224a..372e2be90706 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c
@@ -1357,6 +1357,7 @@ mlx5_cmd_hws_packet_reformat_alloc(struct mlx5_flow_root_namespace *ns,
pkt_reformat->fs_hws_action.pr_data = pr_data;
}
+ mutex_init(&pkt_reformat->fs_hws_action.lock);
pkt_reformat->owner = MLX5_FLOW_RESOURCE_OWNER_HWS;
pkt_reformat->fs_hws_action.hws_action = hws_action;
return 0;
@@ -1503,7 +1504,6 @@ static int mlx5_cmd_hws_modify_header_alloc(struct mlx5_flow_root_namespace *ns,
err = -ENOMEM;
goto release_mh;
}
- mutex_init(&modify_hdr->fs_hws_action.lock);
modify_hdr->fs_hws_action.mh_data = mh_data;
modify_hdr->fs_hws_action.fs_pool = pool;
modify_hdr->owner = MLX5_FLOW_RESOURCE_OWNER_SW;
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,40 @@
From ba815e2e82f00d7211164de9c0409a7e53173ba8 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:10 -0400
Subject: [PATCH] net/mlx5: HWS, fix missing ip_version handling in definer
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit b5e3c76f35ee7e814c2469c73406c5bbf110d89c
Author: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Tue Jun 10 18:15:10 2025 +0300
net/mlx5: HWS, fix missing ip_version handling in definer
Fix missing field handling in definer - outer IP version.
Fixes: 74a778b4a63f ("net/mlx5: HWS, added definers handling")
Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250610151514.1094735-6-mbloch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c
index 5cc0dc002ac1..d45e1145d197 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c
@@ -785,6 +785,9 @@ hws_definer_conv_outer(struct mlx5hws_definer_conv_data *cd,
HWS_SET_HDR(fc, match_param, IP_PROTOCOL_O,
outer_headers.ip_protocol,
eth_l3_outer.protocol_next_header);
+ HWS_SET_HDR(fc, match_param, IP_VERSION_O,
+ outer_headers.ip_version,
+ eth_l3_outer.ip_version);
HWS_SET_HDR(fc, match_param, IP_TTL_O,
outer_headers.ttl_hoplimit,
eth_l3_outer.time_to_live_hop_limit);
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,98 @@
From cb2e3bb95bee469a17238ac30011a055ff897886 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:10 -0400
Subject: [PATCH] net/mlx5: HWS, make sure the uplink is the last destination
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit b8335829518ec5988294280e37d735799209d70d
Author: Vlad Dogaru <vdogaru@nvidia.com>
Date: Tue Jun 10 18:15:11 2025 +0300
net/mlx5: HWS, make sure the uplink is the last destination
When there are more than one destinations, we create a FW flow
table and provide it with all the destinations. FW requires to
have wire as the last destination in the list (if it exists),
otherwise the operation fails with FW syndrome.
This patch fixes the destination array action creation: if it
contains a wire destination, it is moved to the end.
Fixes: 504e536d9010 ("net/mlx5: HWS, added actions handling")
Signed-off-by: Vlad Dogaru <vdogaru@nvidia.com>
Reviewed-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250610151514.1094735-7-mbloch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c
index fb62f3bc4bd4..447ea3f8722c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c
@@ -1370,8 +1370,8 @@ mlx5hws_action_create_dest_array(struct mlx5hws_context *ctx,
struct mlx5hws_cmd_set_fte_attr fte_attr = {0};
struct mlx5hws_cmd_forward_tbl *fw_island;
struct mlx5hws_action *action;
- u32 i /*, packet_reformat_id*/;
- int ret;
+ int ret, last_dest_idx = -1;
+ u32 i;
if (num_dest <= 1) {
mlx5hws_err(ctx, "Action must have multiple dests\n");
@@ -1401,11 +1401,8 @@ mlx5hws_action_create_dest_array(struct mlx5hws_context *ctx,
dest_list[i].destination_id = dests[i].dest->dest_obj.obj_id;
fte_attr.action_flags |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
fte_attr.ignore_flow_level = ignore_flow_level;
- /* ToDo: In SW steering we have a handling of 'go to WIRE'
- * destination here by upper layer setting 'is_wire_ft' flag
- * if the destination is wire.
- * This is because uplink should be last dest in the list.
- */
+ if (dests[i].is_wire_ft)
+ last_dest_idx = i;
break;
case MLX5HWS_ACTION_TYP_VPORT:
dest_list[i].destination_type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
@@ -1429,6 +1426,9 @@ mlx5hws_action_create_dest_array(struct mlx5hws_context *ctx,
}
}
+ if (last_dest_idx != -1)
+ swap(dest_list[last_dest_idx], dest_list[num_dest - 1]);
+
fte_attr.dests_num = num_dest;
fte_attr.dests = dest_list;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c
index 372e2be90706..bf4643d0ce17 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c
@@ -966,6 +966,9 @@ static int mlx5_fs_fte_get_hws_actions(struct mlx5_flow_root_namespace *ns,
switch (attr->type) {
case MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE:
dest_action = mlx5_fs_get_dest_action_ft(fs_ctx, dst);
+ if (dst->dest_attr.ft->flags &
+ MLX5_FLOW_TABLE_UPLINK_VPORT)
+ dest_actions[num_dest_actions].is_wire_ft = true;
break;
case MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM:
dest_action = mlx5_fs_get_dest_action_table_num(fs_ctx,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h
index 9bbadc4d8a0b..d8ac6c196211 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h
@@ -213,6 +213,7 @@ struct mlx5hws_action_dest_attr {
struct mlx5hws_action *dest;
/* Optional reformat action */
struct mlx5hws_action *reformat;
+ bool is_wire_ft;
};
/**
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,81 @@
From 7af56f3f1bdf8cc9ba3e4a85456cedb7a69a8b86 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:10 -0400
Subject: [PATCH] net/mlx5e: Fix leak of Geneve TLV option object
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit aa9c44b842096c553871bc68a8cebc7861fa192b
Author: Jianbo Liu <jianbol@nvidia.com>
Date: Tue Jun 10 18:15:13 2025 +0300
net/mlx5e: Fix leak of Geneve TLV option object
Previously, a unique tunnel id was added for the matching on TC
non-zero chains, to support inner header rewrite with goto action.
Later, it was used to support VF tunnel offload for vxlan, then for
Geneve and GRE. To support VF tunnel, a temporary mlx5_flow_spec is
used to parse tunnel options. For Geneve, if there is TLV option, a
object is created, or refcnt is added if already exists. But the
temporary mlx5_flow_spec is directly freed after parsing, which causes
the leak because no information regarding the object is saved in
flow's mlx5_flow_spec, which is used to free the object when deleting
the flow.
To fix the leak, call mlx5_geneve_tlv_option_del() before free the
temporary spec if it has TLV object.
Fixes: 521933cdc4aa ("net/mlx5e: Support Geneve and GRE with VF tunnel offload")
Signed-off-by: Jianbo Liu <jianbol@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Alex Lazar <alazar@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250610151514.1094735-9-mbloch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index f1d908f61134..fef418e1ed1a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -2028,9 +2028,8 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
return err;
}
-static bool mlx5_flow_has_geneve_opt(struct mlx5e_tc_flow *flow)
+static bool mlx5_flow_has_geneve_opt(struct mlx5_flow_spec *spec)
{
- struct mlx5_flow_spec *spec = &flow->attr->parse_attr->spec;
void *headers_v = MLX5_ADDR_OF(fte_match_param,
spec->match_value,
misc_parameters_3);
@@ -2069,7 +2068,7 @@ static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv,
}
complete_all(&flow->del_hw_done);
- if (mlx5_flow_has_geneve_opt(flow))
+ if (mlx5_flow_has_geneve_opt(&attr->parse_attr->spec))
mlx5_geneve_tlv_option_del(priv->mdev->geneve);
if (flow->decap_route)
@@ -2574,12 +2573,13 @@ static int parse_tunnel_attr(struct mlx5e_priv *priv,
err = mlx5e_tc_tun_parse(filter_dev, priv, tmp_spec, f, match_level);
if (err) {
- kvfree(tmp_spec);
NL_SET_ERR_MSG_MOD(extack, "Failed to parse tunnel attributes");
netdev_warn(priv->netdev, "Failed to parse tunnel attributes");
- return err;
+ } else {
+ err = mlx5e_tc_set_attr_rx_tun(flow, tmp_spec);
}
- err = mlx5e_tc_set_attr_rx_tun(flow, tmp_spec);
+ if (mlx5_flow_has_geneve_opt(tmp_spec))
+ mlx5_geneve_tlv_option_del(priv->mdev->geneve);
kvfree(tmp_spec);
if (err)
return err;
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,78 @@
From c4f4fb210193420862f49e61b6a6865512ee9636 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:10 -0400
Subject: [PATCH] net/mlx5: HWS, Add error checking to
hws_bwc_rule_complex_hash_node_get()
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 1619bdf4389c829f16af5c7d5b4fa5f1673614d7
Author: Dan Carpenter <dan.carpenter@linaro.org>
Date: Wed Jun 11 16:14:32 2025 +0300
net/mlx5: HWS, Add error checking to hws_bwc_rule_complex_hash_node_get()
Check for if ida_alloc() or rhashtable_lookup_get_insert_fast() fails.
Fixes: 17e0accac577 ("net/mlx5: HWS, support complex matchers")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Reviewed-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Link: https://patch.msgid.link/aEmBONjyiF6z5yCV@stanley.mountain
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc_complex.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc_complex.c
index 70768953a4f6..ca7501c57468 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc_complex.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc_complex.c
@@ -1070,7 +1070,7 @@ hws_bwc_rule_complex_hash_node_get(struct mlx5hws_bwc_rule *bwc_rule,
struct mlx5hws_bwc_matcher *bwc_matcher = bwc_rule->bwc_matcher;
struct mlx5hws_bwc_complex_rule_hash_node *node, *old_node;
struct rhashtable *refcount_hash;
- int i;
+ int ret, i;
bwc_rule->complex_hash_node = NULL;
@@ -1078,7 +1078,11 @@ hws_bwc_rule_complex_hash_node_get(struct mlx5hws_bwc_rule *bwc_rule,
if (unlikely(!node))
return -ENOMEM;
- node->tag = ida_alloc(&bwc_matcher->complex->metadata_ida, GFP_KERNEL);
+ ret = ida_alloc(&bwc_matcher->complex->metadata_ida, GFP_KERNEL);
+ if (ret < 0)
+ goto err_free_node;
+ node->tag = ret;
+
refcount_set(&node->refcount, 1);
/* Clear match buffer - turn off all the unrelated fields
@@ -1094,6 +1098,11 @@ hws_bwc_rule_complex_hash_node_get(struct mlx5hws_bwc_rule *bwc_rule,
old_node = rhashtable_lookup_get_insert_fast(refcount_hash,
&node->hash_node,
hws_refcount_hash);
+ if (IS_ERR(old_node)) {
+ ret = PTR_ERR(old_node);
+ goto err_free_ida;
+ }
+
if (old_node) {
/* Rule with the same tag already exists - update refcount */
refcount_inc(&old_node->refcount);
@@ -1112,6 +1121,12 @@ hws_bwc_rule_complex_hash_node_get(struct mlx5hws_bwc_rule *bwc_rule,
bwc_rule->complex_hash_node = node;
return 0;
+
+err_free_ida:
+ ida_free(&bwc_matcher->complex->metadata_ida, node->tag);
+err_free_node:
+ kfree(node);
+ return ret;
}
static void
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,101 @@
From ad54bc890c4892fa93557d437d96dd787a30b98b Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:11 -0400
Subject: [PATCH] net/mlx5e: Fix race between DIM disable and net_dim()
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit eb41a264a3a576dc040ee37c3d9d6b7e2d9be968
Author: Carolina Jubran <cjubran@nvidia.com>
Date: Thu Jul 10 16:53:43 2025 +0300
net/mlx5e: Fix race between DIM disable and net_dim()
There's a race between disabling DIM and NAPI callbacks using the dim
pointer on the RQ or SQ.
If NAPI checks the DIM state bit and sees it still set, it assumes
`rq->dim` or `sq->dim` is valid. But if DIM gets disabled right after
that check, the pointer might already be set to NULL, leading to a NULL
pointer dereference in net_dim().
Fix this by calling `synchronize_net()` before freeing the DIM context.
This ensures all in-progress NAPI callbacks are finished before the
pointer is cleared.
Kernel log:
BUG: kernel NULL pointer dereference, address: 0000000000000000
...
RIP: 0010:net_dim+0x23/0x190
...
Call Trace:
<TASK>
? __die+0x20/0x60
? page_fault_oops+0x150/0x3e0
? common_interrupt+0xf/0xa0
? sysvec_call_function_single+0xb/0x90
? exc_page_fault+0x74/0x130
? asm_exc_page_fault+0x22/0x30
? net_dim+0x23/0x190
? mlx5e_poll_ico_cq+0x41/0x6f0 [mlx5_core]
? sysvec_apic_timer_interrupt+0xb/0x90
mlx5e_handle_rx_dim+0x92/0xd0 [mlx5_core]
mlx5e_napi_poll+0x2cd/0xac0 [mlx5_core]
? mlx5e_poll_ico_cq+0xe5/0x6f0 [mlx5_core]
busy_poll_stop+0xa2/0x200
? mlx5e_napi_poll+0x1d9/0xac0 [mlx5_core]
? mlx5e_trigger_irq+0x130/0x130 [mlx5_core]
__napi_busy_loop+0x345/0x3b0
? sysvec_call_function_single+0xb/0x90
? asm_sysvec_call_function_single+0x16/0x20
? sysvec_apic_timer_interrupt+0xb/0x90
? pcpu_free_area+0x1e4/0x2e0
napi_busy_loop+0x11/0x20
xsk_recvmsg+0x10c/0x130
sock_recvmsg+0x44/0x70
__sys_recvfrom+0xbc/0x130
? __schedule+0x398/0x890
__x64_sys_recvfrom+0x20/0x30
do_syscall_64+0x4c/0x100
entry_SYSCALL_64_after_hwframe+0x4b/0x53
...
---[ end trace 0000000000000000 ]---
...
---[ end Kernel panic - not syncing: Fatal exception in interrupt ]---
Fixes: 445a25f6e1a2 ("net/mlx5e: Support updating coalescing configuration without resetting channels")
Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Link: https://patch.msgid.link/1752155624-24095-3-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
index 298bb74ec5e9..d1d629697e28 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
@@ -113,7 +113,7 @@ int mlx5e_dim_rx_change(struct mlx5e_rq *rq, bool enable)
__set_bit(MLX5E_RQ_STATE_DIM, &rq->state);
} else {
__clear_bit(MLX5E_RQ_STATE_DIM, &rq->state);
-
+ synchronize_net();
mlx5e_dim_disable(rq->dim);
rq->dim = NULL;
}
@@ -140,7 +140,7 @@ int mlx5e_dim_tx_change(struct mlx5e_txqsq *sq, bool enable)
__set_bit(MLX5E_SQ_STATE_DIM, &sq->state);
} else {
__clear_bit(MLX5E_SQ_STATE_DIM, &sq->state);
-
+ synchronize_net();
mlx5e_dim_disable(sq->dim);
sq->dim = NULL;
}
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,116 @@
From bd88819cc929d1193d943400f4977a8f573b5d15 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:11 -0400
Subject: [PATCH] net/mlx5e: Add new prio for promiscuous mode
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 4c9fce56fa702059bbc5ab737265b68f79cbaac4
Author: Jianbo Liu <jianbol@nvidia.com>
Date: Thu Jul 10 16:53:44 2025 +0300
net/mlx5e: Add new prio for promiscuous mode
An optimization for promiscuous mode adds a high-priority steering
table with a single catch-all rule to steer all traffic directly to
the TTC table.
However, a gap exists between the creation of this table and the
insertion of the catch-all rule. Packets arriving in this brief window
would miss as no rule was inserted yet, unnecessarily incrementing the
'rx_steer_missed_packets' counter and dropped.
This patch resolves the issue by introducing a new prio for this
table, placing it between MLX5E_TC_PRIO and MLX5E_NIC_PRIO. By doing
so, packets arriving during the window now fall through to the next
prio (at MLX5E_NIC_PRIO) instead of being dropped.
Fixes: 1c46d7409f30 ("net/mlx5e: Optimize promiscuous mode")
Signed-off-by: Jianbo Liu <jianbol@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Link: https://patch.msgid.link/1752155624-24095-4-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h
index b5c3a2a9d2a5..9560fcba643f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h
@@ -18,7 +18,8 @@ enum {
enum {
MLX5E_TC_PRIO = 0,
- MLX5E_NIC_PRIO
+ MLX5E_PROMISC_PRIO,
+ MLX5E_NIC_PRIO,
};
struct mlx5e_flow_table {
@@ -68,9 +69,13 @@ struct mlx5e_l2_table {
MLX5_HASH_FIELD_SEL_DST_IP |\
MLX5_HASH_FIELD_SEL_IPSEC_SPI)
-/* NIC prio FTS */
+/* NIC promisc FT level */
enum {
MLX5E_PROMISC_FT_LEVEL,
+};
+
+/* NIC prio FTS */
+enum {
MLX5E_VLAN_FT_LEVEL,
MLX5E_L2_FT_LEVEL,
MLX5E_TTC_FT_LEVEL,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
index 05058710d2c7..537e732085b2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
@@ -776,7 +776,7 @@ static int mlx5e_create_promisc_table(struct mlx5e_flow_steering *fs)
ft_attr.max_fte = MLX5E_PROMISC_TABLE_SIZE;
ft_attr.autogroup.max_num_groups = 1;
ft_attr.level = MLX5E_PROMISC_FT_LEVEL;
- ft_attr.prio = MLX5E_NIC_PRIO;
+ ft_attr.prio = MLX5E_PROMISC_PRIO;
ft->t = mlx5_create_auto_grouped_flow_table(fs->ns, &ft_attr);
if (IS_ERR(ft->t)) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 5f0f546fa126..b29e67466701 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -113,13 +113,16 @@
#define ETHTOOL_PRIO_NUM_LEVELS 1
#define ETHTOOL_NUM_PRIOS 11
#define ETHTOOL_MIN_LEVEL (KERNEL_MIN_LEVEL + ETHTOOL_NUM_PRIOS)
-/* Promiscuous, Vlan, mac, ttc, inner ttc, {UDP/ANY/aRFS/accel/{esp, esp_err}}, IPsec policy,
+/* Vlan, mac, ttc, inner ttc, {UDP/ANY/aRFS/accel/{esp, esp_err}}, IPsec policy,
* {IPsec RoCE MPV,Alias table},IPsec RoCE policy
*/
-#define KERNEL_NIC_PRIO_NUM_LEVELS 11
+#define KERNEL_NIC_PRIO_NUM_LEVELS 10
#define KERNEL_NIC_NUM_PRIOS 1
-/* One more level for tc */
-#define KERNEL_MIN_LEVEL (KERNEL_NIC_PRIO_NUM_LEVELS + 1)
+/* One more level for tc, and one more for promisc */
+#define KERNEL_MIN_LEVEL (KERNEL_NIC_PRIO_NUM_LEVELS + 2)
+
+#define KERNEL_NIC_PROMISC_NUM_PRIOS 1
+#define KERNEL_NIC_PROMISC_NUM_LEVELS 1
#define KERNEL_NIC_TC_NUM_PRIOS 1
#define KERNEL_NIC_TC_NUM_LEVELS 3
@@ -187,6 +190,8 @@ static struct init_tree_node {
ADD_NS(MLX5_FLOW_TABLE_MISS_ACTION_DEF,
ADD_MULTIPLE_PRIO(KERNEL_NIC_TC_NUM_PRIOS,
KERNEL_NIC_TC_NUM_LEVELS),
+ ADD_MULTIPLE_PRIO(KERNEL_NIC_PROMISC_NUM_PRIOS,
+ KERNEL_NIC_PROMISC_NUM_LEVELS),
ADD_MULTIPLE_PRIO(KERNEL_NIC_NUM_PRIOS,
KERNEL_NIC_PRIO_NUM_LEVELS))),
ADD_PRIO(0, BY_PASS_MIN_LEVEL, 0, FS_CHAINING_CAPS,
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,86 @@
From 07084a10593c81a6071d6f045927f1c2f52ac5b3 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:11 -0400
Subject: [PATCH] net/mlx5: Correctly set gso_size when LRO is used
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 531d0d32de3e1b6b77a87bd37de0c2c6e17b496a
Author: Christoph Paasch <cpaasch@openai.com>
Date: Tue Jul 15 13:20:53 2025 -0700
net/mlx5: Correctly set gso_size when LRO is used
gso_size is expected by the networking stack to be the size of the
payload (thus, not including ethernet/IP/TCP-headers). However, cqe_bcnt
is the full sized frame (including the headers). Dividing cqe_bcnt by
lro_num_seg will then give incorrect results.
For example, running a bpftrace higher up in the TCP-stack
(tcp_event_data_recv), we commonly have gso_size set to 1450 or 1451 even
though in reality the payload was only 1448 bytes.
This can have unintended consequences:
- In tcp_measure_rcv_mss() len will be for example 1450, but. rcv_mss
will be 1448 (because tp->advmss is 1448). Thus, we will always
recompute scaling_ratio each time an LRO-packet is received.
- In tcp_gro_receive(), it will interfere with the decision whether or
not to flush and thus potentially result in less gro'ed packets.
So, we need to discount the protocol headers from cqe_bcnt so we can
actually divide the payload by lro_num_seg to get the real gso_size.
v2:
- Use "(unsigned char *)tcp + tcp->doff * 4 - skb->data)" to compute header-len
(Tariq Toukan <tariqt@nvidia.com>)
- Improve commit-message (Gal Pressman <gal@nvidia.com>)
Fixes: e586b3b0baee ("net/mlx5: Ethernet Datapath files")
Signed-off-by: Christoph Paasch <cpaasch@openai.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Gal Pressman <gal@nvidia.com>
Link: https://patch.msgid.link/20250715-cpaasch-pf-925-investigate-incorrect-gso_size-on-cx-7-nic-v2-1-e06c3475f3ac@openai.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 12ca0a3e8514..382679838113 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -1156,8 +1156,9 @@ static void mlx5e_lro_update_tcp_hdr(struct mlx5_cqe64 *cqe, struct tcphdr *tcp)
}
}
-static void mlx5e_lro_update_hdr(struct sk_buff *skb, struct mlx5_cqe64 *cqe,
- u32 cqe_bcnt)
+static unsigned int mlx5e_lro_update_hdr(struct sk_buff *skb,
+ struct mlx5_cqe64 *cqe,
+ u32 cqe_bcnt)
{
struct ethhdr *eth = (struct ethhdr *)(skb->data);
struct tcphdr *tcp;
@@ -1207,6 +1208,8 @@ static void mlx5e_lro_update_hdr(struct sk_buff *skb, struct mlx5_cqe64 *cqe,
tcp->check = tcp_v6_check(payload_len, &ipv6->saddr,
&ipv6->daddr, check);
}
+
+ return (unsigned int)((unsigned char *)tcp + tcp->doff * 4 - skb->data);
}
static void *mlx5e_shampo_get_packet_hd(struct mlx5e_rq *rq, u16 header_index)
@@ -1563,8 +1566,9 @@ static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 *cqe,
mlx5e_macsec_offload_handle_rx_skb(netdev, skb, cqe);
if (lro_num_seg > 1) {
- mlx5e_lro_update_hdr(skb, cqe, cqe_bcnt);
- skb_shinfo(skb)->gso_size = DIV_ROUND_UP(cqe_bcnt, lro_num_seg);
+ unsigned int hdrlen = mlx5e_lro_update_hdr(skb, cqe, cqe_bcnt);
+
+ skb_shinfo(skb)->gso_size = DIV_ROUND_UP(cqe_bcnt - hdrlen, lro_num_seg);
/* Subtract one since we already counted this as one
* "regular" packet in mlx5e_complete_rx_cqe()
*/
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,49 @@
From 0e6b8d1b57695231dc16f426ca329c7fc415ff43 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:12 -0400
Subject: [PATCH] net/mlx5: Fix memory leak in cmd_exec()
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 3afa3ae3db52e3c216d77bd5907a5a86833806cc
Author: Chiara Meiohas <cmeiohas@nvidia.com>
Date: Thu Jul 17 15:06:09 2025 +0300
net/mlx5: Fix memory leak in cmd_exec()
If cmd_exec() is called with callback and mlx5_cmd_invoke() returns an
error, resources allocated in cmd_exec() will not be freed.
Fix the code to release the resources if mlx5_cmd_invoke() returns an
error.
Fixes: f086470122d5 ("net/mlx5: cmdif, Return value improvements")
Reported-by: Alex Tereshkin <atereshkin@nvidia.com>
Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Vlad Dumitrescu <vdumitrescu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/1752753970-261832-2-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index b1aeea7c4a91..e395ef5f356e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -1947,8 +1947,8 @@ static int cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
err = mlx5_cmd_invoke(dev, inb, outb, out, out_size, callback, context,
pages_queue, token, force_polling);
- if (callback)
- return err;
+ if (callback && !err)
+ return 0;
if (err > 0) /* Failed in FW, command didn't execute */
err = deliv_status_to_err(err);
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,247 @@
From 52b9ece6153bacd7511923119dae3562495686df Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:42:12 -0400
Subject: [PATCH] net/mlx5: E-Switch, Fix peer miss rules to use peer eswitch
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 5b4c56ad4da0aa00b258ab50b1f5775b7d3108c7
Author: Shahar Shitrit <shshitrit@nvidia.com>
Date: Thu Jul 17 15:06:10 2025 +0300
net/mlx5: E-Switch, Fix peer miss rules to use peer eswitch
In the original design, it is assumed local and peer eswitches have the
same number of vfs. However, in new firmware, local and peer eswitches
can have different number of vfs configured by mlxconfig. In such
configuration, it is incorrect to derive the number of vfs from the
local device's eswitch.
Fix this by updating the peer miss rules add and delete functions to use
the peer device's eswitch and vf count instead of the local device's
information, ensuring correct behavior regardless of vf configuration
differences.
Fixes: ac004b832128 ("net/mlx5e: E-Switch, Add peer miss rules")
Signed-off-by: Shahar Shitrit <shshitrit@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/1752753970-261832-3-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 0e3a977d5332..bee906661282 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -1182,19 +1182,19 @@ static void esw_set_peer_miss_rule_source_port(struct mlx5_eswitch *esw,
static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw,
struct mlx5_core_dev *peer_dev)
{
+ struct mlx5_eswitch *peer_esw = peer_dev->priv.eswitch;
struct mlx5_flow_destination dest = {};
struct mlx5_flow_act flow_act = {0};
struct mlx5_flow_handle **flows;
- /* total vports is the same for both e-switches */
- int nvports = esw->total_vports;
struct mlx5_flow_handle *flow;
+ struct mlx5_vport *peer_vport;
struct mlx5_flow_spec *spec;
- struct mlx5_vport *vport;
int err, pfindex;
unsigned long i;
void *misc;
- if (!MLX5_VPORT_MANAGER(esw->dev) && !mlx5_core_is_ecpf_esw_manager(esw->dev))
+ if (!MLX5_VPORT_MANAGER(peer_dev) &&
+ !mlx5_core_is_ecpf_esw_manager(peer_dev))
return 0;
spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
@@ -1203,7 +1203,7 @@ static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw,
peer_miss_rules_setup(esw, peer_dev, spec, &dest);
- flows = kvcalloc(nvports, sizeof(*flows), GFP_KERNEL);
+ flows = kvcalloc(peer_esw->total_vports, sizeof(*flows), GFP_KERNEL);
if (!flows) {
err = -ENOMEM;
goto alloc_flows_err;
@@ -1213,10 +1213,10 @@ static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw,
misc = MLX5_ADDR_OF(fte_match_param, spec->match_value,
misc_parameters);
- if (mlx5_core_is_ecpf_esw_manager(esw->dev)) {
- vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_PF);
- esw_set_peer_miss_rule_source_port(esw, peer_dev->priv.eswitch,
- spec, MLX5_VPORT_PF);
+ if (mlx5_core_is_ecpf_esw_manager(peer_dev)) {
+ peer_vport = mlx5_eswitch_get_vport(peer_esw, MLX5_VPORT_PF);
+ esw_set_peer_miss_rule_source_port(esw, peer_esw, spec,
+ MLX5_VPORT_PF);
flow = mlx5_add_flow_rules(mlx5_eswitch_get_slow_fdb(esw),
spec, &flow_act, &dest, 1);
@@ -1224,11 +1224,11 @@ static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw,
err = PTR_ERR(flow);
goto add_pf_flow_err;
}
- flows[vport->index] = flow;
+ flows[peer_vport->index] = flow;
}
- if (mlx5_ecpf_vport_exists(esw->dev)) {
- vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_ECPF);
+ if (mlx5_ecpf_vport_exists(peer_dev)) {
+ peer_vport = mlx5_eswitch_get_vport(peer_esw, MLX5_VPORT_ECPF);
MLX5_SET(fte_match_set_misc, misc, source_port, MLX5_VPORT_ECPF);
flow = mlx5_add_flow_rules(mlx5_eswitch_get_slow_fdb(esw),
spec, &flow_act, &dest, 1);
@@ -1236,13 +1236,14 @@ static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw,
err = PTR_ERR(flow);
goto add_ecpf_flow_err;
}
- flows[vport->index] = flow;
+ flows[peer_vport->index] = flow;
}
- mlx5_esw_for_each_vf_vport(esw, i, vport, mlx5_core_max_vfs(esw->dev)) {
+ mlx5_esw_for_each_vf_vport(peer_esw, i, peer_vport,
+ mlx5_core_max_vfs(peer_dev)) {
esw_set_peer_miss_rule_source_port(esw,
- peer_dev->priv.eswitch,
- spec, vport->vport);
+ peer_esw,
+ spec, peer_vport->vport);
flow = mlx5_add_flow_rules(mlx5_eswitch_get_slow_fdb(esw),
spec, &flow_act, &dest, 1);
@@ -1250,22 +1251,22 @@ static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw,
err = PTR_ERR(flow);
goto add_vf_flow_err;
}
- flows[vport->index] = flow;
+ flows[peer_vport->index] = flow;
}
- if (mlx5_core_ec_sriov_enabled(esw->dev)) {
- mlx5_esw_for_each_ec_vf_vport(esw, i, vport, mlx5_core_max_ec_vfs(esw->dev)) {
- if (i >= mlx5_core_max_ec_vfs(peer_dev))
- break;
- esw_set_peer_miss_rule_source_port(esw, peer_dev->priv.eswitch,
- spec, vport->vport);
+ if (mlx5_core_ec_sriov_enabled(peer_dev)) {
+ mlx5_esw_for_each_ec_vf_vport(peer_esw, i, peer_vport,
+ mlx5_core_max_ec_vfs(peer_dev)) {
+ esw_set_peer_miss_rule_source_port(esw, peer_esw,
+ spec,
+ peer_vport->vport);
flow = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb,
spec, &flow_act, &dest, 1);
if (IS_ERR(flow)) {
err = PTR_ERR(flow);
goto add_ec_vf_flow_err;
}
- flows[vport->index] = flow;
+ flows[peer_vport->index] = flow;
}
}
@@ -1282,25 +1283,27 @@ static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw,
return 0;
add_ec_vf_flow_err:
- mlx5_esw_for_each_ec_vf_vport(esw, i, vport, mlx5_core_max_ec_vfs(esw->dev)) {
- if (!flows[vport->index])
+ mlx5_esw_for_each_ec_vf_vport(peer_esw, i, peer_vport,
+ mlx5_core_max_ec_vfs(peer_dev)) {
+ if (!flows[peer_vport->index])
continue;
- mlx5_del_flow_rules(flows[vport->index]);
+ mlx5_del_flow_rules(flows[peer_vport->index]);
}
add_vf_flow_err:
- mlx5_esw_for_each_vf_vport(esw, i, vport, mlx5_core_max_vfs(esw->dev)) {
- if (!flows[vport->index])
+ mlx5_esw_for_each_vf_vport(peer_esw, i, peer_vport,
+ mlx5_core_max_vfs(peer_dev)) {
+ if (!flows[peer_vport->index])
continue;
- mlx5_del_flow_rules(flows[vport->index]);
+ mlx5_del_flow_rules(flows[peer_vport->index]);
}
- if (mlx5_ecpf_vport_exists(esw->dev)) {
- vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_ECPF);
- mlx5_del_flow_rules(flows[vport->index]);
+ if (mlx5_ecpf_vport_exists(peer_dev)) {
+ peer_vport = mlx5_eswitch_get_vport(peer_esw, MLX5_VPORT_ECPF);
+ mlx5_del_flow_rules(flows[peer_vport->index]);
}
add_ecpf_flow_err:
- if (mlx5_core_is_ecpf_esw_manager(esw->dev)) {
- vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_PF);
- mlx5_del_flow_rules(flows[vport->index]);
+ if (mlx5_core_is_ecpf_esw_manager(peer_dev)) {
+ peer_vport = mlx5_eswitch_get_vport(peer_esw, MLX5_VPORT_PF);
+ mlx5_del_flow_rules(flows[peer_vport->index]);
}
add_pf_flow_err:
esw_warn(esw->dev, "FDB: Failed to add peer miss flow rule err %d\n", err);
@@ -1313,37 +1316,34 @@ static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw,
static void esw_del_fdb_peer_miss_rules(struct mlx5_eswitch *esw,
struct mlx5_core_dev *peer_dev)
{
+ struct mlx5_eswitch *peer_esw = peer_dev->priv.eswitch;
u16 peer_index = mlx5_get_dev_index(peer_dev);
struct mlx5_flow_handle **flows;
- struct mlx5_vport *vport;
+ struct mlx5_vport *peer_vport;
unsigned long i;
flows = esw->fdb_table.offloads.peer_miss_rules[peer_index];
if (!flows)
return;
- if (mlx5_core_ec_sriov_enabled(esw->dev)) {
- mlx5_esw_for_each_ec_vf_vport(esw, i, vport, mlx5_core_max_ec_vfs(esw->dev)) {
- /* The flow for a particular vport could be NULL if the other ECPF
- * has fewer or no VFs enabled
- */
- if (!flows[vport->index])
- continue;
- mlx5_del_flow_rules(flows[vport->index]);
- }
+ if (mlx5_core_ec_sriov_enabled(peer_dev)) {
+ mlx5_esw_for_each_ec_vf_vport(peer_esw, i, peer_vport,
+ mlx5_core_max_ec_vfs(peer_dev))
+ mlx5_del_flow_rules(flows[peer_vport->index]);
}
- mlx5_esw_for_each_vf_vport(esw, i, vport, mlx5_core_max_vfs(esw->dev))
- mlx5_del_flow_rules(flows[vport->index]);
+ mlx5_esw_for_each_vf_vport(peer_esw, i, peer_vport,
+ mlx5_core_max_vfs(peer_dev))
+ mlx5_del_flow_rules(flows[peer_vport->index]);
- if (mlx5_ecpf_vport_exists(esw->dev)) {
- vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_ECPF);
- mlx5_del_flow_rules(flows[vport->index]);
+ if (mlx5_ecpf_vport_exists(peer_dev)) {
+ peer_vport = mlx5_eswitch_get_vport(peer_esw, MLX5_VPORT_ECPF);
+ mlx5_del_flow_rules(flows[peer_vport->index]);
}
- if (mlx5_core_is_ecpf_esw_manager(esw->dev)) {
- vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_PF);
- mlx5_del_flow_rules(flows[vport->index]);
+ if (mlx5_core_is_ecpf_esw_manager(peer_dev)) {
+ peer_vport = mlx5_eswitch_get_vport(peer_esw, MLX5_VPORT_PF);
+ mlx5_del_flow_rules(flows[peer_vport->index]);
}
kvfree(flows);
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,70 @@
From dc192079bc2276b1a7c8163940448e003bf856c7 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:43:25 -0400
Subject: [PATCH] RDMA/mlx5: convert timeouts to secs_to_jiffies()
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 16b82367aa28bd31795e720548421b58824108e1
Author: Easwar Hariharan <easwar.hariharan@linux.microsoft.com>
Date: Wed Feb 19 21:36:40 2025 +0000
RDMA/mlx5: convert timeouts to secs_to_jiffies()
Commit b35108a51cf7 ("jiffies: Define secs_to_jiffies()") introduced
secs_to_jiffies(). As the value here is a multiple of 1000, use
secs_to_jiffies() instead of msecs_to_jiffies to avoid the multiplication.
This is converted using scripts/coccinelle/misc/secs_to_jiffies.cocci with
the following Coccinelle rules:
@depends on patch@
expression E;
@@
-msecs_to_jiffies(E * 1000)
+secs_to_jiffies(E)
-msecs_to_jiffies(E * MSEC_PER_SEC)
+secs_to_jiffies(E)
Link: https://patch.msgid.link/r/20250219-rdma-secs-to-jiffies-v1-2-b506746561a9@linux.microsoft.com
Signed-off-by: Easwar Hariharan <eahariha@linux.microsoft.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 247f7248a0c0..5a7b234bdfd9 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -525,7 +525,7 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
ent->fill_to_high_water = false;
if (ent->pending)
queue_delayed_work(ent->dev->cache.wq, &ent->dwork,
- msecs_to_jiffies(1000));
+ secs_to_jiffies(1));
else
mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
}
@@ -576,7 +576,7 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
"add keys command failed, err %d\n",
err);
queue_delayed_work(cache->wq, &ent->dwork,
- msecs_to_jiffies(1000));
+ secs_to_jiffies(1));
}
}
} else if (ent->mkeys_queue.ci > 2 * ent->limit) {
@@ -2080,7 +2080,7 @@ static int mlx5r_handle_mkey_cleanup(struct mlx5_ib_mr *mr)
ent->in_use--;
if (ent->is_tmp && !ent->tmp_cleanup_scheduled) {
mod_delayed_work(ent->dev->cache.wq, &ent->dwork,
- msecs_to_jiffies(30 * 1000));
+ secs_to_jiffies(30));
ent->tmp_cleanup_scheduled = true;
}
spin_unlock_irq(&ent->mkeys_queue.lock);
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,103 @@
From a146abefb34ccd688e9923429cd51b9b7b1a3442 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:43:26 -0400
Subject: [PATCH] RDMA/mlx5: Remove the redundant MLX5_IB_STAGE_UAR stage
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 972db388d40ded1a5ef8ce09d92ef1e2b9e40f07
Author: Yishai Hadas <yishaih@nvidia.com>
Date: Tue May 13 14:02:40 2025 +0300
RDMA/mlx5: Remove the redundant MLX5_IB_STAGE_UAR stage
The MLX5_IB_STAGE_UAR stage in the RDMA driver is redundant and should
be removed.
Responsibility for initializing the device's UAR pointer
(mdev->priv.uar) lies with mlx5_core, which already sets it during the
mlx5_load() process.
At present, the RDMA UAR stage overwrites this pointer, which was
correctly initialized by mlx5_core, creating the risk of inconsistency.
Ownership and management of the UAR pointer should remain exclusively
within mlx5_core.
In the current upstream code, we luckily receive the same pointer, since
mlx5_get_uars_page() still finds available BF registers for that UAR,
allowing it to be shared.
However, future changes in mlx5_core may expose this flaw.
For instance, if mlx5_alloc_bfreg() is invoked twice before the RDMA UAR
stage runs, the RDMA driver may overwrite the UAR allocated by
mlx5_core.
This could lead to real bugs. For example, if mlx5_ib is unloaded
(rmmod), it might free the UAR, leaving mlx5_core with a dangling
reference to an invalid UAR.
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Reviewed-by: Fan Li <fanl@nvidia.com>
Link: https://patch.msgid.link/feaa84ec6f20468b4935c439923e9266122a93d0.1747134130.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 64f1e0fafd46..6a75c5a2f6c8 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -4422,17 +4422,6 @@ static void mlx5_ib_stage_cong_debugfs_cleanup(struct mlx5_ib_dev *dev)
mlx5_core_native_port_num(dev->mdev) - 1);
}
-static int mlx5_ib_stage_uar_init(struct mlx5_ib_dev *dev)
-{
- dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev);
- return PTR_ERR_OR_ZERO(dev->mdev->priv.uar);
-}
-
-static void mlx5_ib_stage_uar_cleanup(struct mlx5_ib_dev *dev)
-{
- mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar);
-}
-
static int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev)
{
int err;
@@ -4661,9 +4650,6 @@ static const struct mlx5_ib_profile pf_profile = {
STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS,
mlx5_ib_stage_cong_debugfs_init,
mlx5_ib_stage_cong_debugfs_cleanup),
- STAGE_CREATE(MLX5_IB_STAGE_UAR,
- mlx5_ib_stage_uar_init,
- mlx5_ib_stage_uar_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_BFREG,
mlx5_ib_stage_bfrag_init,
mlx5_ib_stage_bfrag_cleanup),
@@ -4721,9 +4707,6 @@ const struct mlx5_ib_profile raw_eth_profile = {
STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS,
mlx5_ib_stage_cong_debugfs_init,
mlx5_ib_stage_cong_debugfs_cleanup),
- STAGE_CREATE(MLX5_IB_STAGE_UAR,
- mlx5_ib_stage_uar_init,
- mlx5_ib_stage_uar_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_BFREG,
mlx5_ib_stage_bfrag_init,
mlx5_ib_stage_bfrag_cleanup),
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index c84ef94bb9fc..54ca6e010bd4 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1002,7 +1002,6 @@ enum mlx5_ib_stages {
MLX5_IB_STAGE_ODP,
MLX5_IB_STAGE_COUNTERS,
MLX5_IB_STAGE_CONG_DEBUGFS,
- MLX5_IB_STAGE_UAR,
MLX5_IB_STAGE_BFREG,
MLX5_IB_STAGE_PRE_IB_REG_UMR,
MLX5_IB_STAGE_WHITELIST_UID,
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,60 @@
From 1e3839f66a3dc069dc1f0b26277180792852653c Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:43:26 -0400
Subject: [PATCH] RDMA/mlx5: Add support for 200Gbps per lane speeds
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit d00d16bcbc2553a3ac9acccf2d6444cda5502adf
Author: Patrisious Haddad <phaddad@nvidia.com>
Date: Tue May 13 14:03:41 2025 +0300
RDMA/mlx5: Add support for 200Gbps per lane speeds
Add support for 200Gbps per lane speeds speed when querying PTYS and
report it back correctly when needed.
Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
Reviewed-by: Maor Gottlieb <maorg@nvidia.com>
Link: https://patch.msgid.link/b842d2f523e9b82e221378c444ebd5860d612959.1747134197.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 6a75c5a2f6c8..d5f44c83b667 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -485,6 +485,10 @@ static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u16 *active_speed,
*active_width = IB_WIDTH_2X;
*active_speed = IB_SPEED_NDR;
break;
+ case MLX5E_PROT_MASK(MLX5E_200GAUI_1_200GBASE_CR1_KR1):
+ *active_width = IB_WIDTH_1X;
+ *active_speed = IB_SPEED_XDR;
+ break;
case MLX5E_PROT_MASK(MLX5E_400GAUI_8_400GBASE_CR8):
*active_width = IB_WIDTH_8X;
*active_speed = IB_SPEED_HDR;
@@ -493,10 +497,18 @@ static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u16 *active_speed,
*active_width = IB_WIDTH_4X;
*active_speed = IB_SPEED_NDR;
break;
+ case MLX5E_PROT_MASK(MLX5E_400GAUI_2_400GBASE_CR2_KR2):
+ *active_width = IB_WIDTH_2X;
+ *active_speed = IB_SPEED_XDR;
+ break;
case MLX5E_PROT_MASK(MLX5E_800GAUI_8_800GBASE_CR8_KR8):
*active_width = IB_WIDTH_8X;
*active_speed = IB_SPEED_NDR;
break;
+ case MLX5E_PROT_MASK(MLX5E_800GAUI_4_800GBASE_CR4_KR4):
+ *active_width = IB_WIDTH_4X;
+ *active_speed = IB_SPEED_XDR;
+ break;
default:
return -EINVAL;
}
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,109 @@
From 08a08451012e72442d1813506229d85d3f785535 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:43:26 -0400
Subject: [PATCH] RDMA/mlx5: Avoid flexible array warning
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit e91fb8b9d0edec86a1ef26490bc80af96210863d
Author: Leon Romanovsky <leon@kernel.org>
Date: Wed May 21 14:34:58 2025 +0300
RDMA/mlx5: Avoid flexible array warning
The following warning is reported by sparse tool:
drivers/infiniband/hw/mlx5/fs.c:1664:26: warning: array of flexible
structures
Avoid it by simply splitting array into two separate structs.
Link: https://patch.msgid.link/7b891b96a9fc053d01284c184d25ae98d35db2d4.1747827041.git.leon@kernel.org
Reviewed-by: Zhu Yanjun <yanjun.zhu@linux.dev>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c
index 0ff9f18a71e8..680627f1de33 100644
--- a/drivers/infiniband/hw/mlx5/fs.c
+++ b/drivers/infiniband/hw/mlx5/fs.c
@@ -1645,11 +1645,6 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
return _create_flow_rule(dev, ft_prio, flow_attr, dst, 0, NULL);
}
-enum {
- LEFTOVERS_MC,
- LEFTOVERS_UC,
-};
-
static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *dev,
struct mlx5_ib_flow_prio *ft_prio,
struct ib_flow_attr *flow_attr,
@@ -1659,43 +1654,32 @@ static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *de
struct mlx5_ib_flow_handler *handler = NULL;
static struct {
- struct ib_flow_attr flow_attr;
struct ib_flow_spec_eth eth_flow;
- } leftovers_specs[] = {
- [LEFTOVERS_MC] = {
- .flow_attr = {
- .num_of_specs = 1,
- .size = sizeof(leftovers_specs[0])
- },
- .eth_flow = {
- .type = IB_FLOW_SPEC_ETH,
- .size = sizeof(struct ib_flow_spec_eth),
- .mask = {.dst_mac = {0x1} },
- .val = {.dst_mac = {0x1} }
- }
- },
- [LEFTOVERS_UC] = {
- .flow_attr = {
- .num_of_specs = 1,
- .size = sizeof(leftovers_specs[0])
- },
- .eth_flow = {
- .type = IB_FLOW_SPEC_ETH,
- .size = sizeof(struct ib_flow_spec_eth),
- .mask = {.dst_mac = {0x1} },
- .val = {.dst_mac = {} }
- }
- }
- };
+ struct ib_flow_attr flow_attr;
+ } leftovers_wc = { .flow_attr = { .num_of_specs = 1,
+ .size = sizeof(leftovers_wc) },
+ .eth_flow = {
+ .type = IB_FLOW_SPEC_ETH,
+ .size = sizeof(struct ib_flow_spec_eth),
+ .mask = { .dst_mac = { 0x1 } },
+ .val = { .dst_mac = { 0x1 } } } };
- handler = create_flow_rule(dev, ft_prio,
- &leftovers_specs[LEFTOVERS_MC].flow_attr,
- dst);
+ static struct {
+ struct ib_flow_spec_eth eth_flow;
+ struct ib_flow_attr flow_attr;
+ } leftovers_uc = { .flow_attr = { .num_of_specs = 1,
+ .size = sizeof(leftovers_uc) },
+ .eth_flow = {
+ .type = IB_FLOW_SPEC_ETH,
+ .size = sizeof(struct ib_flow_spec_eth),
+ .mask = { .dst_mac = { 0x1 } },
+ .val = { .dst_mac = {} } } };
+
+ handler = create_flow_rule(dev, ft_prio, &leftovers_wc.flow_attr, dst);
if (!IS_ERR(handler) &&
flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) {
handler_ucast = create_flow_rule(dev, ft_prio,
- &leftovers_specs[LEFTOVERS_UC].flow_attr,
- dst);
+ &leftovers_uc.flow_attr, dst);
if (IS_ERR(handler_ucast)) {
mlx5_del_flow_rules(handler->rule);
ft_prio->refcount--;
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,103 @@
From 06a22c7cc0922e8dbc79ec9dad2cec1493943213 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:43:26 -0400
Subject: [PATCH] RDMA/mlx5: Initialize obj_event->obj_sub_list before
xa_insert
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 8edab8a72d67742f87e9dc2e2b0cdfddda5dc29a
Author: Mark Zhang <markzhang@nvidia.com>
Date: Tue Jun 17 11:13:55 2025 +0300
RDMA/mlx5: Initialize obj_event->obj_sub_list before xa_insert
The obj_event may be loaded immediately after inserted, then if the
list_head is not initialized then we may get a poisonous pointer. This
fixes the crash below:
mlx5_core 0000:03:00.0: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0 enhanced)
mlx5_core.sf mlx5_core.sf.4: firmware version: 32.38.3056
mlx5_core 0000:03:00.0 en3f0pf0sf2002: renamed from eth0
mlx5_core.sf mlx5_core.sf.4: Rate limit: 127 rates are supported, range: 0Mbps to 195312Mbps
IPv6: ADDRCONF(NETDEV_CHANGE): en3f0pf0sf2002: link becomes ready
Unable to handle kernel NULL pointer dereference at virtual address 0000000000000060
Mem abort info:
ESR = 0x96000006
EC = 0x25: DABT (current EL), IL = 32 bits
SET = 0, FnV = 0
EA = 0, S1PTW = 0
Data abort info:
ISV = 0, ISS = 0x00000006
CM = 0, WnR = 0
user pgtable: 4k pages, 48-bit VAs, pgdp=00000007760fb000
[0000000000000060] pgd=000000076f6d7003, p4d=000000076f6d7003, pud=0000000777841003, pmd=0000000000000000
Internal error: Oops: 96000006 [#1] SMP
Modules linked in: ipmb_host(OE) act_mirred(E) cls_flower(E) sch_ingress(E) mptcp_diag(E) udp_diag(E) raw_diag(E) unix_diag(E) tcp_diag(E) inet_diag(E) binfmt_misc(E) bonding(OE) rdma_ucm(OE) rdma_cm(OE) iw_cm(OE) ib_ipoib(OE) ib_cm(OE) isofs(E) cdrom(E) mst_pciconf(OE) ib_umad(OE) mlx5_ib(OE) ipmb_dev_int(OE) mlx5_core(OE) kpatch_15237886(OEK) mlxdevm(OE) auxiliary(OE) ib_uverbs(OE) ib_core(OE) psample(E) mlxfw(OE) tls(E) sunrpc(E) vfat(E) fat(E) crct10dif_ce(E) ghash_ce(E) sha1_ce(E) sbsa_gwdt(E) virtio_console(E) ext4(E) mbcache(E) jbd2(E) xfs(E) libcrc32c(E) mmc_block(E) virtio_net(E) net_failover(E) failover(E) sha2_ce(E) sha256_arm64(E) nvme(OE) nvme_core(OE) gpio_mlxbf3(OE) mlx_compat(OE) mlxbf_pmc(OE) i2c_mlxbf(OE) sdhci_of_dwcmshc(OE) pinctrl_mlxbf3(OE) mlxbf_pka(OE) gpio_generic(E) i2c_core(E) mmc_core(E) mlxbf_gige(OE) vitesse(E) pwr_mlxbf(OE) mlxbf_tmfifo(OE) micrel(E) mlxbf_bootctl(OE) virtio_ring(E) virtio(E) ipmi_devintf(E) ipmi_msghandler(E)
[last unloaded: mst_pci]
CPU: 11 PID: 20913 Comm: rte-worker-11 Kdump: loaded Tainted: G OE K 5.10.134-13.1.an8.aarch64 #1
Hardware name: https://www.mellanox.com BlueField-3 SmartNIC Main Card/BlueField-3 SmartNIC Main Card, BIOS 4.2.2.12968 Oct 26 2023
pstate: a0400089 (NzCv daIf +PAN -UAO -TCO BTYPE=--)
pc : dispatch_event_fd+0x68/0x300 [mlx5_ib]
lr : devx_event_notifier+0xcc/0x228 [mlx5_ib]
sp : ffff80001005bcf0
x29: ffff80001005bcf0 x28: 0000000000000001
x27: ffff244e0740a1d8 x26: ffff244e0740a1d0
x25: ffffda56beff5ae0 x24: ffffda56bf911618
x23: ffff244e0596a480 x22: ffff244e0596a480
x21: ffff244d8312ad90 x20: ffff244e0596a480
x19: fffffffffffffff0 x18: 0000000000000000
x17: 0000000000000000 x16: ffffda56be66d620
x15: 0000000000000000 x14: 0000000000000000
x13: 0000000000000000 x12: 0000000000000000
x11: 0000000000000040 x10: ffffda56bfcafb50
x9 : ffffda5655c25f2c x8 : 0000000000000010
x7 : 0000000000000000 x6 : ffff24545a2e24b8
x5 : 0000000000000003 x4 : ffff80001005bd28
x3 : 0000000000000000 x2 : 0000000000000000
x1 : ffff244e0596a480 x0 : ffff244d8312ad90
Call trace:
dispatch_event_fd+0x68/0x300 [mlx5_ib]
devx_event_notifier+0xcc/0x228 [mlx5_ib]
atomic_notifier_call_chain+0x58/0x80
mlx5_eq_async_int+0x148/0x2b0 [mlx5_core]
atomic_notifier_call_chain+0x58/0x80
irq_int_handler+0x20/0x30 [mlx5_core]
__handle_irq_event_percpu+0x60/0x220
handle_irq_event_percpu+0x3c/0x90
handle_irq_event+0x58/0x158
handle_fasteoi_irq+0xfc/0x188
generic_handle_irq+0x34/0x48
...
Fixes: 759738537142 ("IB/mlx5: Enable subscription for device events over DEVX")
Link: https://patch.msgid.link/r/3ce7f20e0d1a03dc7de6e57494ec4b8eaf1f05c2.1750147949.git.leon@kernel.org
Signed-off-by: Mark Zhang <markzhang@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index 6485ce3208ce..fae11535b1a0 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -1958,6 +1958,7 @@ subscribe_event_xa_alloc(struct mlx5_devx_event_table *devx_event_table,
/* Level1 is valid for future use, no need to free */
return -ENOMEM;
+ INIT_LIST_HEAD(&obj_event->obj_sub_list);
err = xa_insert(&event->object_ids,
key_level2,
obj_event,
@@ -1966,7 +1967,6 @@ subscribe_event_xa_alloc(struct mlx5_devx_event_table *devx_event_table,
kfree(obj_event);
return err;
}
- INIT_LIST_HEAD(&obj_event->obj_sub_list);
}
return 0;
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,48 @@
From 1607e340f1d5df6e422bdd53e34444bf73986864 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:43:26 -0400
Subject: [PATCH] RDMA/mlx5: Fix HW counters query for non-representor devices
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 3cc1dbfddf88dc5ecce0a75185061403b1f7352d
Author: Patrisious Haddad <phaddad@nvidia.com>
Date: Mon Jun 16 12:14:52 2025 +0300
RDMA/mlx5: Fix HW counters query for non-representor devices
To get the device HW counters, a non-representor switchdev device
should use the mlx5_ib_query_q_counters() function and query all of
the available counters. While a representor device in switchdev mode
should use the mlx5_ib_query_q_counters_vport() function and query only
the Q_Counters without the PPCNT counters and congestion control counters,
since they aren't relevant for a representor device.
Currently a non-representor switchdev device skips querying the PPCNT
counters and congestion control counters, leaving them unupdated.
Fix that by properly querying those counters for non-representor devices.
Fixes: d22467a71ebe ("RDMA/mlx5: Expand switchdev Q-counters to expose representor statistics")
Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
Reviewed-by: Maher Sanalla <msanalla@nvidia.com>
Link: https://patch.msgid.link/56bf8af4ca8c58e3fb9f7e47b1dca2009eeeed81.1750064969.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/infiniband/hw/mlx5/counters.c b/drivers/infiniband/hw/mlx5/counters.c
index b847084dcd99..943e9eb2ad20 100644
--- a/drivers/infiniband/hw/mlx5/counters.c
+++ b/drivers/infiniband/hw/mlx5/counters.c
@@ -398,7 +398,7 @@ static int do_get_hw_stats(struct ib_device *ibdev,
return ret;
/* We don't expose device counters over Vports */
- if (is_mdev_switchdev_mode(dev->mdev) && port_num != 0)
+ if (is_mdev_switchdev_mode(dev->mdev) && dev->is_rep && port_num != 0)
goto done;
if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,40 @@
From daaee9898eeb1ee187247fd1ed5452440eba9edc Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:43:26 -0400
Subject: [PATCH] RDMA/mlx5: Fix CC counters query for MPV
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit acd245b1e33fc4b9d0f2e3372021d632f7ee0652
Author: Patrisious Haddad <phaddad@nvidia.com>
Date: Mon Jun 16 12:14:53 2025 +0300
RDMA/mlx5: Fix CC counters query for MPV
In case, CC counters are querying for the second port use the correct
core device for the query instead of always using the master core device.
Fixes: aac4492ef23a ("IB/mlx5: Update counter implementation for dual port RoCE")
Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
Reviewed-by: Michael Guralnik <michaelgur@nvidia.com>
Link: https://patch.msgid.link/9cace74dcf106116118bebfa9146d40d4166c6b0.1750064969.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/infiniband/hw/mlx5/counters.c b/drivers/infiniband/hw/mlx5/counters.c
index 943e9eb2ad20..a506fafd2b15 100644
--- a/drivers/infiniband/hw/mlx5/counters.c
+++ b/drivers/infiniband/hw/mlx5/counters.c
@@ -418,7 +418,7 @@ static int do_get_hw_stats(struct ib_device *ibdev,
*/
goto done;
}
- ret = mlx5_lag_query_cong_counters(dev->mdev,
+ ret = mlx5_lag_query_cong_counters(mdev,
stats->value +
cnts->num_q_counters,
cnts->num_cong_counters,
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,89 @@
From b58e4be62a28810484d1d1203db45cc311c6d48f Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 17:43:26 -0400
Subject: [PATCH] RDMA/mlx5: Fix vport loopback for MPV device
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit a9a9e68954f29b1e197663f76289db4879fd51bb
Author: Patrisious Haddad <phaddad@nvidia.com>
Date: Mon Jun 16 12:14:54 2025 +0300
RDMA/mlx5: Fix vport loopback for MPV device
Always enable vport loopback for both MPV devices on driver start.
Previously in some cases related to MPV RoCE, packets weren't correctly
executing loopback check at vport in FW, since it was disabled.
Due to complexity of identifying such cases for MPV always enable vport
loopback for both GVMIs when binding the slave to the master port.
Fixes: 0042f9e458a5 ("RDMA/mlx5: Enable vport loopback when user context or QP mandate")
Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/d4298f5ebb2197459e9e7221c51ecd6a34699847.1750064969.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index d5f44c83b667..f463b8e7cfca 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1791,6 +1791,33 @@ static void deallocate_uars(struct mlx5_ib_dev *dev,
context->devx_uid);
}
+static int mlx5_ib_enable_lb_mp(struct mlx5_core_dev *master,
+ struct mlx5_core_dev *slave)
+{
+ int err;
+
+ err = mlx5_nic_vport_update_local_lb(master, true);
+ if (err)
+ return err;
+
+ err = mlx5_nic_vport_update_local_lb(slave, true);
+ if (err)
+ goto out;
+
+ return 0;
+
+out:
+ mlx5_nic_vport_update_local_lb(master, false);
+ return err;
+}
+
+static void mlx5_ib_disable_lb_mp(struct mlx5_core_dev *master,
+ struct mlx5_core_dev *slave)
+{
+ mlx5_nic_vport_update_local_lb(slave, false);
+ mlx5_nic_vport_update_local_lb(master, false);
+}
+
int mlx5_ib_enable_lb(struct mlx5_ib_dev *dev, bool td, bool qp)
{
int err = 0;
@@ -3495,6 +3522,8 @@ static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev,
lockdep_assert_held(&mlx5_ib_multiport_mutex);
+ mlx5_ib_disable_lb_mp(ibdev->mdev, mpi->mdev);
+
mlx5_core_mp_event_replay(ibdev->mdev,
MLX5_DRIVER_EVENT_AFFILIATION_REMOVED,
NULL);
@@ -3590,6 +3619,10 @@ static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev,
MLX5_DRIVER_EVENT_AFFILIATION_DONE,
&key);
+ err = mlx5_ib_enable_lb_mp(ibdev->mdev, mpi->mdev);
+ if (err)
+ goto unbind;
+
return true;
unbind:
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,132 @@
From c49aebd76637b284358e3705c4642286f996b607 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 18:03:17 -0400
Subject: [PATCH] net/mlx5: Expose serial numbers in devlink info
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 18667214b955ef89f208d451820c39a5dfd77f27
Author: Jiri Pirko <jiri@resnulli.us>
Date: Tue Jun 10 04:51:28 2025 +0200
net/mlx5: Expose serial numbers in devlink info
Devlink info allows to expose serial number and board serial number
Get the values from PCI VPD and expose it.
$ devlink dev info
pci/0000:08:00.0:
driver mlx5_core
serial_number e4397f872caeed218000846daa7d2f49
board.serial_number MT2314XZ00YA
versions:
fixed:
fw.psid MT_0000000894
running:
fw.version 28.41.1000
fw 28.41.1000
stored:
fw.version 28.41.1000
fw 28.41.1000
auxiliary/mlx5_core.eth.0:
driver mlx5_core.eth
pci/0000:08:00.1:
driver mlx5_core
serial_number e4397f872caeed218000846daa7d2f49
board.serial_number MT2314XZ00YA
versions:
fixed:
fw.psid MT_0000000894
running:
fw.version 28.41.1000
fw 28.41.1000
stored:
fw.version 28.41.1000
fw 28.41.1000
auxiliary/mlx5_core.eth.1:
driver mlx5_core.eth
Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>
Acked-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/20250610025128.109232-1-jiri@resnulli.us
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
index 3b27da79ba94..4b536b384fc0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
@@ -35,6 +35,55 @@ static u16 mlx5_fw_ver_subminor(u32 version)
return version & 0xffff;
}
+static int mlx5_devlink_serial_numbers_put(struct mlx5_core_dev *dev,
+ struct devlink_info_req *req,
+ struct netlink_ext_ack *extack)
+{
+ struct pci_dev *pdev = dev->pdev;
+ unsigned int vpd_size, kw_len;
+ char *str, *end;
+ u8 *vpd_data;
+ int err = 0;
+ int start;
+
+ vpd_data = pci_vpd_alloc(pdev, &vpd_size);
+ if (IS_ERR(vpd_data))
+ return 0;
+
+ start = pci_vpd_find_ro_info_keyword(vpd_data, vpd_size,
+ PCI_VPD_RO_KEYWORD_SERIALNO, &kw_len);
+ if (start >= 0) {
+ str = kstrndup(vpd_data + start, kw_len, GFP_KERNEL);
+ if (!str) {
+ err = -ENOMEM;
+ goto end;
+ }
+ end = strchrnul(str, ' ');
+ *end = '\0';
+ err = devlink_info_board_serial_number_put(req, str);
+ kfree(str);
+ if (err)
+ goto end;
+ }
+
+ start = pci_vpd_find_ro_info_keyword(vpd_data, vpd_size, "V3", &kw_len);
+ if (start >= 0) {
+ str = kstrndup(vpd_data + start, kw_len, GFP_KERNEL);
+ if (!str) {
+ err = -ENOMEM;
+ goto end;
+ }
+ err = devlink_info_serial_number_put(req, str);
+ kfree(str);
+ if (err)
+ goto end;
+ }
+
+end:
+ kfree(vpd_data);
+ return err;
+}
+
#define DEVLINK_FW_STRING_LEN 32
static int
@@ -49,6 +98,10 @@ mlx5_devlink_info_get(struct devlink *devlink, struct devlink_info_req *req,
if (!mlx5_core_is_pf(dev))
return 0;
+ err = mlx5_devlink_serial_numbers_put(dev, req, extack);
+ if (err)
+ return err;
+
err = devlink_info_version_fixed_put(req, "fw.psid", dev->board_id);
if (err)
return err;
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,242 @@
From 9440e62748742ac2e252b1559c6575de37da4e90 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 18:03:17 -0400
Subject: [PATCH] net/mlx5e: SHAMPO: Reorganize mlx5_rq_shampo_alloc
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit af4312c4c9c11da84b13b3aa8f472ab287cf1f0b
Author: Saeed Mahameed <saeedm@nvidia.com>
Date: Mon Jun 16 17:14:33 2025 +0300
net/mlx5e: SHAMPO: Reorganize mlx5_rq_shampo_alloc
Drop redundant SHAMPO structure alloc/free functions.
Gather together function calls pertaining to header split info, pass
header per WQE (hd_per_wqe) as parameter to those function to avoid use
before initialization future mistakes.
Allocate HW GRO related info outside of the header related info scope.
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Reviewed-by: Dragos Tatulea <dtatulea@nvidia.com>
Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250616141441.1243044-5-mbloch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 5b0d03b3efe8..211ea429ea89 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -638,7 +638,6 @@ struct mlx5e_shampo_hd {
struct mlx5e_frag_page *pages;
u32 hd_per_wq;
u16 hd_per_wqe;
- u16 pages_per_wq;
unsigned long *bitmap;
u16 pi;
u16 ci;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 9bd166f489e7..a074f1eac3f4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -330,47 +330,6 @@ static inline void mlx5e_build_umr_wqe(struct mlx5e_rq *rq,
ucseg->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE);
}
-static int mlx5e_rq_shampo_hd_alloc(struct mlx5e_rq *rq, int node)
-{
- rq->mpwqe.shampo = kvzalloc_node(sizeof(*rq->mpwqe.shampo),
- GFP_KERNEL, node);
- if (!rq->mpwqe.shampo)
- return -ENOMEM;
- return 0;
-}
-
-static void mlx5e_rq_shampo_hd_free(struct mlx5e_rq *rq)
-{
- kvfree(rq->mpwqe.shampo);
-}
-
-static int mlx5e_rq_shampo_hd_info_alloc(struct mlx5e_rq *rq, int node)
-{
- struct mlx5e_shampo_hd *shampo = rq->mpwqe.shampo;
-
- shampo->bitmap = bitmap_zalloc_node(shampo->hd_per_wq, GFP_KERNEL,
- node);
- shampo->pages = kvzalloc_node(array_size(shampo->hd_per_wq,
- sizeof(*shampo->pages)),
- GFP_KERNEL, node);
- if (!shampo->bitmap || !shampo->pages)
- goto err_nomem;
-
- return 0;
-
-err_nomem:
- bitmap_free(shampo->bitmap);
- kvfree(shampo->pages);
-
- return -ENOMEM;
-}
-
-static void mlx5e_rq_shampo_hd_info_free(struct mlx5e_rq *rq)
-{
- bitmap_free(rq->mpwqe.shampo->bitmap);
- kvfree(rq->mpwqe.shampo->pages);
-}
-
static int mlx5e_rq_alloc_mpwqe_info(struct mlx5e_rq *rq, int node)
{
int wq_sz = mlx5_wq_ll_get_size(&rq->mpwqe.wq);
@@ -583,19 +542,18 @@ static int mlx5e_create_rq_umr_mkey(struct mlx5_core_dev *mdev, struct mlx5e_rq
}
static int mlx5e_create_rq_hd_umr_mkey(struct mlx5_core_dev *mdev,
- struct mlx5e_rq *rq)
+ u16 hd_per_wq, u32 *umr_mkey)
{
u32 max_ksm_size = BIT(MLX5_CAP_GEN(mdev, log_max_klm_list_size));
- if (max_ksm_size < rq->mpwqe.shampo->hd_per_wq) {
+ if (max_ksm_size < hd_per_wq) {
mlx5_core_err(mdev, "max ksm list size 0x%x is smaller than shampo header buffer list size 0x%x\n",
- max_ksm_size, rq->mpwqe.shampo->hd_per_wq);
+ max_ksm_size, hd_per_wq);
return -EINVAL;
}
-
- return mlx5e_create_umr_ksm_mkey(mdev, rq->mpwqe.shampo->hd_per_wq,
+ return mlx5e_create_umr_ksm_mkey(mdev, hd_per_wq,
MLX5E_SHAMPO_LOG_HEADER_ENTRY_SIZE,
- &rq->mpwqe.shampo->mkey);
+ umr_mkey);
}
static void mlx5e_init_frags_partition(struct mlx5e_rq *rq)
@@ -757,6 +715,35 @@ static int mlx5e_init_rxq_rq(struct mlx5e_channel *c, struct mlx5e_params *param
xdp_frag_size);
}
+static int mlx5e_rq_shampo_hd_info_alloc(struct mlx5e_rq *rq, u16 hd_per_wq,
+ int node)
+{
+ struct mlx5e_shampo_hd *shampo = rq->mpwqe.shampo;
+
+ shampo->hd_per_wq = hd_per_wq;
+
+ shampo->bitmap = bitmap_zalloc_node(hd_per_wq, GFP_KERNEL, node);
+ shampo->pages = kvzalloc_node(array_size(hd_per_wq,
+ sizeof(*shampo->pages)),
+ GFP_KERNEL, node);
+ if (!shampo->bitmap || !shampo->pages)
+ goto err_nomem;
+
+ return 0;
+
+err_nomem:
+ kvfree(shampo->pages);
+ bitmap_free(shampo->bitmap);
+
+ return -ENOMEM;
+}
+
+static void mlx5e_rq_shampo_hd_info_free(struct mlx5e_rq *rq)
+{
+ kvfree(rq->mpwqe.shampo->pages);
+ bitmap_free(rq->mpwqe.shampo->bitmap);
+}
+
static int mlx5_rq_shampo_alloc(struct mlx5_core_dev *mdev,
struct mlx5e_params *params,
struct mlx5e_rq_param *rqp,
@@ -764,42 +751,52 @@ static int mlx5_rq_shampo_alloc(struct mlx5_core_dev *mdev,
u32 *pool_size,
int node)
{
+ void *wqc = MLX5_ADDR_OF(rqc, rqp->rqc, wq);
+ u16 hd_per_wq;
+ int wq_size;
int err;
if (!test_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state))
return 0;
- err = mlx5e_rq_shampo_hd_alloc(rq, node);
- if (err)
- goto out;
- rq->mpwqe.shampo->hd_per_wq =
- mlx5e_shampo_hd_per_wq(mdev, params, rqp);
- err = mlx5e_create_rq_hd_umr_mkey(mdev, rq);
+
+ rq->mpwqe.shampo = kvzalloc_node(sizeof(*rq->mpwqe.shampo),
+ GFP_KERNEL, node);
+ if (!rq->mpwqe.shampo)
+ return -ENOMEM;
+
+ /* split headers data structures */
+ hd_per_wq = mlx5e_shampo_hd_per_wq(mdev, params, rqp);
+ err = mlx5e_rq_shampo_hd_info_alloc(rq, hd_per_wq, node);
if (err)
- goto err_shampo_hd;
- err = mlx5e_rq_shampo_hd_info_alloc(rq, node);
+ goto err_shampo_hd_info_alloc;
+
+ err = mlx5e_create_rq_hd_umr_mkey(mdev, hd_per_wq,
+ &rq->mpwqe.shampo->mkey);
if (err)
- goto err_shampo_info;
+ goto err_umr_mkey;
+
+ rq->mpwqe.shampo->key = cpu_to_be32(rq->mpwqe.shampo->mkey);
+ rq->mpwqe.shampo->hd_per_wqe =
+ mlx5e_shampo_hd_per_wqe(mdev, params, rqp);
+ wq_size = BIT(MLX5_GET(wq, wqc, log_wq_sz));
+ *pool_size += (rq->mpwqe.shampo->hd_per_wqe * wq_size) /
+ MLX5E_SHAMPO_WQ_HEADER_PER_PAGE;
+
+ /* gro only data structures */
rq->hw_gro_data = kvzalloc_node(sizeof(*rq->hw_gro_data), GFP_KERNEL, node);
if (!rq->hw_gro_data) {
err = -ENOMEM;
goto err_hw_gro_data;
}
- rq->mpwqe.shampo->key =
- cpu_to_be32(rq->mpwqe.shampo->mkey);
- rq->mpwqe.shampo->hd_per_wqe =
- mlx5e_shampo_hd_per_wqe(mdev, params, rqp);
- rq->mpwqe.shampo->pages_per_wq =
- rq->mpwqe.shampo->hd_per_wq / MLX5E_SHAMPO_WQ_HEADER_PER_PAGE;
- *pool_size += rq->mpwqe.shampo->pages_per_wq;
+
return 0;
err_hw_gro_data:
- mlx5e_rq_shampo_hd_info_free(rq);
-err_shampo_info:
mlx5_core_destroy_mkey(mdev, rq->mpwqe.shampo->mkey);
-err_shampo_hd:
- mlx5e_rq_shampo_hd_free(rq);
-out:
+err_umr_mkey:
+ mlx5e_rq_shampo_hd_info_free(rq);
+err_shampo_hd_info_alloc:
+ kvfree(rq->mpwqe.shampo);
return err;
}
@@ -811,7 +808,7 @@ static void mlx5e_rq_free_shampo(struct mlx5e_rq *rq)
kvfree(rq->hw_gro_data);
mlx5e_rq_shampo_hd_info_free(rq);
mlx5_core_destroy_mkey(rq->mdev, rq->mpwqe.shampo->mkey);
- mlx5e_rq_shampo_hd_free(rq);
+ kvfree(rq->mpwqe.shampo);
}
static int mlx5e_alloc_rq(struct mlx5e_params *params,
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,113 @@
From 675c166094cf502d5b037ae4a692e7c5c6f6f9fa Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 18:03:18 -0400
Subject: [PATCH] net/mlx5e: SHAMPO: Remove redundant params
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 16142defd304d5a8e591781efe24da498ccfa51f
Author: Saeed Mahameed <saeedm@nvidia.com>
Date: Mon Jun 16 17:14:34 2025 +0300
net/mlx5e: SHAMPO: Remove redundant params
Two SHAMPO params are static and always the same, remove them from the
global mlx5e_params struct.
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Reviewed-by: Dragos Tatulea <dtatulea@nvidia.com>
Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250616141441.1243044-6-mbloch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 211ea429ea89..581eef34f512 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -278,10 +278,6 @@ enum packet_merge {
struct mlx5e_packet_merge_param {
enum packet_merge type;
u32 timeout;
- struct {
- u8 match_criteria_type;
- u8 alignment_granularity;
- } shampo;
};
struct mlx5e_params {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
index 58ec5e44aa7a..fc945bce933a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
@@ -901,6 +901,7 @@ int mlx5e_build_rq_param(struct mlx5_core_dev *mdev,
{
void *rqc = param->rqc;
void *wq = MLX5_ADDR_OF(rqc, rqc, wq);
+ u32 lro_timeout;
int ndsegs = 1;
int err;
@@ -926,22 +927,25 @@ int mlx5e_build_rq_param(struct mlx5_core_dev *mdev,
MLX5_SET(wq, wq, log_wqe_stride_size,
log_wqe_stride_size - MLX5_MPWQE_LOG_STRIDE_SZ_BASE);
MLX5_SET(wq, wq, log_wq_sz, mlx5e_mpwqe_get_log_rq_size(mdev, params, xsk));
- if (params->packet_merge.type == MLX5E_PACKET_MERGE_SHAMPO) {
- MLX5_SET(wq, wq, shampo_enable, true);
- MLX5_SET(wq, wq, log_reservation_size,
- mlx5e_shampo_get_log_rsrv_size(mdev, params));
- MLX5_SET(wq, wq,
- log_max_num_of_packets_per_reservation,
- mlx5e_shampo_get_log_pkt_per_rsrv(mdev, params));
- MLX5_SET(wq, wq, log_headers_entry_size,
- mlx5e_shampo_get_log_hd_entry_size(mdev, params));
- MLX5_SET(rqc, rqc, reservation_timeout,
- mlx5e_choose_lro_timeout(mdev, MLX5E_DEFAULT_SHAMPO_TIMEOUT));
- MLX5_SET(rqc, rqc, shampo_match_criteria_type,
- params->packet_merge.shampo.match_criteria_type);
- MLX5_SET(rqc, rqc, shampo_no_match_alignment_granularity,
- params->packet_merge.shampo.alignment_granularity);
- }
+ if (params->packet_merge.type != MLX5E_PACKET_MERGE_SHAMPO)
+ break;
+
+ MLX5_SET(wq, wq, shampo_enable, true);
+ MLX5_SET(wq, wq, log_reservation_size,
+ mlx5e_shampo_get_log_rsrv_size(mdev, params));
+ MLX5_SET(wq, wq,
+ log_max_num_of_packets_per_reservation,
+ mlx5e_shampo_get_log_pkt_per_rsrv(mdev, params));
+ MLX5_SET(wq, wq, log_headers_entry_size,
+ mlx5e_shampo_get_log_hd_entry_size(mdev, params));
+ lro_timeout =
+ mlx5e_choose_lro_timeout(mdev,
+ MLX5E_DEFAULT_SHAMPO_TIMEOUT);
+ MLX5_SET(rqc, rqc, reservation_timeout, lro_timeout);
+ MLX5_SET(rqc, rqc, shampo_match_criteria_type,
+ MLX5_RQC_SHAMPO_MATCH_CRITERIA_TYPE_EXTENDED);
+ MLX5_SET(rqc, rqc, shampo_no_match_alignment_granularity,
+ MLX5_RQC_SHAMPO_NO_MATCH_ALIGNMENT_GRANULARITY_STRIDE);
break;
}
default: /* MLX5_WQ_TYPE_CYCLIC */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index a074f1eac3f4..4809fc9e3522 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -4026,10 +4026,6 @@ static int set_feature_hw_gro(struct net_device *netdev, bool enable)
if (enable) {
new_params.packet_merge.type = MLX5E_PACKET_MERGE_SHAMPO;
- new_params.packet_merge.shampo.match_criteria_type =
- MLX5_RQC_SHAMPO_MATCH_CRITERIA_TYPE_EXTENDED;
- new_params.packet_merge.shampo.alignment_granularity =
- MLX5_RQC_SHAMPO_NO_MATCH_ALIGNMENT_GRANULARITY_STRIDE;
} else if (new_params.packet_merge.type == MLX5E_PACKET_MERGE_SHAMPO) {
new_params.packet_merge.type = MLX5E_PACKET_MERGE_NONE;
} else {
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,68 @@
From ce0ae8e829ab57ae2acd173f5053cf63c391a4e1 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 18:03:18 -0400
Subject: [PATCH] net/mlx5e: SHAMPO: Improve hw gro capability checking
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit d2760abdedde635b055a214b3a45dce3e4ecbfce
Author: Saeed Mahameed <saeedm@nvidia.com>
Date: Mon Jun 16 17:14:35 2025 +0300
net/mlx5e: SHAMPO: Improve hw gro capability checking
Add missing HW capabilities, declare the feature in
netdev->vlan_features, similar to other features in mlx5e_build_nic_netdev.
No functional change here as all by default disabled features are
explicitly disabled at the bottom of the function.
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Reviewed-by: Dragos Tatulea <dtatulea@nvidia.com>
Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250616141441.1243044-7-mbloch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 4809fc9e3522..e552dcf8f13a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -77,7 +77,8 @@
static bool mlx5e_hw_gro_supported(struct mlx5_core_dev *mdev)
{
- if (!MLX5_CAP_GEN(mdev, shampo))
+ if (!MLX5_CAP_GEN(mdev, shampo) ||
+ !MLX5_CAP_SHAMPO(mdev, shampo_header_split_data_merge))
return false;
/* Our HW-GRO implementation relies on "KSM Mkey" for
@@ -5489,17 +5490,17 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)
MLX5E_MPWRQ_UMR_MODE_ALIGNED))
netdev->vlan_features |= NETIF_F_LRO;
+ if (mlx5e_hw_gro_supported(mdev) &&
+ mlx5e_check_fragmented_striding_rq_cap(mdev, PAGE_SHIFT,
+ MLX5E_MPWRQ_UMR_MODE_ALIGNED))
+ netdev->vlan_features |= NETIF_F_GRO_HW;
+
netdev->hw_features = netdev->vlan_features;
netdev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX;
netdev->hw_features |= NETIF_F_HW_VLAN_CTAG_RX;
netdev->hw_features |= NETIF_F_HW_VLAN_CTAG_FILTER;
netdev->hw_features |= NETIF_F_HW_VLAN_STAG_TX;
- if (mlx5e_hw_gro_supported(mdev) &&
- mlx5e_check_fragmented_striding_rq_cap(mdev, PAGE_SHIFT,
- MLX5E_MPWRQ_UMR_MODE_ALIGNED))
- netdev->hw_features |= NETIF_F_GRO_HW;
-
if (mlx5e_tunnel_any_tx_proto_supported(mdev)) {
netdev->hw_enc_features |= NETIF_F_HW_CSUM;
netdev->hw_enc_features |= NETIF_F_TSO;
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,304 @@
From 65f74d0b4614133bc8ed318ed25a4532182f50fc Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 18:03:18 -0400
Subject: [PATCH] net/mlx5e: SHAMPO: Separate pool for headers
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit e225d9bd93ed0bb84014f5f8e241e8e456533e30
Author: Saeed Mahameed <saeedm@nvidia.com>
Date: Mon Jun 16 17:14:36 2025 +0300
net/mlx5e: SHAMPO: Separate pool for headers
Allow allocating a separate page pool for headers when SHAMPO is on.
This will be useful for adding support to zc page pool, which has to be
different from the headers page pool.
For now, the pools are the same.
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Reviewed-by: Dragos Tatulea <dtatulea@nvidia.com>
Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250616141441.1243044-8-mbloch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 581eef34f512..c329de1d4f0a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -716,7 +716,11 @@ struct mlx5e_rq {
struct bpf_prog __rcu *xdp_prog;
struct mlx5e_xdpsq *xdpsq;
DECLARE_BITMAP(flags, 8);
+
+ /* page pools */
struct page_pool *page_pool;
+ struct page_pool *hd_page_pool;
+
struct mlx5e_xdp_buff mxbuf;
/* AF_XDP zero-copy */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index e552dcf8f13a..59e845367cfd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -40,6 +40,7 @@
#include <linux/if_bridge.h>
#include <linux/filter.h>
#include <net/netdev_queues.h>
+#include <net/netdev_rx_queue.h>
#include <net/page_pool/types.h>
#include <net/pkt_sched.h>
#include <net/xdp_sock_drv.h>
@@ -745,6 +746,11 @@ static void mlx5e_rq_shampo_hd_info_free(struct mlx5e_rq *rq)
bitmap_free(rq->mpwqe.shampo->bitmap);
}
+static bool mlx5_rq_needs_separate_hd_pool(struct mlx5e_rq *rq)
+{
+ return false;
+}
+
static int mlx5_rq_shampo_alloc(struct mlx5_core_dev *mdev,
struct mlx5e_params *params,
struct mlx5e_rq_param *rqp,
@@ -753,6 +759,7 @@ static int mlx5_rq_shampo_alloc(struct mlx5_core_dev *mdev,
int node)
{
void *wqc = MLX5_ADDR_OF(rqc, rqp->rqc, wq);
+ u32 hd_pool_size;
u16 hd_per_wq;
int wq_size;
int err;
@@ -780,8 +787,34 @@ static int mlx5_rq_shampo_alloc(struct mlx5_core_dev *mdev,
rq->mpwqe.shampo->hd_per_wqe =
mlx5e_shampo_hd_per_wqe(mdev, params, rqp);
wq_size = BIT(MLX5_GET(wq, wqc, log_wq_sz));
- *pool_size += (rq->mpwqe.shampo->hd_per_wqe * wq_size) /
- MLX5E_SHAMPO_WQ_HEADER_PER_PAGE;
+ hd_pool_size = (rq->mpwqe.shampo->hd_per_wqe * wq_size) /
+ MLX5E_SHAMPO_WQ_HEADER_PER_PAGE;
+
+ if (mlx5_rq_needs_separate_hd_pool(rq)) {
+ /* Separate page pool for shampo headers */
+ struct page_pool_params pp_params = { };
+
+ pp_params.order = 0;
+ pp_params.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
+ pp_params.pool_size = hd_pool_size;
+ pp_params.nid = node;
+ pp_params.dev = rq->pdev;
+ pp_params.napi = rq->cq.napi;
+ pp_params.netdev = rq->netdev;
+ pp_params.dma_dir = rq->buff.map_dir;
+ pp_params.max_len = PAGE_SIZE;
+
+ rq->hd_page_pool = page_pool_create(&pp_params);
+ if (IS_ERR(rq->hd_page_pool)) {
+ err = PTR_ERR(rq->hd_page_pool);
+ rq->hd_page_pool = NULL;
+ goto err_hds_page_pool;
+ }
+ } else {
+ /* Common page pool, reserve space for headers. */
+ *pool_size += hd_pool_size;
+ rq->hd_page_pool = NULL;
+ }
/* gro only data structures */
rq->hw_gro_data = kvzalloc_node(sizeof(*rq->hw_gro_data), GFP_KERNEL, node);
@@ -793,6 +826,8 @@ static int mlx5_rq_shampo_alloc(struct mlx5_core_dev *mdev,
return 0;
err_hw_gro_data:
+ page_pool_destroy(rq->hd_page_pool);
+err_hds_page_pool:
mlx5_core_destroy_mkey(mdev, rq->mpwqe.shampo->mkey);
err_umr_mkey:
mlx5e_rq_shampo_hd_info_free(rq);
@@ -807,6 +842,8 @@ static void mlx5e_rq_free_shampo(struct mlx5e_rq *rq)
return;
kvfree(rq->hw_gro_data);
+ if (rq->hd_page_pool != rq->page_pool)
+ page_pool_destroy(rq->hd_page_pool);
mlx5e_rq_shampo_hd_info_free(rq);
mlx5_core_destroy_mkey(rq->mdev, rq->mpwqe.shampo->mkey);
kvfree(rq->mpwqe.shampo);
@@ -938,6 +975,8 @@ static int mlx5e_alloc_rq(struct mlx5e_params *params,
rq->page_pool = NULL;
goto err_free_by_rq_type;
}
+ if (!rq->hd_page_pool)
+ rq->hd_page_pool = rq->page_pool;
if (xdp_rxq_info_is_reg(&rq->xdp_rxq))
err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
MEM_TYPE_PAGE_POOL, rq->page_pool);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 382679838113..36a4780332d7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -273,12 +273,12 @@ static inline u32 mlx5e_decompress_cqes_start(struct mlx5e_rq *rq,
#define MLX5E_PAGECNT_BIAS_MAX (PAGE_SIZE / 64)
-static int mlx5e_page_alloc_fragmented(struct mlx5e_rq *rq,
+static int mlx5e_page_alloc_fragmented(struct page_pool *pool,
struct mlx5e_frag_page *frag_page)
{
struct page *page;
- page = page_pool_dev_alloc_pages(rq->page_pool);
+ page = page_pool_dev_alloc_pages(pool);
if (unlikely(!page))
return -ENOMEM;
@@ -292,14 +292,14 @@ static int mlx5e_page_alloc_fragmented(struct mlx5e_rq *rq,
return 0;
}
-static void mlx5e_page_release_fragmented(struct mlx5e_rq *rq,
+static void mlx5e_page_release_fragmented(struct page_pool *pool,
struct mlx5e_frag_page *frag_page)
{
u16 drain_count = MLX5E_PAGECNT_BIAS_MAX - frag_page->frags;
struct page *page = frag_page->page;
if (page_pool_unref_page(page, drain_count) == 0)
- page_pool_put_unrefed_page(rq->page_pool, page, -1, true);
+ page_pool_put_unrefed_page(pool, page, -1, true);
}
static inline int mlx5e_get_rx_frag(struct mlx5e_rq *rq,
@@ -313,7 +313,8 @@ static inline int mlx5e_get_rx_frag(struct mlx5e_rq *rq,
* offset) should just use the new one without replenishing again
* by themselves.
*/
- err = mlx5e_page_alloc_fragmented(rq, frag->frag_page);
+ err = mlx5e_page_alloc_fragmented(rq->page_pool,
+ frag->frag_page);
return err;
}
@@ -332,7 +333,7 @@ static inline void mlx5e_put_rx_frag(struct mlx5e_rq *rq,
struct mlx5e_wqe_frag_info *frag)
{
if (mlx5e_frag_can_release(frag))
- mlx5e_page_release_fragmented(rq, frag->frag_page);
+ mlx5e_page_release_fragmented(rq->page_pool, frag->frag_page);
}
static inline struct mlx5e_wqe_frag_info *get_frag(struct mlx5e_rq *rq, u16 ix)
@@ -586,7 +587,8 @@ mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi)
struct mlx5e_frag_page *frag_page;
frag_page = &wi->alloc_units.frag_pages[i];
- mlx5e_page_release_fragmented(rq, frag_page);
+ mlx5e_page_release_fragmented(rq->page_pool,
+ frag_page);
}
}
}
@@ -681,11 +683,10 @@ static int mlx5e_build_shampo_hd_umr(struct mlx5e_rq *rq,
struct mlx5e_frag_page *frag_page = mlx5e_shampo_hd_to_frag_page(rq, index);
u64 addr;
- err = mlx5e_page_alloc_fragmented(rq, frag_page);
+ err = mlx5e_page_alloc_fragmented(rq->hd_page_pool, frag_page);
if (unlikely(err))
goto err_unmap;
-
addr = page_pool_get_dma_addr(frag_page->page);
for (int j = 0; j < MLX5E_SHAMPO_WQ_HEADER_PER_PAGE; j++) {
@@ -717,7 +718,8 @@ static int mlx5e_build_shampo_hd_umr(struct mlx5e_rq *rq,
if (!header_offset) {
struct mlx5e_frag_page *frag_page = mlx5e_shampo_hd_to_frag_page(rq, index);
- mlx5e_page_release_fragmented(rq, frag_page);
+ mlx5e_page_release_fragmented(rq->hd_page_pool,
+ frag_page);
}
}
@@ -793,7 +795,7 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
for (i = 0; i < rq->mpwqe.pages_per_wqe; i++, frag_page++) {
dma_addr_t addr;
- err = mlx5e_page_alloc_fragmented(rq, frag_page);
+ err = mlx5e_page_alloc_fragmented(rq->page_pool, frag_page);
if (unlikely(err))
goto err_unmap;
addr = page_pool_get_dma_addr(frag_page->page);
@@ -838,7 +840,7 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
err_unmap:
while (--i >= 0) {
frag_page--;
- mlx5e_page_release_fragmented(rq, frag_page);
+ mlx5e_page_release_fragmented(rq->page_pool, frag_page);
}
bitmap_fill(wi->skip_release_bitmap, rq->mpwqe.pages_per_wqe);
@@ -857,7 +859,7 @@ mlx5e_free_rx_shampo_hd_entry(struct mlx5e_rq *rq, u16 header_index)
if (((header_index + 1) & (MLX5E_SHAMPO_WQ_HEADER_PER_PAGE - 1)) == 0) {
struct mlx5e_frag_page *frag_page = mlx5e_shampo_hd_to_frag_page(rq, header_index);
- mlx5e_page_release_fragmented(rq, frag_page);
+ mlx5e_page_release_fragmented(rq->hd_page_pool, frag_page);
}
clear_bit(header_index, shampo->bitmap);
}
@@ -1102,6 +1104,8 @@ INDIRECT_CALLABLE_SCOPE bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq)
if (rq->page_pool)
page_pool_nid_changed(rq->page_pool, numa_mem_id());
+ if (rq->hd_page_pool)
+ page_pool_nid_changed(rq->hd_page_pool, numa_mem_id());
head = rq->mpwqe.actual_wq_head;
i = missing;
@@ -2010,7 +2014,8 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w
if (prog) {
/* area for bpf_xdp_[store|load]_bytes */
net_prefetchw(page_address(frag_page->page) + frag_offset);
- if (unlikely(mlx5e_page_alloc_fragmented(rq, &wi->linear_page))) {
+ if (unlikely(mlx5e_page_alloc_fragmented(rq->page_pool,
+ &wi->linear_page))) {
rq->stats->buff_alloc_err++;
return NULL;
}
@@ -2074,7 +2079,8 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w
wi->linear_page.frags++;
}
- mlx5e_page_release_fragmented(rq, &wi->linear_page);
+ mlx5e_page_release_fragmented(rq->page_pool,
+ &wi->linear_page);
return NULL; /* page/packet was consumed by XDP */
}
@@ -2083,13 +2089,14 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w
mxbuf->xdp.data - mxbuf->xdp.data_hard_start, 0,
mxbuf->xdp.data - mxbuf->xdp.data_meta);
if (unlikely(!skb)) {
- mlx5e_page_release_fragmented(rq, &wi->linear_page);
+ mlx5e_page_release_fragmented(rq->page_pool,
+ &wi->linear_page);
return NULL;
}
skb_mark_for_recycle(skb);
wi->linear_page.frags++;
- mlx5e_page_release_fragmented(rq, &wi->linear_page);
+ mlx5e_page_release_fragmented(rq->page_pool, &wi->linear_page);
if (xdp_buff_has_frags(&mxbuf->xdp)) {
struct mlx5e_frag_page *pagep;
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,150 @@
From d17c81e1bebf992831b10750cc1ee7c6fdc04339 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 18:03:18 -0400
Subject: [PATCH] net/mlx5e: Implement queue mgmt ops and single channel swap
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit b2588ea40ec9472688289c1a644627c0f4a1f33f
Author: Saeed Mahameed <saeedm@nvidia.com>
Date: Mon Jun 16 17:14:39 2025 +0300
net/mlx5e: Implement queue mgmt ops and single channel swap
The bulk of the work is done in mlx5e_queue_mem_alloc, where we allocate
and create the new channel resources, similar to
mlx5e_safe_switch_params, but here we do it for a single channel using
existing params, sort of a clone channel.
To swap the old channel with the new one, we deactivate and close the
old channel then replace it with the new one, since the swap procedure
doesn't fail in mlx5, we do it all in one place (mlx5e_queue_start).
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Reviewed-by: Dragos Tatulea <dtatulea@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Acked-by: Mina Almasry <almasrymina@google.com>
Link: https://patch.msgid.link/20250616141441.1243044-11-mbloch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 59e845367cfd..9330d90c1f03 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -5479,6 +5479,103 @@ static const struct netdev_stat_ops mlx5e_stat_ops = {
.get_base_stats = mlx5e_get_base_stats,
};
+struct mlx5_qmgmt_data {
+ struct mlx5e_channel *c;
+ struct mlx5e_channel_param cparam;
+};
+
+static int mlx5e_queue_mem_alloc(struct net_device *dev, void *newq,
+ int queue_index)
+{
+ struct mlx5_qmgmt_data *new = (struct mlx5_qmgmt_data *)newq;
+ struct mlx5e_priv *priv = netdev_priv(dev);
+ struct mlx5e_channels *chs = &priv->channels;
+ struct mlx5e_params params = chs->params;
+ struct mlx5_core_dev *mdev;
+ int err;
+
+ mutex_lock(&priv->state_lock);
+ if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+ err = -ENODEV;
+ goto unlock;
+ }
+
+ if (queue_index >= chs->num) {
+ err = -ERANGE;
+ goto unlock;
+ }
+
+ if (MLX5E_GET_PFLAG(&chs->params, MLX5E_PFLAG_TX_PORT_TS) ||
+ chs->params.ptp_rx ||
+ chs->params.xdp_prog ||
+ priv->htb) {
+ netdev_err(priv->netdev,
+ "Cloning channels with Port/rx PTP, XDP or HTB is not supported\n");
+ err = -EOPNOTSUPP;
+ goto unlock;
+ }
+
+ mdev = mlx5_sd_ch_ix_get_dev(priv->mdev, queue_index);
+ err = mlx5e_build_channel_param(mdev, &params, &new->cparam);
+ if (err)
+ goto unlock;
+
+ err = mlx5e_open_channel(priv, queue_index, &params, NULL, &new->c);
+unlock:
+ mutex_unlock(&priv->state_lock);
+ return err;
+}
+
+static void mlx5e_queue_mem_free(struct net_device *dev, void *mem)
+{
+ struct mlx5_qmgmt_data *data = (struct mlx5_qmgmt_data *)mem;
+
+ /* not supposed to happen since mlx5e_queue_start never fails
+ * but this is how this should be implemented just in case
+ */
+ if (data->c)
+ mlx5e_close_channel(data->c);
+}
+
+static int mlx5e_queue_stop(struct net_device *dev, void *oldq, int queue_index)
+{
+ /* In mlx5 a txq cannot be simply stopped in isolation, only restarted.
+ * mlx5e_queue_start does not fail, we stop the old queue there.
+ * TODO: Improve this.
+ */
+ return 0;
+}
+
+static int mlx5e_queue_start(struct net_device *dev, void *newq,
+ int queue_index)
+{
+ struct mlx5_qmgmt_data *new = (struct mlx5_qmgmt_data *)newq;
+ struct mlx5e_priv *priv = netdev_priv(dev);
+ struct mlx5e_channel *old;
+
+ mutex_lock(&priv->state_lock);
+
+ /* stop and close the old */
+ old = priv->channels.c[queue_index];
+ mlx5e_deactivate_priv_channels(priv);
+ /* close old before activating new, to avoid napi conflict */
+ mlx5e_close_channel(old);
+
+ /* start the new */
+ priv->channels.c[queue_index] = new->c;
+ mlx5e_activate_priv_channels(priv);
+ mutex_unlock(&priv->state_lock);
+ return 0;
+}
+
+static const struct netdev_queue_mgmt_ops mlx5e_queue_mgmt_ops = {
+ .ndo_queue_mem_size = sizeof(struct mlx5_qmgmt_data),
+ .ndo_queue_mem_alloc = mlx5e_queue_mem_alloc,
+ .ndo_queue_mem_free = mlx5e_queue_mem_free,
+ .ndo_queue_start = mlx5e_queue_start,
+ .ndo_queue_stop = mlx5e_queue_stop,
+};
+
static void mlx5e_build_nic_netdev(struct net_device *netdev)
{
struct mlx5e_priv *priv = netdev_priv(netdev);
@@ -5489,6 +5586,7 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)
SET_NETDEV_DEV(netdev, mdev->device);
netdev->netdev_ops = &mlx5e_netdev_ops;
+ netdev->queue_mgmt_ops = &mlx5e_queue_mgmt_ops;
netdev->xdp_metadata_ops = &mlx5e_xdp_metadata_ops;
netdev->xsk_tx_metadata_ops = &mlx5e_xsk_tx_metadata_ops;
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,141 @@
From ccdbf67ee58fe08fc65b7fa79731868afdf21c63 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Wed, 22 Apr 2026 09:42:15 -0400
Subject: [PATCH] net/mlx5e: Support ethtool tcp-data-split settings
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
Conflicts:
Add "#include <linux/ethtool_netlink.h>" to avoid build failure.
commit 46bcce5dfd330c233e59cd5efd7eb43f049b0a82
Author: Saeed Mahameed <saeedm@nvidia.com>
Date: Mon Jun 16 17:14:40 2025 +0300
net/mlx5e: Support ethtool tcp-data-split settings
In mlx5, tcp header-data split requires HW GRO to be on.
Enabling it fails when HW GRO is off.
mlx5e_fix_features now keeps HW GRO on when tcp data split is enabled.
Finally, when tcp data split is disabled, features are updated to maybe
remove the forced HW GRO.
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Dragos Tatulea <dtatulea@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250616141441.1243044-12-mbloch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index e6c9338ddae8..ff0b9ab2daa0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -32,6 +32,7 @@
#include <linux/dim.h>
#include <linux/ethtool_netlink.h>
+#include <net/netdev_queues.h>
#include "en.h"
#include "en/channels.h"
@@ -365,11 +366,6 @@ void mlx5e_ethtool_get_ringparam(struct mlx5e_priv *priv,
param->tx_max_pending = 1 << MLX5E_PARAMS_MAXIMUM_LOG_SQ_SIZE;
param->rx_pending = 1 << priv->channels.params.log_rq_mtu_frames;
param->tx_pending = 1 << priv->channels.params.log_sq_size;
-
- kernel_param->tcp_data_split =
- (priv->channels.params.packet_merge.type == MLX5E_PACKET_MERGE_SHAMPO) ?
- ETHTOOL_TCP_DATA_SPLIT_ENABLED :
- ETHTOOL_TCP_DATA_SPLIT_DISABLED;
}
static void mlx5e_get_ringparam(struct net_device *dev,
@@ -382,6 +378,27 @@ static void mlx5e_get_ringparam(struct net_device *dev,
mlx5e_ethtool_get_ringparam(priv, param, kernel_param);
}
+static bool mlx5e_ethtool_set_tcp_data_split(struct mlx5e_priv *priv,
+ u8 tcp_data_split,
+ struct netlink_ext_ack *extack)
+{
+ struct net_device *dev = priv->netdev;
+
+ if (tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_ENABLED &&
+ !(dev->features & NETIF_F_GRO_HW)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "TCP-data-split is not supported when GRO HW is disabled");
+ return false;
+ }
+
+ /* Might need to disable HW-GRO if it was kept on due to hds. */
+ if (tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_DISABLED &&
+ dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED)
+ netdev_update_features(priv->netdev);
+
+ return true;
+}
+
int mlx5e_ethtool_set_ringparam(struct mlx5e_priv *priv,
struct ethtool_ringparam *param,
struct netlink_ext_ack *extack)
@@ -440,6 +457,11 @@ static int mlx5e_set_ringparam(struct net_device *dev,
{
struct mlx5e_priv *priv = netdev_priv(dev);
+ if (!mlx5e_ethtool_set_tcp_data_split(priv,
+ kernel_param->tcp_data_split,
+ extack))
+ return -EINVAL;
+
return mlx5e_ethtool_set_ringparam(priv, param, extack);
}
@@ -2645,6 +2667,7 @@ const struct ethtool_ops mlx5e_ethtool_ops = {
ETHTOOL_COALESCE_USE_ADAPTIVE |
ETHTOOL_COALESCE_USE_CQE,
.supported_input_xfrm = RXH_XFRM_SYM_OR_XOR,
+ .supported_ring_params = ETHTOOL_RING_USE_TCP_DATA_SPLIT,
.get_drvinfo = mlx5e_get_drvinfo,
.get_link = ethtool_op_get_link,
.get_link_ext_state = mlx5e_get_link_ext_state,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 9330d90c1f03..4bbf10174fe8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -39,6 +39,7 @@
#include <linux/debugfs.h>
#include <linux/if_bridge.h>
#include <linux/filter.h>
+#include <linux/ethtool_netlink.h>
#include <net/netdev_queues.h>
#include <net/netdev_rx_queue.h>
#include <net/page_pool/types.h>
@@ -4392,6 +4393,7 @@ static netdev_features_t mlx5e_fix_uplink_rep_features(struct net_device *netdev
static netdev_features_t mlx5e_fix_features(struct net_device *netdev,
netdev_features_t features)
{
+ struct netdev_config *cfg = netdev->cfg_pending;
struct mlx5e_priv *priv = netdev_priv(netdev);
struct mlx5e_vlan_table *vlan;
struct mlx5e_params *params;
@@ -4458,6 +4460,13 @@ static netdev_features_t mlx5e_fix_features(struct net_device *netdev,
}
}
+ /* The header-data split ring param requires HW GRO to stay enabled. */
+ if (cfg && cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED &&
+ !(features & NETIF_F_GRO_HW)) {
+ netdev_warn(netdev, "Keeping HW-GRO enabled, TCP header-data split depends on it\n");
+ features |= NETIF_F_GRO_HW;
+ }
+
if (mlx5e_is_uplink_rep(priv)) {
features = mlx5e_fix_uplink_rep_features(netdev, features);
netdev->netns_immutable = true;
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,103 @@
From d243df0a0b8c463819ec69b746ed501da46f66e6 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 18:03:18 -0400
Subject: [PATCH] net/mlx5: fs, add multiple prios to RDMA TRANSPORT steering
domain
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit 52931f55159ea5c27ad4fe66fc0cb8ad75ab795b
Author: Patrisious Haddad <phaddad@nvidia.com>
Date: Tue Jun 17 11:19:15 2025 +0300
net/mlx5: fs, add multiple prios to RDMA TRANSPORT steering domain
RDMA TRANSPORT domains were initially limited to a single priority.
This change allows the domains to have multiple priorities, making
it possible to add several rules and control the order in which
they're evaluated.
Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/b299cbb4c8678a33da6e6b6988b5bf6145c54b88.1750148083.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index b29e67466701..2a855e50be95 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -3250,34 +3250,48 @@ static int
init_rdma_transport_rx_root_ns_one(struct mlx5_flow_steering *steering,
int vport_idx)
{
+ struct mlx5_flow_root_namespace *root_ns;
struct fs_prio *prio;
+ int i;
steering->rdma_transport_rx_root_ns[vport_idx] =
create_root_ns(steering, FS_FT_RDMA_TRANSPORT_RX);
if (!steering->rdma_transport_rx_root_ns[vport_idx])
return -ENOMEM;
- /* create 1 prio*/
- prio = fs_create_prio(&steering->rdma_transport_rx_root_ns[vport_idx]->ns,
- MLX5_RDMA_TRANSPORT_BYPASS_PRIO, 1);
- return PTR_ERR_OR_ZERO(prio);
+ root_ns = steering->rdma_transport_rx_root_ns[vport_idx];
+
+ for (i = 0; i < MLX5_RDMA_TRANSPORT_BYPASS_PRIO; i++) {
+ prio = fs_create_prio(&root_ns->ns, i, 1);
+ if (IS_ERR(prio))
+ return PTR_ERR(prio);
+ }
+ set_prio_attrs(root_ns);
+ return 0;
}
static int
init_rdma_transport_tx_root_ns_one(struct mlx5_flow_steering *steering,
int vport_idx)
{
+ struct mlx5_flow_root_namespace *root_ns;
struct fs_prio *prio;
+ int i;
steering->rdma_transport_tx_root_ns[vport_idx] =
create_root_ns(steering, FS_FT_RDMA_TRANSPORT_TX);
if (!steering->rdma_transport_tx_root_ns[vport_idx])
return -ENOMEM;
- /* create 1 prio*/
- prio = fs_create_prio(&steering->rdma_transport_tx_root_ns[vport_idx]->ns,
- MLX5_RDMA_TRANSPORT_BYPASS_PRIO, 1);
- return PTR_ERR_OR_ZERO(prio);
+ root_ns = steering->rdma_transport_tx_root_ns[vport_idx];
+
+ for (i = 0; i < MLX5_RDMA_TRANSPORT_BYPASS_PRIO; i++) {
+ prio = fs_create_prio(&root_ns->ns, i, 1);
+ if (IS_ERR(prio))
+ return PTR_ERR(prio);
+ }
+ set_prio_attrs(root_ns);
+ return 0;
}
static int init_rdma_transport_rx_root_ns(struct mlx5_flow_steering *steering)
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index fb5f98fcc726..6ac76a0c3827 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -40,7 +40,7 @@
#define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v)
-#define MLX5_RDMA_TRANSPORT_BYPASS_PRIO 0
+#define MLX5_RDMA_TRANSPORT_BYPASS_PRIO 16
#define MLX5_FS_MAX_POOL_SIZE BIT(30)
enum mlx5_flow_destination_type {
--
2.50.1 (Apple Git-155)

View File

@ -0,0 +1,75 @@
From a8ad8f1ee332d30aea6afc35f434dc416e5b574a Mon Sep 17 00:00:00 2001
From: Kamal Heib <kheib@redhat.com>
Date: Thu, 16 Apr 2026 18:03:18 -0400
Subject: [PATCH] net/mlx5: Small refactor for general object capabilities
JIRA: https://redhat.atlassian.net/browse/RHEL-169055
commit ebf8d47121b6ef3f38425a343a72f37c60fd6dbc
Author: Dragos Tatulea <dtatulea@nvidia.com>
Date: Thu Jun 19 14:37:17 2025 +0300
net/mlx5: Small refactor for general object capabilities
Make enum for capability bits of general object types depend on
the type definitions themselves.
Make sure that capabilities in the [64,127] bit range are
properly calculated (type id - 64).
Signed-off-by: Dragos Tatulea <dtatulea@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250619113721.60201-2-mbloch@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Kamal Heib <kheib@redhat.com>
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 9521159b0857..4077f0921039 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -12500,17 +12500,6 @@ struct mlx5_ifc_affiliated_event_header_bits {
u8 obj_id[0x20];
};
-enum {
- MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY = BIT_ULL(0xc),
- MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_IPSEC = BIT_ULL(0x13),
- MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_SAMPLER = BIT_ULL(0x20),
- MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_FLOW_METER_ASO = BIT_ULL(0x24),
-};
-
-enum {
- MLX5_HCA_CAP_2_GENERAL_OBJECT_TYPES_RDMA_CTRL = BIT_ULL(0x13),
-};
-
enum {
MLX5_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY = 0xc,
MLX5_GENERAL_OBJECT_TYPES_IPSEC = 0x13,
@@ -12522,6 +12511,22 @@ enum {
MLX5_GENERAL_OBJECT_TYPES_FLOW_TABLE_ALIAS = 0xff15,
};
+enum {
+ MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY =
+ BIT_ULL(MLX5_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY),
+ MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_IPSEC =
+ BIT_ULL(MLX5_GENERAL_OBJECT_TYPES_IPSEC),
+ MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_SAMPLER =
+ BIT_ULL(MLX5_GENERAL_OBJECT_TYPES_SAMPLER),
+ MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_FLOW_METER_ASO =
+ BIT_ULL(MLX5_GENERAL_OBJECT_TYPES_FLOW_METER_ASO),
+};
+
+enum {
+ MLX5_HCA_CAP_2_GENERAL_OBJECT_TYPES_RDMA_CTRL =
+ BIT_ULL(MLX5_GENERAL_OBJECT_TYPES_RDMA_CTRL - 0x40),
+};
+
enum {
MLX5_IPSEC_OBJECT_ICV_LEN_16B,
};
--
2.50.1 (Apple Git-155)

Some files were not shown because too many files have changed in this diff Show More