diff --git a/COPYING-5.14.0-687.17.1.el9 b/COPYING-5.14.0-687.19.1.el9 similarity index 100% rename from COPYING-5.14.0-687.17.1.el9 rename to COPYING-5.14.0-687.19.1.el9 diff --git a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/counters.rst b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/counters.rst index 43d72c8b71..cc498895f9 100644 --- a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/counters.rst +++ b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/counters.rst @@ -1341,3 +1341,40 @@ Device Counters - The number of times the device owned queue had not enough buffers allocated. - Error + + * - `pci_bw_inbound_high` + - The number of times the device crossed the high inbound pcie bandwidth + threshold. To be compared to pci_bw_inbound_low to check if the device + is in a congested state. + If pci_bw_inbound_high == pci_bw_inbound_low then the device is not congested. + If pci_bw_inbound_high > pci_bw_inbound_low then the device is congested. + - Informative + + * - `pci_bw_inbound_low` + - The number of times the device crossed the low inbound PCIe bandwidth + threshold. To be compared to pci_bw_inbound_high to check if the device + is in a congested state. + If pci_bw_inbound_high == pci_bw_inbound_low then the device is not congested. + If pci_bw_inbound_high > pci_bw_inbound_low then the device is congested. + - Informative + + * - `pci_bw_outbound_high` + - The number of times the device crossed the high outbound pcie bandwidth + threshold. To be compared to pci_bw_outbound_low to check if the device + is in a congested state. + If pci_bw_outbound_high == pci_bw_outbound_low then the device is not congested. + If pci_bw_outbound_high > pci_bw_outbound_low then the device is congested. + - Informative + + * - `pci_bw_outbound_low` + - The number of times the device crossed the low outbound PCIe bandwidth + threshold. To be compared to pci_bw_outbound_high to check if the device + is in a congested state. + If pci_bw_outbound_high == pci_bw_outbound_low then the device is not congested. + If pci_bw_outbound_high > pci_bw_outbound_low then the device is congested. + - Informative + + * - `pci_bw_stale_event` + - The number of times the device fired a PCIe congestion event but on query + there was no change in state. + - Informative diff --git a/Documentation/networking/devlink/mlx5.rst b/Documentation/networking/devlink/mlx5.rst index 7febe0aecd..0e5f9c76e5 100644 --- a/Documentation/networking/devlink/mlx5.rst +++ b/Documentation/networking/devlink/mlx5.rst @@ -15,23 +15,62 @@ Parameters * - Name - Mode - Validation + - Notes * - ``enable_roce`` - driverinit - - Type: Boolean - - If the device supports RoCE disablement, RoCE enablement state controls + - Boolean + - If the device supports RoCE disablement, RoCE enablement state controls device support for RoCE capability. Otherwise, the control occurs in the driver stack. When RoCE is disabled at the driver level, only raw ethernet QPs are supported. * - ``io_eq_size`` - driverinit - The range is between 64 and 4096. + - * - ``event_eq_size`` - driverinit - The range is between 64 and 4096. + - * - ``max_macs`` - driverinit - The range is between 1 and 2^31. Only power of 2 values are supported. + - + * - ``enable_sriov`` + - permanent + - Boolean + - Applies to each physical function (PF) independently, if the device + supports it. Otherwise, it applies symmetrically to all PFs. + * - ``total_vfs`` + - permanent + - The range is between 1 and a device-specific max. + - Applies to each physical function (PF) independently, if the device + supports it. Otherwise, it applies symmetrically to all PFs. + +Note: permanent parameters such as ``enable_sriov`` and ``total_vfs`` require FW reset to take effect + +.. code-block:: bash + + # setup parameters + devlink dev param set pci/0000:01:00.0 name enable_sriov value true cmode permanent + devlink dev param set pci/0000:01:00.0 name total_vfs value 8 cmode permanent + + # Fw reset + devlink dev reload pci/0000:01:00.0 action fw_activate + + # for PCI related config such as sriov PCI reset/rescan is required: + echo 1 >/sys/bus/pci/devices/0000:01:00.0/remove + echo 1 >/sys/bus/pci/rescan + grep ^ /sys/bus/pci/devices/0000:01:00.0/sriov_* + + * - ``num_doorbells`` + - driverinit + - This controls the number of channel doorbells used by the netdev. In all + cases, an additional doorbell is allocated and used for non-channel + communication (e.g. for PTP, HWS, etc.). Supported values are: + + - 0: No channel-specific doorbells, use the global one for everything. + - [1, max_num_channels]: Spread netdev channels equally across these + doorbells. The ``mlx5`` driver also implements the following driver-specific parameters. @@ -116,6 +155,68 @@ parameters. - u32 - driverinit - Control the size (in packets) of the hairpin queues. + * - ``pcie_cong_inbound_high`` + - u16 + - driverinit + - High threshold configuration for PCIe congestion events. The firmware + will send an event once device side inbound PCIe traffic went + above the configured high threshold for a long enough period (at least + 200ms). + + See pci_bw_inbound_high ethtool stat. + + Units are 0.01 %. Accepted values are in range [0, 10000]. + pcie_cong_inbound_low < pcie_cong_inbound_high. + Default value: 9000 (Corresponds to 90%). + * - ``pcie_cong_inbound_low`` + - u16 + - driverinit + - Low threshold configuration for PCIe congestion events. The firmware + will send an event once device side inbound PCIe traffic went + below the configured low threshold, only after having been previously in + a congested state. + + See pci_bw_inbound_low ethtool stat. + + Units are 0.01 %. Accepted values are in range [0, 10000]. + pcie_cong_inbound_low < pcie_cong_inbound_high. + Default value: 7500. + * - ``pcie_cong_outbound_high`` + - u16 + - driverinit + - High threshold configuration for PCIe congestion events. The firmware + will send an event once device side outbound PCIe traffic went + above the configured high threshold for a long enough period (at least + 200ms). + + See pci_bw_outbound_high ethtool stat. + + Units are 0.01 %. Accepted values are in range [0, 10000]. + pcie_cong_outbound_low < pcie_cong_outbound_high. + Default value: 9000 (Corresponds to 90%). + * - ``pcie_cong_outbound_low`` + - u16 + - driverinit + - Low threshold configuration for PCIe congestion events. The firmware + will send an event once device side outbound PCIe traffic went + below the configured low threshold, only after having been previously in + a congested state. + + See pci_bw_outbound_low ethtool stat. + + Units are 0.01 %. Accepted values are in range [0, 10000]. + pcie_cong_outbound_low < pcie_cong_outbound_high. + Default value: 7500. + + * - ``cqe_compress_type`` + - string + - permanent + - Configure which mechanism/algorithm should be used by the NIC that will + affect the rate (aggressiveness) of compressed CQEs depending on PCIe bus + conditions and other internal NIC factors. This mode affects all queues + that enable compression. + * ``balanced`` : Merges fewer CQEs, resulting in a moderate compression ratio but maintaining a balance between bandwidth savings and performance + * ``aggressive`` : Merges more CQEs into a single entry, achieving a higher compression rate and maximizing performance, particularly under high traffic loads The ``mlx5`` driver supports reloading via ``DEVLINK_CMD_RELOAD`` @@ -284,6 +385,12 @@ Description of the vnic counters: amount of Interconnect Host Memory (ICM) consumed by the vnic in granularity of 4KB. ICM is host memory allocated by SW upon HCA request and is used for storing data structures that control HCA operation. +- bar_uar_access + number of WRITE or READ access operations to the UAR on the PCIe BAR. +- odp_local_triggered_page_fault + number of locally-triggered page-faults due to ODP. +- odp_remote_triggered_page_fault + number of remotly-triggered page-faults due to ODP. User commands examples: diff --git a/MAINTAINERS b/MAINTAINERS index 260294985f..b6d545721b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7475,6 +7475,15 @@ F: fs/mnt_idmapping.c F: include/linux/mnt_idmapping.* F: tools/testing/selftests/mount_setattr/ +FILESYSTEMS [STACKABLE] +M: Miklos Szeredi +M: Amir Goldstein +L: linux-fsdevel@vger.kernel.org +L: linux-unionfs@vger.kernel.org +S: Maintained +F: fs/backing-file.c +F: include/linux/backing-file.h + FINTEK F75375S HARDWARE MONITOR AND FAN CONTROLLER DRIVER M: Riku Voipio L: linux-hwmon@vger.kernel.org diff --git a/arch/arc/include/asm/jump_label.h b/arch/arc/include/asm/jump_label.h index 9d96180797..a339223d9e 100644 --- a/arch/arc/include/asm/jump_label.h +++ b/arch/arc/include/asm/jump_label.h @@ -31,7 +31,7 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto(".balign "__stringify(JUMP_LABEL_NOP_SIZE)" \n" + asm goto(".balign "__stringify(JUMP_LABEL_NOP_SIZE)" \n" "1: \n" "nop \n" ".pushsection __jump_table, \"aw\" \n" @@ -47,7 +47,7 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) { - asm_volatile_goto(".balign "__stringify(JUMP_LABEL_NOP_SIZE)" \n" + asm goto(".balign "__stringify(JUMP_LABEL_NOP_SIZE)" \n" "1: \n" "b %l[l_yes] \n" ".pushsection __jump_table, \"aw\" \n" diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c index 7654c2e42d..134c48374e 100644 --- a/arch/arc/kernel/troubleshoot.c +++ b/arch/arc/kernel/troubleshoot.c @@ -90,10 +90,12 @@ static void show_faulting_vma(unsigned long address) */ if (vma) { char buf[ARC_PATH_MAX]; - char *nm = "?"; + char *nm = "anon"; if (vma->vm_file) { - nm = file_path(vma->vm_file, buf, ARC_PATH_MAX-1); + /* XXX: can we use %pD below and get rid of buf? */ + nm = d_path(file_user_path(vma->vm_file), buf, + ARC_PATH_MAX-1); if (IS_ERR(nm)) nm = "?"; } diff --git a/arch/arm/include/asm/jump_label.h b/arch/arm/include/asm/jump_label.h index e12d7d096f..e4eb54f6cd 100644 --- a/arch/arm/include/asm/jump_label.h +++ b/arch/arm/include/asm/jump_label.h @@ -11,7 +11,7 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("1:\n\t" + asm goto("1:\n\t" WASM(nop) "\n\t" ".pushsection __jump_table, \"aw\"\n\t" ".word 1b, %l[l_yes], %c0\n\t" @@ -25,7 +25,7 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) { - asm_volatile_goto("1:\n\t" + asm goto("1:\n\t" WASM(b) " %l[l_yes]\n\t" ".pushsection __jump_table, \"aw\"\n\t" ".word 1b, %l[l_yes], %c0\n\t" diff --git a/arch/arm64/include/asm/alternative-macros.h b/arch/arm64/include/asm/alternative-macros.h index 210bb43cff..d328f549b1 100644 --- a/arch/arm64/include/asm/alternative-macros.h +++ b/arch/arm64/include/asm/alternative-macros.h @@ -229,7 +229,7 @@ alternative_has_cap_likely(const unsigned long cpucap) if (!cpucap_is_possible(cpucap)) return false; - asm_volatile_goto( + asm goto( ALTERNATIVE_CB("b %l[l_no]", %[cpucap], alt_cb_patch_nops) : : [cpucap] "i" (cpucap) @@ -247,7 +247,7 @@ alternative_has_cap_unlikely(const unsigned long cpucap) if (!cpucap_is_possible(cpucap)) return false; - asm_volatile_goto( + asm goto( ALTERNATIVE("nop", "b %l[l_yes]", %[cpucap]) : : [cpucap] "i" (cpucap) diff --git a/arch/arm64/include/asm/jump_label.h b/arch/arm64/include/asm/jump_label.h index 48ddc0f45d..6aafbb7899 100644 --- a/arch/arm64/include/asm/jump_label.h +++ b/arch/arm64/include/asm/jump_label.h @@ -18,7 +18,7 @@ static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { - asm_volatile_goto( + asm goto( "1: nop \n\t" " .pushsection __jump_table, \"aw\" \n\t" " .align 3 \n\t" @@ -35,7 +35,7 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch) { - asm_volatile_goto( + asm goto( "1: b %l[l_yes] \n\t" " .pushsection __jump_table, \"aw\" \n\t" " .align 3 \n\t" diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 9debb2f184..9998f4ce4c 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -627,8 +627,10 @@ static void vgic_its_invalidate_cache(struct vgic_its *its) unsigned long idx; xa_for_each(&its->translation_cache, idx, irq) { - xa_erase(&its->translation_cache, idx); - vgic_put_irq(kvm, irq); + /* Only the context that erases the entry drops its cache ref. */ + irq = xa_erase(&its->translation_cache, idx); + if (irq) + vgic_put_irq(kvm, irq); } } diff --git a/arch/mips/include/asm/jump_label.h b/arch/mips/include/asm/jump_label.h index c5c6864e64..405c85173f 100644 --- a/arch/mips/include/asm/jump_label.h +++ b/arch/mips/include/asm/jump_label.h @@ -36,7 +36,7 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("1:\t" B_INSN " 2f\n\t" + asm goto("1:\t" B_INSN " 2f\n\t" "2:\t.insn\n\t" ".pushsection __jump_table, \"aw\"\n\t" WORD_INSN " 1b, %l[l_yes], %0\n\t" @@ -50,7 +50,7 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) { - asm_volatile_goto("1:\t" J_INSN " %l[l_yes]\n\t" + asm goto("1:\t" J_INSN " %l[l_yes]\n\t" ".pushsection __jump_table, \"aw\"\n\t" WORD_INSN " 1b, %l[l_yes], %0\n\t" ".popsection\n\t" diff --git a/arch/parisc/include/asm/jump_label.h b/arch/parisc/include/asm/jump_label.h index 7efb1aa2f7..9e06acd0e5 100644 --- a/arch/parisc/include/asm/jump_label.h +++ b/arch/parisc/include/asm/jump_label.h @@ -11,7 +11,7 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("1:\n\t" + asm goto("1:\n\t" "nop\n\t" ".pushsection __jump_table, \"aw\"\n\t" ".word 1b - ., %l[l_yes] - .\n\t" @@ -26,7 +26,7 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) { - asm_volatile_goto("1:\n\t" + asm goto("1:\n\t" "b,n %l[l_yes]\n\t" ".pushsection __jump_table, \"aw\"\n\t" ".word 1b - ., %l[l_yes] - .\n\t" diff --git a/arch/powerpc/include/asm/book3s/64/kup.h b/arch/powerpc/include/asm/book3s/64/kup.h index 23545732dd..c82323b864 100644 --- a/arch/powerpc/include/asm/book3s/64/kup.h +++ b/arch/powerpc/include/asm/book3s/64/kup.h @@ -328,7 +328,7 @@ static inline unsigned long get_kuap(void) return mfspr(SPRN_AMR); } -static inline void set_kuap(unsigned long value) +static __always_inline void set_kuap(unsigned long value) { if (!mmu_has_feature(MMU_FTR_BOOK3S_KUAP)) return; @@ -398,7 +398,7 @@ static __always_inline void allow_user_access(void __user *to, const void __user #endif /* !CONFIG_PPC_KUAP */ -static inline void prevent_user_access(unsigned long dir) +static __always_inline void prevent_user_access(unsigned long dir) { set_kuap(AMR_KUAP_BLOCKED); if (static_branch_unlikely(&uaccess_flush_key)) diff --git a/arch/powerpc/include/asm/jump_label.h b/arch/powerpc/include/asm/jump_label.h index 93ce3ec253..2f2a86ed22 100644 --- a/arch/powerpc/include/asm/jump_label.h +++ b/arch/powerpc/include/asm/jump_label.h @@ -17,7 +17,7 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("1:\n\t" + asm goto("1:\n\t" "nop # arch_static_branch\n\t" ".pushsection __jump_table, \"aw\"\n\t" ".long 1b - ., %l[l_yes] - .\n\t" @@ -32,7 +32,7 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) { - asm_volatile_goto("1:\n\t" + asm goto("1:\n\t" "b %l[l_yes] # arch_static_branch_jump\n\t" ".pushsection __jump_table, \"aw\"\n\t" ".long 1b - ., %l[l_yes] - .\n\t" diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index fd515d2e7d..1f645841ff 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -73,7 +73,7 @@ __pu_failed: \ * are no aliasing issues. */ #define __put_user_asm_goto(x, addr, label, op) \ - asm_volatile_goto( \ + asm goto( \ "1: " op "%U1%X1 %0,%1 # put_user\n" \ EX_TABLE(1b, %l2) \ : \ @@ -86,7 +86,7 @@ __pu_failed: \ __put_user_asm_goto(x, ptr, label, "std") #else /* __powerpc64__ */ #define __put_user_asm2_goto(x, addr, label) \ - asm_volatile_goto( \ + asm goto( \ "1: stw%X1 %0, %1\n" \ "2: stw%X1 %L0, %L1\n" \ EX_TABLE(1b, %l2) \ @@ -130,7 +130,7 @@ do { \ #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT #define __get_user_asm_goto(x, addr, label, op) \ - asm_volatile_goto( \ + asm_goto_output( \ "1: "op"%U1%X1 %0, %1 # get_user\n" \ EX_TABLE(1b, %l2) \ : "=r" (x) \ @@ -143,7 +143,7 @@ do { \ __get_user_asm_goto(x, addr, label, "ld") #else /* __powerpc64__ */ #define __get_user_asm2_goto(x, addr, label) \ - asm_volatile_goto( \ + asm_goto_output( \ "1: lwz%X1 %0, %1\n" \ "2: lwz%X1 %L0, %L1\n" \ EX_TABLE(1b, %l2) \ diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 2a1d2f4ada..218f95e560 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -242,7 +242,7 @@ notrace void arch_local_irq_restore(unsigned long mask) * This allows interrupts to be unmasked without hard disabling, and * also without new hard interrupts coming in ahead of pending ones. */ - asm_volatile_goto( + asm goto( "1: \n" " lbz 9,%0(13) \n" " cmpwi 9,0 \n" diff --git a/arch/s390/include/asm/jump_label.h b/arch/s390/include/asm/jump_label.h index 895f774bbc..bf78cf381d 100644 --- a/arch/s390/include/asm/jump_label.h +++ b/arch/s390/include/asm/jump_label.h @@ -25,7 +25,7 @@ */ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("0: brcl 0,%l[label]\n" + asm goto("0: brcl 0,%l[label]\n" ".pushsection __jump_table,\"aw\"\n" ".balign 8\n" ".long 0b-.,%l[label]-.\n" @@ -39,7 +39,7 @@ label: static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) { - asm_volatile_goto("0: brcl 15,%l[label]\n" + asm goto("0: brcl 15,%l[label]\n" ".pushsection __jump_table,\"aw\"\n" ".balign 8\n" ".long 0b-.,%l[label]-.\n" diff --git a/arch/sparc/include/asm/jump_label.h b/arch/sparc/include/asm/jump_label.h index 94eb529dcb..2718cbea82 100644 --- a/arch/sparc/include/asm/jump_label.h +++ b/arch/sparc/include/asm/jump_label.h @@ -10,7 +10,7 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("1:\n\t" + asm goto("1:\n\t" "nop\n\t" "nop\n\t" ".pushsection __jump_table, \"aw\"\n\t" @@ -26,7 +26,7 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) { - asm_volatile_goto("1:\n\t" + asm goto("1:\n\t" "b %l[l_yes]\n\t" "nop\n\t" ".pushsection __jump_table, \"aw\"\n\t" diff --git a/arch/um/include/asm/cpufeature.h b/arch/um/include/asm/cpufeature.h index 4b6d1b526b..66fe06db87 100644 --- a/arch/um/include/asm/cpufeature.h +++ b/arch/um/include/asm/cpufeature.h @@ -75,7 +75,7 @@ extern void setup_clear_cpu_cap(unsigned int bit); */ static __always_inline bool _static_cpu_has(u16 bit) { - asm_volatile_goto("1: jmp 6f\n" + asm goto("1: jmp 6f\n" "2:\n" ".skip -(((5f-4f) - (2b-1b)) > 0) * " "((5f-4f) - (2b-1b)),0x90\n" diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 071572e23d..cbbef32517 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -24,7 +24,7 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("1:" + asm goto("1:" "jmp %l[l_yes] # objtool NOPs this \n\t" JUMP_TABLE_ENTRY : : "i" (key), "i" (2 | branch) : : l_yes); @@ -38,7 +38,7 @@ l_yes: static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { - asm_volatile_goto("1:" + asm goto("1:" ".byte " __stringify(BYTES_NOP5) "\n\t" JUMP_TABLE_ENTRY : : "i" (key), "i" (branch) : : l_yes); @@ -52,7 +52,7 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch) { - asm_volatile_goto("1:" + asm goto("1:" "jmp %l[l_yes]\n\t" JUMP_TABLE_ENTRY : : "i" (key), "i" (branch) : : l_yes); diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h index 7fa6112164..1919ccf493 100644 --- a/arch/x86/include/asm/rmwcc.h +++ b/arch/x86/include/asm/rmwcc.h @@ -18,7 +18,7 @@ #define __GEN_RMWcc(fullop, _var, cc, clobbers, ...) \ ({ \ bool c = false; \ - asm_volatile_goto (fullop "; j" #cc " %l[cc_label]" \ + asm goto (fullop "; j" #cc " %l[cc_label]" \ : : [var] "m" (_var), ## __VA_ARGS__ \ : clobbers : cc_label); \ if (0) { \ diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 9d45aff761..1c5513b04f 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -205,7 +205,7 @@ static inline void clwb(volatile void *__p) #ifdef CONFIG_X86_USER_SHADOW_STACK static inline int write_user_shstk_64(u64 __user *addr, u64 val) { - asm_volatile_goto("1: wrussq %[val], (%[addr])\n" + asm goto("1: wrussq %[val], (%[addr])\n" _ASM_EXTABLE(1b, %l[fail]) :: [addr] "r" (addr), [val] "r" (val) :: fail); diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 34f500f9b0..3a7755c1a4 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -133,7 +133,7 @@ extern int __get_user_bad(void); #ifdef CONFIG_X86_32 #define __put_user_goto_u64(x, addr, label) \ - asm_volatile_goto("\n" \ + asm goto("\n" \ "1: movl %%eax,0(%1)\n" \ "2: movl %%edx,4(%1)\n" \ _ASM_EXTABLE_UA(1b, %l2) \ @@ -295,7 +295,7 @@ do { \ } while (0) #define __get_user_asm(x, addr, itype, ltype, label) \ - asm_volatile_goto("\n" \ + asm_goto_output("\n" \ "1: mov"itype" %[umem],%[output]\n" \ _ASM_EXTABLE_UA(1b, %l2) \ : [output] ltype(x) \ @@ -375,7 +375,7 @@ do { \ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ __typeof__(*(_ptr)) __old = *_old; \ __typeof__(*(_ptr)) __new = (_new); \ - asm_volatile_goto("\n" \ + asm_goto_output("\n" \ "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\ _ASM_EXTABLE_UA(1b, %l[label]) \ : CC_OUT(z) (success), \ @@ -394,7 +394,7 @@ do { \ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ __typeof__(*(_ptr)) __old = *_old; \ __typeof__(*(_ptr)) __new = (_new); \ - asm_volatile_goto("\n" \ + asm_goto_output("\n" \ "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n" \ _ASM_EXTABLE_UA(1b, %l[label]) \ : CC_OUT(z) (success), \ @@ -477,7 +477,7 @@ struct __large_struct { unsigned long buf[100]; }; * aliasing issues. */ #define __put_user_goto(x, addr, itype, ltype, label) \ - asm_volatile_goto("\n" \ + asm goto("\n" \ "1: mov"itype" %0,%1\n" \ _ASM_EXTABLE_UA(1b, %l2) \ : : ltype(x), "m" (__m(addr)) \ diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h index 36c8af87a7..4e725854c6 100644 --- a/arch/x86/kvm/svm/svm_ops.h +++ b/arch/x86/kvm/svm/svm_ops.h @@ -8,7 +8,7 @@ #define svm_asm(insn, clobber...) \ do { \ - asm_volatile_goto("1: " __stringify(insn) "\n\t" \ + asm goto("1: " __stringify(insn) "\n\t" \ _ASM_EXTABLE(1b, %l[fault]) \ ::: clobber : fault); \ return; \ @@ -18,7 +18,7 @@ fault: \ #define svm_asm1(insn, op1, clobber...) \ do { \ - asm_volatile_goto("1: " __stringify(insn) " %0\n\t" \ + asm goto("1: " __stringify(insn) " %0\n\t" \ _ASM_EXTABLE(1b, %l[fault]) \ :: op1 : clobber : fault); \ return; \ @@ -28,7 +28,7 @@ fault: \ #define svm_asm2(insn, op1, op2, clobber...) \ do { \ - asm_volatile_goto("1: " __stringify(insn) " %1, %0\n\t" \ + asm goto("1: " __stringify(insn) " %1, %0\n\t" \ _ASM_EXTABLE(1b, %l[fault]) \ :: op1, op2 : clobber : fault); \ return; \ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 09fc5c6d13..bc94324bf7 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -745,7 +745,7 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, */ static int kvm_cpu_vmxoff(void) { - asm_volatile_goto("1: vmxoff\n\t" + asm goto("1: vmxoff\n\t" _ASM_EXTABLE(1b, %l[fault]) ::: "cc", "memory" : fault); @@ -2807,7 +2807,7 @@ static int kvm_cpu_vmxon(u64 vmxon_pointer) cr4_set_bits(X86_CR4_VMXE); - asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t" + asm goto("1: vmxon %[vmxon_pointer]\n\t" _ASM_EXTABLE(1b, %l[fault]) : : [vmxon_pointer] "m"(vmxon_pointer) : : fault); diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index f41ce3c241..8060e5fc6d 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -94,7 +94,7 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT - asm_volatile_goto("1: vmread %[field], %[output]\n\t" + asm_goto_output("1: vmread %[field], %[output]\n\t" "jna %l[do_fail]\n\t" _ASM_EXTABLE(1b, %l[do_exception]) @@ -188,7 +188,7 @@ static __always_inline unsigned long vmcs_readl(unsigned long field) #define vmx_asm1(insn, op1, error_args...) \ do { \ - asm_volatile_goto("1: " __stringify(insn) " %0\n\t" \ + asm goto("1: " __stringify(insn) " %0\n\t" \ ".byte 0x2e\n\t" /* branch not taken hint */ \ "jna %l[error]\n\t" \ _ASM_EXTABLE(1b, %l[fault]) \ @@ -205,7 +205,7 @@ fault: \ #define vmx_asm2(insn, op1, op2, error_args...) \ do { \ - asm_volatile_goto("1: " __stringify(insn) " %1, %0\n\t" \ + asm goto("1: " __stringify(insn) " %1, %0\n\t" \ ".byte 0x2e\n\t" /* branch not taken hint */ \ "jna %l[error]\n\t" \ _ASM_EXTABLE(1b, %l[fault]) \ diff --git a/arch/xtensa/include/asm/jump_label.h b/arch/xtensa/include/asm/jump_label.h index c812bf8502..46c8596259 100644 --- a/arch/xtensa/include/asm/jump_label.h +++ b/arch/xtensa/include/asm/jump_label.h @@ -13,7 +13,7 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("1:\n\t" + asm goto("1:\n\t" "_nop\n\t" ".pushsection __jump_table, \"aw\"\n\t" ".word 1b, %l[l_yes], %c0\n\t" @@ -38,7 +38,7 @@ static __always_inline bool arch_static_branch_jump(struct static_key *key, * make it reachable and wrap both into a no-transform block * to avoid any assembler interference with this. */ - asm_volatile_goto("1:\n\t" + asm goto("1:\n\t" ".begin no-transform\n\t" "_j %l[l_yes]\n\t" "2:\n\t" diff --git a/configs/kernel-5.14.0-aarch64-64k-debug.config b/configs/kernel-5.14.0-aarch64-64k-debug.config index 0a74e67cb0..33c15607ba 100644 --- a/configs/kernel-5.14.0-aarch64-64k-debug.config +++ b/configs/kernel-5.14.0-aarch64-64k-debug.config @@ -7360,6 +7360,7 @@ CONFIG_HTE_TEGRA194_TEST=m CONFIG_DCACHE_WORD_ACCESS=y # CONFIG_VALIDATE_FS_PARSER is not set CONFIG_FS_IOMAP=y +CONFIG_FS_STACK=y CONFIG_BUFFER_HEAD=y CONFIG_LEGACY_DIRECT_IO=y # CONFIG_EXT2_FS is not set diff --git a/configs/kernel-5.14.0-aarch64-64k.config b/configs/kernel-5.14.0-aarch64-64k.config index 3331e60b37..467b46857e 100644 --- a/configs/kernel-5.14.0-aarch64-64k.config +++ b/configs/kernel-5.14.0-aarch64-64k.config @@ -7337,6 +7337,7 @@ CONFIG_HTE_TEGRA194=m CONFIG_DCACHE_WORD_ACCESS=y # CONFIG_VALIDATE_FS_PARSER is not set CONFIG_FS_IOMAP=y +CONFIG_FS_STACK=y CONFIG_BUFFER_HEAD=y CONFIG_LEGACY_DIRECT_IO=y # CONFIG_EXT2_FS is not set diff --git a/configs/kernel-5.14.0-aarch64-debug.config b/configs/kernel-5.14.0-aarch64-debug.config index b22f681e24..b6fbe75e5e 100644 --- a/configs/kernel-5.14.0-aarch64-debug.config +++ b/configs/kernel-5.14.0-aarch64-debug.config @@ -7365,6 +7365,7 @@ CONFIG_HTE_TEGRA194_TEST=m CONFIG_DCACHE_WORD_ACCESS=y # CONFIG_VALIDATE_FS_PARSER is not set CONFIG_FS_IOMAP=y +CONFIG_FS_STACK=y CONFIG_BUFFER_HEAD=y CONFIG_LEGACY_DIRECT_IO=y # CONFIG_EXT2_FS is not set diff --git a/configs/kernel-5.14.0-aarch64-rt-64k-debug.config b/configs/kernel-5.14.0-aarch64-rt-64k-debug.config index cc471d4111..603fff1224 100644 --- a/configs/kernel-5.14.0-aarch64-rt-64k-debug.config +++ b/configs/kernel-5.14.0-aarch64-rt-64k-debug.config @@ -7364,6 +7364,7 @@ CONFIG_HTE_TEGRA194_TEST=m CONFIG_DCACHE_WORD_ACCESS=y # CONFIG_VALIDATE_FS_PARSER is not set CONFIG_FS_IOMAP=y +CONFIG_FS_STACK=y CONFIG_BUFFER_HEAD=y CONFIG_LEGACY_DIRECT_IO=y # CONFIG_EXT2_FS is not set diff --git a/configs/kernel-5.14.0-aarch64-rt-64k.config b/configs/kernel-5.14.0-aarch64-rt-64k.config index 2a2643c17f..7b25c1ac7c 100644 --- a/configs/kernel-5.14.0-aarch64-rt-64k.config +++ b/configs/kernel-5.14.0-aarch64-rt-64k.config @@ -7342,6 +7342,7 @@ CONFIG_HTE_TEGRA194=m CONFIG_DCACHE_WORD_ACCESS=y # CONFIG_VALIDATE_FS_PARSER is not set CONFIG_FS_IOMAP=y +CONFIG_FS_STACK=y CONFIG_BUFFER_HEAD=y CONFIG_LEGACY_DIRECT_IO=y # CONFIG_EXT2_FS is not set diff --git a/configs/kernel-5.14.0-aarch64-rt-debug.config b/configs/kernel-5.14.0-aarch64-rt-debug.config index d6b5fcd0e2..5ccacb314b 100644 --- a/configs/kernel-5.14.0-aarch64-rt-debug.config +++ b/configs/kernel-5.14.0-aarch64-rt-debug.config @@ -7367,6 +7367,7 @@ CONFIG_HTE_TEGRA194_TEST=m CONFIG_DCACHE_WORD_ACCESS=y # CONFIG_VALIDATE_FS_PARSER is not set CONFIG_FS_IOMAP=y +CONFIG_FS_STACK=y CONFIG_BUFFER_HEAD=y CONFIG_LEGACY_DIRECT_IO=y # CONFIG_EXT2_FS is not set diff --git a/configs/kernel-5.14.0-aarch64-rt.config b/configs/kernel-5.14.0-aarch64-rt.config index 6dbd2079a8..087009d5f0 100644 --- a/configs/kernel-5.14.0-aarch64-rt.config +++ b/configs/kernel-5.14.0-aarch64-rt.config @@ -7345,6 +7345,7 @@ CONFIG_HTE_TEGRA194=m CONFIG_DCACHE_WORD_ACCESS=y # CONFIG_VALIDATE_FS_PARSER is not set CONFIG_FS_IOMAP=y +CONFIG_FS_STACK=y CONFIG_BUFFER_HEAD=y CONFIG_LEGACY_DIRECT_IO=y # CONFIG_EXT2_FS is not set diff --git a/configs/kernel-5.14.0-aarch64.config b/configs/kernel-5.14.0-aarch64.config index 512f5e976e..78b99656ed 100644 --- a/configs/kernel-5.14.0-aarch64.config +++ b/configs/kernel-5.14.0-aarch64.config @@ -7342,6 +7342,7 @@ CONFIG_HTE_TEGRA194=m CONFIG_DCACHE_WORD_ACCESS=y # CONFIG_VALIDATE_FS_PARSER is not set CONFIG_FS_IOMAP=y +CONFIG_FS_STACK=y CONFIG_BUFFER_HEAD=y CONFIG_LEGACY_DIRECT_IO=y # CONFIG_EXT2_FS is not set diff --git a/configs/kernel-5.14.0-ppc64le-debug.config b/configs/kernel-5.14.0-ppc64le-debug.config index 9f7f188211..c0ae302a4c 100644 --- a/configs/kernel-5.14.0-ppc64le-debug.config +++ b/configs/kernel-5.14.0-ppc64le-debug.config @@ -5757,6 +5757,7 @@ CONFIG_NVMEM_SYSFS=y CONFIG_DCACHE_WORD_ACCESS=y # CONFIG_VALIDATE_FS_PARSER is not set CONFIG_FS_IOMAP=y +CONFIG_FS_STACK=y CONFIG_BUFFER_HEAD=y CONFIG_LEGACY_DIRECT_IO=y # CONFIG_EXT2_FS is not set diff --git a/configs/kernel-5.14.0-ppc64le-kvm-debug.config b/configs/kernel-5.14.0-ppc64le-kvm-debug.config index ad380f6506..af3c1ae444 100644 --- a/configs/kernel-5.14.0-ppc64le-kvm-debug.config +++ b/configs/kernel-5.14.0-ppc64le-kvm-debug.config @@ -5774,6 +5774,7 @@ CONFIG_NVMEM_SYSFS=y CONFIG_DCACHE_WORD_ACCESS=y # CONFIG_VALIDATE_FS_PARSER is not set CONFIG_FS_IOMAP=y +CONFIG_FS_STACK=y CONFIG_BUFFER_HEAD=y CONFIG_LEGACY_DIRECT_IO=y # CONFIG_EXT2_FS is not set diff --git a/configs/kernel-5.14.0-ppc64le-kvm.config b/configs/kernel-5.14.0-ppc64le-kvm.config index bf4bf76e3f..607ae0bc88 100644 --- a/configs/kernel-5.14.0-ppc64le-kvm.config +++ b/configs/kernel-5.14.0-ppc64le-kvm.config @@ -5771,6 +5771,7 @@ CONFIG_NVMEM_SYSFS=y CONFIG_DCACHE_WORD_ACCESS=y # CONFIG_VALIDATE_FS_PARSER is not set CONFIG_FS_IOMAP=y +CONFIG_FS_STACK=y CONFIG_BUFFER_HEAD=y CONFIG_LEGACY_DIRECT_IO=y # CONFIG_EXT2_FS is not set diff --git a/configs/kernel-5.14.0-ppc64le.config b/configs/kernel-5.14.0-ppc64le.config index 83f86139f1..6c455066e6 100644 --- a/configs/kernel-5.14.0-ppc64le.config +++ b/configs/kernel-5.14.0-ppc64le.config @@ -5754,6 +5754,7 @@ CONFIG_NVMEM_SYSFS=y CONFIG_DCACHE_WORD_ACCESS=y # CONFIG_VALIDATE_FS_PARSER is not set CONFIG_FS_IOMAP=y +CONFIG_FS_STACK=y CONFIG_BUFFER_HEAD=y CONFIG_LEGACY_DIRECT_IO=y # CONFIG_EXT2_FS is not set diff --git a/configs/kernel-5.14.0-s390x-debug.config b/configs/kernel-5.14.0-s390x-debug.config index 58262f5454..289fdeeff6 100644 --- a/configs/kernel-5.14.0-s390x-debug.config +++ b/configs/kernel-5.14.0-s390x-debug.config @@ -3079,6 +3079,7 @@ CONFIG_NVMEM_SYSFS=y CONFIG_DCACHE_WORD_ACCESS=y # CONFIG_VALIDATE_FS_PARSER is not set CONFIG_FS_IOMAP=y +CONFIG_FS_STACK=y CONFIG_BUFFER_HEAD=y CONFIG_LEGACY_DIRECT_IO=y # CONFIG_EXT2_FS is not set diff --git a/configs/kernel-5.14.0-s390x.config b/configs/kernel-5.14.0-s390x.config index 2b75e563fb..14d59f77bb 100644 --- a/configs/kernel-5.14.0-s390x.config +++ b/configs/kernel-5.14.0-s390x.config @@ -3102,6 +3102,7 @@ CONFIG_NVMEM_SYSFS=y CONFIG_DCACHE_WORD_ACCESS=y # CONFIG_VALIDATE_FS_PARSER is not set CONFIG_FS_IOMAP=y +CONFIG_FS_STACK=y CONFIG_BUFFER_HEAD=y CONFIG_LEGACY_DIRECT_IO=y # CONFIG_EXT2_FS is not set diff --git a/configs/kernel-5.14.0-x86_64-debug.config b/configs/kernel-5.14.0-x86_64-debug.config index f1af46c280..416659d34b 100644 --- a/configs/kernel-5.14.0-x86_64-debug.config +++ b/configs/kernel-5.14.0-x86_64-debug.config @@ -8170,6 +8170,7 @@ CONFIG_INTEL_QEP=m CONFIG_DCACHE_WORD_ACCESS=y # CONFIG_VALIDATE_FS_PARSER is not set CONFIG_FS_IOMAP=y +CONFIG_FS_STACK=y CONFIG_BUFFER_HEAD=y CONFIG_LEGACY_DIRECT_IO=y # CONFIG_EXT2_FS is not set diff --git a/configs/kernel-5.14.0-x86_64-rt-debug.config b/configs/kernel-5.14.0-x86_64-rt-debug.config index dbe0a9f474..a4251639dc 100644 --- a/configs/kernel-5.14.0-x86_64-rt-debug.config +++ b/configs/kernel-5.14.0-x86_64-rt-debug.config @@ -8221,6 +8221,7 @@ CONFIG_INTEL_QEP=m CONFIG_DCACHE_WORD_ACCESS=y # CONFIG_VALIDATE_FS_PARSER is not set CONFIG_FS_IOMAP=y +CONFIG_FS_STACK=y CONFIG_BUFFER_HEAD=y CONFIG_LEGACY_DIRECT_IO=y # CONFIG_EXT2_FS is not set diff --git a/configs/kernel-5.14.0-x86_64-rt.config b/configs/kernel-5.14.0-x86_64-rt.config index 2ebc3263d7..1ece4021c4 100644 --- a/configs/kernel-5.14.0-x86_64-rt.config +++ b/configs/kernel-5.14.0-x86_64-rt.config @@ -8194,6 +8194,7 @@ CONFIG_INTEL_QEP=m CONFIG_DCACHE_WORD_ACCESS=y # CONFIG_VALIDATE_FS_PARSER is not set CONFIG_FS_IOMAP=y +CONFIG_FS_STACK=y CONFIG_BUFFER_HEAD=y CONFIG_LEGACY_DIRECT_IO=y # CONFIG_EXT2_FS is not set diff --git a/configs/kernel-5.14.0-x86_64.config b/configs/kernel-5.14.0-x86_64.config index d1a12a983a..ce39dea203 100644 --- a/configs/kernel-5.14.0-x86_64.config +++ b/configs/kernel-5.14.0-x86_64.config @@ -8143,6 +8143,7 @@ CONFIG_INTEL_QEP=m CONFIG_DCACHE_WORD_ACCESS=y # CONFIG_VALIDATE_FS_PARSER is not set CONFIG_FS_IOMAP=y +CONFIG_FS_STACK=y CONFIG_BUFFER_HEAD=y CONFIG_LEGACY_DIRECT_IO=y # CONFIG_EXT2_FS is not set diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 4ef4e2dc47..3a01e1862d 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2049,7 +2049,7 @@ static int binder_translate_binder(struct flat_binder_object *fp, ret = -EINVAL; goto done; } - if (security_binder_transfer_binder(proc->tsk, target_proc->tsk)) { + if (security_binder_transfer_binder(proc->cred, target_proc->cred)) { ret = -EPERM; goto done; } @@ -2095,7 +2095,7 @@ static int binder_translate_handle(struct flat_binder_object *fp, proc->pid, thread->pid, fp->handle); return -EINVAL; } - if (security_binder_transfer_binder(proc->tsk, target_proc->tsk)) { + if (security_binder_transfer_binder(proc->cred, target_proc->cred)) { ret = -EPERM; goto done; } @@ -2183,7 +2183,7 @@ static int binder_translate_fd(u32 fd, binder_size_t fd_offset, ret = -EBADF; goto err_fget; } - ret = security_binder_transfer_file(proc->tsk, target_proc->tsk, file); + ret = security_binder_transfer_file(proc->cred, target_proc->cred, file); if (ret < 0) { ret = -EPERM; goto err_security; @@ -2588,8 +2588,8 @@ static void binder_transaction(struct binder_proc *proc, return_error_line = __LINE__; goto err_invalid_target_handle; } - if (security_binder_transaction(proc->tsk, - target_proc->tsk) < 0) { + if (security_binder_transaction(proc->cred, + target_proc->cred) < 0) { return_error = BR_FAILED_REPLY; return_error_param = -EPERM; return_error_line = __LINE__; @@ -4554,7 +4554,7 @@ static int binder_ioctl_set_ctx_mgr(struct file *filp, ret = -EBUSY; goto out; } - ret = security_binder_set_context_mgr(proc->tsk); + ret = security_binder_set_context_mgr(proc->cred); if (ret < 0) goto out; if (uid_valid(context->binder_context_mgr_uid)) { diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 7fe40bbba2..e2e1bd4d5a 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -12220,6 +12220,11 @@ static int amdgpu_dm_atomic_check(struct drm_device *dev, } if (dc_resource_is_dsc_encoding_supported(dc)) { + for_each_oldnew_crtc_in_state(state, crtc, old_crtc_state, new_crtc_state, i) { + dm_new_crtc_state = to_dm_crtc_state(new_crtc_state); + dm_new_crtc_state->mode_changed_independent_from_dsc = new_crtc_state->mode_changed; + } + for_each_oldnew_crtc_in_state(state, crtc, old_crtc_state, new_crtc_state, i) { if (drm_atomic_crtc_needs_modeset(new_crtc_state)) { ret = add_affected_mst_dsc_crtcs(state, crtc); diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h index 8ca7389575..7bf4b8369f 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h @@ -962,6 +962,7 @@ struct dm_crtc_state { bool freesync_vrr_info_changed; + bool mode_changed_independent_from_dsc; bool dsc_force_changed; bool vrr_supported; struct mod_freesync_config freesync_config; diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c index 5e92eaa67a..2e0895f4f9 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c @@ -1744,9 +1744,11 @@ int pre_validate_dsc(struct drm_atomic_state *state, int ind = find_crtc_index_in_state_by_stream(state, stream); if (ind >= 0) { + struct dm_crtc_state *dm_new_crtc_state = to_dm_crtc_state(state->crtcs[ind].new_state); + DRM_INFO_ONCE("%s:%d MST_DSC no mode changed for stream 0x%p\n", __func__, __LINE__, stream); - state->crtcs[ind].new_state->mode_changed = 0; + dm_new_crtc_state->base.mode_changed = dm_new_crtc_state->mode_changed_independent_from_dsc; } } } diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 9419ab4435..a2cf6135fc 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -95,7 +95,6 @@ static struct workqueue_struct *iwcm_wq; struct iwcm_work { struct work_struct work; struct iwcm_id_private *cm_id; - struct list_head list; struct iw_cm_event event; struct list_head free_list; }; @@ -179,7 +178,6 @@ static int alloc_work_entries(struct iwcm_id_private *cm_id_priv, int count) return -ENOMEM; } work->cm_id = cm_id_priv; - INIT_LIST_HEAD(&work->list); put_work(work); } return 0; @@ -214,7 +212,6 @@ static void free_cm_id(struct iwcm_id_private *cm_id_priv) static bool iwcm_deref_id(struct iwcm_id_private *cm_id_priv) { if (refcount_dec_and_test(&cm_id_priv->refcount)) { - BUG_ON(!list_empty(&cm_id_priv->work_list)); free_cm_id(cm_id_priv); return true; } @@ -261,7 +258,6 @@ struct iw_cm_id *iw_create_cm_id(struct ib_device *device, refcount_set(&cm_id_priv->refcount, 1); init_waitqueue_head(&cm_id_priv->connect_wait); init_completion(&cm_id_priv->destroy_comp); - INIT_LIST_HEAD(&cm_id_priv->work_list); INIT_LIST_HEAD(&cm_id_priv->work_free_list); return &cm_id_priv->id; @@ -1008,13 +1004,13 @@ static int process_event(struct iwcm_id_private *cm_id_priv, } /* - * Process events on the work_list for the cm_id. If the callback - * function requests that the cm_id be deleted, a flag is set in the - * cm_id flags to indicate that when the last reference is - * removed, the cm_id is to be destroyed. This is necessary to - * distinguish between an object that will be destroyed by the app - * thread asleep on the destroy_comp list vs. an object destroyed - * here synchronously when the last reference is removed. + * Process events for the cm_id. If the callback function requests + * that the cm_id be deleted, a flag is set in the cm_id flags to + * indicate that when the last reference is removed, the cm_id is + * to be destroyed. This is necessary to distinguish between an + * object that will be destroyed by the app thread asleep on the + * destroy_comp list vs. an object destroyed here synchronously + * when the last reference is removed. */ static void cm_work_handler(struct work_struct *_work) { @@ -1025,35 +1021,26 @@ static void cm_work_handler(struct work_struct *_work) int ret = 0; spin_lock_irqsave(&cm_id_priv->lock, flags); - while (!list_empty(&cm_id_priv->work_list)) { - work = list_first_entry(&cm_id_priv->work_list, - struct iwcm_work, list); - list_del_init(&work->list); - levent = work->event; - put_work(work); - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - - if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) { - ret = process_event(cm_id_priv, &levent); - if (ret) { - destroy_cm_id(&cm_id_priv->id); - WARN_ON_ONCE(iwcm_deref_id(cm_id_priv)); - } - } else - pr_debug("dropping event %d\n", levent.event); - if (iwcm_deref_id(cm_id_priv)) - return; - spin_lock_irqsave(&cm_id_priv->lock, flags); - } + levent = work->event; + put_work(work); spin_unlock_irqrestore(&cm_id_priv->lock, flags); + + if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) { + ret = process_event(cm_id_priv, &levent); + if (ret) { + destroy_cm_id(&cm_id_priv->id); + WARN_ON_ONCE(iwcm_deref_id(cm_id_priv)); + } + } else + pr_debug("dropping event %d\n", levent.event); + if (iwcm_deref_id(cm_id_priv)) + return; } /* * This function is called on interrupt context. Schedule events on * the iwcm_wq thread to allow callback functions to downcall into - * the CM and/or block. Events are queued to a per-CM_ID - * work_list. If this is the first event on the work_list, the work - * element is also queued on the iwcm_wq thread. + * the CM and/or block. * * Each event holds a reference on the cm_id. Until the last posted * event has been delivered and processed, the cm_id cannot be @@ -1095,7 +1082,6 @@ static int cm_event_handler(struct iw_cm_id *cm_id, } refcount_inc(&cm_id_priv->refcount); - list_add_tail(&work->list, &cm_id_priv->work_list); queue_work(iwcm_wq, &work->work); out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); diff --git a/drivers/infiniband/core/iwcm.h b/drivers/infiniband/core/iwcm.h index bf74639be1..b56fb12ede 100644 --- a/drivers/infiniband/core/iwcm.h +++ b/drivers/infiniband/core/iwcm.h @@ -50,7 +50,6 @@ struct iwcm_id_private { struct ib_qp *qp; struct completion destroy_comp; wait_queue_head_t connect_wait; - struct list_head work_list; spinlock_t lock; refcount_t refcount; struct list_head work_free_list; diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile index 11878ddf7c..dd7bb377f4 100644 --- a/drivers/infiniband/hw/mlx5/Makefile +++ b/drivers/infiniband/hw/mlx5/Makefile @@ -8,6 +8,7 @@ mlx5_ib-y := ah.o \ cq.o \ data_direct.o \ dm.o \ + dmah.o \ doorbell.o \ fs.o \ gsi.o \ diff --git a/drivers/infiniband/hw/mlx5/counters.c b/drivers/infiniband/hw/mlx5/counters.c index b847084dcd..e042e0719e 100644 --- a/drivers/infiniband/hw/mlx5/counters.c +++ b/drivers/infiniband/hw/mlx5/counters.c @@ -16,6 +16,18 @@ struct mlx5_ib_counter { u32 type; }; +struct mlx5_rdma_counter { + struct rdma_counter rdma_counter; + + struct mlx5_fc *fc[MLX5_IB_OPCOUNTER_MAX]; + struct xarray qpn_opfc_xa; +}; + +static struct mlx5_rdma_counter *to_mcounter(struct rdma_counter *counter) +{ + return container_of(counter, struct mlx5_rdma_counter, rdma_counter); +} + #define INIT_Q_COUNTER(_name) \ { .name = #_name, .offset = MLX5_BYTE_OFF(query_q_counter_out, _name)} @@ -398,7 +410,7 @@ static int do_get_hw_stats(struct ib_device *ibdev, return ret; /* We don't expose device counters over Vports */ - if (is_mdev_switchdev_mode(dev->mdev) && port_num != 0) + if (is_mdev_switchdev_mode(dev->mdev) && dev->is_rep && port_num != 0) goto done; if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) { @@ -418,7 +430,7 @@ static int do_get_hw_stats(struct ib_device *ibdev, */ goto done; } - ret = mlx5_lag_query_cong_counters(dev->mdev, + ret = mlx5_lag_query_cong_counters(mdev, stats->value + cnts->num_q_counters, cnts->num_cong_counters, @@ -602,7 +614,7 @@ static int mlx5_ib_counter_dealloc(struct rdma_counter *counter) return 0; WARN_ON(!xa_empty(&mcounter->qpn_opfc_xa)); - mlx5r_fs_destroy_fcs(dev, counter); + mlx5r_fs_destroy_fcs(dev, mcounter->fc); MLX5_SET(dealloc_q_counter_in, in, opcode, MLX5_CMD_OP_DEALLOC_Q_COUNTER); MLX5_SET(dealloc_q_counter_in, in, counter_set_id, counter->id); @@ -612,6 +624,7 @@ static int mlx5_ib_counter_dealloc(struct rdma_counter *counter) static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter, struct ib_qp *qp, u32 port) { + struct mlx5_rdma_counter *mcounter = to_mcounter(counter); struct mlx5_ib_dev *dev = to_mdev(qp->device); bool new = false; int err; @@ -635,7 +648,11 @@ static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter, if (err) goto fail_set_counter; - err = mlx5r_fs_bind_op_fc(qp, counter, port); + if (!counter->mode.bind_opcnt) + return 0; + + err = mlx5r_fs_bind_op_fc(qp, mcounter->fc, &mcounter->qpn_opfc_xa, + port); if (err) goto fail_bind_op_fc; @@ -655,9 +672,12 @@ fail_set_counter: static int mlx5_ib_counter_unbind_qp(struct ib_qp *qp, u32 port) { struct rdma_counter *counter = qp->counter; + struct mlx5_rdma_counter *mcounter; int err; - mlx5r_fs_unbind_op_fc(qp, counter); + mcounter = to_mcounter(counter); + + mlx5r_fs_unbind_op_fc(qp, &mcounter->qpn_opfc_xa); err = mlx5_ib_qp_set_counter(qp, NULL); if (err) @@ -666,7 +686,9 @@ static int mlx5_ib_counter_unbind_qp(struct ib_qp *qp, u32 port) return 0; fail_set_counter: - mlx5r_fs_bind_op_fc(qp, counter, port); + if (counter->mode.bind_opcnt) + mlx5r_fs_bind_op_fc(qp, mcounter->fc, + &mcounter->qpn_opfc_xa, port); return err; } diff --git a/drivers/infiniband/hw/mlx5/counters.h b/drivers/infiniband/hw/mlx5/counters.h index bd03cee420..a04e7dd594 100644 --- a/drivers/infiniband/hw/mlx5/counters.h +++ b/drivers/infiniband/hw/mlx5/counters.h @@ -8,19 +8,6 @@ #include "mlx5_ib.h" -struct mlx5_rdma_counter { - struct rdma_counter rdma_counter; - - struct mlx5_fc *fc[MLX5_IB_OPCOUNTER_MAX]; - struct xarray qpn_opfc_xa; -}; - -static inline struct mlx5_rdma_counter * -to_mcounter(struct rdma_counter *counter) -{ - return container_of(counter, struct mlx5_rdma_counter, rdma_counter); -} - int mlx5_ib_counters_init(struct mlx5_ib_dev *dev); void mlx5_ib_counters_cleanup(struct mlx5_ib_dev *dev); void mlx5_ib_counters_clear_description(struct ib_counters *counters); diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 3657ab8f84..aa11a4b5a2 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -645,7 +645,7 @@ int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) { struct mlx5_core_dev *mdev = to_mdev(ibcq->device)->mdev; struct mlx5_ib_cq *cq = to_mcq(ibcq); - void __iomem *uar_page = mdev->priv.uar->map; + void __iomem *uar_page = mdev->priv.bfreg.up->map; unsigned long irq_flags; int ret = 0; @@ -920,7 +920,7 @@ static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, cq->buf.frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); - *index = dev->mdev->priv.uar->index; + *index = dev->mdev->priv.bfreg.up->index; return 0; @@ -1017,15 +1017,18 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, if (cq->create_flags & IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN) MLX5_SET(cqc, cqc, oi, 1); + if (udata) { + cq->mcq.comp = mlx5_add_cq_to_tasklet; + cq->mcq.tasklet_ctx.comp = mlx5_ib_cq_comp; + } else { + cq->mcq.comp = mlx5_ib_cq_comp; + } + err = mlx5_core_create_cq(dev->mdev, &cq->mcq, cqb, inlen, out, sizeof(out)); if (err) goto err_cqb; mlx5_ib_dbg(dev, "cqn 0x%x\n", cq->mcq.cqn); - if (udata) - cq->mcq.tasklet_ctx.comp = mlx5_ib_cq_comp; - else - cq->mcq.comp = mlx5_ib_cq_comp; cq->mcq.event = mlx5_ib_cq_event; INIT_LIST_HEAD(&cq->wc_list); @@ -1052,20 +1055,31 @@ err_cqb: return err; } -int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) +int mlx5_ib_pre_destroy_cq(struct ib_cq *cq) { struct mlx5_ib_dev *dev = to_mdev(cq->device); struct mlx5_ib_cq *mcq = to_mcq(cq); + + return mlx5_core_destroy_cq(dev->mdev, &mcq->mcq); +} + +void mlx5_ib_post_destroy_cq(struct ib_cq *cq) +{ + destroy_cq_kernel(to_mdev(cq->device), to_mcq(cq)); +} + +int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) +{ int ret; - ret = mlx5_core_destroy_cq(dev->mdev, &mcq->mcq); + ret = mlx5_ib_pre_destroy_cq(cq); if (ret) return ret; if (udata) - destroy_cq_user(mcq, udata); + destroy_cq_user(to_mcq(cq), udata); else - destroy_cq_kernel(dev, mcq); + mlx5_ib_post_destroy_cq(cq); return 0; } diff --git a/drivers/infiniband/hw/mlx5/data_direct.c b/drivers/infiniband/hw/mlx5/data_direct.c index b9ba84afaa..b81ac5709b 100644 --- a/drivers/infiniband/hw/mlx5/data_direct.c +++ b/drivers/infiniband/hw/mlx5/data_direct.c @@ -35,7 +35,7 @@ static int mlx5_data_direct_vpd_get_vuid(struct mlx5_data_direct_dev *dev) vpd_data = pci_vpd_alloc(pdev, &vpd_size); if (IS_ERR(vpd_data)) { - pci_err(pdev, "Unable to read VPD, err=%ld\n", PTR_ERR(vpd_data)); + pci_err(pdev, "Unable to read VPD, err=%pe\n", vpd_data); return PTR_ERR(vpd_data); } diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 6485ce3208..e52a73bb94 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -159,7 +159,7 @@ int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user, u64 req_ucaps) uctx = MLX5_ADDR_OF(create_uctx_in, in, uctx); if (is_user && (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RAW_TX) && - capable(CAP_NET_RAW)) + rdma_dev_has_raw_cap(&dev->ib_dev)) cap |= MLX5_UCTX_CAP_RAW_TX; if (is_user && (MLX5_CAP_GEN(dev->mdev, uctx_cap) & @@ -233,6 +233,7 @@ static u16 get_legacy_obj_type(u16 opcode) { switch (opcode) { case MLX5_CMD_OP_CREATE_RQ: + case MLX5_CMD_OP_CREATE_RMP: return MLX5_EVENT_QUEUE_TYPE_RQ; case MLX5_CMD_OP_CREATE_QP: return MLX5_EVENT_QUEUE_TYPE_QP; @@ -1224,6 +1225,11 @@ static void devx_obj_build_destroy_cmd(void *in, void *out, void *din, MLX5_GET(create_flow_table_in, in, other_vport)); MLX5_SET(destroy_flow_table_in, din, vport_number, MLX5_GET(create_flow_table_in, in, vport_number)); + MLX5_SET(destroy_flow_table_in, din, other_eswitch, + MLX5_GET(create_flow_table_in, in, other_eswitch)); + MLX5_SET(destroy_flow_table_in, din, eswitch_owner_vhca_id, + MLX5_GET(create_flow_table_in, in, + eswitch_owner_vhca_id)); MLX5_SET(destroy_flow_table_in, din, table_type, MLX5_GET(create_flow_table_in, in, table_type)); MLX5_SET(destroy_flow_table_in, din, table_id, *obj_id); @@ -1236,6 +1242,11 @@ static void devx_obj_build_destroy_cmd(void *in, void *out, void *din, MLX5_GET(create_flow_group_in, in, other_vport)); MLX5_SET(destroy_flow_group_in, din, vport_number, MLX5_GET(create_flow_group_in, in, vport_number)); + MLX5_SET(destroy_flow_group_in, din, other_eswitch, + MLX5_GET(create_flow_group_in, in, other_eswitch)); + MLX5_SET(destroy_flow_group_in, din, eswitch_owner_vhca_id, + MLX5_GET(create_flow_group_in, in, + eswitch_owner_vhca_id)); MLX5_SET(destroy_flow_group_in, din, table_type, MLX5_GET(create_flow_group_in, in, table_type)); MLX5_SET(destroy_flow_group_in, din, table_id, @@ -1250,6 +1261,10 @@ static void devx_obj_build_destroy_cmd(void *in, void *out, void *din, MLX5_GET(set_fte_in, in, other_vport)); MLX5_SET(delete_fte_in, din, vport_number, MLX5_GET(set_fte_in, in, vport_number)); + MLX5_SET(delete_fte_in, din, other_eswitch, + MLX5_GET(set_fte_in, in, other_eswitch)); + MLX5_SET(delete_fte_in, din, eswitch_owner_vhca_id, + MLX5_GET(set_fte_in, in, eswitch_owner_vhca_id)); MLX5_SET(delete_fte_in, din, table_type, MLX5_GET(set_fte_in, in, table_type)); MLX5_SET(delete_fte_in, din, table_id, @@ -1393,6 +1408,10 @@ static int devx_handle_mkey_create(struct mlx5_ib_dev *dev, } MLX5_SET(create_mkey_in, in, mkey_umem_valid, 1); + /* TPH is not allowed to bypass the regular kernel's verbs flow */ + MLX5_SET(mkc, mkc, pcie_tph_en, 0); + MLX5_SET(mkc, mkc, pcie_tph_steering_tag_index, + MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX); return 0; } @@ -1958,6 +1977,7 @@ subscribe_event_xa_alloc(struct mlx5_devx_event_table *devx_event_table, /* Level1 is valid for future use, no need to free */ return -ENOMEM; + INIT_LIST_HEAD(&obj_event->obj_sub_list); err = xa_insert(&event->object_ids, key_level2, obj_event, @@ -1966,7 +1986,6 @@ subscribe_event_xa_alloc(struct mlx5_devx_event_table *devx_event_table, kfree(obj_event); return err; } - INIT_LIST_HEAD(&obj_event->obj_sub_list); } return 0; diff --git a/drivers/infiniband/hw/mlx5/dm.c b/drivers/infiniband/hw/mlx5/dm.c index b4c97fb62a..9ded2b7c1e 100644 --- a/drivers/infiniband/hw/mlx5/dm.c +++ b/drivers/infiniband/hw/mlx5/dm.c @@ -282,7 +282,7 @@ static struct ib_dm *handle_alloc_dm_memic(struct ib_ucontext *ctx, int err; u64 address; - if (!MLX5_CAP_DEV_MEM(dm_db->dev, memic)) + if (!dm_db || !MLX5_CAP_DEV_MEM(dm_db->dev, memic)) return ERR_PTR(-EOPNOTSUPP); dm = kzalloc(sizeof(*dm), GFP_KERNEL); diff --git a/drivers/infiniband/hw/mlx5/dmah.c b/drivers/infiniband/hw/mlx5/dmah.c new file mode 100644 index 0000000000..362a88992f --- /dev/null +++ b/drivers/infiniband/hw/mlx5/dmah.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include +#include +#include "dmah.h" + +#define UVERBS_MODULE_NAME mlx5_ib +#include + +static int mlx5_ib_alloc_dmah(struct ib_dmah *ibdmah, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_core_dev *mdev = to_mdev(ibdmah->device)->mdev; + struct mlx5_ib_dmah *dmah = to_mdmah(ibdmah); + u16 st_bits = BIT(IB_DMAH_CPU_ID_EXISTS) | + BIT(IB_DMAH_MEM_TYPE_EXISTS); + int err; + + /* PH is a must for TPH following PCIe spec 6.2-1.0 */ + if (!(ibdmah->valid_fields & BIT(IB_DMAH_PH_EXISTS))) + return -EINVAL; + + /* ST is optional; however, partial data for it is not allowed */ + if (ibdmah->valid_fields & st_bits) { + if ((ibdmah->valid_fields & st_bits) != st_bits) + return -EINVAL; + err = mlx5_st_alloc_index(mdev, ibdmah->mem_type, + ibdmah->cpu_id, &dmah->st_index); + if (err) + return err; + } + + return 0; +} + +static int mlx5_ib_dealloc_dmah(struct ib_dmah *ibdmah, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_dmah *dmah = to_mdmah(ibdmah); + struct mlx5_core_dev *mdev = to_mdev(ibdmah->device)->mdev; + + if (ibdmah->valid_fields & BIT(IB_DMAH_CPU_ID_EXISTS)) + return mlx5_st_dealloc_index(mdev, dmah->st_index); + + return 0; +} + +const struct ib_device_ops mlx5_ib_dev_dmah_ops = { + .alloc_dmah = mlx5_ib_alloc_dmah, + .dealloc_dmah = mlx5_ib_dealloc_dmah, +}; diff --git a/drivers/infiniband/hw/mlx5/dmah.h b/drivers/infiniband/hw/mlx5/dmah.h new file mode 100644 index 0000000000..68de72b474 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/dmah.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#ifndef _MLX5_IB_DMAH_H +#define _MLX5_IB_DMAH_H + +#include "mlx5_ib.h" + +extern const struct ib_device_ops mlx5_ib_dev_dmah_ops; + +struct mlx5_ib_dmah { + struct ib_dmah ibdmah; + u16 st_index; +}; + +static inline struct mlx5_ib_dmah *to_mdmah(struct ib_dmah *ibdmah) +{ + return container_of(ibdmah, struct mlx5_ib_dmah, ibdmah); +} + +#endif /* _MLX5_IB_DMAH_H */ diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c index 0ff9f18a71..d17823ce7f 100644 --- a/drivers/infiniband/hw/mlx5/fs.c +++ b/drivers/infiniband/hw/mlx5/fs.c @@ -691,22 +691,13 @@ static bool __maybe_unused mlx5_ib_shared_ft_allowed(struct ib_device *device) return MLX5_CAP_GEN(dev->mdev, shared_object_to_user_object_allowed); } -static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_ib_dev *dev, - struct mlx5_flow_namespace *ns, +static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns, struct mlx5_ib_flow_prio *prio, - int priority, - int num_entries, int num_groups, - u32 flags, u16 vport) + struct mlx5_flow_table_attr *ft_attr) { - struct mlx5_flow_table_attr ft_attr = {}; struct mlx5_flow_table *ft; - ft_attr.prio = priority; - ft_attr.max_fte = num_entries; - ft_attr.flags = flags; - ft_attr.vport = vport; - ft_attr.autogroup.max_num_groups = num_groups; - ft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr); + ft = mlx5_create_auto_grouped_flow_table(ns, ft_attr); if (IS_ERR(ft)) return ERR_CAST(ft); @@ -720,6 +711,7 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, enum flow_table_type ft_type) { bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP; + struct mlx5_flow_table_attr ft_attr = {}; struct mlx5_flow_namespace *ns = NULL; enum mlx5_flow_namespace_type fn_type; struct mlx5_ib_flow_prio *prio; @@ -797,11 +789,14 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, max_table_size = min_t(int, num_entries, max_table_size); ft = prio->flow_table; - if (!ft) - return _get_prio(dev, ns, prio, priority, max_table_size, - num_groups, flags, 0); + if (ft) + return prio; - return prio; + ft_attr.prio = priority; + ft_attr.max_fte = max_table_size; + ft_attr.flags = flags; + ft_attr.autogroup.max_num_groups = num_groups; + return _get_prio(ns, prio, &ft_attr); } enum { @@ -950,6 +945,7 @@ static int get_per_qp_prio(struct mlx5_ib_dev *dev, enum mlx5_ib_optional_counter_type type) { enum mlx5_ib_optional_counter_type per_qp_type; + struct mlx5_flow_table_attr ft_attr = {}; enum mlx5_flow_namespace_type fn_type; struct mlx5_flow_namespace *ns; struct mlx5_ib_flow_prio *prio; @@ -1003,7 +999,10 @@ static int get_per_qp_prio(struct mlx5_ib_dev *dev, if (prio->flow_table) return 0; - prio = _get_prio(dev, ns, prio, priority, MLX5_FS_MAX_POOL_SIZE, 1, 0, 0); + ft_attr.prio = priority; + ft_attr.max_fte = MLX5_FS_MAX_POOL_SIZE; + ft_attr.autogroup.max_num_groups = 1; + prio = _get_prio(ns, prio, &ft_attr); if (IS_ERR(prio)) return PTR_ERR(prio); @@ -1012,14 +1011,14 @@ static int get_per_qp_prio(struct mlx5_ib_dev *dev, return 0; } -static struct mlx5_per_qp_opfc * -get_per_qp_opfc(struct mlx5_rdma_counter *mcounter, u32 qp_num, bool *new) +static struct mlx5_per_qp_opfc *get_per_qp_opfc(struct xarray *qpn_opfc_xa, + u32 qp_num, bool *new) { struct mlx5_per_qp_opfc *per_qp_opfc; *new = false; - per_qp_opfc = xa_load(&mcounter->qpn_opfc_xa, qp_num); + per_qp_opfc = xa_load(qpn_opfc_xa, qp_num); if (per_qp_opfc) return per_qp_opfc; per_qp_opfc = kzalloc(sizeof(*per_qp_opfc), GFP_KERNEL); @@ -1032,7 +1031,8 @@ get_per_qp_opfc(struct mlx5_rdma_counter *mcounter, u32 qp_num, bool *new) } static int add_op_fc_rules(struct mlx5_ib_dev *dev, - struct mlx5_rdma_counter *mcounter, + struct mlx5_fc *fc_arr[MLX5_IB_OPCOUNTER_MAX], + struct xarray *qpn_opfc_xa, struct mlx5_per_qp_opfc *per_qp_opfc, struct mlx5_ib_flow_prio *prio, enum mlx5_ib_optional_counter_type type, @@ -1055,7 +1055,7 @@ static int add_op_fc_rules(struct mlx5_ib_dev *dev, return 0; } - opfc->fc = mcounter->fc[type]; + opfc->fc = fc_arr[type]; spec = kcalloc(MAX_OPFC_RULES, sizeof(*spec), GFP_KERNEL); if (!spec) { @@ -1148,8 +1148,7 @@ static int add_op_fc_rules(struct mlx5_ib_dev *dev, } prio->refcount += spec_num; - err = xa_err(xa_store(&mcounter->qpn_opfc_xa, qp_num, per_qp_opfc, - GFP_KERNEL)); + err = xa_err(xa_store(qpn_opfc_xa, qp_num, per_qp_opfc, GFP_KERNEL)); if (err) goto del_rules; @@ -1168,8 +1167,9 @@ null_fc: return err; } -static bool is_fc_shared_and_in_use(struct mlx5_rdma_counter *mcounter, - u32 type, struct mlx5_fc **fc) +static bool +is_fc_shared_and_in_use(struct mlx5_fc *fc_arr[MLX5_IB_OPCOUNTER_MAX], u32 type, + struct mlx5_fc **fc) { u32 shared_fc_type; @@ -1190,7 +1190,7 @@ static bool is_fc_shared_and_in_use(struct mlx5_rdma_counter *mcounter, return false; } - *fc = mcounter->fc[shared_fc_type]; + *fc = fc_arr[shared_fc_type]; if (!(*fc)) return false; @@ -1198,24 +1198,23 @@ static bool is_fc_shared_and_in_use(struct mlx5_rdma_counter *mcounter, } void mlx5r_fs_destroy_fcs(struct mlx5_ib_dev *dev, - struct rdma_counter *counter) + struct mlx5_fc *fc_arr[MLX5_IB_OPCOUNTER_MAX]) { - struct mlx5_rdma_counter *mcounter = to_mcounter(counter); struct mlx5_fc *in_use_fc; int i; for (i = MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP; i <= MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP; i++) { - if (!mcounter->fc[i]) + if (!fc_arr[i]) continue; - if (is_fc_shared_and_in_use(mcounter, i, &in_use_fc)) { - mcounter->fc[i] = NULL; + if (is_fc_shared_and_in_use(fc_arr, i, &in_use_fc)) { + fc_arr[i] = NULL; continue; } - mlx5_fc_destroy(dev->mdev, mcounter->fc[i]); - mcounter->fc[i] = NULL; + mlx5_fc_destroy(dev->mdev, fc_arr[i]); + fc_arr[i] = NULL; } } @@ -1223,6 +1222,7 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num, struct mlx5_ib_op_fc *opfc, enum mlx5_ib_optional_counter_type type) { + struct mlx5_flow_table_attr ft_attr = {}; enum mlx5_flow_namespace_type fn_type; int priority, i, err, spec_num; struct mlx5_flow_act flow_act = {}; @@ -1304,8 +1304,10 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num, if (err) goto free; - prio = _get_prio(dev, ns, prio, priority, - dev->num_ports * MAX_OPFC_RULES, 1, 0, 0); + ft_attr.prio = priority; + ft_attr.max_fte = dev->num_ports * MAX_OPFC_RULES; + ft_attr.autogroup.max_num_groups = 1; + prio = _get_prio(ns, prio, &ft_attr); if (IS_ERR(prio)) { err = PTR_ERR(prio); goto put_prio; @@ -1359,16 +1361,15 @@ void mlx5_ib_fs_remove_op_fc(struct mlx5_ib_dev *dev, put_per_qp_prio(dev, type); } -void mlx5r_fs_unbind_op_fc(struct ib_qp *qp, struct rdma_counter *counter) +void mlx5r_fs_unbind_op_fc(struct ib_qp *qp, struct xarray *qpn_opfc_xa) { - struct mlx5_rdma_counter *mcounter = to_mcounter(counter); - struct mlx5_ib_dev *dev = to_mdev(counter->device); + struct mlx5_ib_dev *dev = to_mdev(qp->device); struct mlx5_per_qp_opfc *per_qp_opfc; struct mlx5_ib_op_fc *in_use_opfc; struct mlx5_ib_flow_prio *prio; int i, j; - per_qp_opfc = xa_load(&mcounter->qpn_opfc_xa, qp->qp_num); + per_qp_opfc = xa_load(qpn_opfc_xa, qp->qp_num); if (!per_qp_opfc) return; @@ -1394,13 +1395,13 @@ void mlx5r_fs_unbind_op_fc(struct ib_qp *qp, struct rdma_counter *counter) } kfree(per_qp_opfc); - xa_erase(&mcounter->qpn_opfc_xa, qp->qp_num); + xa_erase(qpn_opfc_xa, qp->qp_num); } -int mlx5r_fs_bind_op_fc(struct ib_qp *qp, struct rdma_counter *counter, - u32 port) +int mlx5r_fs_bind_op_fc(struct ib_qp *qp, + struct mlx5_fc *fc_arr[MLX5_IB_OPCOUNTER_MAX], + struct xarray *qpn_opfc_xa, u32 port) { - struct mlx5_rdma_counter *mcounter = to_mcounter(counter); struct mlx5_ib_dev *dev = to_mdev(qp->device); struct mlx5_per_qp_opfc *per_qp_opfc; struct mlx5_ib_flow_prio *prio; @@ -1410,9 +1411,6 @@ int mlx5r_fs_bind_op_fc(struct ib_qp *qp, struct rdma_counter *counter, int i, err, per_qp_type; bool new; - if (!counter->mode.bind_opcnt) - return 0; - cnts = &dev->port[port - 1].cnts; for (i = 0; i <= MLX5_IB_OPCOUNTER_RDMA_RX_BYTES; i++) { @@ -1424,23 +1422,22 @@ int mlx5r_fs_bind_op_fc(struct ib_qp *qp, struct rdma_counter *counter, prio = get_opfc_prio(dev, per_qp_type); WARN_ON(!prio->flow_table); - if (is_fc_shared_and_in_use(mcounter, per_qp_type, &in_use_fc)) - mcounter->fc[per_qp_type] = in_use_fc; + if (is_fc_shared_and_in_use(fc_arr, per_qp_type, &in_use_fc)) + fc_arr[per_qp_type] = in_use_fc; - if (!mcounter->fc[per_qp_type]) { - mcounter->fc[per_qp_type] = mlx5_fc_create(dev->mdev, - false); - if (IS_ERR(mcounter->fc[per_qp_type])) - return PTR_ERR(mcounter->fc[per_qp_type]); + if (!fc_arr[per_qp_type]) { + fc_arr[per_qp_type] = mlx5_fc_create(dev->mdev, false); + if (IS_ERR(fc_arr[per_qp_type])) + return PTR_ERR(fc_arr[per_qp_type]); } - per_qp_opfc = get_per_qp_opfc(mcounter, qp->qp_num, &new); + per_qp_opfc = get_per_qp_opfc(qpn_opfc_xa, qp->qp_num, &new); if (!per_qp_opfc) { err = -ENOMEM; goto free_fc; } - err = add_op_fc_rules(dev, mcounter, per_qp_opfc, prio, - per_qp_type, qp->qp_num, port); + err = add_op_fc_rules(dev, fc_arr, qpn_opfc_xa, per_qp_opfc, + prio, per_qp_type, qp->qp_num, port); if (err) goto del_rules; } @@ -1448,12 +1445,12 @@ int mlx5r_fs_bind_op_fc(struct ib_qp *qp, struct rdma_counter *counter, return 0; del_rules: - mlx5r_fs_unbind_op_fc(qp, counter); + mlx5r_fs_unbind_op_fc(qp, qpn_opfc_xa); if (new) kfree(per_qp_opfc); free_fc: - if (xa_empty(&mcounter->qpn_opfc_xa)) - mlx5r_fs_destroy_fcs(dev, counter); + if (xa_empty(qpn_opfc_xa)) + mlx5r_fs_destroy_fcs(dev, fc_arr); return err; } @@ -1645,11 +1642,6 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev, return _create_flow_rule(dev, ft_prio, flow_attr, dst, 0, NULL); } -enum { - LEFTOVERS_MC, - LEFTOVERS_UC, -}; - static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *dev, struct mlx5_ib_flow_prio *ft_prio, struct ib_flow_attr *flow_attr, @@ -1659,43 +1651,32 @@ static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *de struct mlx5_ib_flow_handler *handler = NULL; static struct { - struct ib_flow_attr flow_attr; struct ib_flow_spec_eth eth_flow; - } leftovers_specs[] = { - [LEFTOVERS_MC] = { - .flow_attr = { - .num_of_specs = 1, - .size = sizeof(leftovers_specs[0]) - }, - .eth_flow = { - .type = IB_FLOW_SPEC_ETH, - .size = sizeof(struct ib_flow_spec_eth), - .mask = {.dst_mac = {0x1} }, - .val = {.dst_mac = {0x1} } - } - }, - [LEFTOVERS_UC] = { - .flow_attr = { - .num_of_specs = 1, - .size = sizeof(leftovers_specs[0]) - }, - .eth_flow = { - .type = IB_FLOW_SPEC_ETH, - .size = sizeof(struct ib_flow_spec_eth), - .mask = {.dst_mac = {0x1} }, - .val = {.dst_mac = {} } - } - } - }; + struct ib_flow_attr flow_attr; + } leftovers_wc = { .flow_attr = { .num_of_specs = 1, + .size = sizeof(leftovers_wc) }, + .eth_flow = { + .type = IB_FLOW_SPEC_ETH, + .size = sizeof(struct ib_flow_spec_eth), + .mask = { .dst_mac = { 0x1 } }, + .val = { .dst_mac = { 0x1 } } } }; - handler = create_flow_rule(dev, ft_prio, - &leftovers_specs[LEFTOVERS_MC].flow_attr, - dst); + static struct { + struct ib_flow_spec_eth eth_flow; + struct ib_flow_attr flow_attr; + } leftovers_uc = { .flow_attr = { .num_of_specs = 1, + .size = sizeof(leftovers_uc) }, + .eth_flow = { + .type = IB_FLOW_SPEC_ETH, + .size = sizeof(struct ib_flow_spec_eth), + .mask = { .dst_mac = { 0x1 } }, + .val = { .dst_mac = {} } } }; + + handler = create_flow_rule(dev, ft_prio, &leftovers_wc.flow_attr, dst); if (!IS_ERR(handler) && flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) { handler_ucast = create_flow_rule(dev, ft_prio, - &leftovers_specs[LEFTOVERS_UC].flow_attr, - dst); + &leftovers_uc.flow_attr, dst); if (IS_ERR(handler_ucast)) { mlx5_del_flow_rules(handler->rule); ft_prio->refcount--; @@ -1893,7 +1874,7 @@ static int mlx5_ib_fill_transport_ns_info(struct mlx5_ib_dev *dev, u32 *flags, u16 *vport_idx, u16 *vport, struct mlx5_core_dev **ft_mdev, - u32 ib_port) + u32 ib_port, u16 *esw_owner_vhca_id) { struct mlx5_core_dev *esw_mdev; @@ -1907,8 +1888,13 @@ static int mlx5_ib_fill_transport_ns_info(struct mlx5_ib_dev *dev, return -EINVAL; esw_mdev = mlx5_eswitch_get_core_dev(dev->port[ib_port - 1].rep->esw); - if (esw_mdev != dev->mdev) - return -EOPNOTSUPP; + if (esw_mdev != dev->mdev) { + if (!MLX5_CAP_ADV_RDMA(dev->mdev, + rdma_transport_manager_other_eswitch)) + return -EOPNOTSUPP; + *flags |= MLX5_FLOW_TABLE_OTHER_ESWITCH; + *esw_owner_vhca_id = MLX5_CAP_GEN(esw_mdev, vhca_id); + } *flags |= MLX5_FLOW_TABLE_OTHER_VPORT; *ft_mdev = esw_mdev; @@ -1924,8 +1910,10 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, bool mcast, u32 ib_port) { struct mlx5_core_dev *ft_mdev = dev->mdev; + struct mlx5_flow_table_attr ft_attr = {}; struct mlx5_flow_namespace *ns = NULL; struct mlx5_ib_flow_prio *prio = NULL; + u16 esw_owner_vhca_id = 0; int max_table_size = 0; u16 vport_idx = 0; bool esw_encap; @@ -1982,11 +1970,13 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, break; case MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX: case MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX: - if (ib_port == 0 || user_priority > MLX5_RDMA_TRANSPORT_BYPASS_PRIO) + if (ib_port == 0 || + user_priority >= MLX5_RDMA_TRANSPORT_BYPASS_PRIO) return ERR_PTR(-EINVAL); ret = mlx5_ib_fill_transport_ns_info(dev, ns_type, &flags, &vport_idx, &vport, - &ft_mdev, ib_port); + &ft_mdev, ib_port, + &esw_owner_vhca_id); if (ret) return ERR_PTR(ret); @@ -2032,10 +2022,10 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, prio = &dev->flow_db->rdma_tx[priority]; break; case MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX: - prio = &dev->flow_db->rdma_transport_rx[ib_port - 1]; + prio = &dev->flow_db->rdma_transport_rx[priority][ib_port - 1]; break; case MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX: - prio = &dev->flow_db->rdma_transport_tx[ib_port - 1]; + prio = &dev->flow_db->rdma_transport_tx[priority][ib_port - 1]; break; default: return ERR_PTR(-EINVAL); } @@ -2046,8 +2036,13 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, if (prio->flow_table) return prio; - return _get_prio(dev, ns, prio, priority, max_table_size, - MLX5_FS_MAX_TYPES, flags, vport); + ft_attr.prio = priority; + ft_attr.max_fte = max_table_size; + ft_attr.flags = flags; + ft_attr.vport = vport; + ft_attr.esw_owner_vhca_id = esw_owner_vhca_id; + ft_attr.autogroup.max_num_groups = MLX5_FS_MAX_TYPES; + return _get_prio(ns, prio, &ft_attr); } static struct mlx5_ib_flow_handler * @@ -2474,7 +2469,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( struct mlx5_ib_dev *dev; u32 flags; - if (!capable(CAP_NET_RAW)) + if (!rdma_uattrs_has_raw_cap(attrs)) return -EPERM; fs_matcher = uverbs_attr_get_obj(attrs, @@ -3005,7 +3000,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_STEERING_ANCHOR_CREATE)( u32 ft_id; int err; - if (!capable(CAP_NET_RAW)) + if (!rdma_dev_has_raw_cap(&dev->ib_dev)) return -EPERM; err = uverbs_get_const(&ib_uapi_ft_type, attrs, @@ -3482,31 +3477,40 @@ static const struct ib_device_ops flow_ops = { int mlx5_ib_fs_init(struct mlx5_ib_dev *dev) { + int i, j; + dev->flow_db = kzalloc(sizeof(*dev->flow_db), GFP_KERNEL); if (!dev->flow_db) return -ENOMEM; - dev->flow_db->rdma_transport_rx = kcalloc(dev->num_ports, - sizeof(struct mlx5_ib_flow_prio), - GFP_KERNEL); - if (!dev->flow_db->rdma_transport_rx) - goto free_flow_db; + for (i = 0; i < MLX5_RDMA_TRANSPORT_BYPASS_PRIO; i++) { + dev->flow_db->rdma_transport_rx[i] = + kcalloc(dev->num_ports, + sizeof(struct mlx5_ib_flow_prio), GFP_KERNEL); + if (!dev->flow_db->rdma_transport_rx[i]) + goto free_rdma_transport_rx; + } - dev->flow_db->rdma_transport_tx = kcalloc(dev->num_ports, - sizeof(struct mlx5_ib_flow_prio), - GFP_KERNEL); - if (!dev->flow_db->rdma_transport_tx) - goto free_rdma_transport_rx; + for (j = 0; j < MLX5_RDMA_TRANSPORT_BYPASS_PRIO; j++) { + dev->flow_db->rdma_transport_tx[j] = + kcalloc(dev->num_ports, + sizeof(struct mlx5_ib_flow_prio), GFP_KERNEL); + if (!dev->flow_db->rdma_transport_tx[j]) + goto free_rdma_transport_tx; + } mutex_init(&dev->flow_db->lock); ib_set_device_ops(&dev->ib_dev, &flow_ops); return 0; +free_rdma_transport_tx: + while (j--) + kfree(dev->flow_db->rdma_transport_tx[j]); free_rdma_transport_rx: - kfree(dev->flow_db->rdma_transport_rx); -free_flow_db: + while (i--) + kfree(dev->flow_db->rdma_transport_rx[i]); kfree(dev->flow_db); return -ENOMEM; } diff --git a/drivers/infiniband/hw/mlx5/fs.h b/drivers/infiniband/hw/mlx5/fs.h index 2ebe86e5be..7abba0e283 100644 --- a/drivers/infiniband/hw/mlx5/fs.h +++ b/drivers/infiniband/hw/mlx5/fs.h @@ -13,6 +13,8 @@ void mlx5_ib_fs_cleanup_anchor(struct mlx5_ib_dev *dev); static inline void mlx5_ib_fs_cleanup(struct mlx5_ib_dev *dev) { + int i; + /* When a steering anchor is created, a special flow table is also * created for the user to reference. Since the user can reference it, * the kernel cannot trust that when the user destroys the steering @@ -25,8 +27,10 @@ static inline void mlx5_ib_fs_cleanup(struct mlx5_ib_dev *dev) * is a safe assumption that all references are gone. */ mlx5_ib_fs_cleanup_anchor(dev); - kfree(dev->flow_db->rdma_transport_tx); - kfree(dev->flow_db->rdma_transport_rx); + for (i = 0; i < MLX5_RDMA_TRANSPORT_BYPASS_PRIO; i++) + kfree(dev->flow_db->rdma_transport_tx[i]); + for (i = 0; i < MLX5_RDMA_TRANSPORT_BYPASS_PRIO; i++) + kfree(dev->flow_db->rdma_transport_rx[i]); kfree(dev->flow_db); } #endif /* _MLX5_IB_FS_H */ diff --git a/drivers/infiniband/hw/mlx5/gsi.c b/drivers/infiniband/hw/mlx5/gsi.c index b804f2dd56..d5487834ed 100644 --- a/drivers/infiniband/hw/mlx5/gsi.c +++ b/drivers/infiniband/hw/mlx5/gsi.c @@ -131,8 +131,9 @@ int mlx5_ib_create_gsi(struct ib_pd *pd, struct mlx5_ib_qp *mqp, gsi->cq = ib_alloc_cq(pd->device, gsi, attr->cap.max_send_wr, 0, IB_POLL_SOFTIRQ); if (IS_ERR(gsi->cq)) { - mlx5_ib_warn(dev, "unable to create send CQ for GSI QP. error %ld\n", - PTR_ERR(gsi->cq)); + mlx5_ib_warn(dev, + "unable to create send CQ for GSI QP. error %pe\n", + gsi->cq); ret = PTR_ERR(gsi->cq); goto err_free_wrs; } @@ -147,8 +148,9 @@ int mlx5_ib_create_gsi(struct ib_pd *pd, struct mlx5_ib_qp *mqp, gsi->rx_qp = ib_create_qp(pd, &hw_init_attr); if (IS_ERR(gsi->rx_qp)) { - mlx5_ib_warn(dev, "unable to create hardware GSI QP. error %ld\n", - PTR_ERR(gsi->rx_qp)); + mlx5_ib_warn(dev, + "unable to create hardware GSI QP. error %pe\n", + gsi->rx_qp); ret = PTR_ERR(gsi->rx_qp); goto err_destroy_cq; } @@ -294,8 +296,9 @@ static void setup_qp(struct mlx5_ib_gsi_qp *gsi, u16 qp_index) qp = create_gsi_ud_qp(gsi); if (IS_ERR(qp)) { - mlx5_ib_warn(dev, "unable to create hardware UD QP for GSI: %ld\n", - PTR_ERR(qp)); + mlx5_ib_warn(dev, + "unable to create hardware UD QP for GSI: %pe\n", + qp); return; } diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c index 49af1cfbe6..bbecca4051 100644 --- a/drivers/infiniband/hw/mlx5/ib_rep.c +++ b/drivers/infiniband/hw/mlx5/ib_rep.c @@ -44,6 +44,63 @@ static void mlx5_ib_num_ports_update(struct mlx5_core_dev *dev, u32 *num_ports) } } +static int mlx5_ib_set_owner_transport(struct mlx5_core_dev *cur_owner, + struct mlx5_core_dev *new_owner) +{ + int ret; + + if (!MLX5_CAP_FLOWTABLE_RDMA_TRANSPORT_TX(cur_owner, ft_support) || + !MLX5_CAP_FLOWTABLE_RDMA_TRANSPORT_RX(cur_owner, ft_support)) + return 0; + + if (!MLX5_CAP_ADV_RDMA(new_owner, rdma_transport_manager) || + !MLX5_CAP_ADV_RDMA(new_owner, rdma_transport_manager_other_eswitch)) + return 0; + + ret = mlx5_fs_set_root_dev(cur_owner, new_owner, + FS_FT_RDMA_TRANSPORT_TX); + if (ret) + return ret; + + ret = mlx5_fs_set_root_dev(cur_owner, new_owner, + FS_FT_RDMA_TRANSPORT_RX); + if (ret) { + mlx5_fs_set_root_dev(cur_owner, cur_owner, + FS_FT_RDMA_TRANSPORT_TX); + return ret; + } + + return 0; +} + +static void mlx5_ib_release_transport(struct mlx5_core_dev *dev) +{ + struct mlx5_core_dev *peer_dev; + int i, ret; + + mlx5_lag_for_each_peer_mdev(dev, peer_dev, i) { + ret = mlx5_ib_set_owner_transport(peer_dev, peer_dev); + WARN_ON_ONCE(ret); + } +} + +static int mlx5_ib_take_transport(struct mlx5_core_dev *dev) +{ + struct mlx5_core_dev *peer_dev; + int ret; + int i; + + mlx5_lag_for_each_peer_mdev(dev, peer_dev, i) { + ret = mlx5_ib_set_owner_transport(peer_dev, dev); + if (ret) { + mlx5_ib_release_transport(dev); + return ret; + } + } + + return 0; +} + static int mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) { @@ -88,9 +145,18 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) else return mlx5_ib_set_vport_rep(lag_master, rep, vport_index); - ibdev = ib_alloc_device(mlx5_ib_dev, ib_dev); - if (!ibdev) - return -ENOMEM; + if (mlx5_lag_is_shared_fdb(dev)) { + ret = mlx5_ib_take_transport(lag_master); + if (ret) + return ret; + } + + ibdev = ib_alloc_device_with_net(mlx5_ib_dev, ib_dev, + mlx5_core_net(lag_master)); + if (!ibdev) { + ret = -ENOMEM; + goto release_transport; + } ibdev->port = kcalloc(num_ports, sizeof(*ibdev->port), GFP_KERNEL); @@ -126,6 +192,10 @@ fail_add: kfree(ibdev->port); fail_port: ib_dealloc_device(&ibdev->ib_dev); +release_transport: + if (mlx5_lag_is_shared_fdb(lag_master)) + mlx5_ib_release_transport(lag_master); + return ret; } @@ -181,6 +251,7 @@ mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep) esw = peer_mdev->priv.eswitch; mlx5_eswitch_unregister_vport_reps(esw, REP_IB); } + mlx5_ib_release_transport(mdev); } __mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX); } diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 64f1e0fafd..36f06ea8a6 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -50,6 +51,7 @@ #include #include "macsec.h" #include "data_direct.h" +#include "dmah.h" #define UVERBS_MODULE_NAME mlx5_ib #include @@ -485,6 +487,10 @@ static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u16 *active_speed, *active_width = IB_WIDTH_2X; *active_speed = IB_SPEED_NDR; break; + case MLX5E_PROT_MASK(MLX5E_200GAUI_1_200GBASE_CR1_KR1): + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_XDR; + break; case MLX5E_PROT_MASK(MLX5E_400GAUI_8_400GBASE_CR8): *active_width = IB_WIDTH_8X; *active_speed = IB_SPEED_HDR; @@ -493,10 +499,18 @@ static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u16 *active_speed, *active_width = IB_WIDTH_4X; *active_speed = IB_SPEED_NDR; break; + case MLX5E_PROT_MASK(MLX5E_400GAUI_2_400GBASE_CR2_KR2): + *active_width = IB_WIDTH_2X; + *active_speed = IB_SPEED_XDR; + break; case MLX5E_PROT_MASK(MLX5E_800GAUI_8_800GBASE_CR8_KR8): *active_width = IB_WIDTH_8X; *active_speed = IB_SPEED_NDR; break; + case MLX5E_PROT_MASK(MLX5E_800GAUI_4_800GBASE_CR4_KR4): + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_XDR; + break; default: return -EINVAL; } @@ -828,7 +842,7 @@ static int mlx5_query_node_guid(struct mlx5_ib_dev *dev, break; case MLX5_VPORT_ACCESS_METHOD_NIC: - err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp); + err = mlx5_query_nic_vport_node_guid(dev->mdev, 0, false, &tmp); break; default: @@ -870,6 +884,51 @@ static void fill_esw_mgr_reg_c0(struct mlx5_core_dev *mdev, resp->reg_c0.mask = mlx5_eswitch_get_vport_metadata_mask(); } +/* + * Calculate maximum SQ overhead across all QP types. + * Other QP types (REG_UMR, UC, RC, UD/SMI/GSI, XRC_TGT) + * have smaller overhead than the types calculated below, + * so they are implicitly included. + */ +static u32 mlx5_ib_calc_max_sq_overhead(void) +{ + u32 max_overhead_xrc, overhead_ud_lso, a, b; + + /* XRC_INI */ + max_overhead_xrc = sizeof(struct mlx5_wqe_xrc_seg); + max_overhead_xrc += sizeof(struct mlx5_wqe_ctrl_seg); + a = sizeof(struct mlx5_wqe_atomic_seg) + + sizeof(struct mlx5_wqe_raddr_seg); + b = sizeof(struct mlx5_wqe_umr_ctrl_seg) + + sizeof(struct mlx5_mkey_seg) + + MLX5_IB_SQ_UMR_INLINE_THRESHOLD / MLX5_IB_UMR_OCTOWORD; + max_overhead_xrc += max(a, b); + + /* UD with LSO */ + overhead_ud_lso = sizeof(struct mlx5_wqe_ctrl_seg); + overhead_ud_lso += sizeof(struct mlx5_wqe_eth_pad); + overhead_ud_lso += sizeof(struct mlx5_wqe_eth_seg); + overhead_ud_lso += sizeof(struct mlx5_wqe_datagram_seg); + + return max(max_overhead_xrc, overhead_ud_lso); +} + +static u32 mlx5_ib_calc_max_qp_wr(struct mlx5_ib_dev *dev) +{ + struct mlx5_core_dev *mdev = dev->mdev; + u32 max_wqe_bb_units = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); + u32 max_wqe_size; + /* max QP overhead + 1 SGE, no inline, no special features */ + max_wqe_size = mlx5_ib_calc_max_sq_overhead() + + sizeof(struct mlx5_wqe_data_seg); + + max_wqe_size = roundup_pow_of_two(max_wqe_size); + + max_wqe_size = ALIGN(max_wqe_size, MLX5_SEND_WQE_BB); + + return (max_wqe_bb_units * MLX5_SEND_WQE_BB) / max_wqe_size; +} + static int mlx5_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props, struct ib_udata *uhw) @@ -1028,7 +1087,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->max_mr_size = ~0ull; props->page_size_cap = ~(min_page_size - 1); props->max_qp = 1 << MLX5_CAP_GEN(mdev, log_max_qp); - props->max_qp_wr = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); + props->max_qp_wr = mlx5_ib_calc_max_qp_wr(dev); max_rq_sg = MLX5_CAP_GEN(mdev, max_wqe_sz_rq) / sizeof(struct mlx5_wqe_data_seg); max_sq_desc = min_t(int, MLX5_CAP_GEN(mdev, max_wqe_sz_sq), 512); @@ -1779,10 +1838,45 @@ static void deallocate_uars(struct mlx5_ib_dev *dev, context->devx_uid); } +static int mlx5_ib_enable_lb_mp(struct mlx5_core_dev *master, + struct mlx5_core_dev *slave, + struct mlx5_ib_lb_state *lb_state) +{ + int err; + + err = mlx5_nic_vport_update_local_lb(master, true); + if (err) + return err; + + err = mlx5_nic_vport_update_local_lb(slave, true); + if (err) + goto out; + + lb_state->force_enable = true; + return 0; + +out: + mlx5_nic_vport_update_local_lb(master, false); + return err; +} + +static void mlx5_ib_disable_lb_mp(struct mlx5_core_dev *master, + struct mlx5_core_dev *slave, + struct mlx5_ib_lb_state *lb_state) +{ + mlx5_nic_vport_update_local_lb(slave, false); + mlx5_nic_vport_update_local_lb(master, false); + + lb_state->force_enable = false; +} + int mlx5_ib_enable_lb(struct mlx5_ib_dev *dev, bool td, bool qp) { int err = 0; + if (dev->lb.force_enable) + return 0; + mutex_lock(&dev->lb.mutex); if (td) dev->lb.user_td++; @@ -1804,6 +1898,9 @@ int mlx5_ib_enable_lb(struct mlx5_ib_dev *dev, bool td, bool qp) void mlx5_ib_disable_lb(struct mlx5_ib_dev *dev, bool td, bool qp) { + if (dev->lb.force_enable) + return; + mutex_lock(&dev->lb.mutex); if (td) dev->lb.user_td--; @@ -2954,14 +3051,16 @@ int mlx5_ib_dev_res_cq_init(struct mlx5_ib_dev *dev) pd = ib_alloc_pd(ibdev, 0); if (IS_ERR(pd)) { ret = PTR_ERR(pd); - mlx5_ib_err(dev, "Couldn't allocate PD for res init, err=%d\n", ret); + mlx5_ib_err(dev, "Couldn't allocate PD for res init, err=%pe\n", + pd); goto unlock; } cq = ib_create_cq(ibdev, NULL, NULL, NULL, &cq_attr); if (IS_ERR(cq)) { ret = PTR_ERR(cq); - mlx5_ib_err(dev, "Couldn't create CQ for res init, err=%d\n", ret); + mlx5_ib_err(dev, "Couldn't create CQ for res init, err=%pe\n", + cq); ib_dealloc_pd(pd); goto unlock; } @@ -3005,7 +3104,9 @@ int mlx5_ib_dev_res_srq_init(struct mlx5_ib_dev *dev) s0 = ib_create_srq(devr->p0, &attr); if (IS_ERR(s0)) { ret = PTR_ERR(s0); - mlx5_ib_err(dev, "Couldn't create SRQ 0 for res init, err=%d\n", ret); + mlx5_ib_err(dev, + "Couldn't create SRQ 0 for res init, err=%pe\n", + s0); goto unlock; } @@ -3017,8 +3118,11 @@ int mlx5_ib_dev_res_srq_init(struct mlx5_ib_dev *dev) s1 = ib_create_srq(devr->p0, &attr); if (IS_ERR(s1)) { ret = PTR_ERR(s1); - mlx5_ib_err(dev, "Couldn't create SRQ 1 for res init, err=%d\n", ret); + mlx5_ib_err(dev, + "Couldn't create SRQ 1 for res init, err=%pe\n", + s1); ib_destroy_srq(s0); + goto unlock; } devr->s0 = s0; @@ -3078,6 +3182,7 @@ mlx5_ib_create_data_direct_resources(struct mlx5_ib_dev *dev) { int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); struct mlx5_core_dev *mdev = dev->mdev; + bool ro_supp = false; void *mkc; u32 mkey; u32 pdn; @@ -3106,14 +3211,37 @@ mlx5_ib_create_data_direct_resources(struct mlx5_ib_dev *dev) MLX5_SET(mkc, mkc, length64, 1); MLX5_SET(mkc, mkc, qpn, 0xffffff); err = mlx5_core_create_mkey(mdev, &mkey, in, inlen); - kvfree(in); if (err) - goto err; + goto err_mkey; dev->ddr.mkey = mkey; dev->ddr.pdn = pdn; + + /* create another mkey with RO support */ + if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write)) { + MLX5_SET(mkc, mkc, relaxed_ordering_write, 1); + ro_supp = true; + } + + if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read)) { + MLX5_SET(mkc, mkc, relaxed_ordering_read, 1); + ro_supp = true; + } + + if (ro_supp) { + err = mlx5_core_create_mkey(mdev, &mkey, in, inlen); + /* RO is defined as best effort */ + if (!err) { + dev->ddr.mkey_ro = mkey; + dev->ddr.mkey_ro_valid = true; + } + } + + kvfree(in); return 0; +err_mkey: + kvfree(in); err: mlx5_core_dealloc_pd(mdev, pdn); return err; @@ -3122,6 +3250,10 @@ err: static void mlx5_ib_free_data_direct_resources(struct mlx5_ib_dev *dev) { + + if (dev->ddr.mkey_ro_valid) + mlx5_core_destroy_mkey(dev->mdev, dev->ddr.mkey_ro); + mlx5_core_destroy_mkey(dev->mdev, dev->ddr.mkey); mlx5_core_dealloc_pd(dev->mdev, dev->ddr.pdn); } @@ -3483,6 +3615,8 @@ static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev, lockdep_assert_held(&mlx5_ib_multiport_mutex); + mlx5_ib_disable_lb_mp(ibdev->mdev, mpi->mdev, &ibdev->lb); + mlx5_core_mp_event_replay(ibdev->mdev, MLX5_DRIVER_EVENT_AFFILIATION_REMOVED, NULL); @@ -3578,6 +3712,10 @@ static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev, MLX5_DRIVER_EVENT_AFFILIATION_DONE, &key); + err = mlx5_ib_enable_lb_mp(ibdev->mdev, mpi->mdev, &ibdev->lb); + if (err) + goto unbind; + return true; unbind: @@ -4145,7 +4283,9 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { .modify_port = mlx5_ib_modify_port, .modify_qp = mlx5_ib_modify_qp, .modify_srq = mlx5_ib_modify_srq, + .pre_destroy_cq = mlx5_ib_pre_destroy_cq, .poll_cq = mlx5_ib_poll_cq, + .post_destroy_cq = mlx5_ib_post_destroy_cq, .post_recv = mlx5_ib_post_recv_nodrain, .post_send = mlx5_ib_post_send_nodrain, .post_srq_recv = mlx5_ib_post_srq_recv, @@ -4167,6 +4307,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { INIT_RDMA_OBJ_SIZE(ib_ah, mlx5_ib_ah, ibah), INIT_RDMA_OBJ_SIZE(ib_counters, mlx5_ib_mcounters, ibcntrs), INIT_RDMA_OBJ_SIZE(ib_cq, mlx5_ib_cq, ibcq), + INIT_RDMA_OBJ_SIZE(ib_dmah, mlx5_ib_dmah, ibdmah), INIT_RDMA_OBJ_SIZE(ib_pd, mlx5_ib_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_qp, mlx5_ib_qp, ibqp), INIT_RDMA_OBJ_SIZE(ib_srq, mlx5_ib_srq, ibsrq), @@ -4294,6 +4435,9 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM) ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_dm_ops); + if (mdev->st) + ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_dmah_ops); + ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_ops); if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)) @@ -4422,17 +4566,6 @@ static void mlx5_ib_stage_cong_debugfs_cleanup(struct mlx5_ib_dev *dev) mlx5_core_native_port_num(dev->mdev) - 1); } -static int mlx5_ib_stage_uar_init(struct mlx5_ib_dev *dev) -{ - dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev); - return PTR_ERR_OR_ZERO(dev->mdev->priv.uar); -} - -static void mlx5_ib_stage_uar_cleanup(struct mlx5_ib_dev *dev) -{ - mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar); -} - static int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev) { int err; @@ -4661,9 +4794,6 @@ static const struct mlx5_ib_profile pf_profile = { STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS, mlx5_ib_stage_cong_debugfs_init, mlx5_ib_stage_cong_debugfs_cleanup), - STAGE_CREATE(MLX5_IB_STAGE_UAR, - mlx5_ib_stage_uar_init, - mlx5_ib_stage_uar_cleanup), STAGE_CREATE(MLX5_IB_STAGE_BFREG, mlx5_ib_stage_bfrag_init, mlx5_ib_stage_bfrag_cleanup), @@ -4721,9 +4851,6 @@ const struct mlx5_ib_profile raw_eth_profile = { STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS, mlx5_ib_stage_cong_debugfs_init, mlx5_ib_stage_cong_debugfs_cleanup), - STAGE_CREATE(MLX5_IB_STAGE_UAR, - mlx5_ib_stage_uar_init, - mlx5_ib_stage_uar_cleanup), STAGE_CREATE(MLX5_IB_STAGE_BFREG, mlx5_ib_stage_bfrag_init, mlx5_ib_stage_bfrag_cleanup), @@ -4795,7 +4922,8 @@ static struct ib_device *mlx5_ib_add_sub_dev(struct ib_device *parent, !MLX5_CAP_GEN_2(mparent->mdev, multiplane_qp_ud)) return ERR_PTR(-EOPNOTSUPP); - mplane = ib_alloc_device(mlx5_ib_dev, ib_dev); + mplane = ib_alloc_device_with_net(mlx5_ib_dev, ib_dev, + mlx5_core_net(mparent->mdev)); if (!mplane) return ERR_PTR(-ENOMEM); @@ -4909,7 +5037,8 @@ static int mlx5r_probe(struct auxiliary_device *adev, num_ports = max(MLX5_CAP_GEN(mdev, num_ports), MLX5_CAP_GEN(mdev, num_vhca_ports)); - dev = ib_alloc_device(mlx5_ib_dev, ib_dev); + dev = ib_alloc_device_with_net(mlx5_ib_dev, ib_dev, + mlx5_core_net(mdev)); if (!dev) return -ENOMEM; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index c84ef94bb9..b20d3e5efd 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -100,19 +100,6 @@ unsigned long __mlx5_umem_find_best_quantized_pgoff( __mlx5_bit_sz(typ, page_offset_fld), 0, scale, \ page_offset_quantized) -static inline unsigned long -mlx5_umem_dmabuf_find_best_pgsz(struct ib_umem_dmabuf *umem_dmabuf) -{ - /* - * mkeys used for dmabuf are fixed at PAGE_SIZE because we must be able - * to hold any sgl after a move operation. Ideally the mkc page size - * could be changed at runtime to be optimal, but right now the driver - * cannot do that. - */ - return ib_umem_find_best_pgsz(&umem_dmabuf->umem, PAGE_SIZE, - umem_dmabuf->umem.iova); -} - enum { MLX5_IB_MMAP_OFFSET_START = 9, MLX5_IB_MMAP_OFFSET_END = 255, @@ -316,8 +303,8 @@ struct mlx5_ib_flow_db { struct mlx5_ib_flow_prio rdma_tx[MLX5_IB_NUM_FLOW_FT]; struct mlx5_ib_flow_prio opfcs[MLX5_IB_OPCOUNTER_MAX]; struct mlx5_flow_table *lag_demux_ft; - struct mlx5_ib_flow_prio *rdma_transport_rx; - struct mlx5_ib_flow_prio *rdma_transport_tx; + struct mlx5_ib_flow_prio *rdma_transport_rx[MLX5_RDMA_TRANSPORT_BYPASS_PRIO]; + struct mlx5_ib_flow_prio *rdma_transport_tx[MLX5_RDMA_TRANSPORT_BYPASS_PRIO]; /* Protect flow steering bypass flow tables * when add/del flow rules. * only single add/removal of flow steering rule could be done @@ -348,6 +335,7 @@ struct mlx5_ib_flow_db { #define MLX5_IB_UPD_XLT_ACCESS BIT(5) #define MLX5_IB_UPD_XLT_INDIRECT BIT(6) #define MLX5_IB_UPD_XLT_DOWNGRADE BIT(7) +#define MLX5_IB_UPD_XLT_KEEP_PGSZ BIT(8) /* Private QP creation flags to be passed in ib_qp_init_attr.create_flags. * @@ -646,8 +634,13 @@ enum mlx5_mkey_type { MLX5_MKEY_IMPLICIT_CHILD, }; +/* Used for non-existent ph value */ +#define MLX5_IB_NO_PH 0xff + struct mlx5r_cache_rb_key { u8 ats:1; + u8 ph; + u16 st_index; unsigned int access_mode; unsigned int access_flags; unsigned int ndescs; @@ -735,6 +728,8 @@ struct mlx5_ib_mr { struct mlx5_ib_mr *dd_crossed_mr; struct list_head dd_node; u8 revoked :1; + /* Indicates previous dmabuf page fault occurred */ + u8 dmabuf_faulted:1; struct mlx5_ib_mkey null_mmkey; }; }; @@ -855,6 +850,8 @@ struct mlx5_ib_port_resources { struct mlx5_data_direct_resources { u32 pdn; u32 mkey; + u32 mkey_ro; + u8 mkey_ro_valid :1; }; struct mlx5_ib_resources { @@ -895,13 +892,14 @@ void mlx5_ib_fs_remove_op_fc(struct mlx5_ib_dev *dev, struct mlx5_ib_op_fc *opfc, enum mlx5_ib_optional_counter_type type); -int mlx5r_fs_bind_op_fc(struct ib_qp *qp, struct rdma_counter *counter, - u32 port); +int mlx5r_fs_bind_op_fc(struct ib_qp *qp, + struct mlx5_fc *fc_arr[MLX5_IB_OPCOUNTER_MAX], + struct xarray *qpn_opfc_xa, u32 port); -void mlx5r_fs_unbind_op_fc(struct ib_qp *qp, struct rdma_counter *counter); +void mlx5r_fs_unbind_op_fc(struct ib_qp *qp, struct xarray *qpn_opfc_xa); void mlx5r_fs_destroy_fcs(struct mlx5_ib_dev *dev, - struct rdma_counter *counter); + struct mlx5_fc *fc_arr[MLX5_IB_OPCOUNTER_MAX]); struct mlx5_ib_multiport_info; @@ -1002,7 +1000,6 @@ enum mlx5_ib_stages { MLX5_IB_STAGE_ODP, MLX5_IB_STAGE_COUNTERS, MLX5_IB_STAGE_CONG_DEBUGFS, - MLX5_IB_STAGE_UAR, MLX5_IB_STAGE_BFREG, MLX5_IB_STAGE_PRE_IB_REG_UMR, MLX5_IB_STAGE_WHITELIST_UID, @@ -1110,6 +1107,7 @@ struct mlx5_ib_lb_state { u32 user_td; int qps; bool enabled; + bool force_enable; }; struct mlx5_ib_pf_eq { @@ -1369,6 +1367,8 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, struct uverbs_attr_bundle *attrs); int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); +int mlx5_ib_pre_destroy_cq(struct ib_cq *cq); +void mlx5_ib_post_destroy_cq(struct ib_cq *cq); int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); @@ -1747,20 +1747,75 @@ static inline u32 smi_to_native_portnum(struct mlx5_ib_dev *dev, u32 port) return (port - 1) / dev->num_ports + 1; } +static inline unsigned int get_max_log_entity_size_cap(struct mlx5_ib_dev *dev, + int access_mode) +{ + int max_log_size = 0; + + if (access_mode == MLX5_MKC_ACCESS_MODE_MTT) + max_log_size = + MLX5_CAP_GEN_2(dev->mdev, max_mkey_log_entity_size_mtt); + else if (access_mode == MLX5_MKC_ACCESS_MODE_KSM) + max_log_size = MLX5_CAP_GEN_2( + dev->mdev, max_mkey_log_entity_size_fixed_buffer); + + if (!max_log_size || + (max_log_size > 31 && + !MLX5_CAP_GEN_2(dev->mdev, umr_log_entity_size_5))) + max_log_size = 31; + + return max_log_size; +} + +static inline unsigned int get_min_log_entity_size_cap(struct mlx5_ib_dev *dev, + int access_mode) +{ + int min_log_size = 0; + + if (access_mode == MLX5_MKC_ACCESS_MODE_KSM && + MLX5_CAP_GEN_2(dev->mdev, + min_mkey_log_entity_size_fixed_buffer_valid)) + min_log_size = MLX5_CAP_GEN_2( + dev->mdev, min_mkey_log_entity_size_fixed_buffer); + else + min_log_size = + MLX5_CAP_GEN_2(dev->mdev, log_min_mkey_entity_size); + + min_log_size = max(min_log_size, MLX5_ADAPTER_PAGE_SHIFT); + return min_log_size; +} + /* * For mkc users, instead of a page_offset the command has a start_iova which * specifies both the page_offset and the on-the-wire IOVA */ static __always_inline unsigned long mlx5_umem_mkc_find_best_pgsz(struct mlx5_ib_dev *dev, struct ib_umem *umem, - u64 iova) + u64 iova, int access_mode) { - int page_size_bits = - MLX5_CAP_GEN_2(dev->mdev, umr_log_entity_size_5) ? 6 : 5; - unsigned long bitmap = - __mlx5_log_page_size_to_bitmap(page_size_bits, 0); + unsigned int max_log_entity_size_cap, min_log_entity_size_cap; + unsigned long bitmap; + + max_log_entity_size_cap = get_max_log_entity_size_cap(dev, access_mode); + min_log_entity_size_cap = get_min_log_entity_size_cap(dev, access_mode); + + bitmap = GENMASK_ULL(max_log_entity_size_cap, min_log_entity_size_cap); + + /* In KSM mode HW requires IOVA and mkey's page size to be aligned */ + if (access_mode == MLX5_MKC_ACCESS_MODE_KSM && iova) + bitmap &= GENMASK_ULL(__ffs64(iova), 0); return ib_umem_find_best_pgsz(umem, bitmap, iova); } +static inline unsigned long +mlx5_umem_dmabuf_find_best_pgsz(struct ib_umem_dmabuf *umem_dmabuf, + int access_mode) +{ + return mlx5_umem_mkc_find_best_pgsz(to_mdev(umem_dmabuf->umem.ibdev), + &umem_dmabuf->umem, + umem_dmabuf->umem.iova, + access_mode); +} + #endif /* MLX5_IB_H */ diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 247f7248a0..325fa04cbe 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -44,6 +44,7 @@ #include "mlx5_ib.h" #include "umr.h" #include "data_direct.h" +#include "dmah.h" enum { MAX_PENDING_REG_MR = 8, @@ -57,7 +58,7 @@ create_mkey_callback(int status, struct mlx5_async_work *context); static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, u64 iova, int access_flags, unsigned long page_size, bool populate, - int access_mode); + int access_mode, u16 st_index, u8 ph); static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr); static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr, @@ -256,6 +257,14 @@ static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc) get_mkc_octo_size(ent->rb_key.access_mode, ent->rb_key.ndescs)); MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); + + if (ent->rb_key.ph != MLX5_IB_NO_PH) { + MLX5_SET(mkc, mkc, pcie_tph_en, 1); + MLX5_SET(mkc, mkc, pcie_tph_ph, ent->rb_key.ph); + if (ent->rb_key.st_index != MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX) + MLX5_SET(mkc, mkc, pcie_tph_steering_tag_index, + ent->rb_key.st_index); + } } /* Asynchronously schedule new MRs to be populated in the cache. */ @@ -525,7 +534,7 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) ent->fill_to_high_water = false; if (ent->pending) queue_delayed_work(ent->dev->cache.wq, &ent->dwork, - msecs_to_jiffies(1000)); + secs_to_jiffies(1)); else mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); } @@ -576,7 +585,7 @@ static void __cache_work_func(struct mlx5_cache_ent *ent) "add keys command failed, err %d\n", err); queue_delayed_work(cache->wq, &ent->dwork, - msecs_to_jiffies(1000)); + secs_to_jiffies(1)); } } } else if (ent->mkeys_queue.ci > 2 * ent->limit) { @@ -641,6 +650,14 @@ static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1, if (res) return res; + res = key1.st_index - key2.st_index; + if (res) + return res; + + res = key1.ph - key2.ph; + if (res) + return res; + /* * keep ndescs the last in the compare table since the find function * searches for an exact match on all properties and only closest @@ -712,6 +729,8 @@ mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev, smallest->rb_key.access_mode == rb_key.access_mode && smallest->rb_key.access_flags == rb_key.access_flags && smallest->rb_key.ats == rb_key.ats && + smallest->rb_key.st_index == rb_key.st_index && + smallest->rb_key.ph == rb_key.ph && smallest->rb_key.ndescs <= ndescs_limit) ? smallest : NULL; @@ -786,7 +805,8 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, struct mlx5r_cache_rb_key rb_key = { .ndescs = ndescs, .access_mode = access_mode, - .access_flags = get_unchangeable_access_flags(dev, access_flags) + .access_flags = get_unchangeable_access_flags(dev, access_flags), + .ph = MLX5_IB_NO_PH, }; struct mlx5_cache_ent *ent = mkey_cache_ent_from_rb_key(dev, rb_key); @@ -943,6 +963,7 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) struct rb_root *root = &dev->cache.rb_root; struct mlx5r_cache_rb_key rb_key = { .access_mode = MLX5_MKC_ACCESS_MODE_MTT, + .ph = MLX5_IB_NO_PH, }; struct mlx5_cache_ent *ent; struct rb_node *node; @@ -1119,7 +1140,8 @@ static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem, static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, struct ib_umem *umem, u64 iova, - int access_flags, int access_mode) + int access_flags, int access_mode, + u16 st_index, u8 ph) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5r_cache_rb_key rb_key = {}; @@ -1130,7 +1152,8 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, if (umem->is_dmabuf) page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova); else - page_size = mlx5_umem_mkc_find_best_pgsz(dev, umem, iova); + page_size = mlx5_umem_mkc_find_best_pgsz(dev, umem, iova, + access_mode); if (WARN_ON(!page_size)) return ERR_PTR(-EINVAL); @@ -1138,6 +1161,8 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size); rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags); rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags); + rb_key.st_index = st_index; + rb_key.ph = ph; ent = mkey_cache_ent_from_rb_key(dev, rb_key); /* * If the MR can't come from the cache then synchronously create an uncached @@ -1145,7 +1170,8 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, */ if (!ent) { mutex_lock(&dev->slow_path_mutex); - mr = reg_create(pd, umem, iova, access_flags, page_size, false, access_mode); + mr = reg_create(pd, umem, iova, access_flags, page_size, false, access_mode, + st_index, ph); mutex_unlock(&dev->slow_path_mutex); if (IS_ERR(mr)) return mr; @@ -1230,7 +1256,7 @@ err_1: static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, u64 iova, int access_flags, unsigned long page_size, bool populate, - int access_mode) + int access_mode, u16 st_index, u8 ph) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_mr *mr; @@ -1240,7 +1266,8 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, u32 *in; int err; bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)) && - (access_mode == MLX5_MKC_ACCESS_MODE_MTT); + (access_mode == MLX5_MKC_ACCESS_MODE_MTT) && + (ph == MLX5_IB_NO_PH); bool ksm_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM); if (!page_size) @@ -1304,6 +1331,13 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, get_octo_len(iova, umem->length, mr->page_shift)); } + if (ph != MLX5_IB_NO_PH) { + MLX5_SET(mkc, mkc, pcie_tph_en, 1); + MLX5_SET(mkc, mkc, pcie_tph_ph, ph); + if (st_index != MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX) + MLX5_SET(mkc, mkc, pcie_tph_steering_tag_index, st_index); + } + err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); if (err) { mlx5_ib_warn(dev, "create mkey failed\n"); @@ -1423,24 +1457,37 @@ struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, } static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, - u64 iova, int access_flags) + u64 iova, int access_flags, + struct ib_dmah *dmah) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_mr *mr = NULL; bool xlt_with_umr; + u16 st_index = MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX; + u8 ph = MLX5_IB_NO_PH; int err; + if (dmah) { + struct mlx5_ib_dmah *mdmah = to_mdmah(dmah); + + ph = dmah->ph; + if (dmah->valid_fields & BIT(IB_DMAH_CPU_ID_EXISTS)) + st_index = mdmah->st_index; + } + xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length); if (xlt_with_umr) { mr = alloc_cacheable_mr(pd, umem, iova, access_flags, - MLX5_MKC_ACCESS_MODE_MTT); + MLX5_MKC_ACCESS_MODE_MTT, + st_index, ph); } else { - unsigned long page_size = - mlx5_umem_mkc_find_best_pgsz(dev, umem, iova); + unsigned long page_size = mlx5_umem_mkc_find_best_pgsz( + dev, umem, iova, MLX5_MKC_ACCESS_MODE_MTT); mutex_lock(&dev->slow_path_mutex); mr = reg_create(pd, umem, iova, access_flags, page_size, - true, MLX5_MKC_ACCESS_MODE_MTT); + true, MLX5_MKC_ACCESS_MODE_MTT, + st_index, ph); mutex_unlock(&dev->slow_path_mutex); } if (IS_ERR(mr)) { @@ -1504,7 +1551,9 @@ static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length, return ERR_CAST(odp); mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags, - MLX5_MKC_ACCESS_MODE_MTT); + MLX5_MKC_ACCESS_MODE_MTT, + MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX, + MLX5_IB_NO_PH); if (IS_ERR(mr)) { ib_umem_release(&odp->umem); return ERR_CAST(mr); @@ -1535,7 +1584,8 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, struct ib_umem *umem; int err; - if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || dmah) + if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || + ((access_flags & IB_ACCESS_ON_DEMAND) && dmah)) return ERR_PTR(-EOPNOTSUPP); mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", @@ -1551,7 +1601,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, umem = ib_umem_get(&dev->ib_dev, start, length, access_flags); if (IS_ERR(umem)) return ERR_CAST(umem); - return create_real_mr(pd, umem, iova, access_flags); + return create_real_mr(pd, umem, iova, access_flags, dmah); } static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) @@ -1576,12 +1626,15 @@ static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = { static struct ib_mr * reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device, u64 offset, u64 length, u64 virt_addr, - int fd, int access_flags, int access_mode) + int fd, int access_flags, int access_mode, + struct ib_dmah *dmah) { bool pinned_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM); struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_mr *mr = NULL; struct ib_umem_dmabuf *umem_dmabuf; + u16 st_index = MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX; + u8 ph = MLX5_IB_NO_PH; int err; err = mlx5r_umr_resource_init(dev); @@ -1599,13 +1652,21 @@ reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device, fd, access_flags); if (IS_ERR(umem_dmabuf)) { - mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n", - PTR_ERR(umem_dmabuf)); + mlx5_ib_dbg(dev, "umem_dmabuf get failed (%pe)\n", umem_dmabuf); return ERR_CAST(umem_dmabuf); } + if (dmah) { + struct mlx5_ib_dmah *mdmah = to_mdmah(dmah); + + ph = dmah->ph; + if (dmah->valid_fields & BIT(IB_DMAH_CPU_ID_EXISTS)) + st_index = mdmah->st_index; + } + mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr, - access_flags, access_mode); + access_flags, access_mode, + st_index, ph); if (IS_ERR(mr)) { ib_umem_release(&umem_dmabuf->umem); return ERR_CAST(mr); @@ -1655,14 +1716,15 @@ reg_user_mr_dmabuf_by_data_direct(struct ib_pd *pd, u64 offset, goto end; } - /* The device's 'data direct mkey' was created without RO flags to - * simplify things and allow for a single mkey per device. - * Since RO is not a must, mask it out accordingly. + /* If no device's 'data direct mkey' with RO flags exists + * mask it out accordingly. */ - access_flags &= ~IB_ACCESS_RELAXED_ORDERING; + if (!dev->ddr.mkey_ro_valid) + access_flags &= ~IB_ACCESS_RELAXED_ORDERING; crossed_mr = reg_user_mr_dmabuf(pd, &data_direct_dev->pdev->dev, offset, length, virt_addr, fd, - access_flags, MLX5_MKC_ACCESS_MODE_KSM); + access_flags, MLX5_MKC_ACCESS_MODE_KSM, + NULL); if (IS_ERR(crossed_mr)) { ret = PTR_ERR(crossed_mr); goto end; @@ -1697,7 +1759,7 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, int err; if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || - !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) || dmah) + !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) return ERR_PTR(-EOPNOTSUPP); if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS)) { @@ -1722,7 +1784,8 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, return reg_user_mr_dmabuf(pd, pd->device->dma_device, offset, length, virt_addr, - fd, access_flags, MLX5_MKC_ACCESS_MODE_MTT); + fd, access_flags, MLX5_MKC_ACCESS_MODE_MTT, + dmah); } /* @@ -1756,7 +1819,8 @@ static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr, if (!mlx5r_umr_can_load_pas(dev, new_umem->length)) return false; - *page_size = mlx5_umem_mkc_find_best_pgsz(dev, new_umem, iova); + *page_size = mlx5_umem_mkc_find_best_pgsz( + dev, new_umem, iova, mr->mmkey.cache_ent->rb_key.access_mode); if (WARN_ON(!*page_size)) return false; return (mr->mmkey.cache_ent->rb_key.ndescs) >= @@ -1819,7 +1883,8 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, struct mlx5_ib_mr *mr = to_mmr(ib_mr); int err; - if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || mr->data_direct) + if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || mr->data_direct || + mr->mmkey.rb_key.ph != MLX5_IB_NO_PH) return ERR_PTR(-EOPNOTSUPP); mlx5_ib_dbg( @@ -1863,7 +1928,7 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages); return create_real_mr(new_pd, umem, mr->ibmr.iova, - new_access_flags); + new_access_flags, NULL); } /* @@ -1894,7 +1959,7 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, } return NULL; } - return create_real_mr(new_pd, new_umem, iova, new_access_flags); + return create_real_mr(new_pd, new_umem, iova, new_access_flags, NULL); } /* @@ -2080,7 +2145,7 @@ static int mlx5r_handle_mkey_cleanup(struct mlx5_ib_mr *mr) ent->in_use--; if (ent->is_tmp && !ent->tmp_cleanup_scheduled) { mod_delayed_work(ent->dev->cache.wq, &ent->dwork, - msecs_to_jiffies(30 * 1000)); + secs_to_jiffies(30)); ent->tmp_cleanup_scheduled = true; } spin_unlock_irq(&ent->mkeys_queue.lock); diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index f6abd64f07..bafb21f288 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -97,33 +97,28 @@ struct mlx5_pagefault { * a pagefault. */ #define MMU_NOTIFIER_TIMEOUT 1000 -#define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT) -#define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT) -#define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS) -#define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT) -#define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1)) - -#define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT - static u64 mlx5_imr_ksm_entries; +static u64 mlx5_imr_mtt_entries; +static u64 mlx5_imr_mtt_size; +static u8 mlx5_imr_mtt_shift; +static u8 mlx5_imr_ksm_page_shift; -static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, +static void populate_ksm(struct mlx5_ksm *pksm, size_t idx, size_t nentries, struct mlx5_ib_mr *imr, int flags) { struct mlx5_core_dev *dev = mr_to_mdev(imr)->mdev; - struct mlx5_klm *end = pklm + nentries; - int step = MLX5_CAP_ODP(dev, mem_page_fault) ? MLX5_IMR_MTT_SIZE : 0; + struct mlx5_ksm *end = pksm + nentries; + u64 step = MLX5_CAP_ODP(dev, mem_page_fault) ? mlx5_imr_mtt_size : 0; __be32 key = MLX5_CAP_ODP(dev, mem_page_fault) ? cpu_to_be32(imr->null_mmkey.key) : mr_to_mdev(imr)->mkeys.null_mkey; u64 va = - MLX5_CAP_ODP(dev, mem_page_fault) ? idx * MLX5_IMR_MTT_SIZE : 0; + MLX5_CAP_ODP(dev, mem_page_fault) ? idx * mlx5_imr_mtt_size : 0; if (flags & MLX5_IB_UPD_XLT_ZAP) { - for (; pklm != end; pklm++, idx++, va += step) { - pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); - pklm->key = key; - pklm->va = cpu_to_be64(va); + for (; pksm != end; pksm++, idx++, va += step) { + pksm->key = key; + pksm->va = cpu_to_be64(va); } return; } @@ -147,16 +142,15 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, */ lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex); - for (; pklm != end; pklm++, idx++, va += step) { + for (; pksm != end; pksm++, idx++, va += step) { struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx); - pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); if (mtt) { - pklm->key = cpu_to_be32(mtt->ibmr.lkey); - pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE); + pksm->key = cpu_to_be32(mtt->ibmr.lkey); + pksm->va = cpu_to_be64(idx * mlx5_imr_mtt_size); } else { - pklm->key = key; - pklm->va = cpu_to_be64(va); + pksm->key = key; + pksm->va = cpu_to_be64(va); } } } @@ -201,7 +195,7 @@ int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, struct mlx5_ib_mr *mr, int flags) { if (flags & MLX5_IB_UPD_XLT_INDIRECT) { - populate_klm(xlt, idx, nentries, mr, flags); + populate_ksm(xlt, idx, nentries, mr, flags); return 0; } else { return populate_mtt(xlt, idx, nentries, mr, flags); @@ -226,7 +220,7 @@ static void free_implicit_child_mr_work(struct work_struct *work) mutex_lock(&odp_imr->umem_mutex); mlx5r_umr_update_xlt(mr->parent, - ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT, 1, 0, + ib_umem_start(odp) >> mlx5_imr_mtt_shift, 1, 0, MLX5_IB_UPD_XLT_INDIRECT | MLX5_IB_UPD_XLT_ATOMIC); mutex_unlock(&odp_imr->umem_mutex); mlx5_ib_dereg_mr(&mr->ibmr, NULL); @@ -237,7 +231,7 @@ static void free_implicit_child_mr_work(struct work_struct *work) static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr) { struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); - unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT; + unsigned long idx = ib_umem_start(odp) >> mlx5_imr_mtt_shift; struct mlx5_ib_mr *imr = mr->parent; /* @@ -425,7 +419,10 @@ static void internal_fill_odp_caps(struct mlx5_ib_dev *dev) if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) && MLX5_CAP_GEN(dev->mdev, null_mkey) && MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) && - !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled)) + !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled) && + mlx5_imr_ksm_entries != 0 && + !(mlx5_imr_ksm_page_shift > + get_max_log_entity_size_cap(dev, MLX5_MKC_ACCESS_MODE_KSM))) caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT; } @@ -476,14 +473,14 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, int err; odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem), - idx * MLX5_IMR_MTT_SIZE, - MLX5_IMR_MTT_SIZE, &mlx5_mn_ops); + idx * mlx5_imr_mtt_size, + mlx5_imr_mtt_size, &mlx5_mn_ops); if (IS_ERR(odp)) return ERR_CAST(odp); mr = mlx5_mr_cache_alloc(dev, imr->access_flags, MLX5_MKC_ACCESS_MODE_MTT, - MLX5_IMR_MTT_ENTRIES); + mlx5_imr_mtt_entries); if (IS_ERR(mr)) { ib_umem_odp_release(odp); return mr; @@ -495,7 +492,7 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, mr->umem = &odp->umem; mr->ibmr.lkey = mr->mmkey.key; mr->ibmr.rkey = mr->mmkey.key; - mr->ibmr.iova = idx * MLX5_IMR_MTT_SIZE; + mr->ibmr.iova = idx * mlx5_imr_mtt_size; mr->parent = imr; odp->private = mr; @@ -506,7 +503,7 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, refcount_set(&mr->mmkey.usecount, 2); err = mlx5r_umr_update_xlt(mr, 0, - MLX5_IMR_MTT_ENTRIES, + mlx5_imr_mtt_entries, PAGE_SHIFT, MLX5_IB_UPD_XLT_ZAP | MLX5_IB_UPD_XLT_ENABLE); @@ -611,7 +608,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, struct mlx5_ib_mr *imr; int err; - if (!mlx5r_umr_can_load_pas(dev, MLX5_IMR_MTT_ENTRIES * PAGE_SIZE)) + if (!mlx5r_umr_can_load_pas(dev, mlx5_imr_mtt_entries * PAGE_SIZE)) return ERR_PTR(-EOPNOTSUPP); umem_odp = ib_umem_odp_alloc_implicit(&dev->ib_dev, access_flags); @@ -647,7 +644,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, err = mlx5r_umr_update_xlt(imr, 0, mlx5_imr_ksm_entries, - MLX5_KSM_PAGE_SHIFT, + mlx5_imr_ksm_page_shift, MLX5_IB_UPD_XLT_INDIRECT | MLX5_IB_UPD_XLT_ZAP | MLX5_IB_UPD_XLT_ENABLE); @@ -750,20 +747,20 @@ static int pagefault_implicit_mr(struct mlx5_ib_mr *imr, struct ib_umem_odp *odp_imr, u64 user_va, size_t bcnt, u32 *bytes_mapped, u32 flags) { - unsigned long end_idx = (user_va + bcnt - 1) >> MLX5_IMR_MTT_SHIFT; + unsigned long end_idx = (user_va + bcnt - 1) >> mlx5_imr_mtt_shift; unsigned long upd_start_idx = end_idx + 1; unsigned long upd_len = 0; unsigned long npages = 0; int err; int ret; - if (unlikely(user_va >= mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE || - mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - user_va < bcnt)) + if (unlikely(user_va >= mlx5_imr_ksm_entries * mlx5_imr_mtt_size || + mlx5_imr_ksm_entries * mlx5_imr_mtt_size - user_va < bcnt)) return -EFAULT; /* Fault each child mr that intersects with our interval. */ while (bcnt) { - unsigned long idx = user_va >> MLX5_IMR_MTT_SHIFT; + unsigned long idx = user_va >> mlx5_imr_mtt_shift; struct ib_umem_odp *umem_odp; struct mlx5_ib_mr *mtt; u64 len; @@ -836,9 +833,13 @@ static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt, u32 *bytes_mapped, u32 flags) { struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem); + int access_mode = mr->data_direct ? MLX5_MKC_ACCESS_MODE_KSM : + MLX5_MKC_ACCESS_MODE_MTT; + unsigned int old_page_shift = mr->page_shift; + unsigned int page_shift; + unsigned long page_size; u32 xlt_flags = 0; int err; - unsigned long page_size; if (flags & MLX5_PF_FLAGS_ENABLE) xlt_flags |= MLX5_IB_UPD_XLT_ENABLE; @@ -850,20 +851,33 @@ static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt, return err; } - page_size = mlx5_umem_dmabuf_find_best_pgsz(umem_dmabuf); + page_size = mlx5_umem_dmabuf_find_best_pgsz(umem_dmabuf, access_mode); if (!page_size) { ib_umem_dmabuf_unmap_pages(umem_dmabuf); err = -EINVAL; } else { - if (mr->data_direct) - err = mlx5r_umr_update_data_direct_ksm_pas(mr, xlt_flags); - else - err = mlx5r_umr_update_mr_pas(mr, xlt_flags); + page_shift = order_base_2(page_size); + if (page_shift != mr->page_shift && mr->dmabuf_faulted) { + err = mlx5r_umr_dmabuf_update_pgsz(mr, xlt_flags, + page_shift); + } else { + mr->page_shift = page_shift; + if (mr->data_direct) + err = mlx5r_umr_update_data_direct_ksm_pas( + mr, xlt_flags); + else + err = mlx5r_umr_update_mr_pas(mr, + xlt_flags); + } } dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); - if (err) + if (err) { + mr->page_shift = old_page_shift; return err; + } + + mr->dmabuf_faulted = 1; if (bytes_mapped) *bytes_mapped += bcnt; @@ -1866,6 +1880,7 @@ int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev) struct mlx5r_cache_rb_key rb_key = { .access_mode = MLX5_MKC_ACCESS_MODE_KSM, .ndescs = mlx5_imr_ksm_entries, + .ph = MLX5_IB_NO_PH, }; struct mlx5_cache_ent *ent; @@ -1906,9 +1921,25 @@ void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev) int mlx5_ib_odp_init(void) { - mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) - - MLX5_IMR_MTT_BITS); + u32 log_va_pages = ilog2(TASK_SIZE) - PAGE_SHIFT; + u8 mlx5_imr_mtt_bits; + /* 48 is default ARM64 VA space and covers X86 4-level paging which is 47 */ + if (log_va_pages <= 48 - PAGE_SHIFT) + mlx5_imr_mtt_shift = 30; + /* 56 is x86-64, 5-level paging */ + else if (log_va_pages <= 56 - PAGE_SHIFT) + mlx5_imr_mtt_shift = 34; + else + return 0; + + mlx5_imr_mtt_size = BIT_ULL(mlx5_imr_mtt_shift); + mlx5_imr_mtt_bits = mlx5_imr_mtt_shift - PAGE_SHIFT; + mlx5_imr_mtt_entries = BIT_ULL(mlx5_imr_mtt_bits); + mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) - + mlx5_imr_mtt_bits); + + mlx5_imr_ksm_page_shift = mlx5_imr_mtt_shift; return 0; } diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c index d3dcc27220..146d03ae40 100644 --- a/drivers/infiniband/hw/mlx5/qpc.c +++ b/drivers/infiniband/hw/mlx5/qpc.c @@ -21,8 +21,10 @@ mlx5_get_rsc(struct mlx5_qp_table *table, u32 rsn) spin_lock_irqsave(&table->lock, flags); common = radix_tree_lookup(&table->tree, rsn); - if (common) + if (common && !common->invalid) refcount_inc(&common->refcount); + else + common = NULL; spin_unlock_irqrestore(&table->lock, flags); @@ -178,6 +180,18 @@ static int create_resource_common(struct mlx5_ib_dev *dev, return 0; } +static void modify_resource_common_state(struct mlx5_ib_dev *dev, + struct mlx5_core_qp *qp, + bool invalid) +{ + struct mlx5_qp_table *table = &dev->qp_table; + unsigned long flags; + + spin_lock_irqsave(&table->lock, flags); + qp->common.invalid = invalid; + spin_unlock_irqrestore(&table->lock, flags); +} + static void destroy_resource_common(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp) { @@ -609,8 +623,20 @@ err_destroy_rq: int mlx5_core_destroy_rq_tracked(struct mlx5_ib_dev *dev, struct mlx5_core_qp *rq) { + int ret; + + /* The rq destruction can be called again in case it fails, hence we + * mark the common resource as invalid and only once FW destruction + * is completed successfully we actually destroy the resources. + */ + modify_resource_common_state(dev, rq, true); + ret = destroy_rq_tracked(dev, rq->qpn, rq->uid); + if (ret) { + modify_resource_common_state(dev, rq, false); + return ret; + } destroy_resource_common(dev, rq); - return destroy_rq_tracked(dev, rq->qpn, rq->uid); + return 0; } static void destroy_sq_tracked(struct mlx5_ib_dev *dev, u32 sqn, u16 uid) diff --git a/drivers/infiniband/hw/mlx5/std_types.c b/drivers/infiniband/hw/mlx5/std_types.c index bdb5684110..2fcf553044 100644 --- a/drivers/infiniband/hw/mlx5/std_types.c +++ b/drivers/infiniband/hw/mlx5/std_types.c @@ -83,33 +83,14 @@ static int fill_vport_icm_addr(struct mlx5_core_dev *mdev, u16 vport, static int fill_vport_vhca_id(struct mlx5_core_dev *mdev, u16 vport, struct mlx5_ib_uapi_query_port *info) { - size_t out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out); - u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {}; - void *out; - int err; + int err = mlx5_vport_get_vhca_id(mdev, vport, &info->vport_vhca_id); - out = kzalloc(out_sz, GFP_KERNEL); - if (!out) - return -ENOMEM; - - MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); - MLX5_SET(query_hca_cap_in, in, other_function, true); - MLX5_SET(query_hca_cap_in, in, function_id, vport); - MLX5_SET(query_hca_cap_in, in, op_mod, - MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE | - HCA_CAP_OPMOD_GET_CUR); - - err = mlx5_cmd_exec(mdev, in, sizeof(in), out, out_sz); if (err) - goto out; - - info->vport_vhca_id = MLX5_GET(query_hca_cap_out, out, - capability.cmd_hca_cap.vhca_id); + return err; info->flags |= MLX5_IB_UAPI_QUERY_PORT_VPORT_VHCA_ID; -out: - kfree(out); - return err; + + return 0; } static int fill_multiport_info(struct mlx5_ib_dev *dev, u32 port_num, diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c index 5be4426a28..4e562e0dd9 100644 --- a/drivers/infiniband/hw/mlx5/umr.c +++ b/drivers/infiniband/hw/mlx5/umr.c @@ -32,13 +32,15 @@ static __be64 get_umr_disable_mr_mask(void) return cpu_to_be64(result); } -static __be64 get_umr_update_translation_mask(void) +static __be64 get_umr_update_translation_mask(struct mlx5_ib_dev *dev) { u64 result; result = MLX5_MKEY_MASK_LEN | MLX5_MKEY_MASK_PAGE_SIZE | MLX5_MKEY_MASK_START_ADDR; + if (MLX5_CAP_GEN_2(dev->mdev, umr_log_entity_size_5)) + result |= MLX5_MKEY_MASK_PAGE_SIZE_5; return cpu_to_be64(result); } @@ -654,9 +656,12 @@ static void mlx5r_umr_final_update_xlt(struct mlx5_ib_dev *dev, flags & MLX5_IB_UPD_XLT_ENABLE || flags & MLX5_IB_UPD_XLT_ADDR; if (update_translation) { - wqe->ctrl_seg.mkey_mask |= get_umr_update_translation_mask(); + wqe->ctrl_seg.mkey_mask |= get_umr_update_translation_mask(dev); if (!mr->ibmr.length) MLX5_SET(mkc, &wqe->mkey_seg, length64, 1); + if (flags & MLX5_IB_UPD_XLT_KEEP_PGSZ) + wqe->ctrl_seg.mkey_mask &= + cpu_to_be64(~MLX5_MKEY_MASK_PAGE_SIZE); } wqe->ctrl_seg.xlt_octowords = @@ -664,46 +669,78 @@ static void mlx5r_umr_final_update_xlt(struct mlx5_ib_dev *dev, wqe->data_seg.byte_count = cpu_to_be32(sg->length); } +static void +_mlx5r_umr_init_wqe(struct mlx5_ib_mr *mr, struct mlx5r_umr_wqe *wqe, + struct ib_sge *sg, unsigned int flags, + unsigned int page_shift, bool dd) +{ + struct mlx5_ib_dev *dev = mr_to_mdev(mr); + + mlx5r_umr_set_update_xlt_ctrl_seg(&wqe->ctrl_seg, flags, sg); + mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe->mkey_seg, mr, page_shift); + if (dd) /* Use the data direct internal kernel PD */ + MLX5_SET(mkc, &wqe->mkey_seg, pd, dev->ddr.pdn); + mlx5r_umr_set_update_xlt_data_seg(&wqe->data_seg, sg); +} + static int -_mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd) +_mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd, + size_t start_block, size_t nblocks) { size_t ent_size = dd ? sizeof(struct mlx5_ksm) : sizeof(struct mlx5_mtt); struct mlx5_ib_dev *dev = mr_to_mdev(mr); struct device *ddev = &dev->mdev->pdev->dev; struct mlx5r_umr_wqe wqe = {}; + size_t processed_blocks = 0; struct ib_block_iter biter; + size_t cur_block_idx = 0; struct mlx5_ksm *cur_ksm; struct mlx5_mtt *cur_mtt; size_t orig_sg_length; + size_t total_blocks; size_t final_size; void *curr_entry; struct ib_sge sg; void *entry; - u64 offset = 0; + u64 offset; int err = 0; - entry = mlx5r_umr_create_xlt(dev, &sg, - ib_umem_num_dma_blocks(mr->umem, 1 << mr->page_shift), - ent_size, flags); + total_blocks = ib_umem_num_dma_blocks(mr->umem, 1UL << mr->page_shift); + if (start_block > total_blocks) + return -EINVAL; + + /* nblocks 0 means update all blocks starting from start_block */ + if (nblocks) + total_blocks = nblocks; + + entry = mlx5r_umr_create_xlt(dev, &sg, total_blocks, ent_size, flags); if (!entry) return -ENOMEM; orig_sg_length = sg.length; - mlx5r_umr_set_update_xlt_ctrl_seg(&wqe.ctrl_seg, flags, &sg); - mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe.mkey_seg, mr, - mr->page_shift); - if (dd) { - /* Use the data direct internal kernel PD */ - MLX5_SET(mkc, &wqe.mkey_seg, pd, dev->ddr.pdn); - cur_ksm = entry; - } else { - cur_mtt = entry; - } - mlx5r_umr_set_update_xlt_data_seg(&wqe.data_seg, &sg); + _mlx5r_umr_init_wqe(mr, &wqe, &sg, flags, mr->page_shift, dd); + + /* Set initial translation offset to start_block */ + offset = (u64)start_block * ent_size; + mlx5r_umr_update_offset(&wqe.ctrl_seg, offset); + + if (dd) + cur_ksm = entry; + else + cur_mtt = entry; curr_entry = entry; + rdma_umem_for_each_dma_block(mr->umem, &biter, BIT(mr->page_shift)) { + if (cur_block_idx < start_block) { + cur_block_idx++; + continue; + } + + if (nblocks && processed_blocks >= nblocks) + break; + if (curr_entry == entry + sg.length) { dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE); @@ -724,7 +761,16 @@ _mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd) if (dd) { cur_ksm->va = cpu_to_be64(rdma_block_iter_dma_address(&biter)); - cur_ksm->key = cpu_to_be32(dev->ddr.mkey); + if (mr->access_flags & IB_ACCESS_RELAXED_ORDERING && + dev->ddr.mkey_ro_valid) + cur_ksm->key = cpu_to_be32(dev->ddr.mkey_ro); + else + cur_ksm->key = cpu_to_be32(dev->ddr.mkey); + if (mr->umem->is_dmabuf && + (flags & MLX5_IB_UPD_XLT_ZAP)) { + cur_ksm->va = 0; + cur_ksm->key = 0; + } cur_ksm++; curr_entry = cur_ksm; } else { @@ -736,6 +782,8 @@ _mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd) cur_mtt++; curr_entry = cur_mtt; } + + processed_blocks++; } final_size = curr_entry - entry; @@ -752,13 +800,32 @@ err: return err; } -int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int flags) +int mlx5r_umr_update_data_direct_ksm_pas_range(struct mlx5_ib_mr *mr, + unsigned int flags, + size_t start_block, + size_t nblocks) { /* No invalidation flow is expected */ - if (WARN_ON(!mr->umem->is_dmabuf) || (flags & MLX5_IB_UPD_XLT_ZAP)) + if (WARN_ON(!mr->umem->is_dmabuf) || ((flags & MLX5_IB_UPD_XLT_ZAP) && + !(flags & MLX5_IB_UPD_XLT_KEEP_PGSZ))) return -EINVAL; - return _mlx5r_umr_update_mr_pas(mr, flags, true); + return _mlx5r_umr_update_mr_pas(mr, flags, true, start_block, nblocks); +} + +int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, + unsigned int flags) +{ + return mlx5r_umr_update_data_direct_ksm_pas_range(mr, flags, 0, 0); +} + +int mlx5r_umr_update_mr_pas_range(struct mlx5_ib_mr *mr, unsigned int flags, + size_t start_block, size_t nblocks) +{ + if (WARN_ON(mr->umem->is_odp)) + return -EINVAL; + + return _mlx5r_umr_update_mr_pas(mr, flags, false, start_block, nblocks); } /* @@ -768,10 +835,7 @@ int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int fla */ int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) { - if (WARN_ON(mr->umem->is_odp)) - return -EINVAL; - - return _mlx5r_umr_update_mr_pas(mr, flags, false); + return mlx5r_umr_update_mr_pas_range(mr, flags, 0, 0); } static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev) @@ -864,3 +928,202 @@ int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, mlx5r_umr_unmap_free_xlt(dev, xlt, &sg); return err; } + +/* + * Update only the page-size (log_page_size) field of an existing memory key + * using UMR. This is useful when the MR's physical layout stays the same + * but the optimal page shift has changed (e.g. dmabuf after pages are + * pinned and the HW can switch from 4K to huge-page alignment). + */ +int mlx5r_umr_update_mr_page_shift(struct mlx5_ib_mr *mr, + unsigned int page_shift, + bool dd) +{ + struct mlx5_ib_dev *dev = mr_to_mdev(mr); + struct mlx5r_umr_wqe wqe = {}; + int err; + + /* Build UMR wqe: we touch only PAGE_SIZE, so use the dedicated mask */ + wqe.ctrl_seg.mkey_mask = get_umr_update_translation_mask(dev); + + /* MR must be free while page size is modified */ + wqe.ctrl_seg.flags = MLX5_UMR_CHECK_FREE | MLX5_UMR_INLINE; + + /* Fill mkey segment with the new page size, keep the rest unchanged */ + MLX5_SET(mkc, &wqe.mkey_seg, log_page_size, page_shift); + + if (dd) + MLX5_SET(mkc, &wqe.mkey_seg, pd, dev->ddr.pdn); + else + MLX5_SET(mkc, &wqe.mkey_seg, pd, to_mpd(mr->ibmr.pd)->pdn); + + MLX5_SET64(mkc, &wqe.mkey_seg, start_addr, mr->ibmr.iova); + MLX5_SET64(mkc, &wqe.mkey_seg, len, mr->ibmr.length); + MLX5_SET(mkc, &wqe.mkey_seg, qpn, 0xffffff); + MLX5_SET(mkc, &wqe.mkey_seg, mkey_7_0, + mlx5_mkey_variant(mr->mmkey.key)); + + err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, false); + if (!err) + mr->page_shift = page_shift; + + return err; +} + +static inline int +_mlx5r_dmabuf_umr_update_pas(struct mlx5_ib_mr *mr, unsigned int flags, + size_t start_block, size_t nblocks, bool dd) +{ + if (dd) + return mlx5r_umr_update_data_direct_ksm_pas_range(mr, flags, + start_block, + nblocks); + else + return mlx5r_umr_update_mr_pas_range(mr, flags, start_block, + nblocks); +} + +/** + * This function makes an mkey non-present by zapping the translation entries of + * the mkey by zapping (zeroing out) the first N entries, where N is determined + * by the largest page size supported by the device and the MR length. + * It then updates the mkey's page size to the largest possible value, ensuring + * the MR is completely non-present and safe for further updates. + * It is useful to update the page size of a dmabuf MR on a page fault. + * + * Return: On success, returns the number of entries that were zapped. + * On error, returns a negative error code. + */ +static int _mlx5r_umr_zap_mkey(struct mlx5_ib_mr *mr, + unsigned int flags, + unsigned int page_shift, + size_t *nblocks, + bool dd) +{ + unsigned int old_page_shift = mr->page_shift; + struct mlx5_ib_dev *dev = mr_to_mdev(mr); + unsigned int max_page_shift; + size_t page_shift_nblocks; + unsigned int max_log_size; + int access_mode; + int err; + + access_mode = dd ? MLX5_MKC_ACCESS_MODE_KSM : MLX5_MKC_ACCESS_MODE_MTT; + flags |= MLX5_IB_UPD_XLT_KEEP_PGSZ | MLX5_IB_UPD_XLT_ZAP | + MLX5_IB_UPD_XLT_ATOMIC; + max_log_size = get_max_log_entity_size_cap(dev, access_mode); + max_page_shift = order_base_2(mr->ibmr.length); + max_page_shift = min(max(max_page_shift, page_shift), max_log_size); + /* Count blocks in units of max_page_shift, we will zap exactly this + * many to make the whole MR non-present. + * Block size must be aligned to MLX5_UMR_FLEX_ALIGNMENT since it may + * be used as offset into the XLT later on. + */ + *nblocks = ib_umem_num_dma_blocks(mr->umem, 1UL << max_page_shift); + if (dd) + *nblocks = ALIGN(*nblocks, MLX5_UMR_KSM_NUM_ENTRIES_ALIGNMENT); + else + *nblocks = ALIGN(*nblocks, MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT); + page_shift_nblocks = ib_umem_num_dma_blocks(mr->umem, + 1UL << page_shift); + /* If the number of blocks at max possible page shift is greater than + * the number of blocks at the new page size, we should just go over the + * whole mkey entries. + */ + if (*nblocks >= page_shift_nblocks) + *nblocks = 0; + + /* Make the first nblocks entries non-present without changing + * page size yet. + */ + if (*nblocks) + mr->page_shift = max_page_shift; + err = _mlx5r_dmabuf_umr_update_pas(mr, flags, 0, *nblocks, dd); + if (err) { + mr->page_shift = old_page_shift; + return err; + } + + /* Change page size to the max page size now that the MR is completely + * non-present. + */ + if (*nblocks) { + err = mlx5r_umr_update_mr_page_shift(mr, max_page_shift, dd); + if (err) { + mr->page_shift = old_page_shift; + return err; + } + } + + return 0; +} + +/** + * mlx5r_umr_dmabuf_update_pgsz - Safely update DMABUF MR page size and its + * entries accordingly + * @mr: The memory region to update + * @xlt_flags: Translation table update flags + * @page_shift: The new (optimized) page shift to use + * + * This function updates the page size and mkey translation entries for a DMABUF + * MR in a safe, multi-step process to avoid exposing partially updated mappings + * The update is performed in 5 steps: + * 1. Make the first X entries non-present, while X is calculated to be + * minimal according to a large page shift that can be used to cover the + * MR length. + * 2. Update the page size to the large supported page size + * 3. Load the remaining N-X entries according to the (optimized) page_shift + * 4. Update the page size according to the (optimized) page_shift + * 5. Load the first X entries with the correct translations + * + * This ensures that at no point is the MR accessible with a partially updated + * translation table, maintaining correctness and preventing access to stale or + * inconsistent mappings. + * + * Returns 0 on success or a negative error code on failure. + */ +int mlx5r_umr_dmabuf_update_pgsz(struct mlx5_ib_mr *mr, u32 xlt_flags, + unsigned int page_shift) +{ + unsigned int old_page_shift = mr->page_shift; + size_t zapped_blocks; + size_t total_blocks; + int err; + + err = _mlx5r_umr_zap_mkey(mr, xlt_flags, page_shift, &zapped_blocks, + mr->data_direct); + if (err) + return err; + + /* _mlx5r_umr_zap_mkey already enables the mkey */ + xlt_flags &= ~MLX5_IB_UPD_XLT_ENABLE; + mr->page_shift = page_shift; + total_blocks = ib_umem_num_dma_blocks(mr->umem, 1UL << mr->page_shift); + if (zapped_blocks && zapped_blocks < total_blocks) { + /* Update PAS according to the new page size but don't update + * the page size in the mkey yet. + */ + err = _mlx5r_dmabuf_umr_update_pas( + mr, + xlt_flags | MLX5_IB_UPD_XLT_KEEP_PGSZ, + zapped_blocks, + total_blocks - zapped_blocks, + mr->data_direct); + if (err) + goto err; + } + + err = mlx5r_umr_update_mr_page_shift(mr, mr->page_shift, + mr->data_direct); + if (err) + goto err; + err = _mlx5r_dmabuf_umr_update_pas(mr, xlt_flags, 0, zapped_blocks, + mr->data_direct); + if (err) + goto err; + + return 0; +err: + mr->page_shift = old_page_shift; + return err; +} diff --git a/drivers/infiniband/hw/mlx5/umr.h b/drivers/infiniband/hw/mlx5/umr.h index 4a02c9b5aa..e9361f0140 100644 --- a/drivers/infiniband/hw/mlx5/umr.h +++ b/drivers/infiniband/hw/mlx5/umr.h @@ -94,9 +94,20 @@ struct mlx5r_umr_wqe { int mlx5r_umr_revoke_mr(struct mlx5_ib_mr *mr); int mlx5r_umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd, int access_flags); -int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags); +int mlx5r_umr_update_data_direct_ksm_pas_range(struct mlx5_ib_mr *mr, + unsigned int flags, + size_t start_block, + size_t nblocks); int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int flags); +int mlx5r_umr_update_mr_pas_range(struct mlx5_ib_mr *mr, unsigned int flags, + size_t start_block, size_t nblocks); +int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags); int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, int page_shift, int flags); +int mlx5r_umr_update_mr_page_shift(struct mlx5_ib_mr *mr, + unsigned int page_shift, + bool dd); +int mlx5r_umr_dmabuf_update_pgsz(struct mlx5_ib_mr *mr, u32 xlt_flags, + unsigned int page_shift); #endif /* _MLX5_IB_UMR_H */ diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c index 9f54aa90a3..dde1910dd8 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c @@ -350,7 +350,7 @@ int pvrdma_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) uresp.qp_tab_size = vdev->dsr->caps.max_qp; ret = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); if (ret) { - pvrdma_uar_free(vdev, &context->uar); + /* pvrdma_dealloc_ucontext() also frees the UAR */ pvrdma_dealloc_ucontext(&context->ibucontext); return -EFAULT; } diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h index 0d7cd7d433..a0bf39bdcb 100644 --- a/drivers/net/ethernet/intel/iavf/iavf.h +++ b/drivers/net/ethernet/intel/iavf/iavf.h @@ -158,11 +158,10 @@ struct iavf_vlan { enum iavf_vlan_state_t { IAVF_VLAN_INVALID, IAVF_VLAN_ADD, /* filter needs to be added */ - IAVF_VLAN_IS_NEW, /* filter is new, wait for PF answer */ - IAVF_VLAN_ACTIVE, /* filter is accepted by PF */ - IAVF_VLAN_DISABLE, /* filter needs to be deleted by PF, then marked INACTIVE */ - IAVF_VLAN_INACTIVE, /* filter is inactive, we are in IFF_DOWN */ - IAVF_VLAN_REMOVE, /* filter needs to be removed from list */ + IAVF_VLAN_ADDING, /* ADD sent to PF, waiting for response */ + IAVF_VLAN_ACTIVE, /* PF confirmed, filter is in HW */ + IAVF_VLAN_REMOVE, /* filter queued for DEL from PF */ + IAVF_VLAN_REMOVING, /* DEL sent to PF, waiting for response */ }; struct iavf_vlan_filter { diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 7a5efc9ea6..f50dcf75bd 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -781,10 +781,13 @@ iavf_vlan_filter *iavf_add_vlan(struct iavf_adapter *adapter, adapter->num_vlan_filters++; iavf_schedule_aq_request(adapter, IAVF_FLAG_AQ_ADD_VLAN_FILTER); } else if (f->state == IAVF_VLAN_REMOVE) { - /* IAVF_VLAN_REMOVE means that VLAN wasn't yet removed. - * We can safely only change the state here. - */ + /* DEL not yet sent to PF, cancel it */ f->state = IAVF_VLAN_ACTIVE; + } else if (f->state == IAVF_VLAN_REMOVING) { + /* DEL already sent to PF, re-add after completion */ + f->state = IAVF_VLAN_ADD; + iavf_schedule_aq_request(adapter, + IAVF_FLAG_AQ_ADD_VLAN_FILTER); } clearout: @@ -812,37 +815,19 @@ static void iavf_del_vlan(struct iavf_adapter *adapter, struct iavf_vlan vlan) list_del(&f->list); kfree(f); adapter->num_vlan_filters--; - } else { + } else if (f->state != IAVF_VLAN_REMOVING) { f->state = IAVF_VLAN_REMOVE; iavf_schedule_aq_request(adapter, IAVF_FLAG_AQ_DEL_VLAN_FILTER); } + /* If REMOVING, DEL is already sent to PF; completion + * handler will free the filter when PF confirms. + */ } spin_unlock_bh(&adapter->mac_vlan_list_lock); } -/** - * iavf_restore_filters - * @adapter: board private structure - * - * Restore existing non MAC filters when VF netdev comes back up - **/ -static void iavf_restore_filters(struct iavf_adapter *adapter) -{ - struct iavf_vlan_filter *f; - - /* re-add all VLAN filters */ - spin_lock_bh(&adapter->mac_vlan_list_lock); - - list_for_each_entry(f, &adapter->vlan_filter_list, list) { - if (f->state == IAVF_VLAN_INACTIVE) - f->state = IAVF_VLAN_ADD; - } - - spin_unlock_bh(&adapter->mac_vlan_list_lock); - adapter->aq_required |= IAVF_FLAG_AQ_ADD_VLAN_FILTER; -} /** * iavf_get_num_vlans_added - get number of VLANs added @@ -1261,13 +1246,12 @@ static void iavf_up_complete(struct iavf_adapter *adapter) } /** - * iavf_clear_mac_vlan_filters - Remove mac and vlan filters not sent to PF - * yet and mark other to be removed. + * iavf_clear_mac_filters - Remove MAC filters not sent to PF yet and mark + * others to be removed. * @adapter: board private structure **/ -static void iavf_clear_mac_vlan_filters(struct iavf_adapter *adapter) +static void iavf_clear_mac_filters(struct iavf_adapter *adapter) { - struct iavf_vlan_filter *vlf, *vlftmp; struct iavf_mac_filter *f, *ftmp; spin_lock_bh(&adapter->mac_vlan_list_lock); @@ -1286,11 +1270,6 @@ static void iavf_clear_mac_vlan_filters(struct iavf_adapter *adapter) } } - /* disable all VLAN filters */ - list_for_each_entry_safe(vlf, vlftmp, &adapter->vlan_filter_list, - list) - vlf->state = IAVF_VLAN_DISABLE; - spin_unlock_bh(&adapter->mac_vlan_list_lock); } @@ -1386,7 +1365,7 @@ void iavf_down(struct iavf_adapter *adapter) iavf_napi_disable_all(adapter); iavf_irq_disable(adapter); - iavf_clear_mac_vlan_filters(adapter); + iavf_clear_mac_filters(adapter); iavf_clear_cloud_filters(adapter); iavf_clear_fdir_filters(adapter); iavf_clear_adv_rss_conf(adapter); @@ -1403,8 +1382,6 @@ void iavf_down(struct iavf_adapter *adapter) */ if (!list_empty(&adapter->mac_filter_list)) adapter->aq_required |= IAVF_FLAG_AQ_DEL_MAC_FILTER; - if (!list_empty(&adapter->vlan_filter_list)) - adapter->aq_required |= IAVF_FLAG_AQ_DEL_VLAN_FILTER; if (!list_empty(&adapter->cloud_filter_list)) adapter->aq_required |= IAVF_FLAG_AQ_DEL_CLOUD_FILTER; if (!list_empty(&adapter->fdir_list_head)) @@ -4559,8 +4536,6 @@ static int iavf_open(struct net_device *netdev) spin_unlock_bh(&adapter->mac_vlan_list_lock); - /* Restore filters that were removed with IFF_DOWN */ - iavf_restore_filters(adapter); iavf_restore_fdir_filters(adapter); iavf_configure(adapter); diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c index 88156082a4..147adb76f6 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c +++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c @@ -746,7 +746,7 @@ static void iavf_vlan_add_reject(struct iavf_adapter *adapter) spin_lock_bh(&adapter->mac_vlan_list_lock); list_for_each_entry_safe(f, ftmp, &adapter->vlan_filter_list, list) { - if (f->state == IAVF_VLAN_IS_NEW) { + if (f->state == IAVF_VLAN_ADDING) { list_del(&f->list); kfree(f); adapter->num_vlan_filters--; @@ -812,7 +812,7 @@ void iavf_add_vlans(struct iavf_adapter *adapter) if (f->state == IAVF_VLAN_ADD) { vvfl->vlan_id[i] = f->vlan.vid; i++; - f->state = IAVF_VLAN_IS_NEW; + f->state = IAVF_VLAN_ADDING; if (i == count) break; } @@ -874,7 +874,7 @@ void iavf_add_vlans(struct iavf_adapter *adapter) vlan->tpid = f->vlan.tpid; i++; - f->state = IAVF_VLAN_IS_NEW; + f->state = IAVF_VLAN_ADDING; } } @@ -911,22 +911,12 @@ void iavf_del_vlans(struct iavf_adapter *adapter) spin_lock_bh(&adapter->mac_vlan_list_lock); list_for_each_entry_safe(f, ftmp, &adapter->vlan_filter_list, list) { - /* since VLAN capabilities are not allowed, we dont want to send - * a VLAN delete request because it will most likely fail and - * create unnecessary errors/noise, so just free the VLAN - * filters marked for removal to enable bailing out before - * sending a virtchnl message - */ if (f->state == IAVF_VLAN_REMOVE && !VLAN_FILTERING_ALLOWED(adapter)) { list_del(&f->list); kfree(f); adapter->num_vlan_filters--; - } else if (f->state == IAVF_VLAN_DISABLE && - !VLAN_FILTERING_ALLOWED(adapter)) { - f->state = IAVF_VLAN_INACTIVE; - } else if (f->state == IAVF_VLAN_REMOVE || - f->state == IAVF_VLAN_DISABLE) { + } else if (f->state == IAVF_VLAN_REMOVE) { count++; } } @@ -958,18 +948,10 @@ void iavf_del_vlans(struct iavf_adapter *adapter) vvfl->vsi_id = adapter->vsi_res->vsi_id; vvfl->num_elements = count; - list_for_each_entry_safe(f, ftmp, &adapter->vlan_filter_list, list) { - if (f->state == IAVF_VLAN_DISABLE) { + list_for_each_entry(f, &adapter->vlan_filter_list, list) { + if (f->state == IAVF_VLAN_REMOVE) { vvfl->vlan_id[i] = f->vlan.vid; - f->state = IAVF_VLAN_INACTIVE; - i++; - if (i == count) - break; - } else if (f->state == IAVF_VLAN_REMOVE) { - vvfl->vlan_id[i] = f->vlan.vid; - list_del(&f->list); - kfree(f); - adapter->num_vlan_filters--; + f->state = IAVF_VLAN_REMOVING; i++; if (i == count) break; @@ -1006,9 +988,8 @@ void iavf_del_vlans(struct iavf_adapter *adapter) vvfl_v2->vport_id = adapter->vsi_res->vsi_id; vvfl_v2->num_elements = count; - list_for_each_entry_safe(f, ftmp, &adapter->vlan_filter_list, list) { - if (f->state == IAVF_VLAN_DISABLE || - f->state == IAVF_VLAN_REMOVE) { + list_for_each_entry(f, &adapter->vlan_filter_list, list) { + if (f->state == IAVF_VLAN_REMOVE) { struct virtchnl_vlan_supported_caps *filtering_support = &adapter->vlan_v2_caps.filtering.filtering_support; struct virtchnl_vlan *vlan; @@ -1022,13 +1003,7 @@ void iavf_del_vlans(struct iavf_adapter *adapter) vlan->tci = f->vlan.vid; vlan->tpid = f->vlan.tpid; - if (f->state == IAVF_VLAN_DISABLE) { - f->state = IAVF_VLAN_INACTIVE; - } else { - list_del(&f->list); - kfree(f); - adapter->num_vlan_filters--; - } + f->state = IAVF_VLAN_REMOVING; i++; if (i == count) break; @@ -2391,10 +2366,6 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, ether_addr_copy(adapter->hw.mac.addr, netdev->dev_addr); wake_up(&adapter->vc_waitqueue); break; - case VIRTCHNL_OP_DEL_VLAN: - dev_err(&adapter->pdev->dev, "Failed to delete VLAN filter, error %s\n", - iavf_stat_str(&adapter->hw, v_retval)); - break; case VIRTCHNL_OP_DEL_ETH_ADDR: dev_err(&adapter->pdev->dev, "Failed to delete MAC filter, error %s\n", iavf_stat_str(&adapter->hw, v_retval)); @@ -2906,17 +2877,42 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, spin_unlock_bh(&adapter->adv_rss_lock); } break; + case VIRTCHNL_OP_ADD_VLAN: case VIRTCHNL_OP_ADD_VLAN_V2: { struct iavf_vlan_filter *f; + if (v_retval) + break; + spin_lock_bh(&adapter->mac_vlan_list_lock); list_for_each_entry(f, &adapter->vlan_filter_list, list) { - if (f->state == IAVF_VLAN_IS_NEW) + if (f->state == IAVF_VLAN_ADDING) f->state = IAVF_VLAN_ACTIVE; } spin_unlock_bh(&adapter->mac_vlan_list_lock); } break; + case VIRTCHNL_OP_DEL_VLAN: + case VIRTCHNL_OP_DEL_VLAN_V2: { + struct iavf_vlan_filter *f, *ftmp; + + spin_lock_bh(&adapter->mac_vlan_list_lock); + list_for_each_entry_safe(f, ftmp, &adapter->vlan_filter_list, + list) { + if (f->state == IAVF_VLAN_REMOVING) { + if (v_retval) { + /* PF rejected DEL, keep filter */ + f->state = IAVF_VLAN_ACTIVE; + } else { + list_del(&f->list); + kfree(f); + adapter->num_vlan_filters--; + } + } + } + spin_unlock_bh(&adapter->mac_vlan_list_lock); + } + break; case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING: /* PF enabled vlan strip on this VF. * Update netdev->features if needed to be in sync with ethtool. diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig index 6ec7d6e018..8ef2ac2060 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig +++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig @@ -8,7 +8,6 @@ config MLX5_CORE depends on PCI select AUXILIARY_BUS select NET_DEVLINK - depends on VXLAN || !VXLAN depends on MLXFW || !MLXFW depends on PTP_1588_CLOCK_OPTIONAL depends on PCI_HYPERV_INTERFACE || !PCI_HYPERV_INTERFACE diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index 568bbe5f83..d77696f46e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -17,7 +17,7 @@ mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \ fs_counters.o fs_ft_pool.o rl.o lag/debugfs.o lag/lag.o dev.o events.o wq.o lib/gid.o \ lib/devcom.o lib/pci_vsc.o lib/dm.o lib/fs_ttc.o diag/fs_tracepoint.o \ diag/fw_tracer.o diag/crdump.o devlink.o diag/rsc_dump.o diag/reporter_vnic.o \ - fw_reset.o qos.o lib/tout.o lib/aso.o wc.o fs_pool.o + fw_reset.o qos.o lib/tout.o lib/aso.o wc.o fs_pool.o lib/nv_param.o # # Netdev basic @@ -29,7 +29,7 @@ mlx5_core-$(CONFIG_MLX5_CORE_EN) += en/rqt.o en/tir.o en/rss.o en/rx_res.o \ en/reporter_tx.o en/reporter_rx.o en/params.o en/xsk/pool.o \ en/xsk/setup.o en/xsk/rx.o en/xsk/tx.o en/devlink.o en/ptp.o \ en/qos.o en/htb.o en/trap.o en/fs_tt_redirect.o en/selq.o \ - lib/crypto.o lib/sd.o + lib/crypto.o lib/sd.o en/pcie_cong_event.o # # Netdev extra @@ -69,7 +69,7 @@ mlx5_core-$(CONFIG_MLX5_TC_SAMPLE) += en/tc/sample.o # Core extra # mlx5_core-$(CONFIG_MLX5_ESWITCH) += eswitch.o eswitch_offloads.o eswitch_offloads_termtbl.o \ - ecpf.o rdma.o esw/legacy.o \ + ecpf.o rdma.o esw/legacy.o esw/adj_vport.o \ esw/devlink_port.o esw/vporttbl.o esw/qos.o esw/ipsec.o mlx5_core-$(CONFIG_MLX5_ESWITCH) += esw/acl/helper.o \ @@ -85,7 +85,9 @@ mlx5_core-$(CONFIG_MLX5_BRIDGE) += esw/bridge.o esw/bridge_mcast.o esw/bridge mlx5_core-$(CONFIG_HWMON) += hwmon.o mlx5_core-$(CONFIG_MLX5_MPFS) += lib/mpfs.o -mlx5_core-$(CONFIG_VXLAN) += lib/vxlan.o +ifneq ($(CONFIG_VXLAN),) + mlx5_core-y += lib/vxlan.o +endif mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += lib/hv.o lib/hv_vhca.o @@ -154,7 +156,8 @@ mlx5_core-$(CONFIG_MLX5_HW_STEERING) += steering/hws/cmd.o \ steering/hws/vport.o \ steering/hws/bwc_complex.o \ steering/hws/fs_hws_pools.o \ - steering/hws/fs_hws.o + steering/hws/fs_hws.o \ + steering/hws/action_ste_pool.o # # SF device @@ -166,5 +169,10 @@ mlx5_core-$(CONFIG_MLX5_SF) += sf/vhca_event.o sf/dev/dev.o sf/dev/driver.o irq_ # mlx5_core-$(CONFIG_MLX5_SF_MANAGER) += sf/cmd.o sf/hw_table.o sf/devlink.o +# +# TPH support +# +mlx5_core-$(CONFIG_PCIE_TPH) += lib/st.o + obj-$(CONFIG_MLX5_DPLL) += mlx5_dpll.o mlx5_dpll-y := dpll.o diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index e53dbdc0a7..5b08e5ffe0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -181,6 +181,7 @@ static int cmd_alloc_index(struct mlx5_cmd *cmd, struct mlx5_cmd_work_ent *ent) static void cmd_free_index(struct mlx5_cmd *cmd, int idx) { lockdep_assert_held(&cmd->alloc_lock); + cmd->ent_arr[idx] = NULL; set_bit(idx, &cmd->vars.bitmask); } @@ -294,6 +295,10 @@ static void poll_timeout(struct mlx5_cmd_work_ent *ent) return; } cond_resched(); + if (mlx5_cmd_is_down(dev)) { + ent->ret = -ENXIO; + return; + } } while (time_before(jiffies, poll_end)); ent->ret = -ETIMEDOUT; @@ -927,8 +932,7 @@ static void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool force static void cb_timeout_handler(struct work_struct *work) { - struct delayed_work *dwork = container_of(work, struct delayed_work, - work); + struct delayed_work *dwork = to_delayed_work(work); struct mlx5_cmd_work_ent *ent = container_of(dwork, struct mlx5_cmd_work_ent, cb_timeout_work); @@ -1071,7 +1075,7 @@ static void cmd_work_handler(struct work_struct *work) poll_timeout(ent); /* make sure we read the descriptor after ownership is SW */ rmb(); - mlx5_cmd_comp_handler(dev, 1ULL << ent->idx, (ent->ret == -ETIMEDOUT)); + mlx5_cmd_comp_handler(dev, 1ULL << ent->idx, !!ent->ret); } } @@ -1197,6 +1201,44 @@ out_err: return err; } +/* Check if all command slots are stalled (timed out and not recovered). + * returns true if all slots timed out on a recent command and have not been + * completed by FW yet. (stalled state) + * false otherwise (at least one slot is not stalled). + * + * In such odd situation "all_stalled", this serves as a protection mechanism + * to avoid blocking the kernel for long periods of time in case FW is not + * responding to commands. + */ +static bool mlx5_cmd_all_stalled(struct mlx5_core_dev *dev) +{ + struct mlx5_cmd *cmd = &dev->cmd; + bool all_stalled = true; + unsigned long flags; + int i; + + spin_lock_irqsave(&cmd->alloc_lock, flags); + + /* at least one command slot is free */ + if (bitmap_weight(&cmd->vars.bitmask, cmd->vars.max_reg_cmds) > 0) { + all_stalled = false; + goto out; + } + + for_each_clear_bit(i, &cmd->vars.bitmask, cmd->vars.max_reg_cmds) { + struct mlx5_cmd_work_ent *ent = dev->cmd.ent_arr[i]; + + if (!test_bit(MLX5_CMD_ENT_STATE_TIMEDOUT, &ent->state)) { + all_stalled = false; + break; + } + } +out: + spin_unlock_irqrestore(&cmd->alloc_lock, flags); + + return all_stalled; +} + /* Notes: * 1. Callback functions may not sleep * 2. page queue commands do not support asynchrous completion @@ -1227,6 +1269,15 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in, if (callback && page_queue) return -EINVAL; + if (!page_queue && mlx5_cmd_all_stalled(dev)) { + mlx5_core_err_rl(dev, + "All CMD slots are stalled, aborting command\n"); + /* there's no reason to wait and block the whole kernel if FW + * isn't currently responding to all slots, fail immediately + */ + return -EAGAIN; + } + ent = cmd_alloc_ent(cmd, in, out, uout, uout_size, callback, context, page_queue); if (IS_ERR(ent)) @@ -1697,6 +1748,13 @@ static void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool force if (test_bit(i, &vector)) { ent = cmd->ent_arr[i]; + if (forced && ent->ret == -ETIMEDOUT) + set_bit(MLX5_CMD_ENT_STATE_TIMEDOUT, + &ent->state); + else if (!forced) /* real FW completion */ + clear_bit(MLX5_CMD_ENT_STATE_TIMEDOUT, + &ent->state); + /* if we already completed the command, ignore it */ if (!test_and_clear_bit(MLX5_CMD_ENT_STATE_PENDING_COMP, &ent->state)) { @@ -1948,8 +2006,8 @@ static int cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, err = mlx5_cmd_invoke(dev, inb, outb, out, out_size, callback, context, pages_queue, token, force_polling); - if (callback) - return err; + if (callback && !err) + return 0; if (err > 0) /* Failed in FW, command didn't execute */ err = deliv_status_to_err(err); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c index 1fd403713b..60f7ab1d72 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c @@ -66,8 +66,8 @@ void mlx5_cq_tasklet_cb(struct tasklet_struct *t) tasklet_schedule(&ctx->task); } -static void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq, - struct mlx5_eqe *eqe) +void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq, + struct mlx5_eqe *eqe) { unsigned long flags; struct mlx5_eq_tasklet *tasklet_ctx = cq->tasklet_ctx.priv; @@ -95,7 +95,15 @@ static void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq, if (schedule_tasklet) tasklet_schedule(&tasklet_ctx->task); } +EXPORT_SYMBOL(mlx5_add_cq_to_tasklet); +static void mlx5_core_cq_dummy_cb(struct mlx5_core_cq *cq, struct mlx5_eqe *eqe) +{ + mlx5_core_err(cq->eq->core.dev, + "CQ default completion callback, CQ #%u\n", cq->cqn); +} + +#define MLX5_CQ_INIT_CMD_SN cpu_to_be32(2 << 28) /* Callers must verify outbox status in case of err */ int mlx5_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, u32 *in, int inlen, u32 *out, int outlen) @@ -121,10 +129,19 @@ int mlx5_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, cq->arm_sn = 0; cq->eq = eq; cq->uid = MLX5_GET(create_cq_in, in, uid); + + /* Kernel CQs must set the arm_db address prior to calling + * this function, allowing for the proper value to be + * initialized. User CQs are responsible for their own + * initialization since they do not use the arm_db field. + */ + if (cq->arm_db) + *cq->arm_db = MLX5_CQ_INIT_CMD_SN; + refcount_set(&cq->refcount, 1); init_completion(&cq->free); if (!cq->comp) - cq->comp = mlx5_add_cq_to_tasklet; + cq->comp = mlx5_core_cq_dummy_cb; /* assuming CQ will be deleted before the EQ */ cq->tasklet_ctx.priv = &eq->tasklet_ctx; INIT_LIST_HEAD(&cq->tasklet_ctx.list); @@ -145,7 +162,6 @@ int mlx5_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, mlx5_core_dbg(dev, "failed adding CP 0x%x to debug file system\n", cq->cqn); - cq->uar = dev->priv.uar; cq->irqn = eq->core.irqn; return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c index 36806e813c..1301c56e20 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c @@ -613,3 +613,19 @@ void mlx5_debug_cq_remove(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq) cq->dbg = NULL; } } + +static int vhca_id_show(struct seq_file *file, void *priv) +{ + struct mlx5_core_dev *dev = file->private; + + seq_printf(file, "0x%x\n", MLX5_CAP_GEN(dev, vhca_id)); + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(vhca_id); + +void mlx5_vhca_debugfs_init(struct mlx5_core_dev *dev) +{ + debugfs_create_file("vhca_id", 0400, dev->priv.dbg.dbg_root, dev, + &vhca_id_fops); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/dev.c index 891bbbbfbb..781e39b5aa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/dev.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/dev.c @@ -564,10 +564,28 @@ int mlx5_rescan_drivers_locked(struct mlx5_core_dev *dev) bool mlx5_same_hw_devs(struct mlx5_core_dev *dev, struct mlx5_core_dev *peer_dev) { - u64 fsystem_guid, psystem_guid; + u8 fsystem_guid[MLX5_SW_IMAGE_GUID_MAX_BYTES]; + u8 psystem_guid[MLX5_SW_IMAGE_GUID_MAX_BYTES]; + u8 flen; + u8 plen; - fsystem_guid = mlx5_query_nic_system_image_guid(dev); - psystem_guid = mlx5_query_nic_system_image_guid(peer_dev); + mlx5_query_nic_sw_system_image_guid(dev, fsystem_guid, &flen); + mlx5_query_nic_sw_system_image_guid(peer_dev, psystem_guid, &plen); - return (fsystem_guid && psystem_guid && fsystem_guid == psystem_guid); + return plen && flen && flen == plen && + !memcmp(fsystem_guid, psystem_guid, flen); +} + +void mlx5_core_reps_aux_devs_remove(struct mlx5_core_dev *dev) +{ + struct mlx5_priv *priv = &dev->priv; + + if (priv->adev[MLX5_INTERFACE_PROTOCOL_ETH]) + device_lock_assert(&priv->adev[MLX5_INTERFACE_PROTOCOL_ETH]->adev.dev); + else + mlx5_core_err(dev, "ETH driver already removed\n"); + if (priv->adev[MLX5_INTERFACE_PROTOCOL_IB_REP]) + del_adev(&priv->adev[MLX5_INTERFACE_PROTOCOL_IB_REP]->adev); + if (priv->adev[MLX5_INTERFACE_PROTOCOL_ETH_REP]) + del_adev(&priv->adev[MLX5_INTERFACE_PROTOCOL_ETH_REP]->adev); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c index 3b27da79ba..9fb39f42a6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -10,6 +10,7 @@ #include "esw/qos.h" #include "sf/dev/dev.h" #include "sf/sf.h" +#include "lib/nv_param.h" static int mlx5_devlink_flash_update(struct devlink *devlink, struct devlink_flash_update_params *params, @@ -35,6 +36,55 @@ static u16 mlx5_fw_ver_subminor(u32 version) return version & 0xffff; } +static int mlx5_devlink_serial_numbers_put(struct mlx5_core_dev *dev, + struct devlink_info_req *req, + struct netlink_ext_ack *extack) +{ + struct pci_dev *pdev = dev->pdev; + unsigned int vpd_size, kw_len; + char *str, *end; + u8 *vpd_data; + int err = 0; + int start; + + vpd_data = pci_vpd_alloc(pdev, &vpd_size); + if (IS_ERR(vpd_data)) + return 0; + + start = pci_vpd_find_ro_info_keyword(vpd_data, vpd_size, + PCI_VPD_RO_KEYWORD_SERIALNO, &kw_len); + if (start >= 0) { + str = kstrndup(vpd_data + start, kw_len, GFP_KERNEL); + if (!str) { + err = -ENOMEM; + goto end; + } + end = strchrnul(str, ' '); + *end = '\0'; + err = devlink_info_board_serial_number_put(req, str); + kfree(str); + if (err) + goto end; + } + + start = pci_vpd_find_ro_info_keyword(vpd_data, vpd_size, "V3", &kw_len); + if (start >= 0) { + str = kstrndup(vpd_data + start, kw_len, GFP_KERNEL); + if (!str) { + err = -ENOMEM; + goto end; + } + err = devlink_info_serial_number_put(req, str); + kfree(str); + if (err) + goto end; + } + +end: + kfree(vpd_data); + return err; +} + #define DEVLINK_FW_STRING_LEN 32 static int @@ -49,6 +99,10 @@ mlx5_devlink_info_get(struct devlink *devlink, struct devlink_info_req *req, if (!mlx5_core_is_pf(dev)) return 0; + err = mlx5_devlink_serial_numbers_put(dev, req, extack); + if (err) + return err; + err = devlink_info_version_fixed_put(req, "fw.psid", dev->board_id); if (err) return err; @@ -107,7 +161,7 @@ static int mlx5_devlink_reload_fw_activate(struct devlink *devlink, struct netli if (err) return err; - mlx5_unload_one_devl_locked(dev, true); + mlx5_sync_reset_unload_flow(dev, true); err = mlx5_health_wait_pci_up(dev); if (err) NL_SET_ERR_MSG_MOD(extack, "FW activate aborted, PCI reads fail after reset"); @@ -143,6 +197,11 @@ static int mlx5_devlink_reload_down(struct devlink *devlink, bool netns_change, struct pci_dev *pdev = dev->pdev; int ret = 0; + if (mlx5_fw_reset_in_progress(dev)) { + NL_SET_ERR_MSG_MOD(extack, "Can't reload during firmware reset"); + return -EBUSY; + } + if (mlx5_dev_is_lightweight(dev)) { if (action != DEVLINK_RELOAD_ACTION_DRIVER_REINIT) return -EOPNOTSUPP; @@ -150,11 +209,6 @@ static int mlx5_devlink_reload_down(struct devlink *devlink, bool netns_change, return 0; } - if (mlx5_lag_is_active(dev)) { - NL_SET_ERR_MSG_MOD(extack, "reload is unsupported in Lag mode"); - return -EOPNOTSUPP; - } - if (mlx5_core_is_mp_slave(dev)) { NL_SET_ERR_MSG_MOD(extack, "reload is unsupported for multi port slave"); return -EOPNOTSUPP; @@ -323,6 +377,8 @@ static const struct devlink_ops mlx5_devlink_ops = { .eswitch_encap_mode_get = mlx5_devlink_eswitch_encap_mode_get, .rate_leaf_tx_share_set = mlx5_esw_devlink_rate_leaf_tx_share_set, .rate_leaf_tx_max_set = mlx5_esw_devlink_rate_leaf_tx_max_set, + .rate_leaf_tc_bw_set = mlx5_esw_devlink_rate_leaf_tc_bw_set, + .rate_node_tc_bw_set = mlx5_esw_devlink_rate_node_tc_bw_set, .rate_node_tx_share_set = mlx5_esw_devlink_rate_node_tx_share_set, .rate_node_tx_max_set = mlx5_esw_devlink_rate_node_tx_max_set, .rate_node_new = mlx5_esw_devlink_rate_node_new, @@ -479,6 +535,25 @@ mlx5_devlink_hairpin_queue_size_validate(struct devlink *devlink, u32 id, return 0; } +static int mlx5_devlink_num_doorbells_validate(struct devlink *devlink, u32 id, + union devlink_param_value val, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *mdev = devlink_priv(devlink); + u32 val32 = val.vu32; + u32 max_num_channels; + + max_num_channels = mlx5e_get_max_num_channels(mdev); + if (val32 > max_num_channels) { + NL_SET_ERR_MSG_FMT_MOD(extack, + "Requested num_doorbells (%u) exceeds max number of channels (%u)", + val32, max_num_channels); + return -EINVAL; + } + + return 0; +} + static void mlx5_devlink_hairpin_params_init_values(struct devlink *devlink) { struct mlx5_core_dev *dev = devlink_priv(devlink); @@ -558,6 +633,9 @@ static const struct devlink_param mlx5_devlink_eth_params[] = { "hairpin_queue_size", DEVLINK_PARAM_TYPE_U32, BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), NULL, NULL, mlx5_devlink_hairpin_queue_size_validate), + DEVLINK_PARAM_GENERIC(NUM_DOORBELLS, + BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), NULL, NULL, + mlx5_devlink_num_doorbells_validate), }; static int mlx5_devlink_eth_params_register(struct devlink *devlink) @@ -581,6 +659,10 @@ static int mlx5_devlink_eth_params_register(struct devlink *devlink) mlx5_devlink_hairpin_params_init_values(devlink); + value.vu32 = MLX5_DEFAULT_NUM_DOORBELLS; + devl_param_driverinit_value_set(devlink, + DEVLINK_PARAM_GENERIC_ID_NUM_DOORBELLS, + value); return 0; } @@ -595,6 +677,105 @@ static void mlx5_devlink_eth_params_unregister(struct devlink *devlink) ARRAY_SIZE(mlx5_devlink_eth_params)); } +#define MLX5_PCIE_CONG_THRESH_MAX 10000 +#define MLX5_PCIE_CONG_THRESH_DEF_LOW 7500 +#define MLX5_PCIE_CONG_THRESH_DEF_HIGH 9000 + +static int +mlx5_devlink_pcie_cong_thresh_validate(struct devlink *devl, u32 id, + union devlink_param_value val, + struct netlink_ext_ack *extack) +{ + if (val.vu16 > MLX5_PCIE_CONG_THRESH_MAX) { + NL_SET_ERR_MSG_FMT_MOD(extack, "Value %u > max supported (%u)", + val.vu16, MLX5_PCIE_CONG_THRESH_MAX); + + return -EINVAL; + } + + switch (id) { + case MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_LOW: + case MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_HIGH: + case MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_LOW: + case MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_HIGH: + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static void mlx5_devlink_pcie_cong_init_values(struct devlink *devlink) +{ + union devlink_param_value value; + u32 id; + + value.vu16 = MLX5_PCIE_CONG_THRESH_DEF_LOW; + id = MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_LOW; + devl_param_driverinit_value_set(devlink, id, value); + + value.vu16 = MLX5_PCIE_CONG_THRESH_DEF_HIGH; + id = MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_HIGH; + devl_param_driverinit_value_set(devlink, id, value); + + value.vu16 = MLX5_PCIE_CONG_THRESH_DEF_LOW; + id = MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_LOW; + devl_param_driverinit_value_set(devlink, id, value); + + value.vu16 = MLX5_PCIE_CONG_THRESH_DEF_HIGH; + id = MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_HIGH; + devl_param_driverinit_value_set(devlink, id, value); +} + +static const struct devlink_param mlx5_devlink_pcie_cong_params[] = { + DEVLINK_PARAM_DRIVER(MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_LOW, + "pcie_cong_inbound_low", DEVLINK_PARAM_TYPE_U16, + BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), NULL, NULL, + mlx5_devlink_pcie_cong_thresh_validate), + DEVLINK_PARAM_DRIVER(MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_HIGH, + "pcie_cong_inbound_high", DEVLINK_PARAM_TYPE_U16, + BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), NULL, NULL, + mlx5_devlink_pcie_cong_thresh_validate), + DEVLINK_PARAM_DRIVER(MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_LOW, + "pcie_cong_outbound_low", DEVLINK_PARAM_TYPE_U16, + BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), NULL, NULL, + mlx5_devlink_pcie_cong_thresh_validate), + DEVLINK_PARAM_DRIVER(MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_HIGH, + "pcie_cong_outbound_high", DEVLINK_PARAM_TYPE_U16, + BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), NULL, NULL, + mlx5_devlink_pcie_cong_thresh_validate), +}; + +static int mlx5_devlink_pcie_cong_params_register(struct devlink *devlink) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + int err; + + if (!mlx5_pcie_cong_event_supported(dev)) + return 0; + + err = devl_params_register(devlink, mlx5_devlink_pcie_cong_params, + ARRAY_SIZE(mlx5_devlink_pcie_cong_params)); + if (err) + return err; + + mlx5_devlink_pcie_cong_init_values(devlink); + + return 0; +} + +static void mlx5_devlink_pcie_cong_params_unregister(struct devlink *devlink) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + + if (!mlx5_pcie_cong_event_supported(dev)) + return; + + devl_params_unregister(devlink, mlx5_devlink_pcie_cong_params, + ARRAY_SIZE(mlx5_devlink_pcie_cong_params)); +} + static int mlx5_devlink_enable_rdma_validate(struct devlink *devlink, u32 id, union devlink_param_value val, struct netlink_ext_ack *extack) @@ -840,8 +1021,20 @@ int mlx5_devlink_params_register(struct devlink *devlink) if (err) goto max_uc_list_err; + err = mlx5_devlink_pcie_cong_params_register(devlink); + if (err) + goto pcie_cong_err; + + err = mlx5_nv_param_register_dl_params(devlink); + if (err) + goto nv_param_err; + return 0; +nv_param_err: + mlx5_devlink_pcie_cong_params_unregister(devlink); +pcie_cong_err: + mlx5_devlink_max_uc_list_params_unregister(devlink); max_uc_list_err: mlx5_devlink_auxdev_params_unregister(devlink); auxdev_reg_err: @@ -852,6 +1045,8 @@ auxdev_reg_err: void mlx5_devlink_params_unregister(struct devlink *devlink) { + mlx5_nv_param_unregister_dl_params(devlink); + mlx5_devlink_pcie_cong_params_unregister(devlink); mlx5_devlink_max_uc_list_params_unregister(devlink); mlx5_devlink_auxdev_params_unregister(devlink); devl_params_unregister(devlink, mlx5_devlink_params, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h index 961f75da62..c9555119a6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h @@ -22,6 +22,11 @@ enum mlx5_devlink_param_id { MLX5_DEVLINK_PARAM_ID_ESW_MULTIPORT, MLX5_DEVLINK_PARAM_ID_HAIRPIN_NUM_QUEUES, MLX5_DEVLINK_PARAM_ID_HAIRPIN_QUEUE_SIZE, + MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_LOW, + MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_HIGH, + MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_LOW, + MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_HIGH, + MLX5_DEVLINK_PARAM_ID_CQE_COMPRESSION_TYPE }; struct mlx5_trap_ctx { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c index 080e7eab52..6b4ec457ce 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c @@ -33,6 +33,7 @@ #include "lib/eq.h" #include "fw_tracer.h" #include "fw_tracer_tracepoint.h" +#include static int mlx5_query_mtrc_caps(struct mlx5_fw_tracer *tracer) { @@ -54,7 +55,7 @@ static int mlx5_query_mtrc_caps(struct mlx5_fw_tracer *tracer) if (!MLX5_GET(mtrc_cap, out, trace_to_memory)) { mlx5_core_dbg(dev, "FWTracer: Device does not support logging traces to memory\n"); - return -ENOTSUPP; + return -EOPNOTSUPP; } tracer->trc_ver = MLX5_GET(mtrc_cap, out, trc_ver); @@ -358,6 +359,47 @@ static const char *VAL_PARM = "%llx"; static const char *REPLACE_64_VAL_PARM = "%x%x"; static const char *PARAM_CHAR = "%"; +static bool mlx5_is_valid_spec(const char *str) +{ + /* Parse format specifiers to find the actual type. + * Structure: %[flags][width][.precision][length]type + * Skip flags, width, precision & length. + */ + while (isdigit(*str) || *str == '#' || *str == '.' || *str == 'l') + str++; + + /* Check if it's a valid integer/hex specifier or %%: + * Valid formats: %x, %d, %i, %u, etc. + */ + if (*str != 'x' && *str != 'X' && *str != 'd' && *str != 'i' && + *str != 'u' && *str != 'c' && *str != '%') + return false; + + return true; +} + +static bool mlx5_tracer_validate_params(const char *str) +{ + const char *substr = str; + + if (!str) + return false; + + substr = strstr(substr, PARAM_CHAR); + while (substr) { + if (!mlx5_is_valid_spec(substr + 1)) + return false; + + if (*(substr + 1) == '%') + substr = strstr(substr + 2, PARAM_CHAR); + else + substr = strstr(substr + 1, PARAM_CHAR); + + } + + return true; +} + static int mlx5_tracer_message_hash(u32 message_id) { return jhash_1word(message_id, 0) & (MESSAGE_HASH_SIZE - 1); @@ -419,6 +461,10 @@ static int mlx5_tracer_get_num_of_params(char *str) char *substr, *pstr = str; int num_of_params = 0; + /* Validate that all parameters are valid before processing */ + if (!mlx5_tracer_validate_params(str)) + return -EINVAL; + /* replace %llx with %x%x */ substr = strstr(pstr, VAL_PARM); while (substr) { @@ -427,11 +473,15 @@ static int mlx5_tracer_get_num_of_params(char *str) substr = strstr(pstr, VAL_PARM); } - /* count all the % characters */ + /* count all the % characters, but skip %% (escaped percent) */ substr = strstr(str, PARAM_CHAR); while (substr) { - num_of_params += 1; - str = substr + 1; + if (*(substr + 1) != '%') { + num_of_params += 1; + str = substr + 1; + } else { + str = substr + 2; + } substr = strstr(str, PARAM_CHAR); } @@ -570,14 +620,17 @@ void mlx5_tracer_print_trace(struct tracer_string_format *str_frmt, { char tmp[512]; - snprintf(tmp, sizeof(tmp), str_frmt->string, - str_frmt->params[0], - str_frmt->params[1], - str_frmt->params[2], - str_frmt->params[3], - str_frmt->params[4], - str_frmt->params[5], - str_frmt->params[6]); + if (str_frmt->invalid_string) + snprintf(tmp, sizeof(tmp), "BAD_FORMAT: %s", str_frmt->string); + else + snprintf(tmp, sizeof(tmp), str_frmt->string, + str_frmt->params[0], + str_frmt->params[1], + str_frmt->params[2], + str_frmt->params[3], + str_frmt->params[4], + str_frmt->params[5], + str_frmt->params[6]); trace_mlx5_fw(dev->tracer, trace_timestamp, str_frmt->lost, str_frmt->event_id, tmp); @@ -609,6 +662,13 @@ static int mlx5_tracer_handle_raw_string(struct mlx5_fw_tracer *tracer, return 0; } +static void mlx5_tracer_handle_bad_format_string(struct mlx5_fw_tracer *tracer, + struct tracer_string_format *cur_string) +{ + cur_string->invalid_string = true; + list_add_tail(&cur_string->list, &tracer->ready_strings_list); +} + static int mlx5_tracer_handle_string_trace(struct mlx5_fw_tracer *tracer, struct tracer_event *tracer_event) { @@ -619,12 +679,18 @@ static int mlx5_tracer_handle_string_trace(struct mlx5_fw_tracer *tracer, if (!cur_string) return mlx5_tracer_handle_raw_string(tracer, tracer_event); - cur_string->num_of_params = mlx5_tracer_get_num_of_params(cur_string->string); - cur_string->last_param_num = 0; cur_string->event_id = tracer_event->event_id; cur_string->tmsn = tracer_event->string_event.tmsn; cur_string->timestamp = tracer_event->string_event.timestamp; cur_string->lost = tracer_event->lost_event; + cur_string->last_param_num = 0; + cur_string->num_of_params = mlx5_tracer_get_num_of_params(cur_string->string); + if (cur_string->num_of_params < 0) { + pr_debug("%s Invalid format string parameters\n", + __func__); + mlx5_tracer_handle_bad_format_string(tracer, cur_string); + return 0; + } if (cur_string->num_of_params == 0) /* trace with no params */ list_add_tail(&cur_string->list, &tracer->ready_strings_list); } else { @@ -634,6 +700,11 @@ static int mlx5_tracer_handle_string_trace(struct mlx5_fw_tracer *tracer, __func__, tracer_event->string_event.tmsn); return mlx5_tracer_handle_raw_string(tracer, tracer_event); } + if (cur_string->num_of_params < 0) { + pr_debug("%s string parameter of invalid string, dumping\n", + __func__); + return 0; + } cur_string->last_param_num += 1; if (cur_string->last_param_num > TRACER_MAX_PARAMS) { pr_debug("%s Number of params exceeds the max (%d)\n", diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h index 5c548bb74f..30d0bcba88 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h @@ -125,6 +125,7 @@ struct tracer_string_format { struct list_head list; u32 timestamp; bool lost; + bool invalid_string; }; enum mlx5_fw_tracer_ownership_state { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c index 878f9b46bf..7cae0c6e5e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. */ +#include + #include "reporter_vnic.h" #include "en_stats.h" #include "devlink.h" @@ -105,6 +107,15 @@ void mlx5_reporter_vnic_diagnose_counters(struct mlx5_core_dev *dev, } if (MLX5_CAP_GEN(dev, nic_cap_reg)) mlx5_reporter_vnic_diagnose_counter_icm(dev, fmsg, vport_num, other_vport); + if (MLX5_CAP_GEN(dev, vnic_env_cnt_bar_uar_access)) + devlink_fmsg_u32_pair_put(fmsg, "bar_uar_access", + VNIC_ENV_GET(&vnic, bar_uar_access)); + if (MLX5_CAP_GEN(dev, vnic_env_cnt_odp_page_fault)) { + devlink_fmsg_u32_pair_put(fmsg, "odp_local_triggered_page_fault", + VNIC_ENV_GET(&vnic, odp_local_triggered_page_fault)); + devlink_fmsg_u32_pair_put(fmsg, "odp_remote_triggered_page_fault", + VNIC_ENV_GET(&vnic, odp_remote_triggered_page_fault)); + } devlink_fmsg_obj_nest_end(fmsg); devlink_fmsg_pair_nest_end(fmsg); @@ -136,8 +147,8 @@ void mlx5_reporter_vnic_create(struct mlx5_core_dev *dev) dev); if (IS_ERR(health->vnic_reporter)) mlx5_core_warn(dev, - "Failed to create vnic reporter, err = %ld\n", - PTR_ERR(health->vnic_reporter)); + "Failed to create vnic reporter, err = %pe\n", + health->vnic_reporter); } void mlx5_reporter_vnic_destroy(struct mlx5_core_dev *dev) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 5b0d03b3ef..041c986f70 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -84,9 +84,10 @@ struct page_pool; #define MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE (9) #define MLX5E_SHAMPO_WQ_HEADER_PER_PAGE (PAGE_SIZE >> MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE) #define MLX5E_SHAMPO_LOG_WQ_HEADER_PER_PAGE (PAGE_SHIFT - MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE) -#define MLX5E_SHAMPO_WQ_BASE_HEAD_ENTRY_SIZE (64) -#define MLX5E_SHAMPO_WQ_RESRV_SIZE (64 * 1024) -#define MLX5E_SHAMPO_WQ_BASE_RESRV_SIZE (4096) +#define MLX5E_SHAMPO_WQ_BASE_HEAD_ENTRY_SIZE_SHIFT (6) +#define MLX5E_SHAMPO_WQ_RESRV_SIZE_BASE_SHIFT (12) +#define MLX5E_SHAMPO_WQ_LOG_RESRV_SIZE (16) +#define MLX5E_SHAMPO_WQ_RESRV_SIZE BIT(MLX5E_SHAMPO_WQ_LOG_RESRV_SIZE) #define MLX5_MPWRQ_MIN_LOG_STRIDE_SZ(mdev) \ (6 + MLX5_CAP_GEN(mdev, cache_line_128byte)) /* HW restriction */ @@ -177,7 +178,8 @@ static inline u16 mlx5_min_rx_wqes(int wq_type, u32 wq_size) } /* Use this function to get max num channels (rxqs/txqs) only to create netdev */ -static inline int mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev) +static inline unsigned int +mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev) { return is_kdump_kernel() ? MLX5E_MIN_NUM_CHANNELS : @@ -278,10 +280,6 @@ enum packet_merge { struct mlx5e_packet_merge_param { enum packet_merge type; u32 timeout; - struct { - u8 match_criteria_type; - u8 alignment_granularity; - } shampo; }; struct mlx5e_params { @@ -347,6 +345,7 @@ struct mlx5e_cq { /* data path - accessed per napi poll */ u16 event_ctr; struct napi_struct *napi; + struct mlx5_uars_page *uar; struct mlx5_core_cq mcq; struct mlx5e_ch_stats *ch_stats; @@ -378,7 +377,7 @@ struct mlx5e_sq_dma { enum mlx5e_dma_map_type type; }; -/* Keep this enum consistent with with the corresponding strings array +/* Keep this enum consistent with the corresponding strings array * declared in en/reporter_tx.c */ enum { @@ -387,7 +386,6 @@ enum { MLX5E_SQ_STATE_RECOVERING, MLX5E_SQ_STATE_IPSEC, MLX5E_SQ_STATE_DIM, - MLX5E_SQ_STATE_VLAN_NEED_L2_INLINE, MLX5E_SQ_STATE_PENDING_XSK_TX, MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC, MLX5E_NUM_SQ_STATES, /* Must be kept last */ @@ -634,15 +632,16 @@ struct mlx5e_dma_info { }; struct mlx5e_shampo_hd { - u32 mkey; struct mlx5e_frag_page *pages; u32 hd_per_wq; + u32 hd_per_page; u16 hd_per_wqe; - u16 pages_per_wq; + u8 log_hd_per_page; + u8 log_hd_entry_size; unsigned long *bitmap; u16 pi; u16 ci; - __be32 key; + __be32 mkey_be; }; struct mlx5e_hw_gro_data { @@ -700,7 +699,7 @@ struct mlx5e_rq { struct mlx5e_rq_stats *stats; struct mlx5e_cq cq; struct mlx5e_cq_decomp cqd; - struct hwtstamp_config *tstamp; + struct kernel_hwtstamp_config *hwtstamp_config; struct mlx5_clock *clock; struct mlx5e_icosq *icosq; struct mlx5e_priv *priv; @@ -721,13 +720,18 @@ struct mlx5e_rq { struct bpf_prog __rcu *xdp_prog; struct mlx5e_xdpsq *xdpsq; DECLARE_BITMAP(flags, 8); + + /* page pools */ struct page_pool *page_pool; + struct page_pool *hd_page_pool; + struct mlx5e_xdp_buff mxbuf; /* AF_XDP zero-copy */ struct xsk_buff_pool *xsk_pool; struct work_struct recover_work; + struct work_struct rx_timeout_work; /* control */ struct mlx5_wq_ctrl wq_ctrl; @@ -783,12 +787,12 @@ struct mlx5e_channel { /* control */ struct mlx5e_priv *priv; struct mlx5_core_dev *mdev; - struct hwtstamp_config *tstamp; DECLARE_BITMAP(state, MLX5E_CHANNEL_NUM_STATES); int ix; int vec_ix; int sd_ix; int cpu; + struct mlx5_sq_bfreg *bfreg; /* Sync between icosq recovery and XSK enable/disable. */ struct mutex icosq_recovery_lock; @@ -916,12 +920,14 @@ struct mlx5e_priv { u8 max_opened_tc; bool tx_ptp_opened; bool rx_ptp_opened; - struct hwtstamp_config tstamp; + struct kernel_hwtstamp_config hwtstamp_config; u16 q_counter[MLX5_SD_MAX_GROUP_SZ]; u16 drop_rq_q_counter; struct notifier_block events_nb; struct notifier_block blocking_events_nb; + struct mlx5e_pcie_cong_event *cong_event; + struct udp_tunnel_nic_info nic_info; #ifdef CONFIG_MLX5_CORE_EN_DCB struct mlx5e_dcbx dcbx; @@ -952,7 +958,7 @@ struct mlx5e_priv { }; struct mlx5e_dev { - struct mlx5e_priv *priv; + struct net_device *netdev; struct devlink_port dl_port; }; @@ -1019,8 +1025,11 @@ void mlx5e_self_test(struct net_device *ndev, struct ethtool_test *etest, u64 *buf); void mlx5e_set_rx_mode_work(struct work_struct *work); -int mlx5e_hwstamp_set(struct mlx5e_priv *priv, struct ifreq *ifr); -int mlx5e_hwstamp_get(struct mlx5e_priv *priv, struct ifreq *ifr); +int mlx5e_hwtstamp_set(struct mlx5e_priv *priv, + struct kernel_hwtstamp_config *config, + struct netlink_ext_ack *extack); +int mlx5e_hwtstamp_get(struct mlx5e_priv *priv, + struct kernel_hwtstamp_config *config); int mlx5e_modify_rx_cqe_compression_locked(struct mlx5e_priv *priv, bool val, bool rx_filter); int mlx5e_vlan_rx_add_vid(struct net_device *dev, __always_unused __be16 proto, @@ -1059,6 +1068,7 @@ struct mlx5e_create_cq_param { struct mlx5e_ch_stats *ch_stats; int node; int ix; + struct mlx5_uars_page *uar; }; struct mlx5e_cq_param; @@ -1145,7 +1155,9 @@ extern const struct ethtool_ops mlx5e_ethtool_ops; int mlx5e_create_mkey(struct mlx5_core_dev *mdev, u32 pdn, u32 *mkey); int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev, bool create_tises); void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev); -int mlx5e_refresh_tirs(struct mlx5e_priv *priv, bool enable_uc_lb, +int mlx5e_modify_tirs_lb(struct mlx5_core_dev *mdev, bool enable_uc_lb, + bool enable_mc_lb); +int mlx5e_refresh_tirs(struct mlx5_core_dev *mdev, bool enable_uc_lb, bool enable_mc_lb); void mlx5e_mkey_set_relaxed_ordering(struct mlx5_core_dev *mdev, void *mkc); @@ -1226,14 +1238,17 @@ struct net_device * mlx5e_create_netdev(struct mlx5_core_dev *mdev, const struct mlx5e_profile *profile); int mlx5e_attach_netdev(struct mlx5e_priv *priv); void mlx5e_detach_netdev(struct mlx5e_priv *priv); -void mlx5e_destroy_netdev(struct mlx5e_priv *priv); -int mlx5e_netdev_change_profile(struct mlx5e_priv *priv, - const struct mlx5e_profile *new_profile, void *new_ppriv); -void mlx5e_netdev_attach_nic_profile(struct mlx5e_priv *priv); +void mlx5e_destroy_netdev(struct net_device *netdev); +int mlx5e_netdev_change_profile(struct net_device *netdev, + struct mlx5_core_dev *mdev, + const struct mlx5e_profile *new_profile, + void *new_ppriv); +void mlx5e_netdev_attach_nic_profile(struct net_device *netdev, + struct mlx5_core_dev *mdev); void mlx5e_set_netdev_mtu_boundaries(struct mlx5e_priv *priv); void mlx5e_build_nic_params(struct mlx5e_priv *priv, struct mlx5e_xsk *xsk, u16 mtu); -void mlx5e_set_xdp_feature(struct net_device *netdev); +void mlx5e_set_xdp_feature(struct mlx5e_priv *priv); netdev_features_t mlx5e_features_check(struct sk_buff *skb, struct net_device *netdev, netdev_features_t features); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h b/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h index b59aee75de..2c98a5299d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h @@ -26,7 +26,6 @@ struct mlx5e_dcbx { u8 cap; /* Buffer configuration */ - bool manual_buffer; u32 cable_len; u32 xoff; u16 port_buff_cell_sz; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c index 0b1ac6e5c8..8818f65d1f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c @@ -40,11 +40,8 @@ void mlx5e_destroy_devlink(struct mlx5e_dev *mlx5e_dev) static void mlx5e_devlink_get_port_parent_id(struct mlx5_core_dev *dev, struct netdev_phys_item_id *ppid) { - u64 parent_id; - - parent_id = mlx5_query_nic_system_image_guid(dev); - ppid->id_len = sizeof(parent_id); - memcpy(ppid->id, &parent_id, sizeof(parent_id)); + BUILD_BUG_ON(MLX5_SW_IMAGE_GUID_MAX_BYTES > MAX_PHYS_ITEM_ID_LEN); + mlx5_query_nic_sw_system_image_guid(dev, ppid->id, &ppid->id_len); } int mlx5e_devlink_port_register(struct mlx5e_dev *mlx5e_dev, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h index b5c3a2a9d2..eb142f3584 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h @@ -18,7 +18,8 @@ enum { enum { MLX5E_TC_PRIO = 0, - MLX5E_NIC_PRIO + MLX5E_PROMISC_PRIO, + MLX5E_NIC_PRIO, }; struct mlx5e_flow_table { @@ -56,7 +57,7 @@ struct mlx5e_l2_table { bool promisc_enabled; }; -#define MLX5E_NUM_INDIR_TIRS (MLX5_NUM_TT - 1) +#define MLX5E_NUM_INDIR_TIRS (MLX5_NUM_INDIR_TIRS) #define MLX5_HASH_IP (MLX5_HASH_FIELD_SEL_SRC_IP |\ MLX5_HASH_FIELD_SEL_DST_IP) @@ -68,9 +69,13 @@ struct mlx5e_l2_table { MLX5_HASH_FIELD_SEL_DST_IP |\ MLX5_HASH_FIELD_SEL_IPSEC_SPI) -/* NIC prio FTS */ +/* NIC promisc FT level */ enum { MLX5E_PROMISC_FT_LEVEL, +}; + +/* NIC prio FTS */ +enum { MLX5E_VLAN_FT_LEVEL, MLX5E_L2_FT_LEVEL, MLX5E_TTC_FT_LEVEL, @@ -87,6 +92,7 @@ enum { MLX5E_ACCEL_FS_ESP_FT_LEVEL = MLX5E_INNER_TTC_FT_LEVEL + 1, MLX5E_ACCEL_FS_ESP_FT_ERR_LEVEL, MLX5E_ACCEL_FS_POL_FT_LEVEL, + MLX5E_ACCEL_FS_POL_MISS_FT_LEVEL, MLX5E_ACCEL_FS_ESP_FT_ROCE_LEVEL, #endif }; @@ -126,7 +132,8 @@ struct mlx5e_ptp_fs; void mlx5e_set_ttc_params(struct mlx5e_flow_steering *fs, struct mlx5e_rx_res *rx_res, - struct ttc_params *ttc_params, bool tunnel); + struct ttc_params *ttc_params, bool tunnel, + bool ipsec_rss); void mlx5e_destroy_ttc_table(struct mlx5e_flow_steering *fs); int mlx5e_create_ttc_table(struct mlx5e_flow_steering *fs, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c index b4f3bd7d34..195863b2c0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c @@ -138,8 +138,8 @@ void mlx5e_hv_vhca_stats_create(struct mlx5e_priv *priv) if (IS_ERR_OR_NULL(agent)) { if (IS_ERR(agent)) netdev_warn(priv->netdev, - "Failed to create hv vhca stats agent, err = %ld\n", - PTR_ERR(agent)); + "Failed to create hv vhca stats agent, err = %pe\n", + agent); kvfree(priv->stats_agent.buf); return; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/mapping.c b/drivers/net/ethernet/mellanox/mlx5/core/en/mapping.c index 4e72ca8070..1de18c7e96 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/mapping.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/mapping.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "mapping.h" @@ -24,7 +25,8 @@ struct mapping_ctx { struct delayed_work dwork; struct list_head pending_list; spinlock_t pending_list_lock; /* Guards pending list */ - u64 id; + u8 id[MLX5_SW_IMAGE_GUID_MAX_BYTES]; + u8 id_len; u8 type; struct list_head list; refcount_t refcount; @@ -220,13 +222,15 @@ mapping_create(size_t data_size, u32 max_id, bool delayed_removal) } struct mapping_ctx * -mapping_create_for_id(u64 id, u8 type, size_t data_size, u32 max_id, bool delayed_removal) +mapping_create_for_id(u8 *id, u8 id_len, u8 type, size_t data_size, u32 max_id, + bool delayed_removal) { struct mapping_ctx *ctx; mutex_lock(&shared_ctx_lock); list_for_each_entry(ctx, &shared_ctx_list, list) { - if (ctx->id == id && ctx->type == type) { + if (ctx->type == type && ctx->id_len == id_len && + !memcmp(id, ctx->id, id_len)) { if (refcount_inc_not_zero(&ctx->refcount)) goto unlock; break; @@ -237,7 +241,8 @@ mapping_create_for_id(u64 id, u8 type, size_t data_size, u32 max_id, bool delaye if (IS_ERR(ctx)) goto unlock; - ctx->id = id; + memcpy(ctx->id, id, id_len); + ctx->id_len = id_len; ctx->type = type; list_add(&ctx->list, &shared_ctx_list); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/mapping.h b/drivers/net/ethernet/mellanox/mlx5/core/en/mapping.h index 4e2119f0f4..e86a103d58 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/mapping.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/mapping.h @@ -27,6 +27,7 @@ void mapping_destroy(struct mapping_ctx *ctx); /* adds mapping with an id or get an existing mapping with the same id */ struct mapping_ctx * -mapping_create_for_id(u64 id, u8 type, size_t data_size, u32 max_id, bool delayed_removal); +mapping_create_for_id(u8 *id, u8 id_len, u8 type, size_t data_size, u32 max_id, + bool delayed_removal); #endif /* __MLX5_MAPPING_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c index 58ec5e44aa..c948a80a0e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c @@ -99,7 +99,7 @@ u8 mlx5e_mpwrq_umr_entry_size(enum mlx5e_mpwrq_umr_mode mode) return sizeof(struct mlx5_ksm) * 4; } WARN_ONCE(1, "MPWRQ UMR mode %d is not known\n", mode); - return 0; + return 1; } u8 mlx5e_mpwrq_log_wqe_sz(struct mlx5_core_dev *mdev, u8 page_shift, @@ -414,25 +414,10 @@ u8 mlx5e_mpwqe_get_log_rq_size(struct mlx5_core_dev *mdev, return params->log_rq_mtu_frames - log_pkts_per_wqe; } -u8 mlx5e_shampo_get_log_hd_entry_size(struct mlx5_core_dev *mdev, - struct mlx5e_params *params) +static u8 mlx5e_shampo_get_log_pkt_per_rsrv(struct mlx5e_params *params) { - return order_base_2(DIV_ROUND_UP(MLX5E_RX_MAX_HEAD, MLX5E_SHAMPO_WQ_BASE_HEAD_ENTRY_SIZE)); -} - -u8 mlx5e_shampo_get_log_rsrv_size(struct mlx5_core_dev *mdev, - struct mlx5e_params *params) -{ - return order_base_2(MLX5E_SHAMPO_WQ_RESRV_SIZE / MLX5E_SHAMPO_WQ_BASE_RESRV_SIZE); -} - -u8 mlx5e_shampo_get_log_pkt_per_rsrv(struct mlx5_core_dev *mdev, - struct mlx5e_params *params) -{ - u32 resrv_size = BIT(mlx5e_shampo_get_log_rsrv_size(mdev, params)) * - MLX5E_SHAMPO_WQ_BASE_RESRV_SIZE; - - return order_base_2(DIV_ROUND_UP(resrv_size, params->sw_mtu)); + return order_base_2(DIV_ROUND_UP(MLX5E_SHAMPO_WQ_RESRV_SIZE, + params->sw_mtu)); } u8 mlx5e_mpwqe_get_log_stride_size(struct mlx5_core_dev *mdev, @@ -626,6 +611,7 @@ void mlx5e_build_create_cq_param(struct mlx5e_create_cq_param *ccp, struct mlx5e .ch_stats = c->stats, .node = cpu_to_node(c->cpu), .ix = c->vec_ix, + .uar = c->bfreg->up, }; } @@ -825,7 +811,7 @@ static void mlx5e_build_common_cq_param(struct mlx5_core_dev *mdev, { void *cqc = param->cqc; - MLX5_SET(cqc, cqc, uar_page, mdev->priv.uar->index); + MLX5_SET(cqc, cqc, uar_page, mdev->priv.bfreg.up->index); if (MLX5_CAP_GEN(mdev, cqe_128_always) && cache_line_size() >= 128) MLX5_SET(cqc, cqc, cqe_sz, CQE_STRIDE_128_PAD); } @@ -834,13 +820,12 @@ static u32 mlx5e_shampo_get_log_cq_size(struct mlx5_core_dev *mdev, struct mlx5e_params *params, struct mlx5e_xsk_param *xsk) { - int rsrv_size = BIT(mlx5e_shampo_get_log_rsrv_size(mdev, params)) * - MLX5E_SHAMPO_WQ_BASE_RESRV_SIZE; u16 num_strides = BIT(mlx5e_mpwqe_get_log_num_strides(mdev, params, xsk)); - int pkt_per_rsrv = BIT(mlx5e_shampo_get_log_pkt_per_rsrv(mdev, params)); u8 log_stride_sz = mlx5e_mpwqe_get_log_stride_size(mdev, params, xsk); + int pkt_per_rsrv = BIT(mlx5e_shampo_get_log_pkt_per_rsrv(params)); int wq_size = BIT(mlx5e_mpwqe_get_log_rq_size(mdev, params, xsk)); int wqe_size = BIT(log_stride_sz) * num_strides; + int rsrv_size = MLX5E_SHAMPO_WQ_RESRV_SIZE; /* +1 is for the case that the pkt_per_rsrv dont consume the reservation * so we get a filler cqe for the rest of the reservation. @@ -901,6 +886,7 @@ int mlx5e_build_rq_param(struct mlx5_core_dev *mdev, { void *rqc = param->rqc; void *wq = MLX5_ADDR_OF(rqc, rqc, wq); + u32 lro_timeout; int ndsegs = 1; int err; @@ -926,22 +912,27 @@ int mlx5e_build_rq_param(struct mlx5_core_dev *mdev, MLX5_SET(wq, wq, log_wqe_stride_size, log_wqe_stride_size - MLX5_MPWQE_LOG_STRIDE_SZ_BASE); MLX5_SET(wq, wq, log_wq_sz, mlx5e_mpwqe_get_log_rq_size(mdev, params, xsk)); - if (params->packet_merge.type == MLX5E_PACKET_MERGE_SHAMPO) { - MLX5_SET(wq, wq, shampo_enable, true); - MLX5_SET(wq, wq, log_reservation_size, - mlx5e_shampo_get_log_rsrv_size(mdev, params)); - MLX5_SET(wq, wq, - log_max_num_of_packets_per_reservation, - mlx5e_shampo_get_log_pkt_per_rsrv(mdev, params)); - MLX5_SET(wq, wq, log_headers_entry_size, - mlx5e_shampo_get_log_hd_entry_size(mdev, params)); - MLX5_SET(rqc, rqc, reservation_timeout, - mlx5e_choose_lro_timeout(mdev, MLX5E_DEFAULT_SHAMPO_TIMEOUT)); - MLX5_SET(rqc, rqc, shampo_match_criteria_type, - params->packet_merge.shampo.match_criteria_type); - MLX5_SET(rqc, rqc, shampo_no_match_alignment_granularity, - params->packet_merge.shampo.alignment_granularity); - } + if (params->packet_merge.type != MLX5E_PACKET_MERGE_SHAMPO) + break; + + MLX5_SET(wq, wq, shampo_enable, true); + MLX5_SET(wq, wq, log_reservation_size, + MLX5E_SHAMPO_WQ_LOG_RESRV_SIZE - + MLX5E_SHAMPO_WQ_RESRV_SIZE_BASE_SHIFT); + MLX5_SET(wq, wq, + log_max_num_of_packets_per_reservation, + mlx5e_shampo_get_log_pkt_per_rsrv(params)); + MLX5_SET(wq, wq, log_headers_entry_size, + MLX5E_SHAMPO_LOG_HEADER_ENTRY_SIZE - + MLX5E_SHAMPO_WQ_BASE_HEAD_ENTRY_SIZE_SHIFT); + lro_timeout = + mlx5e_choose_lro_timeout(mdev, + MLX5E_DEFAULT_SHAMPO_TIMEOUT); + MLX5_SET(rqc, rqc, reservation_timeout, lro_timeout); + MLX5_SET(rqc, rqc, shampo_match_criteria_type, + MLX5_RQC_SHAMPO_MATCH_CRITERIA_TYPE_EXTENDED); + MLX5_SET(rqc, rqc, shampo_no_match_alignment_granularity, + MLX5_RQC_SHAMPO_NO_MATCH_ALIGNMENT_GRANULARITY_STRIDE); break; } default: /* MLX5_WQ_TYPE_CYCLIC */ @@ -1044,18 +1035,17 @@ u32 mlx5e_shampo_hd_per_wqe(struct mlx5_core_dev *mdev, struct mlx5e_params *params, struct mlx5e_rq_param *rq_param) { - int resv_size = BIT(mlx5e_shampo_get_log_rsrv_size(mdev, params)) * - MLX5E_SHAMPO_WQ_BASE_RESRV_SIZE; u16 num_strides = BIT(mlx5e_mpwqe_get_log_num_strides(mdev, params, NULL)); - int pkt_per_resv = BIT(mlx5e_shampo_get_log_pkt_per_rsrv(mdev, params)); u8 log_stride_sz = mlx5e_mpwqe_get_log_stride_size(mdev, params, NULL); + int pkt_per_rsrv = BIT(mlx5e_shampo_get_log_pkt_per_rsrv(params)); int wqe_size = BIT(log_stride_sz) * num_strides; + int rsrv_size = MLX5E_SHAMPO_WQ_RESRV_SIZE; u32 hd_per_wqe; /* Assumption: hd_per_wqe % 8 == 0. */ - hd_per_wqe = (wqe_size / resv_size) * pkt_per_resv; - mlx5_core_dbg(mdev, "%s hd_per_wqe = %d rsrv_size = %d wqe_size = %d pkt_per_resv = %d\n", - __func__, hd_per_wqe, resv_size, wqe_size, pkt_per_resv); + hd_per_wqe = (wqe_size / rsrv_size) * pkt_per_rsrv; + mlx5_core_dbg(mdev, "%s hd_per_wqe = %d rsrv_size = %d wqe_size = %d pkt_per_rsrv = %d\n", + __func__, hd_per_wqe, rsrv_size, wqe_size, pkt_per_rsrv); return hd_per_wqe; } @@ -1240,7 +1230,6 @@ static void mlx5e_build_async_icosq_param(struct mlx5_core_dev *mdev, void mlx5e_build_xdpsq_param(struct mlx5_core_dev *mdev, struct mlx5e_params *params, - struct mlx5e_xsk_param *xsk, struct mlx5e_sq_param *param) { void *sqc = param->sqc; @@ -1267,7 +1256,7 @@ int mlx5e_build_channel_param(struct mlx5_core_dev *mdev, async_icosq_log_wq_sz = mlx5e_build_async_icosq_log_wq_sz(mdev); mlx5e_build_sq_param(mdev, params, &cparam->txq_sq); - mlx5e_build_xdpsq_param(mdev, params, NULL, &cparam->xdp_sq); + mlx5e_build_xdpsq_param(mdev, params, &cparam->xdp_sq); mlx5e_build_icosq_param(mdev, icosq_log_wq_sz, &cparam->icosq); mlx5e_build_async_icosq_param(mdev, async_icosq_log_wq_sz, &cparam->async_icosq); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h index bd5877acc5..00617c65fe 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h @@ -51,6 +51,7 @@ struct mlx5e_create_sq_param { u32 tisn; u8 tis_lst_sz; u8 min_inline_mode; + u32 uar_page; }; /* Striding RQ dynamic parameters */ @@ -95,12 +96,6 @@ bool mlx5e_rx_mpwqe_is_linear_skb(struct mlx5_core_dev *mdev, u8 mlx5e_mpwqe_get_log_rq_size(struct mlx5_core_dev *mdev, struct mlx5e_params *params, struct mlx5e_xsk_param *xsk); -u8 mlx5e_shampo_get_log_hd_entry_size(struct mlx5_core_dev *mdev, - struct mlx5e_params *params); -u8 mlx5e_shampo_get_log_rsrv_size(struct mlx5_core_dev *mdev, - struct mlx5e_params *params); -u8 mlx5e_shampo_get_log_pkt_per_rsrv(struct mlx5_core_dev *mdev, - struct mlx5e_params *params); u32 mlx5e_shampo_hd_per_wqe(struct mlx5_core_dev *mdev, struct mlx5e_params *params, struct mlx5e_rq_param *rq_param); @@ -138,7 +133,6 @@ void mlx5e_build_tx_cq_param(struct mlx5_core_dev *mdev, struct mlx5e_cq_param *param); void mlx5e_build_xdpsq_param(struct mlx5_core_dev *mdev, struct mlx5e_params *params, - struct mlx5e_xsk_param *xsk, struct mlx5e_sq_param *param); int mlx5e_build_channel_param(struct mlx5_core_dev *mdev, struct mlx5e_params *params, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/pcie_cong_event.c b/drivers/net/ethernet/mellanox/mlx5/core/en/pcie_cong_event.c new file mode 100644 index 0000000000..2eb666a46f --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/pcie_cong_event.c @@ -0,0 +1,376 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. + +#include "../devlink.h" +#include "en.h" +#include "pcie_cong_event.h" + +#define MLX5E_CONG_HIGH_STATE 0x7 + +enum { + MLX5E_INBOUND_CONG = BIT(0), + MLX5E_OUTBOUND_CONG = BIT(1), +}; + +struct mlx5e_pcie_cong_thresh { + u16 inbound_high; + u16 inbound_low; + u16 outbound_high; + u16 outbound_low; +}; + +struct mlx5e_pcie_cong_stats { + u32 pci_bw_inbound_high; + u32 pci_bw_inbound_low; + u32 pci_bw_outbound_high; + u32 pci_bw_outbound_low; + u32 pci_bw_stale_event; +}; + +struct mlx5e_pcie_cong_event { + u64 obj_id; + + struct mlx5e_priv *priv; + + /* For event notifier and workqueue. */ + struct work_struct work; + struct mlx5_nb nb; + + /* Stores last read state. */ + u8 state; + + /* For ethtool stats group. */ + struct mlx5e_pcie_cong_stats stats; +}; + + +static const struct counter_desc mlx5e_pcie_cong_stats_desc[] = { + { MLX5E_DECLARE_STAT(struct mlx5e_pcie_cong_stats, + pci_bw_inbound_high) }, + { MLX5E_DECLARE_STAT(struct mlx5e_pcie_cong_stats, + pci_bw_inbound_low) }, + { MLX5E_DECLARE_STAT(struct mlx5e_pcie_cong_stats, + pci_bw_outbound_high) }, + { MLX5E_DECLARE_STAT(struct mlx5e_pcie_cong_stats, + pci_bw_outbound_low) }, + { MLX5E_DECLARE_STAT(struct mlx5e_pcie_cong_stats, + pci_bw_stale_event) }, +}; + +#define NUM_PCIE_CONG_COUNTERS ARRAY_SIZE(mlx5e_pcie_cong_stats_desc) + +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(pcie_cong) +{ + return priv->cong_event ? NUM_PCIE_CONG_COUNTERS : 0; +} + +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(pcie_cong) {} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(pcie_cong) +{ + if (!priv->cong_event) + return; + + for (int i = 0; i < NUM_PCIE_CONG_COUNTERS; i++) + ethtool_puts(data, mlx5e_pcie_cong_stats_desc[i].format); +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(pcie_cong) +{ + if (!priv->cong_event) + return; + + for (int i = 0; i < NUM_PCIE_CONG_COUNTERS; i++) { + u32 ctr = MLX5E_READ_CTR32_CPU(&priv->cong_event->stats, + mlx5e_pcie_cong_stats_desc, + i); + + mlx5e_ethtool_put_stat(data, ctr); + } +} + +MLX5E_DEFINE_STATS_GRP(pcie_cong, 0); + +static int +mlx5_cmd_pcie_cong_event_set(struct mlx5_core_dev *dev, + const struct mlx5e_pcie_cong_thresh *config, + u64 *obj_id) +{ + u32 in[MLX5_ST_SZ_DW(pcie_cong_event_cmd_in)] = {}; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; + void *cong_obj; + void *hdr; + int err; + + hdr = MLX5_ADDR_OF(pcie_cong_event_cmd_in, in, hdr); + cong_obj = MLX5_ADDR_OF(pcie_cong_event_cmd_in, in, cong_obj); + + MLX5_SET(general_obj_in_cmd_hdr, hdr, opcode, + MLX5_CMD_OP_CREATE_GENERAL_OBJECT); + + MLX5_SET(general_obj_in_cmd_hdr, hdr, obj_type, + MLX5_GENERAL_OBJECT_TYPES_PCIE_CONG_EVENT); + + MLX5_SET(pcie_cong_event_obj, cong_obj, inbound_event_en, 1); + MLX5_SET(pcie_cong_event_obj, cong_obj, outbound_event_en, 1); + + MLX5_SET(pcie_cong_event_obj, cong_obj, + inbound_cong_high_threshold, config->inbound_high); + MLX5_SET(pcie_cong_event_obj, cong_obj, + inbound_cong_low_threshold, config->inbound_low); + + MLX5_SET(pcie_cong_event_obj, cong_obj, + outbound_cong_high_threshold, config->outbound_high); + MLX5_SET(pcie_cong_event_obj, cong_obj, + outbound_cong_low_threshold, config->outbound_low); + + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + if (err) + return err; + + *obj_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); + + mlx5_core_dbg(dev, "PCIe congestion event (obj_id=%llu) created. Config: in: [%u, %u], out: [%u, %u]\n", + *obj_id, + config->inbound_high, config->inbound_low, + config->outbound_high, config->outbound_low); + + return 0; +} + +static int mlx5_cmd_pcie_cong_event_destroy(struct mlx5_core_dev *dev, + u64 obj_id) +{ + u32 in[MLX5_ST_SZ_DW(pcie_cong_event_cmd_in)] = {}; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; + void *hdr; + + hdr = MLX5_ADDR_OF(pcie_cong_event_cmd_in, in, hdr); + MLX5_SET(general_obj_in_cmd_hdr, hdr, opcode, + MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, hdr, obj_type, + MLX5_GENERAL_OBJECT_TYPES_PCIE_CONG_EVENT); + MLX5_SET(general_obj_in_cmd_hdr, hdr, obj_id, obj_id); + + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} + +static int mlx5_cmd_pcie_cong_event_query(struct mlx5_core_dev *dev, + u64 obj_id, + u32 *state) +{ + u32 in[MLX5_ST_SZ_DW(pcie_cong_event_cmd_in)] = {}; + u32 out[MLX5_ST_SZ_DW(pcie_cong_event_cmd_out)]; + void *obj; + void *hdr; + u8 cong; + int err; + + hdr = MLX5_ADDR_OF(pcie_cong_event_cmd_in, in, hdr); + + MLX5_SET(general_obj_in_cmd_hdr, hdr, opcode, + MLX5_CMD_OP_QUERY_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, hdr, obj_type, + MLX5_GENERAL_OBJECT_TYPES_PCIE_CONG_EVENT); + MLX5_SET(general_obj_in_cmd_hdr, hdr, obj_id, obj_id); + + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + if (err) + return err; + + obj = MLX5_ADDR_OF(pcie_cong_event_cmd_out, out, cong_obj); + + if (state) { + cong = MLX5_GET(pcie_cong_event_obj, obj, inbound_cong_state); + if (cong == MLX5E_CONG_HIGH_STATE) + *state |= MLX5E_INBOUND_CONG; + + cong = MLX5_GET(pcie_cong_event_obj, obj, outbound_cong_state); + if (cong == MLX5E_CONG_HIGH_STATE) + *state |= MLX5E_OUTBOUND_CONG; + } + + return 0; +} + +static void mlx5e_pcie_cong_event_work(struct work_struct *work) +{ + struct mlx5e_pcie_cong_event *cong_event; + struct mlx5_core_dev *dev; + struct mlx5e_priv *priv; + u32 new_cong_state = 0; + u32 changes; + int err; + + cong_event = container_of(work, struct mlx5e_pcie_cong_event, work); + priv = cong_event->priv; + dev = priv->mdev; + + err = mlx5_cmd_pcie_cong_event_query(dev, cong_event->obj_id, + &new_cong_state); + if (err) { + mlx5_core_warn(dev, "Error %d when querying PCIe cong event object (obj_id=%llu).\n", + err, cong_event->obj_id); + return; + } + + changes = cong_event->state ^ new_cong_state; + if (!changes) { + cong_event->stats.pci_bw_stale_event++; + return; + } + + cong_event->state = new_cong_state; + + if (changes & MLX5E_INBOUND_CONG) { + if (new_cong_state & MLX5E_INBOUND_CONG) + cong_event->stats.pci_bw_inbound_high++; + else + cong_event->stats.pci_bw_inbound_low++; + } + + if (changes & MLX5E_OUTBOUND_CONG) { + if (new_cong_state & MLX5E_OUTBOUND_CONG) + cong_event->stats.pci_bw_outbound_high++; + else + cong_event->stats.pci_bw_outbound_low++; + } +} + +static int mlx5e_pcie_cong_event_handler(struct notifier_block *nb, + unsigned long event, void *eqe) +{ + struct mlx5e_pcie_cong_event *cong_event; + + cong_event = mlx5_nb_cof(nb, struct mlx5e_pcie_cong_event, nb); + queue_work(cong_event->priv->wq, &cong_event->work); + + return NOTIFY_OK; +} + +static int +mlx5e_pcie_cong_get_thresh_config(struct mlx5_core_dev *dev, + struct mlx5e_pcie_cong_thresh *config) +{ + u32 ids[4] = { + MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_LOW, + MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_HIGH, + MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_LOW, + MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_HIGH, + }; + struct devlink *devlink = priv_to_devlink(dev); + union devlink_param_value val[4]; + + for (int i = 0; i < 4; i++) { + u32 id = ids[i]; + int err; + + err = devl_param_driverinit_value_get(devlink, id, &val[i]); + if (err) + return err; + } + + config->inbound_low = val[0].vu16; + config->inbound_high = val[1].vu16; + config->outbound_low = val[2].vu16; + config->outbound_high = val[3].vu16; + + return 0; +} + +static int +mlx5e_thresh_config_validate(struct mlx5_core_dev *mdev, + const struct mlx5e_pcie_cong_thresh *config) +{ + int err = 0; + + if (config->inbound_low >= config->inbound_high) { + err = -EINVAL; + mlx5_core_err(mdev, "PCIe inbound congestion threshold configuration invalid: low (%u) >= high (%u).\n", + config->inbound_low, config->inbound_high); + } + + if (config->outbound_low >= config->outbound_high) { + err = -EINVAL; + mlx5_core_err(mdev, "PCIe outbound congestion threshold configuration invalid: low (%u) >= high (%u).\n", + config->outbound_low, config->outbound_high); + } + + return err; +} + +int mlx5e_pcie_cong_event_init(struct mlx5e_priv *priv) +{ + struct mlx5e_pcie_cong_thresh thresh_config = {}; + struct mlx5e_pcie_cong_event *cong_event; + struct mlx5_core_dev *mdev = priv->mdev; + int err; + + if (!mlx5_pcie_cong_event_supported(mdev)) + return 0; + + err = mlx5e_pcie_cong_get_thresh_config(mdev, &thresh_config); + if (WARN_ON(err)) + return err; + + err = mlx5e_thresh_config_validate(mdev, &thresh_config); + if (err) { + mlx5_core_err(mdev, "PCIe congestion event feature disabled\n"); + return err; + } + + cong_event = kvzalloc_node(sizeof(*cong_event), GFP_KERNEL, + mdev->priv.numa_node); + if (!cong_event) + return -ENOMEM; + + INIT_WORK(&cong_event->work, mlx5e_pcie_cong_event_work); + MLX5_NB_INIT(&cong_event->nb, mlx5e_pcie_cong_event_handler, + OBJECT_CHANGE); + + cong_event->priv = priv; + + err = mlx5_cmd_pcie_cong_event_set(mdev, &thresh_config, + &cong_event->obj_id); + if (err) { + mlx5_core_warn(mdev, "Error creating a PCIe congestion event object\n"); + goto err_free; + } + + err = mlx5_eq_notifier_register(mdev, &cong_event->nb); + if (err) { + mlx5_core_warn(mdev, "Error registering notifier for the PCIe congestion event\n"); + goto err_obj_destroy; + } + + priv->cong_event = cong_event; + + return 0; + +err_obj_destroy: + mlx5_cmd_pcie_cong_event_destroy(mdev, cong_event->obj_id); +err_free: + kvfree(cong_event); + + return err; +} + +void mlx5e_pcie_cong_event_cleanup(struct mlx5e_priv *priv) +{ + struct mlx5e_pcie_cong_event *cong_event = priv->cong_event; + struct mlx5_core_dev *mdev = priv->mdev; + + if (!cong_event) + return; + + priv->cong_event = NULL; + + mlx5_eq_notifier_unregister(mdev, &cong_event->nb); + cancel_work_sync(&cong_event->work); + + if (mlx5_cmd_pcie_cong_event_destroy(mdev, cong_event->obj_id)) + mlx5_core_warn(mdev, "Error destroying PCIe congestion event (obj_id=%llu)\n", + cong_event->obj_id); + + kvfree(cong_event); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/pcie_cong_event.h b/drivers/net/ethernet/mellanox/mlx5/core/en/pcie_cong_event.h new file mode 100644 index 0000000000..b1ea46bf64 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/pcie_cong_event.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. */ + +#ifndef __MLX5_PCIE_CONG_EVENT_H__ +#define __MLX5_PCIE_CONG_EVENT_H__ + +int mlx5e_pcie_cong_event_init(struct mlx5e_priv *priv); +void mlx5e_pcie_cong_event_cleanup(struct mlx5e_priv *priv); + +#endif /* __MLX5_PCIE_CONG_EVENT_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c index 8e25f4ef5c..4720523813 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c @@ -272,8 +272,8 @@ static int port_update_shared_buffer(struct mlx5_core_dev *mdev, /* Total shared buffer size is split in a ratio of 3:1 between * lossy and lossless pools respectively. */ - lossy_epool_size = (shared_buffer_size / 4) * 3; lossless_ipool_size = shared_buffer_size / 4; + lossy_epool_size = shared_buffer_size - lossless_ipool_size; mlx5e_port_set_sbpr(mdev, 0, MLX5_EGRESS_DIR, MLX5_LOSSY_POOL, 0, lossy_epool_size); @@ -288,14 +288,12 @@ static int port_set_buffer(struct mlx5e_priv *priv, u16 port_buff_cell_sz = priv->dcbx.port_buff_cell_sz; struct mlx5_core_dev *mdev = priv->mdev; int sz = MLX5_ST_SZ_BYTES(pbmc_reg); - u32 new_headroom_size = 0; - u32 current_headroom_size; + u32 current_headroom_cells = 0; + u32 new_headroom_cells = 0; void *in; int err; int i; - current_headroom_size = port_buffer->headroom_size; - in = kzalloc(sz, GFP_KERNEL); if (!in) return -ENOMEM; @@ -306,12 +304,14 @@ static int port_set_buffer(struct mlx5e_priv *priv, for (i = 0; i < MLX5E_MAX_NETWORK_BUFFER; i++) { void *buffer = MLX5_ADDR_OF(pbmc_reg, in, buffer[i]); + current_headroom_cells += MLX5_GET(bufferx_reg, buffer, size); + u64 size = port_buffer->buffer[i].size; u64 xoff = port_buffer->buffer[i].xoff; u64 xon = port_buffer->buffer[i].xon; - new_headroom_size += size; do_div(size, port_buff_cell_sz); + new_headroom_cells += size; do_div(xoff, port_buff_cell_sz); do_div(xon, port_buff_cell_sz); MLX5_SET(bufferx_reg, buffer, size, size); @@ -320,10 +320,8 @@ static int port_set_buffer(struct mlx5e_priv *priv, MLX5_SET(bufferx_reg, buffer, xon_threshold, xon); } - new_headroom_size /= port_buff_cell_sz; - current_headroom_size /= port_buff_cell_sz; - err = port_update_shared_buffer(priv->mdev, current_headroom_size, - new_headroom_size); + err = port_update_shared_buffer(priv->mdev, current_headroom_cells, + new_headroom_cells); if (err) goto out; @@ -331,6 +329,9 @@ static int port_set_buffer(struct mlx5e_priv *priv, if (err) goto out; + /* RO bits should be set to 0 on write */ + MLX5_SET(pbmc_reg, in, port_buffer_size, 0); + err = mlx5e_port_set_pbmc(mdev, in); out: kfree(in); @@ -574,7 +575,6 @@ int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, if (err) return err; } - priv->dcbx.xoff = xoff; /* Apply the settings */ if (update_buffer) { @@ -583,6 +583,8 @@ int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, return err; } + priv->dcbx.xoff = xoff; + if (update_prio2buffer) err = mlx5e_port_set_priority2buffer(priv->mdev, prio2buffer); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c index 131ed97ca9..bd58c1771a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c @@ -81,7 +81,7 @@ static struct mlx5e_skb_cb_hwtstamp *mlx5e_skb_cb_get_hwts(struct sk_buff *skb) } static void mlx5e_skb_cb_hwtstamp_tx(struct sk_buff *skb, - struct mlx5e_ptp_cq_stats *cq_stats) + struct mlx5e_ptpsq *ptpsq) { struct skb_shared_hwtstamps hwts = {}; ktime_t diff; @@ -91,8 +91,17 @@ static void mlx5e_skb_cb_hwtstamp_tx(struct sk_buff *skb, /* Maximal allowed diff is 1 / 128 second */ if (diff > (NSEC_PER_SEC >> 7)) { - cq_stats->abort++; - cq_stats->abort_abs_diff_ns += diff; + struct mlx5e_txqsq *sq = &ptpsq->txqsq; + + ptpsq->cq_stats->abort++; + ptpsq->cq_stats->abort_abs_diff_ns += diff; + if (diff > (NSEC_PER_SEC >> 1) && + !test_and_set_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) { + netdev_warn(sq->channel->netdev, + "PTP TX timestamp difference between CQE and port exceeds threshold: %lld ns, recovering SQ %u\n", + (s64)diff, sq->sqn); + queue_work(sq->priv->wq, &ptpsq->report_unhealthy_work); + } return; } @@ -102,7 +111,7 @@ static void mlx5e_skb_cb_hwtstamp_tx(struct sk_buff *skb, void mlx5e_skb_cb_hwtstamp_handler(struct sk_buff *skb, int hwtstamp_type, ktime_t hwtstamp, - struct mlx5e_ptp_cq_stats *cq_stats) + struct mlx5e_ptpsq *ptpsq) { switch (hwtstamp_type) { case (MLX5E_SKB_CB_CQE_HWTSTAMP): @@ -120,7 +129,7 @@ void mlx5e_skb_cb_hwtstamp_handler(struct sk_buff *skb, int hwtstamp_type, !mlx5e_skb_cb_get_hwts(skb)->port_hwtstamp) return; - mlx5e_skb_cb_hwtstamp_tx(skb, cq_stats); + mlx5e_skb_cb_hwtstamp_tx(skb, ptpsq); memset(skb->cb, 0, sizeof(struct mlx5e_skb_cb_hwtstamp)); } @@ -208,7 +217,7 @@ static void mlx5e_ptp_handle_ts_cqe(struct mlx5e_ptpsq *ptpsq, hwtstamp = mlx5e_cqe_ts_to_ns(sq->ptp_cyc2time, sq->clock, get_cqe_ts(cqe)); mlx5e_skb_cb_hwtstamp_handler(skb, MLX5E_SKB_CB_PORT_HWTSTAMP, - hwtstamp, ptpsq->cq_stats); + hwtstamp, ptpsq); ptpsq->cq_stats->cqe++; mlx5e_ptpsq_mark_ts_cqes_undelivered(ptpsq, hwtstamp); @@ -333,14 +342,12 @@ static int mlx5e_ptp_alloc_txqsq(struct mlx5e_ptp *c, int txq_ix, sq->mdev = mdev; sq->ch_ix = MLX5E_PTP_CHANNEL_IX; sq->txq_ix = txq_ix; - sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map; + sq->uar_map = c->bfreg->map; sq->min_inline_mode = params->tx_min_inline_mode; sq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); sq->stats = &c->priv->ptp_stats.sq[tc]; sq->ptpsq = ptpsq; INIT_WORK(&sq->recover_work, mlx5e_tx_err_cqe_work); - if (!MLX5_CAP_ETH(mdev, wqe_vlan_insert)) - set_bit(MLX5E_SQ_STATE_VLAN_NEED_L2_INLINE, &sq->state); sq->stop_room = param->stop_room; sq->ptp_cyc2time = mlx5_sq_ts_translator(mdev); @@ -473,6 +480,7 @@ static int mlx5e_ptp_open_txqsq(struct mlx5e_ptp *c, u32 tisn, csp.wq_ctrl = &txqsq->wq_ctrl; csp.min_inline_mode = txqsq->min_inline_mode; csp.ts_cqe_to_dest_cqn = ptpsq->ts_cq.mcq.cqn; + csp.uar_page = c->bfreg->index; err = mlx5e_create_sq_rdy(c->mdev, sqp, &csp, 0, &txqsq->sqn); if (err) @@ -564,6 +572,7 @@ static int mlx5e_ptp_open_tx_cqs(struct mlx5e_ptp *c, ccp.ch_stats = c->stats; ccp.napi = &c->napi; ccp.ix = MLX5E_PTP_CHANNEL_IX; + ccp.uar = c->bfreg->up; cq_param = &cparams->txq_sq_param.cqp; @@ -613,6 +622,7 @@ static int mlx5e_ptp_open_rx_cq(struct mlx5e_ptp *c, ccp.ch_stats = c->stats; ccp.napi = &c->napi; ccp.ix = MLX5E_PTP_CHANNEL_IX; + ccp.uar = c->bfreg->up; cq_param = &cparams->rq_param.cqp; @@ -697,7 +707,7 @@ static int mlx5e_init_ptp_rq(struct mlx5e_ptp *c, struct mlx5e_params *params, rq->netdev = priv->netdev; rq->priv = priv; rq->clock = mdev->clock; - rq->tstamp = &priv->tstamp; + rq->hwtstamp_config = &priv->hwtstamp_config; rq->mdev = mdev; rq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); rq->stats = &c->priv->ptp_stats.rq; @@ -880,13 +890,13 @@ int mlx5e_ptp_open(struct mlx5e_priv *priv, struct mlx5e_params *params, c->priv = priv; c->mdev = priv->mdev; - c->tstamp = &priv->tstamp; c->pdev = mlx5_core_dma_dev(priv->mdev); c->netdev = priv->netdev; c->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.hw_objs.mkey); c->num_tc = mlx5e_get_dcb_num_tc(params); c->stats = &priv->ptp_stats.ch; c->lag_port = lag_port; + c->bfreg = &mdev->priv.bfreg; err = mlx5e_ptp_set_state(c, params); if (err) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h index 883c044852..2a457a2ed7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h @@ -64,8 +64,8 @@ struct mlx5e_ptp { /* control */ struct mlx5e_priv *priv; struct mlx5_core_dev *mdev; - struct hwtstamp_config *tstamp; DECLARE_BITMAP(state, MLX5E_PTP_STATE_NUM_STATES); + struct mlx5_sq_bfreg *bfreg; }; static inline bool mlx5e_use_ptpsq(struct sk_buff *skb) @@ -147,7 +147,7 @@ enum { void mlx5e_skb_cb_hwtstamp_handler(struct sk_buff *skb, int hwtstamp_type, ktime_t hwtstamp, - struct mlx5e_ptp_cq_stats *cq_stats); + struct mlx5e_ptpsq *ptpsq); void mlx5e_skb_cb_hwtstamp_init(struct sk_buff *skb); #endif /* __MLX5_EN_PTP_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c index f0744a45db..4e461cb03b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c @@ -374,7 +374,7 @@ void mlx5e_reactivate_qos_sq(struct mlx5e_priv *priv, u16 qid, struct netdev_que void mlx5e_reset_qdisc(struct net_device *dev, u16 qid) { struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, qid); - struct Qdisc *qdisc = dev_queue->qdisc_sleeping; + struct Qdisc *qdisc = rtnl_dereference(dev_queue->qdisc_sleeping); if (!qdisc) return; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c index 0f5d7ea895..87a2ad6952 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c @@ -30,15 +30,11 @@ static bool mlx5_esw_bridge_dev_same_hw(struct net_device *dev, struct mlx5_eswi { struct mlx5e_priv *priv = netdev_priv(dev); struct mlx5_core_dev *mdev, *esw_mdev; - u64 system_guid, esw_system_guid; mdev = priv->mdev; esw_mdev = esw->dev; - system_guid = mlx5_query_nic_system_image_guid(mdev); - esw_system_guid = mlx5_query_nic_system_image_guid(esw_mdev); - - return system_guid == esw_system_guid; + return mlx5_same_hw_devs(mdev, esw_mdev); } static struct net_device * @@ -488,8 +484,8 @@ static int mlx5_esw_bridge_switchdev_event(struct notifier_block *nb, fdb_info, br_offloads); if (IS_ERR(work)) { - WARN_ONCE(1, "Failed to init switchdev work, err=%ld", - PTR_ERR(work)); + WARN_ONCE(1, "Failed to init switchdev work, err=%pe", + work); return notifier_from_errno(PTR_ERR(work)); } @@ -527,7 +523,8 @@ void mlx5e_rep_bridge_init(struct mlx5e_priv *priv) br_offloads = mlx5_esw_bridge_init(esw); rtnl_unlock(); if (IS_ERR(br_offloads)) { - esw_warn(mdev, "Failed to init esw bridge (err=%ld)\n", PTR_ERR(br_offloads)); + esw_warn(mdev, "Failed to init esw bridge (err=%pe)\n", + br_offloads); return; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c index e106f06964..0686fbdd5a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c @@ -170,16 +170,23 @@ static int mlx5e_rx_reporter_err_rq_cqe_recover(void *ctx) static int mlx5e_rx_reporter_timeout_recover(void *ctx) { struct mlx5_eq_comp *eq; + struct mlx5e_priv *priv; struct mlx5e_rq *rq; int err; rq = ctx; + priv = rq->priv; + + mutex_lock(&priv->state_lock); + eq = rq->cq.mcq.eq; err = mlx5e_health_channel_eq_recover(rq->netdev, eq, rq->cq.ch_stats); if (err && rq->icosq) clear_bit(MLX5E_SQ_STATE_ENABLED, &rq->icosq->state); + mutex_unlock(&priv->state_lock); + return err; } @@ -311,7 +318,8 @@ mlx5e_rx_reporter_diagnose_common_ptp_config(struct mlx5e_priv *priv, struct mlx struct devlink_fmsg *fmsg) { mlx5e_health_fmsg_named_obj_nest_start(fmsg, "PTP"); - devlink_fmsg_u32_pair_put(fmsg, "filter_type", priv->tstamp.rx_filter); + devlink_fmsg_u32_pair_put(fmsg, "filter_type", + priv->hwtstamp_config.rx_filter); mlx5e_rx_reporter_diagnose_generic_rq(&ptp_ch->rq, fmsg); mlx5e_health_fmsg_named_obj_nest_end(fmsg); } @@ -645,6 +653,7 @@ void mlx5e_reporter_icosq_resume_recovery(struct mlx5e_channel *c) } #define MLX5E_REPORTER_RX_GRACEFUL_PERIOD 500 +#define MLX5E_REPORTER_RX_BURST_PERIOD 500 static const struct devlink_health_reporter_ops mlx5_rx_reporter_ops = { .name = "rx", @@ -652,6 +661,7 @@ static const struct devlink_health_reporter_ops mlx5_rx_reporter_ops = { .diagnose = mlx5e_rx_reporter_diagnose, .dump = mlx5e_rx_reporter_dump, .default_graceful_period = MLX5E_REPORTER_RX_GRACEFUL_PERIOD, + .default_burst_period = MLX5E_REPORTER_RX_BURST_PERIOD, }; void mlx5e_reporter_rx_create(struct mlx5e_priv *priv) @@ -663,8 +673,8 @@ void mlx5e_reporter_rx_create(struct mlx5e_priv *priv) &mlx5_rx_reporter_ops, priv); if (IS_ERR(reporter)) { - netdev_warn(priv->netdev, "Failed to create rx reporter, err = %ld\n", - PTR_ERR(reporter)); + netdev_warn(priv->netdev, "Failed to create rx reporter, err = %pe\n", + reporter); return; } priv->rx_reporter = reporter; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c index a107aad018..f10b9c5bf5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c @@ -13,7 +13,6 @@ static const char * const sq_sw_state_type_name[] = { [MLX5E_SQ_STATE_RECOVERING] = "recovering", [MLX5E_SQ_STATE_IPSEC] = "ipsec", [MLX5E_SQ_STATE_DIM] = "dim", - [MLX5E_SQ_STATE_VLAN_NEED_L2_INLINE] = "vlan_need_l2_inline", [MLX5E_SQ_STATE_PENDING_XSK_TX] = "pending_xsk_tx", [MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC] = "pending_tls_rx_resync", }; @@ -316,6 +315,30 @@ out: mlx5e_health_fmsg_named_obj_nest_end(fmsg); } +static void +mlx5e_tx_reporter_diagnose_tis_config(struct devlink_health_reporter *reporter, + struct devlink_fmsg *fmsg) +{ + struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); + u8 num_tc = mlx5e_get_dcb_num_tc(&priv->channels.params); + u32 tc, i, tisn; + + devlink_fmsg_arr_pair_nest_start(fmsg, "TIS Config"); + for (i = 0; i < mlx5e_get_num_lag_ports(priv->mdev); i++) { + for (tc = 0; tc < num_tc; tc++) { + tisn = mlx5e_profile_get_tisn(priv->mdev, priv, + priv->profile, i, tc); + + devlink_fmsg_obj_nest_start(fmsg); + devlink_fmsg_u32_pair_put(fmsg, "lag port", i); + devlink_fmsg_u32_pair_put(fmsg, "tc", tc); + devlink_fmsg_u32_pair_put(fmsg, "tisn", tisn); + devlink_fmsg_obj_nest_end(fmsg); + } + } + devlink_fmsg_arr_pair_nest_end(fmsg); +} + static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter, struct devlink_fmsg *fmsg, struct netlink_ext_ack *extack) @@ -331,6 +354,7 @@ static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter, goto unlock; mlx5e_tx_reporter_diagnose_common_config(reporter, fmsg); + mlx5e_tx_reporter_diagnose_tis_config(reporter, fmsg); devlink_fmsg_arr_pair_nest_start(fmsg, "SQs"); for (i = 0; i < priv->channels.num; i++) { @@ -519,14 +543,16 @@ void mlx5e_reporter_tx_ptpsq_unhealthy(struct mlx5e_ptpsq *ptpsq) mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); } -#define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500 +#define MLX5E_REPORTER_TX_GRACEFUL_PERIOD 500 +#define MLX5E_REPORTER_TX_BURST_PERIOD 500 static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = { .name = "tx", .recover = mlx5e_tx_reporter_recover, .diagnose = mlx5e_tx_reporter_diagnose, .dump = mlx5e_tx_reporter_dump, - .default_graceful_period = MLX5_REPORTER_TX_GRACEFUL_PERIOD, + .default_graceful_period = MLX5E_REPORTER_TX_GRACEFUL_PERIOD, + .default_burst_period = MLX5E_REPORTER_TX_BURST_PERIOD, }; void mlx5e_reporter_tx_create(struct mlx5e_priv *priv) @@ -539,8 +565,8 @@ void mlx5e_reporter_tx_create(struct mlx5e_priv *priv) priv); if (IS_ERR(reporter)) { netdev_warn(priv->netdev, - "Failed to create tx reporter, err = %ld\n", - PTR_ERR(reporter)); + "Failed to create tx reporter, err = %pe\n", + reporter); return; } priv->tx_reporter = reporter; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rss.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rss.c index 74cd111ee3..88b0e1050d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rss.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rss.c @@ -75,15 +75,14 @@ struct mlx5e_rss { struct mlx5e_tir *inner_tir[MLX5E_NUM_INDIR_TIRS]; struct mlx5e_rqt rqt; struct mlx5_core_dev *mdev; /* primary */ - u32 drop_rqn; - bool inner_ft_support; + struct mlx5e_rss_params params; bool enabled; refcount_t refcnt; }; bool mlx5e_rss_get_inner_ft_support(struct mlx5e_rss *rss) { - return rss->inner_ft_support; + return rss->params.inner_ft_support; } void mlx5e_rss_params_indir_modify_actual_size(struct mlx5e_rss *rss, u32 num_channels) @@ -91,7 +90,7 @@ void mlx5e_rss_params_indir_modify_actual_size(struct mlx5e_rss *rss, u32 num_ch rss->indir.actual_table_size = mlx5e_rqt_size(rss->mdev, num_channels); } -int mlx5e_rss_params_indir_init(struct mlx5e_rss_params_indir *indir, struct mlx5_core_dev *mdev, +int mlx5e_rss_params_indir_init(struct mlx5e_rss_params_indir *indir, u32 actual_table_size, u32 max_table_size) { indir->table = kvmalloc_array(max_table_size, sizeof(*indir->table), GFP_KERNEL); @@ -139,7 +138,8 @@ static struct mlx5e_rss *mlx5e_rss_init_copy(const struct mlx5e_rss *from) if (!rss) return ERR_PTR(-ENOMEM); - err = mlx5e_rss_params_indir_init(&rss->indir, from->mdev, from->indir.actual_table_size, + err = mlx5e_rss_params_indir_init(&rss->indir, + from->indir.actual_table_size, from->indir.max_table_size); if (err) goto err_free_rss; @@ -192,11 +192,12 @@ mlx5e_rss_get_tt_config(struct mlx5e_rss *rss, enum mlx5_traffic_types tt) return rss_tt; } -static int mlx5e_rss_create_tir(struct mlx5e_rss *rss, - enum mlx5_traffic_types tt, - const struct mlx5e_packet_merge_param *init_pkt_merge_param, - bool inner) +static int +mlx5e_rss_create_tir(struct mlx5e_rss *rss, enum mlx5_traffic_types tt, + const struct mlx5e_packet_merge_param *pkt_merge_param, + bool inner) { + bool rss_inner = rss->params.inner_ft_support; struct mlx5e_rss_params_traffic_type rss_tt; struct mlx5e_tir_builder *builder; struct mlx5e_tir **tir_p; @@ -204,7 +205,7 @@ static int mlx5e_rss_create_tir(struct mlx5e_rss *rss, u32 rqtn; int err; - if (inner && !rss->inner_ft_support) { + if (inner && !rss_inner) { mlx5e_rss_warn(rss->mdev, "Cannot create inner indirect TIR[%d], RSS inner FT is not supported.\n", tt); @@ -227,9 +228,11 @@ static int mlx5e_rss_create_tir(struct mlx5e_rss *rss, rqtn = mlx5e_rqt_get_rqtn(&rss->rqt); mlx5e_tir_builder_build_rqt(builder, rss->mdev->mlx5e_res.hw_objs.td.tdn, - rqtn, rss->inner_ft_support); - mlx5e_tir_builder_build_packet_merge(builder, init_pkt_merge_param); + rqtn, rss_inner); + mlx5e_tir_builder_build_packet_merge(builder, pkt_merge_param); rss_tt = mlx5e_rss_get_tt_config(rss, tt); + mlx5e_tir_builder_build_self_lb_block(builder, rss->params.self_lb_blk, + rss->params.self_lb_blk); mlx5e_tir_builder_build_rss(builder, &rss->hash, &rss_tt, inner); err = mlx5e_tir_init(tir, builder, rss->mdev, true); @@ -264,15 +267,16 @@ static void mlx5e_rss_destroy_tir(struct mlx5e_rss *rss, enum mlx5_traffic_types *tir_p = NULL; } -static int mlx5e_rss_create_tirs(struct mlx5e_rss *rss, - const struct mlx5e_packet_merge_param *init_pkt_merge_param, - bool inner) +static int +mlx5e_rss_create_tirs(struct mlx5e_rss *rss, + const struct mlx5e_packet_merge_param *pkt_merge_param, + bool inner) { enum mlx5_traffic_types tt, max_tt; int err; for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++) { - err = mlx5e_rss_create_tir(rss, tt, init_pkt_merge_param, inner); + err = mlx5e_rss_create_tir(rss, tt, pkt_merge_param, inner); if (err) goto err_destroy_tirs; } @@ -335,7 +339,7 @@ static int mlx5e_rss_update_tirs(struct mlx5e_rss *rss) tt, err); } - if (!rss->inner_ft_support) + if (!rss->params.inner_ft_support) continue; err = mlx5e_rss_update_tir(rss, tt, true); @@ -355,14 +359,16 @@ static int mlx5e_rss_init_no_tirs(struct mlx5e_rss *rss) refcount_set(&rss->refcnt, 1); return mlx5e_rqt_init_direct(&rss->rqt, rss->mdev, true, - rss->drop_rqn, rss->indir.max_table_size); + rss->params.drop_rqn, + rss->indir.max_table_size); } -struct mlx5e_rss *mlx5e_rss_init(struct mlx5_core_dev *mdev, bool inner_ft_support, u32 drop_rqn, - const struct mlx5e_packet_merge_param *init_pkt_merge_param, - enum mlx5e_rss_init_type type, unsigned int nch, - unsigned int max_nch) +struct mlx5e_rss * +mlx5e_rss_init(struct mlx5_core_dev *mdev, + const struct mlx5e_rss_params *params, + const struct mlx5e_rss_init_params *init_params) { + u32 rqt_max_size, rqt_size; struct mlx5e_rss *rss; int err; @@ -370,29 +376,31 @@ struct mlx5e_rss *mlx5e_rss_init(struct mlx5_core_dev *mdev, bool inner_ft_suppo if (!rss) return ERR_PTR(-ENOMEM); - err = mlx5e_rss_params_indir_init(&rss->indir, mdev, - mlx5e_rqt_size(mdev, nch), - mlx5e_rqt_size(mdev, max_nch)); + rqt_size = mlx5e_rqt_size(mdev, init_params->nch); + rqt_max_size = mlx5e_rqt_size(mdev, init_params->max_nch); + err = mlx5e_rss_params_indir_init(&rss->indir, rqt_size, rqt_max_size); if (err) goto err_free_rss; rss->mdev = mdev; - rss->inner_ft_support = inner_ft_support; - rss->drop_rqn = drop_rqn; + rss->params = *params; err = mlx5e_rss_init_no_tirs(rss); if (err) goto err_free_indir; - if (type == MLX5E_RSS_INIT_NO_TIRS) + if (init_params->type == MLX5E_RSS_INIT_NO_TIRS) goto out; - err = mlx5e_rss_create_tirs(rss, init_pkt_merge_param, false); + err = mlx5e_rss_create_tirs(rss, init_params->pkt_merge_param, + false); if (err) goto err_destroy_rqt; - if (inner_ft_support) { - err = mlx5e_rss_create_tirs(rss, init_pkt_merge_param, true); + if (params->inner_ft_support) { + err = mlx5e_rss_create_tirs(rss, + init_params->pkt_merge_param, + true); if (err) goto err_destroy_tirs; } @@ -418,7 +426,7 @@ int mlx5e_rss_cleanup(struct mlx5e_rss *rss) mlx5e_rss_destroy_tirs(rss, false); - if (rss->inner_ft_support) + if (rss->params.inner_ft_support) mlx5e_rss_destroy_tirs(rss, true); mlx5e_rqt_destroy(&rss->rqt); @@ -448,7 +456,7 @@ u32 mlx5e_rss_get_tirn(struct mlx5e_rss *rss, enum mlx5_traffic_types tt, { struct mlx5e_tir *tir; - WARN_ON(inner && !rss->inner_ft_support); + WARN_ON(inner && !rss->params.inner_ft_support); tir = rss_get_tir(rss, tt, inner); WARN_ON(!tir); @@ -468,10 +476,10 @@ bool mlx5e_rss_valid_tir(struct mlx5e_rss *rss, enum mlx5_traffic_types tt, bool /* Fill the "tirn" output parameter. * Create the requested TIR if it's its first usage. */ -int mlx5e_rss_obtain_tirn(struct mlx5e_rss *rss, - enum mlx5_traffic_types tt, - const struct mlx5e_packet_merge_param *init_pkt_merge_param, - bool inner, u32 *tirn) +int +mlx5e_rss_obtain_tirn(struct mlx5e_rss *rss, enum mlx5_traffic_types tt, + const struct mlx5e_packet_merge_param *pkt_merge_param, + bool inner, u32 *tirn) { struct mlx5e_tir *tir; @@ -479,7 +487,7 @@ int mlx5e_rss_obtain_tirn(struct mlx5e_rss *rss, if (!tir) { /* TIR doesn't exist, create one */ int err; - err = mlx5e_rss_create_tir(rss, tt, init_pkt_merge_param, inner); + err = mlx5e_rss_create_tir(rss, tt, pkt_merge_param, inner); if (err) return err; tir = rss_get_tir(rss, tt, inner); @@ -512,10 +520,11 @@ void mlx5e_rss_disable(struct mlx5e_rss *rss) int err; rss->enabled = false; - err = mlx5e_rqt_redirect_direct(&rss->rqt, rss->drop_rqn, NULL); + err = mlx5e_rqt_redirect_direct(&rss->rqt, rss->params.drop_rqn, NULL); if (err) mlx5e_rss_warn(rss->mdev, "Failed to redirect RQT %#x to drop RQ %#x: err = %d\n", - mlx5e_rqt_get_rqtn(&rss->rqt), rss->drop_rqn, err); + mlx5e_rqt_get_rqtn(&rss->rqt), + rss->params.drop_rqn, err); } int mlx5e_rss_packet_merge_set_param(struct mlx5e_rss *rss, @@ -548,7 +557,7 @@ int mlx5e_rss_packet_merge_set_param(struct mlx5e_rss *rss, } inner_tir: - if (!rss->inner_ft_support) + if (!rss->params.inner_ft_support) continue; tir = rss_get_tir(rss, tt, true); @@ -567,7 +576,8 @@ inner_tir: return final_err; } -int mlx5e_rss_get_rxfh(struct mlx5e_rss *rss, u32 *indir, u8 *key, u8 *hfunc, bool *symmetric) +void mlx5e_rss_get_rxfh(struct mlx5e_rss *rss, u32 *indir, u8 *key, u8 *hfunc, + bool *symmetric) { if (indir) memcpy(indir, rss->indir.table, @@ -582,8 +592,6 @@ int mlx5e_rss_get_rxfh(struct mlx5e_rss *rss, u32 *indir, u8 *key, u8 *hfunc, bo if (symmetric) *symmetric = rss->hash.symmetric; - - return 0; } int mlx5e_rss_set_rxfh(struct mlx5e_rss *rss, const u32 *indir, @@ -682,7 +690,7 @@ int mlx5e_rss_set_hash_fields(struct mlx5e_rss *rss, enum mlx5_traffic_types tt, return err; } - if (!(rss->inner_ft_support)) + if (!(rss->params.inner_ft_support)) return 0; err = mlx5e_rss_update_tir(rss, tt, true); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rss.h b/drivers/net/ethernet/mellanox/mlx5/core/en/rss.h index 8ac9021900..17664757a5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rss.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rss.h @@ -13,19 +13,32 @@ enum mlx5e_rss_init_type { MLX5E_RSS_INIT_TIRS }; +struct mlx5e_rss_init_params { + enum mlx5e_rss_init_type type; + const struct mlx5e_packet_merge_param *pkt_merge_param; + unsigned int nch; + unsigned int max_nch; +}; + +struct mlx5e_rss_params { + bool inner_ft_support; + u32 drop_rqn; + bool self_lb_blk; +}; + struct mlx5e_rss_params_traffic_type mlx5e_rss_get_default_tt_config(enum mlx5_traffic_types tt); struct mlx5e_rss; -int mlx5e_rss_params_indir_init(struct mlx5e_rss_params_indir *indir, struct mlx5_core_dev *mdev, +int mlx5e_rss_params_indir_init(struct mlx5e_rss_params_indir *indir, u32 actual_table_size, u32 max_table_size); void mlx5e_rss_params_indir_cleanup(struct mlx5e_rss_params_indir *indir); void mlx5e_rss_params_indir_modify_actual_size(struct mlx5e_rss *rss, u32 num_channels); -struct mlx5e_rss *mlx5e_rss_init(struct mlx5_core_dev *mdev, bool inner_ft_support, u32 drop_rqn, - const struct mlx5e_packet_merge_param *init_pkt_merge_param, - enum mlx5e_rss_init_type type, unsigned int nch, - unsigned int max_nch); +struct mlx5e_rss * +mlx5e_rss_init(struct mlx5_core_dev *mdev, + const struct mlx5e_rss_params *params, + const struct mlx5e_rss_init_params *init_params); int mlx5e_rss_cleanup(struct mlx5e_rss *rss); void mlx5e_rss_refcnt_inc(struct mlx5e_rss *rss); @@ -37,17 +50,18 @@ u32 mlx5e_rss_get_tirn(struct mlx5e_rss *rss, enum mlx5_traffic_types tt, bool inner); bool mlx5e_rss_valid_tir(struct mlx5e_rss *rss, enum mlx5_traffic_types tt, bool inner); u32 mlx5e_rss_get_rqtn(struct mlx5e_rss *rss); -int mlx5e_rss_obtain_tirn(struct mlx5e_rss *rss, - enum mlx5_traffic_types tt, - const struct mlx5e_packet_merge_param *init_pkt_merge_param, - bool inner, u32 *tirn); +int +mlx5e_rss_obtain_tirn(struct mlx5e_rss *rss, enum mlx5_traffic_types tt, + const struct mlx5e_packet_merge_param *pkt_merge_param, + bool inner, u32 *tirn); void mlx5e_rss_enable(struct mlx5e_rss *rss, u32 *rqns, u32 *vhca_ids, unsigned int num_rqns); void mlx5e_rss_disable(struct mlx5e_rss *rss); int mlx5e_rss_packet_merge_set_param(struct mlx5e_rss *rss, struct mlx5e_packet_merge_param *pkt_merge_param); -int mlx5e_rss_get_rxfh(struct mlx5e_rss *rss, u32 *indir, u8 *key, u8 *hfunc, bool *symmetric); +void mlx5e_rss_get_rxfh(struct mlx5e_rss *rss, u32 *indir, u8 *key, u8 *hfunc, + bool *symmetric); int mlx5e_rss_set_rxfh(struct mlx5e_rss *rss, const u32 *indir, const u8 *key, const u8 *hfunc, const bool *symmetric, u32 *rqns, u32 *vhca_ids, unsigned int num_rqns); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.c index 5fcbe47337..55c117b7d8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.c @@ -54,51 +54,74 @@ static int mlx5e_rx_res_rss_init_def(struct mlx5e_rx_res *res, unsigned int init_nch) { bool inner_ft_support = res->features & MLX5E_RX_RES_FEATURE_INNER_FT; + struct mlx5e_rss_init_params init_params; + struct mlx5e_rss_params rss_params; struct mlx5e_rss *rss; if (WARN_ON(res->rss[0])) return -EINVAL; - rss = mlx5e_rss_init(res->mdev, inner_ft_support, res->drop_rqn, - &res->pkt_merge_param, MLX5E_RSS_INIT_TIRS, init_nch, res->max_nch); + init_params = (struct mlx5e_rss_init_params) { + .type = MLX5E_RSS_INIT_TIRS, + .pkt_merge_param = &res->pkt_merge_param, + .nch = init_nch, + .max_nch = res->max_nch, + }; + + rss_params = (struct mlx5e_rss_params) { + .inner_ft_support = inner_ft_support, + .drop_rqn = res->drop_rqn, + .self_lb_blk = + res->features & MLX5E_RX_RES_FEATURE_SELF_LB_BLOCK, + }; + + rss = mlx5e_rss_init(res->mdev, &rss_params, &init_params); if (IS_ERR(rss)) return PTR_ERR(rss); - mlx5e_rss_set_indir_uniform(rss, init_nch); + mlx5e_rss_set_indir_uniform(rss, init_params.nch); res->rss[0] = rss; return 0; } -int mlx5e_rx_res_rss_init(struct mlx5e_rx_res *res, u32 *rss_idx, unsigned int init_nch) +int mlx5e_rx_res_rss_init(struct mlx5e_rx_res *res, u32 rss_idx, unsigned int init_nch) { bool inner_ft_support = res->features & MLX5E_RX_RES_FEATURE_INNER_FT; + struct mlx5e_rss_init_params init_params; + struct mlx5e_rss_params rss_params; struct mlx5e_rss *rss; - int i; - for (i = 1; i < MLX5E_MAX_NUM_RSS; i++) - if (!res->rss[i]) - break; - - if (i == MLX5E_MAX_NUM_RSS) + if (WARN_ON_ONCE(res->rss[rss_idx])) return -ENOSPC; - rss = mlx5e_rss_init(res->mdev, inner_ft_support, res->drop_rqn, - &res->pkt_merge_param, MLX5E_RSS_INIT_NO_TIRS, init_nch, - res->max_nch); + init_params = (struct mlx5e_rss_init_params) { + .type = MLX5E_RSS_INIT_NO_TIRS, + .pkt_merge_param = &res->pkt_merge_param, + .nch = init_nch, + .max_nch = res->max_nch, + }; + + rss_params = (struct mlx5e_rss_params) { + .inner_ft_support = inner_ft_support, + .drop_rqn = res->drop_rqn, + .self_lb_blk = + res->features & MLX5E_RX_RES_FEATURE_SELF_LB_BLOCK, + }; + + rss = mlx5e_rss_init(res->mdev, &rss_params, &init_params); if (IS_ERR(rss)) return PTR_ERR(rss); - mlx5e_rss_set_indir_uniform(rss, init_nch); + mlx5e_rss_set_indir_uniform(rss, init_params.nch); if (res->rss_active) { u32 *vhca_ids = get_vhca_ids(res, 0); mlx5e_rss_enable(rss, res->rss_rqns, vhca_ids, res->rss_nch); } - res->rss[i] = rss; - *rss_idx = i; + res->rss[rss_idx] = rss; return 0; } @@ -193,19 +216,17 @@ void mlx5e_rx_res_rss_set_indir_uniform(struct mlx5e_rx_res *res, unsigned int n mlx5e_rss_set_indir_uniform(res->rss[0], nch); } -int mlx5e_rx_res_rss_get_rxfh(struct mlx5e_rx_res *res, u32 rss_idx, - u32 *indir, u8 *key, u8 *hfunc, bool *symmetric) +void mlx5e_rx_res_rss_get_rxfh(struct mlx5e_rx_res *res, u32 rss_idx, + u32 *indir, u8 *key, u8 *hfunc, bool *symmetric) { - struct mlx5e_rss *rss; + struct mlx5e_rss *rss = NULL; - if (rss_idx >= MLX5E_MAX_NUM_RSS) - return -EINVAL; + if (rss_idx < MLX5E_MAX_NUM_RSS) + rss = res->rss[rss_idx]; + if (WARN_ON_ONCE(!rss)) + return; - rss = res->rss[rss_idx]; - if (!rss) - return -ENOENT; - - return mlx5e_rss_get_rxfh(rss, indir, key, hfunc, symmetric); + mlx5e_rss_get_rxfh(rss, indir, key, hfunc, symmetric); } int mlx5e_rx_res_rss_set_rxfh(struct mlx5e_rx_res *res, u32 rss_idx, @@ -329,6 +350,7 @@ static struct mlx5e_rx_res *mlx5e_rx_res_alloc(struct mlx5_core_dev *mdev, unsig static int mlx5e_rx_res_channels_init(struct mlx5e_rx_res *res) { bool inner_ft_support = res->features & MLX5E_RX_RES_FEATURE_INNER_FT; + bool self_lb_blk = res->features & MLX5E_RX_RES_FEATURE_SELF_LB_BLOCK; struct mlx5e_tir_builder *builder; int err = 0; int ix; @@ -359,6 +381,8 @@ static int mlx5e_rx_res_channels_init(struct mlx5e_rx_res *res) mlx5e_rqt_get_rqtn(&res->channels[ix].direct_rqt), inner_ft_support); mlx5e_tir_builder_build_packet_merge(builder, &res->pkt_merge_param); + mlx5e_tir_builder_build_self_lb_block(builder, self_lb_blk, + self_lb_blk); mlx5e_tir_builder_build_direct(builder); err = mlx5e_tir_init(&res->channels[ix].direct_tir, builder, res->mdev, true); @@ -446,7 +470,7 @@ static void mlx5e_rx_res_ptp_destroy(struct mlx5e_rx_res *res) struct mlx5e_rx_res * mlx5e_rx_res_create(struct mlx5_core_dev *mdev, enum mlx5e_rx_res_features features, unsigned int max_nch, u32 drop_rqn, - const struct mlx5e_packet_merge_param *init_pkt_merge_param, + const struct mlx5e_packet_merge_param *pkt_merge_param, unsigned int init_nch) { bool multi_vhca = features & MLX5E_RX_RES_FEATURE_MULTI_VHCA; @@ -462,7 +486,7 @@ mlx5e_rx_res_create(struct mlx5_core_dev *mdev, enum mlx5e_rx_res_features featu res->max_nch = max_nch; res->drop_rqn = drop_rqn; - res->pkt_merge_param = *init_pkt_merge_param; + res->pkt_merge_param = *pkt_merge_param; init_rwsem(&res->pkt_merge_param_sem); err = mlx5e_rx_res_rss_init_def(res, init_nch); @@ -579,8 +603,6 @@ void mlx5e_rx_res_channels_activate(struct mlx5e_rx_res *res, struct mlx5e_chann for (ix = 0; ix < nch; ix++) mlx5e_rx_res_channel_activate_direct(res, chs, ix); - for (ix = nch; ix < res->max_nch; ix++) - mlx5e_rx_res_channel_deactivate_direct(res, ix); if (res->features & MLX5E_RX_RES_FEATURE_PTP) { u32 rqn; @@ -603,7 +625,7 @@ void mlx5e_rx_res_channels_deactivate(struct mlx5e_rx_res *res) mlx5e_rx_res_rss_disable(res); - for (ix = 0; ix < res->max_nch; ix++) + for (ix = 0; ix < res->rss_nch; ix++) mlx5e_rx_res_channel_deactivate_direct(res, ix); if (res->features & MLX5E_RX_RES_FEATURE_PTP) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.h b/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.h index 3e09d91281..675780120a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.h @@ -21,13 +21,14 @@ enum mlx5e_rx_res_features { MLX5E_RX_RES_FEATURE_INNER_FT = BIT(0), MLX5E_RX_RES_FEATURE_PTP = BIT(1), MLX5E_RX_RES_FEATURE_MULTI_VHCA = BIT(2), + MLX5E_RX_RES_FEATURE_SELF_LB_BLOCK = BIT(3), }; /* Setup */ struct mlx5e_rx_res * mlx5e_rx_res_create(struct mlx5_core_dev *mdev, enum mlx5e_rx_res_features features, unsigned int max_nch, u32 drop_rqn, - const struct mlx5e_packet_merge_param *init_pkt_merge_param, + const struct mlx5e_packet_merge_param *pkt_merge_param, unsigned int init_nch); void mlx5e_rx_res_destroy(struct mlx5e_rx_res *res); @@ -48,8 +49,9 @@ void mlx5e_rx_res_xsk_update(struct mlx5e_rx_res *res, struct mlx5e_channels *ch /* Configuration API */ void mlx5e_rx_res_rss_set_indir_uniform(struct mlx5e_rx_res *res, unsigned int nch); -int mlx5e_rx_res_rss_get_rxfh(struct mlx5e_rx_res *res, u32 rss_idx, - u32 *indir, u8 *key, u8 *hfunc, bool *symmetric); +void mlx5e_rx_res_rss_get_rxfh(struct mlx5e_rx_res *res, u32 rss_idx, + u32 *indir, u8 *key, u8 *hfunc, + bool *symmetric); int mlx5e_rx_res_rss_set_rxfh(struct mlx5e_rx_res *res, u32 rss_idx, const u32 *indir, const u8 *key, const u8 *hfunc, const bool *symmetric); @@ -61,7 +63,7 @@ int mlx5e_rx_res_rss_set_hash_fields(struct mlx5e_rx_res *res, u32 rss_idx, int mlx5e_rx_res_packet_merge_set_param(struct mlx5e_rx_res *res, struct mlx5e_packet_merge_param *pkt_merge_param); -int mlx5e_rx_res_rss_init(struct mlx5e_rx_res *res, u32 *rss_idx, unsigned int init_nch); +int mlx5e_rx_res_rss_init(struct mlx5e_rx_res *res, u32 rss_idx, unsigned int init_nch); int mlx5e_rx_res_rss_destroy(struct mlx5e_rx_res *res, u32 rss_idx); int mlx5e_rx_res_rss_cnt(struct mlx5e_rx_res *res); int mlx5e_rx_res_rss_index(struct mlx5e_rx_res *res, struct mlx5e_rss *rss); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/vlan.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/vlan.c index a13c5e707b..9bdb5820c5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/vlan.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/vlan.c @@ -94,29 +94,30 @@ mlx5e_tc_act_vlan_add_push_action(struct mlx5e_priv *priv, struct net_device **out_dev, struct netlink_ext_ack *extack) { - struct net_device *vlan_dev = *out_dev; - struct flow_action_entry vlan_act = { - .id = FLOW_ACTION_VLAN_PUSH, - .vlan.vid = vlan_dev_vlan_id(vlan_dev), - .vlan.proto = vlan_dev_vlan_proto(vlan_dev), - .vlan.prio = 0, - }; - int err; + do { + struct net_device *vlan_dev = *out_dev; + struct flow_action_entry vlan_act = { + .id = FLOW_ACTION_VLAN_PUSH, + .vlan.vid = vlan_dev_vlan_id(vlan_dev), + .vlan.proto = vlan_dev_vlan_proto(vlan_dev), + .vlan.prio = 0, + }; + int err; - err = parse_tc_vlan_action(priv, &vlan_act, attr->esw_attr, &attr->action, extack, NULL); - if (err) - return err; + err = parse_tc_vlan_action(priv, &vlan_act, attr->esw_attr, + &attr->action, extack, NULL); + if (err) + return err; - rcu_read_lock(); - *out_dev = dev_get_by_index_rcu(dev_net(vlan_dev), dev_get_iflink(vlan_dev)); - rcu_read_unlock(); - if (!*out_dev) - return -ENODEV; + rcu_read_lock(); + *out_dev = dev_get_by_index_rcu(dev_net(vlan_dev), + dev_get_iflink(vlan_dev)); + rcu_read_unlock(); + if (!*out_dev) + return -ENODEV; + } while (is_vlan_dev(*out_dev)); - if (is_vlan_dev(*out_dev)) - err = mlx5e_tc_act_vlan_add_push_action(priv, attr, out_dev, extack); - - return err; + return 0; } int diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_hmfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_hmfs.c index a4263137fe..d3db6146fc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_hmfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_hmfs.c @@ -136,8 +136,8 @@ mlx5_ct_fs_hmfs_matcher_get(struct mlx5_ct_fs *fs, struct mlx5_flow_spec *spec, hws_bwc_matcher = mlx5_ct_fs_hmfs_matcher_create(fs, tbl, spec, ipv4, tcp, gre); if (IS_ERR(hws_bwc_matcher)) { netdev_warn(fs->netdev, - "ct_fs_hmfs: failed to create bwc matcher (nat %d, ipv4 %d, tcp %d, gre %d), err: %ld\n", - nat, ipv4, tcp, gre, PTR_ERR(hws_bwc_matcher)); + "ct_fs_hmfs: failed to create bwc matcher (nat %d, ipv4 %d, tcp %d, gre %d), err: %pe\n", + nat, ipv4, tcp, gre, hws_bwc_matcher); hmfs_matcher = ERR_CAST(hws_bwc_matcher); goto out_unlock; @@ -173,6 +173,8 @@ static void mlx5_ct_fs_hmfs_fill_rule_actions(struct mlx5_ct_fs_hmfs *fs_hmfs, memset(rule_actions, 0, NUM_CT_HMFS_RULES * sizeof(*rule_actions)); rule_actions[0].action = mlx5_fc_get_hws_action(fs_hmfs->ctx, attr->counter); + rule_actions[0].counter.offset = + attr->counter->id - attr->counter->bulk->base_id; /* Modify header is special, it may require extra arguments outside the action itself. */ if (mh_action->mh_data) { rule_actions[1].modify_header.offset = mh_action->mh_data->offset; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_smfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_smfs.c index 0c97c58999..4d6924b644 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_smfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_smfs.c @@ -148,8 +148,8 @@ mlx5_ct_fs_smfs_matcher_get(struct mlx5_ct_fs *fs, bool nat, bool ipv4, bool tcp dr_matcher = mlx5_ct_fs_smfs_matcher_create(fs, tbl, ipv4, tcp, gre, prio); if (IS_ERR(dr_matcher)) { netdev_warn(fs->netdev, - "ct_fs_smfs: failed to create matcher (nat %d, ipv4 %d, tcp %d, gre %d), err: %ld\n", - nat, ipv4, tcp, gre, PTR_ERR(dr_matcher)); + "ct_fs_smfs: failed to create matcher (nat %d, ipv4 %d, tcp %d, gre %d), err: %pe\n", + nat, ipv4, tcp, gre, dr_matcher); smfs_matcher = ERR_CAST(dr_matcher); goto out_unlock; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/int_port.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/int_port.c index 8afcec0c5d..991f470506 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/int_port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/int_port.c @@ -93,8 +93,8 @@ mlx5e_int_port_create_rx_rule(struct mlx5_eswitch *esw, flow_rule = mlx5_add_flow_rules(esw->offloads.ft_offloads, spec, &flow_act, dest, 1); if (IS_ERR(flow_rule)) - mlx5_core_warn(esw->dev, "ft offloads: Failed to add internal vport rx rule err %ld\n", - PTR_ERR(flow_rule)); + mlx5_core_warn(esw->dev, "ft offloads: Failed to add internal vport rx rule err %pe\n", + flow_rule); kvfree(spec); @@ -307,7 +307,8 @@ mlx5e_tc_int_port_init(struct mlx5e_priv *priv) { struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; struct mlx5e_tc_int_port_priv *int_port_priv; - u64 mapping_id; + u8 mapping_id[MLX5_SW_IMAGE_GUID_MAX_BYTES]; + u8 id_len; if (!mlx5e_tc_int_port_supported(esw)) return NULL; @@ -316,14 +317,15 @@ mlx5e_tc_int_port_init(struct mlx5e_priv *priv) if (!int_port_priv) return NULL; - mapping_id = mlx5_query_nic_system_image_guid(priv->mdev); + mlx5_query_nic_sw_system_image_guid(priv->mdev, mapping_id, &id_len); - int_port_priv->metadata_mapping = mapping_create_for_id(mapping_id, MAPPING_TYPE_INT_PORT, + int_port_priv->metadata_mapping = mapping_create_for_id(mapping_id, id_len, + MAPPING_TYPE_INT_PORT, sizeof(u32) * 2, (1 << ESW_VPORT_BITS) - 1, true); if (IS_ERR(int_port_priv->metadata_mapping)) { - mlx5_core_warn(priv->mdev, "Can't allocate metadata mapping of int port offload, err=%ld\n", - PTR_ERR(int_port_priv->metadata_mapping)); + mlx5_core_warn(priv->mdev, "Can't allocate metadata mapping of int port offload, err=%pe\n", + int_port_priv->metadata_mapping); goto err_mapping; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c index 81332cd4a5..fc0e57403d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c @@ -1195,6 +1195,7 @@ mlx5_tc_ct_block_flow_offload_add(struct mlx5_ct_ft *ft, struct flow_action_entry *meta_action; unsigned long cookie = flow->cookie; struct mlx5_ct_entry *entry; + bool has_nat; int err; meta_action = mlx5_tc_ct_get_ct_metadata_action(flow_rule); @@ -1236,6 +1237,8 @@ mlx5_tc_ct_block_flow_offload_add(struct mlx5_ct_ft *ft, err = mlx5_tc_ct_rule_to_tuple_nat(&entry->tuple_nat, flow_rule); if (err) goto err_set; + has_nat = memcmp(&entry->tuple, &entry->tuple_nat, + sizeof(entry->tuple)); spin_lock_bh(&ct_priv->ht_lock); @@ -1244,7 +1247,7 @@ mlx5_tc_ct_block_flow_offload_add(struct mlx5_ct_ft *ft, if (err) goto err_entries; - if (memcmp(&entry->tuple, &entry->tuple_nat, sizeof(entry->tuple))) { + if (has_nat) { err = rhashtable_lookup_insert_fast(&ct_priv->ct_tuples_nat_ht, &entry->tuple_nat_node, tuples_nat_ht_params); @@ -2284,9 +2287,10 @@ mlx5_tc_ct_init(struct mlx5e_priv *priv, struct mlx5_fs_chains *chains, enum mlx5_flow_namespace_type ns_type, struct mlx5e_post_act *post_act) { + u8 mapping_id[MLX5_SW_IMAGE_GUID_MAX_BYTES]; struct mlx5_tc_ct_priv *ct_priv; struct mlx5_core_dev *dev; - u64 mapping_id; + u8 id_len; int err; dev = priv->mdev; @@ -2298,16 +2302,18 @@ mlx5_tc_ct_init(struct mlx5e_priv *priv, struct mlx5_fs_chains *chains, if (!ct_priv) goto err_alloc; - mapping_id = mlx5_query_nic_system_image_guid(dev); + mlx5_query_nic_sw_system_image_guid(dev, mapping_id, &id_len); - ct_priv->zone_mapping = mapping_create_for_id(mapping_id, MAPPING_TYPE_ZONE, + ct_priv->zone_mapping = mapping_create_for_id(mapping_id, id_len, + MAPPING_TYPE_ZONE, sizeof(u16), 0, true); if (IS_ERR(ct_priv->zone_mapping)) { err = PTR_ERR(ct_priv->zone_mapping); goto err_mapping_zone; } - ct_priv->labels_mapping = mapping_create_for_id(mapping_id, MAPPING_TYPE_LABELS, + ct_priv->labels_mapping = mapping_create_for_id(mapping_id, id_len, + MAPPING_TYPE_LABELS, sizeof(u32) * 4, 0, true); if (IS_ERR(ct_priv->labels_mapping)) { err = PTR_ERR(ct_priv->labels_mapping); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c index a0fc76a1bc..0735d10f2b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c @@ -172,8 +172,8 @@ void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv, &reformat_params, MLX5_FLOW_NAMESPACE_FDB); if (IS_ERR(e->pkt_reformat)) { - mlx5_core_warn(priv->mdev, "Failed to offload cached encapsulation header, %lu\n", - PTR_ERR(e->pkt_reformat)); + mlx5_core_warn(priv->mdev, "Failed to offload cached encapsulation header, %pe\n", + e->pkt_reformat); return; } e->flags |= MLX5_ENCAP_ENTRY_VALID; @@ -1845,8 +1845,8 @@ static int mlx5e_tc_tun_fib_event(struct notifier_block *nb, unsigned long event queue_work(priv->wq, &fib_work->work); } else if (IS_ERR(fib_work)) { NL_SET_ERR_MSG_MOD(info->extack, "Failed to init fib work"); - mlx5_core_warn(priv->mdev, "Failed to init fib work, %ld\n", - PTR_ERR(fib_work)); + mlx5_core_warn(priv->mdev, "Failed to init fib work, %pe\n", + fib_work); } break; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tir.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tir.c index 19499072f6..0b55e77f19 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tir.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tir.c @@ -146,6 +146,31 @@ void mlx5e_tir_builder_build_direct(struct mlx5e_tir_builder *builder) MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_INVERTED_XOR8); } +static void mlx5e_tir_context_self_lb_block(void *tirc, bool enable_uc_lb, + bool enable_mc_lb) +{ + u8 lb_flags = 0; + + if (enable_uc_lb) + lb_flags = MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST; + if (enable_mc_lb) + lb_flags |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_MULTICAST; + + MLX5_SET(tirc, tirc, self_lb_block, lb_flags); +} + +void mlx5e_tir_builder_build_self_lb_block(struct mlx5e_tir_builder *builder, + bool enable_uc_lb, + bool enable_mc_lb) +{ + void *tirc = mlx5e_tir_builder_get_tirc(builder); + + if (builder->modify) + MLX5_SET(modify_tir_in, builder->in, bitmask.self_lb_en, 1); + + mlx5e_tir_context_self_lb_block(tirc, enable_uc_lb, enable_mc_lb); +} + void mlx5e_tir_builder_build_tls(struct mlx5e_tir_builder *builder) { void *tirc = mlx5e_tir_builder_get_tirc(builder); @@ -153,9 +178,7 @@ void mlx5e_tir_builder_build_tls(struct mlx5e_tir_builder *builder) WARN_ON(builder->modify); MLX5_SET(tirc, tirc, tls_en, 1); - MLX5_SET(tirc, tirc, self_lb_block, - MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST | - MLX5_TIRC_SELF_LB_BLOCK_BLOCK_MULTICAST); + mlx5e_tir_context_self_lb_block(tirc, true, true); } int mlx5e_tir_init(struct mlx5e_tir *tir, struct mlx5e_tir_builder *builder, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tir.h b/drivers/net/ethernet/mellanox/mlx5/core/en/tir.h index e8df3aaf65..958eeb959a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tir.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tir.h @@ -35,6 +35,9 @@ void mlx5e_tir_builder_build_rss(struct mlx5e_tir_builder *builder, const struct mlx5e_rss_params_traffic_type *rss_tt, bool inner); void mlx5e_tir_builder_build_direct(struct mlx5e_tir_builder *builder); +void mlx5e_tir_builder_build_self_lb_block(struct mlx5e_tir_builder *builder, + bool enable_uc_lb, + bool enable_mc_lb); void mlx5e_tir_builder_build_tls(struct mlx5e_tir_builder *builder); struct mlx5_core_dev; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c b/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c index 140606fcd2..1b1c89014b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c @@ -47,7 +47,7 @@ static void mlx5e_init_trap_rq(struct mlx5e_trap *t, struct mlx5e_params *params rq->netdev = priv->netdev; rq->priv = priv; rq->clock = mdev->clock; - rq->tstamp = &priv->tstamp; + rq->hwtstamp_config = &priv->hwtstamp_config; rq->mdev = mdev; rq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); rq->stats = &priv->trap_stats.rq; @@ -76,6 +76,7 @@ static int mlx5e_open_trap_rq(struct mlx5e_priv *priv, struct mlx5e_trap *t) ccp.ch_stats = t->stats; ccp.napi = &t->napi; ccp.ix = 0; + ccp.uar = mdev->priv.bfreg.up; err = mlx5e_open_cq(priv->mdev, trap_moder, &rq_param->cqp, &ccp, &rq->cq); if (err) return err; @@ -143,7 +144,6 @@ static struct mlx5e_trap *mlx5e_open_trap(struct mlx5e_priv *priv) t->priv = priv; t->mdev = priv->mdev; - t->tstamp = &priv->tstamp; t->pdev = mlx5_core_dma_dev(priv->mdev); t->netdev = priv->netdev; t->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.hw_objs.mkey); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/trap.h b/drivers/net/ethernet/mellanox/mlx5/core/en/trap.h index aa3f17658c..394e917ea2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/trap.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/trap.h @@ -22,7 +22,6 @@ struct mlx5e_trap { /* control */ struct mlx5e_priv *priv; struct mlx5_core_dev *mdev; - struct hwtstamp_config *tstamp; DECLARE_BITMAP(state, MLX5E_CHANNEL_NUM_STATES); struct mlx5e_params params; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h index e837c21d3d..07945e182b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h @@ -92,7 +92,7 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget); void mlx5e_free_rx_descs(struct mlx5e_rq *rq); void mlx5e_free_rx_missing_descs(struct mlx5e_rq *rq); -static inline bool mlx5e_rx_hw_stamp(struct hwtstamp_config *config) +static inline bool mlx5e_rx_hw_stamp(struct kernel_hwtstamp_config *config) { return config->rx_filter == HWTSTAMP_FILTER_ALL; } @@ -309,10 +309,7 @@ mlx5e_notify_hw(struct mlx5_wq_cyc *wq, u16 pc, void __iomem *uar_map, static inline void mlx5e_cq_arm(struct mlx5e_cq *cq) { - struct mlx5_core_cq *mcq; - - mcq = &cq->mcq; - mlx5_cq_arm(mcq, MLX5_CQ_DB_REQ_NOT, mcq->uar->map, cq->wq.cc); + mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, cq->uar->map, cq->wq.cc); } static inline struct mlx5e_sq_dma * diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index 1f9d012231..027c551873 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -178,7 +178,7 @@ static int mlx5e_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp) { const struct mlx5e_xdp_buff *_ctx = (void *)ctx; - if (unlikely(!mlx5e_rx_hw_stamp(_ctx->rq->tstamp))) + if (unlikely(!mlx5e_rx_hw_stamp(_ctx->rq->hwtstamp_config))) return -ENODATA; *timestamp = mlx5e_cqe_ts_to_ns(_ctx->rq->ptp_cyc2time, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c index d743e82336..5981c71cae 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c @@ -54,7 +54,7 @@ static void mlx5e_build_xsk_cparam(struct mlx5_core_dev *mdev, struct mlx5e_channel_param *cparam) { mlx5e_build_rq_param(mdev, params, xsk, &cparam->rq); - mlx5e_build_xdpsq_param(mdev, params, xsk, &cparam->xdp_sq); + mlx5e_build_xdpsq_param(mdev, params, &cparam->xdp_sq); } static int mlx5e_init_xsk_rq(struct mlx5e_channel *c, @@ -71,7 +71,7 @@ static int mlx5e_init_xsk_rq(struct mlx5e_channel *c, rq->pdev = c->pdev; rq->netdev = c->netdev; rq->priv = c->priv; - rq->tstamp = c->tstamp; + rq->hwtstamp_config = &c->priv->hwtstamp_config; rq->clock = mdev->clock; rq->icosq = &c->icosq; rq->ix = c->ix; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.c index 4f83e31727..1febdc5b81 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.c @@ -138,7 +138,7 @@ struct mlx5_flow_handle *mlx5e_accel_fs_add_sk(struct mlx5e_flow_steering *fs, flow = mlx5_add_flow_rules(ft->t, spec, &flow_act, &dest, 1); if (IS_ERR(flow)) - fs_err(fs, "mlx5_add_flow_rules() failed, flow is %ld\n", PTR_ERR(flow)); + fs_err(fs, "mlx5_add_flow_rules() failed, flow is %pe\n", flow); out: kvfree(spec); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c index 77f61cd28a..9c7064187e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "en.h" #include "eswitch.h" @@ -259,9 +260,15 @@ static void mlx5e_ipsec_init_macs(struct mlx5e_ipsec_sa_entry *sa_entry, struct mlx5_accel_esp_xfrm_attrs *attrs) { struct mlx5_core_dev *mdev = mlx5e_ipsec_sa2dev(sa_entry); + struct mlx5e_ipsec_addr *addrs = &attrs->addrs; struct net_device *netdev = sa_entry->dev; + struct xfrm_state *x = sa_entry->x; + struct dst_entry *rt_dst_entry; + struct flowi4 fl4 = {}; + struct flowi6 fl6 = {}; struct neighbour *n; u8 addr[ETH_ALEN]; + struct rtable *rt; const void *pkey; u8 *dst, *src; @@ -274,18 +281,91 @@ static void mlx5e_ipsec_init_macs(struct mlx5e_ipsec_sa_entry *sa_entry, case XFRM_DEV_OFFLOAD_IN: src = attrs->dmac; dst = attrs->smac; - pkey = &attrs->addrs.saddr.a4; + + switch (addrs->family) { + case AF_INET: + fl4.flowi4_proto = x->sel.proto; + fl4.daddr = addrs->saddr.a4; + fl4.saddr = addrs->daddr.a4; + pkey = &addrs->saddr.a4; + break; + case AF_INET6: + fl6.flowi6_proto = x->sel.proto; + memcpy(fl6.daddr.s6_addr32, addrs->saddr.a6, 16); + memcpy(fl6.saddr.s6_addr32, addrs->daddr.a6, 16); + pkey = &addrs->saddr.a6; + break; + default: + return; + } break; case XFRM_DEV_OFFLOAD_OUT: src = attrs->smac; dst = attrs->dmac; - pkey = &attrs->addrs.daddr.a4; + switch (addrs->family) { + case AF_INET: + fl4.flowi4_proto = x->sel.proto; + fl4.daddr = addrs->daddr.a4; + fl4.saddr = addrs->saddr.a4; + pkey = &addrs->daddr.a4; + break; + case AF_INET6: + fl6.flowi6_proto = x->sel.proto; + memcpy(fl6.daddr.s6_addr32, addrs->daddr.a6, 16); + memcpy(fl6.saddr.s6_addr32, addrs->saddr.a6, 16); + pkey = &addrs->daddr.a6; + break; + default: + return; + } break; default: return; } ether_addr_copy(src, addr); + + /* Destination can refer to a routed network, so perform FIB lookup + * to resolve nexthop and get its MAC. Neighbour resolution is used as + * fallback. + */ + switch (addrs->family) { + case AF_INET: + rt = ip_route_output_key(dev_net(netdev), &fl4); + if (IS_ERR(rt)) + goto neigh; + + if (rt->rt_type != RTN_UNICAST) { + ip_rt_put(rt); + goto neigh; + } + rt_dst_entry = &rt->dst; + break; + case AF_INET6: + if (!IS_ENABLED(CONFIG_IPV6) || + ip6_dst_lookup(dev_net(netdev), NULL, &rt_dst_entry, &fl6)) + goto neigh; + break; + default: + return; + } + + n = dst_neigh_lookup(rt_dst_entry, pkey); + if (!n) { + dst_release(rt_dst_entry); + goto neigh; + } + + neigh_ha_snapshot(addr, n, netdev); + ether_addr_copy(dst, addr); + if (attrs->dir == XFRM_DEV_OFFLOAD_OUT && + is_zero_ether_addr(addr)) + neigh_event_send(n, NULL); + dst_release(rt_dst_entry); + neigh_release(n); + return; + +neigh: n = neigh_lookup(&arp_tbl, pkey, netdev); if (!n) { n = neigh_create(&arp_tbl, pkey, netdev); @@ -350,7 +430,8 @@ void mlx5e_ipsec_build_accel_xfrm_attrs(struct mlx5e_ipsec_sa_entry *sa_entry, attrs->replay_esn.esn = sa_entry->esn_state.esn; attrs->replay_esn.esn_msb = sa_entry->esn_state.esn_msb; attrs->replay_esn.overlap = sa_entry->esn_state.overlap; - if (attrs->dir == XFRM_DEV_OFFLOAD_OUT) + if (attrs->dir == XFRM_DEV_OFFLOAD_OUT || + x->xso.type != XFRM_DEV_OFFLOAD_PACKET) goto skip_replay_window; switch (x->replay_esn->replay_window) { @@ -694,6 +775,7 @@ static int mlx5e_xfrm_add_state(struct net_device *dev, struct netlink_ext_ack *extack) { struct mlx5e_ipsec_sa_entry *sa_entry = NULL; + bool allow_tunnel_mode = false; struct mlx5e_ipsec *ipsec; struct mlx5e_priv *priv; gfp_t gfp; @@ -725,6 +807,21 @@ static int mlx5e_xfrm_add_state(struct net_device *dev, goto err_xfrm; } + err = mlx5_eswitch_block_mode(priv->mdev); + if (err) + goto unblock_ipsec; + + if (x->props.mode == XFRM_MODE_TUNNEL && + x->xso.type == XFRM_DEV_OFFLOAD_PACKET) { + allow_tunnel_mode = mlx5e_ipsec_fs_tunnel_allowed(sa_entry); + if (!allow_tunnel_mode) { + NL_SET_ERR_MSG_MOD(extack, + "Packet offload tunnel mode is disabled due to encap settings"); + err = -EINVAL; + goto unblock_mode; + } + } + /* check esn */ if (x->props.flags & XFRM_STATE_ESN) mlx5e_ipsec_update_esn_state(sa_entry); @@ -739,7 +836,7 @@ static int mlx5e_xfrm_add_state(struct net_device *dev, err = mlx5_ipsec_create_work(sa_entry); if (err) - goto unblock_ipsec; + goto unblock_encap; err = mlx5e_ipsec_create_dwork(sa_entry); if (err) @@ -754,14 +851,6 @@ static int mlx5e_xfrm_add_state(struct net_device *dev, if (err) goto err_hw_ctx; - if (x->props.mode == XFRM_MODE_TUNNEL && - x->xso.type == XFRM_DEV_OFFLOAD_PACKET && - !mlx5e_ipsec_fs_tunnel_enabled(sa_entry)) { - NL_SET_ERR_MSG_MOD(extack, "Packet offload tunnel mode is disabled due to encap settings"); - err = -EINVAL; - goto err_add_rule; - } - /* We use *_bh() variant because xfrm_timer_handler(), which runs * in softirq context, can reach our state delete logic and we need * xa_erase_bh() there. @@ -777,8 +866,7 @@ static int mlx5e_xfrm_add_state(struct net_device *dev, queue_delayed_work(ipsec->wq, &sa_entry->dwork->dwork, MLX5_IPSEC_RESCHED); - if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET && - x->props.mode == XFRM_MODE_TUNNEL) { + if (allow_tunnel_mode) { xa_lock_bh(&ipsec->sadb); __xa_set_mark(&ipsec->sadb, sa_entry->ipsec_obj_id, MLX5E_IPSEC_TUNNEL_SA); @@ -787,6 +875,11 @@ static int mlx5e_xfrm_add_state(struct net_device *dev, out: x->xso.offload_handle = (unsigned long)sa_entry; + if (allow_tunnel_mode) + mlx5_eswitch_unblock_encap(priv->mdev); + + mlx5_eswitch_unblock_mode(priv->mdev); + return 0; err_add_rule: @@ -799,6 +892,11 @@ release_work: if (sa_entry->work) kfree(sa_entry->work->data); kfree(sa_entry->work); +unblock_encap: + if (allow_tunnel_mode) + mlx5_eswitch_unblock_encap(priv->mdev); +unblock_mode: + mlx5_eswitch_unblock_mode(priv->mdev); unblock_ipsec: mlx5_eswitch_unblock_ipsec(priv->mdev); err_xfrm: diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h index ffcd0cdeb7..f8eaaf3796 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h @@ -185,6 +185,7 @@ struct mlx5e_ipsec_rx_create_attr { u32 family; int prio; int pol_level; + int pol_miss_level; int sa_level; int status_level; enum mlx5_flow_namespace_type chains_ns; @@ -318,7 +319,7 @@ void mlx5e_accel_ipsec_fs_del_rule(struct mlx5e_ipsec_sa_entry *sa_entry); int mlx5e_accel_ipsec_fs_add_pol(struct mlx5e_ipsec_pol_entry *pol_entry); void mlx5e_accel_ipsec_fs_del_pol(struct mlx5e_ipsec_pol_entry *pol_entry); void mlx5e_accel_ipsec_fs_modify(struct mlx5e_ipsec_sa_entry *sa_entry); -bool mlx5e_ipsec_fs_tunnel_enabled(struct mlx5e_ipsec_sa_entry *sa_entry); +bool mlx5e_ipsec_fs_tunnel_allowed(struct mlx5e_ipsec_sa_entry *sa_entry); int mlx5_ipsec_create_sa_ctx(struct mlx5e_ipsec_sa_entry *sa_entry); void mlx5_ipsec_free_sa_ctx(struct mlx5e_ipsec_sa_entry *sa_entry); @@ -341,6 +342,7 @@ void mlx5e_ipsec_build_accel_xfrm_attrs(struct mlx5e_ipsec_sa_entry *sa_entry, void mlx5e_ipsec_handle_mpv_event(int event, struct mlx5e_priv *slave_priv, struct mlx5e_priv *master_priv); void mlx5e_ipsec_send_event(struct mlx5e_priv *priv, int event); +void mlx5e_ipsec_disable_events(struct mlx5e_priv *priv); static inline struct mlx5_core_dev * mlx5e_ipsec_sa2dev(struct mlx5e_ipsec_sa_entry *sa_entry) @@ -386,6 +388,10 @@ static inline void mlx5e_ipsec_handle_mpv_event(int event, struct mlx5e_priv *sl static inline void mlx5e_ipsec_send_event(struct mlx5e_priv *priv, int event) { } + +static inline void mlx5e_ipsec_disable_events(struct mlx5e_priv *priv) +{ +} #endif #endif /* __MLX5E_IPSEC_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c index 98b6a3a623..91cfabc450 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c @@ -61,6 +61,7 @@ struct mlx5e_ipsec_rx { struct mlx5_flow_table *pol_miss_ft; struct mlx5_flow_handle *pol_miss_rule; u8 allow_tunnel_mode : 1; + u8 ttc_rules_added : 1; }; /* IPsec RX flow steering */ @@ -585,6 +586,20 @@ out: return err; } +static struct mlx5_flow_destination +ipsec_rx_decrypted_pkt_def_dest(struct mlx5_ttc_table *ttc, u32 family) +{ + struct mlx5_flow_destination dest; + + if (!mlx5_ttc_has_esp_flow_group(ttc)) + return mlx5_ttc_get_default_dest(ttc, family2tt(family)); + + dest.ft = mlx5_get_ttc_flow_table(ttc); + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + + return dest; +} + static void ipsec_rx_update_default_dest(struct mlx5e_ipsec_rx *rx, struct mlx5_flow_destination *old_dest, struct mlx5_flow_destination *new_dest) @@ -598,10 +613,10 @@ static void handle_ipsec_rx_bringup(struct mlx5e_ipsec *ipsec, u32 family) { struct mlx5e_ipsec_rx *rx = ipsec_rx(ipsec, family, XFRM_DEV_OFFLOAD_PACKET); struct mlx5_flow_namespace *ns = mlx5e_fs_get_ns(ipsec->fs, false); + struct mlx5_ttc_table *ttc = mlx5e_fs_get_ttc(ipsec->fs, false); struct mlx5_flow_destination old_dest, new_dest; - old_dest = mlx5_ttc_get_default_dest(mlx5e_fs_get_ttc(ipsec->fs, false), - family2tt(family)); + old_dest = ipsec_rx_decrypted_pkt_def_dest(ttc, family); mlx5_ipsec_fs_roce_rx_create(ipsec->mdev, ipsec->roce, ns, &old_dest, family, MLX5E_ACCEL_FS_ESP_FT_ROCE_LEVEL, MLX5E_NIC_PRIO); @@ -614,12 +629,12 @@ static void handle_ipsec_rx_bringup(struct mlx5e_ipsec *ipsec, u32 family) static void handle_ipsec_rx_cleanup(struct mlx5e_ipsec *ipsec, u32 family) { struct mlx5e_ipsec_rx *rx = ipsec_rx(ipsec, family, XFRM_DEV_OFFLOAD_PACKET); + struct mlx5_ttc_table *ttc = mlx5e_fs_get_ttc(ipsec->fs, false); struct mlx5_flow_destination old_dest, new_dest; old_dest.ft = mlx5_ipsec_fs_roce_ft_get(ipsec->roce, family); old_dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; - new_dest = mlx5_ttc_get_default_dest(mlx5e_fs_get_ttc(ipsec->fs, false), - family2tt(family)); + new_dest = ipsec_rx_decrypted_pkt_def_dest(ttc, family); ipsec_rx_update_default_dest(rx, &old_dest, &new_dest); mlx5_ipsec_fs_roce_rx_destroy(ipsec->roce, family, ipsec->mdev); @@ -669,10 +684,13 @@ static void ipsec_mpv_work_handler(struct work_struct *_work) complete(&work->master_priv->ipsec->comp); } -static void ipsec_rx_ft_disconnect(struct mlx5e_ipsec *ipsec, u32 family) +static void ipsec_rx_ft_disconnect(struct mlx5e_ipsec *ipsec, + struct mlx5e_ipsec_rx *rx, u32 family) { struct mlx5_ttc_table *ttc = mlx5e_fs_get_ttc(ipsec->fs, false); + if (rx->ttc_rules_added) + mlx5_ttc_destroy_ipsec_rules(ttc); mlx5_ttc_fwd_default_dest(ttc, family2tt(family)); } @@ -707,7 +725,7 @@ static void rx_destroy(struct mlx5_core_dev *mdev, struct mlx5e_ipsec *ipsec, { /* disconnect */ if (rx != ipsec->rx_esw) - ipsec_rx_ft_disconnect(ipsec, family); + ipsec_rx_ft_disconnect(ipsec, rx, family); mlx5_del_flow_rules(rx->sa.rule); mlx5_destroy_flow_group(rx->sa.group); @@ -747,6 +765,7 @@ static void ipsec_rx_create_attr_set(struct mlx5e_ipsec *ipsec, attr->family = family; attr->prio = MLX5E_NIC_PRIO; attr->pol_level = MLX5E_ACCEL_FS_POL_FT_LEVEL; + attr->pol_miss_level = MLX5E_ACCEL_FS_POL_MISS_FT_LEVEL; attr->sa_level = MLX5E_ACCEL_FS_ESP_FT_LEVEL; attr->status_level = MLX5E_ACCEL_FS_ESP_FT_ERR_LEVEL; attr->chains_ns = MLX5_FLOW_NAMESPACE_KERNEL; @@ -763,7 +782,7 @@ static int ipsec_rx_status_pass_dest_get(struct mlx5e_ipsec *ipsec, if (rx == ipsec->rx_esw) return mlx5_esw_ipsec_rx_status_pass_dest_get(ipsec, dest); - *dest = mlx5_ttc_get_default_dest(attr->ttc, family2tt(attr->family)); + *dest = ipsec_rx_decrypted_pkt_def_dest(attr->ttc, attr->family); err = mlx5_ipsec_fs_roce_rx_create(ipsec->mdev, ipsec->roce, attr->ns, dest, attr->family, MLX5E_ACCEL_FS_ESP_FT_ROCE_LEVEL, attr->prio); @@ -806,10 +825,16 @@ static void ipsec_rx_ft_connect(struct mlx5e_ipsec *ipsec, struct mlx5e_ipsec_rx_create_attr *attr) { struct mlx5_flow_destination dest = {}; + struct mlx5_ttc_table *ttc, *inner_ttc; dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; dest.ft = rx->ft.sa; - mlx5_ttc_fwd_dest(attr->ttc, family2tt(attr->family), &dest); + if (mlx5_ttc_fwd_dest(attr->ttc, family2tt(attr->family), &dest)) + return; + + ttc = mlx5e_fs_get_ttc(ipsec->fs, false); + inner_ttc = mlx5e_fs_get_ttc(ipsec->fs, true); + rx->ttc_rules_added = !mlx5_ttc_create_ipsec_rules(ttc, inner_ttc); } static int ipsec_rx_chains_create_miss(struct mlx5e_ipsec *ipsec, @@ -833,7 +858,7 @@ static int ipsec_rx_chains_create_miss(struct mlx5e_ipsec *ipsec, ft_attr.max_fte = 1; ft_attr.autogroup.max_num_groups = 1; - ft_attr.level = attr->pol_level; + ft_attr.level = attr->pol_miss_level; ft_attr.prio = attr->prio; ft = mlx5_create_auto_grouped_flow_table(attr->ns, &ft_attr); @@ -1044,7 +1069,9 @@ static int rx_create(struct mlx5_core_dev *mdev, struct mlx5e_ipsec *ipsec, /* Create FT */ if (mlx5_ipsec_device_caps(mdev) & MLX5_IPSEC_CAP_TUNNEL) - rx->allow_tunnel_mode = mlx5_eswitch_block_encap(mdev); + rx->allow_tunnel_mode = + mlx5_eswitch_block_encap(mdev, rx == ipsec->rx_esw); + if (rx->allow_tunnel_mode) flags = MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; ft = ipsec_ft_create(attr.ns, attr.sa_level, attr.prio, 1, 2, flags); @@ -1285,7 +1312,9 @@ static int tx_create(struct mlx5e_ipsec *ipsec, struct mlx5e_ipsec_tx *tx, goto err_status_rule; if (mlx5_ipsec_device_caps(mdev) & MLX5_IPSEC_CAP_TUNNEL) - tx->allow_tunnel_mode = mlx5_eswitch_block_encap(mdev); + tx->allow_tunnel_mode = + mlx5_eswitch_block_encap(mdev, tx == ipsec->tx_esw); + if (tx->allow_tunnel_mode) flags = MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; ft = ipsec_ft_create(tx->ns, attr.sa_level, attr.prio, 1, 4, flags); @@ -1704,8 +1733,8 @@ static int setup_modify_header(struct mlx5e_ipsec *ipsec, int type, u32 val, u8 modify_hdr = mlx5_modify_header_alloc(mdev, ns_type, num_of_actions, action); if (IS_ERR(modify_hdr)) { - mlx5_core_err(mdev, "Failed to allocate modify_header %ld\n", - PTR_ERR(modify_hdr)); + mlx5_core_err(mdev, "Failed to allocate modify_header %pe\n", + modify_hdr); return PTR_ERR(modify_hdr); } @@ -2821,18 +2850,24 @@ void mlx5e_accel_ipsec_fs_modify(struct mlx5e_ipsec_sa_entry *sa_entry) memcpy(sa_entry, &sa_entry_shadow, sizeof(*sa_entry)); } -bool mlx5e_ipsec_fs_tunnel_enabled(struct mlx5e_ipsec_sa_entry *sa_entry) +bool mlx5e_ipsec_fs_tunnel_allowed(struct mlx5e_ipsec_sa_entry *sa_entry) { - struct mlx5_accel_esp_xfrm_attrs *attrs = &sa_entry->attrs; - struct mlx5e_ipsec_rx *rx; - struct mlx5e_ipsec_tx *tx; + struct mlx5e_ipsec *ipsec = sa_entry->ipsec; + struct xfrm_state *x = sa_entry->x; + bool from_fdb; - rx = ipsec_rx(sa_entry->ipsec, attrs->addrs.family, attrs->type); - tx = ipsec_tx(sa_entry->ipsec, attrs->type); - if (sa_entry->attrs.dir == XFRM_DEV_OFFLOAD_OUT) - return tx->allow_tunnel_mode; + if (x->xso.dir == XFRM_DEV_OFFLOAD_OUT) { + struct mlx5e_ipsec_tx *tx = ipsec_tx(ipsec, x->xso.type); - return rx->allow_tunnel_mode; + from_fdb = (tx == ipsec->tx_esw); + } else { + struct mlx5e_ipsec_rx *rx = ipsec_rx(ipsec, x->props.family, + x->xso.type); + + from_fdb = (rx == ipsec->rx_esw); + } + + return mlx5_eswitch_block_encap(ipsec->mdev, from_fdb); } void mlx5e_ipsec_handle_mpv_event(int event, struct mlx5e_priv *slave_priv, @@ -2858,9 +2893,30 @@ void mlx5e_ipsec_handle_mpv_event(int event, struct mlx5e_priv *slave_priv, void mlx5e_ipsec_send_event(struct mlx5e_priv *priv, int event) { - if (!priv->ipsec) - return; /* IPsec not supported */ + if (!priv->ipsec || mlx5_devcom_comp_get_size(priv->devcom) < 2) + return; /* IPsec not supported or no peers */ mlx5_devcom_send_event(priv->devcom, event, event, priv); wait_for_completion(&priv->ipsec->comp); } + +void mlx5e_ipsec_disable_events(struct mlx5e_priv *priv) +{ + struct mlx5_devcom_comp_dev *tmp = NULL; + struct mlx5e_priv *peer_priv; + + if (!priv->devcom) + return; + + if (!mlx5_devcom_for_each_peer_begin(priv->devcom)) + goto out; + + peer_priv = mlx5_devcom_get_next_peer_data(priv->devcom, &tmp); + if (peer_priv && peer_priv->ipsec) + complete_all(&peer_priv->ipsec->comp); + + mlx5_devcom_for_each_peer_end(priv->devcom); +out: + mlx5_devcom_unregister_component(priv->devcom); + priv->devcom = NULL; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c index 820debf3fb..ef7322d381 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c @@ -42,8 +42,7 @@ u32 mlx5_ipsec_device_caps(struct mlx5_core_dev *mdev) if (MLX5_CAP_IPSEC(mdev, ipsec_full_offload) && (mdev->priv.steering->mode == MLX5_FLOW_STEERING_MODE_DMFS || - (mdev->priv.steering->mode == MLX5_FLOW_STEERING_MODE_SMFS && - is_mdev_legacy_mode(mdev)))) { + is_mdev_legacy_mode(mdev))) { if (MLX5_CAP_FLOWTABLE_NIC_TX(mdev, reformat_add_esp_trasport) && MLX5_CAP_FLOWTABLE_NIC_RX(mdev, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c index 727fa7c185..6056106edc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c @@ -327,6 +327,10 @@ void mlx5e_ipsec_offload_handle_rx_skb(struct net_device *netdev, if (unlikely(!sa_entry)) { rcu_read_unlock(); atomic64_inc(&ipsec->sw_stats.ipsec_rx_drop_sadb_miss); + /* Clear secpath to prevent invalid dereference + * in downstream XFRM policy checks. + */ + secpath_reset(skb); return; } xfrm_state_hold(sa_entry->x); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c index 6ab02f3fc2..528b04d4de 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c @@ -1676,7 +1676,7 @@ void mlx5e_macsec_tx_build_eseg(struct mlx5e_macsec *macsec, if (!fs_id) return; - eseg->flow_table_metadata = cpu_to_be32(MLX5_ETH_WQE_FT_META_MACSEC | fs_id << 2); + eseg->flow_table_metadata = cpu_to_be32(MLX5_MACSEC_TX_METADATA(fs_id)); } void mlx5e_macsec_offload_handle_rx_skb(struct net_device *netdev, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c index 6ed3a32b7e..5a2ac7b6f2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c @@ -30,6 +30,7 @@ * SOFTWARE. */ +#include "devlink.h" #include "en.h" #include "lib/crypto.h" @@ -140,9 +141,22 @@ err_close_tises: return err; } +static unsigned int +mlx5e_get_devlink_param_num_doorbells(struct mlx5_core_dev *dev) +{ + const u32 param_id = DEVLINK_PARAM_GENERIC_ID_NUM_DOORBELLS; + struct devlink *devlink = priv_to_devlink(dev); + union devlink_param_value val; + int err; + + err = devl_param_driverinit_value_get(devlink, param_id, &val); + return err ? MLX5_DEFAULT_NUM_DOORBELLS : val.vu32; +} + int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev, bool create_tises) { struct mlx5e_hw_objs *res = &mdev->mlx5e_res.hw_objs; + unsigned int num_doorbells, i; int err; err = mlx5_core_alloc_pd(mdev, &res->pdn); @@ -163,17 +177,30 @@ int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev, bool create_tises) goto err_dealloc_transport_domain; } - err = mlx5_alloc_bfreg(mdev, &res->bfreg, false, false); - if (err) { - mlx5_core_err(mdev, "alloc bfreg failed, %d\n", err); + num_doorbells = min(mlx5e_get_devlink_param_num_doorbells(mdev), + mlx5e_get_max_num_channels(mdev)); + res->bfregs = kcalloc(num_doorbells, sizeof(*res->bfregs), GFP_KERNEL); + if (!res->bfregs) { + err = -ENOMEM; goto err_destroy_mkey; } + for (i = 0; i < num_doorbells; i++) { + err = mlx5_alloc_bfreg(mdev, res->bfregs + i, false, false); + if (err) { + mlx5_core_warn(mdev, + "could only allocate %d/%d doorbells, err %d.\n", + i, num_doorbells, err); + break; + } + } + res->num_bfregs = i; + if (create_tises) { err = mlx5e_create_tises(mdev, res->tisn); if (err) { mlx5_core_err(mdev, "alloc tises failed, %d\n", err); - goto err_destroy_bfreg; + goto err_destroy_bfregs; } res->tisn_valid = true; } @@ -183,15 +210,17 @@ int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev, bool create_tises) mdev->mlx5e_res.dek_priv = mlx5_crypto_dek_init(mdev); if (IS_ERR(mdev->mlx5e_res.dek_priv)) { - mlx5_core_err(mdev, "crypto dek init failed, %ld\n", - PTR_ERR(mdev->mlx5e_res.dek_priv)); + mlx5_core_err(mdev, "crypto dek init failed, %pe\n", + mdev->mlx5e_res.dek_priv); mdev->mlx5e_res.dek_priv = NULL; } return 0; -err_destroy_bfreg: - mlx5_free_bfreg(mdev, &res->bfreg); +err_destroy_bfregs: + for (i = 0; i < res->num_bfregs; i++) + mlx5_free_bfreg(mdev, res->bfregs + i); + kfree(res->bfregs); err_destroy_mkey: mlx5_core_destroy_mkey(mdev, res->mkey); err_dealloc_transport_domain: @@ -209,52 +238,52 @@ void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev) mdev->mlx5e_res.dek_priv = NULL; if (res->tisn_valid) mlx5e_destroy_tises(mdev, res->tisn); - mlx5_free_bfreg(mdev, &res->bfreg); + for (unsigned int i = 0; i < res->num_bfregs; i++) + mlx5_free_bfreg(mdev, res->bfregs + i); + kfree(res->bfregs); mlx5_core_destroy_mkey(mdev, res->mkey); mlx5_core_dealloc_transport_domain(mdev, res->td.tdn); mlx5_core_dealloc_pd(mdev, res->pdn); memset(res, 0, sizeof(*res)); } -int mlx5e_refresh_tirs(struct mlx5e_priv *priv, bool enable_uc_lb, - bool enable_mc_lb) +int mlx5e_modify_tirs_lb(struct mlx5_core_dev *mdev, bool enable_uc_lb, + bool enable_mc_lb) { - struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_tir_builder *builder; struct mlx5e_tir *tir; - u8 lb_flags = 0; - int err = 0; - u32 tirn = 0; - int inlen; - void *in; + int err = 0; - inlen = MLX5_ST_SZ_BYTES(modify_tir_in); - in = kvzalloc(inlen, GFP_KERNEL); - if (!in) + builder = mlx5e_tir_builder_alloc(true); + if (!builder) return -ENOMEM; - if (enable_uc_lb) - lb_flags = MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST; - - if (enable_mc_lb) - lb_flags |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_MULTICAST; - - if (lb_flags) - MLX5_SET(modify_tir_in, in, ctx.self_lb_block, lb_flags); - - MLX5_SET(modify_tir_in, in, bitmask.self_lb_en, 1); + mlx5e_tir_builder_build_self_lb_block(builder, enable_uc_lb, + enable_mc_lb); mutex_lock(&mdev->mlx5e_res.hw_objs.td.list_lock); list_for_each_entry(tir, &mdev->mlx5e_res.hw_objs.td.tirs_list, list) { - tirn = tir->tirn; - err = mlx5_core_modify_tir(mdev, tirn, in); - if (err) + err = mlx5e_tir_modify(tir, builder); + if (err) { + mlx5_core_err(mdev, + "modify tir(0x%x) enable_lb uc(%d) mc(%d) failed, %d\n", + mlx5e_tir_get_tirn(tir), + enable_uc_lb, enable_mc_lb, err); break; + } } mutex_unlock(&mdev->mlx5e_res.hw_objs.td.list_lock); - kvfree(in); - if (err) - netdev_err(priv->netdev, "refresh tir(0x%x) failed, %d\n", tirn, err); + mlx5e_tir_builder_free(builder); return err; } + +int mlx5e_refresh_tirs(struct mlx5_core_dev *mdev, bool enable_uc_lb, + bool enable_mc_lb) +{ + if (MLX5_CAP_GEN(mdev, tis_tir_td_order)) + return 0; /* refresh not needed */ + + return mlx5e_modify_tirs_lb(mdev, enable_uc_lb, enable_mc_lb); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c index 8705cffc74..585ac619e5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c @@ -31,14 +31,15 @@ */ #include #include +#include #include "en.h" #include "en/port.h" #include "en/port_buffer.h" #define MLX5E_MAX_BW_ALLOC 100 /* Max percentage of BW allocation */ -#define MLX5E_100MB (100000) -#define MLX5E_1GB (1000000) +#define MLX5E_100MB_TO_KB (100 * MEGA / KILO) +#define MLX5E_1GB_TO_KB (GIGA / KILO) #define MLX5E_CEE_STATE_UP 1 #define MLX5E_CEE_STATE_DOWN 0 @@ -362,6 +363,7 @@ static int mlx5e_dcbnl_ieee_getpfc(struct net_device *dev, static int mlx5e_dcbnl_ieee_setpfc(struct net_device *dev, struct ieee_pfc *pfc) { + u8 buffer_ownership = MLX5_BUF_OWNERSHIP_UNKNOWN; struct mlx5e_priv *priv = netdev_priv(dev); struct mlx5_core_dev *mdev = priv->mdev; u32 old_cable_len = priv->dcbx.cable_len; @@ -389,7 +391,14 @@ static int mlx5e_dcbnl_ieee_setpfc(struct net_device *dev, if (MLX5_BUFFER_SUPPORTED(mdev)) { pfc_new.pfc_en = (changed & MLX5E_PORT_BUFFER_PFC) ? pfc->pfc_en : curr_pfc_en; - if (priv->dcbx.manual_buffer) + ret = mlx5_query_port_buffer_ownership(mdev, + &buffer_ownership); + if (ret) + netdev_err(dev, + "%s, Failed to get buffer ownership: %d\n", + __func__, ret); + + if (buffer_ownership == MLX5_BUF_OWNERSHIP_SW_OWNED) ret = mlx5e_port_manual_buffer_config(priv, changed, dev->mtu, &pfc_new, NULL, NULL); @@ -564,10 +573,10 @@ static int mlx5e_dcbnl_ieee_getmaxrate(struct net_device *netdev, for (i = 0; i <= mlx5_max_tc(mdev); i++) { switch (max_bw_unit[i]) { case MLX5_100_MBPS_UNIT: - maxrate->tc_maxrate[i] = max_bw_value[i] * MLX5E_100MB; + maxrate->tc_maxrate[i] = max_bw_value[i] * MLX5E_100MB_TO_KB; break; case MLX5_GBPS_UNIT: - maxrate->tc_maxrate[i] = max_bw_value[i] * MLX5E_1GB; + maxrate->tc_maxrate[i] = max_bw_value[i] * MLX5E_1GB_TO_KB; break; case MLX5_BW_NO_LIMIT: break; @@ -587,32 +596,55 @@ static int mlx5e_dcbnl_ieee_setmaxrate(struct net_device *netdev, struct mlx5_core_dev *mdev = priv->mdev; u8 max_bw_value[IEEE_8021QAZ_MAX_TCS]; u8 max_bw_unit[IEEE_8021QAZ_MAX_TCS]; - __u64 upper_limit_mbps = roundup(255 * MLX5E_100MB, MLX5E_1GB); + u64 upper_limit_100mbps; + u64 upper_limit_gbps; int i; + struct { + int scale; + const char *units_str; + } units[] = { + [MLX5_100_MBPS_UNIT] = { + .scale = 100, + .units_str = "Mbps", + }, + [MLX5_GBPS_UNIT] = { + .scale = 1, + .units_str = "Gbps", + }, + }; memset(max_bw_value, 0, sizeof(max_bw_value)); memset(max_bw_unit, 0, sizeof(max_bw_unit)); + upper_limit_100mbps = U8_MAX * MLX5E_100MB_TO_KB; + upper_limit_gbps = U8_MAX * MLX5E_1GB_TO_KB; for (i = 0; i <= mlx5_max_tc(mdev); i++) { if (!maxrate->tc_maxrate[i]) { max_bw_unit[i] = MLX5_BW_NO_LIMIT; continue; } - if (maxrate->tc_maxrate[i] < upper_limit_mbps) { + if (maxrate->tc_maxrate[i] <= upper_limit_100mbps) { max_bw_value[i] = div_u64(maxrate->tc_maxrate[i], - MLX5E_100MB); + MLX5E_100MB_TO_KB); max_bw_value[i] = max_bw_value[i] ? max_bw_value[i] : 1; max_bw_unit[i] = MLX5_100_MBPS_UNIT; - } else { + } else if (maxrate->tc_maxrate[i] <= upper_limit_gbps) { max_bw_value[i] = div_u64(maxrate->tc_maxrate[i], - MLX5E_1GB); + MLX5E_1GB_TO_KB); max_bw_unit[i] = MLX5_GBPS_UNIT; + } else { + netdev_err(netdev, + "tc_%d maxrate %llu Kbps exceeds limit %llu\n", + i, maxrate->tc_maxrate[i], + upper_limit_gbps); + return -EINVAL; } } for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) { - netdev_dbg(netdev, "%s: tc_%d <=> max_bw %d Gbps\n", - __func__, i, max_bw_value[i]); + netdev_dbg(netdev, "%s: tc_%d <=> max_bw %u %s\n", __func__, i, + max_bw_value[i] * units[max_bw_unit[i]].scale, + units[max_bw_unit[i]].units_str); } return mlx5_modify_port_ets_rate_limit(mdev, max_bw_value, max_bw_unit); @@ -982,7 +1014,6 @@ static int mlx5e_dcbnl_setbuffer(struct net_device *dev, if (!changed) return 0; - priv->dcbx.manual_buffer = true; err = mlx5e_port_manual_buffer_config(priv, changed, dev->mtu, NULL, buffer_size, prio2buffer); return err; @@ -1250,7 +1281,6 @@ void mlx5e_dcbnl_initialize(struct mlx5e_priv *priv) priv->dcbx.cap |= DCB_CAP_DCBX_HOST; priv->dcbx.port_buff_cell_sz = mlx5e_query_port_buffers_cell_size(priv); - priv->dcbx.manual_buffer = false; priv->dcbx.cable_len = MLX5E_DEFAULT_CABLE_LEN; mlx5e_ets_init(priv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c index 298bb74ec5..d1d629697e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c @@ -113,7 +113,7 @@ int mlx5e_dim_rx_change(struct mlx5e_rq *rq, bool enable) __set_bit(MLX5E_RQ_STATE_DIM, &rq->state); } else { __clear_bit(MLX5E_RQ_STATE_DIM, &rq->state); - + synchronize_net(); mlx5e_dim_disable(rq->dim); rq->dim = NULL; } @@ -140,7 +140,7 @@ int mlx5e_dim_tx_change(struct mlx5e_txqsq *sq, bool enable) __set_bit(MLX5E_SQ_STATE_DIM, &sq->state); } else { __clear_bit(MLX5E_SQ_STATE_DIM, &sq->state); - + synchronize_net(); mlx5e_dim_disable(sq->dim); sq->dim = NULL; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 8578f03783..12ecb949bc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -32,6 +32,7 @@ #include #include +#include #include "en.h" #include "en/channels.h" @@ -365,11 +366,6 @@ void mlx5e_ethtool_get_ringparam(struct mlx5e_priv *priv, param->tx_max_pending = 1 << MLX5E_PARAMS_MAXIMUM_LOG_SQ_SIZE; param->rx_pending = 1 << priv->channels.params.log_rq_mtu_frames; param->tx_pending = 1 << priv->channels.params.log_sq_size; - - kernel_param->tcp_data_split = - (priv->channels.params.packet_merge.type == MLX5E_PACKET_MERGE_SHAMPO) ? - ETHTOOL_TCP_DATA_SPLIT_ENABLED : - ETHTOOL_TCP_DATA_SPLIT_DISABLED; } static void mlx5e_get_ringparam(struct net_device *dev, @@ -382,6 +378,27 @@ static void mlx5e_get_ringparam(struct net_device *dev, mlx5e_ethtool_get_ringparam(priv, param, kernel_param); } +static bool mlx5e_ethtool_set_tcp_data_split(struct mlx5e_priv *priv, + u8 tcp_data_split, + struct netlink_ext_ack *extack) +{ + struct net_device *dev = priv->netdev; + + if (tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_ENABLED && + !(dev->features & NETIF_F_GRO_HW)) { + NL_SET_ERR_MSG_MOD(extack, + "TCP-data-split is not supported when GRO HW is disabled"); + return false; + } + + /* Might need to disable HW-GRO if it was kept on due to hds. */ + if (tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_DISABLED && + dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED) + netdev_update_features(priv->netdev); + + return true; +} + int mlx5e_ethtool_set_ringparam(struct mlx5e_priv *priv, struct ethtool_ringparam *param, struct netlink_ext_ack *extack) @@ -440,6 +457,11 @@ static int mlx5e_set_ringparam(struct net_device *dev, { struct mlx5e_priv *priv = netdev_priv(dev); + if (!mlx5e_ethtool_set_tcp_data_split(priv, + kernel_param->tcp_data_split, + extack)) + return -EINVAL; + return mlx5e_ethtool_set_ringparam(priv, param, extack); } @@ -1458,61 +1480,58 @@ static u32 mlx5e_get_rxfh_indir_size(struct net_device *netdev) static int mlx5e_get_rxfh(struct net_device *netdev, struct ethtool_rxfh_param *rxfh) { struct mlx5e_priv *priv = netdev_priv(netdev); - u32 rss_context = rxfh->rss_context; bool symmetric; - int err; mutex_lock(&priv->state_lock); - err = mlx5e_rx_res_rss_get_rxfh(priv->rx_res, rss_context, - rxfh->indir, rxfh->key, &rxfh->hfunc, &symmetric); + mlx5e_rx_res_rss_get_rxfh(priv->rx_res, 0, rxfh->indir, rxfh->key, + &rxfh->hfunc, &symmetric); mutex_unlock(&priv->state_lock); - if (err) - return err; - if (symmetric) rxfh->input_xfrm = RXH_XFRM_SYM_OR_XOR; return 0; } -static int mlx5e_set_rxfh(struct net_device *dev, struct ethtool_rxfh_param *rxfh, +static int mlx5e_rxfh_hfunc_check(struct mlx5e_priv *priv, + const struct ethtool_rxfh_param *rxfh, + struct netlink_ext_ack *extack) +{ + unsigned int count; + + count = priv->channels.params.num_channels; + + if (rxfh->hfunc == ETH_RSS_HASH_XOR) { + unsigned int xor8_max_channels = mlx5e_rqt_max_num_channels_allowed_for_xor8(); + + if (count > xor8_max_channels) { + NL_SET_ERR_MSG_FMT_MOD( + extack, + "Number of channels (%u) exceeds the max for XOR8 RSS (%u)", + count, xor8_max_channels); + return -EINVAL; + } + } + + return 0; +} + +static int mlx5e_set_rxfh(struct net_device *dev, + struct ethtool_rxfh_param *rxfh, struct netlink_ext_ack *extack) { bool symmetric = rxfh->input_xfrm == RXH_XFRM_SYM_OR_XOR; struct mlx5e_priv *priv = netdev_priv(dev); - u32 *rss_context = &rxfh->rss_context; u8 hfunc = rxfh->hfunc; - unsigned int count; int err; mutex_lock(&priv->state_lock); - count = priv->channels.params.num_channels; - - if (hfunc == ETH_RSS_HASH_XOR) { - unsigned int xor8_max_channels = mlx5e_rqt_max_num_channels_allowed_for_xor8(); - - if (count > xor8_max_channels) { - err = -EINVAL; - netdev_err(priv->netdev, "%s: Cannot set RSS hash function to XOR, current number of channels (%d) exceeds the maximum allowed for XOR8 RSS hfunc (%d)\n", - __func__, count, xor8_max_channels); - goto unlock; - } - } - - if (*rss_context && rxfh->rss_delete) { - err = mlx5e_rx_res_rss_destroy(priv->rx_res, *rss_context); + err = mlx5e_rxfh_hfunc_check(priv, rxfh, extack); + if (err) goto unlock; - } - if (*rss_context == ETH_RXFH_CONTEXT_ALLOC) { - err = mlx5e_rx_res_rss_init(priv->rx_res, rss_context, count); - if (err) - goto unlock; - } - - err = mlx5e_rx_res_rss_set_rxfh(priv->rx_res, *rss_context, + err = mlx5e_rx_res_rss_set_rxfh(priv->rx_res, rxfh->rss_context, rxfh->indir, rxfh->key, hfunc == ETH_RSS_HASH_NO_CHANGE ? NULL : &hfunc, rxfh->input_xfrm == RXH_XFRM_NO_CHANGE ? NULL : &symmetric); @@ -1522,6 +1541,86 @@ unlock: return err; } +static int mlx5e_create_rxfh_context(struct net_device *dev, + struct ethtool_rxfh_context *ctx, + const struct ethtool_rxfh_param *rxfh, + struct netlink_ext_ack *extack) +{ + bool symmetric = rxfh->input_xfrm == RXH_XFRM_SYM_OR_XOR; + struct mlx5e_priv *priv = netdev_priv(dev); + u8 hfunc = rxfh->hfunc; + int err; + + mutex_lock(&priv->state_lock); + + err = mlx5e_rxfh_hfunc_check(priv, rxfh, extack); + if (err) + goto unlock; + + err = mlx5e_rx_res_rss_init(priv->rx_res, rxfh->rss_context, + priv->channels.params.num_channels); + if (err) + goto unlock; + + err = mlx5e_rx_res_rss_set_rxfh(priv->rx_res, rxfh->rss_context, + rxfh->indir, rxfh->key, + hfunc == ETH_RSS_HASH_NO_CHANGE ? NULL : &hfunc, + rxfh->input_xfrm == RXH_XFRM_NO_CHANGE ? NULL : &symmetric); + if (err) + goto unlock; + + mlx5e_rx_res_rss_get_rxfh(priv->rx_res, rxfh->rss_context, + ethtool_rxfh_context_indir(ctx), + ethtool_rxfh_context_key(ctx), + &ctx->hfunc, &symmetric); + if (symmetric) + ctx->input_xfrm = RXH_XFRM_SYM_OR_XOR; + +unlock: + mutex_unlock(&priv->state_lock); + return err; +} + +static int mlx5e_modify_rxfh_context(struct net_device *dev, + struct ethtool_rxfh_context *ctx, + const struct ethtool_rxfh_param *rxfh, + struct netlink_ext_ack *extack) +{ + bool symmetric = rxfh->input_xfrm == RXH_XFRM_SYM_OR_XOR; + struct mlx5e_priv *priv = netdev_priv(dev); + u8 hfunc = rxfh->hfunc; + int err; + + mutex_lock(&priv->state_lock); + + err = mlx5e_rxfh_hfunc_check(priv, rxfh, extack); + if (err) + goto unlock; + + err = mlx5e_rx_res_rss_set_rxfh(priv->rx_res, rxfh->rss_context, + rxfh->indir, rxfh->key, + hfunc == ETH_RSS_HASH_NO_CHANGE ? NULL : &hfunc, + rxfh->input_xfrm == RXH_XFRM_NO_CHANGE ? NULL : &symmetric); + +unlock: + mutex_unlock(&priv->state_lock); + return err; +} + +static int mlx5e_remove_rxfh_context(struct net_device *dev, + struct ethtool_rxfh_context *ctx, + u32 rss_context, + struct netlink_ext_ack *extack) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + int err; + + mutex_lock(&priv->state_lock); + err = mlx5e_rx_res_rss_destroy(priv->rx_res, rss_context); + mutex_unlock(&priv->state_lock); + return err; +} + #define MLX5E_PFC_PREVEN_AUTO_TOUT_MSEC 100 #define MLX5E_PFC_PREVEN_TOUT_MAX_MSEC 8000 #define MLX5E_PFC_PREVEN_MINOR_PRECENT 85 @@ -1686,6 +1785,7 @@ int mlx5e_ethtool_get_ts_info(struct mlx5e_priv *priv, return 0; info->so_timestamping = SOF_TIMESTAMPING_TX_HARDWARE | + SOF_TIMESTAMPING_TX_SOFTWARE | SOF_TIMESTAMPING_RX_HARDWARE | SOF_TIMESTAMPING_RAW_HARDWARE; @@ -1926,7 +2026,7 @@ static int mlx5e_get_module_info(struct net_device *netdev, int size_read = 0; u8 data[4] = {0}; - size_read = mlx5_query_module_eeprom(dev, 0, 2, data); + size_read = mlx5_query_module_eeprom(dev, 0, 2, data, NULL); if (size_read < 2) return -EIO; @@ -1968,6 +2068,7 @@ static int mlx5e_get_module_eeprom(struct net_device *netdev, struct mlx5_core_dev *mdev = priv->mdev; int offset = ee->offset; int size_read; + u8 status = 0; int i = 0; if (!ee->len) @@ -1977,15 +2078,15 @@ static int mlx5e_get_module_eeprom(struct net_device *netdev, while (i < ee->len) { size_read = mlx5_query_module_eeprom(mdev, offset, ee->len - i, - data + i); - + data + i, &status); if (!size_read) /* Done reading */ return 0; if (size_read < 0) { - netdev_err(priv->netdev, "%s: mlx5_query_eeprom failed:0x%x\n", - __func__, size_read); + netdev_err(netdev, + "%s: mlx5_query_eeprom failed:0x%x, status %u\n", + __func__, size_read, status); return size_read; } @@ -2005,6 +2106,7 @@ static int mlx5e_get_module_eeprom_by_page(struct net_device *netdev, struct mlx5_core_dev *mdev = priv->mdev; u8 *data = page_data->data; int size_read; + u8 status = 0; int i = 0; if (!page_data->length) @@ -2018,20 +2120,19 @@ static int mlx5e_get_module_eeprom_by_page(struct net_device *netdev, query.page = page_data->page; while (i < page_data->length) { query.size = page_data->length - i; - size_read = mlx5_query_module_eeprom_by_page(mdev, &query, data + i); + size_read = mlx5_query_module_eeprom_by_page(mdev, &query, + data + i, &status); /* Done reading, return how many bytes was read */ if (!size_read) return i; - if (size_read == -EINVAL) - return -EINVAL; if (size_read < 0) { NL_SET_ERR_MSG_FMT_MOD( extack, - "Query module eeprom by page failed, read %u bytes, err %d", - i, size_read); - return i; + "Query module eeprom by page failed, read %u bytes, err %d, status %u", + i, size_read, status); + return size_read; } i += size_read; @@ -2177,7 +2278,7 @@ static int set_pflag_rx_cqe_compress(struct net_device *netdev, if (!MLX5_CAP_GEN(mdev, cqe_compression)) return -EOPNOTSUPP; - rx_filter = priv->tstamp.rx_filter != HWTSTAMP_FILTER_NONE; + rx_filter = priv->hwtstamp_config.rx_filter != HWTSTAMP_FILTER_NONE; err = mlx5e_modify_rx_cqe_compression_locked(priv, enable, rx_filter); if (err) return err; @@ -2192,7 +2293,6 @@ static int set_pflag_rx_striding_rq(struct net_device *netdev, bool enable) struct mlx5e_priv *priv = netdev_priv(netdev); struct mlx5_core_dev *mdev = priv->mdev; struct mlx5e_params new_params; - int err; if (enable) { /* Checking the regular RQ here; mlx5e_validate_xsk_param called @@ -2213,14 +2313,7 @@ static int set_pflag_rx_striding_rq(struct net_device *netdev, bool enable) MLX5E_SET_PFLAG(&new_params, MLX5E_PFLAG_RX_STRIDING_RQ, enable); mlx5e_set_rq_type(mdev, &new_params); - err = mlx5e_safe_switch_params(priv, &new_params, NULL, NULL, true); - if (err) - return err; - - /* update XDP supported features */ - mlx5e_set_xdp_feature(netdev); - - return 0; + return mlx5e_safe_switch_params(priv, &new_params, NULL, NULL, true); } static int set_pflag_rx_no_csum_complete(struct net_device *netdev, bool enable) @@ -2398,21 +2491,18 @@ static int mlx5e_set_rxfh_fields(struct net_device *dev, return mlx5e_ethtool_set_rxfh_fields(priv, cmd, extack); } +static u32 mlx5e_get_rx_ring_count(struct net_device *dev) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + return priv->channels.params.num_channels; +} + static int mlx5e_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info, u32 *rule_locs) { struct mlx5e_priv *priv = netdev_priv(dev); - /* ETHTOOL_GRXRINGS is needed by ethtool -x which is not part - * of rxnfc. We keep this logic out of mlx5e_ethtool_get_rxnfc, - * to avoid breaking "ethtool -x" when mlx5e_ethtool_get_rxnfc - * is compiled out via CONFIG_MLX5_EN_RXNFC=n. - */ - if (info->cmd == ETHTOOL_GRXRINGS) { - info->data = priv->channels.params.num_channels; - return 0; - } - return mlx5e_ethtool_get_rxnfc(priv, info, rule_locs); } @@ -2636,14 +2726,15 @@ static void mlx5e_get_ts_stats(struct net_device *netdev, const struct ethtool_ops mlx5e_ethtool_ops = { .cap_link_lanes_supported = true, - .cap_rss_ctx_supported = true, .rxfh_per_ctx_fields = true, .rxfh_per_ctx_key = true, + .rxfh_max_num_contexts = MLX5E_MAX_NUM_RSS, .supported_coalesce_params = ETHTOOL_COALESCE_USECS | ETHTOOL_COALESCE_MAX_FRAMES | ETHTOOL_COALESCE_USE_ADAPTIVE | ETHTOOL_COALESCE_USE_CQE, .supported_input_xfrm = RXH_XFRM_SYM_OR_XOR, + .supported_ring_params = ETHTOOL_RING_USE_TCP_DATA_SPLIT, .get_drvinfo = mlx5e_get_drvinfo, .get_link = ethtool_op_get_link, .get_link_ext_state = mlx5e_get_link_ext_state, @@ -2666,8 +2757,12 @@ const struct ethtool_ops mlx5e_ethtool_ops = { .set_rxfh = mlx5e_set_rxfh, .get_rxfh_fields = mlx5e_get_rxfh_fields, .set_rxfh_fields = mlx5e_set_rxfh_fields, + .create_rxfh_context = mlx5e_create_rxfh_context, + .modify_rxfh_context = mlx5e_modify_rxfh_context, + .remove_rxfh_context = mlx5e_remove_rxfh_context, .get_rxnfc = mlx5e_get_rxnfc, .set_rxnfc = mlx5e_set_rxnfc, + .get_rx_ring_count = mlx5e_get_rx_ring_count, .get_tunable = mlx5e_get_tunable, .set_tunable = mlx5e_set_tunable, .get_pause_stats = mlx5e_get_pause_stats, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c index 05058710d2..b18ef92837 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c @@ -776,7 +776,7 @@ static int mlx5e_create_promisc_table(struct mlx5e_flow_steering *fs) ft_attr.max_fte = MLX5E_PROMISC_TABLE_SIZE; ft_attr.autogroup.max_num_groups = 1; ft_attr.level = MLX5E_PROMISC_FT_LEVEL; - ft_attr.prio = MLX5E_NIC_PRIO; + ft_attr.prio = MLX5E_PROMISC_PRIO; ft->t = mlx5_create_auto_grouped_flow_table(fs->ns, &ft_attr); if (IS_ERR(ft->t)) { @@ -901,6 +901,9 @@ static void mlx5e_set_inner_ttc_params(struct mlx5e_flow_steering *fs, ft_attr->prio = MLX5E_NIC_PRIO; for (tt = 0; tt < MLX5_NUM_TT; tt++) { + if (mlx5_ttc_is_decrypted_esp_tt(tt)) + continue; + ttc_params->dests[tt].type = MLX5_FLOW_DESTINATION_TYPE_TIR; ttc_params->dests[tt].tir_num = tt == MLX5_TT_ANY ? @@ -910,9 +913,17 @@ static void mlx5e_set_inner_ttc_params(struct mlx5e_flow_steering *fs, } } +static bool mlx5e_ipsec_rss_supported(struct mlx5_core_dev *mdev) +{ + return MLX5_CAP_NIC_RX_FT_FIELD_SUPPORT_2(mdev, ipsec_next_header) && + MLX5_CAP_NIC_RX_FT_FIELD_SUPPORT_2(mdev, outer_l4_type_ext) && + MLX5_CAP_NIC_RX_FT_FIELD_SUPPORT_2(mdev, inner_l4_type_ext); +} + void mlx5e_set_ttc_params(struct mlx5e_flow_steering *fs, struct mlx5e_rx_res *rx_res, - struct ttc_params *ttc_params, bool tunnel) + struct ttc_params *ttc_params, bool tunnel, + bool ipsec_rss) { struct mlx5_flow_table_attr *ft_attr = &ttc_params->ft_attr; @@ -923,7 +934,13 @@ void mlx5e_set_ttc_params(struct mlx5e_flow_steering *fs, ft_attr->level = MLX5E_TTC_FT_LEVEL; ft_attr->prio = MLX5E_NIC_PRIO; + ttc_params->ipsec_rss = ipsec_rss && + mlx5e_ipsec_rss_supported(fs->mdev); + for (tt = 0; tt < MLX5_NUM_TT; tt++) { + if (mlx5_ttc_is_decrypted_esp_tt(tt)) + continue; + ttc_params->dests[tt].type = MLX5_FLOW_DESTINATION_TYPE_TIR; ttc_params->dests[tt].tir_num = tt == MLX5_TT_ANY ? @@ -1289,7 +1306,7 @@ int mlx5e_create_ttc_table(struct mlx5e_flow_steering *fs, { struct ttc_params ttc_params = {}; - mlx5e_set_ttc_params(fs, rx_res, &ttc_params, true); + mlx5e_set_ttc_params(fs, rx_res, &ttc_params, true, true); fs->ttc = mlx5_create_ttc_table(fs->mdev, &ttc_params); return PTR_ERR_OR_ZERO(fs->ttc); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c index 79916f1abd..63bdef5b4b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c @@ -704,7 +704,7 @@ static int validate_flow(struct mlx5e_priv *priv, num_tuples += ret; break; default: - return -ENOTSUPP; + return -EOPNOTSUPP; } if ((fs->flow_type & FLOW_EXT)) { ret = validate_vlan(fs); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 9bd166f489..47416e5437 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -39,7 +39,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -74,10 +76,12 @@ #include "en/trap.h" #include "lib/devcom.h" #include "lib/sd.h" +#include "en/pcie_cong_event.h" static bool mlx5e_hw_gro_supported(struct mlx5_core_dev *mdev) { - if (!MLX5_CAP_GEN(mdev, shampo)) + if (!MLX5_CAP_GEN(mdev, shampo) || + !MLX5_CAP_SHAMPO(mdev, shampo_header_split_data_merge)) return false; /* Our HW-GRO implementation relies on "KSM Mkey" for @@ -228,13 +232,17 @@ static int mlx5e_devcom_event_mpv(int event, void *my_data, void *event_data) static int mlx5e_devcom_init_mpv(struct mlx5e_priv *priv, u64 *data) { + struct mlx5_devcom_match_attr attr = { + .key.val = *data, + }; + priv->devcom = mlx5_devcom_register_component(priv->mdev->priv.devc, MLX5_DEVCOM_MPV, - *data, + &attr, mlx5e_devcom_event_mpv, priv); - if (IS_ERR(priv->devcom)) - return PTR_ERR(priv->devcom); + if (!priv->devcom) + return -EINVAL; if (mlx5_core_is_mp_master(priv->mdev)) { mlx5_devcom_send_event(priv->devcom, MPV_DEVCOM_MASTER_UP, @@ -247,7 +255,7 @@ static int mlx5e_devcom_init_mpv(struct mlx5e_priv *priv, u64 *data) static void mlx5e_devcom_cleanup_mpv(struct mlx5e_priv *priv) { - if (IS_ERR_OR_NULL(priv->devcom)) + if (!priv->devcom) return; if (mlx5_core_is_mp_master(priv->mdev)) { @@ -257,6 +265,7 @@ static void mlx5e_devcom_cleanup_mpv(struct mlx5e_priv *priv) } mlx5_devcom_unregister_component(priv->devcom); + priv->devcom = NULL; } static int blocking_event(struct notifier_block *nb, unsigned long event, void *data) @@ -330,47 +339,6 @@ static inline void mlx5e_build_umr_wqe(struct mlx5e_rq *rq, ucseg->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE); } -static int mlx5e_rq_shampo_hd_alloc(struct mlx5e_rq *rq, int node) -{ - rq->mpwqe.shampo = kvzalloc_node(sizeof(*rq->mpwqe.shampo), - GFP_KERNEL, node); - if (!rq->mpwqe.shampo) - return -ENOMEM; - return 0; -} - -static void mlx5e_rq_shampo_hd_free(struct mlx5e_rq *rq) -{ - kvfree(rq->mpwqe.shampo); -} - -static int mlx5e_rq_shampo_hd_info_alloc(struct mlx5e_rq *rq, int node) -{ - struct mlx5e_shampo_hd *shampo = rq->mpwqe.shampo; - - shampo->bitmap = bitmap_zalloc_node(shampo->hd_per_wq, GFP_KERNEL, - node); - shampo->pages = kvzalloc_node(array_size(shampo->hd_per_wq, - sizeof(*shampo->pages)), - GFP_KERNEL, node); - if (!shampo->bitmap || !shampo->pages) - goto err_nomem; - - return 0; - -err_nomem: - bitmap_free(shampo->bitmap); - kvfree(shampo->pages); - - return -ENOMEM; -} - -static void mlx5e_rq_shampo_hd_info_free(struct mlx5e_rq *rq) -{ - bitmap_free(rq->mpwqe.shampo->bitmap); - kvfree(rq->mpwqe.shampo->pages); -} - static int mlx5e_rq_alloc_mpwqe_info(struct mlx5e_rq *rq, int node) { int wq_sz = mlx5_wq_ll_get_size(&rq->mpwqe.wq); @@ -583,19 +551,26 @@ static int mlx5e_create_rq_umr_mkey(struct mlx5_core_dev *mdev, struct mlx5e_rq } static int mlx5e_create_rq_hd_umr_mkey(struct mlx5_core_dev *mdev, - struct mlx5e_rq *rq) + u16 hd_per_wq, __be32 *umr_mkey) { u32 max_ksm_size = BIT(MLX5_CAP_GEN(mdev, log_max_klm_list_size)); + u32 mkey; + int err; - if (max_ksm_size < rq->mpwqe.shampo->hd_per_wq) { + if (max_ksm_size < hd_per_wq) { mlx5_core_err(mdev, "max ksm list size 0x%x is smaller than shampo header buffer list size 0x%x\n", - max_ksm_size, rq->mpwqe.shampo->hd_per_wq); + max_ksm_size, hd_per_wq); return -EINVAL; } - return mlx5e_create_umr_ksm_mkey(mdev, rq->mpwqe.shampo->hd_per_wq, - MLX5E_SHAMPO_LOG_HEADER_ENTRY_SIZE, - &rq->mpwqe.shampo->mkey); + err = mlx5e_create_umr_ksm_mkey(mdev, hd_per_wq, + MLX5E_SHAMPO_LOG_HEADER_ENTRY_SIZE, + &mkey); + if (err) + return err; + + *umr_mkey = cpu_to_be32(mkey); + return 0; } static void mlx5e_init_frags_partition(struct mlx5e_rq *rq) @@ -706,6 +681,27 @@ static void mlx5e_rq_err_cqe_work(struct work_struct *recover_work) mlx5e_reporter_rq_cqe_err(rq); } +static void mlx5e_rq_timeout_work(struct work_struct *timeout_work) +{ + struct mlx5e_rq *rq = container_of(timeout_work, + struct mlx5e_rq, + rx_timeout_work); + + /* Acquire netdev instance lock to synchronize with channel close and + * reopen flows. Either successfully obtain the lock, or detect that + * channels are closing for another reason, making this work no longer + * necessary. + */ + while (!rtnl_trylock()) { + if (!test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &rq->priv->state)) + return; + msleep(20); + } + + mlx5e_reporter_rx_timeout(rq); + netdev_unlock(rq->netdev); +} + static int mlx5e_alloc_mpwqe_rq_drop_page(struct mlx5e_rq *rq) { rq->wqe_overflow.page = alloc_page(GFP_KERNEL); @@ -738,7 +734,7 @@ static int mlx5e_init_rxq_rq(struct mlx5e_channel *c, struct mlx5e_params *param rq->pdev = c->pdev; rq->netdev = c->netdev; rq->priv = c->priv; - rq->tstamp = c->tstamp; + rq->hwtstamp_config = &c->priv->hwtstamp_config; rq->clock = mdev->clock; rq->icosq = &c->icosq; rq->ix = c->ix; @@ -757,6 +753,40 @@ static int mlx5e_init_rxq_rq(struct mlx5e_channel *c, struct mlx5e_params *param xdp_frag_size); } +static int mlx5e_rq_shampo_hd_info_alloc(struct mlx5e_rq *rq, u16 hd_per_wq, + int node) +{ + struct mlx5e_shampo_hd *shampo = rq->mpwqe.shampo; + + shampo->hd_per_wq = hd_per_wq; + + shampo->bitmap = bitmap_zalloc_node(hd_per_wq, GFP_KERNEL, node); + shampo->pages = kvzalloc_node(array_size(hd_per_wq, + sizeof(*shampo->pages)), + GFP_KERNEL, node); + if (!shampo->bitmap || !shampo->pages) + goto err_nomem; + + return 0; + +err_nomem: + kvfree(shampo->pages); + bitmap_free(shampo->bitmap); + + return -ENOMEM; +} + +static void mlx5e_rq_shampo_hd_info_free(struct mlx5e_rq *rq) +{ + kvfree(rq->mpwqe.shampo->pages); + bitmap_free(rq->mpwqe.shampo->bitmap); +} + +static bool mlx5_rq_needs_separate_hd_pool(struct mlx5e_rq *rq) +{ + return false; +} + static int mlx5_rq_shampo_alloc(struct mlx5_core_dev *mdev, struct mlx5e_params *params, struct mlx5e_rq_param *rqp, @@ -764,42 +794,94 @@ static int mlx5_rq_shampo_alloc(struct mlx5_core_dev *mdev, u32 *pool_size, int node) { + void *wqc = MLX5_ADDR_OF(rqc, rqp->rqc, wq); + u8 log_hd_per_page, log_hd_entry_size; + u16 hd_per_wq, hd_per_wqe; + u32 hd_pool_size; + int wq_size; int err; if (!test_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state)) return 0; - err = mlx5e_rq_shampo_hd_alloc(rq, node); + + rq->mpwqe.shampo = kvzalloc_node(sizeof(*rq->mpwqe.shampo), + GFP_KERNEL, node); + if (!rq->mpwqe.shampo) + return -ENOMEM; + + /* split headers data structures */ + hd_per_wq = mlx5e_shampo_hd_per_wq(mdev, params, rqp); + err = mlx5e_rq_shampo_hd_info_alloc(rq, hd_per_wq, node); if (err) - goto out; - rq->mpwqe.shampo->hd_per_wq = - mlx5e_shampo_hd_per_wq(mdev, params, rqp); - err = mlx5e_create_rq_hd_umr_mkey(mdev, rq); + goto err_shampo_hd_info_alloc; + + err = mlx5e_create_rq_hd_umr_mkey(mdev, hd_per_wq, + &rq->mpwqe.shampo->mkey_be); if (err) - goto err_shampo_hd; - err = mlx5e_rq_shampo_hd_info_alloc(rq, node); - if (err) - goto err_shampo_info; + goto err_umr_mkey; + + hd_per_wqe = mlx5e_shampo_hd_per_wqe(mdev, params, rqp); + wq_size = BIT(MLX5_GET(wq, wqc, log_wq_sz)); + + BUILD_BUG_ON(MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE > PAGE_SHIFT); + if (hd_per_wqe >= MLX5E_SHAMPO_WQ_HEADER_PER_PAGE) { + log_hd_per_page = MLX5E_SHAMPO_LOG_WQ_HEADER_PER_PAGE; + log_hd_entry_size = MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE; + } else { + log_hd_per_page = order_base_2(hd_per_wqe); + log_hd_entry_size = order_base_2(PAGE_SIZE / hd_per_wqe); + } + + rq->mpwqe.shampo->hd_per_wqe = hd_per_wqe; + rq->mpwqe.shampo->hd_per_page = BIT(log_hd_per_page); + rq->mpwqe.shampo->log_hd_per_page = log_hd_per_page; + rq->mpwqe.shampo->log_hd_entry_size = log_hd_entry_size; + + hd_pool_size = (hd_per_wqe * wq_size) >> log_hd_per_page; + + if (mlx5_rq_needs_separate_hd_pool(rq)) { + /* Separate page pool for shampo headers */ + struct page_pool_params pp_params = { }; + + pp_params.order = 0; + pp_params.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; + pp_params.pool_size = hd_pool_size; + pp_params.nid = node; + pp_params.dev = rq->pdev; + pp_params.napi = rq->cq.napi; + pp_params.netdev = rq->netdev; + pp_params.dma_dir = rq->buff.map_dir; + pp_params.max_len = PAGE_SIZE; + + rq->hd_page_pool = page_pool_create(&pp_params); + if (IS_ERR(rq->hd_page_pool)) { + err = PTR_ERR(rq->hd_page_pool); + rq->hd_page_pool = NULL; + goto err_hds_page_pool; + } + } else { + /* Common page pool, reserve space for headers. */ + *pool_size += hd_pool_size; + rq->hd_page_pool = NULL; + } + + /* gro only data structures */ rq->hw_gro_data = kvzalloc_node(sizeof(*rq->hw_gro_data), GFP_KERNEL, node); if (!rq->hw_gro_data) { err = -ENOMEM; goto err_hw_gro_data; } - rq->mpwqe.shampo->key = - cpu_to_be32(rq->mpwqe.shampo->mkey); - rq->mpwqe.shampo->hd_per_wqe = - mlx5e_shampo_hd_per_wqe(mdev, params, rqp); - rq->mpwqe.shampo->pages_per_wq = - rq->mpwqe.shampo->hd_per_wq / MLX5E_SHAMPO_WQ_HEADER_PER_PAGE; - *pool_size += rq->mpwqe.shampo->pages_per_wq; + return 0; err_hw_gro_data: + page_pool_destroy(rq->hd_page_pool); +err_hds_page_pool: + mlx5_core_destroy_mkey(mdev, be32_to_cpu(rq->mpwqe.shampo->mkey_be)); +err_umr_mkey: mlx5e_rq_shampo_hd_info_free(rq); -err_shampo_info: - mlx5_core_destroy_mkey(mdev, rq->mpwqe.shampo->mkey); -err_shampo_hd: - mlx5e_rq_shampo_hd_free(rq); -out: +err_shampo_hd_info_alloc: + kvfree(rq->mpwqe.shampo); return err; } @@ -809,9 +891,12 @@ static void mlx5e_rq_free_shampo(struct mlx5e_rq *rq) return; kvfree(rq->hw_gro_data); + if (rq->hd_page_pool != rq->page_pool) + page_pool_destroy(rq->hd_page_pool); mlx5e_rq_shampo_hd_info_free(rq); - mlx5_core_destroy_mkey(rq->mdev, rq->mpwqe.shampo->mkey); - mlx5e_rq_shampo_hd_free(rq); + mlx5_core_destroy_mkey(rq->mdev, + be32_to_cpu(rq->mpwqe.shampo->mkey_be)); + kvfree(rq->mpwqe.shampo); } static int mlx5e_alloc_rq(struct mlx5e_params *params, @@ -829,6 +914,7 @@ static int mlx5e_alloc_rq(struct mlx5e_params *params, rqp->wq.db_numa_node = node; INIT_WORK(&rq->recover_work, mlx5e_rq_err_cqe_work); + INIT_WORK(&rq->rx_timeout_work, mlx5e_rq_timeout_work); if (params->xdp_prog) bpf_prog_inc(params->xdp_prog); @@ -914,6 +1000,8 @@ static int mlx5e_alloc_rq(struct mlx5e_params *params, if (xsk) { err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, MEM_TYPE_XSK_BUFF_POOL, NULL); + if (err) + goto err_free_by_rq_type; xsk_pool_set_rxq_info(rq->xsk_pool, &rq->xdp_rxq); } else { /* Create a page_pool and register it with rxq */ @@ -940,12 +1028,15 @@ static int mlx5e_alloc_rq(struct mlx5e_params *params, rq->page_pool = NULL; goto err_free_by_rq_type; } - if (xdp_rxq_info_is_reg(&rq->xdp_rxq)) + if (!rq->hd_page_pool) + rq->hd_page_pool = rq->page_pool; + if (xdp_rxq_info_is_reg(&rq->xdp_rxq)) { err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, MEM_TYPE_PAGE_POOL, rq->page_pool); + if (err) + goto err_destroy_page_pool; + } } - if (err) - goto err_destroy_page_pool; for (i = 0; i < wq_sz; i++) { if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) { @@ -1073,7 +1164,8 @@ int mlx5e_create_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param, u16 q_cou if (test_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state)) { MLX5_SET(wq, wq, log_headers_buffer_entry_num, order_base_2(rq->mpwqe.shampo->hd_per_wq)); - MLX5_SET(wq, wq, headers_mkey, rq->mpwqe.shampo->mkey); + MLX5_SET(wq, wq, headers_mkey, + be32_to_cpu(rq->mpwqe.shampo->mkey_be)); } mlx5_fill_page_frag_array(&rq->wq_ctrl.buf, @@ -1203,7 +1295,8 @@ int mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq, int wait_time) netdev_warn(rq->netdev, "Failed to get min RX wqes on Channel[%d] RQN[0x%x] wq cur_sz(%d) min_rx_wqes(%d)\n", rq->ix, rq->rqn, mlx5e_rqwq_get_cur_sz(rq), min_wqes); - mlx5e_reporter_rx_timeout(rq); + queue_work(rq->priv->wq, &rq->rx_timeout_work); + return -ETIMEDOUT; } @@ -1374,6 +1467,7 @@ void mlx5e_close_rq(struct mlx5e_rq *rq) if (rq->dim) cancel_work_sync(&rq->dim->work); cancel_work_sync(&rq->recover_work); + cancel_work_sync(&rq->rx_timeout_work); mlx5e_destroy_rq(rq); mlx5e_free_rx_descs(rq); mlx5e_free_rq(rq); @@ -1454,7 +1548,7 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c, sq->pdev = c->pdev; sq->mkey_be = c->mkey_be; sq->channel = c; - sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map; + sq->uar_map = c->bfreg->map; sq->min_inline_mode = params->tx_min_inline_mode; sq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu) - ETH_FCS_LEN; sq->xsk_pool = xsk_pool; @@ -1539,7 +1633,7 @@ static int mlx5e_alloc_icosq(struct mlx5e_channel *c, int err; sq->channel = c; - sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map; + sq->uar_map = c->bfreg->map; sq->reserved_room = param->stop_room; param->wq.db_numa_node = cpu_to_node(c->cpu); @@ -1624,13 +1718,11 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c, sq->priv = c->priv; sq->ch_ix = c->ix; sq->txq_ix = txq_ix; - sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map; + sq->uar_map = c->bfreg->map; sq->min_inline_mode = params->tx_min_inline_mode; sq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); sq->max_sq_mpw_wqebbs = mlx5e_get_max_sq_aligned_wqebbs(mdev); INIT_WORK(&sq->recover_work, mlx5e_tx_err_cqe_work); - if (!MLX5_CAP_ETH(mdev, wqe_vlan_insert)) - set_bit(MLX5E_SQ_STATE_VLAN_NEED_L2_INLINE, &sq->state); if (mlx5_ipsec_device_caps(c->priv->mdev)) set_bit(MLX5E_SQ_STATE_IPSEC, &sq->state); if (param->is_mpw) @@ -1702,7 +1794,7 @@ static int mlx5e_create_sq(struct mlx5_core_dev *mdev, MLX5_SET(sqc, sqc, flush_in_error_en, 1); MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); - MLX5_SET(wq, wq, uar_page, mdev->mlx5e_res.hw_objs.bfreg.index); + MLX5_SET(wq, wq, uar_page, csp->uar_page); MLX5_SET(wq, wq, log_wq_pg_sz, csp->wq_ctrl->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET64(wq, wq, dbr_addr, csp->wq_ctrl->db.dma); @@ -1806,6 +1898,7 @@ int mlx5e_open_txqsq(struct mlx5e_channel *c, u32 tisn, int txq_ix, csp.cqn = sq->cq.mcq.cqn; csp.wq_ctrl = &sq->wq_ctrl; csp.min_inline_mode = sq->min_inline_mode; + csp.uar_page = c->bfreg->index; err = mlx5e_create_sq_rdy(c->mdev, param, &csp, qos_queue_group_id, &sq->sqn); if (err) goto err_free_txqsq; @@ -1963,6 +2056,7 @@ static int mlx5e_open_icosq(struct mlx5e_channel *c, struct mlx5e_params *params csp.cqn = sq->cq.mcq.cqn; csp.wq_ctrl = &sq->wq_ctrl; csp.min_inline_mode = params->tx_min_inline_mode; + csp.uar_page = c->bfreg->index; err = mlx5e_create_sq_rdy(c->mdev, param, &csp, 0, &sq->sqn); if (err) goto err_free_icosq; @@ -2023,6 +2117,7 @@ int mlx5e_open_xdpsq(struct mlx5e_channel *c, struct mlx5e_params *params, csp.cqn = sq->cq.mcq.cqn; csp.wq_ctrl = &sq->wq_ctrl; csp.min_inline_mode = sq->min_inline_mode; + csp.uar_page = c->bfreg->index; set_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); err = mlx5e_create_sq_rdy(c->mdev, param, &csp, 0, &sq->sqn); @@ -2093,6 +2188,7 @@ static void mlx5e_close_xdpredirect_sq(struct mlx5e_xdpsq *xdpsq) static int mlx5e_alloc_cq_common(struct mlx5_core_dev *mdev, struct net_device *netdev, struct workqueue_struct *workqueue, + struct mlx5_uars_page *uar, struct mlx5e_cq_param *param, struct mlx5e_cq *cq) { @@ -2109,7 +2205,6 @@ static int mlx5e_alloc_cq_common(struct mlx5_core_dev *mdev, mcq->set_ci_db = cq->wq_ctrl.db.db; mcq->arm_db = cq->wq_ctrl.db.db + 1; *mcq->set_ci_db = 0; - *mcq->arm_db = 0; mcq->vector = param->eq_ix; mcq->comp = mlx5e_completion_event; mcq->event = mlx5e_cq_error_event; @@ -2124,6 +2219,7 @@ static int mlx5e_alloc_cq_common(struct mlx5_core_dev *mdev, cq->mdev = mdev; cq->netdev = netdev; cq->workqueue = workqueue; + cq->uar = uar; return 0; } @@ -2139,7 +2235,8 @@ static int mlx5e_alloc_cq(struct mlx5_core_dev *mdev, param->wq.db_numa_node = ccp->node; param->eq_ix = ccp->ix; - err = mlx5e_alloc_cq_common(mdev, ccp->netdev, ccp->wq, param, cq); + err = mlx5e_alloc_cq_common(mdev, ccp->netdev, ccp->wq, + ccp->uar, param, cq); cq->napi = ccp->napi; cq->ch_stats = ccp->ch_stats; @@ -2184,7 +2281,7 @@ static int mlx5e_create_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param) MLX5_SET(cqc, cqc, cq_period_mode, mlx5e_cq_period_mode(param->cq_period_mode)); MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); - MLX5_SET(cqc, cqc, uar_page, mdev->priv.uar->index); + MLX5_SET(cqc, cqc, uar_page, cq->uar->index); MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma); @@ -2501,7 +2598,7 @@ static int mlx5e_open_queues(struct mlx5e_channel *c, if (err) goto err_close_icosq_cq; - if (netdev_ops->ndo_xdp_xmit) { + if (netdev_ops->ndo_xdp_xmit && c->xdp) { c->xdpsq = mlx5e_open_xdpredirect_sq(c, params, cparam, &ccp); if (IS_ERR(c->xdpsq)) { err = PTR_ERR(c->xdpsq); @@ -2651,6 +2748,20 @@ void mlx5e_trigger_napi_sched(struct napi_struct *napi) local_bh_enable(); } +static void mlx5e_channel_pick_doorbell(struct mlx5e_channel *c) +{ + struct mlx5e_hw_objs *hw_objs = &c->mdev->mlx5e_res.hw_objs; + + /* No dedicated Ethernet doorbells, use the global one. */ + if (hw_objs->num_bfregs == 0) { + c->bfreg = &c->mdev->priv.bfreg; + return; + } + + /* Round-robin between doorbells. */ + c->bfreg = hw_objs->bfregs + c->vec_ix % hw_objs->num_bfregs; +} + static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, struct mlx5e_params *params, struct xsk_buff_pool *xsk_pool, @@ -2691,7 +2802,6 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, c->priv = priv; c->mdev = mdev; - c->tstamp = &priv->tstamp; c->ix = ix; c->vec_ix = vec_ix; c->sd_ix = mlx5_sd_ch_ix_get_dev_ix(mdev, ix); @@ -2705,6 +2815,8 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, c->aff_mask = irq_get_effective_affinity_mask(irq); c->lag_port = mlx5e_enumerate_lag_port(mdev, ix); + mlx5e_channel_pick_doorbell(c); + netif_napi_add_config(netdev, &c->napi, mlx5e_napi_poll, ix); netif_napi_set_irq(&c->napi, irq); @@ -3239,16 +3351,17 @@ static int mlx5e_switch_priv_params(struct mlx5e_priv *priv, } } + mlx5e_set_xdp_feature(priv); return 0; } static int mlx5e_switch_priv_channels(struct mlx5e_priv *priv, + struct mlx5e_channels *old_chs, struct mlx5e_channels *new_chs, mlx5e_fp_preactivate preactivate, void *context) { struct net_device *netdev = priv->netdev; - struct mlx5e_channels old_chs; int carrier_ok; int err = 0; @@ -3257,7 +3370,6 @@ static int mlx5e_switch_priv_channels(struct mlx5e_priv *priv, mlx5e_deactivate_priv_channels(priv); - old_chs = priv->channels; priv->channels = *new_chs; /* New channels are ready to roll, call the preactivate hook if needed @@ -3266,12 +3378,14 @@ static int mlx5e_switch_priv_channels(struct mlx5e_priv *priv, if (preactivate) { err = preactivate(priv, context); if (err) { - priv->channels = old_chs; + priv->channels = *old_chs; goto out; } } - mlx5e_close_channels(&old_chs); + mlx5e_set_xdp_feature(priv); + if (!MLX5_CAP_GEN(priv->mdev, tis_tir_td_order)) + mlx5e_close_channels(old_chs); priv->profile->update_rx(priv); mlx5e_selq_apply(&priv->selq); @@ -3290,16 +3404,20 @@ int mlx5e_safe_switch_params(struct mlx5e_priv *priv, mlx5e_fp_preactivate preactivate, void *context, bool reset) { - struct mlx5e_channels *new_chs; + struct mlx5e_channels *old_chs, *new_chs; int err; reset &= test_bit(MLX5E_STATE_OPENED, &priv->state); if (!reset) return mlx5e_switch_priv_params(priv, params, preactivate, context); + old_chs = kzalloc(sizeof(*old_chs), GFP_KERNEL); new_chs = kzalloc(sizeof(*new_chs), GFP_KERNEL); - if (!new_chs) - return -ENOMEM; + if (!old_chs || !new_chs) { + err = -ENOMEM; + goto err_free_chs; + } + new_chs->params = *params; mlx5e_selq_prepare_params(&priv->selq, &new_chs->params); @@ -3308,11 +3426,18 @@ int mlx5e_safe_switch_params(struct mlx5e_priv *priv, if (err) goto err_cancel_selq; - err = mlx5e_switch_priv_channels(priv, new_chs, preactivate, context); + *old_chs = priv->channels; + + err = mlx5e_switch_priv_channels(priv, old_chs, new_chs, + preactivate, context); if (err) goto err_close; + if (MLX5_CAP_GEN(priv->mdev, tis_tir_td_order)) + mlx5e_close_channels(old_chs); + kfree(new_chs); + kfree(old_chs); return 0; err_close: @@ -3320,7 +3445,9 @@ err_close: err_cancel_selq: mlx5e_selq_cancel(&priv->selq); +err_free_chs: kfree(new_chs); + kfree(old_chs); return err; } @@ -3331,8 +3458,8 @@ int mlx5e_safe_reopen_channels(struct mlx5e_priv *priv) void mlx5e_timestamp_init(struct mlx5e_priv *priv) { - priv->tstamp.tx_type = HWTSTAMP_TX_OFF; - priv->tstamp.rx_filter = HWTSTAMP_FILTER_NONE; + priv->hwtstamp_config.tx_type = HWTSTAMP_TX_OFF; + priv->hwtstamp_config.rx_filter = HWTSTAMP_FILTER_NONE; } static void mlx5e_modify_admin_state(struct mlx5_core_dev *mdev, @@ -3476,7 +3603,8 @@ static int mlx5e_alloc_drop_cq(struct mlx5e_priv *priv, param->wq.buf_numa_node = dev_to_node(mlx5_core_dma_dev(mdev)); param->wq.db_numa_node = dev_to_node(mlx5_core_dma_dev(mdev)); - return mlx5e_alloc_cq_common(priv->mdev, priv->netdev, priv->wq, param, cq); + return mlx5e_alloc_cq_common(priv->mdev, priv->netdev, priv->wq, + mdev->priv.bfreg.up, param, cq); } int mlx5e_open_drop_rq(struct mlx5e_priv *priv, @@ -3905,6 +4033,8 @@ mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) mlx5e_queue_update_stats(priv); } + netdev_stats_to_stats64(stats, &dev->stats); + if (mlx5e_is_uplink_rep(priv)) { struct mlx5e_vport_stats *vstats = &priv->stats.vport; @@ -3921,21 +4051,21 @@ mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) mlx5e_fold_sw_stats64(priv, stats); } - stats->rx_missed_errors = priv->stats.qcnt.rx_out_of_buffer; - stats->rx_dropped = PPORT_2863_GET(pstats, if_in_discards); + stats->rx_missed_errors += priv->stats.qcnt.rx_out_of_buffer; + stats->rx_dropped += PPORT_2863_GET(pstats, if_in_discards); - stats->rx_length_errors = + stats->rx_length_errors += PPORT_802_3_GET(pstats, a_in_range_length_errors) + PPORT_802_3_GET(pstats, a_out_of_range_length_field) + PPORT_802_3_GET(pstats, a_frame_too_long_errors) + VNIC_ENV_GET(&priv->stats.vnic, eth_wqe_too_small); - stats->rx_crc_errors = + stats->rx_crc_errors += PPORT_802_3_GET(pstats, a_frame_check_sequence_errors); - stats->rx_frame_errors = PPORT_802_3_GET(pstats, a_alignment_errors); - stats->tx_aborted_errors = PPORT_2863_GET(pstats, if_out_discards); - stats->rx_errors = stats->rx_length_errors + stats->rx_crc_errors + - stats->rx_frame_errors; - stats->tx_errors = stats->tx_aborted_errors + stats->tx_carrier_errors; + stats->rx_frame_errors += PPORT_802_3_GET(pstats, a_alignment_errors); + stats->tx_aborted_errors += PPORT_2863_GET(pstats, if_out_discards); + stats->rx_errors += stats->rx_length_errors + stats->rx_crc_errors + + stats->rx_frame_errors; + stats->tx_errors += stats->tx_aborted_errors + stats->tx_carrier_errors; } static void mlx5e_nic_set_rx_mode(struct mlx5e_priv *priv) @@ -4029,10 +4159,6 @@ static int set_feature_hw_gro(struct net_device *netdev, bool enable) if (enable) { new_params.packet_merge.type = MLX5E_PACKET_MERGE_SHAMPO; - new_params.packet_merge.shampo.match_criteria_type = - MLX5_RQC_SHAMPO_MATCH_CRITERIA_TYPE_EXTENDED; - new_params.packet_merge.shampo.alignment_granularity = - MLX5_RQC_SHAMPO_NO_MATCH_ALIGNMENT_GRANULARITY_STRIDE; } else if (new_params.packet_merge.type == MLX5E_PACKET_MERGE_SHAMPO) { new_params.packet_merge.type = MLX5E_PACKET_MERGE_NONE; } else { @@ -4268,23 +4394,22 @@ static int mlx5e_handle_feature(struct net_device *netdev, return 0; } -void mlx5e_set_xdp_feature(struct net_device *netdev) +void mlx5e_set_xdp_feature(struct mlx5e_priv *priv) { - struct mlx5e_priv *priv = netdev_priv(netdev); struct mlx5e_params *params = &priv->channels.params; - xdp_features_t val; + struct net_device *netdev = priv->netdev; + xdp_features_t val = 0; - if (!netdev->netdev_ops->ndo_bpf || - params->packet_merge.type != MLX5E_PACKET_MERGE_NONE) { - xdp_clear_features_flag(netdev); - return; - } + if (netdev->netdev_ops->ndo_bpf && + params->packet_merge.type == MLX5E_PACKET_MERGE_NONE) + val = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | + NETDEV_XDP_ACT_XSK_ZEROCOPY | + NETDEV_XDP_ACT_RX_SG; + + if (netdev->netdev_ops->ndo_xdp_xmit && params->xdp_prog) + val |= NETDEV_XDP_ACT_NDO_XMIT | + NETDEV_XDP_ACT_NDO_XMIT_SG; - val = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | - NETDEV_XDP_ACT_XSK_ZEROCOPY | - NETDEV_XDP_ACT_RX_SG | - NETDEV_XDP_ACT_NDO_XMIT | - NETDEV_XDP_ACT_NDO_XMIT_SG; xdp_set_features_flag(netdev, val); } @@ -4320,9 +4445,6 @@ int mlx5e_set_features(struct net_device *netdev, netdev_features_t features) return -EINVAL; } - /* update XDP supported features */ - mlx5e_set_xdp_feature(netdev); - return 0; } @@ -4359,6 +4481,7 @@ static netdev_features_t mlx5e_fix_uplink_rep_features(struct net_device *netdev static netdev_features_t mlx5e_fix_features(struct net_device *netdev, netdev_features_t features) { + struct netdev_config *cfg = netdev->cfg_pending; struct mlx5e_priv *priv = netdev_priv(netdev); struct mlx5e_vlan_table *vlan; struct mlx5e_params *params; @@ -4425,6 +4548,13 @@ static netdev_features_t mlx5e_fix_features(struct net_device *netdev, } } + /* The header-data split ring param requires HW GRO to stay enabled. */ + if (cfg && cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && + !(features & NETIF_F_GRO_HW)) { + netdev_warn(netdev, "Keeping HW-GRO enabled, TCP header-data split depends on it\n"); + features |= NETIF_F_GRO_HW; + } + if (mlx5e_is_uplink_rep(priv)) { features = mlx5e_fix_uplink_rep_features(netdev, features); netdev->netns_immutable = true; @@ -4622,22 +4752,23 @@ static int mlx5e_hwstamp_config_ptp_rx(struct mlx5e_priv *priv, bool ptp_rx) &new_params.ptp_rx, true); } -int mlx5e_hwstamp_set(struct mlx5e_priv *priv, struct ifreq *ifr) +int mlx5e_hwtstamp_set(struct mlx5e_priv *priv, + struct kernel_hwtstamp_config *config, + struct netlink_ext_ack *extack) { - struct hwtstamp_config config; bool rx_cqe_compress_def; bool ptp_rx; int err; if (!MLX5_CAP_GEN(priv->mdev, device_frequency_khz) || - (mlx5_clock_get_ptp_index(priv->mdev) == -1)) + (mlx5_clock_get_ptp_index(priv->mdev) == -1)) { + NL_SET_ERR_MSG_MOD(extack, + "Timestamps are not supported on this device"); return -EOPNOTSUPP; - - if (copy_from_user(&config, ifr->ifr_data, sizeof(config))) - return -EFAULT; + } /* TX HW timestamp */ - switch (config.tx_type) { + switch (config->tx_type) { case HWTSTAMP_TX_OFF: case HWTSTAMP_TX_ON: break; @@ -4649,7 +4780,7 @@ int mlx5e_hwstamp_set(struct mlx5e_priv *priv, struct ifreq *ifr) rx_cqe_compress_def = priv->channels.params.rx_cqe_compress_def; /* RX HW timestamp */ - switch (config.rx_filter) { + switch (config->rx_filter) { case HWTSTAMP_FILTER_NONE: ptp_rx = false; break; @@ -4668,7 +4799,7 @@ int mlx5e_hwstamp_set(struct mlx5e_priv *priv, struct ifreq *ifr) case HWTSTAMP_FILTER_PTP_V2_SYNC: case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: case HWTSTAMP_FILTER_NTP_ALL: - config.rx_filter = HWTSTAMP_FILTER_ALL; + config->rx_filter = HWTSTAMP_FILTER_ALL; /* ptp_rx is set if both HW TS is set and CQE * compression is set */ @@ -4681,47 +4812,50 @@ int mlx5e_hwstamp_set(struct mlx5e_priv *priv, struct ifreq *ifr) if (!mlx5e_profile_feature_cap(priv->profile, PTP_RX)) err = mlx5e_hwstamp_config_no_ptp_rx(priv, - config.rx_filter != HWTSTAMP_FILTER_NONE); + config->rx_filter != HWTSTAMP_FILTER_NONE); else err = mlx5e_hwstamp_config_ptp_rx(priv, ptp_rx); if (err) goto err_unlock; - memcpy(&priv->tstamp, &config, sizeof(config)); + priv->hwtstamp_config = *config; mutex_unlock(&priv->state_lock); /* might need to fix some features */ netdev_update_features(priv->netdev); - return copy_to_user(ifr->ifr_data, &config, - sizeof(config)) ? -EFAULT : 0; + return 0; err_unlock: mutex_unlock(&priv->state_lock); return err; } -int mlx5e_hwstamp_get(struct mlx5e_priv *priv, struct ifreq *ifr) +static int mlx5e_hwtstamp_set_ndo(struct net_device *netdev, + struct kernel_hwtstamp_config *config, + struct netlink_ext_ack *extack) { - struct hwtstamp_config *cfg = &priv->tstamp; + struct mlx5e_priv *priv = netdev_priv(netdev); + return mlx5e_hwtstamp_set(priv, config, extack); +} + +int mlx5e_hwtstamp_get(struct mlx5e_priv *priv, + struct kernel_hwtstamp_config *config) +{ if (!MLX5_CAP_GEN(priv->mdev, device_frequency_khz)) return -EOPNOTSUPP; - return copy_to_user(ifr->ifr_data, cfg, sizeof(*cfg)) ? -EFAULT : 0; + *config = priv->hwtstamp_config; + + return 0; } -static int mlx5e_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +static int mlx5e_hwtstamp_get_ndo(struct net_device *dev, + struct kernel_hwtstamp_config *config) { struct mlx5e_priv *priv = netdev_priv(dev); - switch (cmd) { - case SIOCSHWTSTAMP: - return mlx5e_hwstamp_set(priv, ifr); - case SIOCGHWTSTAMP: - return mlx5e_hwstamp_get(priv, ifr); - default: - return -EOPNOTSUPP; - } + return mlx5e_hwtstamp_get(priv, config); } #ifdef CONFIG_MLX5_ESWITCH @@ -5165,13 +5299,14 @@ const struct net_device_ops mlx5e_netdev_ops = { .ndo_set_features = mlx5e_set_features, .ndo_fix_features = mlx5e_fix_features, .ndo_change_mtu = mlx5e_change_nic_mtu, - .ndo_eth_ioctl = mlx5e_ioctl, .ndo_set_tx_maxrate = mlx5e_set_tx_maxrate, .ndo_features_check = mlx5e_features_check, .ndo_tx_timeout = mlx5e_tx_timeout, .ndo_bpf = mlx5e_xdp, .ndo_xdp_xmit = mlx5e_xdp_xmit, .ndo_xsk_wakeup = mlx5e_xsk_wakeup, + .ndo_hwtstamp_get = mlx5e_hwtstamp_get_ndo, + .ndo_hwtstamp_set = mlx5e_hwtstamp_set_ndo, #ifdef CONFIG_MLX5_EN_ARFS .ndo_rx_flow_steer = mlx5e_rx_flow_steer, #endif @@ -5446,6 +5581,103 @@ static const struct netdev_stat_ops mlx5e_stat_ops = { .get_base_stats = mlx5e_get_base_stats, }; +struct mlx5_qmgmt_data { + struct mlx5e_channel *c; + struct mlx5e_channel_param cparam; +}; + +static int mlx5e_queue_mem_alloc(struct net_device *dev, void *newq, + int queue_index) +{ + struct mlx5_qmgmt_data *new = (struct mlx5_qmgmt_data *)newq; + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5e_channels *chs = &priv->channels; + struct mlx5e_params params = chs->params; + struct mlx5_core_dev *mdev; + int err; + + mutex_lock(&priv->state_lock); + if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) { + err = -ENODEV; + goto unlock; + } + + if (queue_index >= chs->num) { + err = -ERANGE; + goto unlock; + } + + if (MLX5E_GET_PFLAG(&chs->params, MLX5E_PFLAG_TX_PORT_TS) || + chs->params.ptp_rx || + chs->params.xdp_prog || + priv->htb) { + netdev_err(priv->netdev, + "Cloning channels with Port/rx PTP, XDP or HTB is not supported\n"); + err = -EOPNOTSUPP; + goto unlock; + } + + mdev = mlx5_sd_ch_ix_get_dev(priv->mdev, queue_index); + err = mlx5e_build_channel_param(mdev, ¶ms, &new->cparam); + if (err) + goto unlock; + + err = mlx5e_open_channel(priv, queue_index, ¶ms, NULL, &new->c); +unlock: + mutex_unlock(&priv->state_lock); + return err; +} + +static void mlx5e_queue_mem_free(struct net_device *dev, void *mem) +{ + struct mlx5_qmgmt_data *data = (struct mlx5_qmgmt_data *)mem; + + /* not supposed to happen since mlx5e_queue_start never fails + * but this is how this should be implemented just in case + */ + if (data->c) + mlx5e_close_channel(data->c); +} + +static int mlx5e_queue_stop(struct net_device *dev, void *oldq, int queue_index) +{ + /* In mlx5 a txq cannot be simply stopped in isolation, only restarted. + * mlx5e_queue_start does not fail, we stop the old queue there. + * TODO: Improve this. + */ + return 0; +} + +static int mlx5e_queue_start(struct net_device *dev, void *newq, + int queue_index) +{ + struct mlx5_qmgmt_data *new = (struct mlx5_qmgmt_data *)newq; + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5e_channel *old; + + mutex_lock(&priv->state_lock); + + /* stop and close the old */ + old = priv->channels.c[queue_index]; + mlx5e_deactivate_priv_channels(priv); + /* close old before activating new, to avoid napi conflict */ + mlx5e_close_channel(old); + + /* start the new */ + priv->channels.c[queue_index] = new->c; + mlx5e_activate_priv_channels(priv); + mutex_unlock(&priv->state_lock); + return 0; +} + +static const struct netdev_queue_mgmt_ops mlx5e_queue_mgmt_ops = { + .ndo_queue_mem_size = sizeof(struct mlx5_qmgmt_data), + .ndo_queue_mem_alloc = mlx5e_queue_mem_alloc, + .ndo_queue_mem_free = mlx5e_queue_mem_free, + .ndo_queue_start = mlx5e_queue_start, + .ndo_queue_stop = mlx5e_queue_stop, +}; + static void mlx5e_build_nic_netdev(struct net_device *netdev) { struct mlx5e_priv *priv = netdev_priv(netdev); @@ -5456,6 +5688,7 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev) SET_NETDEV_DEV(netdev, mdev->device); netdev->netdev_ops = &mlx5e_netdev_ops; + netdev->queue_mgmt_ops = &mlx5e_queue_mgmt_ops; netdev->xdp_metadata_ops = &mlx5e_xdp_metadata_ops; netdev->xsk_tx_metadata_ops = &mlx5e_xsk_tx_metadata_ops; @@ -5496,17 +5729,17 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev) MLX5E_MPWRQ_UMR_MODE_ALIGNED)) netdev->vlan_features |= NETIF_F_LRO; + if (mlx5e_hw_gro_supported(mdev) && + mlx5e_check_fragmented_striding_rq_cap(mdev, PAGE_SHIFT, + MLX5E_MPWRQ_UMR_MODE_ALIGNED)) + netdev->vlan_features |= NETIF_F_GRO_HW; + netdev->hw_features = netdev->vlan_features; netdev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX; netdev->hw_features |= NETIF_F_HW_VLAN_CTAG_RX; netdev->hw_features |= NETIF_F_HW_VLAN_CTAG_FILTER; netdev->hw_features |= NETIF_F_HW_VLAN_STAG_TX; - if (mlx5e_hw_gro_supported(mdev) && - mlx5e_check_fragmented_striding_rq_cap(mdev, PAGE_SHIFT, - MLX5E_MPWRQ_UMR_MODE_ALIGNED)) - netdev->hw_features |= NETIF_F_GRO_HW; - if (mlx5e_tunnel_any_tx_proto_supported(mdev)) { netdev->hw_enc_features |= NETIF_F_HW_CSUM; netdev->hw_enc_features |= NETIF_F_TSO; @@ -5586,7 +5819,7 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev) netdev->priv_flags |= IFF_UNICAST_FLT; netif_set_tso_max_size(netdev, GSO_MAX_SIZE); - mlx5e_set_xdp_feature(netdev); + mlx5e_set_xdp_feature(priv); mlx5e_set_netdev_dev_addr(netdev); mlx5e_macsec_build_netdev(priv); mlx5e_ipsec_build_netdev(priv); @@ -5679,7 +5912,7 @@ static int mlx5e_nic_init(struct mlx5_core_dev *mdev, rtnl_lock(); /* update XDP supported features */ - mlx5e_set_xdp_feature(netdev); + mlx5e_set_xdp_feature(priv); if (take_rtnl) rtnl_unlock(); @@ -5831,6 +6064,7 @@ static void mlx5e_nic_enable(struct mlx5e_priv *priv) if (mlx5e_monitor_counter_supported(priv)) mlx5e_monitor_counter_init(priv); + mlx5e_pcie_cong_event_init(priv); mlx5e_hv_vhca_stats_create(priv); if (netdev->reg_state != NETREG_REGISTERED) return; @@ -5861,10 +6095,12 @@ static void mlx5e_nic_disable(struct mlx5e_priv *priv) mlx5e_nic_set_rx_mode(priv); + mlx5e_pcie_cong_event_cleanup(priv); mlx5e_hv_vhca_stats_destroy(priv); if (mlx5e_monitor_counter_supported(priv)) mlx5e_monitor_counter_cleanup(priv); + mlx5e_ipsec_disable_events(priv); mlx5e_disable_blocking_events(priv); if (priv->en_trap) { mlx5e_deactivate_trap(priv); @@ -5880,7 +6116,7 @@ static void mlx5e_nic_disable(struct mlx5e_priv *priv) static int mlx5e_update_nic_rx(struct mlx5e_priv *priv) { - return mlx5e_refresh_tirs(priv, false, false); + return mlx5e_refresh_tirs(priv->mdev, false, false); } static const struct mlx5e_profile mlx5e_nic_profile = { @@ -6033,6 +6269,7 @@ err_free_cpumask: void mlx5e_priv_cleanup(struct mlx5e_priv *priv) { + bool destroying = test_bit(MLX5E_STATE_DESTROYING, &priv->state); int i; /* bail if change profile failed and also rollback failed */ @@ -6059,6 +6296,8 @@ void mlx5e_priv_cleanup(struct mlx5e_priv *priv) } memset(priv, 0, sizeof(*priv)); + if (destroying) /* restore destroying bit, to allow unload */ + set_bit(MLX5E_STATE_DESTROYING, &priv->state); } static unsigned int mlx5e_get_max_num_txqs(struct mlx5_core_dev *mdev, @@ -6285,19 +6524,28 @@ profile_cleanup: return err; } -int mlx5e_netdev_change_profile(struct mlx5e_priv *priv, - const struct mlx5e_profile *new_profile, void *new_ppriv) +int mlx5e_netdev_change_profile(struct net_device *netdev, + struct mlx5_core_dev *mdev, + const struct mlx5e_profile *new_profile, + void *new_ppriv) { - const struct mlx5e_profile *orig_profile = priv->profile; - struct net_device *netdev = priv->netdev; - struct mlx5_core_dev *mdev = priv->mdev; - void *orig_ppriv = priv->ppriv; + struct mlx5e_priv *priv = netdev_priv(netdev); + const struct mlx5e_profile *orig_profile; int err, rollback_err; + void *orig_ppriv; - /* cleanup old profile */ - mlx5e_detach_netdev(priv); - priv->profile->cleanup(priv); - mlx5e_priv_cleanup(priv); + orig_profile = priv->profile; + orig_ppriv = priv->ppriv; + + /* NULL could happen if previous change_profile failed to rollback */ + if (priv->profile) { + WARN_ON_ONCE(priv->mdev != mdev); + /* cleanup old profile */ + mlx5e_detach_netdev(priv); + priv->profile->cleanup(priv); + mlx5e_priv_cleanup(priv); + } + /* priv members are not valid from this point ... */ if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { mlx5e_netdev_init_profile(netdev, mdev, new_profile, new_ppriv); @@ -6314,23 +6562,33 @@ int mlx5e_netdev_change_profile(struct mlx5e_priv *priv, return 0; rollback: + if (!orig_profile) { + netdev_warn(netdev, "no original profile to rollback to\n"); + priv->profile = NULL; + return err; + } + rollback_err = mlx5e_netdev_attach_profile(netdev, mdev, orig_profile, orig_ppriv); - if (rollback_err) - netdev_err(netdev, "%s: failed to rollback to orig profile, %d\n", - __func__, rollback_err); + if (rollback_err) { + netdev_err(netdev, "failed to rollback to orig profile, %d\n", + rollback_err); + priv->profile = NULL; + } return err; } -void mlx5e_netdev_attach_nic_profile(struct mlx5e_priv *priv) +void mlx5e_netdev_attach_nic_profile(struct net_device *netdev, + struct mlx5_core_dev *mdev) { - mlx5e_netdev_change_profile(priv, &mlx5e_nic_profile, NULL); + mlx5e_netdev_change_profile(netdev, mdev, &mlx5e_nic_profile, NULL); } -void mlx5e_destroy_netdev(struct mlx5e_priv *priv) +void mlx5e_destroy_netdev(struct net_device *netdev) { - struct net_device *netdev = priv->netdev; + struct mlx5e_priv *priv = netdev_priv(netdev); - mlx5e_priv_cleanup(priv); + if (priv->profile) + mlx5e_priv_cleanup(priv); free_netdev(netdev); } @@ -6338,8 +6596,8 @@ static int _mlx5e_resume(struct auxiliary_device *adev) { struct mlx5_adev *edev = container_of(adev, struct mlx5_adev, adev); struct mlx5e_dev *mlx5e_dev = auxiliary_get_drvdata(adev); - struct mlx5e_priv *priv = mlx5e_dev->priv; - struct net_device *netdev = priv->netdev; + struct mlx5e_priv *priv = netdev_priv(mlx5e_dev->netdev); + struct net_device *netdev = mlx5e_dev->netdev; struct mlx5_core_dev *mdev = edev->mdev; struct mlx5_core_dev *pos, *to; int err, i; @@ -6385,10 +6643,11 @@ static int mlx5e_resume(struct auxiliary_device *adev) static int _mlx5e_suspend(struct auxiliary_device *adev, bool pre_netdev_reg) { + struct mlx5_adev *edev = container_of(adev, struct mlx5_adev, adev); struct mlx5e_dev *mlx5e_dev = auxiliary_get_drvdata(adev); - struct mlx5e_priv *priv = mlx5e_dev->priv; - struct net_device *netdev = priv->netdev; - struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_priv *priv = netdev_priv(mlx5e_dev->netdev); + struct net_device *netdev = mlx5e_dev->netdev; + struct mlx5_core_dev *mdev = edev->mdev; struct mlx5_core_dev *pos; int i; @@ -6449,11 +6708,11 @@ static int _mlx5e_probe(struct auxiliary_device *adev) goto err_devlink_port_unregister; } SET_NETDEV_DEVLINK_PORT(netdev, &mlx5e_dev->dl_port); + mlx5e_dev->netdev = netdev; mlx5e_build_nic_netdev(netdev); priv = netdev_priv(netdev); - mlx5e_dev->priv = priv; priv->profile = profile; priv->ppriv = NULL; @@ -6486,7 +6745,7 @@ err_resume: err_profile_cleanup: profile->cleanup(priv); err_destroy_netdev: - mlx5e_destroy_netdev(priv); + mlx5e_destroy_netdev(netdev); err_devlink_port_unregister: mlx5e_devlink_port_unregister(mlx5e_dev); err_devlink_unregister: @@ -6516,17 +6775,21 @@ static void _mlx5e_remove(struct auxiliary_device *adev) { struct mlx5_adev *edev = container_of(adev, struct mlx5_adev, adev); struct mlx5e_dev *mlx5e_dev = auxiliary_get_drvdata(adev); - struct mlx5e_priv *priv = mlx5e_dev->priv; + struct net_device *netdev = mlx5e_dev->netdev; + struct mlx5e_priv *priv = netdev_priv(netdev); struct mlx5_core_dev *mdev = edev->mdev; + mlx5_eswitch_safe_aux_devs_remove(mdev); mlx5_core_uplink_netdev_set(mdev, NULL); - mlx5e_dcbnl_delete_app(priv); + + if (priv->profile) + mlx5e_dcbnl_delete_app(priv); /* When unload driver, the netdev is in registered state * if it's from legacy mode. If from switchdev mode, it * is already unregistered before changing to NIC profile. */ - if (priv->netdev->reg_state == NETREG_REGISTERED) { - unregister_netdev(priv->netdev); + if (netdev->reg_state == NETREG_REGISTERED) { + unregister_netdev(netdev); _mlx5e_suspend(adev, false); } else { struct mlx5_core_dev *pos; @@ -6541,7 +6804,7 @@ static void _mlx5e_remove(struct auxiliary_device *adev) /* Avoid cleanup if profile rollback failed. */ if (priv->profile) priv->profile->cleanup(priv); - mlx5e_destroy_netdev(priv); + mlx5e_destroy_netdev(netdev); mlx5e_devlink_port_unregister(mlx5e_dev); mlx5e_destroy_devlink(mlx5e_dev); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 58cd153ccc..493e0f01b5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -803,6 +803,7 @@ static const struct net_device_ops mlx5e_netdev_ops_rep = { .ndo_stop = mlx5e_rep_close, .ndo_start_xmit = mlx5e_xmit, .ndo_setup_tc = mlx5e_rep_setup_tc, + .ndo_set_mac_address = eth_mac_addr, .ndo_get_stats64 = mlx5e_rep_get_stats, .ndo_has_offload_stats = mlx5e_rep_has_offload_stats, .ndo_get_offload_stats = mlx5e_rep_get_offload_stats, @@ -865,7 +866,7 @@ static void mlx5e_build_rep_params(struct net_device *netdev) if (take_rtnl) rtnl_lock(); /* update XDP supported features */ - mlx5e_set_xdp_feature(netdev); + mlx5e_set_xdp_feature(priv); if (take_rtnl) rtnl_unlock(); @@ -970,7 +971,7 @@ static int mlx5e_create_rep_ttc_table(struct mlx5e_priv *priv) MLX5_FLOW_NAMESPACE_KERNEL), false); /* The inner_ttc in the ttc params is intentionally not set */ - mlx5e_set_ttc_params(priv->fs, priv->rx_res, &ttc_params, false); + mlx5e_set_ttc_params(priv->fs, priv->rx_res, &ttc_params, false, false); if (rep->vport != MLX5_VPORT_UPLINK) /* To give uplik rep TTC a lower level for chaining from root ft */ @@ -1442,8 +1443,8 @@ static void mlx5e_rep_vnic_reporter_create(struct mlx5e_priv *priv, rpriv); if (IS_ERR(reporter)) { mlx5_core_err(priv->mdev, - "Failed to create representor vnic reporter, err = %ld\n", - PTR_ERR(reporter)); + "Failed to create representor vnic reporter, err = %pe\n", + reporter); return; } @@ -1498,12 +1499,20 @@ static const struct mlx5e_profile mlx5e_uplink_rep_profile = { static int mlx5e_vport_uplink_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) { - struct mlx5e_priv *priv = netdev_priv(mlx5_uplink_netdev_get(dev)); struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep); + struct net_device *netdev; + int err; - rpriv->netdev = priv->netdev; - return mlx5e_netdev_change_profile(priv, &mlx5e_uplink_rep_profile, - rpriv); + netdev = mlx5_uplink_netdev_get(dev); + if (!netdev) + return 0; + + /* must not use netdev_priv(netdev), it might not be initialized yet */ + rpriv->netdev = netdev; + err = mlx5e_netdev_change_profile(netdev, dev, + &mlx5e_uplink_rep_profile, rpriv); + mlx5_uplink_netdev_put(dev, netdev); + return err; } static void @@ -1529,7 +1538,7 @@ mlx5e_vport_uplink_rep_unload(struct mlx5e_rep_priv *rpriv) if (!(priv->mdev->priv.flags & MLX5_PRIV_FLAGS_SWITCH_LEGACY)) unregister_netdev(netdev); - mlx5e_netdev_attach_nic_profile(priv); + mlx5e_netdev_attach_nic_profile(netdev, priv->mdev); } static int @@ -1595,7 +1604,7 @@ err_cleanup_profile: priv->profile->cleanup(priv); err_destroy_netdev: - mlx5e_destroy_netdev(netdev_priv(netdev)); + mlx5e_destroy_netdev(netdev); return err; } @@ -1630,8 +1639,16 @@ mlx5e_vport_rep_unload(struct mlx5_eswitch_rep *rep) { struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep); struct net_device *netdev = rpriv->netdev; - struct mlx5e_priv *priv = netdev_priv(netdev); - void *ppriv = priv->ppriv; + struct mlx5e_priv *priv; + void *ppriv; + + if (!netdev) { + ppriv = rpriv; + goto free_ppriv; + } + + priv = netdev_priv(netdev); + ppriv = priv->ppriv; if (rep->vport == MLX5_VPORT_UPLINK) { mlx5e_vport_uplink_rep_unload(rpriv); @@ -1642,7 +1659,7 @@ mlx5e_vport_rep_unload(struct mlx5_eswitch_rep *rep) mlx5e_rep_vnic_reporter_destroy(priv); mlx5e_detach_netdev(priv); priv->profile->cleanup(priv); - mlx5e_destroy_netdev(priv); + mlx5e_destroy_netdev(netdev); free_ppriv: kvfree(ppriv); /* mlx5e_rep_priv */ } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 12ca0a3e85..3c34d8f736 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -273,12 +273,12 @@ static inline u32 mlx5e_decompress_cqes_start(struct mlx5e_rq *rq, #define MLX5E_PAGECNT_BIAS_MAX (PAGE_SIZE / 64) -static int mlx5e_page_alloc_fragmented(struct mlx5e_rq *rq, +static int mlx5e_page_alloc_fragmented(struct page_pool *pool, struct mlx5e_frag_page *frag_page) { struct page *page; - page = page_pool_dev_alloc_pages(rq->page_pool); + page = page_pool_dev_alloc_pages(pool); if (unlikely(!page)) return -ENOMEM; @@ -292,14 +292,14 @@ static int mlx5e_page_alloc_fragmented(struct mlx5e_rq *rq, return 0; } -static void mlx5e_page_release_fragmented(struct mlx5e_rq *rq, +static void mlx5e_page_release_fragmented(struct page_pool *pool, struct mlx5e_frag_page *frag_page) { u16 drain_count = MLX5E_PAGECNT_BIAS_MAX - frag_page->frags; struct page *page = frag_page->page; if (page_pool_unref_page(page, drain_count) == 0) - page_pool_put_unrefed_page(rq->page_pool, page, -1, true); + page_pool_put_unrefed_page(pool, page, -1, true); } static inline int mlx5e_get_rx_frag(struct mlx5e_rq *rq, @@ -313,7 +313,8 @@ static inline int mlx5e_get_rx_frag(struct mlx5e_rq *rq, * offset) should just use the new one without replenishing again * by themselves. */ - err = mlx5e_page_alloc_fragmented(rq, frag->frag_page); + err = mlx5e_page_alloc_fragmented(rq->page_pool, + frag->frag_page); return err; } @@ -332,7 +333,7 @@ static inline void mlx5e_put_rx_frag(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *frag) { if (mlx5e_frag_can_release(frag)) - mlx5e_page_release_fragmented(rq, frag->frag_page); + mlx5e_page_release_fragmented(rq->page_pool, frag->frag_page); } static inline struct mlx5e_wqe_frag_info *get_frag(struct mlx5e_rq *rq, u16 ix) @@ -586,7 +587,8 @@ mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi) struct mlx5e_frag_page *frag_page; frag_page = &wi->alloc_units.frag_pages[i]; - mlx5e_page_release_fragmented(rq, frag_page); + mlx5e_page_release_fragmented(rq->page_pool, + frag_page); } } } @@ -645,17 +647,20 @@ static void build_ksm_umr(struct mlx5e_icosq *sq, struct mlx5e_umr_wqe *umr_wqe, umr_wqe->hdr.uctrl.mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE); } -static struct mlx5e_frag_page *mlx5e_shampo_hd_to_frag_page(struct mlx5e_rq *rq, int header_index) +static struct mlx5e_frag_page *mlx5e_shampo_hd_to_frag_page(struct mlx5e_rq *rq, + int header_index) { - BUILD_BUG_ON(MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE > PAGE_SHIFT); + struct mlx5e_shampo_hd *shampo = rq->mpwqe.shampo; - return &rq->mpwqe.shampo->pages[header_index >> MLX5E_SHAMPO_LOG_WQ_HEADER_PER_PAGE]; + return &shampo->pages[header_index >> shampo->log_hd_per_page]; } -static u64 mlx5e_shampo_hd_offset(int header_index) +static u64 mlx5e_shampo_hd_offset(struct mlx5e_rq *rq, int header_index) { - return (header_index & (MLX5E_SHAMPO_WQ_HEADER_PER_PAGE - 1)) << - MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE; + struct mlx5e_shampo_hd *shampo = rq->mpwqe.shampo; + u32 hd_per_page = shampo->hd_per_page; + + return (header_index & (hd_per_page - 1)) << shampo->log_hd_entry_size; } static void mlx5e_free_rx_shampo_hd_entry(struct mlx5e_rq *rq, u16 header_index); @@ -668,34 +673,32 @@ static int mlx5e_build_shampo_hd_umr(struct mlx5e_rq *rq, u16 pi, header_offset, err, wqe_bbs; u32 lkey = rq->mdev->mlx5e_res.hw_objs.mkey; struct mlx5e_umr_wqe *umr_wqe; - int headroom, i = 0; + int headroom, i; headroom = rq->buff.headroom; wqe_bbs = MLX5E_KSM_UMR_WQEBBS(ksm_entries); pi = mlx5e_icosq_get_next_pi(sq, wqe_bbs); umr_wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi); - build_ksm_umr(sq, umr_wqe, shampo->key, index, ksm_entries); + build_ksm_umr(sq, umr_wqe, shampo->mkey_be, index, ksm_entries); - WARN_ON_ONCE(ksm_entries & (MLX5E_SHAMPO_WQ_HEADER_PER_PAGE - 1)); - while (i < ksm_entries) { - struct mlx5e_frag_page *frag_page = mlx5e_shampo_hd_to_frag_page(rq, index); + for (i = 0; i < ksm_entries; i++, index++) { + struct mlx5e_frag_page *frag_page; u64 addr; - err = mlx5e_page_alloc_fragmented(rq, frag_page); - if (unlikely(err)) - goto err_unmap; - + frag_page = mlx5e_shampo_hd_to_frag_page(rq, index); + header_offset = mlx5e_shampo_hd_offset(rq, index); + if (!header_offset) { + err = mlx5e_page_alloc_fragmented(rq->hd_page_pool, + frag_page); + if (err) + goto err_unmap; + } addr = page_pool_get_dma_addr(frag_page->page); - - for (int j = 0; j < MLX5E_SHAMPO_WQ_HEADER_PER_PAGE; j++) { - header_offset = mlx5e_shampo_hd_offset(index++); - - umr_wqe->inline_ksms[i++] = (struct mlx5_ksm) { - .key = cpu_to_be32(lkey), - .va = cpu_to_be64(addr + header_offset + headroom), - }; - } + umr_wqe->inline_ksms[i] = (struct mlx5_ksm) { + .key = cpu_to_be32(lkey), + .va = cpu_to_be64(addr + header_offset + headroom), + }; } sq->db.wqe_info[pi] = (struct mlx5e_icosq_wqe_info) { @@ -711,13 +714,14 @@ static int mlx5e_build_shampo_hd_umr(struct mlx5e_rq *rq, return 0; err_unmap: - while (--i) { + while (--i >= 0) { --index; - header_offset = mlx5e_shampo_hd_offset(index); + header_offset = mlx5e_shampo_hd_offset(rq, index); if (!header_offset) { struct mlx5e_frag_page *frag_page = mlx5e_shampo_hd_to_frag_page(rq, index); - mlx5e_page_release_fragmented(rq, frag_page); + mlx5e_page_release_fragmented(rq->hd_page_pool, + frag_page); } } @@ -732,12 +736,11 @@ static int mlx5e_alloc_rx_hd_mpwqe(struct mlx5e_rq *rq) struct mlx5e_icosq *sq = rq->icosq; int i, err, max_ksm_entries, len; - max_ksm_entries = ALIGN_DOWN(MLX5E_MAX_KSM_PER_WQE(rq->mdev), - MLX5E_SHAMPO_WQ_HEADER_PER_PAGE); + max_ksm_entries = MLX5E_MAX_KSM_PER_WQE(rq->mdev); ksm_entries = bitmap_find_window(shampo->bitmap, shampo->hd_per_wqe, shampo->hd_per_wq, shampo->pi); - ksm_entries = ALIGN_DOWN(ksm_entries, MLX5E_SHAMPO_WQ_HEADER_PER_PAGE); + ksm_entries = ALIGN_DOWN(ksm_entries, shampo->hd_per_page); if (!ksm_entries) return 0; @@ -793,7 +796,7 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) for (i = 0; i < rq->mpwqe.pages_per_wqe; i++, frag_page++) { dma_addr_t addr; - err = mlx5e_page_alloc_fragmented(rq, frag_page); + err = mlx5e_page_alloc_fragmented(rq->page_pool, frag_page); if (unlikely(err)) goto err_unmap; addr = page_pool_get_dma_addr(frag_page->page); @@ -838,7 +841,7 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) err_unmap: while (--i >= 0) { frag_page--; - mlx5e_page_release_fragmented(rq, frag_page); + mlx5e_page_release_fragmented(rq->page_pool, frag_page); } bitmap_fill(wi->skip_release_bitmap, rq->mpwqe.pages_per_wqe); @@ -854,10 +857,10 @@ mlx5e_free_rx_shampo_hd_entry(struct mlx5e_rq *rq, u16 header_index) { struct mlx5e_shampo_hd *shampo = rq->mpwqe.shampo; - if (((header_index + 1) & (MLX5E_SHAMPO_WQ_HEADER_PER_PAGE - 1)) == 0) { + if (((header_index + 1) & (shampo->hd_per_page - 1)) == 0) { struct mlx5e_frag_page *frag_page = mlx5e_shampo_hd_to_frag_page(rq, header_index); - mlx5e_page_release_fragmented(rq, frag_page); + mlx5e_page_release_fragmented(rq->hd_page_pool, frag_page); } clear_bit(header_index, shampo->bitmap); } @@ -1102,6 +1105,8 @@ INDIRECT_CALLABLE_SCOPE bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq) if (rq->page_pool) page_pool_nid_changed(rq->page_pool, numa_mem_id()); + if (rq->hd_page_pool) + page_pool_nid_changed(rq->hd_page_pool, numa_mem_id()); head = rq->mpwqe.actual_wq_head; i = missing; @@ -1156,8 +1161,9 @@ static void mlx5e_lro_update_tcp_hdr(struct mlx5_cqe64 *cqe, struct tcphdr *tcp) } } -static void mlx5e_lro_update_hdr(struct sk_buff *skb, struct mlx5_cqe64 *cqe, - u32 cqe_bcnt) +static unsigned int mlx5e_lro_update_hdr(struct sk_buff *skb, + struct mlx5_cqe64 *cqe, + u32 cqe_bcnt) { struct ethhdr *eth = (struct ethhdr *)(skb->data); struct tcphdr *tcp; @@ -1207,14 +1213,17 @@ static void mlx5e_lro_update_hdr(struct sk_buff *skb, struct mlx5_cqe64 *cqe, tcp->check = tcp_v6_check(payload_len, &ipv6->saddr, &ipv6->daddr, check); } + + return (unsigned int)((unsigned char *)tcp + tcp->doff * 4 - skb->data); } static void *mlx5e_shampo_get_packet_hd(struct mlx5e_rq *rq, u16 header_index) { struct mlx5e_frag_page *frag_page = mlx5e_shampo_hd_to_frag_page(rq, header_index); - u16 head_offset = mlx5e_shampo_hd_offset(header_index) + rq->buff.headroom; + u16 head_offset = mlx5e_shampo_hd_offset(rq, header_index); + void *addr = page_address(frag_page->page); - return page_address(frag_page->page) + head_offset; + return addr + head_offset + rq->buff.headroom; } static void mlx5e_shampo_update_ipv4_udp_hdr(struct mlx5e_rq *rq, struct iphdr *ipv4) @@ -1563,8 +1572,10 @@ static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 *cqe, mlx5e_macsec_offload_handle_rx_skb(netdev, skb, cqe); if (lro_num_seg > 1) { - mlx5e_lro_update_hdr(skb, cqe, cqe_bcnt); - skb_shinfo(skb)->gso_size = DIV_ROUND_UP(cqe_bcnt, lro_num_seg); + unsigned int hdrlen = mlx5e_lro_update_hdr(skb, cqe, cqe_bcnt); + + skb_shinfo(skb)->gso_size = DIV_ROUND_UP(cqe_bcnt - hdrlen, lro_num_seg); + skb_shinfo(skb)->gso_segs = lro_num_seg; /* Subtract one since we already counted this as one * "regular" packet in mlx5e_complete_rx_cqe() */ @@ -1573,7 +1584,7 @@ static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 *cqe, stats->lro_bytes += cqe_bcnt; } - if (unlikely(mlx5e_rx_hw_stamp(rq->tstamp))) + if (unlikely(mlx5e_rx_hw_stamp(rq->hwtstamp_config))) skb_hwtstamps(skb)->hwtstamp = mlx5e_cqe_ts_to_ns(rq->ptp_cyc2time, rq->clock, get_cqe_ts(cqe)); skb_record_rx_queue(skb, rq->ix); @@ -1722,6 +1733,7 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi struct skb_shared_info *sinfo; u32 frag_consumed_bytes; struct bpf_prog *prog; + u8 nr_frags_free = 0; struct sk_buff *skb; dma_addr_t addr; u32 truesize; @@ -1763,14 +1775,23 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi } prog = rcu_dereference(rq->xdp_prog); - if (prog && mlx5e_xdp_handle(rq, prog, mxbuf)) { - if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) { - struct mlx5e_wqe_frag_info *pwi; + if (prog) { + u8 old_nr_frags = sinfo->nr_frags; - for (pwi = head_wi; pwi < wi; pwi++) - pwi->frag_page->frags++; + if (mlx5e_xdp_handle(rq, prog, mxbuf)) { + if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, + rq->flags)) { + struct mlx5e_wqe_frag_info *pwi; + + for (pwi = head_wi; pwi < wi; pwi++) + pwi->frag_page->frags++; + } + return NULL; /* page/packet was consumed by XDP */ } - return NULL; /* page/packet was consumed by XDP */ + + nr_frags_free = old_nr_frags - sinfo->nr_frags; + if (unlikely(nr_frags_free)) + truesize -= nr_frags_free * frag_info->frag_stride; } skb = mlx5e_build_linear_skb( @@ -1786,7 +1807,7 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi if (xdp_buff_has_frags(&mxbuf->xdp)) { /* sinfo->nr_frags is reset by build_skb, calculate again. */ - xdp_update_skb_shared_info(skb, wi - head_wi - 1, + xdp_update_skb_shared_info(skb, wi - head_wi - nr_frags_free - 1, sinfo->xdp_frags_size, truesize, xdp_buff_is_frag_pfmemalloc( &mxbuf->xdp)); @@ -1994,6 +2015,7 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w u32 byte_cnt = cqe_bcnt; struct skb_shared_info *sinfo; unsigned int truesize = 0; + u32 pg_consumed_bytes; struct bpf_prog *prog; struct sk_buff *skb; u32 linear_frame_sz; @@ -2006,7 +2028,8 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w if (prog) { /* area for bpf_xdp_[store|load]_bytes */ net_prefetchw(page_address(frag_page->page) + frag_offset); - if (unlikely(mlx5e_page_alloc_fragmented(rq, &wi->linear_page))) { + if (unlikely(mlx5e_page_alloc_fragmented(rq->page_pool, + &wi->linear_page))) { rq->stats->buff_alloc_err++; return NULL; } @@ -2045,7 +2068,8 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w while (byte_cnt) { /* Non-linear mode, hence non-XSK, which always uses PAGE_SIZE. */ - u32 pg_consumed_bytes = min_t(u32, PAGE_SIZE - frag_offset, byte_cnt); + pg_consumed_bytes = + min_t(u32, PAGE_SIZE - frag_offset, byte_cnt); if (test_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state)) truesize += pg_consumed_bytes; @@ -2061,6 +2085,10 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w } if (prog) { + u8 nr_frags_free, old_nr_frags = sinfo->nr_frags; + u8 new_nr_frags; + u32 len; + if (mlx5e_xdp_handle(rq, prog, mxbuf)) { if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) { struct mlx5e_frag_page *pfp; @@ -2070,28 +2098,39 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w wi->linear_page.frags++; } - mlx5e_page_release_fragmented(rq, &wi->linear_page); + mlx5e_page_release_fragmented(rq->page_pool, + &wi->linear_page); return NULL; /* page/packet was consumed by XDP */ } + new_nr_frags = sinfo->nr_frags; + nr_frags_free = old_nr_frags - new_nr_frags; + if (unlikely(nr_frags_free)) + truesize -= (nr_frags_free - 1) * PAGE_SIZE + + ALIGN(pg_consumed_bytes, + BIT(rq->mpwqe.log_stride_sz)); + + len = mxbuf->xdp.data_end - mxbuf->xdp.data; + skb = mlx5e_build_linear_skb( rq, mxbuf->xdp.data_hard_start, linear_frame_sz, - mxbuf->xdp.data - mxbuf->xdp.data_hard_start, 0, + mxbuf->xdp.data - mxbuf->xdp.data_hard_start, len, mxbuf->xdp.data - mxbuf->xdp.data_meta); if (unlikely(!skb)) { - mlx5e_page_release_fragmented(rq, &wi->linear_page); + mlx5e_page_release_fragmented(rq->page_pool, + &wi->linear_page); return NULL; } skb_mark_for_recycle(skb); wi->linear_page.frags++; - mlx5e_page_release_fragmented(rq, &wi->linear_page); + mlx5e_page_release_fragmented(rq->page_pool, &wi->linear_page); if (xdp_buff_has_frags(&mxbuf->xdp)) { struct mlx5e_frag_page *pagep; /* sinfo->nr_frags is reset by build_skb, calculate again. */ - xdp_update_skb_shared_info(skb, frag_page - head_page, + xdp_update_skb_shared_info(skb, new_nr_frags, sinfo->xdp_frags_size, truesize, xdp_buff_is_frag_pfmemalloc( &mxbuf->xdp)); @@ -2100,8 +2139,11 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w do pagep->frags++; while (++pagep < frag_page); + + headlen = min_t(u16, MLX5E_RX_MAX_HEAD - len, + skb->data_len); + __pskb_pull_tail(skb, headlen); } - __pskb_pull_tail(skb, headlen); } else { dma_addr_t addr; @@ -2193,20 +2235,22 @@ mlx5e_skb_from_cqe_shampo(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, struct mlx5_cqe64 *cqe, u16 header_index) { struct mlx5e_frag_page *frag_page = mlx5e_shampo_hd_to_frag_page(rq, header_index); - dma_addr_t page_dma_addr = page_pool_get_dma_addr(frag_page->page); - u16 head_offset = mlx5e_shampo_hd_offset(header_index); - dma_addr_t dma_addr = page_dma_addr + head_offset; + u16 head_offset = mlx5e_shampo_hd_offset(rq, header_index); + struct mlx5e_shampo_hd *shampo = rq->mpwqe.shampo; u16 head_size = cqe->shampo.header_size; u16 rx_headroom = rq->buff.headroom; struct sk_buff *skb = NULL; void *hdr, *data; u32 frag_size; + dma_addr_t page_dma_addr = page_pool_get_dma_addr(frag_page->page); + dma_addr_t dma_addr = page_dma_addr + head_offset; hdr = page_address(frag_page->page) + head_offset; + data = hdr + rx_headroom; frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + head_size); - if (likely(frag_size <= BIT(MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE))) { + if (likely(frag_size <= BIT(shampo->log_hd_entry_size))) { /* build SKB around header */ dma_sync_single_range_for_cpu(rq->pdev, dma_addr, 0, frag_size, rq->buff.map_dir); net_prefetchw(hdr); @@ -2279,7 +2323,10 @@ mlx5e_hw_gro_skb_has_enough_space(struct sk_buff *skb, u16 data_bcnt) { int nr_frags = skb_shinfo(skb)->nr_frags; - return PAGE_SIZE * nr_frags + data_bcnt <= GRO_LEGACY_MAX_SIZE; + if (PAGE_SIZE >= GRO_LEGACY_MAX_SIZE) + return skb->len + data_bcnt <= GRO_LEGACY_MAX_SIZE; + else + return PAGE_SIZE * nr_frags + data_bcnt <= GRO_LEGACY_MAX_SIZE; } static void mlx5e_handle_rx_cqe_mpwrq_shampo(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) @@ -2560,7 +2607,6 @@ static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq *rq, u32 cqe_bcnt, struct sk_buff *skb) { - struct hwtstamp_config *tstamp; struct mlx5e_rq_stats *stats; struct net_device *netdev; struct mlx5e_priv *priv; @@ -2584,7 +2630,6 @@ static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq *rq, } priv = mlx5i_epriv(netdev); - tstamp = &priv->tstamp; stats = &priv->channel_stats[rq->ix]->rq; flags_rqpn = be32_to_cpu(cqe->flags_rqpn); @@ -2620,7 +2665,7 @@ static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq *rq, stats->csum_none++; } - if (unlikely(mlx5e_rx_hw_stamp(tstamp))) + if (unlikely(mlx5e_rx_hw_stamp(&priv->hwtstamp_config))) skb_hwtstamps(skb)->hwtstamp = mlx5e_cqe_ts_to_ns(rq->ptp_cyc2time, rq->clock, get_cqe_ts(cqe)); skb_record_rx_queue(skb, rq->ix); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c index 2f7a543fec..fcad464bc4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c @@ -214,7 +214,7 @@ static int mlx5e_test_loopback_setup(struct mlx5e_priv *priv, return err; } - err = mlx5e_refresh_tirs(priv, true, false); + err = mlx5e_modify_tirs_lb(priv->mdev, true, false); if (err) goto out; @@ -243,7 +243,7 @@ static void mlx5e_test_loopback_cleanup(struct mlx5e_priv *priv, mlx5_nic_vport_update_local_lb(priv->mdev, false); dev_remove_pack(&lbtp->pt); - mlx5e_refresh_tirs(priv, false, false); + mlx5e_modify_tirs_lb(priv->mdev, false, false); } static int mlx5e_cond_loopback(struct mlx5e_priv *priv) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c index 1c121b4350..c6185ddba0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c @@ -1466,6 +1466,7 @@ static void fec_set_block_stats(struct mlx5e_priv *priv, case MLX5E_FEC_RS_528_514: case MLX5E_FEC_RS_544_514: case MLX5E_FEC_LLRS_272_257_1: + case MLX5E_FEC_RS_544_514_INTERLEAVED_QUAD: fec_set_rs_stats(fec_stats, out); return; case MLX5E_FEC_FIRECODE: @@ -2424,8 +2425,7 @@ static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(ptp) } if (priv->rx_ptp_opened) { for (i = 0; i < NUM_PTP_RQ_STATS; i++) - ethtool_sprintf(data, ptp_rq_stats_desc[i].format, - MLX5E_PTP_CHANNEL_IX); + ethtool_puts(data, ptp_rq_stats_desc[i].format); } } @@ -2613,6 +2613,7 @@ mlx5e_stats_grp_t mlx5e_nic_stats_grps[] = { #ifdef CONFIG_MLX5_MACSEC &MLX5E_STATS_GRP(macsec_hw), #endif + &MLX5E_STATS_GRP(pcie_cong), }; unsigned int mlx5e_nic_stats_grps_num(struct mlx5e_priv *priv) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h index 8de6fcbd3a..72dbcc1928 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h @@ -54,7 +54,7 @@ #define MLX5E_DECLARE_PTP_TX_STAT(type, fld) "ptp_tx%d_"#fld, offsetof(type, fld) #define MLX5E_DECLARE_PTP_CH_STAT(type, fld) "ptp_ch_"#fld, offsetof(type, fld) #define MLX5E_DECLARE_PTP_CQ_STAT(type, fld) "ptp_cq%d_"#fld, offsetof(type, fld) -#define MLX5E_DECLARE_PTP_RQ_STAT(type, fld) "ptp_rq%d_"#fld, offsetof(type, fld) +#define MLX5E_DECLARE_PTP_RQ_STAT(type, fld) "ptp_rq0_"#fld, offsetof(type, fld) #define MLX5E_DECLARE_QOS_TX_STAT(type, fld) "qos_tx%d_"#fld, offsetof(type, fld) @@ -535,5 +535,6 @@ extern MLX5E_DECLARE_STATS_GRP(ipsec_hw); extern MLX5E_DECLARE_STATS_GRP(ipsec_sw); extern MLX5E_DECLARE_STATS_GRP(ptp); extern MLX5E_DECLARE_STATS_GRP(macsec_hw); +extern MLX5E_DECLARE_STATS_GRP(pcie_cong); #endif /* __MLX5_EN_STATS_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index f1d908f611..40ae3c6177 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -66,6 +66,7 @@ #include "lib/devcom.h" #include "lib/geneve.h" #include "lib/fs_chains.h" +#include "lib/mlx5.h" #include "diag/en_tc_tracepoint.h" #include #include "lag/lag.h" @@ -757,11 +758,11 @@ static int mlx5e_hairpin_create_indirect_rqt(struct mlx5e_hairpin *hp) struct mlx5e_priv *priv = hp->func_priv; struct mlx5_core_dev *mdev = priv->mdev; struct mlx5e_rss_params_indir indir; + u32 rqt_size; int err; - err = mlx5e_rss_params_indir_init(&indir, mdev, - mlx5e_rqt_size(mdev, hp->num_channels), - mlx5e_rqt_size(mdev, hp->num_channels)); + rqt_size = mlx5e_rqt_size(mdev, hp->num_channels); + err = mlx5e_rss_params_indir_init(&indir, rqt_size, rqt_size); if (err) return err; @@ -837,6 +838,9 @@ static void mlx5e_hairpin_set_ttc_params(struct mlx5e_hairpin *hp, ttc_params->ns_type = MLX5_FLOW_NAMESPACE_KERNEL; for (tt = 0; tt < MLX5_NUM_TT; tt++) { + if (mlx5_ttc_is_decrypted_esp_tt(tt)) + continue; + ttc_params->dests[tt].type = MLX5_FLOW_DESTINATION_TYPE_TIR; ttc_params->dests[tt].tir_num = tt == MLX5_TT_ANY ? @@ -2028,9 +2032,8 @@ err_out: return err; } -static bool mlx5_flow_has_geneve_opt(struct mlx5e_tc_flow *flow) +static bool mlx5_flow_has_geneve_opt(struct mlx5_flow_spec *spec) { - struct mlx5_flow_spec *spec = &flow->attr->parse_attr->spec; void *headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters_3); @@ -2069,7 +2072,7 @@ static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv, } complete_all(&flow->del_hw_done); - if (mlx5_flow_has_geneve_opt(flow)) + if (mlx5_flow_has_geneve_opt(&attr->parse_attr->spec)) mlx5_geneve_tlv_option_del(priv->mdev->geneve); if (flow->decap_route) @@ -2144,11 +2147,14 @@ static void mlx5e_tc_del_fdb_peer_flow(struct mlx5e_tc_flow *flow, static void mlx5e_tc_del_fdb_peers_flow(struct mlx5e_tc_flow *flow) { + struct mlx5_devcom_comp_dev *devcom; + struct mlx5_devcom_comp_dev *pos; + struct mlx5_eswitch *peer_esw; int i; - for (i = 0; i < MLX5_MAX_PORTS; i++) { - if (i == mlx5_get_dev_index(flow->priv->mdev)) - continue; + devcom = flow->priv->mdev->priv.eswitch->devcom; + mlx5_devcom_for_each_peer_entry(devcom, peer_esw, pos) { + i = mlx5_get_dev_index(peer_esw->dev); mlx5e_tc_del_fdb_peer_flow(flow, i); } } @@ -2574,12 +2580,13 @@ static int parse_tunnel_attr(struct mlx5e_priv *priv, err = mlx5e_tc_tun_parse(filter_dev, priv, tmp_spec, f, match_level); if (err) { - kvfree(tmp_spec); NL_SET_ERR_MSG_MOD(extack, "Failed to parse tunnel attributes"); netdev_warn(priv->netdev, "Failed to parse tunnel attributes"); - return err; + } else { + err = mlx5e_tc_set_attr_rx_tun(flow, tmp_spec); } - err = mlx5e_tc_set_attr_rx_tun(flow, tmp_spec); + if (mlx5_flow_has_geneve_opt(tmp_spec)) + mlx5_geneve_tlv_option_del(priv->mdev->geneve); kvfree(tmp_spec); if (err) return err; @@ -3610,15 +3617,11 @@ static bool same_port_devs(struct mlx5e_priv *priv, struct mlx5e_priv *peer_priv bool mlx5e_same_hw_devs(struct mlx5e_priv *priv, struct mlx5e_priv *peer_priv) { struct mlx5_core_dev *fmdev, *pmdev; - u64 fsystem_guid, psystem_guid; fmdev = priv->mdev; pmdev = peer_priv->mdev; - fsystem_guid = mlx5_query_nic_system_image_guid(fmdev); - psystem_guid = mlx5_query_nic_system_image_guid(pmdev); - - return (fsystem_guid == psystem_guid); + return mlx5_same_hw_devs(fmdev, pmdev); } static int @@ -5233,10 +5236,11 @@ static void mlx5e_tc_nic_destroy_miss_table(struct mlx5e_priv *priv) int mlx5e_tc_nic_init(struct mlx5e_priv *priv) { struct mlx5e_tc_table *tc = mlx5e_fs_get_tc(priv->fs); + u8 mapping_id[MLX5_SW_IMAGE_GUID_MAX_BYTES]; struct mlx5_core_dev *dev = priv->mdev; struct mapping_ctx *chains_mapping; struct mlx5_chains_attr attr = {}; - u64 mapping_id; + u8 id_len; int err; mlx5e_mod_hdr_tbl_init(&tc->mod_hdr); @@ -5252,11 +5256,13 @@ int mlx5e_tc_nic_init(struct mlx5e_priv *priv) lockdep_set_class(&tc->ht.mutex, &tc_ht_lock_key); lockdep_init_map(&tc->ht.run_work.lockdep_map, "tc_ht_wq_key", &tc_ht_wq_key, 0); - mapping_id = mlx5_query_nic_system_image_guid(dev); + mlx5_query_nic_sw_system_image_guid(dev, mapping_id, &id_len); - chains_mapping = mapping_create_for_id(mapping_id, MAPPING_TYPE_CHAIN, + chains_mapping = mapping_create_for_id(mapping_id, id_len, + MAPPING_TYPE_CHAIN, sizeof(struct mlx5_mapped_obj), - MLX5E_TC_TABLE_CHAIN_TAG_MASK, true); + MLX5E_TC_TABLE_CHAIN_TAG_MASK, + true); if (IS_ERR(chains_mapping)) { err = PTR_ERR(chains_mapping); @@ -5387,13 +5393,15 @@ void mlx5e_tc_ht_cleanup(struct rhashtable *tc_ht) int mlx5e_tc_esw_init(struct mlx5_rep_uplink_priv *uplink_priv) { const size_t sz_enc_opts = sizeof(struct tunnel_match_enc_opts); + u8 mapping_id[MLX5_SW_IMAGE_GUID_MAX_BYTES]; + struct mlx5_devcom_match_attr attr = {}; struct netdev_phys_item_id ppid; struct mlx5e_rep_priv *rpriv; struct mapping_ctx *mapping; struct mlx5_eswitch *esw; struct mlx5e_priv *priv; - u64 mapping_id, key; int err = 0; + u8 id_len; rpriv = container_of(uplink_priv, struct mlx5e_rep_priv, uplink_priv); priv = netdev_priv(rpriv->netdev); @@ -5411,9 +5419,9 @@ int mlx5e_tc_esw_init(struct mlx5_rep_uplink_priv *uplink_priv) uplink_priv->tc_psample = mlx5e_tc_sample_init(esw, uplink_priv->post_act); - mapping_id = mlx5_query_nic_system_image_guid(esw->dev); + mlx5_query_nic_sw_system_image_guid(esw->dev, mapping_id, &id_len); - mapping = mapping_create_for_id(mapping_id, MAPPING_TYPE_TUNNEL, + mapping = mapping_create_for_id(mapping_id, id_len, MAPPING_TYPE_TUNNEL, sizeof(struct tunnel_match_key), TUNNEL_INFO_BITS_MASK, true); @@ -5426,8 +5434,10 @@ int mlx5e_tc_esw_init(struct mlx5_rep_uplink_priv *uplink_priv) /* Two last values are reserved for stack devices slow path table mark * and bridge ingress push mark. */ - mapping = mapping_create_for_id(mapping_id, MAPPING_TYPE_TUNNEL_ENC_OPTS, - sz_enc_opts, ENC_OPTS_BITS_MASK - 2, true); + mapping = mapping_create_for_id(mapping_id, id_len, + MAPPING_TYPE_TUNNEL_ENC_OPTS, + sz_enc_opts, ENC_OPTS_BITS_MASK - 2, + true); if (IS_ERR(mapping)) { err = PTR_ERR(mapping); goto err_enc_opts_mapping; @@ -5448,8 +5458,10 @@ int mlx5e_tc_esw_init(struct mlx5_rep_uplink_priv *uplink_priv) err = dev_get_port_parent_id(priv->netdev, &ppid, false); if (!err) { - memcpy(&key, &ppid.id, sizeof(key)); - mlx5_esw_offloads_devcom_init(esw, key); + memcpy(&attr.key.buf, &ppid.id, ppid.id_len); + attr.flags = MLX5_DEVCOM_MATCH_FLAGS_NS; + attr.net = mlx5_core_net(esw->dev); + mlx5_esw_offloads_devcom_init(esw, &attr); } return 0; @@ -5504,12 +5516,16 @@ int mlx5e_tc_num_filters(struct mlx5e_priv *priv, unsigned long flags) void mlx5e_tc_clean_fdb_peer_flows(struct mlx5_eswitch *esw) { + struct mlx5_devcom_comp_dev *devcom; + struct mlx5_devcom_comp_dev *pos; struct mlx5e_tc_flow *flow, *tmp; + struct mlx5_eswitch *peer_esw; int i; - for (i = 0; i < MLX5_MAX_PORTS; i++) { - if (i == mlx5_get_dev_index(esw->dev)) - continue; + devcom = esw->devcom; + + mlx5_devcom_for_each_peer_entry(devcom, peer_esw, pos) { + i = mlx5_get_dev_index(peer_esw->dev); list_for_each_entry_safe(flow, tmp, &esw->offloads.peer_flows[i], peer[i]) mlx5e_tc_del_fdb_peers_flow(flow); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index 4fd853d19e..5d12f19dfe 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -256,8 +256,7 @@ mlx5e_tx_wqe_inline_mode(struct mlx5e_txqsq *sq, struct sk_buff *skb, mode = sq->min_inline_mode; - if (skb_vlan_tag_present(skb) && - test_bit(MLX5E_SQ_STATE_VLAN_NEED_L2_INLINE, &sq->state)) + if (skb_vlan_tag_present(skb)) mode = max_t(u8, MLX5_INLINE_MODE_L2, mode); return mode; @@ -337,10 +336,11 @@ static void mlx5e_sq_calc_wqe_attr(struct sk_buff *skb, const struct mlx5e_tx_at }; } -static void mlx5e_tx_skb_update_hwts_flags(struct sk_buff *skb) +static void mlx5e_tx_skb_update_ts_flags(struct sk_buff *skb) { if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; + skb_tx_timestamp(skb); } static void mlx5e_tx_check_stop(struct mlx5e_txqsq *sq) @@ -392,7 +392,7 @@ mlx5e_txwqe_complete(struct mlx5e_txqsq *sq, struct sk_buff *skb, cseg->opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | attr->opcode); cseg->qpn_ds = cpu_to_be32((sq->sqn << 8) | wqe_attr->ds_cnt); - mlx5e_tx_skb_update_hwts_flags(skb); + mlx5e_tx_skb_update_ts_flags(skb); sq->pc += wi->num_wqebbs; @@ -482,12 +482,6 @@ mlx5e_sq_xmit_wqe(struct mlx5e_txqsq *sq, struct sk_buff *skb, } eseg->inline_hdr.sz |= cpu_to_be16(ihs); dseg += wqe_attr->ds_cnt_inl; - } else if (skb_vlan_tag_present(skb)) { - eseg->insert.type = cpu_to_be16(MLX5_ETH_WQE_INSERT_VLAN); - if (skb->vlan_proto == cpu_to_be16(ETH_P_8021AD)) - eseg->insert.type |= cpu_to_be16(MLX5_ETH_WQE_SVLAN); - eseg->insert.vlan_tci = cpu_to_be16(skb_vlan_tag_get(skb)); - stats->added_vlan_packets++; } dseg += wqe_attr->ds_cnt_ids; @@ -625,7 +619,7 @@ mlx5e_sq_xmit_mpwqe(struct mlx5e_txqsq *sq, struct sk_buff *skb, mlx5e_dma_push(sq, txd.dma_addr, txd.len, MLX5E_DMA_MAP_SINGLE); mlx5e_skb_fifo_push(&sq->db.skb_fifo, skb); mlx5e_tx_mpwqe_add_dseg(sq, &txd); - mlx5e_tx_skb_update_hwts_flags(skb); + mlx5e_tx_skb_update_ts_flags(skb); if (unlikely(mlx5e_tx_mpwqe_is_full(&sq->mpwqe))) { /* Might stop the queue and affect the retval of __netdev_tx_sent_queue. */ @@ -659,7 +653,7 @@ static void mlx5e_cqe_ts_id_eseg(struct mlx5e_ptpsq *ptpsq, struct sk_buff *skb, struct mlx5_wqe_eth_seg *eseg) { if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) - eseg->flow_table_metadata = + eseg->flow_table_metadata |= cpu_to_be32(mlx5e_ptp_metadata_fifo_peek(&ptpsq->metadata_freelist)); } @@ -755,7 +749,7 @@ static void mlx5e_consume_skb(struct mlx5e_txqsq *sq, struct sk_buff *skb, hwts.hwtstamp = mlx5e_cqe_ts_to_ns(sq->ptp_cyc2time, sq->clock, ts); if (sq->ptpsq) { mlx5e_skb_cb_hwtstamp_handler(skb, MLX5E_SKB_CB_CQE_HWTSTAMP, - hwts.hwtstamp, sq->ptpsq->cq_stats); + hwts.hwtstamp, sq->ptpsq); } else { skb_tstamp_tx(skb, &hwts); sq->stats->timestamps++; @@ -939,7 +933,11 @@ void mlx5e_free_txqsq_descs(struct mlx5e_txqsq *sq) sq->dma_fifo_cc = dma_fifo_cc; sq->cc = sqcc; - netdev_tx_completed_queue(sq->txq, npkts, nbytes); + /* Do not update BQL for TXQs that got replaced by new active ones, as + * netdev_tx_reset_queue() is called for them in mlx5e_activate_txqsq(). + */ + if (sq == sq->priv->txq2sq[sq->txq_ix]) + netdev_tx_completed_queue(sq->txq, npkts, nbytes); } #ifdef CONFIG_MLX5_CORE_IPOIB diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index dfb079e59d..25499da177 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -32,9 +32,7 @@ enum { MLX5_EQ_STATE_ALWAYS_ARMED = 0xb, }; -enum { - MLX5_EQ_DOORBEL_OFFSET = 0x40, -}; +#define MLX5_EQ_DOORBELL_OFFSET 0x40 /* budget must be smaller than MLX5_NUM_SPARE_EQE to guarantee that we update * the ci before we polled all the entries in the EQ. MLX5_NUM_SPARE_EQE is @@ -309,7 +307,7 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, eqc = MLX5_ADDR_OF(create_eq_in, in, eq_context_entry); MLX5_SET(eqc, eqc, log_eq_size, eq->fbc.log_sz); - MLX5_SET(eqc, eqc, uar_page, priv->uar->index); + MLX5_SET(eqc, eqc, uar_page, priv->bfreg.up->index); MLX5_SET(eqc, eqc, intr, vecidx); MLX5_SET(eqc, eqc, log_page_size, eq->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); @@ -322,7 +320,7 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, eq->eqn = MLX5_GET(create_eq_out, out, eq_number); eq->irqn = pci_irq_vector(dev->pdev, vecidx); eq->dev = dev; - eq->doorbell = priv->uar->map + MLX5_EQ_DOORBEL_OFFSET; + eq->doorbell = priv->bfreg.up->map + MLX5_EQ_DOORBELL_OFFSET; err = mlx5_debug_eq_add(dev, eq); if (err) @@ -585,6 +583,9 @@ static void gather_async_events_mask(struct mlx5_core_dev *dev, u64 mask[4]) async_event_mask |= (1ull << MLX5_EVENT_TYPE_OBJECT_CHANGE); + if (mlx5_pcie_cong_event_supported(dev)) + async_event_mask |= (1ull << MLX5_EVENT_TYPE_OBJECT_CHANGE); + mask[0] = async_event_mask; if (MLX5_CAP_GEN(dev, event_cap)) @@ -873,19 +874,25 @@ static int comp_irq_request_sf(struct mlx5_core_dev *dev, u16 vecidx) { struct mlx5_irq_pool *pool = mlx5_irq_table_get_comp_irq_pool(dev); struct mlx5_eq_table *table = dev->priv.eq_table; - struct irq_affinity_desc af_desc = {}; + struct irq_affinity_desc *af_desc; struct mlx5_irq *irq; - /* In case SF irq pool does not exist, fallback to the PF irqs*/ + /* In case SF irq pool does not exist, fallback to the PF irqs */ if (!mlx5_irq_pool_is_sf_pool(pool)) return comp_irq_request_pci(dev, vecidx); - af_desc.is_managed = false; - cpumask_copy(&af_desc.mask, cpu_online_mask); - cpumask_andnot(&af_desc.mask, &af_desc.mask, &table->used_cpus); - irq = mlx5_irq_affinity_request(dev, pool, &af_desc); - if (IS_ERR(irq)) + af_desc = kvzalloc(sizeof(*af_desc), GFP_KERNEL); + if (!af_desc) + return -ENOMEM; + + af_desc->is_managed = false; + cpumask_copy(&af_desc->mask, cpu_online_mask); + cpumask_andnot(&af_desc->mask, &af_desc->mask, &table->used_cpus); + irq = mlx5_irq_affinity_request(dev, pool, af_desc); + if (IS_ERR(irq)) { + kvfree(af_desc); return PTR_ERR(irq); + } cpumask_or(&table->used_cpus, &table->used_cpus, mlx5_irq_get_affinity_mask(irq)); mlx5_core_dbg(pool->dev, "IRQ %u mapped to cpu %*pbl, %u EQs on this irq\n", @@ -893,6 +900,8 @@ static int comp_irq_request_sf(struct mlx5_core_dev *dev, u16 vecidx) cpumask_pr_args(mlx5_irq_get_affinity_mask(irq)), mlx5_irq_read_locked(irq) / MLX5_EQ_REFS_PER_IRQ); + kvfree(af_desc); + return xa_err(xa_store(&table->comp_irqs, vecidx, irq, GFP_KERNEL)); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c index 7dd1dc3f77..c9a1654d83 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c @@ -87,8 +87,8 @@ int esw_acl_egress_lgcy_setup(struct mlx5_eswitch *esw, drop_counter = mlx5_fc_create(esw->dev, false); if (IS_ERR(drop_counter)) { esw_warn(esw->dev, - "vport[%d] configure egress drop rule counter err(%ld)\n", - vport->vport, PTR_ERR(drop_counter)); + "vport[%d] configure egress drop rule counter err(%pe)\n", + vport->vport, drop_counter); drop_counter = NULL; } vport->egress.legacy.drop_counter = drop_counter; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c index 1c37098e09..49a637829c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c @@ -188,7 +188,7 @@ int esw_acl_ingress_lgcy_setup(struct mlx5_eswitch *esw, if (IS_ERR(vport->ingress.acl)) { err = PTR_ERR(vport->ingress.acl); vport->ingress.acl = NULL; - return err; + goto out; } err = esw_acl_ingress_lgcy_groups_create(esw, vport); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/adj_vport.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/adj_vport.c new file mode 100644 index 0000000000..250af09b5a --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/adj_vport.c @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +#include "fs_core.h" +#include "eswitch.h" + +int mlx5_esw_adj_vport_modify(struct mlx5_core_dev *dev, u16 vport, + bool connect) +{ + u32 in[MLX5_ST_SZ_DW(modify_vport_state_in)] = {}; + + MLX5_SET(modify_vport_state_in, in, opcode, + MLX5_CMD_OP_MODIFY_VPORT_STATE); + MLX5_SET(modify_vport_state_in, in, op_mod, + MLX5_VPORT_STATE_OP_MOD_ESW_VPORT); + MLX5_SET(modify_vport_state_in, in, other_vport, 1); + MLX5_SET(modify_vport_state_in, in, vport_number, vport); + MLX5_SET(modify_vport_state_in, in, ingress_connect_valid, 1); + MLX5_SET(modify_vport_state_in, in, egress_connect_valid, 1); + MLX5_SET(modify_vport_state_in, in, ingress_connect, connect); + MLX5_SET(modify_vport_state_in, in, egress_connect, connect); + MLX5_SET(modify_vport_state_in, in, admin_state, connect); + return mlx5_cmd_exec_in(dev, modify_vport_state, in); +} + +static void mlx5_esw_destroy_esw_vport(struct mlx5_core_dev *dev, u16 vport) +{ + u32 in[MLX5_ST_SZ_DW(destroy_esw_vport_in)] = {}; + + MLX5_SET(destroy_esw_vport_in, in, opcode, + MLX5_CMD_OPCODE_DESTROY_ESW_VPORT); + MLX5_SET(destroy_esw_vport_in, in, vport_num, vport); + + mlx5_cmd_exec_in(dev, destroy_esw_vport, in); +} + +static int mlx5_esw_create_esw_vport(struct mlx5_core_dev *dev, u16 vhca_id, + u16 *vport_num) +{ + u32 out[MLX5_ST_SZ_DW(create_esw_vport_out)] = {}; + u32 in[MLX5_ST_SZ_DW(create_esw_vport_in)] = {}; + int err; + + MLX5_SET(create_esw_vport_in, in, opcode, + MLX5_CMD_OPCODE_CREATE_ESW_VPORT); + MLX5_SET(create_esw_vport_in, in, managed_vhca_id, vhca_id); + + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + if (!err) + *vport_num = MLX5_GET(create_esw_vport_out, out, vport_num); + + return err; +} + +static int mlx5_esw_adj_vport_create(struct mlx5_eswitch *esw, u16 vhca_id, + const void *rid_info_reg) +{ + struct mlx5_vport *vport; + u16 vport_num; + int err; + + err = mlx5_esw_create_esw_vport(esw->dev, vhca_id, &vport_num); + if (err) { + esw_warn(esw->dev, + "Failed to create adjacent vport for vhca_id %d, err %d\n", + vhca_id, err); + return err; + } + + esw_debug(esw->dev, "Created adjacent vport[%d] %d for vhca_id 0x%x\n", + esw->last_vport_idx, vport_num, vhca_id); + + err = mlx5_esw_vport_alloc(esw, esw->last_vport_idx++, vport_num); + if (err) + goto destroy_esw_vport; + + xa_set_mark(&esw->vports, vport_num, MLX5_ESW_VPT_VF); + vport = mlx5_eswitch_get_vport(esw, vport_num); + vport->adjacent = true; + vport->vhca_id = vhca_id; + + vport->adj_info.parent_pci_devfn = + MLX5_GET(function_vhca_rid_info_reg, rid_info_reg, + parent_pci_device_function); + vport->adj_info.function_id = + MLX5_GET(function_vhca_rid_info_reg, rid_info_reg, function_id); + + mlx5_fs_vport_egress_acl_ns_add(esw->dev->priv.steering, vport->index); + mlx5_fs_vport_ingress_acl_ns_add(esw->dev->priv.steering, vport->index); + err = mlx5_esw_offloads_rep_add(esw, vport); + if (err) + goto acl_ns_remove; + + return 0; + +acl_ns_remove: + mlx5_fs_vport_ingress_acl_ns_remove(esw->dev->priv.steering, + vport->index); + mlx5_fs_vport_egress_acl_ns_remove(esw->dev->priv.steering, + vport->index); + mlx5_esw_vport_free(esw, vport); +destroy_esw_vport: + mlx5_esw_destroy_esw_vport(esw->dev, vport_num); + return err; +} + +static void mlx5_esw_adj_vport_destroy(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + u16 vport_num = vport->vport; + + esw_debug(esw->dev, "Destroying adjacent vport %d for vhca_id 0x%x\n", + vport_num, vport->vhca_id); + + mlx5_esw_offloads_rep_remove(esw, vport); + mlx5_fs_vport_egress_acl_ns_remove(esw->dev->priv.steering, + vport->index); + mlx5_fs_vport_ingress_acl_ns_remove(esw->dev->priv.steering, + vport->index); + mlx5_esw_vport_free(esw, vport); + /* Reset the vport index back so new adj vports can use this index. + * When vport count can incrementally change, this needs to be modified. + */ + esw->last_vport_idx--; + mlx5_esw_destroy_esw_vport(esw->dev, vport_num); +} + +void mlx5_esw_adjacent_vhcas_cleanup(struct mlx5_eswitch *esw) +{ + struct mlx5_vport *vport; + unsigned long i; + + if (!MLX5_CAP_GEN_2(esw->dev, delegated_vhca_max)) + return; + + mlx5_esw_for_each_vf_vport(esw, i, vport, U16_MAX) { + if (!vport->adjacent) + continue; + mlx5_esw_adj_vport_destroy(esw, vport); + } +} + +void mlx5_esw_adjacent_vhcas_setup(struct mlx5_eswitch *esw) +{ + u32 delegated_vhca_max = MLX5_CAP_GEN_2(esw->dev, delegated_vhca_max); + u32 in[MLX5_ST_SZ_DW(query_delegated_vhca_in)] = {}; + int outlen, err, i = 0; + u8 *out; + u32 count; + + if (!delegated_vhca_max) + return; + + outlen = MLX5_ST_SZ_BYTES(query_delegated_vhca_out) + + delegated_vhca_max * + MLX5_ST_SZ_BYTES(delegated_function_vhca_rid_info); + + esw_debug(esw->dev, "delegated_vhca_max=%d\n", delegated_vhca_max); + + out = kvzalloc(outlen, GFP_KERNEL); + if (!out) + return; + + MLX5_SET(query_delegated_vhca_in, in, opcode, + MLX5_CMD_OPCODE_QUERY_DELEGATED_VHCA); + + err = mlx5_cmd_exec(esw->dev, in, sizeof(in), out, outlen); + if (err) { + kvfree(out); + esw_warn(esw->dev, "Failed to query delegated vhca, err %d\n", + err); + return; + } + + count = MLX5_GET(query_delegated_vhca_out, out, functions_count); + esw_debug(esw->dev, "Delegated vhca functions count %d\n", count); + + for (i = 0; i < count; i++) { + const void *rid_info, *rid_info_reg; + u16 vhca_id; + + rid_info = MLX5_ADDR_OF(query_delegated_vhca_out, out, + delegated_function_vhca_rid_info[i]); + + rid_info_reg = MLX5_ADDR_OF(delegated_function_vhca_rid_info, + rid_info, function_vhca_rid_info); + + vhca_id = MLX5_GET(function_vhca_rid_info_reg, rid_info_reg, + vhca_id); + esw_debug(esw->dev, "Delegating vhca_id 0x%x\n", vhca_id); + + err = mlx5_esw_adj_vport_create(esw, vhca_id, rid_info_reg); + if (err) { + esw_warn(esw->dev, + "Failed to init adjacent vhca 0x%x, err %d\n", + vhca_id, err); + break; + } + } + + kvfree(out); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c index 76e35c827d..60e1004777 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c @@ -81,7 +81,8 @@ mlx5_esw_bridge_table_create(int max_fte, u32 level, struct mlx5_eswitch *esw) ft_attr.prio = FDB_BR_OFFLOAD; fdb = mlx5_create_flow_table(ns, &ft_attr); if (IS_ERR(fdb)) - esw_warn(dev, "Failed to create bridge FDB Table (err=%ld)\n", PTR_ERR(fdb)); + esw_warn(dev, "Failed to create bridge FDB Table (err=%pe)\n", + fdb); return fdb; } @@ -121,8 +122,8 @@ mlx5_esw_bridge_ingress_vlan_proto_fg_create(unsigned int from, unsigned int to, kvfree(in); if (IS_ERR(fg)) esw_warn(esw->dev, - "Failed to create VLAN(proto=%x) flow group for bridge ingress table (err=%ld)\n", - vlan_proto, PTR_ERR(fg)); + "Failed to create VLAN(proto=%x) flow group for bridge ingress table (err=%pe)\n", + vlan_proto, fg); return fg; } @@ -180,8 +181,8 @@ mlx5_esw_bridge_ingress_vlan_proto_filter_fg_create(unsigned int from, unsigned fg = mlx5_create_flow_group(ingress_ft, in); if (IS_ERR(fg)) esw_warn(esw->dev, - "Failed to create bridge ingress table VLAN filter flow group (err=%ld)\n", - PTR_ERR(fg)); + "Failed to create bridge ingress table VLAN filter flow group (err=%pe)\n", + fg); kvfree(in); return fg; } @@ -237,8 +238,8 @@ mlx5_esw_bridge_ingress_mac_fg_create(struct mlx5_eswitch *esw, struct mlx5_flow fg = mlx5_create_flow_group(ingress_ft, in); if (IS_ERR(fg)) esw_warn(esw->dev, - "Failed to create MAC flow group for bridge ingress table (err=%ld)\n", - PTR_ERR(fg)); + "Failed to create MAC flow group for bridge ingress table (err=%pe)\n", + fg); kvfree(in); return fg; @@ -274,8 +275,8 @@ mlx5_esw_bridge_egress_vlan_proto_fg_create(unsigned int from, unsigned int to, fg = mlx5_create_flow_group(egress_ft, in); if (IS_ERR(fg)) esw_warn(esw->dev, - "Failed to create VLAN flow group for bridge egress table (err=%ld)\n", - PTR_ERR(fg)); + "Failed to create VLAN flow group for bridge egress table (err=%pe)\n", + fg); kvfree(in); return fg; } @@ -324,8 +325,8 @@ mlx5_esw_bridge_egress_mac_fg_create(struct mlx5_eswitch *esw, struct mlx5_flow_ fg = mlx5_create_flow_group(egress_ft, in); if (IS_ERR(fg)) esw_warn(esw->dev, - "Failed to create bridge egress table MAC flow group (err=%ld)\n", - PTR_ERR(fg)); + "Failed to create bridge egress table MAC flow group (err=%pe)\n", + fg); kvfree(in); return fg; } @@ -354,8 +355,8 @@ mlx5_esw_bridge_egress_miss_fg_create(struct mlx5_eswitch *esw, struct mlx5_flow fg = mlx5_create_flow_group(egress_ft, in); if (IS_ERR(fg)) esw_warn(esw->dev, - "Failed to create bridge egress table miss flow group (err=%ld)\n", - PTR_ERR(fg)); + "Failed to create bridge egress table miss flow group (err=%pe)\n", + fg); kvfree(in); return fg; } @@ -501,8 +502,8 @@ mlx5_esw_bridge_egress_table_init(struct mlx5_esw_bridge_offloads *br_offloads, if (mlx5_esw_bridge_pkt_reformat_vlan_pop_supported(esw)) { miss_fg = mlx5_esw_bridge_egress_miss_fg_create(esw, egress_ft); if (IS_ERR(miss_fg)) { - esw_warn(esw->dev, "Failed to create miss flow group (err=%ld)\n", - PTR_ERR(miss_fg)); + esw_warn(esw->dev, "Failed to create miss flow group (err=%pe)\n", + miss_fg); miss_fg = NULL; goto skip_miss_flow; } @@ -510,8 +511,8 @@ mlx5_esw_bridge_egress_table_init(struct mlx5_esw_bridge_offloads *br_offloads, miss_pkt_reformat = mlx5_esw_bridge_pkt_reformat_vlan_pop_create(esw); if (IS_ERR(miss_pkt_reformat)) { esw_warn(esw->dev, - "Failed to alloc packet reformat REMOVE_HEADER (err=%ld)\n", - PTR_ERR(miss_pkt_reformat)); + "Failed to alloc packet reformat REMOVE_HEADER (err=%pe)\n", + miss_pkt_reformat); miss_pkt_reformat = NULL; mlx5_destroy_flow_group(miss_fg); miss_fg = NULL; @@ -522,8 +523,8 @@ mlx5_esw_bridge_egress_table_init(struct mlx5_esw_bridge_offloads *br_offloads, br_offloads->skip_ft, miss_pkt_reformat); if (IS_ERR(miss_handle)) { - esw_warn(esw->dev, "Failed to create miss flow (err=%ld)\n", - PTR_ERR(miss_handle)); + esw_warn(esw->dev, "Failed to create miss flow (err=%pe)\n", + miss_handle); miss_handle = NULL; mlx5_packet_reformat_dealloc(esw->dev, miss_pkt_reformat); miss_pkt_reformat = NULL; @@ -1048,8 +1049,8 @@ mlx5_esw_bridge_vlan_push_create(u16 vlan_proto, struct mlx5_esw_bridge_vlan *vl &reformat_params, MLX5_FLOW_NAMESPACE_FDB); if (IS_ERR(pkt_reformat)) { - esw_warn(esw->dev, "Failed to alloc packet reformat INSERT_HEADER (err=%ld)\n", - PTR_ERR(pkt_reformat)); + esw_warn(esw->dev, "Failed to alloc packet reformat INSERT_HEADER (err=%pe)\n", + pkt_reformat); return PTR_ERR(pkt_reformat); } @@ -1076,8 +1077,8 @@ mlx5_esw_bridge_vlan_pop_create(struct mlx5_esw_bridge_vlan *vlan, struct mlx5_e pkt_reformat = mlx5_esw_bridge_pkt_reformat_vlan_pop_create(esw); if (IS_ERR(pkt_reformat)) { - esw_warn(esw->dev, "Failed to alloc packet reformat REMOVE_HEADER (err=%ld)\n", - PTR_ERR(pkt_reformat)); + esw_warn(esw->dev, "Failed to alloc packet reformat REMOVE_HEADER (err=%pe)\n", + pkt_reformat); return PTR_ERR(pkt_reformat); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c index b7102e14d2..89a58dee50 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c @@ -7,11 +7,7 @@ static void mlx5_esw_get_port_parent_id(struct mlx5_core_dev *dev, struct netdev_phys_item_id *ppid) { - u64 parent_id; - - parent_id = mlx5_query_nic_system_image_guid(dev); - ppid->id_len = sizeof(parent_id); - memcpy(ppid->id, &parent_id, sizeof(parent_id)); + mlx5_query_nic_sw_system_image_guid(dev, ppid->id, &ppid->id_len); } static bool mlx5_esw_devlink_port_supported(struct mlx5_eswitch *esw, u16 vport_num) @@ -27,6 +23,7 @@ static void mlx5_esw_offloads_pf_vf_devlink_port_attrs_set(struct mlx5_eswitch * { struct mlx5_core_dev *dev = esw->dev; struct netdev_phys_item_id ppid = {}; + struct mlx5_vport *vport; u32 controller_num = 0; bool external; u16 pfnum; @@ -42,15 +39,25 @@ static void mlx5_esw_offloads_pf_vf_devlink_port_attrs_set(struct mlx5_eswitch * dl_port->attrs.switch_id.id_len = ppid.id_len; devlink_port_attrs_pci_pf_set(dl_port, controller_num, pfnum, external); } else if (mlx5_eswitch_is_vf_vport(esw, vport_num)) { + u16 func_id = vport_num - 1; + + vport = mlx5_eswitch_get_vport(esw, vport_num); memcpy(dl_port->attrs.switch_id.id, ppid.id, ppid.id_len); dl_port->attrs.switch_id.id_len = ppid.id_len; + if (vport->adjacent) { + func_id = vport->adj_info.function_id; + pfnum = vport->adj_info.parent_pci_devfn; + } + devlink_port_attrs_pci_vf_set(dl_port, controller_num, pfnum, - vport_num - 1, external); + func_id, external); } else if (mlx5_core_is_ec_vf_vport(esw->dev, vport_num)) { + u16 base_vport = mlx5_core_ec_vf_vport_base(dev); + memcpy(dl_port->attrs.switch_id.id, ppid.id, ppid.id_len); dl_port->attrs.switch_id.id_len = ppid.id_len; devlink_port_attrs_pci_vf_set(dl_port, 0, pfnum, - vport_num - 1, false); + vport_num - base_vport, false); } } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/legacy.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/legacy.c index 76382626ad..929adeb50a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/legacy.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/legacy.c @@ -66,7 +66,6 @@ static void esw_destroy_legacy_fdb_table(struct mlx5_eswitch *esw) esw->fdb_table.legacy.addr_grp = NULL; esw->fdb_table.legacy.allmulti_grp = NULL; esw->fdb_table.legacy.promisc_grp = NULL; - atomic64_set(&esw->user_count, 0); } static int esw_create_legacy_fdb_table(struct mlx5_eswitch *esw) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index ad9f6fca9b..2e11574b3a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -64,11 +64,19 @@ static void esw_qos_domain_release(struct mlx5_eswitch *esw) enum sched_node_type { SCHED_NODE_TYPE_VPORTS_TSAR, SCHED_NODE_TYPE_VPORT, + SCHED_NODE_TYPE_TC_ARBITER_TSAR, + SCHED_NODE_TYPE_RATE_LIMITER, + SCHED_NODE_TYPE_VPORT_TC, + SCHED_NODE_TYPE_VPORTS_TC_TSAR, }; static const char * const sched_node_type_str[] = { [SCHED_NODE_TYPE_VPORTS_TSAR] = "vports TSAR", [SCHED_NODE_TYPE_VPORT] = "vport", + [SCHED_NODE_TYPE_TC_ARBITER_TSAR] = "TC Arbiter TSAR", + [SCHED_NODE_TYPE_RATE_LIMITER] = "Rate Limiter", + [SCHED_NODE_TYPE_VPORT_TC] = "vport TC", + [SCHED_NODE_TYPE_VPORTS_TC_TSAR] = "vports TC TSAR", }; struct mlx5_esw_sched_node { @@ -92,6 +100,10 @@ struct mlx5_esw_sched_node { struct mlx5_vport *vport; /* Level in the hierarchy. The root node level is 1. */ u8 level; + /* Valid only when this node represents a traffic class. */ + u8 tc; + /* Valid only for a TC arbiter node or vport TC arbiter. */ + u32 tc_bw[DEVLINK_RATE_TCS_MAX]; }; static void esw_qos_node_attach_to_parent(struct mlx5_esw_sched_node *node) @@ -106,6 +118,13 @@ static void esw_qos_node_attach_to_parent(struct mlx5_esw_sched_node *node) } } +static int esw_qos_num_tcs(struct mlx5_core_dev *dev) +{ + int num_tcs = mlx5_max_tc(dev) + 1; + + return num_tcs < DEVLINK_RATE_TCS_MAX ? num_tcs : DEVLINK_RATE_TCS_MAX; +} + static void esw_qos_node_set_parent(struct mlx5_esw_sched_node *node, struct mlx5_esw_sched_node *parent) { @@ -116,8 +135,38 @@ esw_qos_node_set_parent(struct mlx5_esw_sched_node *node, struct mlx5_esw_sched_ esw_qos_node_attach_to_parent(node); } +static void esw_qos_nodes_set_parent(struct list_head *nodes, + struct mlx5_esw_sched_node *parent) +{ + struct mlx5_esw_sched_node *node, *tmp; + + list_for_each_entry_safe(node, tmp, nodes, entry) { + esw_qos_node_set_parent(node, parent); + if (!list_empty(&node->children) && + parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) { + struct mlx5_esw_sched_node *child; + + list_for_each_entry(child, &node->children, entry) { + struct mlx5_vport *vport = child->vport; + + if (vport) + vport->qos.sched_node->parent = parent; + } + } + } +} + void mlx5_esw_qos_vport_qos_free(struct mlx5_vport *vport) { + if (vport->qos.sched_nodes) { + int num_tcs = esw_qos_num_tcs(vport->qos.sched_node->esw->dev); + int i; + + for (i = 0; i < num_tcs; i++) + kfree(vport->qos.sched_nodes[i]); + kfree(vport->qos.sched_nodes); + } + kfree(vport->qos.sched_node); memset(&vport->qos, 0, sizeof(vport->qos)); } @@ -141,16 +190,37 @@ mlx5_esw_qos_vport_get_parent(const struct mlx5_vport *vport) static void esw_qos_sched_elem_warn(struct mlx5_esw_sched_node *node, int err, const char *op) { - if (node->vport) { + switch (node->type) { + case SCHED_NODE_TYPE_VPORTS_TC_TSAR: + esw_warn(node->esw->dev, + "E-Switch %s %s scheduling element failed (tc=%d,err=%d)\n", + op, sched_node_type_str[node->type], node->tc, err); + break; + case SCHED_NODE_TYPE_VPORT_TC: + esw_warn(node->esw->dev, + "E-Switch %s %s scheduling element failed (vport=%d,tc=%d,err=%d)\n", + op, + sched_node_type_str[node->type], + node->vport->vport, node->tc, err); + break; + case SCHED_NODE_TYPE_VPORT: esw_warn(node->esw->dev, "E-Switch %s %s scheduling element failed (vport=%d,err=%d)\n", op, sched_node_type_str[node->type], node->vport->vport, err); - return; + break; + case SCHED_NODE_TYPE_RATE_LIMITER: + case SCHED_NODE_TYPE_TC_ARBITER_TSAR: + case SCHED_NODE_TYPE_VPORTS_TSAR: + esw_warn(node->esw->dev, + "E-Switch %s %s scheduling element failed (err=%d)\n", + op, sched_node_type_str[node->type], err); + break; + default: + esw_warn(node->esw->dev, + "E-Switch %s scheduling element failed (err=%d)\n", + op, err); + break; } - - esw_warn(node->esw->dev, - "E-Switch %s %s scheduling element failed (err=%d)\n", - op, sched_node_type_str[node->type], err); } static int esw_qos_node_create_sched_element(struct mlx5_esw_sched_node *node, void *ctx, @@ -233,6 +303,24 @@ static int esw_qos_sched_elem_config(struct mlx5_esw_sched_node *node, u32 max_r return 0; } +static int esw_qos_create_rate_limit_element(struct mlx5_esw_sched_node *node, + struct netlink_ext_ack *extack) +{ + u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {}; + + if (!mlx5_qos_element_type_supported( + node->esw->dev, + SCHEDULING_CONTEXT_ELEMENT_TYPE_RATE_LIMIT, + SCHEDULING_HIERARCHY_E_SWITCH)) + return -EOPNOTSUPP; + + MLX5_SET(scheduling_context, sched_ctx, max_average_bw, node->max_rate); + MLX5_SET(scheduling_context, sched_ctx, element_type, + SCHEDULING_CONTEXT_ELEMENT_TYPE_RATE_LIMIT); + + return esw_qos_node_create_sched_element(node, sched_ctx, extack); +} + static u32 esw_qos_calculate_min_rate_divider(struct mlx5_eswitch *esw, struct mlx5_esw_sched_node *parent) { @@ -253,24 +341,19 @@ static u32 esw_qos_calculate_min_rate_divider(struct mlx5_eswitch *esw, if (max_guarantee) return max_t(u32, max_guarantee / fw_max_bw_share, 1); - /* If nodes max min_rate divider is 0 but their parent has bw_share - * configured, then set bw_share for nodes to minimal value. - */ - - if (parent && parent->bw_share) - return 1; - /* If the node nodes has min_rate configured, a divider of 0 sets all * nodes' bw_share to 0, effectively disabling min guarantees. */ return 0; } -static u32 esw_qos_calc_bw_share(u32 min_rate, u32 divider, u32 fw_max) +static u32 esw_qos_calc_bw_share(u32 value, u32 divider, u32 fw_max) { if (!divider) return 0; - return min_t(u32, max_t(u32, DIV_ROUND_UP(min_rate, divider), MLX5_MIN_BW_SHARE), fw_max); + return min_t(u32, fw_max, + max_t(u32, + DIV_ROUND_UP(value, divider), MLX5_MIN_BW_SHARE)); } static void esw_qos_update_sched_node_bw_share(struct mlx5_esw_sched_node *node, @@ -297,7 +380,13 @@ static void esw_qos_normalize_min_rate(struct mlx5_eswitch *esw, if (node->esw != esw || node->ix == esw->qos.root_tsar_ix) continue; - esw_qos_update_sched_node_bw_share(node, divider, extack); + /* Vports TC TSARs don't have a minimum rate configured, + * so there's no need to update the bw_share on them. + */ + if (node->type != SCHED_NODE_TYPE_VPORTS_TC_TSAR) { + esw_qos_update_sched_node_bw_share(node, divider, + extack); + } if (list_empty(&node->children)) continue; @@ -306,6 +395,20 @@ static void esw_qos_normalize_min_rate(struct mlx5_eswitch *esw, } } +static u32 esw_qos_calculate_tc_bw_divider(u32 *tc_bw) +{ + u32 total = 0; + int i; + + for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) + total += tc_bw[i]; + + /* If total is zero, tc-bw config is disabled and we shouldn't reach + * here. + */ + return WARN_ON(!total) ? 1 : total; +} + static int esw_qos_set_node_min_rate(struct mlx5_esw_sched_node *node, u32 min_rate, struct netlink_ext_ack *extack) { @@ -350,28 +453,65 @@ esw_qos_create_node_sched_elem(struct mlx5_core_dev *dev, u32 parent_element_id, tsar_ix); } -static int esw_qos_vport_create_sched_element(struct mlx5_esw_sched_node *vport_node, - struct netlink_ext_ack *extack) +static int +esw_qos_vport_create_sched_element(struct mlx5_esw_sched_node *vport_node, + struct netlink_ext_ack *extack) { + struct mlx5_esw_sched_node *parent = vport_node->parent; u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {}; struct mlx5_core_dev *dev = vport_node->esw->dev; void *attr; - if (!mlx5_qos_element_type_supported(dev, - SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT, - SCHEDULING_HIERARCHY_E_SWITCH)) + if (!mlx5_qos_element_type_supported( + dev, + SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT, + SCHEDULING_HIERARCHY_E_SWITCH)) return -EOPNOTSUPP; MLX5_SET(scheduling_context, sched_ctx, element_type, SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT); attr = MLX5_ADDR_OF(scheduling_context, sched_ctx, element_attributes); MLX5_SET(vport_element, attr, vport_number, vport_node->vport->vport); - MLX5_SET(scheduling_context, sched_ctx, parent_element_id, vport_node->parent->ix); - MLX5_SET(scheduling_context, sched_ctx, max_average_bw, vport_node->max_rate); + MLX5_SET(scheduling_context, sched_ctx, parent_element_id, + parent ? parent->ix : vport_node->esw->qos.root_tsar_ix); + MLX5_SET(scheduling_context, sched_ctx, max_average_bw, + vport_node->max_rate); return esw_qos_node_create_sched_element(vport_node, sched_ctx, extack); } +static int +esw_qos_vport_tc_create_sched_element(struct mlx5_esw_sched_node *vport_tc_node, + u32 rate_limit_elem_ix, + struct netlink_ext_ack *extack) +{ + u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {}; + struct mlx5_core_dev *dev = vport_tc_node->esw->dev; + void *attr; + + if (!mlx5_qos_element_type_supported( + dev, + SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT_TC, + SCHEDULING_HIERARCHY_E_SWITCH)) + return -EOPNOTSUPP; + + MLX5_SET(scheduling_context, sched_ctx, element_type, + SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT_TC); + attr = MLX5_ADDR_OF(scheduling_context, sched_ctx, element_attributes); + MLX5_SET(vport_tc_element, attr, vport_number, + vport_tc_node->vport->vport); + MLX5_SET(vport_tc_element, attr, traffic_class, vport_tc_node->tc); + MLX5_SET(scheduling_context, sched_ctx, max_bw_obj_id, + rate_limit_elem_ix); + MLX5_SET(scheduling_context, sched_ctx, parent_element_id, + vport_tc_node->parent->ix); + MLX5_SET(scheduling_context, sched_ctx, bw_share, + vport_tc_node->bw_share); + + return esw_qos_node_create_sched_element(vport_tc_node, sched_ctx, + extack); +} + static struct mlx5_esw_sched_node * __esw_qos_alloc_node(struct mlx5_eswitch *esw, u32 tsar_ix, enum sched_node_type type, struct mlx5_esw_sched_node *parent) @@ -388,6 +528,14 @@ __esw_qos_alloc_node(struct mlx5_eswitch *esw, u32 tsar_ix, enum sched_node_type node->parent = parent; INIT_LIST_HEAD(&node->children); esw_qos_node_attach_to_parent(node); + if (!parent) { + /* The caller is responsible for inserting the node into the + * parent list if necessary. This function can also be used with + * a NULL parent, which doesn't necessarily indicate that it + * refers to the root scheduling element. + */ + list_del_init(&node->entry); + } return node; } @@ -404,6 +552,147 @@ static void esw_qos_destroy_node(struct mlx5_esw_sched_node *node, struct netlin __esw_qos_free_node(node); } +static int esw_qos_create_vports_tc_node(struct mlx5_esw_sched_node *parent, + u8 tc, struct netlink_ext_ack *extack) +{ + u32 tsar_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {}; + struct mlx5_core_dev *dev = parent->esw->dev; + struct mlx5_esw_sched_node *vports_tc_node; + void *attr; + int err; + + if (!mlx5_qos_element_type_supported( + dev, + SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR, + SCHEDULING_HIERARCHY_E_SWITCH) || + !mlx5_qos_tsar_type_supported(dev, + TSAR_ELEMENT_TSAR_TYPE_DWRR, + SCHEDULING_HIERARCHY_E_SWITCH)) + return -EOPNOTSUPP; + + vports_tc_node = __esw_qos_alloc_node(parent->esw, 0, + SCHED_NODE_TYPE_VPORTS_TC_TSAR, + parent); + if (!vports_tc_node) { + NL_SET_ERR_MSG_MOD(extack, "E-Switch alloc node failed"); + esw_warn(dev, "Failed to alloc vports TC node (tc=%d)\n", tc); + return -ENOMEM; + } + + attr = MLX5_ADDR_OF(scheduling_context, tsar_ctx, element_attributes); + MLX5_SET(tsar_element, attr, tsar_type, TSAR_ELEMENT_TSAR_TYPE_DWRR); + MLX5_SET(tsar_element, attr, traffic_class, tc); + MLX5_SET(scheduling_context, tsar_ctx, parent_element_id, parent->ix); + MLX5_SET(scheduling_context, tsar_ctx, element_type, + SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR); + + err = esw_qos_node_create_sched_element(vports_tc_node, tsar_ctx, + extack); + if (err) + goto err_create_sched_element; + + vports_tc_node->tc = tc; + + return 0; + +err_create_sched_element: + __esw_qos_free_node(vports_tc_node); + return err; +} + +static void +esw_qos_tc_arbiter_get_bw_shares(struct mlx5_esw_sched_node *tc_arbiter_node, + u32 *tc_bw) +{ + memcpy(tc_bw, tc_arbiter_node->tc_bw, sizeof(tc_arbiter_node->tc_bw)); +} + +static void +esw_qos_set_tc_arbiter_bw_shares(struct mlx5_esw_sched_node *tc_arbiter_node, + u32 *tc_bw, struct netlink_ext_ack *extack) +{ + struct mlx5_eswitch *esw = tc_arbiter_node->esw; + struct mlx5_esw_sched_node *vports_tc_node; + u32 divider, fw_max_bw_share; + + fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share); + divider = esw_qos_calculate_tc_bw_divider(tc_bw); + list_for_each_entry(vports_tc_node, &tc_arbiter_node->children, entry) { + u8 tc = vports_tc_node->tc; + u32 bw_share; + + tc_arbiter_node->tc_bw[tc] = tc_bw[tc]; + bw_share = tc_bw[tc] * fw_max_bw_share; + bw_share = esw_qos_calc_bw_share(bw_share, divider, + fw_max_bw_share); + esw_qos_sched_elem_config(vports_tc_node, 0, bw_share, extack); + } +} + +static void +esw_qos_destroy_vports_tc_nodes(struct mlx5_esw_sched_node *tc_arbiter_node, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_sched_node *vports_tc_node, *tmp; + + list_for_each_entry_safe(vports_tc_node, tmp, + &tc_arbiter_node->children, entry) + esw_qos_destroy_node(vports_tc_node, extack); +} + +static int +esw_qos_create_vports_tc_nodes(struct mlx5_esw_sched_node *tc_arbiter_node, + struct netlink_ext_ack *extack) +{ + struct mlx5_eswitch *esw = tc_arbiter_node->esw; + int err, i, num_tcs = esw_qos_num_tcs(esw->dev); + + for (i = 0; i < num_tcs; i++) { + err = esw_qos_create_vports_tc_node(tc_arbiter_node, i, extack); + if (err) + goto err_tc_node_create; + } + + return 0; + +err_tc_node_create: + esw_qos_destroy_vports_tc_nodes(tc_arbiter_node, NULL); + return err; +} + +static int esw_qos_create_tc_arbiter_sched_elem( + struct mlx5_esw_sched_node *tc_arbiter_node, + struct netlink_ext_ack *extack) +{ + u32 tsar_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {}; + u32 tsar_parent_ix; + void *attr; + + if (!mlx5_qos_tsar_type_supported(tc_arbiter_node->esw->dev, + TSAR_ELEMENT_TSAR_TYPE_TC_ARB, + SCHEDULING_HIERARCHY_E_SWITCH)) { + NL_SET_ERR_MSG_MOD(extack, + "E-Switch TC Arbiter scheduling element is not supported"); + return -EOPNOTSUPP; + } + + attr = MLX5_ADDR_OF(scheduling_context, tsar_ctx, element_attributes); + MLX5_SET(tsar_element, attr, tsar_type, TSAR_ELEMENT_TSAR_TYPE_TC_ARB); + tsar_parent_ix = tc_arbiter_node->parent ? tc_arbiter_node->parent->ix : + tc_arbiter_node->esw->qos.root_tsar_ix; + MLX5_SET(scheduling_context, tsar_ctx, parent_element_id, + tsar_parent_ix); + MLX5_SET(scheduling_context, tsar_ctx, element_type, + SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR); + MLX5_SET(scheduling_context, tsar_ctx, max_average_bw, + tc_arbiter_node->max_rate); + MLX5_SET(scheduling_context, tsar_ctx, bw_share, + tc_arbiter_node->bw_share); + + return esw_qos_node_create_sched_element(tc_arbiter_node, tsar_ctx, + extack); +} + static struct mlx5_esw_sched_node * __esw_qos_create_vports_sched_node(struct mlx5_eswitch *esw, struct mlx5_esw_sched_node *parent, struct netlink_ext_ack *extack) @@ -426,6 +715,7 @@ __esw_qos_create_vports_sched_node(struct mlx5_eswitch *esw, struct mlx5_esw_sch goto err_alloc_node; } + list_add_tail(&node->entry, &esw->qos.domain->nodes); esw_qos_normalize_min_rate(esw, NULL, extack); trace_mlx5_esw_node_qos_create(esw->dev, node, node->ix); @@ -467,6 +757,9 @@ static void __esw_qos_destroy_node(struct mlx5_esw_sched_node *node, struct netl { struct mlx5_eswitch *esw = node->esw; + if (node->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) + esw_qos_destroy_vports_tc_nodes(node, extack); + trace_mlx5_esw_node_qos_destroy(esw->dev, node, node->ix); esw_qos_destroy_node(node, extack); esw_qos_normalize_min_rate(esw, NULL, extack); @@ -487,45 +780,15 @@ static int esw_qos_create(struct mlx5_eswitch *esw, struct netlink_ext_ack *exta return err; } - if (MLX5_CAP_QOS(dev, log_esw_max_sched_depth)) { - esw->qos.node0 = __esw_qos_create_vports_sched_node(esw, NULL, extack); - } else { - /* The eswitch doesn't support scheduling nodes. - * Create a software-only node0 using the root TSAR to attach vport QoS to. - */ - if (!__esw_qos_alloc_node(esw, - esw->qos.root_tsar_ix, - SCHED_NODE_TYPE_VPORTS_TSAR, - NULL)) - esw->qos.node0 = ERR_PTR(-ENOMEM); - } - if (IS_ERR(esw->qos.node0)) { - err = PTR_ERR(esw->qos.node0); - esw_warn(dev, "E-Switch create rate node 0 failed (%d)\n", err); - goto err_node0; - } refcount_set(&esw->qos.refcnt, 1); return 0; - -err_node0: - if (mlx5_destroy_scheduling_element_cmd(esw->dev, SCHEDULING_HIERARCHY_E_SWITCH, - esw->qos.root_tsar_ix)) - esw_warn(esw->dev, "E-Switch destroy root TSAR failed.\n"); - - return err; } static void esw_qos_destroy(struct mlx5_eswitch *esw) { int err; - if (esw->qos.node0->ix != esw->qos.root_tsar_ix) - __esw_qos_destroy_node(esw->qos.node0, NULL); - else - __esw_qos_free_node(esw->qos.node0); - esw->qos.node0 = NULL; - err = mlx5_destroy_scheduling_element_cmd(esw->dev, SCHEDULING_HIERARCHY_E_SWITCH, esw->qos.root_tsar_ix); @@ -555,36 +818,271 @@ static void esw_qos_put(struct mlx5_eswitch *esw) esw_qos_destroy(esw); } -static void esw_qos_vport_disable(struct mlx5_vport *vport, struct netlink_ext_ack *extack) +static void +esw_qos_tc_arbiter_scheduling_teardown(struct mlx5_esw_sched_node *node, + struct netlink_ext_ack *extack) +{ + /* Clean up all Vports TC nodes within the TC arbiter node. */ + esw_qos_destroy_vports_tc_nodes(node, extack); + /* Destroy the scheduling element for the TC arbiter node itself. */ + esw_qos_node_destroy_sched_element(node, extack); +} + +static int esw_qos_tc_arbiter_scheduling_setup(struct mlx5_esw_sched_node *node, + struct netlink_ext_ack *extack) +{ + u32 curr_ix = node->ix; + int err; + + err = esw_qos_create_tc_arbiter_sched_elem(node, extack); + if (err) + return err; + /* Initialize the vports TC nodes within created TC arbiter TSAR. */ + err = esw_qos_create_vports_tc_nodes(node, extack); + if (err) + goto err_vports_tc_nodes; + + node->type = SCHED_NODE_TYPE_TC_ARBITER_TSAR; + + return 0; + +err_vports_tc_nodes: + /* If initialization fails, clean up the scheduling element + * for the TC arbiter node. + */ + esw_qos_node_destroy_sched_element(node, NULL); + node->ix = curr_ix; + return err; +} + +static int +esw_qos_create_vport_tc_sched_node(struct mlx5_vport *vport, + u32 rate_limit_elem_ix, + struct mlx5_esw_sched_node *vports_tc_node, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node; + struct mlx5_esw_sched_node *vport_tc_node; + u8 tc = vports_tc_node->tc; + int err; + + vport_tc_node = __esw_qos_alloc_node(vport_node->esw, 0, + SCHED_NODE_TYPE_VPORT_TC, + vports_tc_node); + if (!vport_tc_node) + return -ENOMEM; + + vport_tc_node->min_rate = vport_node->min_rate; + vport_tc_node->tc = tc; + vport_tc_node->vport = vport; + err = esw_qos_vport_tc_create_sched_element(vport_tc_node, + rate_limit_elem_ix, + extack); + if (err) + goto err_out; + + vport->qos.sched_nodes[tc] = vport_tc_node; + + return 0; +err_out: + __esw_qos_free_node(vport_tc_node); + return err; +} + +static void +esw_qos_destroy_vport_tc_sched_elements(struct mlx5_vport *vport, + struct netlink_ext_ack *extack) +{ + int i, num_tcs = esw_qos_num_tcs(vport->qos.sched_node->esw->dev); + + for (i = 0; i < num_tcs; i++) { + if (vport->qos.sched_nodes[i]) { + __esw_qos_destroy_node(vport->qos.sched_nodes[i], + extack); + } + } + + kfree(vport->qos.sched_nodes); + vport->qos.sched_nodes = NULL; +} + +static int +esw_qos_create_vport_tc_sched_elements(struct mlx5_vport *vport, + enum sched_node_type type, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node; + struct mlx5_esw_sched_node *tc_arbiter_node, *vports_tc_node; + int err, num_tcs = esw_qos_num_tcs(vport_node->esw->dev); + u32 rate_limit_elem_ix; + + vport->qos.sched_nodes = kcalloc(num_tcs, + sizeof(struct mlx5_esw_sched_node *), + GFP_KERNEL); + if (!vport->qos.sched_nodes) { + NL_SET_ERR_MSG_MOD(extack, + "Allocating the vport TC scheduling elements failed."); + return -ENOMEM; + } + + rate_limit_elem_ix = type == SCHED_NODE_TYPE_RATE_LIMITER ? + vport_node->ix : 0; + tc_arbiter_node = type == SCHED_NODE_TYPE_RATE_LIMITER ? + vport_node->parent : vport_node; + list_for_each_entry(vports_tc_node, &tc_arbiter_node->children, entry) { + err = esw_qos_create_vport_tc_sched_node(vport, + rate_limit_elem_ix, + vports_tc_node, + extack); + if (err) + goto err_create_vport_tc; + } + + return 0; + +err_create_vport_tc: + esw_qos_destroy_vport_tc_sched_elements(vport, NULL); + + return err; +} + +static int +esw_qos_vport_tc_enable(struct mlx5_vport *vport, enum sched_node_type type, + struct netlink_ext_ack *extack) { struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node; struct mlx5_esw_sched_node *parent = vport_node->parent; + int err; - esw_qos_node_destroy_sched_element(vport_node, extack); + if (type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) { + int new_level, max_level; + + /* Increase the parent's level by 2 to account for both the + * TC arbiter and the vports TC scheduling element. + */ + new_level = (parent ? parent->level : 2) + 2; + max_level = 1 << MLX5_CAP_QOS(vport_node->esw->dev, + log_esw_max_sched_depth); + if (new_level > max_level) { + NL_SET_ERR_MSG_FMT_MOD(extack, + "TC arbitration on leafs is not supported beyond max depth %d", + max_level); + return -EOPNOTSUPP; + } + } + + esw_assert_qos_lock_held(vport->dev->priv.eswitch); + + if (type == SCHED_NODE_TYPE_RATE_LIMITER) + err = esw_qos_create_rate_limit_element(vport_node, extack); + else + err = esw_qos_tc_arbiter_scheduling_setup(vport_node, extack); + if (err) + return err; + + /* Rate limiters impact multiple nodes not directly connected to them + * and are not direct members of the QoS hierarchy. + * Unlink it from the parent to reflect that. + */ + if (type == SCHED_NODE_TYPE_RATE_LIMITER) { + list_del_init(&vport_node->entry); + vport_node->level = 0; + } + + err = esw_qos_create_vport_tc_sched_elements(vport, type, extack); + if (err) + goto err_sched_nodes; + + return 0; + +err_sched_nodes: + if (type == SCHED_NODE_TYPE_RATE_LIMITER) { + esw_qos_node_destroy_sched_element(vport_node, NULL); + esw_qos_node_attach_to_parent(vport_node); + } else { + esw_qos_tc_arbiter_scheduling_teardown(vport_node, NULL); + } + return err; +} + +static void esw_qos_vport_tc_disable(struct mlx5_vport *vport, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node; + enum sched_node_type curr_type = vport_node->type; + + esw_qos_destroy_vport_tc_sched_elements(vport, extack); + + if (curr_type == SCHED_NODE_TYPE_RATE_LIMITER) + esw_qos_node_destroy_sched_element(vport_node, extack); + else + esw_qos_tc_arbiter_scheduling_teardown(vport_node, extack); +} + +static int esw_qos_set_vport_tcs_min_rate(struct mlx5_vport *vport, + u32 min_rate, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node; + int err, i, num_tcs = esw_qos_num_tcs(vport_node->esw->dev); + + for (i = 0; i < num_tcs; i++) { + err = esw_qos_set_node_min_rate(vport->qos.sched_nodes[i], + min_rate, extack); + if (err) + goto err_out; + } + vport_node->min_rate = min_rate; + + return 0; +err_out: + for (--i; i >= 0; i--) { + esw_qos_set_node_min_rate(vport->qos.sched_nodes[i], + vport_node->min_rate, extack); + } + return err; +} + +static void esw_qos_vport_disable(struct mlx5_vport *vport, struct netlink_ext_ack *extack) +{ + struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node; + enum sched_node_type curr_type = vport_node->type; + + if (curr_type == SCHED_NODE_TYPE_VPORT) + esw_qos_node_destroy_sched_element(vport_node, extack); + else + esw_qos_vport_tc_disable(vport, extack); vport_node->bw_share = 0; + memset(vport_node->tc_bw, 0, sizeof(vport_node->tc_bw)); list_del_init(&vport_node->entry); - esw_qos_normalize_min_rate(parent->esw, parent, extack); + esw_qos_normalize_min_rate(vport_node->esw, vport_node->parent, extack); trace_mlx5_esw_vport_qos_destroy(vport_node->esw->dev, vport); } -static int esw_qos_vport_enable(struct mlx5_vport *vport, struct mlx5_esw_sched_node *parent, +static int esw_qos_vport_enable(struct mlx5_vport *vport, + enum sched_node_type type, + struct mlx5_esw_sched_node *parent, struct netlink_ext_ack *extack) { + struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node; int err; esw_assert_qos_lock_held(vport->dev->priv.eswitch); - esw_qos_node_set_parent(vport->qos.sched_node, parent); - err = esw_qos_vport_create_sched_element(vport->qos.sched_node, extack); + esw_qos_node_set_parent(vport_node, parent); + if (type == SCHED_NODE_TYPE_VPORT) + err = esw_qos_vport_create_sched_element(vport_node, extack); + else + err = esw_qos_vport_tc_enable(vport, type, extack); if (err) return err; - esw_qos_normalize_min_rate(parent->esw, parent, extack); - trace_mlx5_esw_vport_qos_create(vport->dev, vport, - vport->qos.sched_node->max_rate, - vport->qos.sched_node->bw_share); + vport_node->type = type; + esw_qos_normalize_min_rate(vport_node->esw, parent, extack); + trace_mlx5_esw_vport_qos_create(vport->dev, vport, vport_node->max_rate, + vport_node->bw_share); return 0; } @@ -595,6 +1093,7 @@ static int mlx5_esw_qos_vport_enable(struct mlx5_vport *vport, enum sched_node_t { struct mlx5_eswitch *esw = vport->dev->priv.eswitch; struct mlx5_esw_sched_node *sched_node; + struct mlx5_eswitch *parent_esw; int err; esw_assert_qos_lock_held(esw); @@ -602,16 +1101,20 @@ static int mlx5_esw_qos_vport_enable(struct mlx5_vport *vport, enum sched_node_t if (err) return err; - parent = parent ?: esw->qos.node0; - sched_node = __esw_qos_alloc_node(parent->esw, 0, type, parent); - if (!sched_node) + parent_esw = parent ? parent->esw : esw; + sched_node = __esw_qos_alloc_node(parent_esw, 0, type, parent); + if (!sched_node) { + esw_qos_put(esw); return -ENOMEM; + } + if (!parent) + list_add_tail(&sched_node->entry, &esw->qos.domain->nodes); sched_node->max_rate = max_rate; sched_node->min_rate = min_rate; sched_node->vport = vport; vport->qos.sched_node = sched_node; - err = esw_qos_vport_enable(vport, parent, extack); + err = esw_qos_vport_enable(vport, type, parent, extack); if (err) { __esw_qos_free_node(sched_node); esw_qos_put(esw); @@ -621,6 +1124,19 @@ static int mlx5_esw_qos_vport_enable(struct mlx5_vport *vport, enum sched_node_t return err; } +static void mlx5_esw_qos_vport_disable_locked(struct mlx5_vport *vport) +{ + struct mlx5_eswitch *esw = vport->dev->priv.eswitch; + + esw_assert_qos_lock_held(esw); + if (!vport->qos.sched_node) + return; + + esw_qos_vport_disable(vport, NULL); + mlx5_esw_qos_vport_qos_free(vport); + esw_qos_put(esw); +} + void mlx5_esw_qos_vport_disable(struct mlx5_vport *vport) { struct mlx5_eswitch *esw = vport->dev->priv.eswitch; @@ -632,11 +1148,9 @@ void mlx5_esw_qos_vport_disable(struct mlx5_vport *vport) goto unlock; parent = vport->qos.sched_node->parent; - WARN(parent != esw->qos.node0, "Disabling QoS on port before detaching it from node"); + WARN(parent, "Disabling QoS on port before detaching it from node"); - esw_qos_vport_disable(vport, NULL); - mlx5_esw_qos_vport_qos_free(vport); - esw_qos_put(esw); + mlx5_esw_qos_vport_disable_locked(vport); unlock: esw_qos_unlock(esw); } @@ -666,6 +1180,8 @@ static int mlx5_esw_qos_set_vport_min_rate(struct mlx5_vport *vport, u32 min_rat if (!vport_node) return mlx5_esw_qos_vport_enable(vport, SCHED_NODE_TYPE_VPORT, NULL, 0, min_rate, extack); + else if (vport_node->type == SCHED_NODE_TYPE_RATE_LIMITER) + return esw_qos_set_vport_tcs_min_rate(vport, min_rate, extack); else return esw_qos_set_node_min_rate(vport_node, min_rate, extack); } @@ -698,69 +1214,316 @@ bool mlx5_esw_qos_get_vport_rate(struct mlx5_vport *vport, u32 *max_rate, u32 *m return enabled; } -static int esw_qos_vport_update_parent(struct mlx5_vport *vport, struct mlx5_esw_sched_node *parent, +static int esw_qos_vport_tc_check_type(enum sched_node_type curr_type, + enum sched_node_type new_type, struct netlink_ext_ack *extack) { - struct mlx5_eswitch *esw = vport->dev->priv.eswitch; - struct mlx5_esw_sched_node *curr_parent; + if (curr_type == SCHED_NODE_TYPE_TC_ARBITER_TSAR && + new_type == SCHED_NODE_TYPE_RATE_LIMITER) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot switch from vport-level TC arbitration to node-level TC arbitration"); + return -EOPNOTSUPP; + } + + if (curr_type == SCHED_NODE_TYPE_RATE_LIMITER && + new_type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot switch from node-level TC arbitration to vport-level TC arbitration"); + return -EOPNOTSUPP; + } + + return 0; +} + +static int esw_qos_vport_update(struct mlx5_vport *vport, + enum sched_node_type type, + struct mlx5_esw_sched_node *parent, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node; + struct mlx5_esw_sched_node *curr_parent = vport_node->parent; + enum sched_node_type curr_type = vport_node->type; + u32 curr_tc_bw[DEVLINK_RATE_TCS_MAX] = {0}; int err; - esw_assert_qos_lock_held(esw); - curr_parent = vport->qos.sched_node->parent; - parent = parent ?: esw->qos.node0; - if (curr_parent == parent) + esw_assert_qos_lock_held(vport->dev->priv.eswitch); + if (curr_type == type && curr_parent == parent) return 0; + err = esw_qos_vport_tc_check_type(curr_type, type, extack); + if (err) + return err; + + if (curr_type == SCHED_NODE_TYPE_TC_ARBITER_TSAR && curr_type == type) + esw_qos_tc_arbiter_get_bw_shares(vport_node, curr_tc_bw); + esw_qos_vport_disable(vport, extack); - err = esw_qos_vport_enable(vport, parent, extack); + err = esw_qos_vport_enable(vport, type, parent, extack); if (err) { - if (esw_qos_vport_enable(vport, curr_parent, NULL)) - esw_warn(parent->esw->dev, "vport restore QoS failed (vport=%d)\n", - vport->vport); + esw_qos_vport_enable(vport, curr_type, curr_parent, NULL); + extack = NULL; + } + + if (curr_type == SCHED_NODE_TYPE_TC_ARBITER_TSAR && curr_type == type) { + esw_qos_set_tc_arbiter_bw_shares(vport_node, curr_tc_bw, + extack); } return err; } -static u32 mlx5_esw_qos_lag_link_speed_get_locked(struct mlx5_core_dev *mdev) +static int esw_qos_vport_update_parent(struct mlx5_vport *vport, struct mlx5_esw_sched_node *parent, + struct netlink_ext_ack *extack) +{ + struct mlx5_eswitch *esw = vport->dev->priv.eswitch; + struct mlx5_esw_sched_node *curr_parent; + enum sched_node_type type; + + esw_assert_qos_lock_held(esw); + curr_parent = vport->qos.sched_node->parent; + if (curr_parent == parent) + return 0; + + /* Set vport QoS type based on parent node type if different from + * default QoS; otherwise, use the vport's current QoS type. + */ + if (parent && parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) + type = SCHED_NODE_TYPE_RATE_LIMITER; + else if (curr_parent && + curr_parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) + type = SCHED_NODE_TYPE_VPORT; + else + type = vport->qos.sched_node->type; + + return esw_qos_vport_update(vport, type, parent, extack); +} + +static void +esw_qos_switch_vport_tcs_to_vport(struct mlx5_esw_sched_node *tc_arbiter_node, + struct mlx5_esw_sched_node *node, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_sched_node *vports_tc_node, *vport_tc_node, *tmp; + + vports_tc_node = list_first_entry(&tc_arbiter_node->children, + struct mlx5_esw_sched_node, + entry); + + list_for_each_entry_safe(vport_tc_node, tmp, &vports_tc_node->children, + entry) + esw_qos_vport_update_parent(vport_tc_node->vport, node, extack); +} + +static int esw_qos_switch_tc_arbiter_node_to_vports( + struct mlx5_esw_sched_node *tc_arbiter_node, + struct mlx5_esw_sched_node *node, + struct netlink_ext_ack *extack) +{ + u32 parent_tsar_ix = node->parent ? + node->parent->ix : node->esw->qos.root_tsar_ix; + int err; + + err = esw_qos_create_node_sched_elem(node->esw->dev, parent_tsar_ix, + node->max_rate, node->bw_share, + &node->ix); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Failed to create scheduling element for vports node when disabling vports TC QoS"); + return err; + } + + node->type = SCHED_NODE_TYPE_VPORTS_TSAR; + + /* Disable TC QoS for vports in the arbiter node. */ + esw_qos_switch_vport_tcs_to_vport(tc_arbiter_node, node, extack); + + return 0; +} + +static int esw_qos_switch_vports_node_to_tc_arbiter( + struct mlx5_esw_sched_node *node, + struct mlx5_esw_sched_node *tc_arbiter_node, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_sched_node *vport_node, *tmp; + struct mlx5_vport *vport; + int err; + + /* Enable TC QoS for each vport in the node. */ + list_for_each_entry_safe(vport_node, tmp, &node->children, entry) { + vport = vport_node->vport; + err = esw_qos_vport_update_parent(vport, tc_arbiter_node, + extack); + if (err) + goto err_out; + } + + /* Destroy the current vports node TSAR. */ + err = mlx5_destroy_scheduling_element_cmd(node->esw->dev, + SCHEDULING_HIERARCHY_E_SWITCH, + node->ix); + if (err) + goto err_out; + + return 0; +err_out: + /* Restore vports back into the node if an error occurs. */ + esw_qos_switch_vport_tcs_to_vport(tc_arbiter_node, node, NULL); + + return err; +} + +static struct mlx5_esw_sched_node * +esw_qos_move_node(struct mlx5_esw_sched_node *curr_node) +{ + struct mlx5_esw_sched_node *new_node; + + new_node = __esw_qos_alloc_node(curr_node->esw, curr_node->ix, + curr_node->type, NULL); + if (!new_node) + return ERR_PTR(-ENOMEM); + + esw_qos_nodes_set_parent(&curr_node->children, new_node); + return new_node; +} + +static int esw_qos_node_disable_tc_arbitration(struct mlx5_esw_sched_node *node, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_sched_node *curr_node; + int err; + + if (node->type != SCHED_NODE_TYPE_TC_ARBITER_TSAR) + return 0; + + /* Allocate a new rate node to hold the current state, which will allow + * for restoring the vports back to this node after disabling TC + * arbitration. + */ + curr_node = esw_qos_move_node(node); + if (IS_ERR(curr_node)) { + NL_SET_ERR_MSG_MOD(extack, "Failed setting up vports node"); + return PTR_ERR(curr_node); + } + + /* Disable TC QoS for all vports, and assign them back to the node. */ + err = esw_qos_switch_tc_arbiter_node_to_vports(curr_node, node, extack); + if (err) + goto err_out; + + /* Clean up the TC arbiter node after disabling TC QoS for vports. */ + esw_qos_tc_arbiter_scheduling_teardown(curr_node, extack); + goto out; +err_out: + esw_qos_nodes_set_parent(&curr_node->children, node); +out: + __esw_qos_free_node(curr_node); + return err; +} + +static int esw_qos_node_enable_tc_arbitration(struct mlx5_esw_sched_node *node, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_sched_node *curr_node, *child; + int err, new_level, max_level; + + if (node->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) + return 0; + + /* Increase the hierarchy level by one to account for the additional + * vports TC scheduling node, and verify that the new level does not + * exceed the maximum allowed depth. + */ + new_level = node->level + 1; + max_level = 1 << MLX5_CAP_QOS(node->esw->dev, log_esw_max_sched_depth); + if (new_level > max_level) { + NL_SET_ERR_MSG_FMT_MOD(extack, + "TC arbitration on nodes is not supported beyond max depth %d", + max_level); + return -EOPNOTSUPP; + } + + /* Ensure the node does not contain non-leaf children before assigning + * TC bandwidth. + */ + if (!list_empty(&node->children)) { + list_for_each_entry(child, &node->children, entry) { + if (!child->vport) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot configure TC bandwidth on a node with non-leaf children"); + return -EOPNOTSUPP; + } + } + } + + /* Allocate a new node that will store the information of the current + * node. This will be used later to restore the node if necessary. + */ + curr_node = esw_qos_move_node(node); + if (IS_ERR(curr_node)) { + NL_SET_ERR_MSG_MOD(extack, "Failed setting up node TC QoS"); + return PTR_ERR(curr_node); + } + + /* Initialize the TC arbiter node for QoS management. + * This step prepares the node for handling Traffic Class arbitration. + */ + err = esw_qos_tc_arbiter_scheduling_setup(node, extack); + if (err) + goto err_setup; + + /* Enable TC QoS for each vport within the current node. */ + err = esw_qos_switch_vports_node_to_tc_arbiter(curr_node, node, extack); + if (err) + goto err_switch_vports; + goto out; + +err_switch_vports: + esw_qos_tc_arbiter_scheduling_teardown(node, NULL); + node->ix = curr_node->ix; + node->type = curr_node->type; +err_setup: + esw_qos_nodes_set_parent(&curr_node->children, node); +out: + __esw_qos_free_node(curr_node); + return err; +} + +static u32 mlx5_esw_qos_lag_link_speed_get(struct mlx5_core_dev *mdev, + bool take_rtnl) { struct ethtool_link_ksettings lksettings; struct net_device *slave, *master; u32 speed = SPEED_UNKNOWN; - /* Lock ensures a stable reference to master and slave netdevice - * while port speed of master is queried. - */ - ASSERT_RTNL(); - slave = mlx5_uplink_netdev_get(mdev); if (!slave) goto out; + if (take_rtnl) + rtnl_lock(); master = netdev_master_upper_dev_get(slave); if (master && !__ethtool_get_link_ksettings(master, &lksettings)) speed = lksettings.base.speed; + if (take_rtnl) + rtnl_unlock(); out: + mlx5_uplink_netdev_put(mdev, slave); return speed; } static int mlx5_esw_qos_max_link_speed_get(struct mlx5_core_dev *mdev, u32 *link_speed_max, - bool hold_rtnl_lock, struct netlink_ext_ack *extack) + bool take_rtnl, + struct netlink_ext_ack *extack) { int err; if (!mlx5_lag_is_active(mdev)) goto skip_lag; - if (hold_rtnl_lock) - rtnl_lock(); - - *link_speed_max = mlx5_esw_qos_lag_link_speed_get_locked(mdev); - - if (hold_rtnl_lock) - rtnl_unlock(); + *link_speed_max = mlx5_esw_qos_lag_link_speed_get(mdev, take_rtnl); if (*link_speed_max != (u32)SPEED_UNKNOWN) return 0; @@ -848,6 +1611,57 @@ static int esw_qos_devlink_rate_to_mbps(struct mlx5_core_dev *mdev, const char * return 0; } +static bool esw_qos_validate_unsupported_tc_bw(struct mlx5_eswitch *esw, + u32 *tc_bw) +{ + int i, num_tcs = esw_qos_num_tcs(esw->dev); + + for (i = num_tcs; i < DEVLINK_RATE_TCS_MAX; i++) { + if (tc_bw[i]) + return false; + } + + return true; +} + +static bool esw_qos_vport_validate_unsupported_tc_bw(struct mlx5_vport *vport, + u32 *tc_bw) +{ + struct mlx5_esw_sched_node *node = vport->qos.sched_node; + struct mlx5_eswitch *esw = vport->dev->priv.eswitch; + + esw = (node && node->parent) ? node->parent->esw : esw; + + return esw_qos_validate_unsupported_tc_bw(esw, tc_bw); +} + +static bool esw_qos_tc_bw_disabled(u32 *tc_bw) +{ + int i; + + for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) { + if (tc_bw[i]) + return false; + } + + return true; +} + +static void esw_vport_qos_prune_empty(struct mlx5_vport *vport) +{ + struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node; + + esw_assert_qos_lock_held(vport->dev->priv.eswitch); + if (!vport_node) + return; + + if (vport_node->parent || vport_node->max_rate || + vport_node->min_rate || !esw_qos_tc_bw_disabled(vport_node->tc_bw)) + return; + + mlx5_esw_qos_vport_disable_locked(vport); +} + int mlx5_esw_qos_init(struct mlx5_eswitch *esw) { if (esw->qos.domain) @@ -881,6 +1695,10 @@ int mlx5_esw_devlink_rate_leaf_tx_share_set(struct devlink_rate *rate_leaf, void esw_qos_lock(esw); err = mlx5_esw_qos_set_vport_min_rate(vport, tx_share, extack); + if (err) + goto out; + esw_vport_qos_prune_empty(vport); +out: esw_qos_unlock(esw); return err; } @@ -902,6 +1720,95 @@ int mlx5_esw_devlink_rate_leaf_tx_max_set(struct devlink_rate *rate_leaf, void * esw_qos_lock(esw); err = mlx5_esw_qos_set_vport_max_rate(vport, tx_max, extack); + if (err) + goto out; + esw_vport_qos_prune_empty(vport); +out: + esw_qos_unlock(esw); + return err; +} + +int mlx5_esw_devlink_rate_leaf_tc_bw_set(struct devlink_rate *rate_leaf, + void *priv, + u32 *tc_bw, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_sched_node *vport_node; + struct mlx5_vport *vport = priv; + struct mlx5_eswitch *esw; + bool disable; + int err = 0; + + esw = vport->dev->priv.eswitch; + if (!mlx5_esw_allowed(esw)) + return -EPERM; + + disable = esw_qos_tc_bw_disabled(tc_bw); + esw_qos_lock(esw); + + if (!esw_qos_vport_validate_unsupported_tc_bw(vport, tc_bw)) { + NL_SET_ERR_MSG_MOD(extack, + "E-Switch traffic classes number is not supported"); + err = -EOPNOTSUPP; + goto unlock; + } + + vport_node = vport->qos.sched_node; + if (disable && !vport_node) + goto unlock; + + if (disable) { + if (vport_node->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) + err = esw_qos_vport_update(vport, SCHED_NODE_TYPE_VPORT, + vport_node->parent, extack); + esw_vport_qos_prune_empty(vport); + goto unlock; + } + + if (!vport_node) { + err = mlx5_esw_qos_vport_enable(vport, + SCHED_NODE_TYPE_TC_ARBITER_TSAR, + NULL, 0, 0, extack); + vport_node = vport->qos.sched_node; + } else { + err = esw_qos_vport_update(vport, + SCHED_NODE_TYPE_TC_ARBITER_TSAR, + vport_node->parent, extack); + } + if (!err) + esw_qos_set_tc_arbiter_bw_shares(vport_node, tc_bw, extack); +unlock: + esw_qos_unlock(esw); + return err; +} + +int mlx5_esw_devlink_rate_node_tc_bw_set(struct devlink_rate *rate_node, + void *priv, + u32 *tc_bw, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_sched_node *node = priv; + struct mlx5_eswitch *esw = node->esw; + bool disable; + int err; + + if (!esw_qos_validate_unsupported_tc_bw(esw, tc_bw)) { + NL_SET_ERR_MSG_MOD(extack, + "E-Switch traffic classes number is not supported"); + return -EOPNOTSUPP; + } + + disable = esw_qos_tc_bw_disabled(tc_bw); + esw_qos_lock(esw); + if (disable) { + err = esw_qos_node_disable_tc_arbitration(node, extack); + goto unlock; + } + + err = esw_qos_node_enable_tc_arbitration(node, extack); + if (!err) + esw_qos_set_tc_arbiter_bw_shares(node, tc_bw, extack); +unlock: esw_qos_unlock(esw); return err; } @@ -996,10 +1903,16 @@ int mlx5_esw_qos_vport_update_parent(struct mlx5_vport *vport, struct mlx5_esw_s } esw_qos_lock(esw); - if (!vport->qos.sched_node && parent) - err = mlx5_esw_qos_vport_enable(vport, SCHED_NODE_TYPE_VPORT, parent, 0, 0, extack); - else if (vport->qos.sched_node) + if (!vport->qos.sched_node && parent) { + enum sched_node_type type; + + type = parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR ? + SCHED_NODE_TYPE_RATE_LIMITER : SCHED_NODE_TYPE_VPORT; + err = mlx5_esw_qos_vport_enable(vport, type, parent, 0, 0, + extack); + } else if (vport->qos.sched_node) { err = esw_qos_vport_update_parent(vport, parent, extack); + } esw_qos_unlock(esw); return err; } @@ -1009,14 +1922,34 @@ int mlx5_esw_devlink_rate_leaf_parent_set(struct devlink_rate *devlink_rate, void *priv, void *parent_priv, struct netlink_ext_ack *extack) { - struct mlx5_esw_sched_node *node; + struct mlx5_esw_sched_node *node = parent ? parent_priv : NULL; struct mlx5_vport *vport = priv; + int err; - if (!parent) - return mlx5_esw_qos_vport_update_parent(vport, NULL, extack); + err = mlx5_esw_qos_vport_update_parent(vport, node, extack); + if (!err) { + struct mlx5_eswitch *esw = vport->dev->priv.eswitch; - node = parent_priv; - return mlx5_esw_qos_vport_update_parent(vport, node, extack); + esw_qos_lock(esw); + esw_vport_qos_prune_empty(vport); + esw_qos_unlock(esw); + } + + return err; +} + +static bool esw_qos_is_node_empty(struct mlx5_esw_sched_node *node) +{ + if (list_empty(&node->children)) + return true; + + if (node->type != SCHED_NODE_TYPE_TC_ARBITER_TSAR) + return false; + + node = list_first_entry(&node->children, struct mlx5_esw_sched_node, + entry); + + return esw_qos_is_node_empty(node); } static int @@ -1032,23 +1965,63 @@ mlx5_esw_qos_node_validate_set_parent(struct mlx5_esw_sched_node *node, return -EOPNOTSUPP; } - if (!list_empty(&node->children)) { + if (!esw_qos_is_node_empty(node)) { NL_SET_ERR_MSG_MOD(extack, "Cannot reassign a node that contains rate objects"); return -EOPNOTSUPP; } + if (parent && parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot attach a node to a parent with TC bandwidth configured"); + return -EOPNOTSUPP; + } + new_level = parent ? parent->level + 1 : 2; + if (node->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) { + /* Increase by one to account for the vports TC scheduling + * element. + */ + new_level += 1; + } + max_level = 1 << MLX5_CAP_QOS(node->esw->dev, log_esw_max_sched_depth); if (new_level > max_level) { - NL_SET_ERR_MSG_MOD(extack, - "Node hierarchy depth exceeds the maximum supported level"); + NL_SET_ERR_MSG_FMT_MOD(extack, + "Node hierarchy depth %d exceeds the maximum supported level %d", + new_level, max_level); return -EOPNOTSUPP; } return 0; } +static int +esw_qos_tc_arbiter_node_update_parent(struct mlx5_esw_sched_node *node, + struct mlx5_esw_sched_node *parent, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_sched_node *curr_parent = node->parent; + u32 curr_tc_bw[DEVLINK_RATE_TCS_MAX] = {0}; + struct mlx5_eswitch *esw = node->esw; + int err; + + esw_qos_tc_arbiter_get_bw_shares(node, curr_tc_bw); + esw_qos_tc_arbiter_scheduling_teardown(node, extack); + esw_qos_node_set_parent(node, parent); + err = esw_qos_tc_arbiter_scheduling_setup(node, extack); + if (err) { + esw_qos_node_set_parent(node, curr_parent); + if (esw_qos_tc_arbiter_scheduling_setup(node, extack)) { + esw_warn(esw->dev, "Node restore QoS failed\n"); + return err; + } + } + esw_qos_set_tc_arbiter_bw_shares(node, curr_tc_bw, extack); + + return err; +} + static int esw_qos_vports_node_update_parent(struct mlx5_esw_sched_node *node, struct mlx5_esw_sched_node *parent, struct netlink_ext_ack *extack) @@ -1095,7 +2068,13 @@ static int mlx5_esw_qos_node_update_parent(struct mlx5_esw_sched_node *node, esw_qos_lock(esw); curr_parent = node->parent; - err = esw_qos_vports_node_update_parent(node, parent, extack); + if (node->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) { + err = esw_qos_tc_arbiter_node_update_parent(node, parent, + extack); + } else { + err = esw_qos_vports_node_update_parent(node, parent, extack); + } + if (err) goto out; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h index ed40ec8f02..0a50982b0e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h @@ -21,6 +21,14 @@ int mlx5_esw_devlink_rate_leaf_tx_share_set(struct devlink_rate *rate_leaf, void u64 tx_share, struct netlink_ext_ack *extack); int mlx5_esw_devlink_rate_leaf_tx_max_set(struct devlink_rate *rate_leaf, void *priv, u64 tx_max, struct netlink_ext_ack *extack); +int mlx5_esw_devlink_rate_leaf_tc_bw_set(struct devlink_rate *rate_node, + void *priv, + u32 *tc_bw, + struct netlink_ext_ack *extack); +int mlx5_esw_devlink_rate_node_tc_bw_set(struct devlink_rate *rate_node, + void *priv, + u32 *tc_bw, + struct netlink_ext_ack *extack); int mlx5_esw_devlink_rate_node_tx_share_set(struct devlink_rate *rate_node, void *priv, u64 tx_share, struct netlink_ext_ack *extack); int mlx5_esw_devlink_rate_node_tx_max_set(struct devlink_rate *rate_node, void *priv, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/vporttbl.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/vporttbl.c index 749c3957a1..407062096a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/vporttbl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/vporttbl.c @@ -45,8 +45,8 @@ esw_vport_tbl_create(struct mlx5_eswitch *esw, struct mlx5_flow_namespace *ns, ft_attr.flags = vport_ns->flags; fdb = mlx5_create_auto_grouped_flow_table(ns, &ft_attr); if (IS_ERR(fdb)) { - esw_warn(esw->dev, "Failed to create per vport FDB Table err %ld\n", - PTR_ERR(fdb)); + esw_warn(esw->dev, "Failed to create per vport FDB Table err %pe\n", + fdb); } return fdb; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 4917d185d0..3adf2b1cd2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -257,8 +257,8 @@ __esw_fdb_set_vport_rule(struct mlx5_eswitch *esw, u16 vport, bool rx_rule, &flow_act, &dest, 1); if (IS_ERR(flow_rule)) { esw_warn(esw->dev, - "FDB: Failed to add flow rule: dmac_v(%pM) dmac_c(%pM) -> vport(%d), err(%ld)\n", - dmac_v, dmac_c, vport, PTR_ERR(flow_rule)); + "FDB: Failed to add flow rule: dmac_v(%pM) dmac_c(%pM) -> vport(%d), err(%pe)\n", + dmac_v, dmac_c, vport, flow_rule); flow_rule = NULL; } @@ -820,6 +820,7 @@ static int mlx5_esw_vport_caps_get(struct mlx5_eswitch *esw, struct mlx5_vport * hca_caps = MLX5_ADDR_OF(query_hca_cap_out, query_ctx, capability); vport->info.roce_enabled = MLX5_GET(cmd_hca_cap, hca_caps, roce); + vport->vhca_id = MLX5_GET(cmd_hca_cap, hca_caps, vhca_id); if (!MLX5_CAP_GEN_MAX(esw->dev, hca_cap_2)) goto out_free; @@ -839,6 +840,18 @@ out_free: return err; } +bool mlx5_esw_vport_vhca_id(struct mlx5_eswitch *esw, u16 vportn, u16 *vhca_id) +{ + struct mlx5_vport *vport; + + vport = mlx5_eswitch_get_vport(esw, vportn); + if (IS_ERR(vport) || MLX5_VPORT_INVAL_VHCA_ID(vport)) + return false; + + *vhca_id = vport->vhca_id; + return true; +} + static int esw_vport_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport) { bool vst_mode_steering = esw_vst_mode_is_steering(esw); @@ -862,13 +875,10 @@ static int esw_vport_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport) vport_num, 1, vport->info.link_state); - /* Host PF has its own mac/guid. */ - if (vport_num) { - mlx5_modify_nic_vport_mac_address(esw->dev, vport_num, - vport->info.mac); - mlx5_modify_nic_vport_node_guid(esw->dev, vport_num, - vport->info.node_guid); - } + mlx5_query_nic_vport_mac_address(esw->dev, vport_num, true, + vport->info.mac); + mlx5_query_nic_vport_node_guid(esw->dev, vport_num, true, + &vport->info.node_guid); flags = (vport->info.vlan || vport->info.qos) ? SET_VLAN_STRIP | SET_VLAN_INSERT : 0; @@ -929,17 +939,11 @@ int mlx5_esw_vport_enable(struct mlx5_eswitch *esw, struct mlx5_vport *vport, if (!mlx5_esw_is_manager_vport(esw, vport_num) && MLX5_CAP_GEN(esw->dev, vhca_resource_manager)) { - ret = mlx5_esw_vport_vhca_id_set(esw, vport_num); + ret = mlx5_esw_vport_vhca_id_map(esw, vport); if (ret) goto err_vhca_mapping; } - /* External controller host PF has factory programmed MAC. - * Read it from the device. - */ - if (mlx5_core_is_ecpf(esw->dev) && vport_num == MLX5_VPORT_PF) - mlx5_query_nic_vport_mac_address(esw->dev, vport_num, true, vport->info.mac); - esw_vport_change_handle_locked(vport); esw->enabled_vports++; @@ -973,7 +977,7 @@ void mlx5_esw_vport_disable(struct mlx5_eswitch *esw, struct mlx5_vport *vport) if (!mlx5_esw_is_manager_vport(esw, vport_num) && MLX5_CAP_GEN(esw->dev, vhca_resource_manager)) - mlx5_esw_vport_vhca_id_clear(esw, vport_num); + mlx5_esw_vport_vhca_id_unmap(esw, vport); if (vport->vport != MLX5_VPORT_PF && (vport->info.ipsec_crypto_enabled || vport->info.ipsec_packet_enabled)) @@ -1038,6 +1042,25 @@ const u32 *mlx5_esw_query_functions(struct mlx5_core_dev *dev) return ERR_PTR(err); } +static int mlx5_esw_host_functions_enabled_query(struct mlx5_eswitch *esw) +{ + const u32 *query_host_out; + + if (!mlx5_core_is_ecpf_esw_manager(esw->dev)) + return 0; + + query_host_out = mlx5_esw_query_functions(esw->dev); + if (IS_ERR(query_host_out)) + return PTR_ERR(query_host_out); + + esw->esw_funcs.host_funcs_disabled = + MLX5_GET(query_esw_functions_out, query_host_out, + host_params_context.host_pf_not_exist); + + kvfree(query_host_out); + return 0; +} + static void mlx5_eswitch_event_handler_register(struct mlx5_eswitch *esw) { if (esw->mode == MLX5_ESWITCH_OFFLOADS && mlx5_eswitch_is_funcs_handler(esw->dev)) { @@ -1185,7 +1208,8 @@ void mlx5_eswitch_unload_vf_vports(struct mlx5_eswitch *esw, u16 num_vfs) unsigned long i; mlx5_esw_for_each_vf_vport(esw, i, vport, num_vfs) { - if (!vport->enabled) + /* Adjacent VFs are unloaded separately */ + if (!vport->enabled || vport->adjacent) continue; mlx5_eswitch_unload_pf_vf_vport(esw, vport->vport); } @@ -1204,6 +1228,42 @@ static void mlx5_eswitch_unload_ec_vf_vports(struct mlx5_eswitch *esw, } } +static void mlx5_eswitch_unload_adj_vf_vports(struct mlx5_eswitch *esw) +{ + struct mlx5_vport *vport; + unsigned long i; + + mlx5_esw_for_each_vf_vport(esw, i, vport, U16_MAX) { + if (!vport->enabled || !vport->adjacent) + continue; + mlx5_eswitch_unload_pf_vf_vport(esw, vport->vport); + } +} + +static int +mlx5_eswitch_load_adj_vf_vports(struct mlx5_eswitch *esw, + enum mlx5_eswitch_vport_event enabled_events) +{ + struct mlx5_vport *vport; + unsigned long i; + int err; + + mlx5_esw_for_each_vf_vport(esw, i, vport, U16_MAX) { + if (!vport->adjacent) + continue; + err = mlx5_eswitch_load_pf_vf_vport(esw, vport->vport, + enabled_events); + if (err) + goto unload_adj_vf_vport; + } + + return 0; + +unload_adj_vf_vport: + mlx5_eswitch_unload_adj_vf_vports(esw); + return err; +} + int mlx5_eswitch_load_vf_vports(struct mlx5_eswitch *esw, u16 num_vfs, enum mlx5_eswitch_vport_event enabled_events) { @@ -1278,17 +1338,19 @@ mlx5_eswitch_enable_pf_vf_vports(struct mlx5_eswitch *esw, esw->mode == MLX5_ESWITCH_LEGACY; /* Enable PF vport */ - if (pf_needed) { + if (pf_needed && mlx5_esw_host_functions_enabled(esw->dev)) { ret = mlx5_eswitch_load_pf_vf_vport(esw, MLX5_VPORT_PF, enabled_events); if (ret) return ret; } - /* Enable external host PF HCA */ - ret = host_pf_enable_hca(esw->dev); - if (ret) - goto pf_hca_err; + if (mlx5_esw_host_functions_enabled(esw->dev)) { + /* Enable external host PF HCA */ + ret = host_pf_enable_hca(esw->dev); + if (ret) + goto pf_hca_err; + } /* Enable ECPF vport */ if (mlx5_ecpf_vport_exists(esw->dev)) { @@ -1311,8 +1373,16 @@ mlx5_eswitch_enable_pf_vf_vports(struct mlx5_eswitch *esw, enabled_events); if (ret) goto vf_err; + + /* Enable adjacent VF vports */ + ret = mlx5_eswitch_load_adj_vf_vports(esw, enabled_events); + if (ret) + goto unload_vf_vports; + return 0; +unload_vf_vports: + mlx5_eswitch_unload_vf_vports(esw, esw->esw_funcs.num_vfs); vf_err: if (mlx5_core_ec_sriov_enabled(esw->dev)) mlx5_eswitch_unload_ec_vf_vports(esw, esw->esw_funcs.num_ec_vfs); @@ -1320,9 +1390,10 @@ ec_vf_err: if (mlx5_ecpf_vport_exists(esw->dev)) mlx5_eswitch_unload_pf_vf_vport(esw, MLX5_VPORT_ECPF); ecpf_err: - host_pf_disable_hca(esw->dev); + if (mlx5_esw_host_functions_enabled(esw->dev)) + host_pf_disable_hca(esw->dev); pf_hca_err: - if (pf_needed) + if (pf_needed && mlx5_esw_host_functions_enabled(esw->dev)) mlx5_eswitch_unload_pf_vf_vport(esw, MLX5_VPORT_PF); return ret; } @@ -1332,6 +1403,8 @@ pf_hca_err: */ void mlx5_eswitch_disable_pf_vf_vports(struct mlx5_eswitch *esw) { + mlx5_eswitch_unload_adj_vf_vports(esw); + mlx5_eswitch_unload_vf_vports(esw, esw->esw_funcs.num_vfs); if (mlx5_core_ec_sriov_enabled(esw->dev)) @@ -1342,10 +1415,12 @@ void mlx5_eswitch_disable_pf_vf_vports(struct mlx5_eswitch *esw) mlx5_eswitch_unload_pf_vf_vport(esw, MLX5_VPORT_ECPF); } - host_pf_disable_hca(esw->dev); + if (mlx5_esw_host_functions_enabled(esw->dev)) + host_pf_disable_hca(esw->dev); - if (mlx5_core_is_ecpf_esw_manager(esw->dev) || - esw->mode == MLX5_ESWITCH_LEGACY) + if ((mlx5_core_is_ecpf_esw_manager(esw->dev) || + esw->mode == MLX5_ESWITCH_LEGACY) && + mlx5_esw_host_functions_enabled(esw->dev)) mlx5_eswitch_unload_pf_vf_vport(esw, MLX5_VPORT_PF); } @@ -1399,22 +1474,79 @@ static void mlx5_esw_mode_change_notify(struct mlx5_eswitch *esw, u16 mode) info.new_mode = mode; - blocking_notifier_call_chain(&esw->n_head, 0, &info); + blocking_notifier_call_chain(&esw->dev->priv.esw_n_head, 0, &info); +} + +static int mlx5_esw_egress_acls_init(struct mlx5_core_dev *dev) +{ + struct mlx5_flow_steering *steering = dev->priv.steering; + int total_vports = mlx5_eswitch_get_total_vports(dev); + int err; + int i; + + for (i = 0; i < total_vports; i++) { + err = mlx5_fs_vport_egress_acl_ns_add(steering, i); + if (err) + goto acl_ns_remove; + } + return 0; + +acl_ns_remove: + while (i--) + mlx5_fs_vport_egress_acl_ns_remove(steering, i); + return err; +} + +static void mlx5_esw_egress_acls_cleanup(struct mlx5_core_dev *dev) +{ + struct mlx5_flow_steering *steering = dev->priv.steering; + int total_vports = mlx5_eswitch_get_total_vports(dev); + int i; + + for (i = total_vports - 1; i >= 0; i--) + mlx5_fs_vport_egress_acl_ns_remove(steering, i); +} + +static int mlx5_esw_ingress_acls_init(struct mlx5_core_dev *dev) +{ + struct mlx5_flow_steering *steering = dev->priv.steering; + int total_vports = mlx5_eswitch_get_total_vports(dev); + int err; + int i; + + for (i = 0; i < total_vports; i++) { + err = mlx5_fs_vport_ingress_acl_ns_add(steering, i); + if (err) + goto acl_ns_remove; + } + return 0; + +acl_ns_remove: + while (i--) + mlx5_fs_vport_ingress_acl_ns_remove(steering, i); + return err; +} + +static void mlx5_esw_ingress_acls_cleanup(struct mlx5_core_dev *dev) +{ + struct mlx5_flow_steering *steering = dev->priv.steering; + int total_vports = mlx5_eswitch_get_total_vports(dev); + int i; + + for (i = total_vports - 1; i >= 0; i--) + mlx5_fs_vport_ingress_acl_ns_remove(steering, i); } static int mlx5_esw_acls_ns_init(struct mlx5_eswitch *esw) { struct mlx5_core_dev *dev = esw->dev; - int total_vports; int err; if (esw->flags & MLX5_ESWITCH_VPORT_ACL_NS_CREATED) return 0; - total_vports = mlx5_eswitch_get_total_vports(dev); - if (MLX5_CAP_ESW_EGRESS_ACL(dev, ft_support)) { - err = mlx5_fs_egress_acls_init(dev, total_vports); + err = mlx5_esw_egress_acls_init(dev); if (err) return err; } else { @@ -1422,7 +1554,7 @@ static int mlx5_esw_acls_ns_init(struct mlx5_eswitch *esw) } if (MLX5_CAP_ESW_INGRESS_ACL(dev, ft_support)) { - err = mlx5_fs_ingress_acls_init(dev, total_vports); + err = mlx5_esw_ingress_acls_init(dev); if (err) goto err; } else { @@ -1433,7 +1565,7 @@ static int mlx5_esw_acls_ns_init(struct mlx5_eswitch *esw) err: if (MLX5_CAP_ESW_EGRESS_ACL(dev, ft_support)) - mlx5_fs_egress_acls_cleanup(dev); + mlx5_esw_egress_acls_cleanup(dev); return err; } @@ -1443,9 +1575,9 @@ static void mlx5_esw_acls_ns_cleanup(struct mlx5_eswitch *esw) esw->flags &= ~MLX5_ESWITCH_VPORT_ACL_NS_CREATED; if (MLX5_CAP_ESW_INGRESS_ACL(dev, ft_support)) - mlx5_fs_ingress_acls_cleanup(dev); + mlx5_esw_ingress_acls_cleanup(dev); if (MLX5_CAP_ESW_EGRESS_ACL(dev, ft_support)) - mlx5_fs_egress_acls_cleanup(dev); + mlx5_esw_egress_acls_cleanup(dev); } /** @@ -1674,7 +1806,8 @@ int mlx5_esw_sf_max_hpf_functions(struct mlx5_core_dev *dev, u16 *max_sfs, u16 * void *hca_caps; int err; - if (!mlx5_core_is_ecpf(dev)) { + if (!mlx5_core_is_ecpf(dev) || + !mlx5_esw_host_functions_enabled(dev)) { *max_sfs = 0; return 0; } @@ -1696,8 +1829,7 @@ out_free: return err; } -static int mlx5_esw_vport_alloc(struct mlx5_eswitch *esw, - int index, u16 vport_num) +int mlx5_esw_vport_alloc(struct mlx5_eswitch *esw, int index, u16 vport_num) { struct mlx5_vport *vport; int err; @@ -1710,6 +1842,7 @@ static int mlx5_esw_vport_alloc(struct mlx5_eswitch *esw, vport->vport = vport_num; vport->index = index; vport->info.link_state = MLX5_VPORT_ADMIN_STATE_AUTO; + vport->vhca_id = MLX5_VHCA_ID_INVALID; INIT_WORK(&vport->vport_change_handler, esw_vport_change_handler); err = xa_insert(&esw->vports, vport_num, vport, GFP_KERNEL); if (err) @@ -1723,8 +1856,9 @@ insert_err: return err; } -static void mlx5_esw_vport_free(struct mlx5_eswitch *esw, struct mlx5_vport *vport) +void mlx5_esw_vport_free(struct mlx5_eswitch *esw, struct mlx5_vport *vport) { + esw->total_vports--; xa_erase(&esw->vports, vport->vport); kfree(vport); } @@ -1750,21 +1884,23 @@ static int mlx5_esw_vports_init(struct mlx5_eswitch *esw) xa_init(&esw->vports); - err = mlx5_esw_vport_alloc(esw, idx, MLX5_VPORT_PF); - if (err) - goto err; - if (esw->first_host_vport == MLX5_VPORT_PF) - xa_set_mark(&esw->vports, idx, MLX5_ESW_VPT_HOST_FN); - idx++; - - for (i = 0; i < mlx5_core_max_vfs(dev); i++) { - err = mlx5_esw_vport_alloc(esw, idx, idx); + if (mlx5_esw_host_functions_enabled(dev)) { + err = mlx5_esw_vport_alloc(esw, idx, MLX5_VPORT_PF); if (err) goto err; - xa_set_mark(&esw->vports, idx, MLX5_ESW_VPT_VF); - xa_set_mark(&esw->vports, idx, MLX5_ESW_VPT_HOST_FN); + if (esw->first_host_vport == MLX5_VPORT_PF) + xa_set_mark(&esw->vports, idx, MLX5_ESW_VPT_HOST_FN); idx++; + for (i = 0; i < mlx5_core_max_vfs(dev); i++) { + err = mlx5_esw_vport_alloc(esw, idx, idx); + if (err) + goto err; + xa_set_mark(&esw->vports, idx, MLX5_ESW_VPT_VF); + xa_set_mark(&esw->vports, idx, MLX5_ESW_VPT_HOST_FN); + idx++; + } } + base_sf_num = mlx5_sf_start_function_id(dev); for (i = 0; i < mlx5_sf_max_functions(dev); i++) { err = mlx5_esw_vport_alloc(esw, idx, base_sf_num + i); @@ -1806,6 +1942,9 @@ static int mlx5_esw_vports_init(struct mlx5_eswitch *esw) err = mlx5_esw_vport_alloc(esw, idx, MLX5_VPORT_UPLINK); if (err) goto err; + + /* Adjacent vports or other dynamically create vports will use this */ + esw->last_vport_idx = ++idx; return 0; err: @@ -1864,6 +2003,7 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev) goto free_esw; esw->dev = dev; + dev->priv.eswitch = esw; esw->manager_vport = mlx5_eswitch_manager_vport(dev); esw->first_host_vport = mlx5_eswitch_first_host_vport_num(dev); @@ -1874,11 +2014,14 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev) goto abort; } + err = mlx5_esw_host_functions_enabled_query(esw); + if (err) + goto abort; + err = mlx5_esw_vports_init(esw); if (err) goto abort; - dev->priv.eswitch = esw; err = esw_offloads_init(esw); if (err) goto reps_err; @@ -1907,7 +2050,6 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev) esw->offloads.encap = DEVLINK_ESWITCH_ENCAP_MODE_BASIC; else esw->offloads.encap = DEVLINK_ESWITCH_ENCAP_MODE_NONE; - BLOCKING_INIT_NOTIFIER_HEAD(&esw->n_head); esw_info(dev, "Total vports %d, per vport: max uc(%d) max mc(%d)\n", @@ -2083,6 +2225,9 @@ int mlx5_eswitch_get_vport_config(struct mlx5_eswitch *esw, ivi->vf = vport - 1; mutex_lock(&esw->state_lock); + + mlx5_query_nic_vport_mac_address(esw->dev, vport, true, + evport->info.mac); ether_addr_copy(ivi->mac, evport->info.mac); ivi->linkstate = evport->info.link_state; ivi->vlan = evport->info.vlan; @@ -2233,14 +2378,16 @@ bool mlx5_esw_multipath_prereq(struct mlx5_core_dev *dev0, dev1->priv.eswitch->mode == MLX5_ESWITCH_OFFLOADS); } -int mlx5_esw_event_notifier_register(struct mlx5_eswitch *esw, struct notifier_block *nb) +int mlx5_esw_event_notifier_register(struct mlx5_core_dev *dev, + struct notifier_block *nb) { - return blocking_notifier_chain_register(&esw->n_head, nb); + return blocking_notifier_chain_register(&dev->priv.esw_n_head, nb); } -void mlx5_esw_event_notifier_unregister(struct mlx5_eswitch *esw, struct notifier_block *nb) +void mlx5_esw_event_notifier_unregister(struct mlx5_core_dev *dev, + struct notifier_block *nb) { - blocking_notifier_chain_unregister(&esw->n_head, nb); + blocking_notifier_chain_unregister(&dev->priv.esw_n_head, nb); } /** @@ -2410,3 +2557,11 @@ void mlx5_eswitch_unblock_ipsec(struct mlx5_core_dev *dev) dev->num_ipsec_offloads--; mutex_unlock(&esw->state_lock); } + +bool mlx5_esw_host_functions_enabled(const struct mlx5_core_dev *dev) +{ + if (!dev->priv.eswitch) + return true; + + return !dev->priv.eswitch->esw_funcs.host_funcs_disabled; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index 8573d36785..714ad28e84 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -197,6 +197,11 @@ static inline struct mlx5_vport *mlx5_devlink_port_vport_get(struct devlink_port return mlx5_devlink_port_get(dl_port)->vport; } +#define MLX5_VHCA_ID_INVALID (-1) + +#define MLX5_VPORT_INVAL_VHCA_ID(vport) \ + ((vport)->vhca_id == MLX5_VHCA_ID_INVALID) + struct mlx5_vport { struct mlx5_core_dev *dev; struct hlist_head uc_list[MLX5_L2_ADDR_HASH_SIZE]; @@ -209,13 +214,30 @@ struct mlx5_vport { struct vport_egress egress; u32 default_metadata; u32 metadata; + int vhca_id; + + bool adjacent; /* delegated vhca from adjacent function */ + struct { + u16 parent_pci_devfn; /* Adjacent parent PCI device function */ + u16 function_id; /* Function ID of the delegated VPort */ + } adj_info; struct mlx5_vport_info info; - /* Protected with the E-Switch qos domain lock. */ + /* Protected with the E-Switch qos domain lock. The Vport QoS can + * either be disabled (sched_node is NULL) or in one of three states: + * 1. Regular QoS (sched_node is a vport node). + * 2. TC QoS enabled on the vport (sched_node is a TC arbiter). + * 3. TC QoS enabled on the vport's parent node + * (sched_node is a rate limit node). + * When TC is enabled in either mode, the vport owns vport TC scheduling + * nodes. + */ struct { - /* Vport scheduling element node. */ + /* Vport scheduling node. */ struct mlx5_esw_sched_node *sched_node; + /* Array of vport traffic class scheduling nodes. */ + struct mlx5_esw_sched_node **sched_nodes; } qos; u16 vport; @@ -242,6 +264,9 @@ struct mlx5_eswitch_fdb { struct offloads_fdb { struct mlx5_flow_namespace *ns; + struct mlx5_flow_table *drop_root; + struct mlx5_flow_handle *drop_root_rule; + struct mlx5_fc *drop_root_fc; struct mlx5_flow_table *tc_miss_table; struct mlx5_flow_table *slow_fdb; struct mlx5_flow_group *send_to_vport_grp; @@ -313,6 +338,7 @@ struct mlx5_host_work { struct mlx5_esw_functions { struct mlx5_nb nb; + bool host_funcs_disabled; u16 num_vfs; u16 num_ec_vfs; }; @@ -363,16 +389,13 @@ struct mlx5_eswitch { refcount_t refcnt; u32 root_tsar_ix; struct mlx5_qos_domain *domain; - /* Contains all vports with QoS enabled but no explicit node. - * Cannot be NULL if QoS is enabled, but may be a fake node - * referencing the root TSAR if the esw doesn't support nodes. - */ - struct mlx5_esw_sched_node *node0; } qos; struct mlx5_esw_bridge_offloads *br_offloads; struct mlx5_esw_offload offloads; + u32 last_vport_idx; int mode; + bool offloads_inactive; u16 manager_vport; u16 first_host_vport; u8 num_peers; @@ -380,7 +403,6 @@ struct mlx5_eswitch { struct { u32 large_group_num; } params; - struct blocking_notifier_head n_head; struct xarray paired; struct mlx5_devcom_comp_dev *devcom; u16 enabled_ipsec_vf_count; @@ -405,6 +427,8 @@ int mlx5_esw_qos_modify_vport_rate(struct mlx5_eswitch *esw, u16 vport_num, u32 /* E-Switch API */ int mlx5_eswitch_init(struct mlx5_core_dev *dev); void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw); +int mlx5_esw_vport_alloc(struct mlx5_eswitch *esw, int index, u16 vport_num); +void mlx5_esw_vport_free(struct mlx5_eswitch *esw, struct mlx5_vport *vport); #define MLX5_ESWITCH_IGNORE_NUM_VFS (-1) int mlx5_eswitch_enable_locked(struct mlx5_eswitch *esw, int num_vfs); @@ -412,7 +436,8 @@ int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs); void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw, bool clear_vf); void mlx5_eswitch_disable_locked(struct mlx5_eswitch *esw); void mlx5_eswitch_disable(struct mlx5_eswitch *esw); -void mlx5_esw_offloads_devcom_init(struct mlx5_eswitch *esw, u64 key); +void mlx5_esw_offloads_devcom_init(struct mlx5_eswitch *esw, + const struct mlx5_devcom_match_attr *attr); void mlx5_esw_offloads_devcom_cleanup(struct mlx5_eswitch *esw); bool mlx5_esw_offloads_devcom_is_ready(struct mlx5_eswitch *esw); int mlx5_eswitch_set_vport_mac(struct mlx5_eswitch *esw, @@ -610,6 +635,11 @@ bool mlx5_esw_multipath_prereq(struct mlx5_core_dev *dev0, const u32 *mlx5_esw_query_functions(struct mlx5_core_dev *dev); +void mlx5_esw_adjacent_vhcas_setup(struct mlx5_eswitch *esw); +void mlx5_esw_adjacent_vhcas_cleanup(struct mlx5_eswitch *esw); +int mlx5_esw_adj_vport_modify(struct mlx5_core_dev *dev, u16 vport, + bool connect); + #define MLX5_DEBUG_ESWITCH_MASK BIT(3) #define esw_info(__dev, format, ...) \ @@ -812,12 +842,20 @@ struct devlink_port *mlx5_esw_offloads_devlink_port(struct mlx5_eswitch *esw, u1 int mlx5_esw_sf_max_hpf_functions(struct mlx5_core_dev *dev, u16 *max_sfs, u16 *sf_base_id); -int mlx5_esw_vport_vhca_id_set(struct mlx5_eswitch *esw, u16 vport_num); -void mlx5_esw_vport_vhca_id_clear(struct mlx5_eswitch *esw, u16 vport_num); +int mlx5_esw_vport_vhca_id_map(struct mlx5_eswitch *esw, + struct mlx5_vport *vport); +void mlx5_esw_vport_vhca_id_unmap(struct mlx5_eswitch *esw, + struct mlx5_vport *vport); int mlx5_eswitch_vhca_id_to_vport(struct mlx5_eswitch *esw, u16 vhca_id, u16 *vport_num); +bool mlx5_esw_vport_vhca_id(struct mlx5_eswitch *esw, u16 vportn, u16 *vhca_id); + +void mlx5_esw_offloads_rep_remove(struct mlx5_eswitch *esw, + const struct mlx5_vport *vport); +int mlx5_esw_offloads_rep_add(struct mlx5_eswitch *esw, + const struct mlx5_vport *vport); /** - * mlx5_esw_event_info - Indicates eswitch mode changed/changing. + * struct mlx5_esw_event_info - Indicates eswitch mode changed/changing. * * @new_mode: New mode of eswitch. */ @@ -825,8 +863,10 @@ struct mlx5_esw_event_info { u16 new_mode; }; -int mlx5_esw_event_notifier_register(struct mlx5_eswitch *esw, struct notifier_block *n); -void mlx5_esw_event_notifier_unregister(struct mlx5_eswitch *esw, struct notifier_block *n); +int mlx5_esw_event_notifier_register(struct mlx5_core_dev *dev, + struct notifier_block *n); +void mlx5_esw_event_notifier_unregister(struct mlx5_core_dev *dev, + struct notifier_block *n); bool mlx5_esw_hold(struct mlx5_core_dev *dev); void mlx5_esw_release(struct mlx5_core_dev *dev); @@ -846,7 +886,7 @@ void mlx5_eswitch_offloads_single_fdb_del_one(struct mlx5_eswitch *master_esw, struct mlx5_eswitch *slave_esw); int mlx5_eswitch_reload_ib_reps(struct mlx5_eswitch *esw); -bool mlx5_eswitch_block_encap(struct mlx5_core_dev *dev); +bool mlx5_eswitch_block_encap(struct mlx5_core_dev *dev, bool from_fdb); void mlx5_eswitch_unblock_encap(struct mlx5_core_dev *dev); int mlx5_eswitch_block_mode(struct mlx5_core_dev *dev); @@ -888,6 +928,8 @@ int mlx5_esw_ipsec_vf_packet_offload_set(struct mlx5_eswitch *esw, struct mlx5_v bool enable); int mlx5_esw_ipsec_vf_packet_offload_supported(struct mlx5_core_dev *dev, u16 vport_num); +bool mlx5_esw_host_functions_enabled(const struct mlx5_core_dev *dev); +void mlx5_eswitch_safe_aux_devs_remove(struct mlx5_core_dev *dev); #else /* CONFIG_MLX5_ESWITCH */ /* eswitch API stubs */ static inline int mlx5_eswitch_init(struct mlx5_core_dev *dev) { return 0; } @@ -895,7 +937,9 @@ static inline void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw) {} static inline int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs) { return 0; } static inline void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw, bool clear_vf) {} static inline void mlx5_eswitch_disable(struct mlx5_eswitch *esw) {} -static inline void mlx5_esw_offloads_devcom_init(struct mlx5_eswitch *esw, u64 key) {} +static inline void +mlx5_esw_offloads_devcom_init(struct mlx5_eswitch *esw, + const struct mlx5_devcom_match_attr *attr) {} static inline void mlx5_esw_offloads_devcom_cleanup(struct mlx5_eswitch *esw) {} static inline bool mlx5_esw_offloads_devcom_is_ready(struct mlx5_eswitch *esw) { return false; } static inline bool mlx5_eswitch_is_funcs_handler(struct mlx5_core_dev *dev) { return false; } @@ -938,7 +982,8 @@ mlx5_eswitch_reload_ib_reps(struct mlx5_eswitch *esw) return 0; } -static inline bool mlx5_eswitch_block_encap(struct mlx5_core_dev *dev) +static inline bool +mlx5_eswitch_block_encap(struct mlx5_core_dev *dev, bool from_fdb) { return true; } @@ -955,6 +1000,22 @@ static inline bool mlx5_eswitch_block_ipsec(struct mlx5_core_dev *dev) } static inline void mlx5_eswitch_unblock_ipsec(struct mlx5_core_dev *dev) {} + +static inline bool +mlx5_esw_host_functions_enabled(const struct mlx5_core_dev *dev) +{ + return true; +} + +static inline bool +mlx5_esw_vport_vhca_id(struct mlx5_eswitch *esw, u16 vportn, u16 *vhca_id) +{ + return false; +} + +static inline void +mlx5_eswitch_safe_aux_devs_remove(struct mlx5_core_dev *dev) {} + #endif /* CONFIG_MLX5_ESWITCH */ #endif /* __MLX5_ESWITCH_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 0e3a977d53..3af2a51ace 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -52,6 +52,7 @@ #include "devlink.h" #include "lag/lag.h" #include "en/tc/post_meter.h" +#include "fw_reset.h" /* There are two match-all miss flows, one for unicast dst mac and * one for multicast. @@ -1016,8 +1017,8 @@ mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *on_esw, flow_rule = mlx5_add_flow_rules(mlx5_eswitch_get_slow_fdb(on_esw), spec, &flow_act, &dest, 1); if (IS_ERR(flow_rule)) - esw_warn(on_esw->dev, "FDB: Failed to add send to vport rule err %ld\n", - PTR_ERR(flow_rule)); + esw_warn(on_esw->dev, "FDB: Failed to add send to vport rule err %pe\n", + flow_rule); out: kvfree(spec); return flow_rule; @@ -1065,8 +1066,8 @@ mlx5_eswitch_add_send_to_vport_meta_rule(struct mlx5_eswitch *esw, u16 vport_num flow_rule = mlx5_add_flow_rules(mlx5_eswitch_get_slow_fdb(esw), spec, &flow_act, &dest, 1); if (IS_ERR(flow_rule)) - esw_warn(esw->dev, "FDB: Failed to add send to vport meta rule vport %d, err %ld\n", - vport_num, PTR_ERR(flow_rule)); + esw_warn(esw->dev, "FDB: Failed to add send to vport meta rule vport %d, err %pe\n", + vport_num, flow_rule); kvfree(spec); return flow_rule; @@ -1182,19 +1183,19 @@ static void esw_set_peer_miss_rule_source_port(struct mlx5_eswitch *esw, static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw, struct mlx5_core_dev *peer_dev) { + struct mlx5_eswitch *peer_esw = peer_dev->priv.eswitch; struct mlx5_flow_destination dest = {}; struct mlx5_flow_act flow_act = {0}; struct mlx5_flow_handle **flows; - /* total vports is the same for both e-switches */ - int nvports = esw->total_vports; struct mlx5_flow_handle *flow; + struct mlx5_vport *peer_vport; struct mlx5_flow_spec *spec; - struct mlx5_vport *vport; int err, pfindex; unsigned long i; void *misc; - if (!MLX5_VPORT_MANAGER(esw->dev) && !mlx5_core_is_ecpf_esw_manager(esw->dev)) + if (!MLX5_VPORT_MANAGER(peer_dev) && + !mlx5_core_is_ecpf_esw_manager(peer_dev)) return 0; spec = kvzalloc(sizeof(*spec), GFP_KERNEL); @@ -1203,7 +1204,7 @@ static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw, peer_miss_rules_setup(esw, peer_dev, spec, &dest); - flows = kvcalloc(nvports, sizeof(*flows), GFP_KERNEL); + flows = kvcalloc(peer_esw->total_vports, sizeof(*flows), GFP_KERNEL); if (!flows) { err = -ENOMEM; goto alloc_flows_err; @@ -1213,10 +1214,11 @@ static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw, misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters); - if (mlx5_core_is_ecpf_esw_manager(esw->dev)) { - vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_PF); - esw_set_peer_miss_rule_source_port(esw, peer_dev->priv.eswitch, - spec, MLX5_VPORT_PF); + if (mlx5_core_is_ecpf_esw_manager(peer_dev) && + mlx5_esw_host_functions_enabled(peer_dev)) { + peer_vport = mlx5_eswitch_get_vport(peer_esw, MLX5_VPORT_PF); + esw_set_peer_miss_rule_source_port(esw, peer_esw, spec, + MLX5_VPORT_PF); flow = mlx5_add_flow_rules(mlx5_eswitch_get_slow_fdb(esw), spec, &flow_act, &dest, 1); @@ -1224,11 +1226,11 @@ static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw, err = PTR_ERR(flow); goto add_pf_flow_err; } - flows[vport->index] = flow; + flows[peer_vport->index] = flow; } - if (mlx5_ecpf_vport_exists(esw->dev)) { - vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_ECPF); + if (mlx5_ecpf_vport_exists(peer_dev)) { + peer_vport = mlx5_eswitch_get_vport(peer_esw, MLX5_VPORT_ECPF); MLX5_SET(fte_match_set_misc, misc, source_port, MLX5_VPORT_ECPF); flow = mlx5_add_flow_rules(mlx5_eswitch_get_slow_fdb(esw), spec, &flow_act, &dest, 1); @@ -1236,36 +1238,35 @@ static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw, err = PTR_ERR(flow); goto add_ecpf_flow_err; } - flows[vport->index] = flow; + flows[peer_vport->index] = flow; } - mlx5_esw_for_each_vf_vport(esw, i, vport, mlx5_core_max_vfs(esw->dev)) { - esw_set_peer_miss_rule_source_port(esw, - peer_dev->priv.eswitch, - spec, vport->vport); - + mlx5_esw_for_each_vf_vport(peer_esw, i, peer_vport, + mlx5_core_max_vfs(peer_dev)) { + esw_set_peer_miss_rule_source_port(esw, peer_esw, spec, + peer_vport->vport); flow = mlx5_add_flow_rules(mlx5_eswitch_get_slow_fdb(esw), spec, &flow_act, &dest, 1); if (IS_ERR(flow)) { err = PTR_ERR(flow); goto add_vf_flow_err; } - flows[vport->index] = flow; + flows[peer_vport->index] = flow; } - if (mlx5_core_ec_sriov_enabled(esw->dev)) { - mlx5_esw_for_each_ec_vf_vport(esw, i, vport, mlx5_core_max_ec_vfs(esw->dev)) { - if (i >= mlx5_core_max_ec_vfs(peer_dev)) - break; - esw_set_peer_miss_rule_source_port(esw, peer_dev->priv.eswitch, - spec, vport->vport); + if (mlx5_core_ec_sriov_enabled(peer_dev)) { + mlx5_esw_for_each_ec_vf_vport(peer_esw, i, peer_vport, + mlx5_core_max_ec_vfs(peer_dev)) { + esw_set_peer_miss_rule_source_port(esw, peer_esw, + spec, + peer_vport->vport); flow = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb, spec, &flow_act, &dest, 1); if (IS_ERR(flow)) { err = PTR_ERR(flow); goto add_ec_vf_flow_err; } - flows[vport->index] = flow; + flows[peer_vport->index] = flow; } } @@ -1282,25 +1283,29 @@ static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw, return 0; add_ec_vf_flow_err: - mlx5_esw_for_each_ec_vf_vport(esw, i, vport, mlx5_core_max_ec_vfs(esw->dev)) { - if (!flows[vport->index]) + mlx5_esw_for_each_ec_vf_vport(peer_esw, i, peer_vport, + mlx5_core_max_ec_vfs(peer_dev)) { + if (!flows[peer_vport->index]) continue; - mlx5_del_flow_rules(flows[vport->index]); + mlx5_del_flow_rules(flows[peer_vport->index]); } add_vf_flow_err: - mlx5_esw_for_each_vf_vport(esw, i, vport, mlx5_core_max_vfs(esw->dev)) { - if (!flows[vport->index]) + mlx5_esw_for_each_vf_vport(peer_esw, i, peer_vport, + mlx5_core_max_vfs(peer_dev)) { + if (!flows[peer_vport->index]) continue; - mlx5_del_flow_rules(flows[vport->index]); + mlx5_del_flow_rules(flows[peer_vport->index]); } - if (mlx5_ecpf_vport_exists(esw->dev)) { - vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_ECPF); - mlx5_del_flow_rules(flows[vport->index]); + if (mlx5_ecpf_vport_exists(peer_dev)) { + peer_vport = mlx5_eswitch_get_vport(peer_esw, MLX5_VPORT_ECPF); + mlx5_del_flow_rules(flows[peer_vport->index]); } add_ecpf_flow_err: - if (mlx5_core_is_ecpf_esw_manager(esw->dev)) { - vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_PF); - mlx5_del_flow_rules(flows[vport->index]); + + if (mlx5_core_is_ecpf_esw_manager(peer_dev) && + mlx5_esw_host_functions_enabled(peer_dev)) { + peer_vport = mlx5_eswitch_get_vport(peer_esw, MLX5_VPORT_PF); + mlx5_del_flow_rules(flows[peer_vport->index]); } add_pf_flow_err: esw_warn(esw->dev, "FDB: Failed to add peer miss flow rule err %d\n", err); @@ -1313,37 +1318,35 @@ alloc_flows_err: static void esw_del_fdb_peer_miss_rules(struct mlx5_eswitch *esw, struct mlx5_core_dev *peer_dev) { + struct mlx5_eswitch *peer_esw = peer_dev->priv.eswitch; u16 peer_index = mlx5_get_dev_index(peer_dev); struct mlx5_flow_handle **flows; - struct mlx5_vport *vport; + struct mlx5_vport *peer_vport; unsigned long i; flows = esw->fdb_table.offloads.peer_miss_rules[peer_index]; if (!flows) return; - if (mlx5_core_ec_sriov_enabled(esw->dev)) { - mlx5_esw_for_each_ec_vf_vport(esw, i, vport, mlx5_core_max_ec_vfs(esw->dev)) { - /* The flow for a particular vport could be NULL if the other ECPF - * has fewer or no VFs enabled - */ - if (!flows[vport->index]) - continue; - mlx5_del_flow_rules(flows[vport->index]); - } + if (mlx5_core_ec_sriov_enabled(peer_dev)) { + mlx5_esw_for_each_ec_vf_vport(peer_esw, i, peer_vport, + mlx5_core_max_ec_vfs(peer_dev)) + mlx5_del_flow_rules(flows[peer_vport->index]); } - mlx5_esw_for_each_vf_vport(esw, i, vport, mlx5_core_max_vfs(esw->dev)) - mlx5_del_flow_rules(flows[vport->index]); + mlx5_esw_for_each_vf_vport(peer_esw, i, peer_vport, + mlx5_core_max_vfs(peer_dev)) + mlx5_del_flow_rules(flows[peer_vport->index]); - if (mlx5_ecpf_vport_exists(esw->dev)) { - vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_ECPF); - mlx5_del_flow_rules(flows[vport->index]); + if (mlx5_ecpf_vport_exists(peer_dev)) { + peer_vport = mlx5_eswitch_get_vport(peer_esw, MLX5_VPORT_ECPF); + mlx5_del_flow_rules(flows[peer_vport->index]); } - if (mlx5_core_is_ecpf_esw_manager(esw->dev)) { - vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_PF); - mlx5_del_flow_rules(flows[vport->index]); + if (mlx5_core_is_ecpf_esw_manager(peer_dev) && + mlx5_esw_host_functions_enabled(peer_dev)) { + peer_vport = mlx5_eswitch_get_vport(peer_esw, MLX5_VPORT_PF); + mlx5_del_flow_rules(flows[peer_vport->index]); } kvfree(flows); @@ -1572,6 +1575,7 @@ esw_chains_create(struct mlx5_eswitch *esw, struct mlx5_flow_table *miss_fdb) attr.max_grp_num = esw->params.large_group_num; attr.default_ft = miss_fdb; attr.mapping = esw->offloads.reg_c0_obj_pool; + attr.fs_base_prio = FDB_BYPASS_PATH; chains = mlx5_chains_create(dev, &attr); if (IS_ERR(chains)) { @@ -1973,7 +1977,6 @@ static void esw_destroy_offloads_fdb_tables(struct mlx5_eswitch *esw) /* Holds true only as long as DMFS is the default */ mlx5_flow_namespace_set_mode(esw->fdb_table.offloads.ns, MLX5_FLOW_STEERING_MODE_DMFS); - atomic64_set(&esw->user_count, 0); } static int esw_get_nr_ft_offloads_steering_src_ports(struct mlx5_eswitch *esw) @@ -2154,7 +2157,9 @@ mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, u16 vport, flow_rule = mlx5_add_flow_rules(esw->offloads.ft_offloads, spec, &flow_act, dest, 1); if (IS_ERR(flow_rule)) { - esw_warn(esw->dev, "fs offloads: Failed to add vport rx rule err %ld\n", PTR_ERR(flow_rule)); + esw_warn(esw->dev, + "fs offloads: Failed to add vport rx rule err %pe\n", + flow_rule); goto out; } @@ -2173,8 +2178,8 @@ static int esw_create_vport_rx_drop_rule(struct mlx5_eswitch *esw) &flow_act, NULL, 0); if (IS_ERR(flow_rule)) { esw_warn(esw->dev, - "fs offloads: Failed to add vport rx drop rule err %ld\n", - PTR_ERR(flow_rule)); + "fs offloads: Failed to add vport rx drop rule err %pe\n", + flow_rule); return PTR_ERR(flow_rule); } @@ -2349,6 +2354,131 @@ static void esw_mode_change(struct mlx5_eswitch *esw, u16 mode) mlx5_devcom_comp_unlock(esw->dev->priv.hca_devcom_comp); } +static void mlx5_esw_fdb_drop_destroy(struct mlx5_eswitch *esw) +{ + if (!esw->fdb_table.offloads.drop_root) + return; + + esw_debug(esw->dev, "Destroying FDB drop root table %#x fc %#x\n", + esw->fdb_table.offloads.drop_root->id, + esw->fdb_table.offloads.drop_root_fc->id); + mlx5_del_flow_rules(esw->fdb_table.offloads.drop_root_rule); + /* Don't free flow counter here, can be reused on a later activation */ + mlx5_destroy_flow_table(esw->fdb_table.offloads.drop_root); + esw->fdb_table.offloads.drop_root_rule = NULL; + esw->fdb_table.offloads.drop_root = NULL; +} + +static int mlx5_esw_fdb_drop_create(struct mlx5_eswitch *esw) +{ + struct mlx5_flow_destination drop_fc_dst = {}; + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_flow_destination *dst = NULL; + struct mlx5_core_dev *dev = esw->dev; + struct mlx5_flow_namespace *root_ns; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_handle *flow_rule; + struct mlx5_flow_table *table; + int err = 0, dst_num = 0; + + if (esw->fdb_table.offloads.drop_root) + return 0; + + root_ns = esw->fdb_table.offloads.ns; + + ft_attr.prio = FDB_DROP_ROOT; + ft_attr.max_fte = 1; + ft_attr.autogroup.max_num_groups = 1; + table = mlx5_create_auto_grouped_flow_table(root_ns, &ft_attr); + if (IS_ERR(table)) { + esw_warn(dev, "Failed to create fdb drop root table, err %pe\n", + table); + return PTR_ERR(table); + } + + /* Drop FC reusable, create once on first deactivation of FDB */ + if (!esw->fdb_table.offloads.drop_root_fc) { + struct mlx5_fc *counter = mlx5_fc_create(dev, 0); + + err = PTR_ERR_OR_ZERO(counter); + if (err) + esw_warn(esw->dev, "create fdb drop fc err %d\n", err); + else + esw->fdb_table.offloads.drop_root_fc = counter; + } + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP; + + if (esw->fdb_table.offloads.drop_root_fc) { + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; + drop_fc_dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + drop_fc_dst.counter = esw->fdb_table.offloads.drop_root_fc; + dst = &drop_fc_dst; + dst_num++; + } + + flow_rule = mlx5_add_flow_rules(table, NULL, &flow_act, dst, dst_num); + err = PTR_ERR_OR_ZERO(flow_rule); + if (err) { + esw_warn(esw->dev, + "fs offloads: Failed to add vport rx drop rule err %d\n", + err); + goto err_flow_rule; + } + + esw->fdb_table.offloads.drop_root = table; + esw->fdb_table.offloads.drop_root_rule = flow_rule; + esw_debug(esw->dev, "Created FDB drop root table %#x fc %#x\n", + table->id, dst ? dst->counter->id : 0); + return 0; + +err_flow_rule: + /* no need to free drop fc, esw_offloads_steering_cleanup will do it */ + mlx5_destroy_flow_table(table); + return err; +} + +static void mlx5_esw_fdb_active(struct mlx5_eswitch *esw) +{ + struct mlx5_vport *vport; + unsigned long i; + + mlx5_esw_fdb_drop_destroy(esw); + mlx5_mpfs_enable(esw->dev); + + mlx5_esw_for_each_vf_vport(esw, i, vport, U16_MAX) { + if (!vport->adjacent) + continue; + esw_debug(esw->dev, "Connecting vport %d to eswitch\n", + vport->vport); + mlx5_esw_adj_vport_modify(esw->dev, vport->vport, true); + } + + esw->offloads_inactive = false; + esw_warn(esw->dev, "MPFS/FDB active\n"); +} + +static void mlx5_esw_fdb_inactive(struct mlx5_eswitch *esw) +{ + struct mlx5_vport *vport; + unsigned long i; + + mlx5_mpfs_disable(esw->dev); + mlx5_esw_fdb_drop_create(esw); + + mlx5_esw_for_each_vf_vport(esw, i, vport, U16_MAX) { + if (!vport->adjacent) + continue; + esw_debug(esw->dev, "Disconnecting vport %u from eswitch\n", + vport->vport); + + mlx5_esw_adj_vport_modify(esw->dev, vport->vport, false); + } + + esw->offloads_inactive = true; + esw_warn(esw->dev, "MPFS/FDB inactive\n"); +} + static int esw_offloads_start(struct mlx5_eswitch *esw, struct netlink_ext_ack *extack) { @@ -2373,7 +2503,20 @@ static int esw_offloads_start(struct mlx5_eswitch *esw, return 0; } -static int mlx5_esw_offloads_rep_init(struct mlx5_eswitch *esw, const struct mlx5_vport *vport) +void mlx5_esw_offloads_rep_remove(struct mlx5_eswitch *esw, + const struct mlx5_vport *vport) +{ + struct mlx5_eswitch_rep *rep = xa_load(&esw->offloads.vport_reps, + vport->vport); + + if (!rep) + return; + xa_erase(&esw->offloads.vport_reps, vport->vport); + kfree(rep); +} + +int mlx5_esw_offloads_rep_add(struct mlx5_eswitch *esw, + const struct mlx5_vport *vport) { struct mlx5_eswitch_rep *rep; int rep_type; @@ -2385,9 +2528,19 @@ static int mlx5_esw_offloads_rep_init(struct mlx5_eswitch *esw, const struct mlx rep->vport = vport->vport; rep->vport_index = vport->index; - for (rep_type = 0; rep_type < NUM_REP_TYPES; rep_type++) - atomic_set(&rep->rep_data[rep_type].state, REP_UNREGISTERED); - + for (rep_type = 0; rep_type < NUM_REP_TYPES; rep_type++) { + if (!esw->offloads.rep_ops[rep_type]) { + atomic_set(&rep->rep_data[rep_type].state, + REP_UNREGISTERED); + continue; + } + /* Dynamic/delegated vports add their representors after + * mlx5_eswitch_register_vport_reps, so mark them as registered + * for them to be loaded later with the others. + */ + rep->esw = esw; + atomic_set(&rep->rep_data[rep_type].state, REP_REGISTERED); + } err = xa_insert(&esw->offloads.vport_reps, rep->vport, rep, GFP_KERNEL); if (err) goto insert_err; @@ -2425,7 +2578,7 @@ static int esw_offloads_init_reps(struct mlx5_eswitch *esw) xa_init(&esw->offloads.vport_reps); mlx5_esw_for_each_vport(esw, i, vport) { - err = mlx5_esw_offloads_rep_init(esw, vport); + err = mlx5_esw_offloads_rep_add(esw, vport); if (err) goto err; } @@ -3076,7 +3229,8 @@ err_out: return err; } -void mlx5_esw_offloads_devcom_init(struct mlx5_eswitch *esw, u64 key) +void mlx5_esw_offloads_devcom_init(struct mlx5_eswitch *esw, + const struct mlx5_devcom_match_attr *attr) { int i; @@ -3095,10 +3249,10 @@ void mlx5_esw_offloads_devcom_init(struct mlx5_eswitch *esw, u64 key) esw->num_peers = 0; esw->devcom = mlx5_devcom_register_component(esw->dev->priv.devc, MLX5_DEVCOM_ESW_OFFLOADS, - key, + attr, mlx5_esw_offloads_devcom_event, esw); - if (IS_ERR(esw->devcom)) + if (!esw->devcom) return; mlx5_devcom_send_event(esw->devcom, @@ -3109,7 +3263,7 @@ void mlx5_esw_offloads_devcom_init(struct mlx5_eswitch *esw, u64 key) void mlx5_esw_offloads_devcom_cleanup(struct mlx5_eswitch *esw) { - if (IS_ERR_OR_NULL(esw->devcom)) + if (!esw->devcom) return; mlx5_devcom_send_event(esw->devcom, @@ -3408,6 +3562,10 @@ create_indir_err: static void esw_offloads_steering_cleanup(struct mlx5_eswitch *esw) { + mlx5_esw_fdb_drop_destroy(esw); + if (esw->fdb_table.offloads.drop_root_fc) + mlx5_fc_destroy(esw->dev, esw->fdb_table.offloads.drop_root_fc); + esw->fdb_table.offloads.drop_root_fc = NULL; esw_destroy_vport_rx_drop_rule(esw); esw_destroy_vport_rx_drop_group(esw); esw_destroy_vport_rx_group(esw); @@ -3526,13 +3684,16 @@ bool mlx5_esw_offloads_controller_valid(const struct mlx5_eswitch *esw, u32 cont int esw_offloads_enable(struct mlx5_eswitch *esw) { + u8 mapping_id[MLX5_SW_IMAGE_GUID_MAX_BYTES]; struct mapping_ctx *reg_c0_obj_pool; struct mlx5_vport *vport; unsigned long i; - u64 mapping_id; + u8 id_len; int err; mutex_init(&esw->offloads.termtbl_mutex); + mlx5_esw_adjacent_vhcas_setup(esw); + err = mlx5_rdma_enable_roce(esw->dev); if (err) goto err_roce; @@ -3549,9 +3710,10 @@ int esw_offloads_enable(struct mlx5_eswitch *esw) if (err) goto err_vport_metadata; - mapping_id = mlx5_query_nic_system_image_guid(esw->dev); + mlx5_query_nic_sw_system_image_guid(esw->dev, mapping_id, &id_len); - reg_c0_obj_pool = mapping_create_for_id(mapping_id, MAPPING_TYPE_CHAIN, + reg_c0_obj_pool = mapping_create_for_id(mapping_id, id_len, + MAPPING_TYPE_CHAIN, sizeof(struct mlx5_mapped_obj), ESW_REG_C0_USER_DATA_METADATA_MASK, true); @@ -3566,6 +3728,11 @@ int esw_offloads_enable(struct mlx5_eswitch *esw) if (err) goto err_steering_init; + if (esw->offloads_inactive) + mlx5_esw_fdb_inactive(esw); + else + mlx5_esw_fdb_active(esw); + /* Representor will control the vport link state */ mlx5_esw_for_each_vf_vport(esw, i, vport, esw->esw_funcs.num_vfs) vport->info.link_state = MLX5_VPORT_ADMIN_STATE_DOWN; @@ -3597,6 +3764,7 @@ err_vport_metadata: err_metadata: mlx5_rdma_disable_roce(esw->dev); err_roce: + mlx5_esw_adjacent_vhcas_cleanup(esw); mutex_destroy(&esw->offloads.termtbl_mutex); return err; } @@ -3630,6 +3798,10 @@ void esw_offloads_disable(struct mlx5_eswitch *esw) mapping_destroy(esw->offloads.reg_c0_obj_pool); esw_offloads_metadata_uninit(esw); mlx5_rdma_disable_roce(esw->dev); + mlx5_esw_adjacent_vhcas_cleanup(esw); + /* must be done after vhcas cleanup to avoid adjacent vports connect */ + if (esw->offloads_inactive) + mlx5_esw_fdb_active(esw); /* legacy mode always active */ mutex_destroy(&esw->offloads.termtbl_mutex); } @@ -3640,6 +3812,7 @@ static int esw_mode_from_devlink(u16 mode, u16 *mlx5_mode) *mlx5_mode = MLX5_ESWITCH_LEGACY; break; case DEVLINK_ESWITCH_MODE_SWITCHDEV: + case DEVLINK_ESWITCH_MODE_SWITCHDEV_INACTIVE: *mlx5_mode = MLX5_ESWITCH_OFFLOADS; break; default: @@ -3649,14 +3822,17 @@ static int esw_mode_from_devlink(u16 mode, u16 *mlx5_mode) return 0; } -static int esw_mode_to_devlink(u16 mlx5_mode, u16 *mode) +static int esw_mode_to_devlink(struct mlx5_eswitch *esw, u16 *mode) { - switch (mlx5_mode) { + switch (esw->mode) { case MLX5_ESWITCH_LEGACY: *mode = DEVLINK_ESWITCH_MODE_LEGACY; break; case MLX5_ESWITCH_OFFLOADS: - *mode = DEVLINK_ESWITCH_MODE_SWITCHDEV; + if (esw->offloads_inactive) + *mode = DEVLINK_ESWITCH_MODE_SWITCHDEV_INACTIVE; + else + *mode = DEVLINK_ESWITCH_MODE_SWITCHDEV; break; default: return -EINVAL; @@ -3739,6 +3915,94 @@ void mlx5_eswitch_unblock_mode(struct mlx5_core_dev *dev) up_write(&esw->mode_lock); } +/* Returns false only when uplink netdev exists and its netns is different from + * devlink's netns. True for all others so entering switchdev mode is allowed. + */ +static bool mlx5_devlink_netdev_netns_immutable_set(struct devlink *devlink, + bool immutable) +{ + struct mlx5_core_dev *mdev = devlink_priv(devlink); + struct net_device *netdev; + bool ret; + + netdev = mlx5_uplink_netdev_get(mdev); + if (!netdev) + return true; + + rtnl_lock(); + netdev->netns_immutable = immutable; + ret = net_eq(dev_net(netdev), devlink_net(devlink)); + rtnl_unlock(); + + mlx5_uplink_netdev_put(mdev, netdev); + return ret; +} + +/* Returns true when only changing between active and inactive switchdev mode */ +static bool mlx5_devlink_switchdev_active_mode_change(struct mlx5_eswitch *esw, + u16 devlink_mode) +{ + /* current mode is not switchdev */ + if (esw->mode != MLX5_ESWITCH_OFFLOADS) + return false; + + /* new mode is not switchdev */ + if (devlink_mode != DEVLINK_ESWITCH_MODE_SWITCHDEV && + devlink_mode != DEVLINK_ESWITCH_MODE_SWITCHDEV_INACTIVE) + return false; + + /* already inactive: no change in current state */ + if (devlink_mode == DEVLINK_ESWITCH_MODE_SWITCHDEV_INACTIVE && + esw->offloads_inactive) + return false; + + /* already active: no change in current state */ + if (devlink_mode == DEVLINK_ESWITCH_MODE_SWITCHDEV && + !esw->offloads_inactive) + return false; + + down_write(&esw->mode_lock); + esw->offloads_inactive = !esw->offloads_inactive; + esw->eswitch_operation_in_progress = true; + up_write(&esw->mode_lock); + + if (esw->offloads_inactive) + mlx5_esw_fdb_inactive(esw); + else + mlx5_esw_fdb_active(esw); + + down_write(&esw->mode_lock); + esw->eswitch_operation_in_progress = false; + up_write(&esw->mode_lock); + return true; +} + +#define MLX5_ESW_HOLD_TIMEOUT_MS 7000 +#define MLX5_ESW_HOLD_RETRY_DELAY_MS 500 + +void mlx5_eswitch_safe_aux_devs_remove(struct mlx5_core_dev *dev) +{ + unsigned long timeout; + bool hold_esw = true; + + /* Wait for any concurrent eswitch mode transition to complete. */ + if (!mlx5_esw_hold(dev)) { + timeout = jiffies + msecs_to_jiffies(MLX5_ESW_HOLD_TIMEOUT_MS); + while (!mlx5_esw_hold(dev)) { + if (!time_before(jiffies, timeout)) { + hold_esw = false; + break; + } + msleep(MLX5_ESW_HOLD_RETRY_DELAY_MS); + } + } + if (hold_esw) { + if (mlx5_eswitch_mode(dev) == MLX5_ESWITCH_OFFLOADS) + mlx5_core_reps_aux_devs_remove(dev); + mlx5_esw_release(dev); + } +} + int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode, struct netlink_ext_ack *extack) { @@ -3750,15 +4014,24 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode, if (IS_ERR(esw)) return PTR_ERR(esw); + if (mlx5_fw_reset_in_progress(esw->dev)) { + NL_SET_ERR_MSG_MOD(extack, "Can't change eswitch mode during firmware reset"); + return -EBUSY; + } + if (esw_mode_from_devlink(mode, &mlx5_mode)) return -EINVAL; - if (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV && mlx5_get_sd(esw->dev)) { + if (mlx5_mode == MLX5_ESWITCH_OFFLOADS && mlx5_get_sd(esw->dev)) { NL_SET_ERR_MSG_MOD(extack, "Can't change E-Switch mode to switchdev when multi-PF netdev (Socket Direct) is configured."); return -EPERM; } + /* Avoid try_lock, active/inactive mode change is not restricted */ + if (mlx5_devlink_switchdev_active_mode_change(esw, mode)) + return 0; + mlx5_lag_disable_change(esw->dev); err = mlx5_esw_try_lock(esw); if (err < 0) { @@ -3781,24 +4054,36 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode, esw->eswitch_operation_in_progress = true; up_write(&esw->mode_lock); - if (mode == DEVLINK_ESWITCH_MODE_LEGACY) + if (mlx5_mode == MLX5_ESWITCH_OFFLOADS && + !mlx5_devlink_netdev_netns_immutable_set(devlink, true)) { + NL_SET_ERR_MSG_MOD(extack, + "Can't change E-Switch mode to switchdev when netdev net namespace has diverged from the devlink's."); + err = -EINVAL; + goto skip; + } + + if (mlx5_mode == MLX5_ESWITCH_LEGACY) esw->dev->priv.flags |= MLX5_PRIV_FLAGS_SWITCH_LEGACY; mlx5_eswitch_disable_locked(esw); - if (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV) { + if (mlx5_mode == MLX5_ESWITCH_OFFLOADS) { if (mlx5_devlink_trap_get_num_active(esw->dev)) { NL_SET_ERR_MSG_MOD(extack, "Can't change mode while devlink traps are active"); err = -EOPNOTSUPP; goto skip; } + esw->offloads_inactive = + (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV_INACTIVE); err = esw_offloads_start(esw, extack); - } else if (mode == DEVLINK_ESWITCH_MODE_LEGACY) { + } else if (mlx5_mode == MLX5_ESWITCH_LEGACY) { err = esw_offloads_stop(esw, extack); } else { err = -EINVAL; } skip: + if (mlx5_mode == MLX5_ESWITCH_OFFLOADS && err) + mlx5_devlink_netdev_netns_immutable_set(devlink, false); down_write(&esw->mode_lock); esw->eswitch_operation_in_progress = false; unlock: @@ -3816,7 +4101,7 @@ int mlx5_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode) if (IS_ERR(esw)) return PTR_ERR(esw); - return esw_mode_to_devlink(esw->mode, mode); + return esw_mode_to_devlink(esw, mode); } static int mlx5_esw_vports_inline_set(struct mlx5_eswitch *esw, u8 mlx5_mode, @@ -3938,23 +4223,25 @@ int mlx5_devlink_eswitch_inline_mode_get(struct devlink *devlink, u8 *mode) return esw_inline_mode_to_devlink(esw->offloads.inline_mode, mode); } -bool mlx5_eswitch_block_encap(struct mlx5_core_dev *dev) +bool mlx5_eswitch_block_encap(struct mlx5_core_dev *dev, bool from_fdb) { struct mlx5_eswitch *esw = dev->priv.eswitch; + enum devlink_eswitch_encap_mode encap; + bool allow_tunnel = false; if (!mlx5_esw_allowed(esw)) return true; down_write(&esw->mode_lock); - if (esw->mode != MLX5_ESWITCH_LEGACY && - esw->offloads.encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE) { - up_write(&esw->mode_lock); - return false; + encap = esw->offloads.encap; + if (esw->mode == MLX5_ESWITCH_LEGACY || + (encap == DEVLINK_ESWITCH_ENCAP_MODE_NONE && !from_fdb)) { + allow_tunnel = true; + esw->offloads.num_block_encap++; } - - esw->offloads.num_block_encap++; up_write(&esw->mode_lock); - return true; + + return allow_tunnel; } void mlx5_eswitch_unblock_encap(struct mlx5_core_dev *dev) @@ -4059,7 +4346,8 @@ mlx5_eswitch_vport_has_rep(const struct mlx5_eswitch *esw, u16 vport_num) { /* Currently, only ECPF based device has representor for host PF. */ if (vport_num == MLX5_VPORT_PF && - !mlx5_core_is_ecpf_esw_manager(esw->dev)) + (!mlx5_core_is_ecpf_esw_manager(esw->dev) || + !mlx5_esw_host_functions_enabled(esw->dev))) return false; if (vport_num == MLX5_VPORT_ECPF && @@ -4161,23 +4449,28 @@ u32 mlx5_eswitch_get_vport_metadata_for_match(struct mlx5_eswitch *esw, } EXPORT_SYMBOL(mlx5_eswitch_get_vport_metadata_for_match); -int mlx5_esw_vport_vhca_id_set(struct mlx5_eswitch *esw, u16 vport_num) +int mlx5_esw_vport_vhca_id_map(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) { u16 *old_entry, *vhca_map_entry, vhca_id; - int err; - err = mlx5_vport_get_vhca_id(esw->dev, vport_num, &vhca_id); - if (err) { - esw_warn(esw->dev, "Getting vhca_id for vport failed (vport=%u,err=%d)\n", - vport_num, err); - return err; + if (WARN_ONCE(MLX5_VPORT_INVAL_VHCA_ID(vport), + "vport %d vhca_id is not set", vport->vport)) { + int err; + + err = mlx5_vport_get_vhca_id(vport->dev, vport->vport, + &vhca_id); + if (err) + return err; + vport->vhca_id = vhca_id; } + vhca_id = vport->vhca_id; vhca_map_entry = kmalloc(sizeof(*vhca_map_entry), GFP_KERNEL); if (!vhca_map_entry) return -ENOMEM; - *vhca_map_entry = vport_num; + *vhca_map_entry = vport->vport; old_entry = xa_store(&esw->offloads.vhca_map, vhca_id, vhca_map_entry, GFP_KERNEL); if (xa_is_err(old_entry)) { kfree(vhca_map_entry); @@ -4187,17 +4480,12 @@ int mlx5_esw_vport_vhca_id_set(struct mlx5_eswitch *esw, u16 vport_num) return 0; } -void mlx5_esw_vport_vhca_id_clear(struct mlx5_eswitch *esw, u16 vport_num) +void mlx5_esw_vport_vhca_id_unmap(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) { - u16 *vhca_map_entry, vhca_id; - int err; + u16 *vhca_map_entry; - err = mlx5_vport_get_vhca_id(esw->dev, vport_num, &vhca_id); - if (err) - esw_warn(esw->dev, "Getting vhca_id for vport failed (vport=%hu,err=%d)\n", - vport_num, err); - - vhca_map_entry = xa_erase(&esw->offloads.vhca_map, vhca_id); + vhca_map_entry = xa_erase(&esw->offloads.vhca_map, vport->vhca_id); kfree(vhca_map_entry); } @@ -4232,6 +4520,9 @@ int mlx5_devlink_port_fn_hw_addr_get(struct devlink_port *port, struct mlx5_vport *vport = mlx5_devlink_port_vport_get(port); mutex_lock(&esw->state_lock); + + mlx5_query_nic_vport_mac_address(esw->dev, vport->vport, true, + vport->info.mac); ether_addr_copy(hw_addr, vport->info.mac); *hw_addr_len = ETH_ALEN; mutex_unlock(&esw->state_lock); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c index c4de6bf8d1..ccef64fb40 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c @@ -421,6 +421,13 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size) __be64 *pas; u32 i; + conn->cq.mcq.cqe_sz = 64; + conn->cq.mcq.set_ci_db = conn->cq.wq_ctrl.db.db; + conn->cq.mcq.arm_db = conn->cq.wq_ctrl.db.db + 1; + *conn->cq.mcq.set_ci_db = 0; + conn->cq.mcq.vector = 0; + conn->cq.mcq.comp = mlx5_fpga_conn_cq_complete; + cq_size = roundup_pow_of_two(cq_size); MLX5_SET(cqc, temp_cqc, log_cq_size, ilog2(cq_size)); @@ -468,16 +475,7 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size) if (err) goto err_cqwq; - conn->cq.mcq.cqe_sz = 64; - conn->cq.mcq.set_ci_db = conn->cq.wq_ctrl.db.db; - conn->cq.mcq.arm_db = conn->cq.wq_ctrl.db.db + 1; - *conn->cq.mcq.set_ci_db = 0; - *conn->cq.mcq.arm_db = 0; - conn->cq.mcq.vector = 0; - conn->cq.mcq.comp = mlx5_fpga_conn_cq_complete; - conn->cq.mcq.uar = fdev->conn_res.uar; tasklet_setup(&conn->cq.tasklet, mlx5_fpga_conn_cq_tasklet); - mlx5_fpga_dbg(fdev, "Created CQ #0x%x\n", conn->cq.mcq.cqn); goto out; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c index e5c1012921..1ec61164e6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c @@ -211,7 +211,7 @@ int mlx5_fpga_device_start(struct mlx5_core_dev *mdev) max_num_qps = MLX5_CAP_FPGA(mdev, shell_caps.max_num_qps); if (!max_num_qps) { mlx5_fpga_err(fdev, "FPGA reports 0 QPs in SHELL_CAPS\n"); - err = -ENOTSUPP; + err = -EOPNOTSUPP; goto out; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c index a47c29571f..c348ee62cd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c @@ -239,6 +239,10 @@ static int mlx5_cmd_update_root_ft(struct mlx5_flow_root_namespace *ns, MLX5_SET(set_flow_table_root_in, in, vport_number, ft->vport); MLX5_SET(set_flow_table_root_in, in, other_vport, !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT)); + MLX5_SET(set_flow_table_root_in, in, eswitch_owner_vhca_id, + ft->esw_owner_vhca_id); + MLX5_SET(set_flow_table_root_in, in, other_eswitch, + !!(ft->flags & MLX5_FLOW_TABLE_OTHER_ESWITCH)); err = mlx5_cmd_exec_in(dev, set_flow_table_root, in); if (!err && @@ -302,6 +306,10 @@ static int mlx5_cmd_create_flow_table(struct mlx5_flow_root_namespace *ns, MLX5_SET(create_flow_table_in, in, vport_number, ft->vport); MLX5_SET(create_flow_table_in, in, other_vport, !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT)); + MLX5_SET(create_flow_table_in, in, eswitch_owner_vhca_id, + ft->esw_owner_vhca_id); + MLX5_SET(create_flow_table_in, in, other_eswitch, + !!(ft->flags & MLX5_FLOW_TABLE_OTHER_ESWITCH)); MLX5_SET(create_flow_table_in, in, flow_table_context.decap_en, en_decap); @@ -360,6 +368,10 @@ static int mlx5_cmd_destroy_flow_table(struct mlx5_flow_root_namespace *ns, MLX5_SET(destroy_flow_table_in, in, vport_number, ft->vport); MLX5_SET(destroy_flow_table_in, in, other_vport, !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT)); + MLX5_SET(destroy_flow_table_in, in, eswitch_owner_vhca_id, + ft->esw_owner_vhca_id); + MLX5_SET(destroy_flow_table_in, in, other_eswitch, + !!(ft->flags & MLX5_FLOW_TABLE_OTHER_ESWITCH)); err = mlx5_cmd_exec_in(dev, destroy_flow_table, in); if (!err) @@ -394,6 +406,10 @@ static int mlx5_cmd_modify_flow_table(struct mlx5_flow_root_namespace *ns, MLX5_SET(modify_flow_table_in, in, vport_number, ft->vport); MLX5_SET(modify_flow_table_in, in, other_vport, !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT)); + MLX5_SET(modify_flow_table_in, in, eswitch_owner_vhca_id, + ft->esw_owner_vhca_id); + MLX5_SET(modify_flow_table_in, in, other_eswitch, + !!(ft->flags & MLX5_FLOW_TABLE_OTHER_ESWITCH)); MLX5_SET(modify_flow_table_in, in, modify_field_select, MLX5_MODIFY_FLOW_TABLE_MISS_TABLE_ID); if (next_ft) { @@ -429,6 +445,10 @@ static int mlx5_cmd_create_flow_group(struct mlx5_flow_root_namespace *ns, MLX5_SET(create_flow_group_in, in, vport_number, ft->vport); MLX5_SET(create_flow_group_in, in, other_vport, !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT)); + MLX5_SET(create_flow_group_in, in, eswitch_owner_vhca_id, + ft->esw_owner_vhca_id); + MLX5_SET(create_flow_group_in, in, other_eswitch, + !!(ft->flags & MLX5_FLOW_TABLE_OTHER_ESWITCH)); err = mlx5_cmd_exec_inout(dev, create_flow_group, in, out); if (!err) fg->id = MLX5_GET(create_flow_group_out, out, @@ -451,6 +471,10 @@ static int mlx5_cmd_destroy_flow_group(struct mlx5_flow_root_namespace *ns, MLX5_SET(destroy_flow_group_in, in, vport_number, ft->vport); MLX5_SET(destroy_flow_group_in, in, other_vport, !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT)); + MLX5_SET(destroy_flow_group_in, in, eswitch_owner_vhca_id, + ft->esw_owner_vhca_id); + MLX5_SET(destroy_flow_group_in, in, other_eswitch, + !!(ft->flags & MLX5_FLOW_TABLE_OTHER_ESWITCH)); return mlx5_cmd_exec_in(dev, destroy_flow_group, in); } @@ -527,7 +551,7 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev, struct mlx5_flow_rule *dst; void *in_flow_context, *vlan; void *in_match_value; - int reformat_id = 0; + u32 reformat_id = 0; unsigned int inlen; int dst_cnt_size; u32 *in, action; @@ -559,6 +583,9 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev, MLX5_SET(set_fte_in, in, vport_number, ft->vport); MLX5_SET(set_fte_in, in, other_vport, !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT)); + MLX5_SET(set_fte_in, in, eswitch_owner_vhca_id, ft->esw_owner_vhca_id); + MLX5_SET(set_fte_in, in, other_eswitch, + !!(ft->flags & MLX5_FLOW_TABLE_OTHER_ESWITCH)); in_flow_context = MLX5_ADDR_OF(set_fte_in, in, flow_context); MLX5_SET(flow_context, in_flow_context, group_id, group_id); @@ -580,23 +607,21 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev, MLX5_SET(flow_context, in_flow_context, action, action); if (!extended_dest && fte->act_dests.action.pkt_reformat) { - struct mlx5_pkt_reformat *pkt_reformat = fte->act_dests.action.pkt_reformat; + struct mlx5_pkt_reformat *pkt_reformat = + fte->act_dests.action.pkt_reformat; - if (pkt_reformat->owner == MLX5_FLOW_RESOURCE_OWNER_SW) { - reformat_id = mlx5_fs_dr_action_get_pkt_reformat_id(pkt_reformat); - if (reformat_id < 0) { - mlx5_core_err(dev, - "Unsupported SW-owned pkt_reformat type (%d) in FW-owned table\n", - pkt_reformat->reformat_type); - err = reformat_id; - goto err_out; - } - } else { - reformat_id = fte->act_dests.action.pkt_reformat->id; + err = mlx5_fs_get_packet_reformat_id(pkt_reformat, + &reformat_id); + if (err) { + mlx5_core_err(dev, + "Unsupported pkt_reformat type (%d)\n", + pkt_reformat->reformat_type); + goto err_out; } } - MLX5_SET(flow_context, in_flow_context, packet_reformat_id, (u32)reformat_id); + MLX5_SET(flow_context, in_flow_context, packet_reformat_id, + reformat_id); if (fte->act_dests.action.modify_hdr) { if (fte->act_dests.action.modify_hdr->owner == MLX5_FLOW_RESOURCE_OWNER_SW) { @@ -790,6 +815,10 @@ static int mlx5_cmd_delete_fte(struct mlx5_flow_root_namespace *ns, MLX5_SET(delete_fte_in, in, vport_number, ft->vport); MLX5_SET(delete_fte_in, in, other_vport, !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT)); + MLX5_SET(delete_fte_in, in, eswitch_owner_vhca_id, + ft->esw_owner_vhca_id); + MLX5_SET(delete_fte_in, in, other_eswitch, + !!(ft->flags & MLX5_FLOW_TABLE_OTHER_ESWITCH)); return mlx5_cmd_exec_in(dev, delete_fte, in); } @@ -1169,7 +1198,8 @@ int mlx5_fs_cmd_set_tx_flow_table_root(struct mlx5_core_dev *dev, u32 ft_id, boo u32 out[MLX5_ST_SZ_DW(set_flow_table_root_out)] = {}; u32 in[MLX5_ST_SZ_DW(set_flow_table_root_in)] = {}; - if (disconnect && MLX5_CAP_FLOWTABLE_NIC_TX(dev, reset_root_to_default)) + if (disconnect && + !MLX5_CAP_FLOWTABLE_NIC_TX(dev, reset_root_to_default)) return -EOPNOTSUPP; MLX5_SET(set_flow_table_root_in, in, opcode, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index a22ecf1415..2b755a0035 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -113,13 +113,16 @@ #define ETHTOOL_PRIO_NUM_LEVELS 1 #define ETHTOOL_NUM_PRIOS 11 #define ETHTOOL_MIN_LEVEL (KERNEL_MIN_LEVEL + ETHTOOL_NUM_PRIOS) -/* Promiscuous, Vlan, mac, ttc, inner ttc, {UDP/ANY/aRFS/accel/{esp, esp_err}}, IPsec policy, - * {IPsec RoCE MPV,Alias table},IPsec RoCE policy +/* Vlan, mac, ttc, inner ttc, {UDP/ANY/aRFS/accel/{esp, esp_err}}, IPsec policy, + * IPsec policy miss, {IPsec RoCE MPV,Alias table},IPsec RoCE policy */ #define KERNEL_NIC_PRIO_NUM_LEVELS 11 #define KERNEL_NIC_NUM_PRIOS 1 -/* One more level for tc */ -#define KERNEL_MIN_LEVEL (KERNEL_NIC_PRIO_NUM_LEVELS + 1) +/* One more level for tc, and one more for promisc */ +#define KERNEL_MIN_LEVEL (KERNEL_NIC_PRIO_NUM_LEVELS + 2) + +#define KERNEL_NIC_PROMISC_NUM_PRIOS 1 +#define KERNEL_NIC_PROMISC_NUM_LEVELS 1 #define KERNEL_NIC_TC_NUM_PRIOS 1 #define KERNEL_NIC_TC_NUM_LEVELS 3 @@ -187,6 +190,8 @@ static struct init_tree_node { ADD_NS(MLX5_FLOW_TABLE_MISS_ACTION_DEF, ADD_MULTIPLE_PRIO(KERNEL_NIC_TC_NUM_PRIOS, KERNEL_NIC_TC_NUM_LEVELS), + ADD_MULTIPLE_PRIO(KERNEL_NIC_PROMISC_NUM_PRIOS, + KERNEL_NIC_PROMISC_NUM_LEVELS), ADD_MULTIPLE_PRIO(KERNEL_NIC_NUM_PRIOS, KERNEL_NIC_PRIO_NUM_LEVELS))), ADD_PRIO(0, BY_PASS_MIN_LEVEL, 0, FS_CHAINING_CAPS, @@ -934,10 +939,10 @@ static struct mlx5_flow_group *alloc_insert_flow_group(struct mlx5_flow_table *f return fg; } -static struct mlx5_flow_table *alloc_flow_table(int level, u16 vport, - enum fs_flow_table_type table_type, - enum fs_flow_table_op_mod op_mod, - u32 flags) +static struct mlx5_flow_table * +alloc_flow_table(struct mlx5_flow_table_attr *ft_attr, u16 vport, + enum fs_flow_table_type table_type, + enum fs_flow_table_op_mod op_mod) { struct mlx5_flow_table *ft; int ret; @@ -952,12 +957,13 @@ static struct mlx5_flow_table *alloc_flow_table(int level, u16 vport, return ERR_PTR(ret); } - ft->level = level; + ft->level = ft_attr->level; ft->node.type = FS_TYPE_FLOW_TABLE; ft->op_mod = op_mod; ft->type = table_type; ft->vport = vport; - ft->flags = flags; + ft->esw_owner_vhca_id = ft_attr->esw_owner_vhca_id; + ft->flags = ft_attr->flags; INIT_LIST_HEAD(&ft->fwd_rules); mutex_init(&ft->lock); @@ -1365,10 +1371,7 @@ static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespa /* The level is related to the * priority level range. */ - ft = alloc_flow_table(ft_attr->level, - vport, - root->table_type, - op_mod, ft_attr->flags); + ft = alloc_flow_table(ft_attr, vport, root->table_type, op_mod); if (IS_ERR(ft)) { err = PTR_ERR(ft); goto unlock_root; @@ -1830,14 +1833,35 @@ static int create_auto_flow_group(struct mlx5_flow_table *ft, return err; } +int mlx5_fs_get_packet_reformat_id(struct mlx5_pkt_reformat *pkt_reformat, + u32 *id) +{ + switch (pkt_reformat->owner) { + case MLX5_FLOW_RESOURCE_OWNER_FW: + *id = pkt_reformat->id; + return 0; + case MLX5_FLOW_RESOURCE_OWNER_SW: + return mlx5_fs_dr_action_get_pkt_reformat_id(pkt_reformat, id); + case MLX5_FLOW_RESOURCE_OWNER_HWS: + return mlx5_fs_hws_action_get_pkt_reformat_id(pkt_reformat, id); + default: + return -EINVAL; + } +} + static bool mlx5_pkt_reformat_cmp(struct mlx5_pkt_reformat *p1, struct mlx5_pkt_reformat *p2) { - return p1->owner == p2->owner && - (p1->owner == MLX5_FLOW_RESOURCE_OWNER_FW ? - p1->id == p2->id : - mlx5_fs_dr_action_get_pkt_reformat_id(p1) == - mlx5_fs_dr_action_get_pkt_reformat_id(p2)); + int err1, err2; + u32 id1, id2; + + if (p1->owner != p2->owner) + return false; + + err1 = mlx5_fs_get_packet_reformat_id(p1, &id1); + err2 = mlx5_fs_get_packet_reformat_id(p2, &id2); + + return !err1 && !err2 && id1 == id2; } static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1, @@ -2207,6 +2231,7 @@ try_add_to_existing_fg(struct mlx5_flow_table *ft, struct mlx5_flow_handle *rule; struct match_list *iter; bool take_write = false; + bool try_again = false; struct fs_fte *fte; u64 version = 0; int err; @@ -2271,6 +2296,7 @@ skip_search: nested_down_write_ref_node(&g->node, FS_LOCK_PARENT); if (!g->node.active) { + try_again = true; up_write_ref_node(&g->node, false); continue; } @@ -2292,7 +2318,8 @@ skip_search: tree_put_node(&fte->node, false); return rule; } - rule = ERR_PTR(-ENOENT); + err = try_again ? -EAGAIN : -ENOENT; + rule = ERR_PTR(err); out: kmem_cache_free(steering->ftes_cache, fte); return rule; @@ -2764,30 +2791,32 @@ struct mlx5_flow_namespace *mlx5_get_flow_namespace(struct mlx5_core_dev *dev, } EXPORT_SYMBOL(mlx5_get_flow_namespace); +struct mlx5_vport_acl_root_ns { + u16 vport_idx; + struct mlx5_flow_root_namespace *root_ns; +}; + struct mlx5_flow_namespace * mlx5_get_flow_vport_namespace(struct mlx5_core_dev *dev, enum mlx5_flow_namespace_type type, int vport_idx) { struct mlx5_flow_steering *steering = dev->priv.steering; + struct mlx5_vport_acl_root_ns *vport_ns; if (!steering) return NULL; switch (type) { case MLX5_FLOW_NAMESPACE_ESW_EGRESS: - if (vport_idx >= steering->esw_egress_acl_vports) - return NULL; - if (steering->esw_egress_root_ns && - steering->esw_egress_root_ns[vport_idx]) - return &steering->esw_egress_root_ns[vport_idx]->ns; + vport_ns = xa_load(&steering->esw_egress_root_ns, vport_idx); + if (vport_ns) + return &vport_ns->root_ns->ns; else return NULL; case MLX5_FLOW_NAMESPACE_ESW_INGRESS: - if (vport_idx >= steering->esw_ingress_acl_vports) - return NULL; - if (steering->esw_ingress_root_ns && - steering->esw_ingress_root_ns[vport_idx]) - return &steering->esw_ingress_root_ns[vport_idx]->ns; + vport_ns = xa_load(&steering->esw_ingress_root_ns, vport_idx); + if (vport_ns) + return &vport_ns->root_ns->ns; else return NULL; case MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX: @@ -3221,36 +3250,120 @@ static int init_rdma_transport_rx_root_ns_one(struct mlx5_flow_steering *steering, int vport_idx) { + struct mlx5_flow_root_namespace *root_ns; struct fs_prio *prio; + int ret; + int i; steering->rdma_transport_rx_root_ns[vport_idx] = create_root_ns(steering, FS_FT_RDMA_TRANSPORT_RX); if (!steering->rdma_transport_rx_root_ns[vport_idx]) return -ENOMEM; - /* create 1 prio*/ - prio = fs_create_prio(&steering->rdma_transport_rx_root_ns[vport_idx]->ns, - MLX5_RDMA_TRANSPORT_BYPASS_PRIO, 1); - return PTR_ERR_OR_ZERO(prio); + root_ns = steering->rdma_transport_rx_root_ns[vport_idx]; + + for (i = 0; i < MLX5_RDMA_TRANSPORT_BYPASS_PRIO; i++) { + prio = fs_create_prio(&root_ns->ns, i, 1); + if (IS_ERR(prio)) { + ret = PTR_ERR(prio); + goto err; + } + } + set_prio_attrs(root_ns); + return 0; + +err: + cleanup_root_ns(root_ns); + return ret; } static int init_rdma_transport_tx_root_ns_one(struct mlx5_flow_steering *steering, int vport_idx) { + struct mlx5_flow_root_namespace *root_ns; struct fs_prio *prio; + int ret; + int i; steering->rdma_transport_tx_root_ns[vport_idx] = create_root_ns(steering, FS_FT_RDMA_TRANSPORT_TX); if (!steering->rdma_transport_tx_root_ns[vport_idx]) return -ENOMEM; - /* create 1 prio*/ - prio = fs_create_prio(&steering->rdma_transport_tx_root_ns[vport_idx]->ns, - MLX5_RDMA_TRANSPORT_BYPASS_PRIO, 1); - return PTR_ERR_OR_ZERO(prio); + root_ns = steering->rdma_transport_tx_root_ns[vport_idx]; + + for (i = 0; i < MLX5_RDMA_TRANSPORT_BYPASS_PRIO; i++) { + prio = fs_create_prio(&root_ns->ns, i, 1); + if (IS_ERR(prio)) { + ret = PTR_ERR(prio); + goto err; + } + } + set_prio_attrs(root_ns); + return 0; + +err: + cleanup_root_ns(root_ns); + return ret; } +static bool mlx5_fs_ns_is_empty(struct mlx5_flow_namespace *ns) +{ + struct fs_prio *iter_prio; + + fs_for_each_prio(iter_prio, ns) { + if (iter_prio->num_ft) + return false; + } + + return true; +} + +int mlx5_fs_set_root_dev(struct mlx5_core_dev *dev, + struct mlx5_core_dev *new_dev, + enum fs_flow_table_type table_type) +{ + struct mlx5_flow_root_namespace **root; + int total_vports; + int i; + + switch (table_type) { + case FS_FT_RDMA_TRANSPORT_TX: + root = dev->priv.steering->rdma_transport_tx_root_ns; + total_vports = dev->priv.steering->rdma_transport_tx_vports; + break; + case FS_FT_RDMA_TRANSPORT_RX: + root = dev->priv.steering->rdma_transport_rx_root_ns; + total_vports = dev->priv.steering->rdma_transport_rx_vports; + break; + default: + WARN_ON_ONCE(true); + return -EINVAL; + } + + for (i = 0; i < total_vports; i++) { + mutex_lock(&root[i]->chain_lock); + if (!mlx5_fs_ns_is_empty(&root[i]->ns)) { + mutex_unlock(&root[i]->chain_lock); + goto err; + } + root[i]->dev = new_dev; + mutex_unlock(&root[i]->chain_lock); + } + return 0; +err: + while (i--) { + mutex_lock(&root[i]->chain_lock); + root[i]->dev = dev; + mutex_unlock(&root[i]->chain_lock); + } + /* If you hit this error try destroying all flow tables and try again */ + mlx5_core_err(dev, "Failed to set root device for RDMA TRANSPORT\n"); + return -EINVAL; +} +EXPORT_SYMBOL(mlx5_fs_set_root_dev); + static int init_rdma_transport_rx_root_ns(struct mlx5_flow_steering *steering) { struct mlx5_core_dev *dev = steering->dev; @@ -3461,6 +3574,11 @@ static int init_fdb_root_ns(struct mlx5_flow_steering *steering) if (!steering->fdb_root_ns) return -ENOMEM; + maj_prio = fs_create_prio(&steering->fdb_root_ns->ns, FDB_DROP_ROOT, 1); + err = PTR_ERR_OR_ZERO(maj_prio); + if (err) + goto out_err; + err = create_fdb_bypass(steering); if (err) goto out_err; @@ -3518,118 +3636,102 @@ out_err: return err; } -static int init_egress_acl_root_ns(struct mlx5_flow_steering *steering, int vport) +static void +mlx5_fs_remove_vport_acl_root_ns(struct xarray *esw_acl_root_ns, u16 vport_idx) { - struct fs_prio *prio; + struct mlx5_vport_acl_root_ns *vport_ns; - steering->esw_egress_root_ns[vport] = create_root_ns(steering, FS_FT_ESW_EGRESS_ACL); - if (!steering->esw_egress_root_ns[vport]) - return -ENOMEM; - - /* create 1 prio*/ - prio = fs_create_prio(&steering->esw_egress_root_ns[vport]->ns, 0, 1); - return PTR_ERR_OR_ZERO(prio); -} - -static int init_ingress_acl_root_ns(struct mlx5_flow_steering *steering, int vport) -{ - struct fs_prio *prio; - - steering->esw_ingress_root_ns[vport] = create_root_ns(steering, FS_FT_ESW_INGRESS_ACL); - if (!steering->esw_ingress_root_ns[vport]) - return -ENOMEM; - - /* create 1 prio*/ - prio = fs_create_prio(&steering->esw_ingress_root_ns[vport]->ns, 0, 1); - return PTR_ERR_OR_ZERO(prio); -} - -int mlx5_fs_egress_acls_init(struct mlx5_core_dev *dev, int total_vports) -{ - struct mlx5_flow_steering *steering = dev->priv.steering; - int err; - int i; - - steering->esw_egress_root_ns = - kcalloc(total_vports, - sizeof(*steering->esw_egress_root_ns), - GFP_KERNEL); - if (!steering->esw_egress_root_ns) - return -ENOMEM; - - for (i = 0; i < total_vports; i++) { - err = init_egress_acl_root_ns(steering, i); - if (err) - goto cleanup_root_ns; + vport_ns = xa_erase(esw_acl_root_ns, vport_idx); + if (vport_ns) { + cleanup_root_ns(vport_ns->root_ns); + kfree(vport_ns); } - steering->esw_egress_acl_vports = total_vports; +} + +static int +mlx5_fs_add_vport_acl_root_ns(struct mlx5_flow_steering *steering, + struct xarray *esw_acl_root_ns, + enum fs_flow_table_type table_type, + u16 vport_idx) +{ + struct mlx5_vport_acl_root_ns *vport_ns; + struct fs_prio *prio; + int err; + + /* sanity check, intended xarrays are used */ + if (WARN_ON(esw_acl_root_ns != &steering->esw_egress_root_ns && + esw_acl_root_ns != &steering->esw_ingress_root_ns)) + return -EINVAL; + + if (table_type != FS_FT_ESW_EGRESS_ACL && + table_type != FS_FT_ESW_INGRESS_ACL) { + mlx5_core_err(steering->dev, + "Invalid table type %d for egress/ingress ACLs\n", + table_type); + return -EINVAL; + } + + if (xa_load(esw_acl_root_ns, vport_idx)) + return -EEXIST; + + vport_ns = kzalloc(sizeof(*vport_ns), GFP_KERNEL); + if (!vport_ns) + return -ENOMEM; + + vport_ns->root_ns = create_root_ns(steering, table_type); + if (!vport_ns->root_ns) { + err = -ENOMEM; + goto kfree_vport_ns; + } + + /* create 1 prio*/ + prio = fs_create_prio(&vport_ns->root_ns->ns, 0, 1); + if (IS_ERR(prio)) { + err = PTR_ERR(prio); + goto cleanup_root_ns; + } + + vport_ns->vport_idx = vport_idx; + err = xa_insert(esw_acl_root_ns, vport_idx, vport_ns, GFP_KERNEL); + if (err) + goto cleanup_root_ns; return 0; cleanup_root_ns: - for (i--; i >= 0; i--) - cleanup_root_ns(steering->esw_egress_root_ns[i]); - kfree(steering->esw_egress_root_ns); - steering->esw_egress_root_ns = NULL; + cleanup_root_ns(vport_ns->root_ns); +kfree_vport_ns: + kfree(vport_ns); return err; } -void mlx5_fs_egress_acls_cleanup(struct mlx5_core_dev *dev) +int mlx5_fs_vport_egress_acl_ns_add(struct mlx5_flow_steering *steering, + u16 vport_idx) { - struct mlx5_flow_steering *steering = dev->priv.steering; - int i; - - if (!steering->esw_egress_root_ns) - return; - - for (i = 0; i < steering->esw_egress_acl_vports; i++) - cleanup_root_ns(steering->esw_egress_root_ns[i]); - - kfree(steering->esw_egress_root_ns); - steering->esw_egress_root_ns = NULL; + return mlx5_fs_add_vport_acl_root_ns(steering, + &steering->esw_egress_root_ns, + FS_FT_ESW_EGRESS_ACL, vport_idx); } -int mlx5_fs_ingress_acls_init(struct mlx5_core_dev *dev, int total_vports) +int mlx5_fs_vport_ingress_acl_ns_add(struct mlx5_flow_steering *steering, + u16 vport_idx) { - struct mlx5_flow_steering *steering = dev->priv.steering; - int err; - int i; - - steering->esw_ingress_root_ns = - kcalloc(total_vports, - sizeof(*steering->esw_ingress_root_ns), - GFP_KERNEL); - if (!steering->esw_ingress_root_ns) - return -ENOMEM; - - for (i = 0; i < total_vports; i++) { - err = init_ingress_acl_root_ns(steering, i); - if (err) - goto cleanup_root_ns; - } - steering->esw_ingress_acl_vports = total_vports; - return 0; - -cleanup_root_ns: - for (i--; i >= 0; i--) - cleanup_root_ns(steering->esw_ingress_root_ns[i]); - kfree(steering->esw_ingress_root_ns); - steering->esw_ingress_root_ns = NULL; - return err; + return mlx5_fs_add_vport_acl_root_ns(steering, + &steering->esw_ingress_root_ns, + FS_FT_ESW_INGRESS_ACL, vport_idx); } -void mlx5_fs_ingress_acls_cleanup(struct mlx5_core_dev *dev) +void mlx5_fs_vport_egress_acl_ns_remove(struct mlx5_flow_steering *steering, + int vport_idx) { - struct mlx5_flow_steering *steering = dev->priv.steering; - int i; + mlx5_fs_remove_vport_acl_root_ns(&steering->esw_egress_root_ns, + vport_idx); +} - if (!steering->esw_ingress_root_ns) - return; - - for (i = 0; i < steering->esw_ingress_acl_vports; i++) - cleanup_root_ns(steering->esw_ingress_root_ns[i]); - - kfree(steering->esw_ingress_root_ns); - steering->esw_ingress_root_ns = NULL; +void mlx5_fs_vport_ingress_acl_ns_remove(struct mlx5_flow_steering *steering, + int vport_idx) +{ + mlx5_fs_remove_vport_acl_root_ns(&steering->esw_ingress_root_ns, + vport_idx); } u32 mlx5_fs_get_capabilities(struct mlx5_core_dev *dev, enum mlx5_flow_namespace_type type) @@ -3677,6 +3779,13 @@ static int mlx5_fs_mode_validate(struct devlink *devlink, u32 id, char *value = val.vstr; u8 eswitch_mode; + eswitch_mode = mlx5_eswitch_mode(dev); + if (eswitch_mode == MLX5_ESWITCH_OFFLOADS) { + NL_SET_ERR_MSG_FMT_MOD(extack, + "Changing fs mode is not supported when eswitch offloads enabled."); + return -EOPNOTSUPP; + } + if (!strcmp(value, "dmfs")) return 0; @@ -3702,14 +3811,6 @@ static int mlx5_fs_mode_validate(struct devlink *devlink, u32 id, return -EINVAL; } - eswitch_mode = mlx5_eswitch_mode(dev); - if (eswitch_mode == MLX5_ESWITCH_OFFLOADS) { - NL_SET_ERR_MSG_FMT_MOD(extack, - "Moving to %s is not supported when eswitch offloads enabled.", - value); - return -EOPNOTSUPP; - } - return 0; } @@ -3762,6 +3863,11 @@ void mlx5_fs_core_cleanup(struct mlx5_core_dev *dev) { struct mlx5_flow_steering *steering = dev->priv.steering; + WARN_ON(!xa_empty(&steering->esw_egress_root_ns)); + WARN_ON(!xa_empty(&steering->esw_ingress_root_ns)); + xa_destroy(&steering->esw_egress_root_ns); + xa_destroy(&steering->esw_ingress_root_ns); + cleanup_root_ns(steering->root_ns); cleanup_fdb_root_ns(steering); cleanup_root_ns(steering->port_sel_root_ns); @@ -3852,6 +3958,8 @@ int mlx5_fs_core_init(struct mlx5_core_dev *dev) goto err; } + xa_init(&steering->esw_egress_root_ns); + xa_init(&steering->esw_ingress_root_ns); return 0; err: @@ -3895,6 +4003,8 @@ int mlx5_fs_core_alloc(struct mlx5_core_dev *dev) if (mlx5_fs_dr_is_supported(dev)) steering->mode = MLX5_FLOW_STEERING_MODE_SMFS; + else if (mlx5_fs_hws_is_supported(dev)) + steering->mode = MLX5_FLOW_STEERING_MODE_HMFS; else steering->mode = MLX5_FLOW_STEERING_MODE_DMFS; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h index 1f523fb761..1c65914252 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h @@ -58,6 +58,7 @@ struct mlx5_flow_definer { enum mlx5_flow_resource_owner { MLX5_FLOW_RESOURCE_OWNER_FW, MLX5_FLOW_RESOURCE_OWNER_SW, + MLX5_FLOW_RESOURCE_OWNER_HWS, }; struct mlx5_modify_hdr { @@ -102,24 +103,6 @@ enum fs_node_type { FS_TYPE_FLOW_DEST }; -enum fs_flow_table_type { - FS_FT_NIC_RX = 0x0, - FS_FT_NIC_TX = 0x1, - FS_FT_ESW_EGRESS_ACL = 0x2, - FS_FT_ESW_INGRESS_ACL = 0x3, - FS_FT_FDB = 0X4, - FS_FT_SNIFFER_RX = 0X5, - FS_FT_SNIFFER_TX = 0X6, - FS_FT_RDMA_RX = 0X7, - FS_FT_RDMA_TX = 0X8, - FS_FT_PORT_SEL = 0X9, - FS_FT_FDB_RX = 0xa, - FS_FT_FDB_TX = 0xb, - FS_FT_RDMA_TRANSPORT_RX = 0xd, - FS_FT_RDMA_TRANSPORT_TX = 0xe, - FS_FT_MAX_TYPE = FS_FT_RDMA_TRANSPORT_TX, -}; - enum fs_flow_table_op_mod { FS_FT_OP_MOD_NORMAL, FS_FT_OP_MOD_LAG_DEMUX, @@ -150,16 +133,14 @@ struct mlx5_flow_steering { struct mlx5_flow_root_namespace *root_ns; struct mlx5_flow_root_namespace *fdb_root_ns; struct mlx5_flow_namespace **fdb_sub_ns; - struct mlx5_flow_root_namespace **esw_egress_root_ns; - struct mlx5_flow_root_namespace **esw_ingress_root_ns; + struct xarray esw_egress_root_ns; + struct xarray esw_ingress_root_ns; struct mlx5_flow_root_namespace *sniffer_tx_root_ns; struct mlx5_flow_root_namespace *sniffer_rx_root_ns; struct mlx5_flow_root_namespace *rdma_rx_root_ns; struct mlx5_flow_root_namespace *rdma_tx_root_ns; struct mlx5_flow_root_namespace *egress_root_ns; struct mlx5_flow_root_namespace *port_sel_root_ns; - int esw_egress_acl_vports; - int esw_ingress_acl_vports; struct mlx5_flow_root_namespace **rdma_transport_rx_root_ns; struct mlx5_flow_root_namespace **rdma_transport_tx_root_ns; int rdma_transport_rx_vports; @@ -206,6 +187,7 @@ struct mlx5_flow_table { }; u32 id; u16 vport; + u16 esw_owner_vhca_id; unsigned int max_fte; unsigned int level; enum fs_flow_table_type type; @@ -378,15 +360,22 @@ void mlx5_fs_core_free(struct mlx5_core_dev *dev); int mlx5_fs_core_init(struct mlx5_core_dev *dev); void mlx5_fs_core_cleanup(struct mlx5_core_dev *dev); -int mlx5_fs_egress_acls_init(struct mlx5_core_dev *dev, int total_vports); -void mlx5_fs_egress_acls_cleanup(struct mlx5_core_dev *dev); -int mlx5_fs_ingress_acls_init(struct mlx5_core_dev *dev, int total_vports); -void mlx5_fs_ingress_acls_cleanup(struct mlx5_core_dev *dev); +int mlx5_fs_vport_egress_acl_ns_add(struct mlx5_flow_steering *steering, + u16 vport_idx); +int mlx5_fs_vport_ingress_acl_ns_add(struct mlx5_flow_steering *steering, + u16 vport_idx); +void mlx5_fs_vport_egress_acl_ns_remove(struct mlx5_flow_steering *steering, + int vport_idx); +void mlx5_fs_vport_ingress_acl_ns_remove(struct mlx5_flow_steering *steering, + int vport_idx); u32 mlx5_fs_get_capabilities(struct mlx5_core_dev *dev, enum mlx5_flow_namespace_type type); struct mlx5_flow_root_namespace *find_root(struct fs_node *node); +int mlx5_fs_get_packet_reformat_id(struct mlx5_pkt_reformat *pkt_reformat, + u32 *id); + #define fs_get_obj(v, _node) {v = container_of((_node), typeof(*v), node); } #define fs_list_for_each_entry(pos, root) \ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c index 57476487e3..eeb4437975 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c @@ -294,6 +294,12 @@ int mlx5_query_hca_caps(struct mlx5_core_dev *dev) return err; } + if (MLX5_CAP_GEN(dev, psp)) { + err = mlx5_core_get_caps(dev, MLX5_CAP_PSP); + if (err) + return err; + } + return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c index 69933addd9..4544f1968f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c @@ -6,13 +6,16 @@ #include "fw_reset.h" #include "diag/fw_tracer.h" #include "lib/tout.h" +#include "sf/sf.h" enum { MLX5_FW_RESET_FLAGS_RESET_REQUESTED, MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, MLX5_FW_RESET_FLAGS_PENDING_COMP, MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS, - MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED + MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED, + MLX5_FW_RESET_FLAGS_UNLOAD_EVENT, + MLX5_FW_RESET_FLAGS_RESET_IN_PROGRESS, }; struct mlx5_fw_reset { @@ -25,6 +28,7 @@ struct mlx5_fw_reset { struct work_struct reset_reload_work; struct work_struct reset_now_work; struct work_struct reset_abort_work; + struct delayed_work reset_timeout_work; unsigned long reset_flags; u8 reset_method; struct timer_list timer; @@ -124,6 +128,16 @@ int mlx5_fw_reset_query(struct mlx5_core_dev *dev, u8 *reset_level, u8 *reset_ty return mlx5_reg_mfrl_query(dev, reset_level, reset_type, NULL, NULL); } +bool mlx5_fw_reset_in_progress(struct mlx5_core_dev *dev) +{ + struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; + + if (!fw_reset) + return false; + + return test_bit(MLX5_FW_RESET_FLAGS_RESET_IN_PROGRESS, &fw_reset->reset_flags); +} + static int mlx5_fw_reset_get_reset_method(struct mlx5_core_dev *dev, u8 *reset_method) { @@ -219,7 +233,7 @@ int mlx5_fw_reset_set_live_patch(struct mlx5_core_dev *dev) return mlx5_reg_mfrl_set(dev, MLX5_MFRL_REG_RESET_LEVEL0, 0, 0, false); } -static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev, bool unloaded) +static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev) { struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; struct devlink *devlink = priv_to_devlink(dev); @@ -228,8 +242,7 @@ static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev, bool unload if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags)) { complete(&fw_reset->done); } else { - if (!unloaded) - mlx5_unload_one(dev, false); + mlx5_sync_reset_unload_flow(dev, false); if (mlx5_health_wait_pci_up(dev)) mlx5_core_err(dev, "reset reload flow aborted, PCI reads still not working\n"); else @@ -240,6 +253,8 @@ static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev, bool unload BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE)); devl_unlock(devlink); } + + clear_bit(MLX5_FW_RESET_FLAGS_RESET_IN_PROGRESS, &fw_reset->reset_flags); } static void mlx5_stop_sync_reset_poll(struct mlx5_core_dev *dev) @@ -258,6 +273,8 @@ static int mlx5_sync_reset_clear_reset_requested(struct mlx5_core_dev *dev, bool return -EALREADY; } + if (current_work() != &fw_reset->reset_timeout_work.work) + cancel_delayed_work(&fw_reset->reset_timeout_work); mlx5_stop_sync_reset_poll(dev); if (poll_health) mlx5_start_health_poll(dev); @@ -272,7 +289,7 @@ static void mlx5_sync_reset_reload_work(struct work_struct *work) mlx5_sync_reset_clear_reset_requested(dev, false); mlx5_enter_error_state(dev, true); - mlx5_fw_reset_complete_reload(dev, false); + mlx5_fw_reset_complete_reload(dev); } #define MLX5_RESET_POLL_INTERVAL (HZ / 10) @@ -329,6 +346,11 @@ static int mlx5_sync_reset_set_reset_requested(struct mlx5_core_dev *dev) } mlx5_stop_health_poll(dev, true); mlx5_start_sync_reset_poll(dev); + + if (!test_bit(MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS, + &fw_reset->reset_flags)) + schedule_delayed_work(&fw_reset->reset_timeout_work, + msecs_to_jiffies(mlx5_tout_ms(dev, PCI_SYNC_UPDATE))); return 0; } @@ -428,6 +450,11 @@ static bool mlx5_is_reset_now_capable(struct mlx5_core_dev *dev, return false; } + if (!mlx5_core_is_ecpf(dev) && !mlx5_sf_table_empty(dev)) { + mlx5_core_warn(dev, "SFs should be removed before reset\n"); + return false; + } + #if IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE) if (reset_method != MLX5_MFRL_REG_PCI_RESET_METHOD_HOT_RESET) { err = mlx5_check_hotplug_interrupt(dev, bridge); @@ -447,27 +474,48 @@ static void mlx5_sync_reset_request_event(struct work_struct *work) struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset, reset_request_work); struct mlx5_core_dev *dev = fw_reset->dev; + bool nack_request = false; + struct devlink *devlink; int err; err = mlx5_fw_reset_get_reset_method(dev, &fw_reset->reset_method); - if (err) + if (err) { + nack_request = true; mlx5_core_warn(dev, "Failed reading MFRL, err %d\n", err); + } else if (!mlx5_is_reset_now_capable(dev, fw_reset->reset_method) || + test_bit(MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, + &fw_reset->reset_flags)) { + nack_request = true; + } - if (err || test_bit(MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, &fw_reset->reset_flags) || - !mlx5_is_reset_now_capable(dev, fw_reset->reset_method)) { + devlink = priv_to_devlink(dev); + /* For external resets, try to acquire devl_lock. Skip if devlink reset is + * pending (lock already held) + */ + if (nack_request || + (!test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, + &fw_reset->reset_flags) && + !devl_trylock(devlink))) { err = mlx5_fw_reset_set_reset_sync_nack(dev); mlx5_core_warn(dev, "PCI Sync FW Update Reset Nack %s", err ? "Failed" : "Sent"); return; } + if (mlx5_sync_reset_set_reset_requested(dev)) - return; + goto unlock; + + set_bit(MLX5_FW_RESET_FLAGS_RESET_IN_PROGRESS, &fw_reset->reset_flags); err = mlx5_fw_reset_set_reset_sync_ack(dev); if (err) mlx5_core_warn(dev, "PCI Sync FW Update Reset Ack Failed. Error code: %d\n", err); else mlx5_core_warn(dev, "PCI Sync FW Update Reset Ack. Device reset is expected.\n"); + +unlock: + if (!test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags)) + devl_unlock(devlink); } static int mlx5_pci_link_toggle(struct mlx5_core_dev *dev, u16 dev_id) @@ -586,65 +634,23 @@ static int mlx5_sync_pci_reset(struct mlx5_core_dev *dev, u8 reset_method) return err; } -static void mlx5_sync_reset_now_event(struct work_struct *work) +void mlx5_sync_reset_unload_flow(struct mlx5_core_dev *dev, bool locked) { - struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset, - reset_now_work); - struct mlx5_core_dev *dev = fw_reset->dev; - int err; - - if (mlx5_sync_reset_clear_reset_requested(dev, false)) - return; - - mlx5_core_warn(dev, "Sync Reset now. Device is going to reset.\n"); - - err = mlx5_cmd_fast_teardown_hca(dev); - if (err) { - mlx5_core_warn(dev, "Fast teardown failed, no reset done, err %d\n", err); - goto done; - } - - err = mlx5_sync_pci_reset(dev, fw_reset->reset_method); - if (err) { - mlx5_core_warn(dev, "mlx5_sync_pci_reset failed, no reset done, err %d\n", err); - set_bit(MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED, &fw_reset->reset_flags); - } - - mlx5_enter_error_state(dev, true); -done: - fw_reset->ret = err; - mlx5_fw_reset_complete_reload(dev, false); -} - -static void mlx5_sync_reset_unload_event(struct work_struct *work) -{ - struct mlx5_fw_reset *fw_reset; - struct mlx5_core_dev *dev; + struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; unsigned long timeout; int poll_freq = 20; bool reset_action; u8 rst_state; int err; - fw_reset = container_of(work, struct mlx5_fw_reset, reset_unload_work); - dev = fw_reset->dev; - - if (mlx5_sync_reset_clear_reset_requested(dev, false)) - return; - - mlx5_core_warn(dev, "Sync Reset Unload. Function is forced down.\n"); - - err = mlx5_cmd_fast_teardown_hca(dev); - if (err) - mlx5_core_warn(dev, "Fast teardown failed, unloading, err %d\n", err); - else - mlx5_enter_error_state(dev, true); - - if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags)) + if (locked) mlx5_unload_one_devl_locked(dev, false); else mlx5_unload_one(dev, false); + if (!test_bit(MLX5_FW_RESET_FLAGS_UNLOAD_EVENT, &fw_reset->reset_flags)) + return; + mlx5_set_fw_rst_ack(dev); mlx5_core_warn(dev, "Sync Reset Unload done, device reset expected\n"); @@ -672,17 +678,73 @@ static void mlx5_sync_reset_unload_event(struct work_struct *work) goto done; } - mlx5_core_warn(dev, "Sync Reset, got reset action. rst_state = %u\n", rst_state); + mlx5_core_warn(dev, "Sync Reset, got reset action. rst_state = %u\n", + rst_state); if (rst_state == MLX5_FW_RST_STATE_TOGGLE_REQ) { err = mlx5_sync_pci_reset(dev, fw_reset->reset_method); if (err) { - mlx5_core_warn(dev, "mlx5_sync_pci_reset failed, err %d\n", err); + mlx5_core_warn(dev, "mlx5_sync_pci_reset failed, err %d\n", + err); fw_reset->ret = err; } } done: - mlx5_fw_reset_complete_reload(dev, true); + clear_bit(MLX5_FW_RESET_FLAGS_UNLOAD_EVENT, &fw_reset->reset_flags); +} + +static void mlx5_sync_reset_now_event(struct work_struct *work) +{ + struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset, + reset_now_work); + struct mlx5_core_dev *dev = fw_reset->dev; + int err; + + if (mlx5_sync_reset_clear_reset_requested(dev, false)) + return; + + mlx5_core_warn(dev, "Sync Reset now. Device is going to reset.\n"); + + err = mlx5_cmd_fast_teardown_hca(dev); + if (err) { + mlx5_core_warn(dev, "Fast teardown failed, no reset done, err %d\n", err); + goto done; + } + + err = mlx5_sync_pci_reset(dev, fw_reset->reset_method); + if (err) { + mlx5_core_warn(dev, "mlx5_sync_pci_reset failed, no reset done, err %d\n", err); + set_bit(MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED, &fw_reset->reset_flags); + } + + mlx5_enter_error_state(dev, true); +done: + fw_reset->ret = err; + mlx5_fw_reset_complete_reload(dev); +} + +static void mlx5_sync_reset_unload_event(struct work_struct *work) +{ + struct mlx5_fw_reset *fw_reset; + struct mlx5_core_dev *dev; + int err; + + fw_reset = container_of(work, struct mlx5_fw_reset, reset_unload_work); + dev = fw_reset->dev; + + if (mlx5_sync_reset_clear_reset_requested(dev, false)) + return; + + set_bit(MLX5_FW_RESET_FLAGS_UNLOAD_EVENT, &fw_reset->reset_flags); + mlx5_core_warn(dev, "Sync Reset Unload. Function is forced down.\n"); + + err = mlx5_cmd_fast_teardown_hca(dev); + if (err) + mlx5_core_warn(dev, "Fast teardown failed, unloading, err %d\n", err); + else + mlx5_enter_error_state(dev, true); + + mlx5_fw_reset_complete_reload(dev); } static void mlx5_sync_reset_abort_event(struct work_struct *work) @@ -693,6 +755,8 @@ static void mlx5_sync_reset_abort_event(struct work_struct *work) if (mlx5_sync_reset_clear_reset_requested(dev, true)) return; + + clear_bit(MLX5_FW_RESET_FLAGS_RESET_IN_PROGRESS, &fw_reset->reset_flags); mlx5_core_warn(dev, "PCI Sync FW Update Reset Aborted.\n"); } @@ -719,6 +783,20 @@ static void mlx5_sync_reset_events_handle(struct mlx5_fw_reset *fw_reset, struct } } +static void mlx5_sync_reset_timeout_work(struct work_struct *work) +{ + struct delayed_work *dwork = container_of(work, struct delayed_work, + work); + struct mlx5_fw_reset *fw_reset = + container_of(dwork, struct mlx5_fw_reset, reset_timeout_work); + struct mlx5_core_dev *dev = fw_reset->dev; + + if (mlx5_sync_reset_clear_reset_requested(dev, true)) + return; + clear_bit(MLX5_FW_RESET_FLAGS_RESET_IN_PROGRESS, &fw_reset->reset_flags); + mlx5_core_warn(dev, "PCI Sync FW Update Reset Timeout.\n"); +} + static int fw_reset_event_notifier(struct notifier_block *nb, unsigned long action, void *data) { struct mlx5_fw_reset *fw_reset = mlx5_nb_cof(nb, struct mlx5_fw_reset, nb); @@ -802,6 +880,8 @@ void mlx5_drain_fw_reset(struct mlx5_core_dev *dev) cancel_work_sync(&fw_reset->reset_reload_work); cancel_work_sync(&fw_reset->reset_now_work); cancel_work_sync(&fw_reset->reset_abort_work); + if (test_bit(MLX5_FW_RESET_FLAGS_RESET_REQUESTED, &fw_reset->reset_flags)) + mlx5_sync_reset_clear_reset_requested(dev, true); } static const struct devlink_param mlx5_fw_reset_devlink_params[] = { @@ -845,6 +925,8 @@ int mlx5_fw_reset_init(struct mlx5_core_dev *dev) INIT_WORK(&fw_reset->reset_reload_work, mlx5_sync_reset_reload_work); INIT_WORK(&fw_reset->reset_now_work, mlx5_sync_reset_now_event); INIT_WORK(&fw_reset->reset_abort_work, mlx5_sync_reset_abort_event); + INIT_DELAYED_WORK(&fw_reset->reset_timeout_work, + mlx5_sync_reset_timeout_work); init_completion(&fw_reset->done); return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h index ea527d06a8..2d96b2adc1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h @@ -10,8 +10,10 @@ int mlx5_fw_reset_query(struct mlx5_core_dev *dev, u8 *reset_level, u8 *reset_ty int mlx5_fw_reset_set_reset_sync(struct mlx5_core_dev *dev, u8 reset_type_sel, struct netlink_ext_ack *extack); int mlx5_fw_reset_set_live_patch(struct mlx5_core_dev *dev); +bool mlx5_fw_reset_in_progress(struct mlx5_core_dev *dev); int mlx5_fw_reset_wait_reset_done(struct mlx5_core_dev *dev); +void mlx5_sync_reset_unload_flow(struct mlx5_core_dev *dev, bool locked); int mlx5_fw_reset_verify_fw_complete(struct mlx5_core_dev *dev, struct netlink_ext_ack *extack); void mlx5_fw_reset_events_start(struct mlx5_core_dev *dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index b63c5a221e..aeeb136f5e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -718,15 +718,15 @@ void mlx5_fw_reporters_create(struct mlx5_core_dev *dev) health->fw_reporter = devl_health_reporter_create(devlink, fw_ops, dev); if (IS_ERR(health->fw_reporter)) - mlx5_core_warn(dev, "Failed to create fw reporter, err = %ld\n", - PTR_ERR(health->fw_reporter)); + mlx5_core_warn(dev, "Failed to create fw reporter, err = %pe\n", + health->fw_reporter); health->fw_fatal_reporter = devl_health_reporter_create(devlink, fw_fatal_ops, dev); if (IS_ERR(health->fw_fatal_reporter)) - mlx5_core_warn(dev, "Failed to create fw fatal reporter, err = %ld\n", - PTR_ERR(health->fw_fatal_reporter)); + mlx5_core_warn(dev, "Failed to create fw fatal reporter, err = %pe\n", + health->fw_fatal_reporter); } static void mlx5_fw_reporters_destroy(struct mlx5_core_dev *dev) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c index 4b3430ac39..3b2f54ca30 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c @@ -266,21 +266,18 @@ static int mlx5i_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd) return mlx5e_ethtool_set_rxnfc(priv, cmd); } +static u32 mlx5i_get_rx_ring_count(struct net_device *dev) +{ + struct mlx5e_priv *priv = mlx5i_epriv(dev); + + return priv->channels.params.num_channels; +} + static int mlx5i_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info, u32 *rule_locs) { struct mlx5e_priv *priv = mlx5i_epriv(dev); - /* ETHTOOL_GRXRINGS is needed by ethtool -x which is not part - * of rxnfc. We keep this logic out of mlx5e_ethtool_get_rxnfc, - * to avoid breaking "ethtool -x" when mlx5e_ethtool_get_rxnfc - * is compiled out via CONFIG_MLX5_EN_RXNFC=n. - */ - if (info->cmd == ETHTOOL_GRXRINGS) { - info->data = priv->channels.params.num_channels; - return 0; - } - return mlx5e_ethtool_get_rxnfc(priv, info, rule_locs); } @@ -304,6 +301,7 @@ const struct ethtool_ops mlx5i_ethtool_ops = { .set_rxfh_fields = mlx5i_set_rxfh_fields, .get_rxnfc = mlx5i_get_rxnfc, .set_rxnfc = mlx5i_set_rxnfc, + .get_rx_ring_count = mlx5i_get_rx_ring_count, .get_link_ksettings = mlx5i_get_link_ksettings, .get_link = ethtool_op_get_link, }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c index 0979d672d4..22037785d1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c @@ -44,6 +44,23 @@ static int mlx5i_open(struct net_device *netdev); static int mlx5i_close(struct net_device *netdev); static int mlx5i_change_mtu(struct net_device *netdev, int new_mtu); +int mlx5i_hwtstamp_set(struct net_device *dev, + struct kernel_hwtstamp_config *config, + struct netlink_ext_ack *extack) +{ + struct mlx5e_priv *epriv = mlx5i_epriv(dev); + + return mlx5e_hwtstamp_set(epriv, config, extack); +} + +int mlx5i_hwtstamp_get(struct net_device *dev, + struct kernel_hwtstamp_config *config) +{ + struct mlx5e_priv *epriv = mlx5i_epriv(dev); + + return mlx5e_hwtstamp_get(epriv, config); +} + static const struct net_device_ops mlx5i_netdev_ops = { .ndo_open = mlx5i_open, .ndo_stop = mlx5i_close, @@ -51,7 +68,8 @@ static const struct net_device_ops mlx5i_netdev_ops = { .ndo_init = mlx5i_dev_init, .ndo_uninit = mlx5i_dev_cleanup, .ndo_change_mtu = mlx5i_change_mtu, - .ndo_eth_ioctl = mlx5i_ioctl, + .ndo_hwtstamp_get = mlx5i_hwtstamp_get, + .ndo_hwtstamp_set = mlx5i_hwtstamp_set, }; /* IPoIB mlx5 netdev profile */ @@ -313,7 +331,7 @@ void mlx5i_destroy_underlay_qp(struct mlx5_core_dev *mdev, u32 qpn) int mlx5i_update_nic_rx(struct mlx5e_priv *priv) { - return mlx5e_refresh_tirs(priv, true, true); + return mlx5e_refresh_tirs(priv->mdev, true, true); } int mlx5i_create_tis(struct mlx5_core_dev *mdev, u32 underlay_qpn, u32 *tisn) @@ -406,6 +424,7 @@ static void mlx5i_destroy_flow_steering(struct mlx5e_priv *priv) static int mlx5i_init_rx(struct mlx5e_priv *priv) { struct mlx5_core_dev *mdev = priv->mdev; + enum mlx5e_rx_res_features features; int err; priv->fs = mlx5e_fs_init(priv->profile, mdev, @@ -424,7 +443,9 @@ static int mlx5i_init_rx(struct mlx5e_priv *priv) goto err_destroy_q_counters; } - priv->rx_res = mlx5e_rx_res_create(priv->mdev, 0, priv->max_nch, priv->drop_rq.rqn, + features = MLX5E_RX_RES_FEATURE_SELF_LB_BLOCK; + priv->rx_res = mlx5e_rx_res_create(priv->mdev, features, priv->max_nch, + priv->drop_rq.rqn, &priv->channels.params.packet_merge, priv->channels.params.num_channels); if (IS_ERR(priv->rx_res)) { @@ -554,20 +575,6 @@ int mlx5i_dev_init(struct net_device *dev) return 0; } -int mlx5i_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) -{ - struct mlx5e_priv *priv = mlx5i_epriv(dev); - - switch (cmd) { - case SIOCSHWTSTAMP: - return mlx5e_hwstamp_set(priv, ifr); - case SIOCGHWTSTAMP: - return mlx5e_hwstamp_get(priv, ifr); - default: - return -EOPNOTSUPP; - } -} - void mlx5i_dev_cleanup(struct net_device *dev) { struct mlx5e_priv *priv = mlx5i_epriv(dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h index 2ab6437a1c..d67d5a72bb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h @@ -88,7 +88,11 @@ struct net_device *mlx5i_pkey_get_netdev(struct net_device *netdev, u32 qpn); /* Shared ndo functions */ int mlx5i_dev_init(struct net_device *dev); void mlx5i_dev_cleanup(struct net_device *dev); -int mlx5i_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd); +int mlx5i_hwtstamp_set(struct net_device *dev, + struct kernel_hwtstamp_config *config, + struct netlink_ext_ack *extack); +int mlx5i_hwtstamp_get(struct net_device *dev, + struct kernel_hwtstamp_config *config); /* Parent profile functions */ int mlx5i_init(struct mlx5_core_dev *mdev, struct net_device *netdev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c index 028a76944d..04444dad3a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c @@ -140,7 +140,6 @@ static int mlx5i_pkey_close(struct net_device *netdev); static int mlx5i_pkey_dev_init(struct net_device *dev); static void mlx5i_pkey_dev_cleanup(struct net_device *netdev); static int mlx5i_pkey_change_mtu(struct net_device *netdev, int new_mtu); -static int mlx5i_pkey_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd); static const struct net_device_ops mlx5i_pkey_netdev_ops = { .ndo_open = mlx5i_pkey_open, @@ -149,7 +148,8 @@ static const struct net_device_ops mlx5i_pkey_netdev_ops = { .ndo_get_stats64 = mlx5i_get_stats, .ndo_uninit = mlx5i_pkey_dev_cleanup, .ndo_change_mtu = mlx5i_pkey_change_mtu, - .ndo_eth_ioctl = mlx5i_pkey_ioctl, + .ndo_hwtstamp_get = mlx5i_hwtstamp_get, + .ndo_hwtstamp_set = mlx5i_hwtstamp_set, }; /* Child NDOs */ @@ -184,11 +184,6 @@ static int mlx5i_pkey_dev_init(struct net_device *dev) return mlx5i_dev_init(dev); } -static int mlx5i_pkey_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) -{ - return mlx5i_ioctl(dev, ifr, cmd); -} - static void mlx5i_pkey_dev_cleanup(struct net_device *netdev) { mlx5i_parent_put(netdev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c b/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c index 2691d88cde..14d339eceb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c @@ -47,29 +47,40 @@ static int cpu_get_least_loaded(struct mlx5_irq_pool *pool, static struct mlx5_irq * irq_pool_request_irq(struct mlx5_irq_pool *pool, struct irq_affinity_desc *af_desc) { - struct irq_affinity_desc auto_desc = {}; + struct irq_affinity_desc *auto_desc; struct mlx5_irq *irq; u32 irq_index; int err; + auto_desc = kvzalloc(sizeof(*auto_desc), GFP_KERNEL); + if (!auto_desc) + return ERR_PTR(-ENOMEM); + err = xa_alloc(&pool->irqs, &irq_index, NULL, pool->xa_num_irqs, GFP_KERNEL); - if (err) + if (err) { + kvfree(auto_desc); return ERR_PTR(err); + } + if (pool->irqs_per_cpu) { if (cpumask_weight(&af_desc->mask) > 1) /* if req_mask contain more then one CPU, set the least loadad CPU * of req_mask */ cpumask_set_cpu(cpu_get_least_loaded(pool, &af_desc->mask), - &auto_desc.mask); + &auto_desc->mask); else cpu_get(pool, cpumask_first(&af_desc->mask)); } + irq = mlx5_irq_alloc(pool, irq_index, - cpumask_empty(&auto_desc.mask) ? af_desc : &auto_desc, + cpumask_empty(&auto_desc->mask) ? af_desc : auto_desc, NULL); if (IS_ERR(irq)) xa_erase(&pool->irqs, irq_index); + + kvfree(auto_desc); + return irq; } @@ -139,8 +150,8 @@ mlx5_irq_affinity_request(struct mlx5_core_dev *dev, struct mlx5_irq_pool *pool, if (IS_ERR(new_irq)) { if (!least_loaded_irq) { /* We failed to create an IRQ and we didn't find an IRQ */ - mlx5_core_err(pool->dev, "Didn't find a matching IRQ. err = %ld\n", - PTR_ERR(new_irq)); + mlx5_core_err(pool->dev, "Didn't find a matching IRQ. err = %pe\n", + new_irq); mutex_unlock(&pool->lock); return new_irq; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index d058cbb4a0..a459a30f36 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -35,6 +35,7 @@ #include #include #include +#include "lib/mlx5.h" #include "lib/devcom.h" #include "mlx5_core.h" #include "eswitch.h" @@ -231,9 +232,13 @@ static void mlx5_do_bond_work(struct work_struct *work); static void mlx5_ldev_free(struct kref *ref) { struct mlx5_lag *ldev = container_of(ref, struct mlx5_lag, ref); + struct net *net; + + if (ldev->nb.notifier_call) { + net = read_pnet(&ldev->net); + unregister_netdevice_notifier_net(net, &ldev->nb); + } - if (ldev->nb.notifier_call) - unregister_netdevice_notifier_net(&init_net, &ldev->nb); mlx5_lag_mp_cleanup(ldev); cancel_delayed_work_sync(&ldev->bond_work); destroy_workqueue(ldev->wq); @@ -271,7 +276,8 @@ static struct mlx5_lag *mlx5_lag_dev_alloc(struct mlx5_core_dev *dev) INIT_DELAYED_WORK(&ldev->bond_work, mlx5_do_bond_work); ldev->nb.notifier_call = mlx5_lag_netdev_event; - if (register_netdevice_notifier_net(&init_net, &ldev->nb)) { + write_pnet(&ldev->net, mlx5_core_net(dev)); + if (register_netdevice_notifier_net(read_pnet(&ldev->net), &ldev->nb)) { ldev->nb.notifier_call = NULL; mlx5_core_err(dev, "Failed to register LAG netdev notifier\n"); } @@ -1404,6 +1410,38 @@ static int __mlx5_lag_dev_add_mdev(struct mlx5_core_dev *dev) return 0; } +static void mlx5_lag_unregister_hca_devcom_comp(struct mlx5_core_dev *dev) +{ + mlx5_devcom_unregister_component(dev->priv.hca_devcom_comp); + dev->priv.hca_devcom_comp = NULL; +} + +static int mlx5_lag_register_hca_devcom_comp(struct mlx5_core_dev *dev) +{ + struct mlx5_devcom_match_attr attr = { + .flags = MLX5_DEVCOM_MATCH_FLAGS_NS, + .net = mlx5_core_net(dev), + }; + u8 len __always_unused; + + mlx5_query_nic_sw_system_image_guid(dev, attr.key.buf, &len); + + /* This component is use to sync adding core_dev to lag_dev and to sync + * changes of mlx5_adev_devices between LAG layer and other layers. + */ + dev->priv.hca_devcom_comp = + mlx5_devcom_register_component(dev->priv.devc, + MLX5_DEVCOM_HCA_PORTS, + &attr, NULL, dev); + if (!dev->priv.hca_devcom_comp) { + mlx5_core_err(dev, + "Failed to register devcom HCA component."); + return -EINVAL; + } + + return 0; +} + void mlx5_lag_remove_mdev(struct mlx5_core_dev *dev) { struct mlx5_lag *ldev; @@ -1425,6 +1463,7 @@ recheck: } mlx5_ldev_remove_mdev(ldev, dev); mutex_unlock(&ldev->lock); + mlx5_lag_unregister_hca_devcom_comp(dev); mlx5_ldev_put(ldev); } @@ -1435,7 +1474,7 @@ void mlx5_lag_add_mdev(struct mlx5_core_dev *dev) if (!mlx5_lag_is_supported(dev)) return; - if (IS_ERR_OR_NULL(dev->priv.hca_devcom_comp)) + if (mlx5_lag_register_hca_devcom_comp(dev)) return; recheck: diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h index c2f256bb2b..4918eee2b3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h @@ -67,6 +67,7 @@ struct mlx5_lag { struct workqueue_struct *wq; struct delayed_work bond_work; struct notifier_block nb; + possible_net_t net; struct lag_mp lag_mp; struct mlx5_lag_port_sel port_sel; /* Protect lag fields/state changes */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c index aee17fcf3b..cdc99fe5c9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c @@ -173,10 +173,15 @@ static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev, unsigned long event, } /* Handle multipath entry with lower priority value */ - if (mp->fib.mfi && mp->fib.mfi != fi && + if (mp->fib.mfi && (mp->fib.dst != fen_info->dst || mp->fib.dst_len != fen_info->dst_len) && - fi->fib_priority >= mp->fib.priority) + mp->fib.dst_len <= fen_info->dst_len && + !(mp->fib.dst_len == fen_info->dst_len && + fi->fib_priority < mp->fib.priority)) { + mlx5_core_dbg(ldev->pf[idx].dev, + "Multipath entry with lower priority was rejected\n"); return; + } nh_dev0 = mlx5_lag_get_next_fib_dev(ldev, fi, NULL); nh_dev1 = mlx5_lag_get_next_fib_dev(ldev, fi, nh_dev0); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c index aad52d3a90..2d86af8f0d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c @@ -67,12 +67,19 @@ err_metadata: static int enable_mpesw(struct mlx5_lag *ldev) { - int idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); struct mlx5_core_dev *dev0; int err; + int idx; int i; - if (idx < 0 || ldev->mode != MLX5_LAG_MODE_NONE) + if (ldev->mode == MLX5_LAG_MODE_MPESW) + return 0; + + if (ldev->mode != MLX5_LAG_MODE_NONE) + return -EINVAL; + + idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); + if (idx < 0) return -EINVAL; dev0 = ldev->pf[idx].dev; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c index 58bd749b5e..129725159a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c @@ -100,7 +100,7 @@ static int create_aso_cq(struct mlx5_aso_cq *cq, void *cqc_data) MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); - MLX5_SET(cqc, cqc, uar_page, mdev->priv.uar->index); + MLX5_SET(cqc, cqc, uar_page, mdev->priv.bfreg.up->index); MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma); @@ -129,7 +129,7 @@ static int mlx5_aso_create_cq(struct mlx5_core_dev *mdev, int numa_node, return -ENOMEM; MLX5_SET(cqc, cqc_data, log_cq_size, 1); - MLX5_SET(cqc, cqc_data, uar_page, mdev->priv.uar->index); + MLX5_SET(cqc, cqc_data, uar_page, mdev->priv.bfreg.up->index); if (MLX5_CAP_GEN(mdev, cqe_128_always) && cache_line_size() >= 128) MLX5_SET(cqc, cqc_data, cqe_sz, CQE_STRIDE_128_PAD); @@ -163,7 +163,7 @@ static int mlx5_aso_alloc_sq(struct mlx5_core_dev *mdev, int numa_node, struct mlx5_wq_param param; int err; - sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map; + sq->uar_map = mdev->priv.bfreg.map; param.db_numa_node = numa_node; param.buf_numa_node = numa_node; @@ -203,7 +203,7 @@ static int create_aso_sq(struct mlx5_core_dev *mdev, int pdn, MLX5_SET(sqc, sqc, ts_format, ts_format); MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); - MLX5_SET(wq, wq, uar_page, mdev->mlx5e_res.hw_objs.bfreg.index); + MLX5_SET(wq, wq, uar_page, mdev->priv.bfreg.index); MLX5_SET(wq, wq, log_wq_pg_sz, sq->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET64(wq, wq, dbr_addr, sq->wq_ctrl.db.dma); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c index 214d732d18..0ba0ef8bae 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c @@ -247,27 +247,24 @@ static bool mlx5_is_ptm_source_time_available(struct mlx5_core_dev *dev) return !!MLX5_GET(mtptm_reg, out, psta); } -static int mlx5_mtctr_syncdevicetime(ktime_t *device_time, - struct system_counterval_t *sys_counterval, - void *ctx) +static int mlx5_mtctr_read(struct mlx5_core_dev *mdev, + bool real_time_mode, + struct system_counterval_t *sys_counterval, + u64 *device) { u32 out[MLX5_ST_SZ_DW(mtctr_reg)] = {0}; u32 in[MLX5_ST_SZ_DW(mtctr_reg)] = {0}; - struct mlx5_core_dev *mdev = ctx; - bool real_time_mode; - u64 host, device; + u64 host; int err; - real_time_mode = mlx5_real_time_mode(mdev); - MLX5_SET(mtctr_reg, in, first_clock_timestamp_request, MLX5_MTCTR_REQUEST_PTM_ROOT_CLOCK); MLX5_SET(mtctr_reg, in, second_clock_timestamp_request, real_time_mode ? MLX5_MTCTR_REQUEST_REAL_TIME_CLOCK : - MLX5_MTCTR_REQUEST_FREE_RUNNING_COUNTER); + MLX5_MTCTR_REQUEST_FREE_RUNNING_COUNTER); - err = mlx5_core_access_reg(mdev, in, sizeof(in), out, sizeof(out), MLX5_REG_MTCTR, - 0, 0); + err = mlx5_core_access_reg(mdev, in, sizeof(in), out, sizeof(out), + MLX5_REG_MTCTR, 0, 0); if (err) return err; @@ -281,8 +278,26 @@ static int mlx5_mtctr_syncdevicetime(ktime_t *device_time, .cs_id = CSID_X86_ART, .use_nsecs = true, }; + *device = MLX5_GET64(mtctr_reg, out, second_clock_timestamp); + + return 0; +} + +static int mlx5_mtctr_syncdevicetime(ktime_t *device_time, + struct system_counterval_t *sys_counterval, + void *ctx) +{ + struct mlx5_core_dev *mdev = ctx; + bool real_time_mode; + u64 device; + int err; + + real_time_mode = mlx5_real_time_mode(mdev); + + err = mlx5_mtctr_read(mdev, real_time_mode, sys_counterval, &device); + if (err) + return err; - device = MLX5_GET64(mtctr_reg, out, second_clock_timestamp); if (real_time_mode) *device_time = ns_to_ktime(REAL_TIME_TO_NS(device >> 32, device & U32_MAX)); else @@ -291,6 +306,23 @@ static int mlx5_mtctr_syncdevicetime(ktime_t *device_time, return 0; } +static int +mlx5_mtctr_syncdevicecyclestime(ktime_t *device_time, + struct system_counterval_t *sys_counterval, + void *ctx) +{ + struct mlx5_core_dev *mdev = ctx; + u64 device; + int err; + + err = mlx5_mtctr_read(mdev, false, sys_counterval, &device); + if (err) + return err; + *device_time = ns_to_ktime(device); + + return 0; +} + static int mlx5_ptp_getcrosststamp(struct ptp_clock_info *ptp, struct system_device_crosststamp *cts) { @@ -315,6 +347,32 @@ unlock: mlx5_clock_unlock(clock); return err; } + +static int mlx5_ptp_getcrosscycles(struct ptp_clock_info *ptp, + struct system_device_crosststamp *cts) +{ + struct mlx5_clock *clock = + container_of(ptp, struct mlx5_clock, ptp_info); + struct system_time_snapshot history_begin = {0}; + struct mlx5_core_dev *mdev; + int err; + + mlx5_clock_lock(clock); + mdev = mlx5_clock_mdev_get(clock); + + if (!mlx5_is_ptm_source_time_available(mdev)) { + err = -EBUSY; + goto unlock; + } + + ktime_get_snapshot(&history_begin); + + err = get_device_system_crosststamp(mlx5_mtctr_syncdevicecyclestime, + mdev, &history_begin, cts); +unlock: + mlx5_clock_unlock(clock); + return err; +} #endif /* CONFIG_X86 */ static u64 mlx5_read_time(struct mlx5_core_dev *dev, @@ -513,6 +571,24 @@ out: return 0; } +static int mlx5_ptp_getcyclesx(struct ptp_clock_info *ptp, + struct timespec64 *ts, + struct ptp_system_timestamp *sts) +{ + struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, + ptp_info); + struct mlx5_core_dev *mdev; + u64 cycles; + + mlx5_clock_lock(clock); + mdev = mlx5_clock_mdev_get(clock); + + cycles = mlx5_read_time(mdev, sts, false); + *ts = ns_to_timespec64(cycles); + mlx5_clock_unlock(clock); + return 0; +} + static int mlx5_ptp_adjtime_real_time(struct mlx5_core_dev *mdev, s64 delta) { u32 in[MLX5_ST_SZ_DW(mtutc_reg)] = {}; @@ -1229,6 +1305,7 @@ static void mlx5_init_timer_max_freq_adjustment(struct mlx5_core_dev *mdev) static void mlx5_init_timer_clock(struct mlx5_core_dev *mdev) { struct mlx5_clock *clock = mdev->clock; + bool expose_cycles; /* Configure the PHC */ clock->ptp_info = mlx5_ptp_clock_info; @@ -1236,12 +1313,22 @@ static void mlx5_init_timer_clock(struct mlx5_core_dev *mdev) if (MLX5_CAP_MCAM_REG(mdev, mtutc)) mlx5_init_timer_max_freq_adjustment(mdev); + expose_cycles = !MLX5_CAP_GEN(mdev, disciplined_fr_counter) || + !mlx5_real_time_mode(mdev); + #ifdef CONFIG_X86 if (MLX5_CAP_MCAM_REG3(mdev, mtptm) && - MLX5_CAP_MCAM_REG3(mdev, mtctr) && boot_cpu_has(X86_FEATURE_ART)) + MLX5_CAP_MCAM_REG3(mdev, mtctr) && boot_cpu_has(X86_FEATURE_ART)) { clock->ptp_info.getcrosststamp = mlx5_ptp_getcrosststamp; + if (expose_cycles) + clock->ptp_info.getcrosscycles = + mlx5_ptp_getcrosscycles; + } #endif /* CONFIG_X86 */ + if (expose_cycles) + clock->ptp_info.getcyclesx64 = mlx5_ptp_getcyclesx; + mlx5_timecounter_init(mdev); mlx5_init_clock_info(mdev); mlx5_init_overflow_period(mdev); @@ -1278,9 +1365,9 @@ static void mlx5_init_clock_dev(struct mlx5_core_dev *mdev) clock->ptp = ptp_clock_register(&clock->ptp_info, clock->shared ? NULL : &mdev->pdev->dev); if (IS_ERR(clock->ptp)) { - mlx5_core_warn(mdev, "%sptp_clock_register failed %ld\n", + mlx5_core_warn(mdev, "%sptp_clock_register failed %pe\n", clock->shared ? "shared clock " : "", - PTR_ERR(clock->ptp)); + clock->ptp); clock->ptp = NULL; } @@ -1345,17 +1432,25 @@ static int mlx5_clock_alloc(struct mlx5_core_dev *mdev, bool shared) return 0; } -static void mlx5_shared_clock_register(struct mlx5_core_dev *mdev, u64 key) +static void mlx5_shared_clock_register(struct mlx5_core_dev *mdev, + u8 identity[MLX5_RT_CLOCK_IDENTITY_SIZE]) { struct mlx5_core_dev *peer_dev, *next = NULL; + struct mlx5_devcom_match_attr attr = {}; + struct mlx5_devcom_comp_dev *compd; struct mlx5_devcom_comp_dev *pos; - mdev->clock_state->compdev = mlx5_devcom_register_component(mdev->priv.devc, - MLX5_DEVCOM_SHARED_CLOCK, - key, NULL, mdev); - if (IS_ERR(mdev->clock_state->compdev)) + BUILD_BUG_ON(MLX5_RT_CLOCK_IDENTITY_SIZE > MLX5_DEVCOM_MATCH_KEY_MAX); + memcpy(attr.key.buf, identity, MLX5_RT_CLOCK_IDENTITY_SIZE); + + compd = mlx5_devcom_register_component(mdev->priv.devc, + MLX5_DEVCOM_SHARED_CLOCK, + &attr, NULL, mdev); + if (!compd) return; + mdev->clock_state->compdev = compd; + mlx5_devcom_comp_lock(mdev->clock_state->compdev); mlx5_devcom_for_each_peer_entry(mdev->clock_state->compdev, peer_dev, pos) { if (peer_dev->clock) { @@ -1501,7 +1596,6 @@ int mlx5_init_clock(struct mlx5_core_dev *mdev) { u8 identity[MLX5_RT_CLOCK_IDENTITY_SIZE]; struct mlx5_clock_dev_state *clock_state; - u64 key; int err; if (!MLX5_CAP_GEN(mdev, device_frequency_khz)) { @@ -1517,12 +1611,10 @@ int mlx5_init_clock(struct mlx5_core_dev *mdev) mdev->clock_state = clock_state; if (MLX5_CAP_MCAM_REG3(mdev, mrtcq) && mlx5_real_time_mode(mdev)) { - if (mlx5_clock_identity_get(mdev, identity)) { + if (mlx5_clock_identity_get(mdev, identity)) mlx5_core_warn(mdev, "failed to get rt clock identity, create ptp dev per function\n"); - } else { - memcpy(&key, &identity, sizeof(key)); - mlx5_shared_clock_register(mdev, key); - } + else + mlx5_shared_clock_register(mdev, identity); } if (!mdev->clock) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h index c18a652c0f..aff3aed62c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h @@ -54,7 +54,6 @@ struct mlx5_timer { struct mlx5_clock { seqlock_t lock; - struct hwtstamp_config hwtstamp_config; struct ptp_clock *ptp; struct ptp_clock_info ptp_info; struct mlx5_pps pps_info; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c index 7b0766c89f..e749618229 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c @@ -4,6 +4,7 @@ #include #include #include "lib/devcom.h" +#include "lib/mlx5.h" #include "mlx5_core.h" static LIST_HEAD(devcom_dev_list); @@ -22,11 +23,17 @@ struct mlx5_devcom_dev { struct kref ref; }; +struct mlx5_devcom_key { + u32 flags; + union mlx5_devcom_match_key key; + possible_net_t net; +}; + struct mlx5_devcom_comp { struct list_head comp_list; enum mlx5_devcom_component id; - u64 key; struct list_head comp_dev_list_head; + struct mlx5_devcom_key key; mlx5_devcom_event_handler_t handler; struct kref ref; bool ready; @@ -69,20 +76,18 @@ mlx5_devcom_dev_alloc(struct mlx5_core_dev *dev) struct mlx5_devcom_dev * mlx5_devcom_register_device(struct mlx5_core_dev *dev) { - struct mlx5_devcom_dev *devc; + struct mlx5_devcom_dev *devc = NULL; mutex_lock(&dev_list_lock); if (devcom_dev_exists(dev)) { - devc = ERR_PTR(-EEXIST); + mlx5_core_err(dev, "devcom device already exists"); goto out; } devc = mlx5_devcom_dev_alloc(dev); - if (!devc) { - devc = ERR_PTR(-ENOMEM); + if (!devc) goto out; - } list_add_tail(&devc->list, &devcom_dev_list); out: @@ -103,21 +108,27 @@ mlx5_devcom_dev_release(struct kref *ref) void mlx5_devcom_unregister_device(struct mlx5_devcom_dev *devc) { - if (!IS_ERR_OR_NULL(devc)) - kref_put(&devc->ref, mlx5_devcom_dev_release); + if (!devc) + return; + + kref_put(&devc->ref, mlx5_devcom_dev_release); } static struct mlx5_devcom_comp * -mlx5_devcom_comp_alloc(u64 id, u64 key, mlx5_devcom_event_handler_t handler) +mlx5_devcom_comp_alloc(u64 id, const struct mlx5_devcom_match_attr *attr, + mlx5_devcom_event_handler_t handler) { struct mlx5_devcom_comp *comp; comp = kzalloc(sizeof(*comp), GFP_KERNEL); if (!comp) - return ERR_PTR(-ENOMEM); + return NULL; comp->id = id; - comp->key = key; + comp->key.key = attr->key; + comp->key.flags = attr->flags; + if (attr->flags & MLX5_DEVCOM_MATCH_FLAGS_NS) + write_pnet(&comp->key.net, attr->net); comp->handler = handler; init_rwsem(&comp->sem); lockdep_register_key(&comp->lock_key); @@ -149,7 +160,7 @@ devcom_alloc_comp_dev(struct mlx5_devcom_dev *devc, devcom = kzalloc(sizeof(*devcom), GFP_KERNEL); if (!devcom) - return ERR_PTR(-ENOMEM); + return NULL; kref_get(&devc->ref); devcom->devc = devc; @@ -180,21 +191,34 @@ devcom_free_comp_dev(struct mlx5_devcom_comp_dev *devcom) static bool devcom_component_equal(struct mlx5_devcom_comp *devcom, enum mlx5_devcom_component id, - u64 key) + const struct mlx5_devcom_match_attr *attr) { - return devcom->id == id && devcom->key == key; + if (devcom->id != id) + return false; + + if (devcom->key.flags != attr->flags) + return false; + + if (memcmp(&devcom->key.key, &attr->key, sizeof(devcom->key.key))) + return false; + + if (devcom->key.flags & MLX5_DEVCOM_MATCH_FLAGS_NS && + !net_eq(read_pnet(&devcom->key.net), attr->net)) + return false; + + return true; } static struct mlx5_devcom_comp * devcom_component_get(struct mlx5_devcom_dev *devc, enum mlx5_devcom_component id, - u64 key, + const struct mlx5_devcom_match_attr *attr, mlx5_devcom_event_handler_t handler) { struct mlx5_devcom_comp *comp; devcom_for_each_component(comp) { - if (devcom_component_equal(comp, id, key)) { + if (devcom_component_equal(comp, id, attr)) { if (handler == comp->handler) { kref_get(&comp->ref); return comp; @@ -212,35 +236,32 @@ devcom_component_get(struct mlx5_devcom_dev *devc, struct mlx5_devcom_comp_dev * mlx5_devcom_register_component(struct mlx5_devcom_dev *devc, enum mlx5_devcom_component id, - u64 key, + const struct mlx5_devcom_match_attr *attr, mlx5_devcom_event_handler_t handler, void *data) { - struct mlx5_devcom_comp_dev *devcom; + struct mlx5_devcom_comp_dev *devcom = NULL; struct mlx5_devcom_comp *comp; - if (IS_ERR_OR_NULL(devc)) - return ERR_PTR(-EINVAL); + if (!devc) + return NULL; mutex_lock(&comp_list_lock); - comp = devcom_component_get(devc, id, key, handler); - if (IS_ERR(comp)) { - devcom = ERR_PTR(-EINVAL); + comp = devcom_component_get(devc, id, attr, handler); + if (IS_ERR(comp)) goto out_unlock; - } if (!comp) { - comp = mlx5_devcom_comp_alloc(id, key, handler); - if (IS_ERR(comp)) { - devcom = ERR_CAST(comp); + comp = mlx5_devcom_comp_alloc(id, attr, handler); + if (!comp) goto out_unlock; - } + list_add_tail(&comp->comp_list, &devcom_comp_list); } mutex_unlock(&comp_list_lock); devcom = devcom_alloc_comp_dev(devc, comp, data); - if (IS_ERR(devcom)) + if (!devcom) kref_put(&comp->ref, mlx5_devcom_comp_release); return devcom; @@ -252,8 +273,10 @@ out_unlock: void mlx5_devcom_unregister_component(struct mlx5_devcom_comp_dev *devcom) { - if (!IS_ERR_OR_NULL(devcom)) - devcom_free_comp_dev(devcom); + if (!devcom) + return; + + devcom_free_comp_dev(devcom); } int mlx5_devcom_comp_get_size(struct mlx5_devcom_comp_dev *devcom) @@ -272,7 +295,7 @@ int mlx5_devcom_send_event(struct mlx5_devcom_comp_dev *devcom, int err = 0; void *data; - if (IS_ERR_OR_NULL(devcom)) + if (!devcom) return -ENODEV; comp = devcom->comp; @@ -314,7 +337,7 @@ void mlx5_devcom_comp_set_ready(struct mlx5_devcom_comp_dev *devcom, bool ready) bool mlx5_devcom_comp_is_ready(struct mlx5_devcom_comp_dev *devcom) { - if (IS_ERR_OR_NULL(devcom)) + if (!devcom) return false; return READ_ONCE(devcom->comp->ready); @@ -324,7 +347,7 @@ bool mlx5_devcom_for_each_peer_begin(struct mlx5_devcom_comp_dev *devcom) { struct mlx5_devcom_comp *comp; - if (IS_ERR_OR_NULL(devcom)) + if (!devcom) return false; comp = devcom->comp; @@ -397,21 +420,21 @@ void *mlx5_devcom_get_next_peer_data_rcu(struct mlx5_devcom_comp_dev *devcom, void mlx5_devcom_comp_lock(struct mlx5_devcom_comp_dev *devcom) { - if (IS_ERR_OR_NULL(devcom)) + if (!devcom) return; down_write(&devcom->comp->sem); } void mlx5_devcom_comp_unlock(struct mlx5_devcom_comp_dev *devcom) { - if (IS_ERR_OR_NULL(devcom)) + if (!devcom) return; up_write(&devcom->comp->sem); } int mlx5_devcom_comp_trylock(struct mlx5_devcom_comp_dev *devcom) { - if (IS_ERR_OR_NULL(devcom)) + if (!devcom) return 0; return down_write_trylock(&devcom->comp->sem); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h index c79699b94a..91e5ae529d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h @@ -6,6 +6,22 @@ #include +enum mlx5_devom_match_flags { + MLX5_DEVCOM_MATCH_FLAGS_NS = BIT(0), +}; + +#define MLX5_DEVCOM_MATCH_KEY_MAX 32 +union mlx5_devcom_match_key { + u64 val; + u8 buf[MLX5_DEVCOM_MATCH_KEY_MAX]; +}; + +struct mlx5_devcom_match_attr { + u32 flags; + union mlx5_devcom_match_key key; + struct net *net; +}; + enum mlx5_devcom_component { MLX5_DEVCOM_ESW_OFFLOADS, MLX5_DEVCOM_MPV, @@ -25,7 +41,7 @@ void mlx5_devcom_unregister_device(struct mlx5_devcom_dev *devc); struct mlx5_devcom_comp_dev * mlx5_devcom_register_component(struct mlx5_devcom_dev *devc, enum mlx5_devcom_component id, - u64 key, + const struct mlx5_devcom_match_attr *attr, mlx5_devcom_event_handler_t handler, void *data); void mlx5_devcom_unregister_component(struct mlx5_devcom_comp_dev *devcom); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/dm.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/dm.c index 7c5516b0a8..8115071c34 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/dm.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/dm.c @@ -30,7 +30,7 @@ struct mlx5_dm *mlx5_dm_create(struct mlx5_core_dev *dev) dm = kzalloc(sizeof(*dm), GFP_KERNEL); if (!dm) - return ERR_PTR(-ENOMEM); + return NULL; spin_lock_init(&dm->lock); @@ -96,7 +96,7 @@ err_modify_hdr: err_steering: kfree(dm); - return ERR_PTR(-ENOMEM); + return NULL; } void mlx5_dm_cleanup(struct mlx5_core_dev *dev) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.c index ca9ecec358..7adad784ad 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.c @@ -9,7 +9,7 @@ #include "mlx5_core.h" #include "lib/fs_ttc.h" -#define MLX5_TTC_MAX_NUM_GROUPS 4 +#define MLX5_TTC_MAX_NUM_GROUPS 7 #define MLX5_TTC_GROUP_TCPUDP_SIZE (MLX5_TT_IPV6_UDP + 1) struct mlx5_fs_ttc_groups { @@ -31,10 +31,14 @@ static int mlx5_fs_ttc_table_size(const struct mlx5_fs_ttc_groups *groups) /* L3/L4 traffic type classifier */ struct mlx5_ttc_table { int num_groups; + const struct mlx5_fs_ttc_groups *groups; + struct mlx5_core_dev *mdev; struct mlx5_flow_table *t; struct mlx5_flow_group **g; struct mlx5_ttc_rule rules[MLX5_NUM_TT]; struct mlx5_flow_handle *tunnel_rules[MLX5_NUM_TUNNEL_TT]; + u32 refcnt; + struct mutex mutex; /* Protect adding rules for ipsec crypto offload */ }; struct mlx5_flow_table *mlx5_get_ttc_flow_table(struct mlx5_ttc_table *ttc) @@ -163,6 +167,8 @@ static struct mlx5_etype_proto ttc_tunnel_rules[] = { enum TTC_GROUP_TYPE { TTC_GROUPS_DEFAULT = 0, TTC_GROUPS_USE_L4_TYPE = 1, + TTC_GROUPS_DEFAULT_ESP = 2, + TTC_GROUPS_USE_L4_TYPE_ESP = 3, }; static const struct mlx5_fs_ttc_groups ttc_groups[] = { @@ -184,6 +190,31 @@ static const struct mlx5_fs_ttc_groups ttc_groups[] = { BIT(0), }, }, + [TTC_GROUPS_DEFAULT_ESP] = { + .num_groups = 6, + .group_size = { + MLX5_TTC_GROUP_TCPUDP_SIZE + BIT(1) + + MLX5_NUM_TUNNEL_TT, + BIT(2), /* decrypted outer L4 */ + BIT(2), /* decrypted inner L4 */ + BIT(1), /* ESP */ + BIT(1), + BIT(0), + }, + }, + [TTC_GROUPS_USE_L4_TYPE_ESP] = { + .use_l4_type = true, + .num_groups = 7, + .group_size = { + MLX5_TTC_GROUP_TCPUDP_SIZE, + BIT(1) + MLX5_NUM_TUNNEL_TT, + BIT(2), /* decrypted outer L4 */ + BIT(2), /* decrypted inner L4 */ + BIT(1), /* ESP */ + BIT(1), + BIT(0), + }, + }, }; static const struct mlx5_fs_ttc_groups inner_ttc_groups[] = { @@ -207,6 +238,23 @@ static const struct mlx5_fs_ttc_groups inner_ttc_groups[] = { }, }; +static const struct mlx5_fs_ttc_groups * +mlx5_ttc_get_fs_groups(bool use_l4_type, bool ipsec_rss) +{ + if (!ipsec_rss) + return use_l4_type ? &ttc_groups[TTC_GROUPS_USE_L4_TYPE] : + &ttc_groups[TTC_GROUPS_DEFAULT]; + + return use_l4_type ? &ttc_groups[TTC_GROUPS_USE_L4_TYPE_ESP] : + &ttc_groups[TTC_GROUPS_DEFAULT_ESP]; +} + +bool mlx5_ttc_has_esp_flow_group(struct mlx5_ttc_table *ttc) +{ + return ttc->groups == &ttc_groups[TTC_GROUPS_DEFAULT_ESP] || + ttc->groups == &ttc_groups[TTC_GROUPS_USE_L4_TYPE_ESP]; +} + u8 mlx5_get_proto_by_tunnel_type(enum mlx5_tunnel_types tt) { return ttc_tunnel_rules[tt].proto; @@ -257,6 +305,31 @@ static u8 mlx5_etype_to_ipv(u16 ethertype) return 0; } +static void mlx5_fs_ttc_set_match_ipv_outer(struct mlx5_core_dev *mdev, + struct mlx5_flow_spec *spec, + u16 etype) +{ + int match_ipv_outer = + MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.outer_ip_version); + u8 ipv; + + ipv = mlx5_etype_to_ipv(etype); + if (match_ipv_outer && ipv) { + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.ip_version); + MLX5_SET(fte_match_param, spec->match_value, + outer_headers.ip_version, ipv); + } else { + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.ethertype); + MLX5_SET(fte_match_param, spec->match_value, + outer_headers.ethertype, etype); + } + + spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; +} + static void mlx5_fs_ttc_set_match_proto(void *headers_c, void *headers_v, u8 proto, bool use_l4_type) { @@ -279,16 +352,12 @@ static void mlx5_fs_ttc_set_match_proto(void *headers_c, void *headers_v, static struct mlx5_flow_handle * mlx5_generate_ttc_rule(struct mlx5_core_dev *dev, struct mlx5_flow_table *ft, struct mlx5_flow_destination *dest, u16 etype, u8 proto, - bool use_l4_type) + bool use_l4_type, bool ipsec_rss) { - int match_ipv_outer = - MLX5_CAP_FLOWTABLE_NIC_RX(dev, - ft_field_support.outer_ip_version); MLX5_DECLARE_FLOW_ACT(flow_act); struct mlx5_flow_handle *rule; struct mlx5_flow_spec *spec; int err = 0; - u8 ipv; spec = kvzalloc(sizeof(*spec), GFP_KERNEL); if (!spec) @@ -305,15 +374,15 @@ mlx5_generate_ttc_rule(struct mlx5_core_dev *dev, struct mlx5_flow_table *ft, proto, use_l4_type); } - ipv = mlx5_etype_to_ipv(etype); - if (match_ipv_outer && ipv) { - spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; - MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ip_version); - MLX5_SET(fte_match_param, spec->match_value, outer_headers.ip_version, ipv); - } else if (etype) { - spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; - MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ethertype); - MLX5_SET(fte_match_param, spec->match_value, outer_headers.ethertype, etype); + if (etype) + mlx5_fs_ttc_set_match_ipv_outer(dev, spec, etype); + + if (ipsec_rss && proto == IPPROTO_ESP) { + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + misc_parameters_2.ipsec_next_header); + MLX5_SET(fte_match_param, spec->match_value, + misc_parameters_2.ipsec_next_header, 0); + spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_2; } rule = mlx5_add_flow_rules(ft, spec, &flow_act, dest, 1); @@ -342,12 +411,16 @@ static int mlx5_generate_ttc_table_rules(struct mlx5_core_dev *dev, for (tt = 0; tt < MLX5_NUM_TT; tt++) { struct mlx5_ttc_rule *rule = &rules[tt]; + if (mlx5_ttc_is_decrypted_esp_tt(tt)) + continue; + if (test_bit(tt, params->ignore_dests)) continue; rule->rule = mlx5_generate_ttc_rule(dev, ft, ¶ms->dests[tt], ttc_rules[tt].etype, ttc_rules[tt].proto, - use_l4_type); + use_l4_type, + params->ipsec_rss); if (IS_ERR(rule->rule)) { err = PTR_ERR(rule->rule); rule->rule = NULL; @@ -370,7 +443,7 @@ static int mlx5_generate_ttc_table_rules(struct mlx5_core_dev *dev, ¶ms->tunnel_dests[tt], ttc_tunnel_rules[tt].etype, ttc_tunnel_rules[tt].proto, - use_l4_type); + use_l4_type, false); if (IS_ERR(trules[tt])) { err = PTR_ERR(trules[tt]); trules[tt] = NULL; @@ -385,10 +458,78 @@ del_rules: return err; } -static int mlx5_create_ttc_table_groups(struct mlx5_ttc_table *ttc, - bool use_ipv, - const struct mlx5_fs_ttc_groups *groups) +static int mlx5_create_ttc_table_ipsec_groups(struct mlx5_ttc_table *ttc, + bool use_ipv, + u32 *in, int *next_ix) { + u8 *mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); + const struct mlx5_fs_ttc_groups *groups = ttc->groups; + int ix = *next_ix; + + MLX5_SET(fte_match_param, mc, outer_headers.ip_protocol, 0); + + /* decrypted ESP outer group */ + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.l4_type_ext); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += groups->group_size[ttc->num_groups]; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ttc->g[ttc->num_groups] = mlx5_create_flow_group(ttc->t, in); + if (IS_ERR(ttc->g[ttc->num_groups])) + goto err; + ttc->num_groups++; + + MLX5_SET(fte_match_param, mc, outer_headers.l4_type_ext, 0); + + /* decrypted ESP inner group */ + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_INNER_HEADERS); + if (use_ipv) + MLX5_SET(fte_match_param, mc, outer_headers.ip_version, 0); + else + MLX5_SET(fte_match_param, mc, outer_headers.ethertype, 0); + MLX5_SET_TO_ONES(fte_match_param, mc, inner_headers.ip_version); + MLX5_SET_TO_ONES(fte_match_param, mc, inner_headers.l4_type_ext); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += groups->group_size[ttc->num_groups]; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ttc->g[ttc->num_groups] = mlx5_create_flow_group(ttc->t, in); + if (IS_ERR(ttc->g[ttc->num_groups])) + goto err; + ttc->num_groups++; + + MLX5_SET(fte_match_param, mc, inner_headers.ip_version, 0); + MLX5_SET(fte_match_param, mc, inner_headers.l4_type_ext, 0); + + /* undecrypted ESP group */ + MLX5_SET_CFG(in, match_criteria_enable, + MLX5_MATCH_OUTER_HEADERS | MLX5_MATCH_MISC_PARAMETERS_2); + if (use_ipv) + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ip_version); + else + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype); + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ip_protocol); + MLX5_SET_TO_ONES(fte_match_param, mc, + misc_parameters_2.ipsec_next_header); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += groups->group_size[ttc->num_groups]; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ttc->g[ttc->num_groups] = mlx5_create_flow_group(ttc->t, in); + if (IS_ERR(ttc->g[ttc->num_groups])) + goto err; + ttc->num_groups++; + + *next_ix = ix; + + return 0; + +err: + return PTR_ERR(ttc->g[ttc->num_groups]); +} + +static int mlx5_create_ttc_table_groups(struct mlx5_ttc_table *ttc, + bool use_ipv) +{ + const struct mlx5_fs_ttc_groups *groups = ttc->groups; int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); int ix = 0; u32 *in; @@ -436,8 +577,18 @@ static int mlx5_create_ttc_table_groups(struct mlx5_ttc_table *ttc, goto err; ttc->num_groups++; + if (mlx5_ttc_has_esp_flow_group(ttc)) { + err = mlx5_create_ttc_table_ipsec_groups(ttc, use_ipv, in, &ix); + if (err) + goto err; + + MLX5_SET(fte_match_param, mc, + misc_parameters_2.ipsec_next_header, 0); + } + /* L3 Group */ MLX5_SET(fte_match_param, mc, outer_headers.ip_protocol, 0); + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); MLX5_SET_CFG(in, start_flow_index, ix); ix += groups->group_size[ttc->num_groups]; MLX5_SET_CFG(in, end_flow_index, ix - 1); @@ -527,6 +678,9 @@ static int mlx5_generate_inner_ttc_table_rules(struct mlx5_core_dev *dev, for (tt = 0; tt < MLX5_NUM_TT; tt++) { struct mlx5_ttc_rule *rule = &rules[tt]; + if (mlx5_ttc_is_decrypted_esp_tt(tt)) + continue; + if (test_bit(tt, params->ignore_dests)) continue; rule->rule = mlx5_generate_inner_ttc_rule(dev, ft, @@ -700,6 +854,7 @@ void mlx5_destroy_ttc_table(struct mlx5_ttc_table *ttc) kfree(ttc->g); mlx5_destroy_flow_table(ttc->t); + mutex_destroy(&ttc->mutex); kvfree(ttc); } @@ -709,7 +864,6 @@ struct mlx5_ttc_table *mlx5_create_ttc_table(struct mlx5_core_dev *dev, bool match_ipv_outer = MLX5_CAP_FLOWTABLE_NIC_RX(dev, ft_field_support.outer_ip_version); - const struct mlx5_fs_ttc_groups *groups; struct mlx5_flow_namespace *ns; struct mlx5_ttc_table *ttc; bool use_l4_type; @@ -738,11 +892,10 @@ struct mlx5_ttc_table *mlx5_create_ttc_table(struct mlx5_core_dev *dev, return ERR_PTR(-EOPNOTSUPP); } - groups = use_l4_type ? &ttc_groups[TTC_GROUPS_USE_L4_TYPE] : - &ttc_groups[TTC_GROUPS_DEFAULT]; + ttc->groups = mlx5_ttc_get_fs_groups(use_l4_type, params->ipsec_rss); WARN_ON_ONCE(params->ft_attr.max_fte); - params->ft_attr.max_fte = mlx5_fs_ttc_table_size(groups); + params->ft_attr.max_fte = mlx5_fs_ttc_table_size(ttc->groups); ttc->t = mlx5_create_flow_table(ns, ¶ms->ft_attr); if (IS_ERR(ttc->t)) { err = PTR_ERR(ttc->t); @@ -750,7 +903,7 @@ struct mlx5_ttc_table *mlx5_create_ttc_table(struct mlx5_core_dev *dev, return ERR_PTR(err); } - err = mlx5_create_ttc_table_groups(ttc, match_ipv_outer, groups); + err = mlx5_create_ttc_table_groups(ttc, match_ipv_outer); if (err) goto destroy_ft; @@ -758,6 +911,9 @@ struct mlx5_ttc_table *mlx5_create_ttc_table(struct mlx5_core_dev *dev, if (err) goto destroy_ft; + ttc->mdev = dev; + mutex_init(&ttc->mutex); + return ttc; destroy_ft: @@ -791,3 +947,194 @@ int mlx5_ttc_fwd_default_dest(struct mlx5_ttc_table *ttc, return mlx5_ttc_fwd_dest(ttc, type, &dest); } + +static void _mlx5_ttc_destroy_ipsec_rules(struct mlx5_ttc_table *ttc) +{ + enum mlx5_traffic_types i; + + for (i = MLX5_TT_DECRYPTED_ESP_OUTER_IPV4_TCP; + i <= MLX5_TT_DECRYPTED_ESP_INNER_IPV6_UDP; i++) { + if (!ttc->rules[i].rule) + continue; + + mlx5_del_flow_rules(ttc->rules[i].rule); + ttc->rules[i].rule = NULL; + } +} + +void mlx5_ttc_destroy_ipsec_rules(struct mlx5_ttc_table *ttc) +{ + if (!mlx5_ttc_has_esp_flow_group(ttc)) + return; + + mutex_lock(&ttc->mutex); + if (--ttc->refcnt) + goto unlock; + + _mlx5_ttc_destroy_ipsec_rules(ttc); +unlock: + mutex_unlock(&ttc->mutex); +} + +static int mlx5_ttc_get_tt_attrs(enum mlx5_traffic_types type, + u16 *etype, int *l4_type_ext, + enum mlx5_traffic_types *tir_tt) +{ + switch (type) { + case MLX5_TT_DECRYPTED_ESP_OUTER_IPV4_TCP: + case MLX5_TT_DECRYPTED_ESP_INNER_IPV4_TCP: + *etype = ETH_P_IP; + *l4_type_ext = MLX5_PACKET_L4_TYPE_EXT_TCP; + *tir_tt = MLX5_TT_IPV4_TCP; + break; + case MLX5_TT_DECRYPTED_ESP_OUTER_IPV6_TCP: + case MLX5_TT_DECRYPTED_ESP_INNER_IPV6_TCP: + *etype = ETH_P_IPV6; + *l4_type_ext = MLX5_PACKET_L4_TYPE_EXT_TCP; + *tir_tt = MLX5_TT_IPV6_TCP; + break; + case MLX5_TT_DECRYPTED_ESP_OUTER_IPV4_UDP: + case MLX5_TT_DECRYPTED_ESP_INNER_IPV4_UDP: + *etype = ETH_P_IP; + *l4_type_ext = MLX5_PACKET_L4_TYPE_EXT_UDP; + *tir_tt = MLX5_TT_IPV4_UDP; + break; + case MLX5_TT_DECRYPTED_ESP_OUTER_IPV6_UDP: + case MLX5_TT_DECRYPTED_ESP_INNER_IPV6_UDP: + *etype = ETH_P_IPV6; + *l4_type_ext = MLX5_PACKET_L4_TYPE_EXT_UDP; + *tir_tt = MLX5_TT_IPV6_UDP; + break; + default: + return -EINVAL; + } + + return 0; +} + +static struct mlx5_flow_handle * +mlx5_ttc_create_ipsec_outer_rule(struct mlx5_ttc_table *ttc, + enum mlx5_traffic_types type) +{ + struct mlx5_flow_destination dest; + MLX5_DECLARE_FLOW_ACT(flow_act); + enum mlx5_traffic_types tir_tt; + struct mlx5_flow_handle *rule; + struct mlx5_flow_spec *spec; + int l4_type_ext; + u16 etype; + int err; + + err = mlx5_ttc_get_tt_attrs(type, &etype, &l4_type_ext, &tir_tt); + if (err) + return ERR_PTR(err); + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return ERR_PTR(-ENOMEM); + + mlx5_fs_ttc_set_match_ipv_outer(ttc->mdev, spec, etype); + + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.l4_type_ext); + MLX5_SET(fte_match_param, spec->match_value, + outer_headers.l4_type_ext, l4_type_ext); + + dest = mlx5_ttc_get_default_dest(ttc, tir_tt); + + rule = mlx5_add_flow_rules(ttc->t, spec, &flow_act, &dest, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + mlx5_core_err(ttc->mdev, "%s: add rule failed\n", __func__); + } + + kvfree(spec); + return err ? ERR_PTR(err) : rule; +} + +static struct mlx5_flow_handle * +mlx5_ttc_create_ipsec_inner_rule(struct mlx5_ttc_table *ttc, + struct mlx5_ttc_table *inner_ttc, + enum mlx5_traffic_types type) +{ + struct mlx5_flow_destination dest; + MLX5_DECLARE_FLOW_ACT(flow_act); + enum mlx5_traffic_types tir_tt; + struct mlx5_flow_handle *rule; + struct mlx5_flow_spec *spec; + int l4_type_ext; + u16 etype; + int err; + + err = mlx5_ttc_get_tt_attrs(type, &etype, &l4_type_ext, &tir_tt); + if (err) + return ERR_PTR(err); + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return ERR_PTR(-ENOMEM); + + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + inner_headers.ip_version); + MLX5_SET(fte_match_param, spec->match_value, + inner_headers.ip_version, mlx5_etype_to_ipv(etype)); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + inner_headers.l4_type_ext); + MLX5_SET(fte_match_param, spec->match_value, + inner_headers.l4_type_ext, l4_type_ext); + + dest = mlx5_ttc_get_default_dest(inner_ttc, tir_tt); + + spec->match_criteria_enable = MLX5_MATCH_INNER_HEADERS; + + rule = mlx5_add_flow_rules(ttc->t, spec, &flow_act, &dest, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + mlx5_core_err(ttc->mdev, "%s: add rule failed\n", __func__); + } + + kvfree(spec); + return err ? ERR_PTR(err) : rule; +} + +int mlx5_ttc_create_ipsec_rules(struct mlx5_ttc_table *ttc, + struct mlx5_ttc_table *inner_ttc) +{ + struct mlx5_flow_handle *rule; + enum mlx5_traffic_types i; + + if (!mlx5_ttc_has_esp_flow_group(ttc)) + return 0; + + mutex_lock(&ttc->mutex); + if (ttc->refcnt) + goto skip; + + for (i = MLX5_TT_DECRYPTED_ESP_OUTER_IPV4_TCP; + i <= MLX5_TT_DECRYPTED_ESP_OUTER_IPV6_UDP; i++) { + rule = mlx5_ttc_create_ipsec_outer_rule(ttc, i); + if (IS_ERR(rule)) + goto err_out; + + ttc->rules[i].rule = rule; + } + + for (i = MLX5_TT_DECRYPTED_ESP_INNER_IPV4_TCP; + i <= MLX5_TT_DECRYPTED_ESP_INNER_IPV6_UDP; i++) { + rule = mlx5_ttc_create_ipsec_inner_rule(ttc, inner_ttc, i); + if (IS_ERR(rule)) + goto err_out; + + ttc->rules[i].rule = rule; + } + +skip: + ttc->refcnt++; + mutex_unlock(&ttc->mutex); + return 0; + +err_out: + _mlx5_ttc_destroy_ipsec_rules(ttc); + mutex_unlock(&ttc->mutex); + return PTR_ERR(rule); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.h index ab9434fe3a..95f6e56724 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.h @@ -18,6 +18,14 @@ enum mlx5_traffic_types { MLX5_TT_IPV4, MLX5_TT_IPV6, MLX5_TT_ANY, + MLX5_TT_DECRYPTED_ESP_OUTER_IPV4_TCP, + MLX5_TT_DECRYPTED_ESP_OUTER_IPV6_TCP, + MLX5_TT_DECRYPTED_ESP_OUTER_IPV4_UDP, + MLX5_TT_DECRYPTED_ESP_OUTER_IPV6_UDP, + MLX5_TT_DECRYPTED_ESP_INNER_IPV4_TCP, + MLX5_TT_DECRYPTED_ESP_INNER_IPV6_TCP, + MLX5_TT_DECRYPTED_ESP_INNER_IPV4_UDP, + MLX5_TT_DECRYPTED_ESP_INNER_IPV6_UDP, MLX5_NUM_TT, MLX5_NUM_INDIR_TIRS = MLX5_TT_ANY, }; @@ -47,6 +55,7 @@ struct ttc_params { bool inner_ttc; DECLARE_BITMAP(ignore_tunnel_dests, MLX5_NUM_TUNNEL_TT); struct mlx5_flow_destination tunnel_dests[MLX5_NUM_TUNNEL_TT]; + bool ipsec_rss; }; const char *mlx5_ttc_get_name(enum mlx5_traffic_types tt); @@ -70,4 +79,14 @@ int mlx5_ttc_fwd_default_dest(struct mlx5_ttc_table *ttc, bool mlx5_tunnel_inner_ft_supported(struct mlx5_core_dev *mdev); u8 mlx5_get_proto_by_tunnel_type(enum mlx5_tunnel_types tt); +bool mlx5_ttc_has_esp_flow_group(struct mlx5_ttc_table *ttc); +int mlx5_ttc_create_ipsec_rules(struct mlx5_ttc_table *ttc, + struct mlx5_ttc_table *inner_ttc); +void mlx5_ttc_destroy_ipsec_rules(struct mlx5_ttc_table *ttc); +static inline bool mlx5_ttc_is_decrypted_esp_tt(enum mlx5_traffic_types tt) +{ + return tt >= MLX5_TT_DECRYPTED_ESP_OUTER_IPV4_TCP && + tt <= MLX5_TT_DECRYPTED_ESP_INNER_IPV6_UDP; +} + #endif /* __MLX5_FS_TTC_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.c index b7d4b1a2ba..d524f02205 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.c @@ -164,6 +164,8 @@ ipsec_fs_roce_rx_rule_setup(struct mlx5_core_dev *mdev, roce->rule = rule; memset(spec, 0, sizeof(*spec)); + if (default_dst->type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) + flow_act.flags |= FLOW_ACT_IGNORE_FLOW_LEVEL; rule = mlx5_add_flow_rules(roce->ft, spec, &flow_act, default_dst, 1); if (IS_ERR(rule)) { err = PTR_ERR(rule); @@ -178,6 +180,8 @@ ipsec_fs_roce_rx_rule_setup(struct mlx5_core_dev *mdev, goto out; flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + if (default_dst->type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) + flow_act.flags &= ~FLOW_ACT_IGNORE_FLOW_LEVEL; dst.type = MLX5_FLOW_DESTINATION_TYPE_TABLE_TYPE; dst.ft = roce->ft_rdma; rule = mlx5_add_flow_rules(roce->nic_master_ft, NULL, &flow_act, &dst, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.c index 762d55ba9e..e6be2f01da 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.c @@ -45,11 +45,7 @@ #define MLX5_SECTAG_HEADER_SIZE_WITHOUT_SCI 0x8 #define MLX5_SECTAG_HEADER_SIZE_WITH_SCI (MLX5_SECTAG_HEADER_SIZE_WITHOUT_SCI + MACSEC_SCI_LEN) -/* MACsec RX flow steering */ -#define MLX5_ETH_WQE_FT_META_MACSEC_MASK 0x3E - /* MACsec fs_id handling for steering */ -#define macsec_fs_set_tx_fs_id(fs_id) (MLX5_ETH_WQE_FT_META_MACSEC | (fs_id) << 2) #define macsec_fs_set_rx_fs_id(fs_id) ((fs_id) | BIT(30)) struct mlx5_sectag_header { @@ -597,7 +593,7 @@ static int macsec_fs_tx_setup_fte(struct mlx5_macsec_fs *macsec_fs, MLX5_SET(fte_match_param, spec->match_criteria, misc_parameters_2.metadata_reg_a, MLX5_ETH_WQE_FT_META_MACSEC_MASK); MLX5_SET(fte_match_param, spec->match_value, misc_parameters_2.metadata_reg_a, - macsec_fs_set_tx_fs_id(id)); + MLX5_MACSEC_TX_METADATA(id)); *fs_id = id; flow_act->crypto.type = MLX5_FLOW_CONTEXT_ENCRYPT_DECRYPT_TYPE_MACSEC; @@ -2219,9 +2215,11 @@ static int mlx5_macsec_fs_add_roce_rule_tx(struct mlx5_macsec_fs *macsec_fs, u32 MLX5_SET(set_action_in, action, action_type, MLX5_ACTION_TYPE_SET); MLX5_SET(set_action_in, action, field, MLX5_ACTION_IN_FIELD_METADATA_REG_A); - MLX5_SET(set_action_in, action, data, macsec_fs_set_tx_fs_id(fs_id)); - MLX5_SET(set_action_in, action, offset, 0); - MLX5_SET(set_action_in, action, length, 32); + MLX5_SET(set_action_in, action, data, + mlx5_macsec_fs_set_tx_fs_id(fs_id)); + MLX5_SET(set_action_in, action, offset, + MLX5_ETH_WQE_FT_META_MACSEC_SHIFT); + MLX5_SET(set_action_in, action, length, 8); modify_hdr = mlx5_modify_header_alloc(mdev, MLX5_FLOW_NAMESPACE_RDMA_TX_MACSEC, 1, action); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.h index 34b80c3ef6..15acaff436 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.h @@ -12,6 +12,21 @@ #define MLX5_MACSEC_METADATA_MARKER(metadata) ((((metadata) >> 30) & 0x3) == 0x1) #define MLX5_MACSEC_RX_METADAT_HANDLE(metadata) ((metadata) & MLX5_MACSEC_RX_FS_ID_MASK) +/* MACsec TX flow steering */ +#define MLX5_ETH_WQE_FT_META_MACSEC_MASK \ + (MLX5_ETH_WQE_FT_META_MACSEC | MLX5_ETH_WQE_FT_META_MACSEC_FS_ID_MASK) +#define MLX5_ETH_WQE_FT_META_MACSEC_SHIFT MLX5_ETH_WQE_FT_META_SHIFT + +/* MACsec fs_id handling for steering */ +#define mlx5_macsec_fs_set_tx_fs_id(fs_id) \ + (((MLX5_ETH_WQE_FT_META_MACSEC) >> MLX5_ETH_WQE_FT_META_MACSEC_SHIFT) \ + | ((fs_id) << 2)) + +#define MLX5_MACSEC_TX_METADATA(fs_id) \ + (mlx5_macsec_fs_set_tx_fs_id(fs_id) << \ + MLX5_ETH_WQE_FT_META_MACSEC_SHIFT) + +/* MACsec fs_id uses 4 bits, supports up to 16 interfaces */ #define MLX5_MACSEC_NUM_OF_SUPPORTED_INTERFACES 16 struct mlx5_macsec_fs; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h index 37d5f44559..74ea5da58b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h @@ -45,14 +45,22 @@ int mlx5_crdump_enable(struct mlx5_core_dev *dev); void mlx5_crdump_disable(struct mlx5_core_dev *dev); int mlx5_crdump_collect(struct mlx5_core_dev *dev, u32 *cr_data); -static inline struct net *mlx5_core_net(struct mlx5_core_dev *dev) -{ - return devlink_net(priv_to_devlink(dev)); -} - static inline struct net_device *mlx5_uplink_netdev_get(struct mlx5_core_dev *mdev) { - return mdev->mlx5e_res.uplink_netdev; + struct mlx5e_resources *mlx5e_res = &mdev->mlx5e_res; + struct net_device *netdev; + + mutex_lock(&mlx5e_res->uplink_netdev_lock); + netdev = mlx5e_res->uplink_netdev; + netdev_hold(netdev, &mlx5e_res->tracker, GFP_KERNEL); + mutex_unlock(&mlx5e_res->uplink_netdev_lock); + return netdev; +} + +static inline void mlx5_uplink_netdev_put(struct mlx5_core_dev *mdev, + struct net_device *netdev) +{ + netdev_put(netdev, &mdev->mlx5e_res.tracker); } struct mlx5_sd; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c index 4450091e18..4a88a42ae4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c @@ -65,13 +65,14 @@ static int del_l2table_entry_cmd(struct mlx5_core_dev *dev, u32 index) /* UC L2 table hash node */ struct l2table_node { struct l2addr_node node; - u32 index; /* index in HW l2 table */ + int index; /* index in HW l2 table */ int ref_count; }; struct mlx5_mpfs { struct hlist_head hash[MLX5_L2_ADDR_HASH_SIZE]; struct mutex lock; /* Synchronize l2 table access */ + bool enabled; u32 size; unsigned long *bitmap; }; @@ -114,6 +115,8 @@ int mlx5_mpfs_init(struct mlx5_core_dev *dev) return -ENOMEM; } + mpfs->enabled = true; + dev->priv.mpfs = mpfs; return 0; } @@ -135,7 +138,7 @@ int mlx5_mpfs_add_mac(struct mlx5_core_dev *dev, u8 *mac) struct mlx5_mpfs *mpfs = dev->priv.mpfs; struct l2table_node *l2addr; int err = 0; - u32 index; + int index; if (!mpfs) return 0; @@ -148,30 +151,34 @@ int mlx5_mpfs_add_mac(struct mlx5_core_dev *dev, u8 *mac) goto out; } - err = alloc_l2table_index(mpfs, &index); - if (err) - goto out; - l2addr = l2addr_hash_add(mpfs->hash, mac, struct l2table_node, GFP_KERNEL); if (!l2addr) { err = -ENOMEM; - goto hash_add_err; + goto out; } - err = set_l2table_entry_cmd(dev, index, mac); - if (err) - goto set_table_entry_err; + index = -1; + + if (mpfs->enabled) { + err = alloc_l2table_index(mpfs, &index); + if (err) + goto hash_del; + err = set_l2table_entry_cmd(dev, index, mac); + if (err) + goto free_l2table_index; + mlx5_core_dbg(dev, "MPFS entry %pM, set @index (%d)\n", + l2addr->node.addr, index); + } l2addr->index = index; l2addr->ref_count = 1; mlx5_core_dbg(dev, "MPFS mac added %pM, index (%d)\n", mac, index); goto out; - -set_table_entry_err: - l2addr_hash_del(l2addr); -hash_add_err: +free_l2table_index: free_l2table_index(mpfs, index); +hash_del: + l2addr_hash_del(l2addr); out: mutex_unlock(&mpfs->lock); return err; @@ -183,7 +190,7 @@ int mlx5_mpfs_del_mac(struct mlx5_core_dev *dev, u8 *mac) struct mlx5_mpfs *mpfs = dev->priv.mpfs; struct l2table_node *l2addr; int err = 0; - u32 index; + int index; if (!mpfs) return 0; @@ -200,12 +207,87 @@ int mlx5_mpfs_del_mac(struct mlx5_core_dev *dev, u8 *mac) goto unlock; index = l2addr->index; - del_l2table_entry_cmd(dev, index); + if (index >= 0) { + del_l2table_entry_cmd(dev, index); + free_l2table_index(mpfs, index); + mlx5_core_dbg(dev, "MPFS entry %pM, deleted @index (%d)\n", + mac, index); + } l2addr_hash_del(l2addr); - free_l2table_index(mpfs, index); mlx5_core_dbg(dev, "MPFS mac deleted %pM, index (%d)\n", mac, index); unlock: mutex_unlock(&mpfs->lock); return err; } EXPORT_SYMBOL(mlx5_mpfs_del_mac); + +int mlx5_mpfs_enable(struct mlx5_core_dev *dev) +{ + struct mlx5_mpfs *mpfs = dev->priv.mpfs; + struct l2table_node *l2addr; + struct hlist_node *n; + int err = 0, i; + + if (!mpfs) + return -ENODEV; + + mutex_lock(&mpfs->lock); + if (mpfs->enabled) + goto out; + mpfs->enabled = true; + mlx5_core_dbg(dev, "MPFS enabling mpfs\n"); + + mlx5_mpfs_foreach(l2addr, n, mpfs, i) { + u32 index; + + err = alloc_l2table_index(mpfs, &index); + if (err) { + mlx5_core_err(dev, "Failed to allocated MPFS index for %pM, err(%d)\n", + l2addr->node.addr, err); + goto out; + } + + err = set_l2table_entry_cmd(dev, index, l2addr->node.addr); + if (err) { + mlx5_core_err(dev, "Failed to set MPFS l2table entry for %pM index=%d, err(%d)\n", + l2addr->node.addr, index, err); + free_l2table_index(mpfs, index); + goto out; + } + + l2addr->index = index; + mlx5_core_dbg(dev, "MPFS entry %pM, set @index (%d)\n", + l2addr->node.addr, l2addr->index); + } +out: + mutex_unlock(&mpfs->lock); + return err; +} + +void mlx5_mpfs_disable(struct mlx5_core_dev *dev) +{ + struct mlx5_mpfs *mpfs = dev->priv.mpfs; + struct l2table_node *l2addr; + struct hlist_node *n; + int i; + + if (!mpfs) + return; + + mutex_lock(&mpfs->lock); + if (!mpfs->enabled) + goto unlock; + mlx5_mpfs_foreach(l2addr, n, mpfs, i) { + if (l2addr->index < 0) + continue; + del_l2table_entry_cmd(dev, l2addr->index); + free_l2table_index(mpfs, l2addr->index); + mlx5_core_dbg(dev, "MPFS entry %pM, deleted @index (%d)\n", + l2addr->node.addr, l2addr->index); + l2addr->index = -1; + } + mpfs->enabled = false; + mlx5_core_dbg(dev, "MPFS disabled\n"); +unlock: + mutex_unlock(&mpfs->lock); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h index 4a293542a7..9c63838ce1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h @@ -45,6 +45,10 @@ struct l2addr_node { u8 addr[ETH_ALEN]; }; +#define mlx5_mpfs_foreach(hs, tmp, mpfs, i) \ + for (i = 0; i < MLX5_L2_ADDR_HASH_SIZE; i++) \ + hlist_for_each_entry_safe(hs, tmp, &(mpfs)->hash[i], node.hlist) + #define for_each_l2hash_node(hn, tmp, hash, i) \ for (i = 0; i < MLX5_L2_ADDR_HASH_SIZE; i++) \ hlist_for_each_entry_safe(hn, tmp, &(hash)[i], hlist) @@ -82,11 +86,16 @@ struct l2addr_node { }) #ifdef CONFIG_MLX5_MPFS +struct mlx5_core_dev; int mlx5_mpfs_init(struct mlx5_core_dev *dev); void mlx5_mpfs_cleanup(struct mlx5_core_dev *dev); +int mlx5_mpfs_enable(struct mlx5_core_dev *dev); +void mlx5_mpfs_disable(struct mlx5_core_dev *dev); #else /* #ifndef CONFIG_MLX5_MPFS */ static inline int mlx5_mpfs_init(struct mlx5_core_dev *dev) { return 0; } static inline void mlx5_mpfs_cleanup(struct mlx5_core_dev *dev) {} +static inline int mlx5_mpfs_enable(struct mlx5_core_dev *dev) { return 0; } +static inline void mlx5_mpfs_disable(struct mlx5_core_dev *dev) {} #endif #endif diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/nv_param.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/nv_param.c new file mode 100644 index 0000000000..459a0b4d08 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/nv_param.c @@ -0,0 +1,567 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#include "nv_param.h" +#include "mlx5_core.h" + +enum { + MLX5_CLASS_0_CTRL_ID_NV_GLOBAL_PCI_CONF = 0x80, + MLX5_CLASS_0_CTRL_ID_NV_GLOBAL_PCI_CAP = 0x81, + MLX5_CLASS_0_CTRL_ID_NV_SW_OFFLOAD_CONFIG = 0x10a, + + MLX5_CLASS_3_CTRL_ID_NV_PF_PCI_CONF = 0x80, +}; + +struct mlx5_ifc_configuration_item_type_class_global_bits { + u8 type_class[0x8]; + u8 parameter_index[0x18]; +}; + +struct mlx5_ifc_configuration_item_type_class_per_host_pf_bits { + u8 type_class[0x8]; + u8 pf_index[0x6]; + u8 pci_bus_index[0x8]; + u8 parameter_index[0xa]; +}; + +union mlx5_ifc_config_item_type_auto_bits { + struct mlx5_ifc_configuration_item_type_class_global_bits + configuration_item_type_class_global; + struct mlx5_ifc_configuration_item_type_class_per_host_pf_bits + configuration_item_type_class_per_host_pf; + u8 reserved_at_0[0x20]; +}; + +struct mlx5_ifc_config_item_bits { + u8 valid[0x2]; + u8 priority[0x2]; + u8 header_type[0x2]; + u8 ovr_en[0x1]; + u8 rd_en[0x1]; + u8 access_mode[0x2]; + u8 reserved_at_a[0x1]; + u8 writer_id[0x5]; + u8 version[0x4]; + u8 reserved_at_14[0x2]; + u8 host_id_valid[0x1]; + u8 length[0x9]; + + union mlx5_ifc_config_item_type_auto_bits type; + + u8 reserved_at_40[0x10]; + u8 crc16[0x10]; +}; + +struct mlx5_ifc_mnvda_reg_bits { + struct mlx5_ifc_config_item_bits configuration_item_header; + + u8 configuration_item_data[64][0x20]; +}; + +struct mlx5_ifc_nv_global_pci_conf_bits { + u8 sriov_valid[0x1]; + u8 reserved_at_1[0x10]; + u8 per_pf_total_vf[0x1]; + u8 reserved_at_12[0xe]; + + u8 sriov_en[0x1]; + u8 reserved_at_21[0xf]; + u8 total_vfs[0x10]; + + u8 reserved_at_40[0x20]; +}; + +struct mlx5_ifc_nv_global_pci_cap_bits { + u8 max_vfs_per_pf_valid[0x1]; + u8 reserved_at_1[0x13]; + u8 per_pf_total_vf_supported[0x1]; + u8 reserved_at_15[0xb]; + + u8 sriov_support[0x1]; + u8 reserved_at_21[0xf]; + u8 max_vfs_per_pf[0x10]; + + u8 reserved_at_40[0x60]; +}; + +struct mlx5_ifc_nv_pf_pci_conf_bits { + u8 reserved_at_0[0x9]; + u8 pf_total_vf_en[0x1]; + u8 reserved_at_a[0x16]; + + u8 reserved_at_20[0x20]; + + u8 reserved_at_40[0x10]; + u8 total_vf[0x10]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_nv_sw_offload_conf_bits { + u8 ip_over_vxlan_port[0x10]; + u8 tunnel_ecn_copy_offload_disable[0x1]; + u8 pci_atomic_mode[0x3]; + u8 sr_enable[0x1]; + u8 ptp_cyc2realtime[0x1]; + u8 vector_calc_disable[0x1]; + u8 uctx_en[0x1]; + u8 prio_tag_required_en[0x1]; + u8 esw_fdb_ipv4_ttl_modify_enable[0x1]; + u8 mkey_by_name[0x1]; + u8 ip_over_vxlan_en[0x1]; + u8 one_qp_per_recovery[0x1]; + u8 cqe_compression[0x3]; + u8 tunnel_udp_entropy_proto_disable[0x1]; + u8 reserved_at_21[0x1]; + u8 ar_enable[0x1]; + u8 log_max_outstanding_wqe[0x5]; + u8 vf_migration[0x2]; + u8 log_tx_psn_win[0x6]; + u8 lro_log_timeout3[0x4]; + u8 lro_log_timeout2[0x4]; + u8 lro_log_timeout1[0x4]; + u8 lro_log_timeout0[0x4]; +}; + +#define MNVDA_HDR_SZ \ + (MLX5_ST_SZ_BYTES(mnvda_reg) - \ + MLX5_BYTE_OFF(mnvda_reg, configuration_item_data)) + +#define MLX5_SET_CFG_ITEM_TYPE(_cls_name, _mnvda_ptr, _field, _val) \ + MLX5_SET(mnvda_reg, _mnvda_ptr, \ + configuration_item_header.type.configuration_item_type_class_##_cls_name._field, \ + _val) + +#define MLX5_SET_CFG_HDR_LEN(_mnvda_ptr, _cls_name) \ + MLX5_SET(mnvda_reg, _mnvda_ptr, configuration_item_header.length, \ + MLX5_ST_SZ_BYTES(_cls_name)) + +#define MLX5_GET_CFG_HDR_LEN(_mnvda_ptr) \ + MLX5_GET(mnvda_reg, _mnvda_ptr, configuration_item_header.length) + +static int mlx5_nv_param_read(struct mlx5_core_dev *dev, void *mnvda, + size_t len) +{ + u32 param_idx, type_class; + u32 header_len; + void *cls_ptr; + int err; + + if (WARN_ON(len > MLX5_ST_SZ_BYTES(mnvda_reg)) || len < MNVDA_HDR_SZ) + return -EINVAL; /* A caller bug */ + + err = mlx5_core_access_reg(dev, mnvda, len, mnvda, len, MLX5_REG_MNVDA, + 0, 0); + if (!err) + return 0; + + cls_ptr = MLX5_ADDR_OF(mnvda_reg, mnvda, + configuration_item_header.type.configuration_item_type_class_global); + + type_class = MLX5_GET(configuration_item_type_class_global, cls_ptr, + type_class); + param_idx = MLX5_GET(configuration_item_type_class_global, cls_ptr, + parameter_index); + header_len = MLX5_GET_CFG_HDR_LEN(mnvda); + + mlx5_core_warn(dev, "Failed to read mnvda reg: type_class 0x%x, param_idx 0x%x, header_len %u, err %d\n", + type_class, param_idx, header_len, err); + + return -EOPNOTSUPP; +} + +static int mlx5_nv_param_write(struct mlx5_core_dev *dev, void *mnvda, + size_t len) +{ + if (WARN_ON(len > MLX5_ST_SZ_BYTES(mnvda_reg)) || len < MNVDA_HDR_SZ) + return -EINVAL; + + if (WARN_ON(MLX5_GET_CFG_HDR_LEN(mnvda) == 0)) + return -EINVAL; + + return mlx5_core_access_reg(dev, mnvda, len, mnvda, len, MLX5_REG_MNVDA, + 0, 1); +} + +static int +mlx5_nv_param_read_sw_offload_conf(struct mlx5_core_dev *dev, void *mnvda, + size_t len) +{ + MLX5_SET_CFG_ITEM_TYPE(global, mnvda, type_class, 0); + MLX5_SET_CFG_ITEM_TYPE(global, mnvda, parameter_index, + MLX5_CLASS_0_CTRL_ID_NV_SW_OFFLOAD_CONFIG); + MLX5_SET_CFG_HDR_LEN(mnvda, nv_sw_offload_conf); + + return mlx5_nv_param_read(dev, mnvda, len); +} + +static const char *const + cqe_compress_str[] = { "balanced", "aggressive" }; + +static int +mlx5_nv_param_devlink_cqe_compress_get(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + u32 mnvda[MLX5_ST_SZ_DW(mnvda_reg)] = {}; + u8 value = U8_MAX; + void *data; + int err; + + err = mlx5_nv_param_read_sw_offload_conf(dev, mnvda, sizeof(mnvda)); + if (err) + return err; + + data = MLX5_ADDR_OF(mnvda_reg, mnvda, configuration_item_data); + value = MLX5_GET(nv_sw_offload_conf, data, cqe_compression); + + if (value >= ARRAY_SIZE(cqe_compress_str)) + return -EOPNOTSUPP; + + strscpy(ctx->val.vstr, cqe_compress_str[value], sizeof(ctx->val.vstr)); + return 0; +} + +static int +mlx5_nv_param_devlink_cqe_compress_validate(struct devlink *devlink, u32 id, + union devlink_param_value val, + struct netlink_ext_ack *extack) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(cqe_compress_str); i++) { + if (!strcmp(val.vstr, cqe_compress_str[i])) + return 0; + } + + NL_SET_ERR_MSG_MOD(extack, + "Invalid value, supported values are balanced/aggressive"); + return -EOPNOTSUPP; +} + +static int +mlx5_nv_param_devlink_cqe_compress_set(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + u32 mnvda[MLX5_ST_SZ_DW(mnvda_reg)] = {}; + int err = 0; + void *data; + u8 value; + + if (!strcmp(ctx->val.vstr, "aggressive")) + value = 1; + else /* balanced: can't be anything else already validated above */ + value = 0; + + err = mlx5_nv_param_read_sw_offload_conf(dev, mnvda, sizeof(mnvda)); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Failed to read sw_offload_conf mnvda reg"); + return err; + } + + data = MLX5_ADDR_OF(mnvda_reg, mnvda, configuration_item_data); + MLX5_SET(nv_sw_offload_conf, data, cqe_compression, value); + + return mlx5_nv_param_write(dev, mnvda, sizeof(mnvda)); +} + +static int mlx5_nv_param_read_global_pci_conf(struct mlx5_core_dev *dev, + void *mnvda, size_t len) +{ + MLX5_SET_CFG_ITEM_TYPE(global, mnvda, type_class, 0); + MLX5_SET_CFG_ITEM_TYPE(global, mnvda, parameter_index, + MLX5_CLASS_0_CTRL_ID_NV_GLOBAL_PCI_CONF); + MLX5_SET_CFG_HDR_LEN(mnvda, nv_global_pci_conf); + + return mlx5_nv_param_read(dev, mnvda, len); +} + +static int mlx5_nv_param_read_global_pci_cap(struct mlx5_core_dev *dev, + void *mnvda, size_t len) +{ + MLX5_SET_CFG_ITEM_TYPE(global, mnvda, type_class, 0); + MLX5_SET_CFG_ITEM_TYPE(global, mnvda, parameter_index, + MLX5_CLASS_0_CTRL_ID_NV_GLOBAL_PCI_CAP); + MLX5_SET_CFG_HDR_LEN(mnvda, nv_global_pci_cap); + + return mlx5_nv_param_read(dev, mnvda, len); +} + +static int mlx5_nv_param_read_per_host_pf_conf(struct mlx5_core_dev *dev, + void *mnvda, size_t len) +{ + MLX5_SET_CFG_ITEM_TYPE(per_host_pf, mnvda, type_class, 3); + MLX5_SET_CFG_ITEM_TYPE(per_host_pf, mnvda, parameter_index, + MLX5_CLASS_3_CTRL_ID_NV_PF_PCI_CONF); + MLX5_SET_CFG_HDR_LEN(mnvda, nv_pf_pci_conf); + + return mlx5_nv_param_read(dev, mnvda, len); +} + +static int mlx5_devlink_enable_sriov_get(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + u32 mnvda[MLX5_ST_SZ_DW(mnvda_reg)] = {}; + bool sriov_en = false; + void *data; + int err; + + err = mlx5_nv_param_read_global_pci_cap(dev, mnvda, sizeof(mnvda)); + if (err) + return err; + + data = MLX5_ADDR_OF(mnvda_reg, mnvda, configuration_item_data); + if (!MLX5_GET(nv_global_pci_cap, data, sriov_support)) { + ctx->val.vbool = false; + return 0; + } + + memset(mnvda, 0, sizeof(mnvda)); + err = mlx5_nv_param_read_global_pci_conf(dev, mnvda, sizeof(mnvda)); + if (err) + return err; + + data = MLX5_ADDR_OF(mnvda_reg, mnvda, configuration_item_data); + sriov_en = MLX5_GET(nv_global_pci_conf, data, sriov_en); + if (!MLX5_GET(nv_global_pci_conf, data, per_pf_total_vf)) { + ctx->val.vbool = sriov_en; + return 0; + } + + /* SRIOV is per PF */ + memset(mnvda, 0, sizeof(mnvda)); + err = mlx5_nv_param_read_per_host_pf_conf(dev, mnvda, sizeof(mnvda)); + if (err) + return err; + + data = MLX5_ADDR_OF(mnvda_reg, mnvda, configuration_item_data); + ctx->val.vbool = sriov_en && + MLX5_GET(nv_pf_pci_conf, data, pf_total_vf_en); + return 0; +} + +static int mlx5_devlink_enable_sriov_set(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + u32 mnvda[MLX5_ST_SZ_DW(mnvda_reg)] = {}; + bool per_pf_support; + void *cap, *data; + int err; + + err = mlx5_nv_param_read_global_pci_cap(dev, mnvda, sizeof(mnvda)); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Failed to read global PCI capability"); + return err; + } + + cap = MLX5_ADDR_OF(mnvda_reg, mnvda, configuration_item_data); + per_pf_support = MLX5_GET(nv_global_pci_cap, cap, + per_pf_total_vf_supported); + + if (!MLX5_GET(nv_global_pci_cap, cap, sriov_support)) { + NL_SET_ERR_MSG_MOD(extack, + "SRIOV is not supported on this device"); + return -EOPNOTSUPP; + } + + if (!per_pf_support) { + /* We don't allow global SRIOV setting on per PF devlink */ + NL_SET_ERR_MSG_MOD(extack, + "SRIOV is not per PF on this device"); + return -EOPNOTSUPP; + } + + memset(mnvda, 0, sizeof(mnvda)); + err = mlx5_nv_param_read_global_pci_conf(dev, mnvda, sizeof(mnvda)); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Unable to read global PCI configuration"); + return err; + } + + data = MLX5_ADDR_OF(mnvda_reg, mnvda, configuration_item_data); + + /* setup per PF sriov mode */ + MLX5_SET(nv_global_pci_conf, data, sriov_valid, 1); + MLX5_SET(nv_global_pci_conf, data, sriov_en, 1); + MLX5_SET(nv_global_pci_conf, data, per_pf_total_vf, 1); + + err = mlx5_nv_param_write(dev, mnvda, sizeof(mnvda)); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Unable to write global PCI configuration"); + return err; + } + + /* enable/disable sriov on this PF */ + memset(mnvda, 0, sizeof(mnvda)); + err = mlx5_nv_param_read_per_host_pf_conf(dev, mnvda, sizeof(mnvda)); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Unable to read per host PF configuration"); + return err; + } + MLX5_SET(nv_pf_pci_conf, data, pf_total_vf_en, ctx->val.vbool); + return mlx5_nv_param_write(dev, mnvda, sizeof(mnvda)); +} + +static int mlx5_devlink_total_vfs_get(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + u32 mnvda[MLX5_ST_SZ_DW(mnvda_reg)] = {}; + void *data; + int err; + + data = MLX5_ADDR_OF(mnvda_reg, mnvda, configuration_item_data); + + err = mlx5_nv_param_read_global_pci_cap(dev, mnvda, sizeof(mnvda)); + if (err) + return err; + + if (!MLX5_GET(nv_global_pci_cap, data, sriov_support)) { + ctx->val.vu32 = 0; + return 0; + } + + memset(mnvda, 0, sizeof(mnvda)); + err = mlx5_nv_param_read_global_pci_conf(dev, mnvda, sizeof(mnvda)); + if (err) + return err; + + if (!MLX5_GET(nv_global_pci_conf, data, per_pf_total_vf)) { + ctx->val.vu32 = MLX5_GET(nv_global_pci_conf, data, total_vfs); + return 0; + } + + /* SRIOV is per PF */ + memset(mnvda, 0, sizeof(mnvda)); + err = mlx5_nv_param_read_per_host_pf_conf(dev, mnvda, sizeof(mnvda)); + if (err) + return err; + + ctx->val.vu32 = MLX5_GET(nv_pf_pci_conf, data, total_vf); + + return 0; +} + +static int mlx5_devlink_total_vfs_set(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + u32 mnvda[MLX5_ST_SZ_DW(mnvda_reg)]; + void *data; + int err; + + err = mlx5_nv_param_read_global_pci_cap(dev, mnvda, sizeof(mnvda)); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Failed to read global pci cap"); + return err; + } + + data = MLX5_ADDR_OF(mnvda_reg, mnvda, configuration_item_data); + if (!MLX5_GET(nv_global_pci_cap, data, sriov_support)) { + NL_SET_ERR_MSG_MOD(extack, "Not configurable on this device"); + return -EOPNOTSUPP; + } + + if (!MLX5_GET(nv_global_pci_cap, data, per_pf_total_vf_supported)) { + /* We don't allow global SRIOV setting on per PF devlink */ + NL_SET_ERR_MSG_MOD(extack, + "SRIOV is not per PF on this device"); + return -EOPNOTSUPP; + } + + memset(mnvda, 0, sizeof(mnvda)); + err = mlx5_nv_param_read_global_pci_conf(dev, mnvda, sizeof(mnvda)); + if (err) + return err; + + MLX5_SET(nv_global_pci_conf, data, sriov_valid, 1); + MLX5_SET(nv_global_pci_conf, data, per_pf_total_vf, 1); + + err = mlx5_nv_param_write(dev, mnvda, sizeof(mnvda)); + if (err) + return err; + + memset(mnvda, 0, sizeof(mnvda)); + err = mlx5_nv_param_read_per_host_pf_conf(dev, mnvda, sizeof(mnvda)); + if (err) + return err; + + data = MLX5_ADDR_OF(mnvda_reg, mnvda, configuration_item_data); + MLX5_SET(nv_pf_pci_conf, data, total_vf, ctx->val.vu32); + return mlx5_nv_param_write(dev, mnvda, sizeof(mnvda)); +} + +static int mlx5_devlink_total_vfs_validate(struct devlink *devlink, u32 id, + union devlink_param_value val, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + u32 cap[MLX5_ST_SZ_DW(mnvda_reg)]; + void *data; + u16 max; + int err; + + data = MLX5_ADDR_OF(mnvda_reg, cap, configuration_item_data); + + err = mlx5_nv_param_read_global_pci_cap(dev, cap, sizeof(cap)); + if (err) + return err; + + if (!MLX5_GET(nv_global_pci_cap, data, max_vfs_per_pf_valid)) + return 0; /* optimistic, but set might fail later */ + + max = MLX5_GET(nv_global_pci_cap, data, max_vfs_per_pf); + if (val.vu16 > max) { + NL_SET_ERR_MSG_FMT_MOD(extack, + "Max allowed by device is %u", max); + return -EINVAL; + } + + return 0; +} + +static const struct devlink_param mlx5_nv_param_devlink_params[] = { + DEVLINK_PARAM_GENERIC(ENABLE_SRIOV, BIT(DEVLINK_PARAM_CMODE_PERMANENT), + mlx5_devlink_enable_sriov_get, + mlx5_devlink_enable_sriov_set, NULL), + DEVLINK_PARAM_GENERIC(TOTAL_VFS, BIT(DEVLINK_PARAM_CMODE_PERMANENT), + mlx5_devlink_total_vfs_get, + mlx5_devlink_total_vfs_set, + mlx5_devlink_total_vfs_validate), + DEVLINK_PARAM_DRIVER(MLX5_DEVLINK_PARAM_ID_CQE_COMPRESSION_TYPE, + "cqe_compress_type", DEVLINK_PARAM_TYPE_STRING, + BIT(DEVLINK_PARAM_CMODE_PERMANENT), + mlx5_nv_param_devlink_cqe_compress_get, + mlx5_nv_param_devlink_cqe_compress_set, + mlx5_nv_param_devlink_cqe_compress_validate), +}; + +int mlx5_nv_param_register_dl_params(struct devlink *devlink) +{ + if (!mlx5_core_is_pf(devlink_priv(devlink))) + return 0; + + return devl_params_register(devlink, mlx5_nv_param_devlink_params, + ARRAY_SIZE(mlx5_nv_param_devlink_params)); +} + +void mlx5_nv_param_unregister_dl_params(struct devlink *devlink) +{ + if (!mlx5_core_is_pf(devlink_priv(devlink))) + return; + + devl_params_unregister(devlink, mlx5_nv_param_devlink_params, + ARRAY_SIZE(mlx5_nv_param_devlink_params)); +} + diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/nv_param.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/nv_param.h new file mode 100644 index 0000000000..9f4922ff77 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/nv_param.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#ifndef __MLX5_NV_PARAM_H +#define __MLX5_NV_PARAM_H + +#include +#include "devlink.h" + +int mlx5_nv_param_register_dl_params(struct devlink *devlink); +void mlx5_nv_param_unregister_dl_params(struct devlink *devlink); + +#endif + diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c index eeb0b7ea05..8e17daae48 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c @@ -210,15 +210,19 @@ static void sd_cleanup(struct mlx5_core_dev *dev) static int sd_register(struct mlx5_core_dev *dev) { struct mlx5_devcom_comp_dev *devcom, *pos; + struct mlx5_devcom_match_attr attr = {}; struct mlx5_core_dev *peer, *primary; struct mlx5_sd *sd, *primary_sd; int err, i; sd = mlx5_get_sd(dev); + attr.key.val = sd->group_id; + attr.flags = MLX5_DEVCOM_MATCH_FLAGS_NS; + attr.net = mlx5_core_net(dev); devcom = mlx5_devcom_register_component(dev->priv.devc, MLX5_DEVCOM_SD_GROUP, - sd->group_id, NULL, dev); - if (IS_ERR(devcom)) - return PTR_ERR(devcom); + &attr, NULL, dev); + if (!devcom) + return -EINVAL; sd->devcom = devcom; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c new file mode 100644 index 0000000000..ef06fe6cbb --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include +#include + +#include "mlx5_core.h" +#include "lib/mlx5.h" + +struct mlx5_st_idx_data { + refcount_t usecount; + u16 tag; +}; + +struct mlx5_st { + /* serialize access upon alloc/free flows */ + struct mutex lock; + struct xa_limit index_limit; + struct xarray idx_xa; /* key == index, value == struct mlx5_st_idx_data */ + u8 direct_mode : 1; +}; + +struct mlx5_st *mlx5_st_create(struct mlx5_core_dev *dev) +{ + struct pci_dev *pdev = dev->pdev; + struct mlx5_st *st; + u8 direct_mode = 0; + u16 num_entries; + u32 tbl_loc; + int ret; + + if (!MLX5_CAP_GEN(dev, mkey_pcie_tph)) + return NULL; + +#ifdef CONFIG_MLX5_SF + if (mlx5_core_is_sf(dev)) + return dev->priv.parent_mdev->st; +#endif + + /* Checking whether the device is capable */ + if (!pdev->tph_cap) + return NULL; + + tbl_loc = pcie_tph_get_st_table_loc(pdev); + if (tbl_loc == PCI_TPH_LOC_NONE) + direct_mode = 1; + + if (!direct_mode) { + num_entries = pcie_tph_get_st_table_size(pdev); + /* We need a reserved entry for non TPH cases */ + if (num_entries < 2) + return NULL; + } + + /* The OS doesn't support ST */ + ret = pcie_enable_tph(pdev, PCI_TPH_ST_DS_MODE); + if (ret) + return NULL; + + st = kzalloc(sizeof(*st), GFP_KERNEL); + if (!st) + goto end; + + mutex_init(&st->lock); + xa_init_flags(&st->idx_xa, XA_FLAGS_ALLOC); + st->direct_mode = direct_mode; + if (st->direct_mode) + return st; + + /* entry 0 is reserved for non TPH cases */ + st->index_limit.min = MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX + 1; + st->index_limit.max = num_entries - 1; + + return st; + +end: + pcie_disable_tph(dev->pdev); + return NULL; +} + +void mlx5_st_destroy(struct mlx5_core_dev *dev) +{ + struct mlx5_st *st = dev->st; + + if (mlx5_core_is_sf(dev) || !st) + return; + + pcie_disable_tph(dev->pdev); + WARN_ON_ONCE(!xa_empty(&st->idx_xa)); + kfree(st); +} + +int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type, + unsigned int cpu_uid, u16 *st_index) +{ + struct mlx5_st_idx_data *idx_data; + struct mlx5_st *st = dev->st; + unsigned long index; + u32 xa_id; + u16 tag; + int ret; + + if (!st) + return -EOPNOTSUPP; + + ret = pcie_tph_get_cpu_st(dev->pdev, mem_type, cpu_uid, &tag); + if (ret) + return ret; + + if (st->direct_mode) { + *st_index = tag; + return 0; + } + + mutex_lock(&st->lock); + + xa_for_each(&st->idx_xa, index, idx_data) { + if (tag == idx_data->tag) { + refcount_inc(&idx_data->usecount); + *st_index = index; + goto end; + } + } + + idx_data = kzalloc(sizeof(*idx_data), GFP_KERNEL); + if (!idx_data) { + ret = -ENOMEM; + goto end; + } + + refcount_set(&idx_data->usecount, 1); + idx_data->tag = tag; + + ret = xa_alloc(&st->idx_xa, &xa_id, idx_data, st->index_limit, GFP_KERNEL); + if (ret) + goto clean_idx_data; + + ret = pcie_tph_set_st_entry(dev->pdev, xa_id, tag); + if (ret) + goto clean_idx_xa; + + *st_index = xa_id; + goto end; + +clean_idx_xa: + xa_erase(&st->idx_xa, xa_id); +clean_idx_data: + kfree(idx_data); +end: + mutex_unlock(&st->lock); + return ret; +} +EXPORT_SYMBOL_GPL(mlx5_st_alloc_index); + +int mlx5_st_dealloc_index(struct mlx5_core_dev *dev, u16 st_index) +{ + struct mlx5_st_idx_data *idx_data; + struct mlx5_st *st = dev->st; + int ret = 0; + + if (!st) + return -EOPNOTSUPP; + + if (st->direct_mode) + return 0; + + mutex_lock(&st->lock); + idx_data = xa_load(&st->idx_xa, st_index); + if (WARN_ON_ONCE(!idx_data)) { + ret = -EINVAL; + goto end; + } + + if (refcount_dec_and_test(&idx_data->usecount)) { + xa_erase(&st->idx_xa, st_index); + /* We leave PCI config space as was before, no mkey will refer to it */ + } + +end: + mutex_unlock(&st->lock); + return ret; +} +EXPORT_SYMBOL_GPL(mlx5_st_dealloc_index); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c index d55e15c1f3..304912637c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c @@ -149,7 +149,7 @@ struct mlx5_vxlan *mlx5_vxlan_create(struct mlx5_core_dev *mdev) struct mlx5_vxlan *vxlan; if (!MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan) || !mlx5_core_is_pf(mdev)) - return ERR_PTR(-ENOTSUPP); + return ERR_PTR(-EOPNOTSUPP); vxlan = kzalloc(sizeof(*vxlan), GFP_KERNEL); if (!vxlan) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 250f7005e7..622bc2c5c6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -553,6 +553,7 @@ EXPORT_SYMBOL(mlx5_is_roce_on); static int handle_hca_cap_2(struct mlx5_core_dev *dev, void *set_ctx) { + bool do_set = false; void *set_hca_cap; int err; @@ -563,17 +564,27 @@ static int handle_hca_cap_2(struct mlx5_core_dev *dev, void *set_ctx) if (err) return err; - if (!MLX5_CAP_GEN_2_MAX(dev, sw_vhca_id_valid) || - !(dev->priv.sw_vhca_id > 0)) - return 0; - set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx, capability); memcpy(set_hca_cap, dev->caps.hca[MLX5_CAP_GENERAL_2]->cur, MLX5_ST_SZ_BYTES(cmd_hca_cap_2)); - MLX5_SET(cmd_hca_cap_2, set_hca_cap, sw_vhca_id_valid, 1); - return set_caps(dev, set_ctx, MLX5_CAP_GENERAL_2); + if (MLX5_CAP_GEN_2_MAX(dev, sw_vhca_id_valid) && + dev->priv.sw_vhca_id > 0) { + MLX5_SET(cmd_hca_cap_2, set_hca_cap, sw_vhca_id_valid, 1); + do_set = true; + } + + if (MLX5_CAP_GEN_2_MAX(dev, lag_per_mp_group)) { + MLX5_SET(cmd_hca_cap_2, set_hca_cap, lag_per_mp_group, 1); + do_set = true; + } + + /* some FW versions that support querying MLX5_CAP_GENERAL_2 + * capabilities but don't support setting them. + * Skip unnecessary update to hca_cap_2 when no changes were introduced + */ + return do_set ? set_caps(dev, set_ctx, MLX5_CAP_GENERAL_2) : 0; } static int handle_hca_cap(struct mlx5_core_dev *dev, void *set_ctx) @@ -973,36 +984,13 @@ static void mlx5_pci_close(struct mlx5_core_dev *dev) mlx5_pci_disable_device(dev); } -static void mlx5_register_hca_devcom_comp(struct mlx5_core_dev *dev) -{ - /* This component is use to sync adding core_dev to lag_dev and to sync - * changes of mlx5_adev_devices between LAG layer and other layers. - */ - if (!mlx5_lag_is_supported(dev)) - return; - - dev->priv.hca_devcom_comp = - mlx5_devcom_register_component(dev->priv.devc, MLX5_DEVCOM_HCA_PORTS, - mlx5_query_nic_system_image_guid(dev), - NULL, dev); - if (IS_ERR(dev->priv.hca_devcom_comp)) - mlx5_core_err(dev, "Failed to register devcom HCA component\n"); -} - -static void mlx5_unregister_hca_devcom_comp(struct mlx5_core_dev *dev) -{ - mlx5_devcom_unregister_component(dev->priv.hca_devcom_comp); -} - static int mlx5_init_once(struct mlx5_core_dev *dev) { int err; dev->priv.devc = mlx5_devcom_register_device(dev); - if (IS_ERR(dev->priv.devc)) - mlx5_core_warn(dev, "failed to register devcom device %ld\n", - PTR_ERR(dev->priv.devc)); - mlx5_register_hca_devcom_comp(dev); + if (!dev->priv.devc) + mlx5_core_warn(dev, "failed to register devcom device\n"); err = mlx5_query_board_id(dev); if (err) { @@ -1022,16 +1010,10 @@ static int mlx5_init_once(struct mlx5_core_dev *dev) goto err_irq_cleanup; } - err = mlx5_events_init(dev); - if (err) { - mlx5_core_err(dev, "failed to initialize events\n"); - goto err_eq_cleanup; - } - err = mlx5_fw_reset_init(dev); if (err) { mlx5_core_err(dev, "failed to initialize fw reset events\n"); - goto err_events_cleanup; + goto err_eq_cleanup; } mlx5_cq_debugfs_init(dev); @@ -1102,9 +1084,7 @@ static int mlx5_init_once(struct mlx5_core_dev *dev) } dev->dm = mlx5_dm_create(dev); - if (IS_ERR(dev->dm)) - mlx5_core_warn(dev, "Failed to init device memory %ld\n", PTR_ERR(dev->dm)); - + dev->st = mlx5_st_create(dev); dev->tracer = mlx5_fw_tracer_create(dev); dev->hv_vhca = mlx5_hv_vhca_create(dev); dev->rsc_dump = mlx5_rsc_dump_create(dev); @@ -1135,14 +1115,11 @@ err_tables_cleanup: mlx5_cleanup_reserved_gids(dev); mlx5_cq_debugfs_cleanup(dev); mlx5_fw_reset_cleanup(dev); -err_events_cleanup: - mlx5_events_cleanup(dev); err_eq_cleanup: mlx5_eq_table_cleanup(dev); err_irq_cleanup: mlx5_irq_table_cleanup(dev); err_devcom: - mlx5_unregister_hca_devcom_comp(dev); mlx5_devcom_unregister_device(dev->priv.devc); return err; @@ -1153,6 +1130,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev) mlx5_rsc_dump_destroy(dev); mlx5_hv_vhca_destroy(dev->hv_vhca); mlx5_fw_tracer_destroy(dev->tracer); + mlx5_st_destroy(dev); mlx5_dm_cleanup(dev); mlx5_fs_core_free(dev); mlx5_sf_table_cleanup(dev); @@ -1169,10 +1147,8 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev) mlx5_cleanup_reserved_gids(dev); mlx5_cq_debugfs_cleanup(dev); mlx5_fw_reset_cleanup(dev); - mlx5_events_cleanup(dev); mlx5_eq_table_cleanup(dev); mlx5_irq_table_cleanup(dev); - mlx5_unregister_hca_devcom_comp(dev); mlx5_devcom_unregister_device(dev->priv.devc); } @@ -1341,10 +1317,9 @@ static int mlx5_load(struct mlx5_core_dev *dev) { int err; - dev->priv.uar = mlx5_get_uars_page(dev); - if (IS_ERR(dev->priv.uar)) { - mlx5_core_err(dev, "Failed allocating uar, aborting\n"); - err = PTR_ERR(dev->priv.uar); + err = mlx5_alloc_bfreg(dev, &dev->priv.bfreg, false, false); + if (err) { + mlx5_core_err(dev, "Failed allocating bfreg, %d\n", err); return err; } @@ -1402,12 +1377,6 @@ static int mlx5_load(struct mlx5_core_dev *dev) mlx5_vhca_event_start(dev); - err = mlx5_sf_hw_table_create(dev); - if (err) { - mlx5_core_err(dev, "sf table create failed %d\n", err); - goto err_vhca; - } - err = mlx5_ec_init(dev); if (err) { mlx5_core_err(dev, "Failed to init embedded CPU\n"); @@ -1436,8 +1405,6 @@ err_sriov: mlx5_lag_remove_mdev(dev); mlx5_ec_cleanup(dev); err_ec: - mlx5_sf_hw_table_destroy(dev); -err_vhca: mlx5_vhca_event_stop(dev); err_set_hca: mlx5_fs_core_cleanup(dev); @@ -1455,7 +1422,7 @@ err_eq_table: err_irq_table: mlx5_pagealloc_stop(dev); mlx5_events_stop(dev); - mlx5_put_uars_page(dev, dev->priv.uar); + mlx5_free_bfreg(dev, &dev->priv.bfreg); return err; } @@ -1463,12 +1430,12 @@ static void mlx5_unload(struct mlx5_core_dev *dev) { mlx5_eswitch_disable(dev->priv.eswitch); mlx5_devlink_traps_unregister(priv_to_devlink(dev)); + mlx5_vhca_event_stop(dev); mlx5_sf_dev_table_destroy(dev); mlx5_sriov_detach(dev); mlx5_lag_remove_mdev(dev); mlx5_ec_cleanup(dev); mlx5_sf_hw_table_destroy(dev); - mlx5_vhca_event_stop(dev); mlx5_fs_core_cleanup(dev); mlx5_fpga_device_stop(dev); mlx5_rsc_dump_cleanup(dev); @@ -1480,7 +1447,7 @@ static void mlx5_unload(struct mlx5_core_dev *dev) mlx5_irq_table_destroy(dev); mlx5_pagealloc_stop(dev); mlx5_events_stop(dev); - mlx5_put_uars_page(dev, dev->priv.uar); + mlx5_free_bfreg(dev, &dev->priv.bfreg); } int mlx5_init_one_devl_locked(struct mlx5_core_dev *dev) @@ -1799,6 +1766,7 @@ static const int types[] = { MLX5_CAP_VDPA_EMULATION, MLX5_CAP_IPSEC, MLX5_CAP_PORT_SELECTION, + MLX5_CAP_PSP, MLX5_CAP_MACSEC, MLX5_CAP_ADV_VIRTUALIZATION, MLX5_CAP_CRYPTO, @@ -1838,15 +1806,49 @@ err: return -ENOMEM; } -static int vhca_id_show(struct seq_file *file, void *priv) +static int mlx5_notifiers_init(struct mlx5_core_dev *dev) { - struct mlx5_core_dev *dev = file->private; + int err; + + err = mlx5_events_init(dev); + if (err) { + mlx5_core_err(dev, "failed to initialize events\n"); + return err; + } + + BLOCKING_INIT_NOTIFIER_HEAD(&dev->priv.esw_n_head); + mlx5_vhca_state_notifier_init(dev); + + err = mlx5_sf_hw_notifier_init(dev); + if (err) + goto err_sf_hw_notifier; + + err = mlx5_sf_notifiers_init(dev); + if (err) + goto err_sf_notifiers; + + err = mlx5_sf_dev_notifier_init(dev); + if (err) + goto err_sf_dev_notifier; - seq_printf(file, "0x%x\n", MLX5_CAP_GEN(dev, vhca_id)); return 0; + +err_sf_dev_notifier: + mlx5_sf_notifiers_cleanup(dev); +err_sf_notifiers: + mlx5_sf_hw_notifier_cleanup(dev); +err_sf_hw_notifier: + mlx5_events_cleanup(dev); + return err; } -DEFINE_SHOW_ATTRIBUTE(vhca_id); +static void mlx5_notifiers_cleanup(struct mlx5_core_dev *dev) +{ + mlx5_sf_dev_notifier_cleanup(dev); + mlx5_sf_notifiers_cleanup(dev); + mlx5_sf_hw_notifier_cleanup(dev); + mlx5_events_cleanup(dev); +} int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx) { @@ -1872,7 +1874,7 @@ int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx) priv->numa_node = dev_to_node(mlx5_core_dma_dev(dev)); priv->dbg.dbg_root = debugfs_create_dir(dev_name(dev->device), mlx5_debugfs_root); - debugfs_create_file("vhca_id", 0400, priv->dbg.dbg_root, dev, &vhca_id_fops); + INIT_LIST_HEAD(&priv->traps); err = mlx5_cmd_init(dev); @@ -1903,6 +1905,10 @@ int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx) if (err) goto err_hca_caps; + err = mlx5_notifiers_init(dev); + if (err) + goto err_notifiers_init; + /* The conjunction of sw_vhca_id with sw_owner_id will be a global * unique id per function which uses mlx5_core. * Those values are supplied to FW as part of the init HCA command to @@ -1917,6 +1923,8 @@ int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx) return 0; +err_notifiers_init: + mlx5_hca_caps_free(dev); err_hca_caps: mlx5_adev_cleanup(dev); err_adev_init: @@ -1945,6 +1953,7 @@ void mlx5_mdev_uninit(struct mlx5_core_dev *dev) if (priv->sw_vhca_id > 0) ida_free(&sw_vhca_ida, dev->priv.sw_vhca_id); + mlx5_notifiers_cleanup(dev); mlx5_hca_caps_free(dev); mlx5_adev_cleanup(dev); mlx5_pagealloc_cleanup(dev); @@ -2005,6 +2014,8 @@ static int probe_one(struct pci_dev *pdev, const struct pci_device_id *id) goto err_init_one; } + mlx5_vhca_debugfs_init(dev); + pci_save_state(pdev); return 0; @@ -2215,6 +2226,7 @@ static void shutdown(struct pci_dev *pdev) mlx5_core_info(dev, "Shutdown was called\n"); set_bit(MLX5_BREAK_FW_WAIT, &dev->intf_state); + mlx5_drain_fw_reset(dev); mlx5_drain_health_wq(dev); err = mlx5_try_fast_unload(dev); if (err) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index 2e02bdea83..f2d74382fb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -258,6 +258,7 @@ int mlx5_wait_for_pages(struct mlx5_core_dev *dev, int *pages); void mlx5_cmd_flush(struct mlx5_core_dev *dev); void mlx5_cq_debugfs_init(struct mlx5_core_dev *dev); void mlx5_cq_debugfs_cleanup(struct mlx5_core_dev *dev); +void mlx5_vhca_debugfs_init(struct mlx5_core_dev *dev); int mlx5_query_pcam_reg(struct mlx5_core_dev *dev, u32 *pcam, u8 feature_group, u8 access_reg_group); @@ -290,6 +291,7 @@ int mlx5_register_device(struct mlx5_core_dev *dev); void mlx5_unregister_device(struct mlx5_core_dev *dev); void mlx5_dev_set_lightweight(struct mlx5_core_dev *dev); bool mlx5_dev_is_lightweight(struct mlx5_core_dev *dev); +void mlx5_core_reps_aux_devs_remove(struct mlx5_core_dev *dev); void mlx5_fw_reporters_create(struct mlx5_core_dev *dev); int mlx5_query_mtpps(struct mlx5_core_dev *dev, u32 *mtpps, u32 mtpps_size); @@ -300,6 +302,15 @@ int mlx5_set_mtppse(struct mlx5_core_dev *mdev, u8 pin, u8 arm, u8 mode); struct mlx5_dm *mlx5_dm_create(struct mlx5_core_dev *dev); void mlx5_dm_cleanup(struct mlx5_core_dev *dev); +#ifdef CONFIG_PCIE_TPH +struct mlx5_st *mlx5_st_create(struct mlx5_core_dev *dev); +void mlx5_st_destroy(struct mlx5_core_dev *dev); +#else +static inline struct mlx5_st * +mlx5_st_create(struct mlx5_core_dev *dev) { return NULL; } +static inline void mlx5_st_destroy(struct mlx5_core_dev *dev) { return; } +#endif + void mlx5_toggle_port_link(struct mlx5_core_dev *dev); int mlx5_set_port_admin_status(struct mlx5_core_dev *dev, enum mlx5_port_status status); @@ -348,16 +359,18 @@ int mlx5_set_port_fcs(struct mlx5_core_dev *mdev, u8 enable); void mlx5_query_port_fcs(struct mlx5_core_dev *mdev, bool *supported, bool *enabled); int mlx5_query_module_eeprom(struct mlx5_core_dev *dev, - u16 offset, u16 size, u8 *data); + u16 offset, u16 size, u8 *data, u8 *status); int mlx5_query_module_eeprom_by_page(struct mlx5_core_dev *dev, struct mlx5_module_eeprom_query_params *params, - u8 *data); + u8 *data, u8 *status); int mlx5_query_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *out); int mlx5_set_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *in); int mlx5_set_trust_state(struct mlx5_core_dev *mdev, u8 trust_state); int mlx5_query_trust_state(struct mlx5_core_dev *mdev, u8 *trust_state); +int mlx5_query_port_buffer_ownership(struct mlx5_core_dev *mdev, + u8 *buffer_ownership); int mlx5_set_dscp2prio(struct mlx5_core_dev *mdev, u8 dscp, u8 prio); int mlx5_query_dscp2prio(struct mlx5_core_dev *mdev, u8 *dscp2prio); @@ -433,13 +446,13 @@ int mlx5_init_one_light(struct mlx5_core_dev *dev); void mlx5_uninit_one_light(struct mlx5_core_dev *dev); void mlx5_unload_one_light(struct mlx5_core_dev *dev); +void mlx5_query_nic_sw_system_image_guid(struct mlx5_core_dev *mdev, u8 *buf, + u8 *len); int mlx5_vport_set_other_func_cap(struct mlx5_core_dev *dev, const void *hca_cap, u16 vport, u16 opmod); #define mlx5_vport_get_other_func_general_cap(dev, vport, out) \ mlx5_vport_get_other_func_cap(dev, vport, out, MLX5_CAP_GENERAL) -int mlx5_vport_get_vhca_id(struct mlx5_core_dev *dev, u16 vport, u16 *vhca_id); - static inline u32 mlx5_sriov_get_vf_total_msix(struct pci_dev *pdev) { struct mlx5_core_dev *dev = pci_get_drvdata(pdev); @@ -495,4 +508,17 @@ static inline int mlx5_max_eq_cap_get(const struct mlx5_core_dev *dev) return 1 << MLX5_CAP_GEN(dev, log_max_eq); } + +static inline bool mlx5_pcie_cong_event_supported(struct mlx5_core_dev *dev) +{ + u64 features = MLX5_CAP_GEN_2_64(dev, general_obj_types_127_64); + + if (!(features & MLX5_HCA_CAP_2_GENERAL_OBJECT_TYPES_PCIE_CONG_EVENT)) + return false; + + if (dev->sd) + return false; + + return true; +} #endif /* __MLX5_CORE_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c index 972e8e9df5..cd68c4b2c0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c @@ -291,7 +291,7 @@ static void free_4k(struct mlx5_core_dev *dev, u64 addr, u32 function) static int alloc_system_page(struct mlx5_core_dev *dev, u32 function) { struct device *device = mlx5_core_dma_dev(dev); - int nid = dev_to_node(device); + int nid = dev->priv.numa_node; struct page *page; u64 zero_addr = 1; u64 addr; @@ -489,9 +489,12 @@ static int reclaim_pages_cmd(struct mlx5_core_dev *dev, u32 func_id; u32 npages; u32 i = 0; + int err; - if (!mlx5_cmd_is_down(dev)) - return mlx5_cmd_do(dev, in, in_size, out, out_size); + err = mlx5_cmd_do(dev, in, in_size, out, out_size); + /* If FW is gone (-ENXIO), proceed to forceful reclaim */ + if (err != -ENXIO) + return err; /* No hard feelings, we want our pages back! */ npages = MLX5_GET(manage_pages_in, in, input_num_entries); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c index 2c5f850c31..aa3b5878e3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -54,7 +54,7 @@ static int mlx5_core_func_to_vport(const struct mlx5_core_dev *dev, /** * mlx5_get_default_msix_vec_count - Get the default number of MSI-X vectors - * to be ssigned to each VF. + * to be assigned to each VF. * @dev: PF to work on * @num_vfs: Number of enabled VFs */ @@ -148,7 +148,7 @@ out: * Free the IRQ and other resources such as rmap from the system. * BUT doesn't free or remove reference from mlx5. * This function is very important for the shutdown flow, where we need to - * cleanup system resoruces but keep mlx5 objects alive, + * cleanup system resources but keep mlx5 objects alive, * see mlx5_irq_table_free_irqs(). */ static void mlx5_system_free_irq(struct mlx5_irq *irq) @@ -324,10 +324,8 @@ err_xa: free_irq(irq->map.virq, &irq->nh); err_req_irq: #ifdef CONFIG_RFS_ACCEL - if (i && rmap && *rmap) { - free_irq_cpu_rmap(*rmap); - *rmap = NULL; - } + if (i && rmap && *rmap) + irq_cpu_rmap_remove(*rmap, irq->map.virq); err_irq_rmap: #endif if (i && pci_msix_can_alloc_dyn(dev->pdev)) @@ -470,26 +468,32 @@ void mlx5_ctrl_irq_release(struct mlx5_core_dev *dev, struct mlx5_irq *ctrl_irq) struct mlx5_irq *mlx5_ctrl_irq_request(struct mlx5_core_dev *dev) { struct mlx5_irq_pool *pool = ctrl_irq_pool_get(dev); - struct irq_affinity_desc af_desc; + struct irq_affinity_desc *af_desc; struct mlx5_irq *irq; - cpumask_copy(&af_desc.mask, cpu_online_mask); - af_desc.is_managed = false; + af_desc = kvzalloc(sizeof(*af_desc), GFP_KERNEL); + if (!af_desc) + return ERR_PTR(-ENOMEM); + + cpumask_copy(&af_desc->mask, cpu_online_mask); + af_desc->is_managed = false; if (!mlx5_irq_pool_is_sf_pool(pool)) { /* In case we are allocating a control IRQ from a pci device's pool. * This can happen also for a SF if the SFs pool is empty. */ if (!pool->xa_num_irqs.max) { - cpumask_clear(&af_desc.mask); + cpumask_clear(&af_desc->mask); /* In case we only have a single IRQ for PF/VF */ - cpumask_set_cpu(cpumask_first(cpu_online_mask), &af_desc.mask); + cpumask_set_cpu(cpumask_first(cpu_online_mask), &af_desc->mask); } /* Allocate the IRQ in index 0. The vector was already allocated */ - irq = irq_pool_request_vector(pool, 0, &af_desc, NULL); + irq = irq_pool_request_vector(pool, 0, af_desc, NULL); } else { - irq = mlx5_irq_affinity_request(dev, pool, &af_desc); + irq = mlx5_irq_affinity_request(dev, pool, af_desc); } + kvfree(af_desc); + return irq; } @@ -548,16 +552,26 @@ struct mlx5_irq *mlx5_irq_request_vector(struct mlx5_core_dev *dev, u16 cpu, { struct mlx5_irq_table *table = mlx5_irq_table_get(dev); struct mlx5_irq_pool *pool = table->pcif_pool; - struct irq_affinity_desc af_desc; int offset = MLX5_IRQ_VEC_COMP_BASE; + struct irq_affinity_desc *af_desc; + struct mlx5_irq *irq; + + af_desc = kvzalloc(sizeof(*af_desc), GFP_KERNEL); + if (!af_desc) + return ERR_PTR(-ENOMEM); if (!pool->xa_num_irqs.max) offset = 0; - af_desc.is_managed = false; - cpumask_clear(&af_desc.mask); - cpumask_set_cpu(cpu, &af_desc.mask); - return mlx5_irq_request(dev, vecidx + offset, &af_desc, rmap); + af_desc->is_managed = false; + cpumask_clear(&af_desc->mask); + cpumask_set_cpu(cpu, &af_desc->mask); + + irq = mlx5_irq_request(dev, vecidx + offset, af_desc, rmap); + + kvfree(af_desc); + + return irq; } static struct mlx5_irq_pool * @@ -588,7 +602,7 @@ static void irq_pool_free(struct mlx5_irq_pool *pool) struct mlx5_irq *irq; unsigned long index; - /* There are cases in which we are destrying the irq_table before + /* There are cases in which we are destroying the irq_table before * freeing all the IRQs, fast teardown for example. Hence, free the irqs * which might not have been freed. */ @@ -617,7 +631,7 @@ static int irq_pools_init(struct mlx5_core_dev *dev, int sf_vec, int pcif_vec, if (!mlx5_sf_max_functions(dev)) return 0; if (sf_vec < MLX5_IRQ_VEC_COMP_BASE_SF) { - mlx5_core_dbg(dev, "Not enught IRQs for SFs. SF may run at lower performance\n"); + mlx5_core_dbg(dev, "Not enough IRQs for SFs. SF may run at lower performance\n"); return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index 7b99e08a79..959b568c4d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -289,11 +289,11 @@ int mlx5_query_module_num(struct mlx5_core_dev *dev, int *module_num) } static int mlx5_query_module_id(struct mlx5_core_dev *dev, int module_num, - u8 *module_id) + u8 *module_id, u8 *status) { u32 in[MLX5_ST_SZ_DW(mcia_reg)] = {}; u32 out[MLX5_ST_SZ_DW(mcia_reg)]; - int err, status; + int err; u8 *ptr; MLX5_SET(mcia_reg, in, i2c_device_address, MLX5_I2C_ADDR_LOW); @@ -308,12 +308,12 @@ static int mlx5_query_module_id(struct mlx5_core_dev *dev, int module_num, if (err) return err; - status = MLX5_GET(mcia_reg, out, status); - if (status) { - mlx5_core_err(dev, "query_mcia_reg failed: status: 0x%x\n", - status); + if (MLX5_GET(mcia_reg, out, status)) { + if (status) + *status = MLX5_GET(mcia_reg, out, status); return -EIO; } + ptr = MLX5_ADDR_OF(mcia_reg, out, dword_0); *module_id = ptr[0]; @@ -370,13 +370,14 @@ static int mlx5_mcia_max_bytes(struct mlx5_core_dev *dev) } static int mlx5_query_mcia(struct mlx5_core_dev *dev, - struct mlx5_module_eeprom_query_params *params, u8 *data) + struct mlx5_module_eeprom_query_params *params, + u8 *data, u8 *status) { u32 in[MLX5_ST_SZ_DW(mcia_reg)] = {}; u32 out[MLX5_ST_SZ_DW(mcia_reg)]; - int status, err; void *ptr; u16 size; + int err; size = min_t(int, params->size, mlx5_mcia_max_bytes(dev)); @@ -392,10 +393,9 @@ static int mlx5_query_mcia(struct mlx5_core_dev *dev, if (err) return err; - status = MLX5_GET(mcia_reg, out, status); - if (status) { - mlx5_core_err(dev, "query_mcia_reg failed: status: 0x%x\n", - status); + if (MLX5_GET(mcia_reg, out, status)) { + if (status) + *status = MLX5_GET(mcia_reg, out, status); return -EIO; } @@ -406,7 +406,7 @@ static int mlx5_query_mcia(struct mlx5_core_dev *dev, } int mlx5_query_module_eeprom(struct mlx5_core_dev *dev, - u16 offset, u16 size, u8 *data) + u16 offset, u16 size, u8 *data, u8 *status) { struct mlx5_module_eeprom_query_params query = {0}; u8 module_id; @@ -416,7 +416,8 @@ int mlx5_query_module_eeprom(struct mlx5_core_dev *dev, if (err) return err; - err = mlx5_query_module_id(dev, query.module_number, &module_id); + err = mlx5_query_module_id(dev, query.module_number, &module_id, + status); if (err) return err; @@ -430,7 +431,8 @@ int mlx5_query_module_eeprom(struct mlx5_core_dev *dev, mlx5_qsfp_eeprom_params_set(&query.i2c_address, &query.page, &offset); break; default: - mlx5_core_err(dev, "Module ID not recognized: 0x%x\n", module_id); + mlx5_core_dbg(dev, "Module ID not recognized: 0x%x\n", + module_id); return -EINVAL; } @@ -441,12 +443,12 @@ int mlx5_query_module_eeprom(struct mlx5_core_dev *dev, query.size = size; query.offset = offset; - return mlx5_query_mcia(dev, &query, data); + return mlx5_query_mcia(dev, &query, data, status); } int mlx5_query_module_eeprom_by_page(struct mlx5_core_dev *dev, struct mlx5_module_eeprom_query_params *params, - u8 *data) + u8 *data, u8 *status) { int err; @@ -460,7 +462,7 @@ int mlx5_query_module_eeprom_by_page(struct mlx5_core_dev *dev, return -EINVAL; } - return mlx5_query_mcia(dev, params, data); + return mlx5_query_mcia(dev, params, data, status); } static int mlx5_query_port_pvlc(struct mlx5_core_dev *dev, u32 *pvlc, @@ -968,6 +970,26 @@ int mlx5_query_trust_state(struct mlx5_core_dev *mdev, u8 *trust_state) return err; } +int mlx5_query_port_buffer_ownership(struct mlx5_core_dev *mdev, + u8 *buffer_ownership) +{ + u32 out[MLX5_ST_SZ_DW(pfcc_reg)] = {}; + int err; + + if (!MLX5_CAP_PCAM_FEATURE(mdev, buffer_ownership)) { + *buffer_ownership = MLX5_BUF_OWNERSHIP_UNKNOWN; + return 0; + } + + err = mlx5_query_pfcc_reg(mdev, out, sizeof(out)); + if (err) + return err; + + *buffer_ownership = MLX5_GET(pfcc_reg, out, buf_ownership); + + return 0; +} + int mlx5_set_dscp2prio(struct mlx5_core_dev *mdev, u8 dscp, u8 prio) { int sz = MLX5_ST_SZ_BYTES(qpdpm_reg); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c index 99219ea52c..f310bde3d1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c @@ -16,7 +16,6 @@ struct mlx5_sf_dev_table { struct xarray devices; phys_addr_t base_address; u64 sf_bar_length; - struct notifier_block nb; struct workqueue_struct *active_wq; struct work_struct work; u8 stop_active_wq:1; @@ -156,18 +155,23 @@ static void mlx5_sf_dev_del(struct mlx5_core_dev *dev, struct mlx5_sf_dev *sf_de static int mlx5_sf_dev_state_change_handler(struct notifier_block *nb, unsigned long event_code, void *data) { - struct mlx5_sf_dev_table *table = container_of(nb, struct mlx5_sf_dev_table, nb); + struct mlx5_core_dev *dev = container_of(nb, struct mlx5_core_dev, + priv.sf_dev_nb); + struct mlx5_sf_dev_table *table = dev->priv.sf_dev_table; const struct mlx5_vhca_state_event *event = data; struct mlx5_sf_dev *sf_dev; u16 max_functions; u16 sf_index; u16 base_id; - max_functions = mlx5_sf_max_functions(table->dev); + if (!table) + return 0; + + max_functions = mlx5_sf_max_functions(dev); if (!max_functions) return 0; - base_id = mlx5_sf_start_function_id(table->dev); + base_id = mlx5_sf_start_function_id(dev); if (event->function_id < base_id || event->function_id >= (base_id + max_functions)) return 0; @@ -177,19 +181,19 @@ mlx5_sf_dev_state_change_handler(struct notifier_block *nb, unsigned long event_ case MLX5_VHCA_STATE_INVALID: case MLX5_VHCA_STATE_ALLOCATED: if (sf_dev) - mlx5_sf_dev_del(table->dev, sf_dev, sf_index); + mlx5_sf_dev_del(dev, sf_dev, sf_index); break; case MLX5_VHCA_STATE_TEARDOWN_REQUEST: if (sf_dev) - mlx5_sf_dev_del(table->dev, sf_dev, sf_index); + mlx5_sf_dev_del(dev, sf_dev, sf_index); else - mlx5_core_err(table->dev, + mlx5_core_err(dev, "SF DEV: teardown state for invalid dev index=%d sfnum=0x%x\n", sf_index, event->sw_function_id); break; case MLX5_VHCA_STATE_ACTIVE: if (!sf_dev) - mlx5_sf_dev_add(table->dev, sf_index, event->function_id, + mlx5_sf_dev_add(dev, sf_index, event->function_id, event->sw_function_id); break; default: @@ -315,6 +319,15 @@ static void mlx5_sf_dev_destroy_active_works(struct mlx5_sf_dev_table *table) } } +int mlx5_sf_dev_notifier_init(struct mlx5_core_dev *dev) +{ + if (mlx5_core_is_sf(dev)) + return 0; + + dev->priv.sf_dev_nb.notifier_call = mlx5_sf_dev_state_change_handler; + return mlx5_vhca_event_notifier_register(dev, &dev->priv.sf_dev_nb); +} + void mlx5_sf_dev_table_create(struct mlx5_core_dev *dev) { struct mlx5_sf_dev_table *table; @@ -329,17 +342,12 @@ void mlx5_sf_dev_table_create(struct mlx5_core_dev *dev) goto table_err; } - table->nb.notifier_call = mlx5_sf_dev_state_change_handler; table->dev = dev; table->sf_bar_length = 1 << (MLX5_CAP_GEN(dev, log_min_sf_size) + 12); table->base_address = pci_resource_start(dev->pdev, 2); xa_init(&table->devices); dev->priv.sf_dev_table = table; - err = mlx5_vhca_event_notifier_register(dev, &table->nb); - if (err) - goto vhca_err; - err = mlx5_sf_dev_create_active_works(table); if (err) goto add_active_err; @@ -351,10 +359,8 @@ void mlx5_sf_dev_table_create(struct mlx5_core_dev *dev) arm_err: mlx5_sf_dev_destroy_active_works(table); -add_active_err: - mlx5_vhca_event_notifier_unregister(dev, &table->nb); mlx5_vhca_event_work_queues_flush(dev); -vhca_err: +add_active_err: kfree(table); dev->priv.sf_dev_table = NULL; table_err: @@ -372,6 +378,14 @@ static void mlx5_sf_dev_destroy_all(struct mlx5_sf_dev_table *table) } } +void mlx5_sf_dev_notifier_cleanup(struct mlx5_core_dev *dev) +{ + if (mlx5_core_is_sf(dev)) + return; + + mlx5_vhca_event_notifier_unregister(dev, &dev->priv.sf_dev_nb); +} + void mlx5_sf_dev_table_destroy(struct mlx5_core_dev *dev) { struct mlx5_sf_dev_table *table = dev->priv.sf_dev_table; @@ -380,8 +394,6 @@ void mlx5_sf_dev_table_destroy(struct mlx5_core_dev *dev) return; mlx5_sf_dev_destroy_active_works(table); - mlx5_vhca_event_notifier_unregister(dev, &table->nb); - mlx5_vhca_event_work_queues_flush(dev); /* Now that event handler is not running, it is safe to destroy * the sf device without race. diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h index b99131e95e..3ab0449c77 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h @@ -25,7 +25,9 @@ struct mlx5_sf_peer_devlink_event_ctx { int err; }; +int mlx5_sf_dev_notifier_init(struct mlx5_core_dev *dev); void mlx5_sf_dev_table_create(struct mlx5_core_dev *dev); +void mlx5_sf_dev_notifier_cleanup(struct mlx5_core_dev *dev); void mlx5_sf_dev_table_destroy(struct mlx5_core_dev *dev); int mlx5_sf_driver_register(void); @@ -35,10 +37,19 @@ bool mlx5_sf_dev_allocated(const struct mlx5_core_dev *dev); #else +static inline int mlx5_sf_dev_notifier_init(struct mlx5_core_dev *dev) +{ + return 0; +} + static inline void mlx5_sf_dev_table_create(struct mlx5_core_dev *dev) { } +static inline void mlx5_sf_dev_notifier_cleanup(struct mlx5_core_dev *dev) +{ +} + static inline void mlx5_sf_dev_table_destroy(struct mlx5_core_dev *dev) { } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/diag/dev_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/diag/dev_tracepoint.h index 7f7c9af5de..ce834680f5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/diag/dev_tracepoint.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/diag/dev_tracepoint.h @@ -28,7 +28,7 @@ DECLARE_EVENT_CLASS(mlx5_sf_dev_template, __entry->hw_fn_id = sfdev->fn_id; __entry->sfnum = sfdev->sfnum; ), - TP_printk("(%s) sfdev=%pK aux_id=%d hw_id=0x%x sfnum=%u\n", + TP_printk("(%s) sfdev=%p aux_id=%d hw_id=0x%x sfnum=%u\n", __get_str(devname), __entry->sfdev, __entry->aux_id, __entry->hw_fn_id, __entry->sfnum) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c index b706f14865..c45540fe7d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c @@ -76,6 +76,7 @@ static int mlx5_sf_dev_probe(struct auxiliary_device *adev, const struct auxilia goto init_one_err; } + mlx5_vhca_debugfs_init(mdev); return 0; init_one_err: diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c index 0864ba625c..b82323b844 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c @@ -31,9 +31,6 @@ struct mlx5_sf_table { struct mlx5_core_dev *dev; /* To refer from notifier context. */ struct xarray function_ids; /* function id based lookup. */ struct mutex sf_state_lock; /* Serializes sf state among user cmds & vhca event handler. */ - struct notifier_block esw_nb; - struct notifier_block vhca_nb; - struct notifier_block mdev_nb; }; static struct mlx5_sf * @@ -391,11 +388,16 @@ static bool mlx5_sf_state_update_check(const struct mlx5_sf *sf, u8 new_state) static int mlx5_sf_vhca_event(struct notifier_block *nb, unsigned long opcode, void *data) { - struct mlx5_sf_table *table = container_of(nb, struct mlx5_sf_table, vhca_nb); + struct mlx5_core_dev *dev = container_of(nb, struct mlx5_core_dev, + priv.sf_table_vhca_nb); + struct mlx5_sf_table *table = dev->priv.sf_table; const struct mlx5_vhca_state_event *event = data; bool update = false; struct mlx5_sf *sf; + if (!table) + return 0; + mutex_lock(&table->sf_state_lock); sf = mlx5_sf_lookup_by_function_id(table, event->function_id); if (!sf) @@ -407,7 +409,7 @@ static int mlx5_sf_vhca_event(struct notifier_block *nb, unsigned long opcode, v update = mlx5_sf_state_update_check(sf, event->new_vhca_state); if (update) sf->hw_state = event->new_vhca_state; - trace_mlx5_sf_update_state(table->dev, sf->port_index, sf->controller, + trace_mlx5_sf_update_state(dev, sf->port_index, sf->controller, sf->hw_fn_id, sf->hw_state); unlock: mutex_unlock(&table->sf_state_lock); @@ -425,12 +427,16 @@ static void mlx5_sf_del_all(struct mlx5_sf_table *table) static int mlx5_sf_esw_event(struct notifier_block *nb, unsigned long event, void *data) { - struct mlx5_sf_table *table = container_of(nb, struct mlx5_sf_table, esw_nb); + struct mlx5_core_dev *dev = container_of(nb, struct mlx5_core_dev, + priv.sf_table_esw_nb); const struct mlx5_esw_event_info *mode = data; + if (!dev->priv.sf_table) + return 0; + switch (mode->new_mode) { case MLX5_ESWITCH_LEGACY: - mlx5_sf_del_all(table); + mlx5_sf_del_all(dev->priv.sf_table); break; default: break; @@ -441,15 +447,16 @@ static int mlx5_sf_esw_event(struct notifier_block *nb, unsigned long event, voi static int mlx5_sf_mdev_event(struct notifier_block *nb, unsigned long event, void *data) { - struct mlx5_sf_table *table = container_of(nb, struct mlx5_sf_table, mdev_nb); + struct mlx5_core_dev *dev = container_of(nb, struct mlx5_core_dev, + priv.sf_table_mdev_nb); struct mlx5_sf_peer_devlink_event_ctx *event_ctx = data; + struct mlx5_sf_table *table = dev->priv.sf_table; int ret = NOTIFY_DONE; struct mlx5_sf *sf; - if (event != MLX5_DRIVER_EVENT_SF_PEER_DEVLINK) + if (!table || event != MLX5_DRIVER_EVENT_SF_PEER_DEVLINK) return NOTIFY_DONE; - mutex_lock(&table->sf_state_lock); sf = mlx5_sf_lookup_by_function_id(table, event_ctx->fn_id); if (!sf) @@ -464,10 +471,40 @@ out: return ret; } +int mlx5_sf_notifiers_init(struct mlx5_core_dev *dev) +{ + int err; + + if (mlx5_core_is_sf(dev)) + return 0; + + dev->priv.sf_table_esw_nb.notifier_call = mlx5_sf_esw_event; + err = mlx5_esw_event_notifier_register(dev, &dev->priv.sf_table_esw_nb); + if (err) + return err; + + dev->priv.sf_table_vhca_nb.notifier_call = mlx5_sf_vhca_event; + err = mlx5_vhca_event_notifier_register(dev, + &dev->priv.sf_table_vhca_nb); + if (err) + goto vhca_err; + + dev->priv.sf_table_mdev_nb.notifier_call = mlx5_sf_mdev_event; + err = mlx5_blocking_notifier_register(dev, &dev->priv.sf_table_mdev_nb); + if (err) + goto mdev_err; + + return 0; +mdev_err: + mlx5_vhca_event_notifier_unregister(dev, &dev->priv.sf_table_vhca_nb); +vhca_err: + mlx5_esw_event_notifier_unregister(dev, &dev->priv.sf_table_esw_nb); + return err; +} + int mlx5_sf_table_init(struct mlx5_core_dev *dev) { struct mlx5_sf_table *table; - int err; if (!mlx5_sf_table_supported(dev) || !mlx5_vhca_event_supported(dev)) return 0; @@ -480,28 +517,18 @@ int mlx5_sf_table_init(struct mlx5_core_dev *dev) table->dev = dev; xa_init(&table->function_ids); dev->priv.sf_table = table; - table->esw_nb.notifier_call = mlx5_sf_esw_event; - err = mlx5_esw_event_notifier_register(dev->priv.eswitch, &table->esw_nb); - if (err) - goto reg_err; - - table->vhca_nb.notifier_call = mlx5_sf_vhca_event; - err = mlx5_vhca_event_notifier_register(table->dev, &table->vhca_nb); - if (err) - goto vhca_err; - - table->mdev_nb.notifier_call = mlx5_sf_mdev_event; - mlx5_blocking_notifier_register(dev, &table->mdev_nb); return 0; +} -vhca_err: - mlx5_esw_event_notifier_unregister(dev->priv.eswitch, &table->esw_nb); -reg_err: - mutex_destroy(&table->sf_state_lock); - kfree(table); - dev->priv.sf_table = NULL; - return err; +void mlx5_sf_notifiers_cleanup(struct mlx5_core_dev *dev) +{ + if (mlx5_core_is_sf(dev)) + return; + + mlx5_blocking_notifier_unregister(dev, &dev->priv.sf_table_mdev_nb); + mlx5_vhca_event_notifier_unregister(dev, &dev->priv.sf_table_vhca_nb); + mlx5_esw_event_notifier_unregister(dev, &dev->priv.sf_table_esw_nb); } void mlx5_sf_table_cleanup(struct mlx5_core_dev *dev) @@ -511,10 +538,17 @@ void mlx5_sf_table_cleanup(struct mlx5_core_dev *dev) if (!table) return; - mlx5_blocking_notifier_unregister(dev, &table->mdev_nb); - mlx5_vhca_event_notifier_unregister(table->dev, &table->vhca_nb); - mlx5_esw_event_notifier_unregister(dev->priv.eswitch, &table->esw_nb); mutex_destroy(&table->sf_state_lock); WARN_ON(!xa_empty(&table->function_ids)); kfree(table); } + +bool mlx5_sf_table_empty(const struct mlx5_core_dev *dev) +{ + struct mlx5_sf_table *table = dev->priv.sf_table; + + if (!table) + return true; + + return xa_empty(&table->function_ids); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c index 1f613320fe..bd968f3b38 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c @@ -30,9 +30,7 @@ enum mlx5_sf_hwc_index { }; struct mlx5_sf_hw_table { - struct mlx5_core_dev *dev; struct mutex table_lock; /* Serializes sf deletion and vhca state change handler. */ - struct notifier_block vhca_nb; struct mlx5_sf_hwc_table hwc[MLX5_SF_HWC_MAX]; }; @@ -71,14 +69,16 @@ mlx5_sf_table_fn_to_hwc(struct mlx5_sf_hw_table *table, u16 fn_id) return NULL; } -static int mlx5_sf_hw_table_id_alloc(struct mlx5_sf_hw_table *table, u32 controller, +static int mlx5_sf_hw_table_id_alloc(struct mlx5_core_dev *dev, + struct mlx5_sf_hw_table *table, + u32 controller, u32 usr_sfnum) { struct mlx5_sf_hwc_table *hwc; int free_idx = -1; int i; - hwc = mlx5_sf_controller_to_hwc(table->dev, controller); + hwc = mlx5_sf_controller_to_hwc(dev, controller); if (!hwc->sfs) return -ENOSPC; @@ -100,11 +100,13 @@ static int mlx5_sf_hw_table_id_alloc(struct mlx5_sf_hw_table *table, u32 control return free_idx; } -static void mlx5_sf_hw_table_id_free(struct mlx5_sf_hw_table *table, u32 controller, int id) +static void mlx5_sf_hw_table_id_free(struct mlx5_core_dev *dev, + struct mlx5_sf_hw_table *table, + u32 controller, int id) { struct mlx5_sf_hwc_table *hwc; - hwc = mlx5_sf_controller_to_hwc(table->dev, controller); + hwc = mlx5_sf_controller_to_hwc(dev, controller); hwc->sfs[id].allocated = false; hwc->sfs[id].pending_delete = false; } @@ -120,7 +122,7 @@ int mlx5_sf_hw_table_sf_alloc(struct mlx5_core_dev *dev, u32 controller, u32 usr return -EOPNOTSUPP; mutex_lock(&table->table_lock); - sw_id = mlx5_sf_hw_table_id_alloc(table, controller, usr_sfnum); + sw_id = mlx5_sf_hw_table_id_alloc(dev, table, controller, usr_sfnum); if (sw_id < 0) { err = sw_id; goto exist_err; @@ -151,7 +153,7 @@ int mlx5_sf_hw_table_sf_alloc(struct mlx5_core_dev *dev, u32 controller, u32 usr vhca_err: mlx5_cmd_dealloc_sf(dev, hw_fn_id); err: - mlx5_sf_hw_table_id_free(table, controller, sw_id); + mlx5_sf_hw_table_id_free(dev, table, controller, sw_id); exist_err: mutex_unlock(&table->table_lock); return err; @@ -165,7 +167,7 @@ void mlx5_sf_hw_table_sf_free(struct mlx5_core_dev *dev, u32 controller, u16 id) mutex_lock(&table->table_lock); hw_fn_id = mlx5_sf_sw_to_hw_id(dev, controller, id); mlx5_cmd_dealloc_sf(dev, hw_fn_id); - mlx5_sf_hw_table_id_free(table, controller, id); + mlx5_sf_hw_table_id_free(dev, table, controller, id); mutex_unlock(&table->table_lock); } @@ -216,10 +218,12 @@ static void mlx5_sf_hw_table_hwc_dealloc_all(struct mlx5_core_dev *dev, } } -static void mlx5_sf_hw_table_dealloc_all(struct mlx5_sf_hw_table *table) +static void mlx5_sf_hw_table_dealloc_all(struct mlx5_core_dev *dev, + struct mlx5_sf_hw_table *table) { - mlx5_sf_hw_table_hwc_dealloc_all(table->dev, &table->hwc[MLX5_SF_HWC_EXTERNAL]); - mlx5_sf_hw_table_hwc_dealloc_all(table->dev, &table->hwc[MLX5_SF_HWC_LOCAL]); + mlx5_sf_hw_table_hwc_dealloc_all(dev, + &table->hwc[MLX5_SF_HWC_EXTERNAL]); + mlx5_sf_hw_table_hwc_dealloc_all(dev, &table->hwc[MLX5_SF_HWC_LOCAL]); } static int mlx5_sf_hw_table_hwc_init(struct mlx5_sf_hwc_table *hwc, u16 max_fn, u16 base_id) @@ -301,7 +305,6 @@ int mlx5_sf_hw_table_init(struct mlx5_core_dev *dev) } mutex_init(&table->table_lock); - table->dev = dev; dev->priv.sf_hw_table = table; base_id = mlx5_sf_start_function_id(dev); @@ -338,19 +341,22 @@ void mlx5_sf_hw_table_cleanup(struct mlx5_core_dev *dev) mlx5_sf_hw_table_hwc_cleanup(&table->hwc[MLX5_SF_HWC_LOCAL]); mutex_destroy(&table->table_lock); kfree(table); + dev->priv.sf_hw_table = NULL; res_unregister: mlx5_sf_hw_table_res_unregister(dev); } static int mlx5_sf_hw_vhca_event(struct notifier_block *nb, unsigned long opcode, void *data) { - struct mlx5_sf_hw_table *table = container_of(nb, struct mlx5_sf_hw_table, vhca_nb); + struct mlx5_core_dev *dev = container_of(nb, struct mlx5_core_dev, + priv.sf_hw_table_vhca_nb); + struct mlx5_sf_hw_table *table = dev->priv.sf_hw_table; const struct mlx5_vhca_state_event *event = data; struct mlx5_sf_hwc_table *hwc; struct mlx5_sf_hw *sf_hw; u16 sw_id; - if (event->new_vhca_state != MLX5_VHCA_STATE_ALLOCATED) + if (!table || event->new_vhca_state != MLX5_VHCA_STATE_ALLOCATED) return 0; hwc = mlx5_sf_table_fn_to_hwc(table, event->function_id); @@ -365,20 +371,28 @@ static int mlx5_sf_hw_vhca_event(struct notifier_block *nb, unsigned long opcode * Hence recycle the sf hardware id for reuse. */ if (sf_hw->allocated && sf_hw->pending_delete) - mlx5_sf_hw_table_hwc_sf_free(table->dev, hwc, sw_id); + mlx5_sf_hw_table_hwc_sf_free(dev, hwc, sw_id); mutex_unlock(&table->table_lock); return 0; } -int mlx5_sf_hw_table_create(struct mlx5_core_dev *dev) +int mlx5_sf_hw_notifier_init(struct mlx5_core_dev *dev) { - struct mlx5_sf_hw_table *table = dev->priv.sf_hw_table; - - if (!table) + if (mlx5_core_is_sf(dev)) return 0; - table->vhca_nb.notifier_call = mlx5_sf_hw_vhca_event; - return mlx5_vhca_event_notifier_register(dev, &table->vhca_nb); + dev->priv.sf_hw_table_vhca_nb.notifier_call = mlx5_sf_hw_vhca_event; + return mlx5_vhca_event_notifier_register(dev, + &dev->priv.sf_hw_table_vhca_nb); +} + +void mlx5_sf_hw_notifier_cleanup(struct mlx5_core_dev *dev) +{ + if (mlx5_core_is_sf(dev)) + return; + + mlx5_vhca_event_notifier_unregister(dev, + &dev->priv.sf_hw_table_vhca_nb); } void mlx5_sf_hw_table_destroy(struct mlx5_core_dev *dev) @@ -388,9 +402,8 @@ void mlx5_sf_hw_table_destroy(struct mlx5_core_dev *dev) if (!table) return; - mlx5_vhca_event_notifier_unregister(dev, &table->vhca_nb); /* Dealloc SFs whose firmware event has been missed. */ - mlx5_sf_hw_table_dealloc_all(table); + mlx5_sf_hw_table_dealloc_all(dev, table); } bool mlx5_sf_hw_table_supported(const struct mlx5_core_dev *dev) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h index 860f9ddb71..d8a934a0e9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h @@ -12,11 +12,15 @@ int mlx5_sf_hw_table_init(struct mlx5_core_dev *dev); void mlx5_sf_hw_table_cleanup(struct mlx5_core_dev *dev); -int mlx5_sf_hw_table_create(struct mlx5_core_dev *dev); +int mlx5_sf_hw_notifier_init(struct mlx5_core_dev *dev); +void mlx5_sf_hw_notifier_cleanup(struct mlx5_core_dev *dev); void mlx5_sf_hw_table_destroy(struct mlx5_core_dev *dev); +int mlx5_sf_notifiers_init(struct mlx5_core_dev *dev); int mlx5_sf_table_init(struct mlx5_core_dev *dev); +void mlx5_sf_notifiers_cleanup(struct mlx5_core_dev *dev); void mlx5_sf_table_cleanup(struct mlx5_core_dev *dev); +bool mlx5_sf_table_empty(const struct mlx5_core_dev *dev); int mlx5_devlink_sf_port_new(struct devlink *devlink, const struct devlink_port_new_attrs *add_attr, @@ -43,24 +47,42 @@ static inline void mlx5_sf_hw_table_cleanup(struct mlx5_core_dev *dev) { } -static inline int mlx5_sf_hw_table_create(struct mlx5_core_dev *dev) +static inline int mlx5_sf_hw_notifier_init(struct mlx5_core_dev *dev) { return 0; } +static inline void mlx5_sf_hw_notifier_cleanup(struct mlx5_core_dev *dev) +{ +} + static inline void mlx5_sf_hw_table_destroy(struct mlx5_core_dev *dev) { } +static inline int mlx5_sf_notifiers_init(struct mlx5_core_dev *dev) +{ + return 0; +} + static inline int mlx5_sf_table_init(struct mlx5_core_dev *dev) { return 0; } +static inline void mlx5_sf_notifiers_cleanup(struct mlx5_core_dev *dev) +{ +} + static inline void mlx5_sf_table_cleanup(struct mlx5_core_dev *dev) { } +static inline bool mlx5_sf_table_empty(const struct mlx5_core_dev *dev) +{ + return true; +} + #endif #endif diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c index cda01ba441..b04cf6cf89 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c @@ -9,15 +9,9 @@ #define CREATE_TRACE_POINTS #include "diag/vhca_tracepoint.h" -struct mlx5_vhca_state_notifier { - struct mlx5_core_dev *dev; - struct mlx5_nb nb; - struct blocking_notifier_head n_head; -}; - struct mlx5_vhca_event_work { struct work_struct work; - struct mlx5_vhca_state_notifier *notifier; + struct mlx5_core_dev *dev; struct mlx5_vhca_state_event event; }; @@ -95,16 +89,14 @@ mlx5_vhca_event_notify(struct mlx5_core_dev *dev, struct mlx5_vhca_state_event * mlx5_vhca_event_arm(dev, event->function_id); trace_mlx5_sf_vhca_event(dev, event); - blocking_notifier_call_chain(&dev->priv.vhca_state_notifier->n_head, 0, event); + blocking_notifier_call_chain(&dev->priv.vhca_state_n_head, 0, event); } static void mlx5_vhca_state_work_handler(struct work_struct *_work) { struct mlx5_vhca_event_work *work = container_of(_work, struct mlx5_vhca_event_work, work); - struct mlx5_vhca_state_notifier *notifier = work->notifier; - struct mlx5_core_dev *dev = notifier->dev; - mlx5_vhca_event_notify(dev, &work->event); + mlx5_vhca_event_notify(work->dev, &work->event); kfree(work); } @@ -116,8 +108,8 @@ void mlx5_vhca_events_work_enqueue(struct mlx5_core_dev *dev, int idx, struct wo static int mlx5_vhca_state_change_notifier(struct notifier_block *nb, unsigned long type, void *data) { - struct mlx5_vhca_state_notifier *notifier = - mlx5_nb_cof(nb, struct mlx5_vhca_state_notifier, nb); + struct mlx5_core_dev *dev = mlx5_nb_cof(nb, struct mlx5_core_dev, + priv.vhca_state_nb); struct mlx5_vhca_event_work *work; struct mlx5_eqe *eqe = data; int wq_idx; @@ -126,10 +118,10 @@ mlx5_vhca_state_change_notifier(struct notifier_block *nb, unsigned long type, v if (!work) return NOTIFY_DONE; INIT_WORK(&work->work, &mlx5_vhca_state_work_handler); - work->notifier = notifier; + work->dev = dev; work->event.function_id = be16_to_cpu(eqe->data.vhca_state.function_id); wq_idx = work->event.function_id % MLX5_DEV_MAX_WQS; - mlx5_vhca_events_work_enqueue(notifier->dev, wq_idx, &work->work); + mlx5_vhca_events_work_enqueue(dev, wq_idx, &work->work); return NOTIFY_OK; } @@ -145,9 +137,15 @@ void mlx5_vhca_state_cap_handle(struct mlx5_core_dev *dev, void *set_hca_cap) MLX5_SET(cmd_hca_cap, set_hca_cap, event_on_vhca_state_teardown_request, 1); } +void mlx5_vhca_state_notifier_init(struct mlx5_core_dev *dev) +{ + BLOCKING_INIT_NOTIFIER_HEAD(&dev->priv.vhca_state_n_head); + MLX5_NB_INIT(&dev->priv.vhca_state_nb, mlx5_vhca_state_change_notifier, + VHCA_STATE_CHANGE); +} + int mlx5_vhca_event_init(struct mlx5_core_dev *dev) { - struct mlx5_vhca_state_notifier *notifier; char wq_name[MLX5_CMD_WQ_MAX_NAME]; struct mlx5_vhca_events *events; int err, i; @@ -160,7 +158,6 @@ int mlx5_vhca_event_init(struct mlx5_core_dev *dev) return -ENOMEM; events->dev = dev; - dev->priv.vhca_events = events; for (i = 0; i < MLX5_DEV_MAX_WQS; i++) { snprintf(wq_name, MLX5_CMD_WQ_MAX_NAME, "mlx5_vhca_event%d", i); events->handler[i].wq = create_singlethread_workqueue(wq_name); @@ -169,20 +166,10 @@ int mlx5_vhca_event_init(struct mlx5_core_dev *dev) goto err_create_wq; } } + dev->priv.vhca_events = events; - notifier = kzalloc(sizeof(*notifier), GFP_KERNEL); - if (!notifier) { - err = -ENOMEM; - goto err_notifier; - } - - dev->priv.vhca_state_notifier = notifier; - notifier->dev = dev; - BLOCKING_INIT_NOTIFIER_HEAD(¬ifier->n_head); - MLX5_NB_INIT(¬ifier->nb, mlx5_vhca_state_change_notifier, VHCA_STATE_CHANGE); return 0; -err_notifier: err_create_wq: for (--i; i >= 0; i--) destroy_workqueue(events->handler[i].wq); @@ -211,8 +198,6 @@ void mlx5_vhca_event_cleanup(struct mlx5_core_dev *dev) if (!mlx5_vhca_event_supported(dev)) return; - kfree(dev->priv.vhca_state_notifier); - dev->priv.vhca_state_notifier = NULL; vhca_events = dev->priv.vhca_events; for (i = 0; i < MLX5_DEV_MAX_WQS; i++) destroy_workqueue(vhca_events->handler[i].wq); @@ -221,34 +206,30 @@ void mlx5_vhca_event_cleanup(struct mlx5_core_dev *dev) void mlx5_vhca_event_start(struct mlx5_core_dev *dev) { - struct mlx5_vhca_state_notifier *notifier; - - if (!dev->priv.vhca_state_notifier) + if (!mlx5_vhca_event_supported(dev)) return; - notifier = dev->priv.vhca_state_notifier; - mlx5_eq_notifier_register(dev, ¬ifier->nb); + mlx5_eq_notifier_register(dev, &dev->priv.vhca_state_nb); } void mlx5_vhca_event_stop(struct mlx5_core_dev *dev) { - struct mlx5_vhca_state_notifier *notifier; - - if (!dev->priv.vhca_state_notifier) + if (!mlx5_vhca_event_supported(dev)) return; - notifier = dev->priv.vhca_state_notifier; - mlx5_eq_notifier_unregister(dev, ¬ifier->nb); + mlx5_eq_notifier_unregister(dev, &dev->priv.vhca_state_nb); + + /* Flush workqueues of all pending events. */ + mlx5_vhca_event_work_queues_flush(dev); } int mlx5_vhca_event_notifier_register(struct mlx5_core_dev *dev, struct notifier_block *nb) { - if (!dev->priv.vhca_state_notifier) - return -EOPNOTSUPP; - return blocking_notifier_chain_register(&dev->priv.vhca_state_notifier->n_head, nb); + return blocking_notifier_chain_register(&dev->priv.vhca_state_n_head, + nb); } void mlx5_vhca_event_notifier_unregister(struct mlx5_core_dev *dev, struct notifier_block *nb) { - blocking_notifier_chain_unregister(&dev->priv.vhca_state_notifier->n_head, nb); + blocking_notifier_chain_unregister(&dev->priv.vhca_state_n_head, nb); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.h index 1725ba64f8..5279042387 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.h @@ -18,6 +18,7 @@ static inline bool mlx5_vhca_event_supported(const struct mlx5_core_dev *dev) } void mlx5_vhca_state_cap_handle(struct mlx5_core_dev *dev, void *set_hca_cap); +void mlx5_vhca_state_notifier_init(struct mlx5_core_dev *dev); int mlx5_vhca_event_init(struct mlx5_core_dev *dev); void mlx5_vhca_event_cleanup(struct mlx5_core_dev *dev); void mlx5_vhca_event_start(struct mlx5_core_dev *dev); @@ -37,6 +38,10 @@ static inline void mlx5_vhca_state_cap_handle(struct mlx5_core_dev *dev, void *s { } +static inline void mlx5_vhca_state_notifier_init(struct mlx5_core_dev *dev) +{ +} + static inline int mlx5_vhca_event_init(struct mlx5_core_dev *dev) { return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c index b5332c54d4..fe56b59e24 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c @@ -72,6 +72,11 @@ enum mlx5hws_action_type mlx5hws_action_get_type(struct mlx5hws_action *action) return action->type; } +struct mlx5_core_dev *mlx5hws_action_get_dev(struct mlx5hws_action *action) +{ + return action->ctx->mdev; +} + static int hws_action_get_shared_stc_nic(struct mlx5hws_context *ctx, enum mlx5hws_context_shared_stc_type stc_type, u8 tbl_type) @@ -112,7 +117,7 @@ static int hws_action_get_shared_stc_nic(struct mlx5hws_context *ctx, mlx5hws_err(ctx, "No such stc_type: %d\n", stc_type); pr_warn("HWS: Invalid stc_type: %d\n", stc_type); ret = -EINVAL; - goto unlock_and_out; + goto free_shared_stc; } ret = mlx5hws_action_alloc_single_stc(ctx, &stc_attr, tbl_type, @@ -238,6 +243,7 @@ hws_action_fixup_stc_attr(struct mlx5hws_context *ctx, enum mlx5hws_table_type table_type, bool is_mirror) { + struct mlx5hws_pool *pool; bool use_fixup = false; u32 fw_tbl_type; u32 base_id; @@ -253,13 +259,11 @@ hws_action_fixup_stc_attr(struct mlx5hws_context *ctx, use_fixup = true; break; } + pool = stc_attr->ste_table.ste_pool; if (!is_mirror) - base_id = mlx5hws_pool_chunk_get_base_id(stc_attr->ste_table.ste_pool, - &stc_attr->ste_table.ste); + base_id = mlx5hws_pool_get_base_id(pool); else - base_id = - mlx5hws_pool_chunk_get_base_mirror_id(stc_attr->ste_table.ste_pool, - &stc_attr->ste_table.ste); + base_id = mlx5hws_pool_get_base_mirror_id(pool); *fixup_stc_attr = *stc_attr; fixup_stc_attr->ste_table.ste_obj_id = base_id; @@ -337,7 +341,7 @@ __must_hold(&ctx->ctrl_lock) if (!mlx5hws_context_cap_dynamic_reparse(ctx)) stc_attr->reparse_mode = MLX5_IFC_STC_REPARSE_IGNORE; - obj_0_id = mlx5hws_pool_chunk_get_base_id(stc_pool, stc); + obj_0_id = mlx5hws_pool_get_base_id(stc_pool); /* According to table/action limitation change the stc_attr */ use_fixup = hws_action_fixup_stc_attr(ctx, stc_attr, &fixup_stc_attr, table_type, false); @@ -353,7 +357,7 @@ __must_hold(&ctx->ctrl_lock) if (table_type == MLX5HWS_TABLE_TYPE_FDB) { u32 obj_1_id; - obj_1_id = mlx5hws_pool_chunk_get_base_mirror_id(stc_pool, stc); + obj_1_id = mlx5hws_pool_get_base_mirror_id(stc_pool); use_fixup = hws_action_fixup_stc_attr(ctx, stc_attr, &fixup_stc_attr, @@ -393,11 +397,11 @@ __must_hold(&ctx->ctrl_lock) stc_attr.action_type = MLX5_IFC_STC_ACTION_TYPE_DROP; stc_attr.action_offset = MLX5HWS_ACTION_OFFSET_HIT; stc_attr.stc_offset = stc->offset; - obj_id = mlx5hws_pool_chunk_get_base_id(stc_pool, stc); + obj_id = mlx5hws_pool_get_base_id(stc_pool); mlx5hws_cmd_stc_modify(ctx->mdev, obj_id, &stc_attr); if (table_type == MLX5HWS_TABLE_TYPE_FDB) { - obj_id = mlx5hws_pool_chunk_get_base_mirror_id(stc_pool, stc); + obj_id = mlx5hws_pool_get_base_mirror_id(stc_pool); mlx5hws_cmd_stc_modify(ctx->mdev, obj_id, &stc_attr); } @@ -1186,14 +1190,15 @@ hws_action_create_modify_header_hws(struct mlx5hws_action *action, struct mlx5hws_action_mh_pattern *pattern, u32 log_bulk_size) { + u16 num_actions, max_mh_actions = 0, hw_max_actions; struct mlx5hws_context *ctx = action->ctx; - u16 num_actions, max_mh_actions = 0; int i, ret, size_in_bytes; u32 pat_id, arg_id = 0; __be64 *new_pattern; size_t pat_max_sz; pat_max_sz = MLX5HWS_ARG_CHUNK_SIZE_MAX * MLX5HWS_ARG_DATA_SIZE; + hw_max_actions = pat_max_sz / MLX5HWS_MODIFY_ACTION_SIZE; size_in_bytes = pat_max_sz * sizeof(__be64); new_pattern = kcalloc(num_of_patterns, size_in_bytes, GFP_KERNEL); if (!new_pattern) @@ -1203,16 +1208,20 @@ hws_action_create_modify_header_hws(struct mlx5hws_action *action, for (i = 0; i < num_of_patterns; i++) { size_t new_num_actions; size_t cur_num_actions; - u32 nope_location; + u32 nop_locations; cur_num_actions = pattern[i].sz / MLX5HWS_MODIFY_ACTION_SIZE; - mlx5hws_pat_calc_nope(pattern[i].data, cur_num_actions, - pat_max_sz / MLX5HWS_MODIFY_ACTION_SIZE, - &new_num_actions, &nope_location, - &new_pattern[i * pat_max_sz]); + ret = mlx5hws_pat_calc_nop(pattern[i].data, cur_num_actions, + hw_max_actions, &new_num_actions, + &nop_locations, + &new_pattern[i * pat_max_sz]); + if (ret) { + mlx5hws_err(ctx, "Too many actions after nop insertion\n"); + goto free_new_pat; + } - action[i].modify_header.nope_locations = nope_location; + action[i].modify_header.nop_locations = nop_locations; action[i].modify_header.num_of_actions = new_num_actions; max_mh_actions = max(max_mh_actions, new_num_actions); @@ -1259,7 +1268,7 @@ hws_action_create_modify_header_hws(struct mlx5hws_action *action, MLX5_GET(set_action_in, pattern[i].data, action_type); } else { /* Multiple modify actions require a pattern */ - if (unlikely(action[i].modify_header.nope_locations)) { + if (unlikely(action[i].modify_header.nop_locations)) { size_t pattern_sz; pattern_sz = action[i].modify_header.num_of_actions * @@ -1349,11 +1358,8 @@ free_action: } struct mlx5hws_action * -mlx5hws_action_create_dest_array(struct mlx5hws_context *ctx, - size_t num_dest, +mlx5hws_action_create_dest_array(struct mlx5hws_context *ctx, size_t num_dest, struct mlx5hws_action_dest_attr *dests, - bool ignore_flow_level, - u32 flow_source, u32 flags) { struct mlx5hws_cmd_set_fte_dest *dest_list = NULL; @@ -1361,8 +1367,8 @@ mlx5hws_action_create_dest_array(struct mlx5hws_context *ctx, struct mlx5hws_cmd_set_fte_attr fte_attr = {0}; struct mlx5hws_cmd_forward_tbl *fw_island; struct mlx5hws_action *action; - u32 i /*, packet_reformat_id*/; - int ret; + int ret, last_dest_idx = -1; + u32 i; if (num_dest <= 1) { mlx5hws_err(ctx, "Action must have multiple dests\n"); @@ -1391,12 +1397,9 @@ mlx5hws_action_create_dest_array(struct mlx5hws_context *ctx, MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; dest_list[i].destination_id = dests[i].dest->dest_obj.obj_id; fte_attr.action_flags |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; - fte_attr.ignore_flow_level = ignore_flow_level; - /* ToDo: In SW steering we have a handling of 'go to WIRE' - * destination here by upper layer setting 'is_wire_ft' flag - * if the destination is wire. - * This is because uplink should be last dest in the list. - */ + fte_attr.ignore_flow_level = 1; + if (dests[i].is_wire_ft) + last_dest_idx = i; break; case MLX5HWS_ACTION_TYP_VPORT: dest_list[i].destination_type = MLX5_FLOW_DESTINATION_TYPE_VPORT; @@ -1420,6 +1423,9 @@ mlx5hws_action_create_dest_array(struct mlx5hws_context *ctx, } } + if (last_dest_idx != -1) + swap(dest_list[last_dest_idx], dest_list[num_dest - 1]); + fte_attr.dests_num = num_dest; fte_attr.dests = dest_list; @@ -1575,17 +1581,15 @@ hws_action_create_dest_match_range_definer(struct mlx5hws_context *ctx) return definer; } -static struct mlx5hws_matcher_action_ste * +static struct mlx5hws_range_action_table * hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx, struct mlx5hws_definer *definer, u32 miss_ft_id) { struct mlx5hws_cmd_rtc_create_attr rtc_attr = {0}; - struct mlx5hws_action_default_stc *default_stc; - struct mlx5hws_matcher_action_ste *table_ste; + struct mlx5hws_range_action_table *table_ste; struct mlx5hws_pool_attr pool_attr = {0}; struct mlx5hws_pool *ste_pool, *stc_pool; - struct mlx5hws_pool_chunk *ste; u32 *rtc_0_id, *rtc_1_id; u32 obj_id; int ret; @@ -1604,7 +1608,6 @@ hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx, pool_attr.table_type = MLX5HWS_TABLE_TYPE_FDB; pool_attr.pool_type = MLX5HWS_POOL_TYPE_STE; - pool_attr.flags = MLX5HWS_POOL_FLAGS_FOR_STE_ACTION_POOL; pool_attr.alloc_log_sz = 1; table_ste->pool = mlx5hws_pool_create(ctx, &pool_attr); if (!table_ste->pool) { @@ -1616,8 +1619,6 @@ hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx, rtc_0_id = &table_ste->rtc_0_id; rtc_1_id = &table_ste->rtc_1_id; ste_pool = table_ste->pool; - ste = &table_ste->ste; - ste->order = 1; rtc_attr.log_size = 0; rtc_attr.log_depth = 0; @@ -1629,18 +1630,16 @@ hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx, rtc_attr.fw_gen_wqe = true; rtc_attr.is_scnd_range = true; - obj_id = mlx5hws_pool_chunk_get_base_id(ste_pool, ste); + obj_id = mlx5hws_pool_get_base_id(ste_pool); rtc_attr.pd = ctx->pd_num; rtc_attr.ste_base = obj_id; - rtc_attr.ste_offset = ste->offset; rtc_attr.reparse_mode = mlx5hws_context_get_reparse_mode(ctx); rtc_attr.table_type = mlx5hws_table_get_res_fw_ft_type(MLX5HWS_TABLE_TYPE_FDB, false); /* STC is a single resource (obj_id), use any STC for the ID */ stc_pool = ctx->stc_pool; - default_stc = ctx->common_res.default_stc; - obj_id = mlx5hws_pool_chunk_get_base_id(stc_pool, &default_stc->default_hit); + obj_id = mlx5hws_pool_get_base_id(stc_pool); rtc_attr.stc_base = obj_id; ret = mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr, rtc_0_id); @@ -1650,11 +1649,11 @@ hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx, } /* Create mirror RTC */ - obj_id = mlx5hws_pool_chunk_get_base_mirror_id(ste_pool, ste); + obj_id = mlx5hws_pool_get_base_mirror_id(ste_pool); rtc_attr.ste_base = obj_id; rtc_attr.table_type = mlx5hws_table_get_res_fw_ft_type(MLX5HWS_TABLE_TYPE_FDB, true); - obj_id = mlx5hws_pool_chunk_get_base_mirror_id(stc_pool, &default_stc->default_hit); + obj_id = mlx5hws_pool_get_base_mirror_id(stc_pool); rtc_attr.stc_base = obj_id; ret = mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr, rtc_1_id); @@ -1677,9 +1676,9 @@ free_ste: return NULL; } -static void -hws_action_destroy_dest_match_range_table(struct mlx5hws_context *ctx, - struct mlx5hws_matcher_action_ste *table_ste) +static void hws_action_destroy_dest_match_range_table( + struct mlx5hws_context *ctx, + struct mlx5hws_range_action_table *table_ste) { mutex_lock(&ctx->ctrl_lock); @@ -1691,12 +1690,11 @@ hws_action_destroy_dest_match_range_table(struct mlx5hws_context *ctx, mutex_unlock(&ctx->ctrl_lock); } -static int -hws_action_create_dest_match_range_fill_table(struct mlx5hws_context *ctx, - struct mlx5hws_matcher_action_ste *table_ste, - struct mlx5hws_action *hit_ft_action, - struct mlx5hws_definer *range_definer, - u32 min, u32 max) +static int hws_action_create_dest_match_range_fill_table( + struct mlx5hws_context *ctx, + struct mlx5hws_range_action_table *table_ste, + struct mlx5hws_action *hit_ft_action, + struct mlx5hws_definer *range_definer, u32 min, u32 max) { struct mlx5hws_wqe_gta_data_seg_ste match_wqe_data = {0}; struct mlx5hws_wqe_gta_data_seg_ste range_wqe_data = {0}; @@ -1792,7 +1790,7 @@ mlx5hws_action_create_dest_match_range(struct mlx5hws_context *ctx, u32 min, u32 max, u32 flags) { struct mlx5hws_cmd_stc_modify_attr stc_attr = {0}; - struct mlx5hws_matcher_action_ste *table_ste; + struct mlx5hws_range_action_table *table_ste; struct mlx5hws_action *hit_ft_action; struct mlx5hws_definer *definer; struct mlx5hws_action *action; @@ -1837,7 +1835,6 @@ mlx5hws_action_create_dest_match_range(struct mlx5hws_context *ctx, stc_attr.action_offset = MLX5HWS_ACTION_OFFSET_HIT; stc_attr.action_type = MLX5_IFC_STC_ACTION_TYPE_JUMP_TO_STE_TABLE; stc_attr.reparse_mode = MLX5_IFC_STC_REPARSE_IGNORE; - stc_attr.ste_table.ste = table_ste->ste; stc_attr.ste_table.ste_pool = table_ste->pool; stc_attr.ste_table.match_definer_id = ctx->caps->trivial_match_definer; @@ -2110,21 +2107,23 @@ static void hws_action_modify_write(struct mlx5hws_send_engine *queue, u32 arg_idx, u8 *arg_data, u16 num_of_actions, - u32 nope_locations) + u32 nop_locations) { u8 *new_arg_data = NULL; int i, j; - if (unlikely(nope_locations)) { + if (unlikely(nop_locations)) { new_arg_data = kcalloc(num_of_actions, MLX5HWS_MODIFY_ACTION_SIZE, GFP_KERNEL); if (unlikely(!new_arg_data)) return; - for (i = 0, j = 0; i < num_of_actions; i++, j++) { - memcpy(&new_arg_data[j], arg_data, MLX5HWS_MODIFY_ACTION_SIZE); - if (BIT(i) & nope_locations) + for (i = 0, j = 0; j < num_of_actions; i++, j++) { + if (BIT(i) & nop_locations) j++; + memcpy(&new_arg_data[j * MLX5HWS_MODIFY_ACTION_SIZE], + &arg_data[i * MLX5HWS_MODIFY_ACTION_SIZE], + MLX5HWS_MODIFY_ACTION_SIZE); } } @@ -2220,6 +2219,7 @@ hws_action_setter_modify_header(struct mlx5hws_actions_apply_data *apply, struct mlx5hws_action *action; u32 arg_sz, arg_idx; u8 *single_action; + u8 max_actions; __be32 stc_idx; rule_action = &apply->rule_action[setter->idx_double]; @@ -2247,21 +2247,23 @@ hws_action_setter_modify_header(struct mlx5hws_actions_apply_data *apply, apply->wqe_data[MLX5HWS_ACTION_OFFSET_DW7] = *(__be32 *)MLX5_ADDR_OF(set_action_in, single_action, data); - } else { - /* Argument offset multiple with number of args per these actions */ - arg_sz = mlx5hws_arg_get_arg_size(action->modify_header.max_num_of_actions); - arg_idx = rule_action->modify_header.offset * arg_sz; + return; + } - apply->wqe_data[MLX5HWS_ACTION_OFFSET_DW7] = htonl(arg_idx); + /* Argument offset multiple with number of args per these actions */ + max_actions = action->modify_header.max_num_of_actions; + arg_sz = mlx5hws_arg_get_arg_size(max_actions); + arg_idx = rule_action->modify_header.offset * arg_sz; - if (!(action->flags & MLX5HWS_ACTION_FLAG_SHARED)) { - apply->require_dep = 1; - hws_action_modify_write(apply->queue, - action->modify_header.arg_id + arg_idx, - rule_action->modify_header.data, - action->modify_header.num_of_actions, - action->modify_header.nope_locations); - } + apply->wqe_data[MLX5HWS_ACTION_OFFSET_DW7] = htonl(arg_idx); + + if (!(action->flags & MLX5HWS_ACTION_FLAG_SHARED)) { + apply->require_dep = 1; + hws_action_modify_write(apply->queue, + action->modify_header.arg_id + arg_idx, + rule_action->modify_header.data, + action->modify_header.num_of_actions, + action->modify_header.nop_locations); } } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h index 64b76075f7..55a079fdd0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h @@ -118,6 +118,12 @@ struct mlx5hws_action_template { u8 only_term; }; +struct mlx5hws_range_action_table { + struct mlx5hws_pool *pool; + u32 rtc_0_id; + u32 rtc_1_id; +}; + struct mlx5hws_action { u8 type; u8 flags; @@ -130,7 +136,7 @@ struct mlx5hws_action { u32 pat_id; u32 arg_id; __be64 single_action; - u32 nope_locations; + u32 nop_locations; u8 num_of_patterns; u8 single_action_type; u8 num_of_actions; @@ -186,7 +192,7 @@ struct mlx5hws_action { size_t size; } remove_header; struct { - struct mlx5hws_matcher_action_ste *table_ste; + struct mlx5hws_range_action_table *table_ste; struct mlx5hws_action *hit_ft_action; struct mlx5hws_definer *definer; } range; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.c new file mode 100644 index 0000000000..5766a9c82f --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.c @@ -0,0 +1,467 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2025 NVIDIA Corporation & Affiliates */ + +#include "internal.h" + +static const char * +hws_pool_opt_to_str(enum mlx5hws_pool_optimize opt) +{ + switch (opt) { + case MLX5HWS_POOL_OPTIMIZE_NONE: + return "rx-and-tx"; + case MLX5HWS_POOL_OPTIMIZE_ORIG: + return "rx-only"; + case MLX5HWS_POOL_OPTIMIZE_MIRROR: + return "tx-only"; + default: + return "unknown"; + } +} + +static int +hws_action_ste_table_create_pool(struct mlx5hws_context *ctx, + struct mlx5hws_action_ste_table *action_tbl, + enum mlx5hws_pool_optimize opt, size_t log_sz) +{ + struct mlx5hws_pool_attr pool_attr = { 0 }; + + pool_attr.pool_type = MLX5HWS_POOL_TYPE_STE; + pool_attr.table_type = MLX5HWS_TABLE_TYPE_FDB; + pool_attr.flags = MLX5HWS_POOL_FLAG_BUDDY; + pool_attr.opt_type = opt; + pool_attr.alloc_log_sz = log_sz; + + action_tbl->pool = mlx5hws_pool_create(ctx, &pool_attr); + if (!action_tbl->pool) { + mlx5hws_err(ctx, "Failed to allocate STE pool\n"); + return -EINVAL; + } + + return 0; +} + +static int hws_action_ste_table_create_single_rtc( + struct mlx5hws_context *ctx, + struct mlx5hws_action_ste_table *action_tbl, + enum mlx5hws_pool_optimize opt, size_t log_sz, bool tx) +{ + struct mlx5hws_cmd_rtc_create_attr rtc_attr = { 0 }; + u32 *rtc_id; + + rtc_attr.log_depth = 0; + rtc_attr.update_index_mode = MLX5_IFC_RTC_STE_UPDATE_MODE_BY_OFFSET; + /* Action STEs use the default always hit definer. */ + rtc_attr.match_definer_0 = ctx->caps->trivial_match_definer; + rtc_attr.is_frst_jumbo = false; + rtc_attr.miss_ft_id = 0; + rtc_attr.pd = ctx->pd_num; + rtc_attr.reparse_mode = mlx5hws_context_get_reparse_mode(ctx); + + if (tx) { + rtc_attr.table_type = FS_FT_FDB_TX; + rtc_attr.ste_base = + mlx5hws_pool_get_base_mirror_id(action_tbl->pool); + rtc_attr.stc_base = + mlx5hws_pool_get_base_mirror_id(ctx->stc_pool); + rtc_attr.log_size = + opt == MLX5HWS_POOL_OPTIMIZE_ORIG ? 0 : log_sz; + rtc_id = &action_tbl->rtc_1_id; + } else { + rtc_attr.table_type = FS_FT_FDB_RX; + rtc_attr.ste_base = mlx5hws_pool_get_base_id(action_tbl->pool); + rtc_attr.stc_base = mlx5hws_pool_get_base_id(ctx->stc_pool); + rtc_attr.log_size = + opt == MLX5HWS_POOL_OPTIMIZE_MIRROR ? 0 : log_sz; + rtc_id = &action_tbl->rtc_0_id; + } + + return mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr, rtc_id); +} + +static int +hws_action_ste_table_create_rtcs(struct mlx5hws_context *ctx, + struct mlx5hws_action_ste_table *action_tbl, + enum mlx5hws_pool_optimize opt, size_t log_sz) +{ + int err; + + err = hws_action_ste_table_create_single_rtc(ctx, action_tbl, opt, + log_sz, false); + if (err) + return err; + + err = hws_action_ste_table_create_single_rtc(ctx, action_tbl, opt, + log_sz, true); + if (err) { + mlx5hws_cmd_rtc_destroy(ctx->mdev, action_tbl->rtc_0_id); + return err; + } + + return 0; +} + +static void +hws_action_ste_table_destroy_rtcs(struct mlx5hws_action_ste_table *action_tbl) +{ + mlx5hws_cmd_rtc_destroy(action_tbl->pool->ctx->mdev, + action_tbl->rtc_1_id); + mlx5hws_cmd_rtc_destroy(action_tbl->pool->ctx->mdev, + action_tbl->rtc_0_id); +} + +static int +hws_action_ste_table_create_stc(struct mlx5hws_context *ctx, + struct mlx5hws_action_ste_table *action_tbl) +{ + struct mlx5hws_cmd_stc_modify_attr stc_attr = { 0 }; + + stc_attr.action_offset = MLX5HWS_ACTION_OFFSET_HIT; + stc_attr.action_type = MLX5_IFC_STC_ACTION_TYPE_JUMP_TO_STE_TABLE; + stc_attr.reparse_mode = MLX5_IFC_STC_REPARSE_IGNORE; + stc_attr.ste_table.ste_pool = action_tbl->pool; + stc_attr.ste_table.match_definer_id = ctx->caps->trivial_match_definer; + + return mlx5hws_action_alloc_single_stc(ctx, &stc_attr, + MLX5HWS_TABLE_TYPE_FDB, + &action_tbl->stc); +} + +static struct mlx5hws_action_ste_table * +hws_action_ste_table_alloc(struct mlx5hws_action_ste_pool_element *parent_elem) +{ + enum mlx5hws_pool_optimize opt = parent_elem->opt; + struct mlx5hws_context *ctx = parent_elem->ctx; + struct mlx5hws_action_ste_table *action_tbl; + size_t log_sz; + int err; + + log_sz = min(parent_elem->log_sz ? + parent_elem->log_sz + + MLX5HWS_ACTION_STE_TABLE_STEP_LOG_SZ : + MLX5HWS_ACTION_STE_TABLE_INIT_LOG_SZ, + MLX5HWS_ACTION_STE_TABLE_MAX_LOG_SZ); + + action_tbl = kzalloc(sizeof(*action_tbl), GFP_KERNEL); + if (!action_tbl) + return ERR_PTR(-ENOMEM); + + err = hws_action_ste_table_create_pool(ctx, action_tbl, opt, log_sz); + if (err) + goto free_tbl; + + err = hws_action_ste_table_create_rtcs(ctx, action_tbl, opt, log_sz); + if (err) + goto destroy_pool; + + err = hws_action_ste_table_create_stc(ctx, action_tbl); + if (err) + goto destroy_rtcs; + + action_tbl->parent_elem = parent_elem; + INIT_LIST_HEAD(&action_tbl->list_node); + action_tbl->last_used = jiffies; + list_add(&action_tbl->list_node, &parent_elem->available); + parent_elem->log_sz = log_sz; + + mlx5hws_dbg(ctx, + "Allocated %s action STE table log_sz %zu; STEs (%d, %d); RTCs (%d, %d); STC %d\n", + hws_pool_opt_to_str(opt), log_sz, + mlx5hws_pool_get_base_id(action_tbl->pool), + mlx5hws_pool_get_base_mirror_id(action_tbl->pool), + action_tbl->rtc_0_id, action_tbl->rtc_1_id, + action_tbl->stc.offset); + + return action_tbl; + +destroy_rtcs: + hws_action_ste_table_destroy_rtcs(action_tbl); +destroy_pool: + mlx5hws_pool_destroy(action_tbl->pool); +free_tbl: + kfree(action_tbl); + + return ERR_PTR(err); +} + +static void +hws_action_ste_table_destroy(struct mlx5hws_action_ste_table *action_tbl) +{ + struct mlx5hws_context *ctx = action_tbl->parent_elem->ctx; + + mlx5hws_dbg(ctx, + "Destroying %s action STE table: STEs (%d, %d); RTCs (%d, %d); STC %d\n", + hws_pool_opt_to_str(action_tbl->parent_elem->opt), + mlx5hws_pool_get_base_id(action_tbl->pool), + mlx5hws_pool_get_base_mirror_id(action_tbl->pool), + action_tbl->rtc_0_id, action_tbl->rtc_1_id, + action_tbl->stc.offset); + + mlx5hws_action_free_single_stc(ctx, MLX5HWS_TABLE_TYPE_FDB, + &action_tbl->stc); + hws_action_ste_table_destroy_rtcs(action_tbl); + mlx5hws_pool_destroy(action_tbl->pool); + + list_del(&action_tbl->list_node); + kfree(action_tbl); +} + +static int +hws_action_ste_pool_element_init(struct mlx5hws_context *ctx, + struct mlx5hws_action_ste_pool_element *elem, + enum mlx5hws_pool_optimize opt) +{ + elem->ctx = ctx; + elem->opt = opt; + INIT_LIST_HEAD(&elem->available); + INIT_LIST_HEAD(&elem->full); + + return 0; +} + +static void hws_action_ste_pool_element_destroy( + struct mlx5hws_action_ste_pool_element *elem) +{ + struct mlx5hws_action_ste_table *action_tbl, *p; + + /* This should be empty, but attempt to free its elements anyway. */ + list_for_each_entry_safe(action_tbl, p, &elem->full, list_node) + hws_action_ste_table_destroy(action_tbl); + + list_for_each_entry_safe(action_tbl, p, &elem->available, list_node) + hws_action_ste_table_destroy(action_tbl); +} + +static int hws_action_ste_pool_init(struct mlx5hws_context *ctx, + struct mlx5hws_action_ste_pool *pool) +{ + enum mlx5hws_pool_optimize opt; + int err; + + mutex_init(&pool->lock); + + /* Rules which are added for both RX and TX must use the same action STE + * indices for both. If we were to use a single table, then RX-only and + * TX-only rules would waste the unused entries. Thus, we use separate + * table sets for the three cases. + */ + for (opt = MLX5HWS_POOL_OPTIMIZE_NONE; opt < MLX5HWS_POOL_OPTIMIZE_MAX; + opt++) { + err = hws_action_ste_pool_element_init(ctx, &pool->elems[opt], + opt); + if (err) + goto destroy_elems; + pool->elems[opt].parent_pool = pool; + } + + return 0; + +destroy_elems: + while (opt-- > MLX5HWS_POOL_OPTIMIZE_NONE) + hws_action_ste_pool_element_destroy(&pool->elems[opt]); + + return err; +} + +static void hws_action_ste_pool_destroy(struct mlx5hws_action_ste_pool *pool) +{ + int opt; + + for (opt = MLX5HWS_POOL_OPTIMIZE_MAX - 1; + opt >= MLX5HWS_POOL_OPTIMIZE_NONE; opt--) + hws_action_ste_pool_element_destroy(&pool->elems[opt]); +} + +static void hws_action_ste_pool_element_collect_stale( + struct mlx5hws_action_ste_pool_element *elem, struct list_head *cleanup) +{ + struct mlx5hws_action_ste_table *action_tbl, *p; + unsigned long expire_time, now; + + expire_time = secs_to_jiffies(MLX5HWS_ACTION_STE_POOL_EXPIRE_SECONDS); + now = jiffies; + + list_for_each_entry_safe(action_tbl, p, &elem->available, list_node) { + if (mlx5hws_pool_full(action_tbl->pool) && + time_before(action_tbl->last_used + expire_time, now)) + list_move(&action_tbl->list_node, cleanup); + } +} + +static void hws_action_ste_table_cleanup_list(struct list_head *cleanup) +{ + struct mlx5hws_action_ste_table *action_tbl, *p; + + list_for_each_entry_safe(action_tbl, p, cleanup, list_node) + hws_action_ste_table_destroy(action_tbl); +} + +static void hws_action_ste_pool_cleanup(struct work_struct *work) +{ + enum mlx5hws_pool_optimize opt; + struct mlx5hws_context *ctx; + LIST_HEAD(cleanup); + int i; + + ctx = container_of(work, struct mlx5hws_context, + action_ste_cleanup.work); + + for (i = 0; i < ctx->queues; i++) { + struct mlx5hws_action_ste_pool *p = &ctx->action_ste_pool[i]; + + mutex_lock(&p->lock); + for (opt = MLX5HWS_POOL_OPTIMIZE_NONE; + opt < MLX5HWS_POOL_OPTIMIZE_MAX; opt++) + hws_action_ste_pool_element_collect_stale( + &p->elems[opt], &cleanup); + mutex_unlock(&p->lock); + } + + hws_action_ste_table_cleanup_list(&cleanup); + + schedule_delayed_work(&ctx->action_ste_cleanup, + secs_to_jiffies( + MLX5HWS_ACTION_STE_POOL_CLEANUP_SECONDS)); +} + +int mlx5hws_action_ste_pool_init(struct mlx5hws_context *ctx) +{ + struct mlx5hws_action_ste_pool *pool; + size_t queues = ctx->queues; + int i, err; + + pool = kcalloc(queues, sizeof(*pool), GFP_KERNEL); + if (!pool) + return -ENOMEM; + + for (i = 0; i < queues; i++) { + err = hws_action_ste_pool_init(ctx, &pool[i]); + if (err) + goto free_pool; + } + + ctx->action_ste_pool = pool; + + INIT_DELAYED_WORK(&ctx->action_ste_cleanup, + hws_action_ste_pool_cleanup); + schedule_delayed_work( + &ctx->action_ste_cleanup, + secs_to_jiffies(MLX5HWS_ACTION_STE_POOL_CLEANUP_SECONDS)); + + return 0; + +free_pool: + while (i--) + hws_action_ste_pool_destroy(&pool[i]); + kfree(pool); + + return err; +} + +void mlx5hws_action_ste_pool_uninit(struct mlx5hws_context *ctx) +{ + size_t queues = ctx->queues; + int i; + + cancel_delayed_work_sync(&ctx->action_ste_cleanup); + + for (i = 0; i < queues; i++) + hws_action_ste_pool_destroy(&ctx->action_ste_pool[i]); + + kfree(ctx->action_ste_pool); +} + +static struct mlx5hws_action_ste_pool_element * +hws_action_ste_choose_elem(struct mlx5hws_action_ste_pool *pool, + bool skip_rx, bool skip_tx) +{ + if (skip_rx) + return &pool->elems[MLX5HWS_POOL_OPTIMIZE_MIRROR]; + + if (skip_tx) + return &pool->elems[MLX5HWS_POOL_OPTIMIZE_ORIG]; + + return &pool->elems[MLX5HWS_POOL_OPTIMIZE_NONE]; +} + +static int +hws_action_ste_table_chunk_alloc(struct mlx5hws_action_ste_table *action_tbl, + struct mlx5hws_action_ste_chunk *chunk) +{ + int err; + + err = mlx5hws_pool_chunk_alloc(action_tbl->pool, &chunk->ste); + if (err) + return err; + + chunk->action_tbl = action_tbl; + action_tbl->last_used = jiffies; + + return 0; +} + +int mlx5hws_action_ste_chunk_alloc(struct mlx5hws_action_ste_pool *pool, + bool skip_rx, bool skip_tx, + struct mlx5hws_action_ste_chunk *chunk) +{ + struct mlx5hws_action_ste_pool_element *elem; + struct mlx5hws_action_ste_table *action_tbl; + bool found; + int err; + + if (skip_rx && skip_tx) + return -EINVAL; + + mutex_lock(&pool->lock); + + elem = hws_action_ste_choose_elem(pool, skip_rx, skip_tx); + + mlx5hws_dbg(elem->ctx, + "Allocating action STEs skip_rx %d skip_tx %d order %d\n", + skip_rx, skip_tx, chunk->ste.order); + + found = false; + list_for_each_entry(action_tbl, &elem->available, list_node) { + if (!hws_action_ste_table_chunk_alloc(action_tbl, chunk)) { + found = true; + break; + } + } + + if (!found) { + action_tbl = hws_action_ste_table_alloc(elem); + if (IS_ERR(action_tbl)) { + err = PTR_ERR(action_tbl); + goto out; + } + + err = hws_action_ste_table_chunk_alloc(action_tbl, chunk); + if (err) + goto out; + } + + if (mlx5hws_pool_empty(action_tbl->pool)) + list_move(&action_tbl->list_node, &elem->full); + + err = 0; + +out: + mutex_unlock(&pool->lock); + + return err; +} + +void mlx5hws_action_ste_chunk_free(struct mlx5hws_action_ste_chunk *chunk) +{ + struct mutex *lock = &chunk->action_tbl->parent_elem->parent_pool->lock; + + mlx5hws_dbg(chunk->action_tbl->pool->ctx, + "Freeing action STEs offset %d order %d\n", + chunk->ste.offset, chunk->ste.order); + + mutex_lock(lock); + mlx5hws_pool_chunk_free(chunk->action_tbl->pool, &chunk->ste); + chunk->action_tbl->last_used = jiffies; + list_move(&chunk->action_tbl->list_node, + &chunk->action_tbl->parent_elem->available); + mutex_unlock(lock); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.h new file mode 100644 index 0000000000..a8ba97359e --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2025 NVIDIA Corporation & Affiliates */ + +#ifndef ACTION_STE_POOL_H_ +#define ACTION_STE_POOL_H_ + +#define MLX5HWS_ACTION_STE_TABLE_INIT_LOG_SZ 10 +#define MLX5HWS_ACTION_STE_TABLE_STEP_LOG_SZ 1 +#define MLX5HWS_ACTION_STE_TABLE_MAX_LOG_SZ 20 + +#define MLX5HWS_ACTION_STE_POOL_CLEANUP_SECONDS 300 +#define MLX5HWS_ACTION_STE_POOL_EXPIRE_SECONDS 300 + +struct mlx5hws_action_ste_pool_element; + +struct mlx5hws_action_ste_table { + struct mlx5hws_action_ste_pool_element *parent_elem; + /* Wraps the RTC and STE range for this given action. */ + struct mlx5hws_pool *pool; + /* Match STEs use this STC to jump to this pool's RTC. */ + struct mlx5hws_pool_chunk stc; + u32 rtc_0_id; + u32 rtc_1_id; + struct list_head list_node; + unsigned long last_used; +}; + +struct mlx5hws_action_ste_pool_element { + struct mlx5hws_context *ctx; + struct mlx5hws_action_ste_pool *parent_pool; + size_t log_sz; /* Size of the largest table so far. */ + enum mlx5hws_pool_optimize opt; + struct list_head available; + struct list_head full; +}; + +/* Central repository of action STEs. The context contains one of these pools + * per queue. + */ +struct mlx5hws_action_ste_pool { + /* Protects the entire pool. We have one pool per queue and only one + * operation can be active per rule at a given time. Thus this lock + * protects solely against concurrent garbage collection and we expect + * very little contention. + */ + struct mutex lock; + struct mlx5hws_action_ste_pool_element elems[MLX5HWS_POOL_OPTIMIZE_MAX]; +}; + +/* A chunk of STEs and the table it was allocated from. Used by rules. */ +struct mlx5hws_action_ste_chunk { + struct mlx5hws_action_ste_table *action_tbl; + struct mlx5hws_pool_chunk ste; +}; + +int mlx5hws_action_ste_pool_init(struct mlx5hws_context *ctx); + +void mlx5hws_action_ste_pool_uninit(struct mlx5hws_context *ctx); + +/* Callers are expected to fill chunk->ste.order. On success, this function + * populates chunk->tbl and chunk->ste.offset. + */ +int mlx5hws_action_ste_chunk_alloc(struct mlx5hws_action_ste_pool *pool, + bool skip_rx, bool skip_tx, + struct mlx5hws_action_ste_chunk *chunk); + +void mlx5hws_action_ste_chunk_free(struct mlx5hws_action_ste_chunk *chunk); + +#endif /* ACTION_STE_POOL_H_ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c index 19dce1ba51..6ef0c4be27 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c @@ -46,9 +46,10 @@ static void hws_bwc_unlock_all_queues(struct mlx5hws_context *ctx) } } -static void hws_bwc_matcher_init_attr(struct mlx5hws_matcher_attr *attr, +static void hws_bwc_matcher_init_attr(struct mlx5hws_bwc_matcher *bwc_matcher, u32 priority, - u8 size_log) + u8 size_log_rx, u8 size_log_tx, + struct mlx5hws_matcher_attr *attr) { memset(attr, 0, sizeof(*attr)); @@ -58,11 +59,170 @@ static void hws_bwc_matcher_init_attr(struct mlx5hws_matcher_attr *attr, attr->optimize_flow_src = MLX5HWS_MATCHER_FLOW_SRC_ANY; attr->insert_mode = MLX5HWS_MATCHER_INSERT_BY_HASH; attr->distribute_mode = MLX5HWS_MATCHER_DISTRIBUTE_BY_HASH; - attr->rule.num_log = size_log; + attr->size[MLX5HWS_MATCHER_SIZE_TYPE_RX].rule.num_log = size_log_rx; + attr->size[MLX5HWS_MATCHER_SIZE_TYPE_TX].rule.num_log = size_log_tx; attr->resizable = true; attr->max_num_of_at_attach = MLX5HWS_BWC_MATCHER_ATTACH_AT_NUM; } +static int +hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_matcher) +{ + struct mlx5hws_context *ctx = bwc_matcher->matcher->tbl->ctx; + struct mlx5hws_matcher *matcher = bwc_matcher->matcher; + int drain_error = 0, move_error = 0, poll_error = 0; + u16 bwc_queues = mlx5hws_bwc_queues(ctx); + struct mlx5hws_rule_attr rule_attr; + struct mlx5hws_bwc_rule *bwc_rule; + struct mlx5hws_send_engine *queue; + struct list_head *rules_list; + u32 pending_rules; + int i, ret = 0; + bool drain; + + mlx5hws_bwc_rule_fill_attr(bwc_matcher, 0, 0, &rule_attr); + + for (i = 0; i < bwc_queues; i++) { + if (list_empty(&bwc_matcher->rules[i])) + continue; + + pending_rules = 0; + rule_attr.queue_id = mlx5hws_bwc_get_queue_id(ctx, i); + rules_list = &bwc_matcher->rules[i]; + + list_for_each_entry(bwc_rule, rules_list, list_node) { + ret = mlx5hws_matcher_resize_rule_move(matcher, + bwc_rule->rule, + &rule_attr); + if (unlikely(ret)) { + if (!move_error) { + mlx5hws_err(ctx, + "Moving BWC rule: move failed (%d), attempting to move rest of the rules\n", + ret); + move_error = ret; + } + /* Rule wasn't queued, no need to poll */ + continue; + } + + pending_rules++; + drain = pending_rules >= + hws_bwc_get_burst_th(ctx, rule_attr.queue_id); + ret = mlx5hws_bwc_queue_poll(ctx, + rule_attr.queue_id, + &pending_rules, + drain); + if (unlikely(ret)) { + if (ret == -ETIMEDOUT) { + mlx5hws_err(ctx, + "Moving BWC rule: timeout polling for completions (%d), aborting rehash\n", + ret); + return ret; + } + if (!poll_error) { + mlx5hws_err(ctx, + "Moving BWC rule: polling for completions failed (%d), attempting to move rest of the rules\n", + ret); + poll_error = ret; + } + } + } + + if (pending_rules) { + queue = &ctx->send_queue[rule_attr.queue_id]; + mlx5hws_send_engine_flush_queue(queue); + ret = mlx5hws_bwc_queue_poll(ctx, + rule_attr.queue_id, + &pending_rules, + true); + if (unlikely(ret)) { + if (ret == -ETIMEDOUT) { + mlx5hws_err(ctx, + "Moving bwc rule: timeout draining completions (%d), aborting rehash\n", + ret); + return ret; + } + if (!drain_error) { + mlx5hws_err(ctx, + "Moving bwc rule: drain failed (%d), attempting to move rest of the rules\n", + ret); + drain_error = ret; + } + } + } + } + + /* Return the first error that happened */ + if (unlikely(move_error)) + return move_error; + if (unlikely(poll_error)) + return poll_error; + if (unlikely(drain_error)) + return drain_error; + + return ret; +} + +static int hws_bwc_matcher_move_all(struct mlx5hws_bwc_matcher *bwc_matcher) +{ + switch (bwc_matcher->matcher_type) { + case MLX5HWS_BWC_MATCHER_SIMPLE: + return hws_bwc_matcher_move_all_simple(bwc_matcher); + case MLX5HWS_BWC_MATCHER_COMPLEX_FIRST: + return mlx5hws_bwc_matcher_complex_move_first(bwc_matcher); + case MLX5HWS_BWC_MATCHER_COMPLEX_SUBMATCHER: + return mlx5hws_bwc_matcher_complex_move(bwc_matcher); + default: + return -EINVAL; + } +} + +static int hws_bwc_matcher_move(struct mlx5hws_bwc_matcher *bwc_matcher) +{ + struct mlx5hws_context *ctx = bwc_matcher->matcher->tbl->ctx; + struct mlx5hws_matcher_attr matcher_attr = {0}; + struct mlx5hws_matcher *old_matcher; + struct mlx5hws_matcher *new_matcher; + int ret; + + hws_bwc_matcher_init_attr(bwc_matcher, + bwc_matcher->priority, + bwc_matcher->rx_size.size_log, + bwc_matcher->tx_size.size_log, + &matcher_attr); + + old_matcher = bwc_matcher->matcher; + new_matcher = mlx5hws_matcher_create(old_matcher->tbl, + &bwc_matcher->mt, 1, + bwc_matcher->at, + bwc_matcher->num_of_at, + &matcher_attr); + if (!new_matcher) { + mlx5hws_err(ctx, "Rehash error: matcher creation failed\n"); + return -ENOMEM; + } + + ret = mlx5hws_matcher_resize_set_target(old_matcher, new_matcher); + if (ret) { + mlx5hws_err(ctx, "Rehash error: failed setting resize target\n"); + return ret; + } + + ret = hws_bwc_matcher_move_all(bwc_matcher); + if (ret) + mlx5hws_err(ctx, "Rehash error: moving rules failed, attempting to remove the old matcher\n"); + + /* Error during rehash can't be rolled back. + * The best option here is to allow the rehash to complete and remove + * the old matcher - can't leave the matcher in the 'in_resize' state. + */ + + bwc_matcher->matcher = new_matcher; + mlx5hws_matcher_destroy(old_matcher); + + return ret; +} + int mlx5hws_bwc_matcher_create_simple(struct mlx5hws_bwc_matcher *bwc_matcher, struct mlx5hws_table *table, u32 priority, @@ -83,12 +243,20 @@ int mlx5hws_bwc_matcher_create_simple(struct mlx5hws_bwc_matcher *bwc_matcher, for (i = 0; i < bwc_queues; i++) INIT_LIST_HEAD(&bwc_matcher->rules[i]); - hws_bwc_matcher_init_attr(&attr, + hws_bwc_matcher_init_attr(bwc_matcher, priority, - MLX5HWS_BWC_MATCHER_INIT_SIZE_LOG); + bwc_matcher->rx_size.size_log, + bwc_matcher->tx_size.size_log, + &attr); + bwc_matcher->matcher_type = MLX5HWS_BWC_MATCHER_SIMPLE; bwc_matcher->priority = priority; - bwc_matcher->size_log = MLX5HWS_BWC_MATCHER_INIT_SIZE_LOG; + + bwc_matcher->size_of_at_array = MLX5HWS_BWC_MATCHER_ATTACH_AT_NUM; + bwc_matcher->at = kcalloc(bwc_matcher->size_of_at_array, + sizeof(*bwc_matcher->at), GFP_KERNEL); + if (!bwc_matcher->at) + goto free_bwc_matcher_rules; /* create dummy action template */ bwc_matcher->at[0] = @@ -96,7 +264,7 @@ int mlx5hws_bwc_matcher_create_simple(struct mlx5hws_bwc_matcher *bwc_matcher, action_types : init_action_types); if (!bwc_matcher->at[0]) { mlx5hws_err(table->ctx, "BWC matcher: failed creating action template\n"); - goto free_bwc_matcher_rules; + goto free_bwc_matcher_at_array; } bwc_matcher->num_of_at = 1; @@ -126,12 +294,28 @@ free_mt: mlx5hws_match_template_destroy(bwc_matcher->mt); free_at: mlx5hws_action_template_destroy(bwc_matcher->at[0]); +free_bwc_matcher_at_array: + kfree(bwc_matcher->at); free_bwc_matcher_rules: kfree(bwc_matcher->rules); err: return -EINVAL; } +static void +hws_bwc_matcher_init_size_rxtx(struct mlx5hws_bwc_matcher_size *size) +{ + size->size_log = MLX5HWS_BWC_MATCHER_INIT_SIZE_LOG; + atomic_set(&size->num_of_rules, 0); + atomic_set(&size->rehash_required, false); +} + +static void hws_bwc_matcher_init_size(struct mlx5hws_bwc_matcher *bwc_matcher) +{ + hws_bwc_matcher_init_size_rxtx(&bwc_matcher->rx_size); + hws_bwc_matcher_init_size_rxtx(&bwc_matcher->tx_size); +} + struct mlx5hws_bwc_matcher * mlx5hws_bwc_matcher_create(struct mlx5hws_table *table, u32 priority, @@ -152,7 +336,7 @@ mlx5hws_bwc_matcher_create(struct mlx5hws_table *table, if (!bwc_matcher) return NULL; - atomic_set(&bwc_matcher->num_of_rules, 0); + hws_bwc_matcher_init_size(bwc_matcher); /* Check if the required match params can be all matched * in single STE, otherwise complex matcher is needed. @@ -192,6 +376,7 @@ int mlx5hws_bwc_matcher_destroy_simple(struct mlx5hws_bwc_matcher *bwc_matcher) for (i = 0; i < bwc_matcher->num_of_at; i++) mlx5hws_action_template_destroy(bwc_matcher->at[i]); + kfree(bwc_matcher->at); mlx5hws_match_template_destroy(bwc_matcher->mt); kfree(bwc_matcher->rules); @@ -201,23 +386,27 @@ int mlx5hws_bwc_matcher_destroy_simple(struct mlx5hws_bwc_matcher *bwc_matcher) int mlx5hws_bwc_matcher_destroy(struct mlx5hws_bwc_matcher *bwc_matcher) { - u32 num_of_rules = atomic_read(&bwc_matcher->num_of_rules); + u32 rx_rules = atomic_read(&bwc_matcher->rx_size.num_of_rules); + u32 tx_rules = atomic_read(&bwc_matcher->tx_size.num_of_rules); - if (num_of_rules) + if (rx_rules || tx_rules) mlx5hws_err(bwc_matcher->matcher->tbl->ctx, - "BWC matcher destroy: matcher still has %d rules\n", - num_of_rules); + "BWC matcher destroy: matcher still has %u RX and %u TX rules\n", + rx_rules, tx_rules); - mlx5hws_bwc_matcher_destroy_simple(bwc_matcher); + if (bwc_matcher->matcher_type == MLX5HWS_BWC_MATCHER_COMPLEX_FIRST) + mlx5hws_bwc_matcher_destroy_complex(bwc_matcher); + else + mlx5hws_bwc_matcher_destroy_simple(bwc_matcher); kfree(bwc_matcher); return 0; } -static int hws_bwc_queue_poll(struct mlx5hws_context *ctx, - u16 queue_id, - u32 *pending_rules, - bool drain) +int mlx5hws_bwc_queue_poll(struct mlx5hws_context *ctx, + u16 queue_id, + u32 *pending_rules, + bool drain) { unsigned long timeout = jiffies + secs_to_jiffies(MLX5HWS_BWC_POLLING_TIMEOUT); @@ -320,16 +509,12 @@ static void hws_bwc_rule_list_add(struct mlx5hws_bwc_rule *bwc_rule, u16 idx) { struct mlx5hws_bwc_matcher *bwc_matcher = bwc_rule->bwc_matcher; - atomic_inc(&bwc_matcher->num_of_rules); bwc_rule->bwc_queue_idx = idx; list_add(&bwc_rule->list_node, &bwc_matcher->rules[idx]); } static void hws_bwc_rule_list_remove(struct mlx5hws_bwc_rule *bwc_rule) { - struct mlx5hws_bwc_matcher *bwc_matcher = bwc_rule->bwc_matcher; - - atomic_dec(&bwc_matcher->num_of_rules); list_del_init(&bwc_rule->list_node); } @@ -352,7 +537,8 @@ hws_bwc_rule_destroy_hws_sync(struct mlx5hws_bwc_rule *bwc_rule, if (unlikely(ret)) return ret; - ret = hws_bwc_queue_poll(ctx, rule_attr->queue_id, &expected_completions, true); + ret = mlx5hws_bwc_queue_poll(ctx, rule_attr->queue_id, + &expected_completions, true); if (unlikely(ret)) return ret; @@ -366,6 +552,80 @@ hws_bwc_rule_destroy_hws_sync(struct mlx5hws_bwc_rule *bwc_rule, return 0; } +static void hws_bwc_rule_cnt_dec(struct mlx5hws_bwc_rule *bwc_rule) +{ + struct mlx5hws_bwc_matcher *bwc_matcher = bwc_rule->bwc_matcher; + + if (!bwc_rule->skip_rx) + atomic_dec(&bwc_matcher->rx_size.num_of_rules); + if (!bwc_rule->skip_tx) + atomic_dec(&bwc_matcher->tx_size.num_of_rules); +} + +static int +hws_bwc_matcher_rehash_shrink(struct mlx5hws_bwc_matcher *bwc_matcher) +{ + struct mlx5hws_bwc_matcher_size *rx_size = &bwc_matcher->rx_size; + struct mlx5hws_bwc_matcher_size *tx_size = &bwc_matcher->tx_size; + + /* It is possible that another thread has added a rule. + * Need to check again if we really need rehash/shrink. + */ + if (atomic_read(&rx_size->num_of_rules) || + atomic_read(&tx_size->num_of_rules)) + return 0; + + /* If the current matcher RX/TX size is already at its initial size. */ + if (rx_size->size_log == MLX5HWS_BWC_MATCHER_INIT_SIZE_LOG && + tx_size->size_log == MLX5HWS_BWC_MATCHER_INIT_SIZE_LOG) + return 0; + + /* Now we've done all the checking - do the shrinking: + * - reset match RTC size to the initial size + * - create new matcher + * - move the rules, which will not do anything as the matcher is empty + * - destroy the old matcher + */ + + rx_size->size_log = MLX5HWS_BWC_MATCHER_INIT_SIZE_LOG; + tx_size->size_log = MLX5HWS_BWC_MATCHER_INIT_SIZE_LOG; + + return hws_bwc_matcher_move(bwc_matcher); +} + +static int hws_bwc_rule_cnt_dec_with_shrink(struct mlx5hws_bwc_rule *bwc_rule, + u16 bwc_queue_idx) +{ + struct mlx5hws_bwc_matcher *bwc_matcher = bwc_rule->bwc_matcher; + struct mlx5hws_context *ctx = bwc_matcher->matcher->tbl->ctx; + struct mutex *queue_lock; /* Protect the queue */ + int ret; + + hws_bwc_rule_cnt_dec(bwc_rule); + + if (atomic_read(&bwc_matcher->rx_size.num_of_rules) || + atomic_read(&bwc_matcher->tx_size.num_of_rules)) + return 0; + + /* Matcher has no more rules - shrink it to save ICM. */ + + queue_lock = hws_bwc_get_queue_lock(ctx, bwc_queue_idx); + mutex_unlock(queue_lock); + + hws_bwc_lock_all_queues(ctx); + ret = hws_bwc_matcher_rehash_shrink(bwc_matcher); + hws_bwc_unlock_all_queues(ctx); + + mutex_lock(queue_lock); + + if (unlikely(ret)) + mlx5hws_err(ctx, + "BWC rule deletion: shrinking empty matcher failed (%d)\n", + ret); + + return ret; +} + int mlx5hws_bwc_rule_destroy_simple(struct mlx5hws_bwc_rule *bwc_rule) { struct mlx5hws_bwc_matcher *bwc_matcher = bwc_rule->bwc_matcher; @@ -383,6 +643,7 @@ int mlx5hws_bwc_rule_destroy_simple(struct mlx5hws_bwc_rule *bwc_rule) ret = hws_bwc_rule_destroy_hws_sync(bwc_rule, &attr); hws_bwc_rule_list_remove(bwc_rule); + hws_bwc_rule_cnt_dec_with_shrink(bwc_rule, idx); mutex_unlock(queue_lock); @@ -391,9 +652,14 @@ int mlx5hws_bwc_rule_destroy_simple(struct mlx5hws_bwc_rule *bwc_rule) int mlx5hws_bwc_rule_destroy(struct mlx5hws_bwc_rule *bwc_rule) { - int ret; + bool is_complex = bwc_rule->bwc_matcher->matcher_type == + MLX5HWS_BWC_MATCHER_COMPLEX_FIRST; + int ret = 0; - ret = mlx5hws_bwc_rule_destroy_simple(bwc_rule); + if (is_complex) + ret = mlx5hws_bwc_rule_destroy_complex(bwc_rule); + else + ret = mlx5hws_bwc_rule_destroy_simple(bwc_rule); mlx5hws_bwc_rule_free(bwc_rule); return ret; @@ -433,9 +699,8 @@ hws_bwc_rule_create_sync(struct mlx5hws_bwc_rule *bwc_rule, if (unlikely(ret)) return ret; - ret = hws_bwc_queue_poll(ctx, rule_attr->queue_id, &expected_completions, true); - - return ret; + return mlx5hws_bwc_queue_poll(ctx, rule_attr->queue_id, + &expected_completions, true); } static int @@ -456,7 +721,8 @@ hws_bwc_rule_update_sync(struct mlx5hws_bwc_rule *bwc_rule, if (unlikely(ret)) return ret; - ret = hws_bwc_queue_poll(ctx, rule_attr->queue_id, &expected_completions, true); + ret = mlx5hws_bwc_queue_poll(ctx, rule_attr->queue_id, + &expected_completions, true); if (unlikely(ret)) mlx5hws_err(ctx, "Failed updating BWC rule (%d)\n", ret); @@ -464,37 +730,27 @@ hws_bwc_rule_update_sync(struct mlx5hws_bwc_rule *bwc_rule, } static bool -hws_bwc_matcher_size_maxed_out(struct mlx5hws_bwc_matcher *bwc_matcher) +hws_bwc_matcher_size_maxed_out(struct mlx5hws_bwc_matcher *bwc_matcher, + struct mlx5hws_bwc_matcher_size *size) { struct mlx5hws_cmd_query_caps *caps = bwc_matcher->matcher->tbl->ctx->caps; /* check the match RTC size */ - if ((bwc_matcher->size_log + - MLX5HWS_MATCHER_ASSURED_MAIN_TBL_DEPTH + - MLX5HWS_BWC_MATCHER_SIZE_LOG_STEP) > - (caps->ste_alloc_log_max - 1)) - return true; - - /* check the action RTC size */ - if ((bwc_matcher->size_log + - MLX5HWS_BWC_MATCHER_SIZE_LOG_STEP + - ilog2(roundup_pow_of_two(bwc_matcher->matcher->action_ste.max_stes)) + - MLX5HWS_MATCHER_ACTION_RTC_UPDATE_MULT) > - (caps->ste_alloc_log_max - 1)) - return true; - - return false; + return (size->size_log + MLX5HWS_MATCHER_ASSURED_MAIN_TBL_DEPTH + + MLX5HWS_BWC_MATCHER_SIZE_LOG_STEP) > + (caps->ste_alloc_log_max - 1); } static bool hws_bwc_matcher_rehash_size_needed(struct mlx5hws_bwc_matcher *bwc_matcher, + struct mlx5hws_bwc_matcher_size *size, u32 num_of_rules) { - if (unlikely(hws_bwc_matcher_size_maxed_out(bwc_matcher))) + if (unlikely(hws_bwc_matcher_size_maxed_out(bwc_matcher, size))) return false; if (unlikely((num_of_rules * 100 / MLX5HWS_BWC_MATCHER_REHASH_PERCENT_TH) >= - (1UL << bwc_matcher->size_log))) + (1UL << size->size_log))) return true; return false; @@ -520,6 +776,23 @@ hws_bwc_matcher_extend_at(struct mlx5hws_bwc_matcher *bwc_matcher, struct mlx5hws_rule_action rule_actions[]) { enum mlx5hws_action_type action_types[MLX5HWS_BWC_MAX_ACTS]; + void *p; + + if (unlikely(bwc_matcher->num_of_at >= bwc_matcher->size_of_at_array)) { + if (bwc_matcher->size_of_at_array >= MLX5HWS_MATCHER_MAX_AT) + return -ENOMEM; + bwc_matcher->size_of_at_array *= 2; + p = krealloc(bwc_matcher->at, + bwc_matcher->size_of_at_array * + sizeof(*bwc_matcher->at), + __GFP_ZERO | GFP_KERNEL); + if (!p) { + bwc_matcher->size_of_at_array /= 2; + return -ENOMEM; + } + + bwc_matcher->at = p; + } hws_bwc_rule_actions_to_action_types(rule_actions, action_types); @@ -534,20 +807,21 @@ hws_bwc_matcher_extend_at(struct mlx5hws_bwc_matcher *bwc_matcher, } static int -hws_bwc_matcher_extend_size(struct mlx5hws_bwc_matcher *bwc_matcher) +hws_bwc_matcher_extend_size(struct mlx5hws_bwc_matcher *bwc_matcher, + struct mlx5hws_bwc_matcher_size *size) { struct mlx5hws_context *ctx = bwc_matcher->matcher->tbl->ctx; struct mlx5hws_cmd_query_caps *caps = ctx->caps; - if (unlikely(hws_bwc_matcher_size_maxed_out(bwc_matcher))) { + if (unlikely(hws_bwc_matcher_size_maxed_out(bwc_matcher, size))) { mlx5hws_err(ctx, "Can't resize matcher: depth exceeds limit %d\n", caps->rtc_log_depth_max); return -ENOMEM; } - bwc_matcher->size_log = - min(bwc_matcher->size_log + MLX5HWS_BWC_MATCHER_SIZE_LOG_STEP, - caps->ste_alloc_log_max - MLX5HWS_MATCHER_ASSURED_MAIN_TBL_DEPTH); + size->size_log = min(size->size_log + MLX5HWS_BWC_MATCHER_SIZE_LOG_STEP, + caps->ste_alloc_log_max - + MLX5HWS_MATCHER_ASSURED_MAIN_TBL_DEPTH); return 0; } @@ -580,163 +854,42 @@ hws_bwc_matcher_find_at(struct mlx5hws_bwc_matcher *bwc_matcher, return -1; } -static int hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_matcher) -{ - struct mlx5hws_context *ctx = bwc_matcher->matcher->tbl->ctx; - u16 bwc_queues = mlx5hws_bwc_queues(ctx); - struct mlx5hws_bwc_rule **bwc_rules; - struct mlx5hws_rule_attr rule_attr; - u32 *pending_rules; - int i, j, ret = 0; - bool all_done; - u16 burst_th; - - mlx5hws_bwc_rule_fill_attr(bwc_matcher, 0, 0, &rule_attr); - - pending_rules = kcalloc(bwc_queues, sizeof(*pending_rules), GFP_KERNEL); - if (!pending_rules) - return -ENOMEM; - - bwc_rules = kcalloc(bwc_queues, sizeof(*bwc_rules), GFP_KERNEL); - if (!bwc_rules) { - ret = -ENOMEM; - goto free_pending_rules; - } - - for (i = 0; i < bwc_queues; i++) { - if (list_empty(&bwc_matcher->rules[i])) - bwc_rules[i] = NULL; - else - bwc_rules[i] = list_first_entry(&bwc_matcher->rules[i], - struct mlx5hws_bwc_rule, - list_node); - } - - do { - all_done = true; - - for (i = 0; i < bwc_queues; i++) { - rule_attr.queue_id = mlx5hws_bwc_get_queue_id(ctx, i); - burst_th = hws_bwc_get_burst_th(ctx, rule_attr.queue_id); - - for (j = 0; j < burst_th && bwc_rules[i]; j++) { - rule_attr.burst = !!((j + 1) % burst_th); - ret = mlx5hws_matcher_resize_rule_move(bwc_matcher->matcher, - bwc_rules[i]->rule, - &rule_attr); - if (unlikely(ret)) { - mlx5hws_err(ctx, - "Moving BWC rule failed during rehash (%d)\n", - ret); - goto free_bwc_rules; - } - - all_done = false; - pending_rules[i]++; - bwc_rules[i] = list_is_last(&bwc_rules[i]->list_node, - &bwc_matcher->rules[i]) ? - NULL : list_next_entry(bwc_rules[i], list_node); - - ret = hws_bwc_queue_poll(ctx, rule_attr.queue_id, - &pending_rules[i], false); - if (unlikely(ret)) { - mlx5hws_err(ctx, - "Moving BWC rule failed during rehash (%d)\n", - ret); - goto free_bwc_rules; - } - } - } - } while (!all_done); - - /* drain all the bwc queues */ - for (i = 0; i < bwc_queues; i++) { - if (pending_rules[i]) { - u16 queue_id = mlx5hws_bwc_get_queue_id(ctx, i); - - mlx5hws_send_engine_flush_queue(&ctx->send_queue[queue_id]); - ret = hws_bwc_queue_poll(ctx, queue_id, - &pending_rules[i], true); - if (unlikely(ret)) { - mlx5hws_err(ctx, - "Moving BWC rule failed during rehash (%d)\n", ret); - goto free_bwc_rules; - } - } - } - -free_bwc_rules: - kfree(bwc_rules); -free_pending_rules: - kfree(pending_rules); - - return ret; -} - -static int hws_bwc_matcher_move_all(struct mlx5hws_bwc_matcher *bwc_matcher) -{ - return hws_bwc_matcher_move_all_simple(bwc_matcher); -} - -static int hws_bwc_matcher_move(struct mlx5hws_bwc_matcher *bwc_matcher) -{ - struct mlx5hws_context *ctx = bwc_matcher->matcher->tbl->ctx; - struct mlx5hws_matcher_attr matcher_attr = {0}; - struct mlx5hws_matcher *old_matcher; - struct mlx5hws_matcher *new_matcher; - int ret; - - hws_bwc_matcher_init_attr(&matcher_attr, - bwc_matcher->priority, - bwc_matcher->size_log); - - old_matcher = bwc_matcher->matcher; - new_matcher = mlx5hws_matcher_create(old_matcher->tbl, - &bwc_matcher->mt, 1, - bwc_matcher->at, - bwc_matcher->num_of_at, - &matcher_attr); - if (!new_matcher) { - mlx5hws_err(ctx, "Rehash error: matcher creation failed\n"); - return -ENOMEM; - } - - ret = mlx5hws_matcher_resize_set_target(old_matcher, new_matcher); - if (ret) { - mlx5hws_err(ctx, "Rehash error: failed setting resize target\n"); - return ret; - } - - ret = hws_bwc_matcher_move_all(bwc_matcher); - if (ret) { - mlx5hws_err(ctx, "Rehash error: moving rules failed\n"); - return -ENOMEM; - } - - bwc_matcher->matcher = new_matcher; - mlx5hws_matcher_destroy(old_matcher); - - return 0; -} - static int hws_bwc_matcher_rehash_size(struct mlx5hws_bwc_matcher *bwc_matcher) { + bool need_rx_rehash, need_tx_rehash; int ret; - /* If the current matcher size is already at its max size, we can't - * do the rehash. Skip it and try adding the rule again - perhaps - * there was some change. + need_rx_rehash = atomic_read(&bwc_matcher->rx_size.rehash_required); + need_tx_rehash = atomic_read(&bwc_matcher->tx_size.rehash_required); + + /* It is possible that another rule has already performed rehash. + * Need to check again if we really need rehash. */ - if (hws_bwc_matcher_size_maxed_out(bwc_matcher)) + if (!need_rx_rehash && !need_tx_rehash) return 0; - /* It is possible that other rule has already performed rehash. - * Need to check again if we really need rehash. - * If the reason for rehash was size, but not any more - skip rehash. + /* If the current matcher RX/TX size is already at its max size, + * it can't be rehashed. */ - if (!hws_bwc_matcher_rehash_size_needed(bwc_matcher, - atomic_read(&bwc_matcher->num_of_rules))) + if (need_rx_rehash && + hws_bwc_matcher_size_maxed_out(bwc_matcher, + &bwc_matcher->rx_size)) { + atomic_set(&bwc_matcher->rx_size.rehash_required, false); + need_rx_rehash = false; + } + if (need_tx_rehash && + hws_bwc_matcher_size_maxed_out(bwc_matcher, + &bwc_matcher->tx_size)) { + atomic_set(&bwc_matcher->tx_size.rehash_required, false); + need_tx_rehash = false; + } + + /* If both RX and TX rehash flags are now off, it means that whatever + * we wanted to rehash is now at its max size - no rehash can be done. + * Return and try adding the rule again - perhaps there was some change. + */ + if (!need_rx_rehash && !need_tx_rehash) return 0; /* Now we're done all the checking - do the rehash: @@ -745,25 +898,127 @@ hws_bwc_matcher_rehash_size(struct mlx5hws_bwc_matcher *bwc_matcher) * - move all the rules to the new matcher * - destroy the old matcher */ + atomic_set(&bwc_matcher->rx_size.rehash_required, false); + atomic_set(&bwc_matcher->tx_size.rehash_required, false); - ret = hws_bwc_matcher_extend_size(bwc_matcher); - if (ret) - return ret; + if (need_rx_rehash) { + ret = hws_bwc_matcher_extend_size(bwc_matcher, + &bwc_matcher->rx_size); + if (ret) + return ret; + } + + if (need_tx_rehash) { + ret = hws_bwc_matcher_extend_size(bwc_matcher, + &bwc_matcher->tx_size); + if (ret) + return ret; + } return hws_bwc_matcher_move(bwc_matcher); } -static int -hws_bwc_matcher_rehash_at(struct mlx5hws_bwc_matcher *bwc_matcher) +static int hws_bwc_rule_get_at_idx(struct mlx5hws_bwc_rule *bwc_rule, + struct mlx5hws_rule_action rule_actions[], + u16 bwc_queue_idx) { - /* Rehash by action template doesn't require any additional checking. - * The bwc_matcher already contains the new action template. - * Just do the usual rehash: - * - create new matcher - * - move all the rules to the new matcher - * - destroy the old matcher - */ - return hws_bwc_matcher_move(bwc_matcher); + struct mlx5hws_bwc_matcher *bwc_matcher = bwc_rule->bwc_matcher; + struct mlx5hws_context *ctx = bwc_matcher->matcher->tbl->ctx; + struct mutex *queue_lock; /* Protect the queue */ + int at_idx, ret; + + /* check if rehash needed due to missing action template */ + at_idx = hws_bwc_matcher_find_at(bwc_matcher, rule_actions); + if (likely(at_idx >= 0)) + return at_idx; + + /* we need to extend BWC matcher action templates array */ + queue_lock = hws_bwc_get_queue_lock(ctx, bwc_queue_idx); + mutex_unlock(queue_lock); + hws_bwc_lock_all_queues(ctx); + + /* check again - perhaps other thread already did extend_at */ + at_idx = hws_bwc_matcher_find_at(bwc_matcher, rule_actions); + if (at_idx >= 0) + goto out; + + ret = hws_bwc_matcher_extend_at(bwc_matcher, rule_actions); + if (unlikely(ret)) { + mlx5hws_err(ctx, "BWC rule: failed extending AT (%d)", ret); + at_idx = -EINVAL; + goto out; + } + + /* action templates array was extended, we need the last idx */ + at_idx = bwc_matcher->num_of_at - 1; + ret = mlx5hws_matcher_attach_at(bwc_matcher->matcher, + bwc_matcher->at[at_idx]); + if (unlikely(ret)) { + mlx5hws_err(ctx, "BWC rule: failed attaching new AT (%d)", ret); + at_idx = -EINVAL; + goto out; + } + +out: + hws_bwc_unlock_all_queues(ctx); + mutex_lock(queue_lock); + return at_idx; +} + +static void hws_bwc_rule_cnt_inc_rxtx(struct mlx5hws_bwc_rule *bwc_rule, + struct mlx5hws_bwc_matcher_size *size) +{ + u32 num_of_rules = atomic_inc_return(&size->num_of_rules); + + if (unlikely(hws_bwc_matcher_rehash_size_needed(bwc_rule->bwc_matcher, + size, num_of_rules))) + atomic_set(&size->rehash_required, true); +} + +static void hws_bwc_rule_cnt_inc(struct mlx5hws_bwc_rule *bwc_rule) +{ + struct mlx5hws_bwc_matcher *bwc_matcher = bwc_rule->bwc_matcher; + + if (!bwc_rule->skip_rx) + hws_bwc_rule_cnt_inc_rxtx(bwc_rule, &bwc_matcher->rx_size); + if (!bwc_rule->skip_tx) + hws_bwc_rule_cnt_inc_rxtx(bwc_rule, &bwc_matcher->tx_size); +} + +static int hws_bwc_rule_cnt_inc_with_rehash(struct mlx5hws_bwc_rule *bwc_rule, + u16 bwc_queue_idx) +{ + struct mlx5hws_bwc_matcher *bwc_matcher = bwc_rule->bwc_matcher; + struct mlx5hws_context *ctx = bwc_matcher->matcher->tbl->ctx; + struct mutex *queue_lock; /* Protect the queue */ + int ret; + + hws_bwc_rule_cnt_inc(bwc_rule); + + if (!atomic_read(&bwc_matcher->rx_size.rehash_required) && + !atomic_read(&bwc_matcher->tx_size.rehash_required)) + return 0; + + queue_lock = hws_bwc_get_queue_lock(ctx, bwc_queue_idx); + mutex_unlock(queue_lock); + + hws_bwc_lock_all_queues(ctx); + ret = hws_bwc_matcher_rehash_size(bwc_matcher); + hws_bwc_unlock_all_queues(ctx); + + mutex_lock(queue_lock); + + if (likely(!ret)) + return 0; + + /* Failed to rehash. Print a diagnostic and rollback the counters. */ + mlx5hws_err(ctx, + "BWC rule insertion: rehash to sizes [%d, %d] failed (%d)\n", + bwc_matcher->rx_size.size_log, + bwc_matcher->tx_size.size_log, ret); + hws_bwc_rule_cnt_dec(bwc_rule); + + return ret; } int mlx5hws_bwc_rule_create_simple(struct mlx5hws_bwc_rule *bwc_rule, @@ -776,7 +1031,6 @@ int mlx5hws_bwc_rule_create_simple(struct mlx5hws_bwc_rule *bwc_rule, struct mlx5hws_context *ctx = bwc_matcher->matcher->tbl->ctx; struct mlx5hws_rule_attr rule_attr; struct mutex *queue_lock; /* Protect the queue */ - u32 num_of_rules; int ret = 0; int at_idx; @@ -786,67 +1040,18 @@ int mlx5hws_bwc_rule_create_simple(struct mlx5hws_bwc_rule *bwc_rule, mutex_lock(queue_lock); - /* check if rehash needed due to missing action template */ - at_idx = hws_bwc_matcher_find_at(bwc_matcher, rule_actions); + at_idx = hws_bwc_rule_get_at_idx(bwc_rule, rule_actions, bwc_queue_idx); if (unlikely(at_idx < 0)) { - /* we need to extend BWC matcher action templates array */ mutex_unlock(queue_lock); - hws_bwc_lock_all_queues(ctx); - - ret = hws_bwc_matcher_extend_at(bwc_matcher, rule_actions); - if (unlikely(ret)) { - hws_bwc_unlock_all_queues(ctx); - return ret; - } - - /* action templates array was extended, we need the last idx */ - at_idx = bwc_matcher->num_of_at - 1; - - ret = mlx5hws_matcher_attach_at(bwc_matcher->matcher, - bwc_matcher->at[at_idx]); - if (unlikely(ret)) { - /* Action template attach failed, possibly due to - * requiring more action STEs. - * Need to attempt creating new matcher with all - * the action templates, including the new one. - */ - ret = hws_bwc_matcher_rehash_at(bwc_matcher); - if (unlikely(ret)) { - mlx5hws_action_template_destroy(bwc_matcher->at[at_idx]); - bwc_matcher->at[at_idx] = NULL; - bwc_matcher->num_of_at--; - - hws_bwc_unlock_all_queues(ctx); - - mlx5hws_err(ctx, - "BWC rule insertion: rehash AT failed (%d)\n", ret); - return ret; - } - } - - hws_bwc_unlock_all_queues(ctx); - mutex_lock(queue_lock); + mlx5hws_err(ctx, "BWC rule create: failed getting AT (%d)", + ret); + return -EINVAL; } - /* check if number of rules require rehash */ - num_of_rules = atomic_read(&bwc_matcher->num_of_rules); - - if (unlikely(hws_bwc_matcher_rehash_size_needed(bwc_matcher, num_of_rules))) { + ret = hws_bwc_rule_cnt_inc_with_rehash(bwc_rule, bwc_queue_idx); + if (unlikely(ret)) { mutex_unlock(queue_lock); - - hws_bwc_lock_all_queues(ctx); - ret = hws_bwc_matcher_rehash_size(bwc_matcher); - hws_bwc_unlock_all_queues(ctx); - - if (ret) { - mlx5hws_err(ctx, "BWC rule insertion: rehash size [%d -> %d] failed (%d)\n", - bwc_matcher->size_log - MLX5HWS_BWC_MATCHER_SIZE_LOG_STEP, - bwc_matcher->size_log, - ret); - return ret; - } - - mutex_lock(queue_lock); + return ret; } ret = hws_bwc_rule_create_sync(bwc_rule, @@ -860,12 +1065,29 @@ int mlx5hws_bwc_rule_create_simple(struct mlx5hws_bwc_rule *bwc_rule, return 0; /* rule inserted successfully */ } + /* Rule insertion could fail due to queue being full, timeout, or + * matcher in resize. In such cases, no point in trying to rehash. + */ + if (ret == -EBUSY || ret == -ETIMEDOUT || ret == -EAGAIN) { + mutex_unlock(queue_lock); + mlx5hws_err(ctx, + "BWC rule insertion failed - %s (%d)\n", + ret == -EBUSY ? "queue is full" : + ret == -ETIMEDOUT ? "timeout" : + ret == -EAGAIN ? "matcher in resize" : "N/A", + ret); + hws_bwc_rule_cnt_dec(bwc_rule); + return ret; + } + /* At this point the rule wasn't added. * It could be because there was collision, or some other problem. - * If we don't dive deeper than API, the only thing we know is that - * the status of completion is RTE_FLOW_OP_ERROR. * Try rehash by size and insert rule again - last chance. */ + if (!bwc_rule->skip_rx) + atomic_set(&bwc_matcher->rx_size.rehash_required, true); + if (!bwc_rule->skip_tx) + atomic_set(&bwc_matcher->tx_size.rehash_required, true); mutex_unlock(queue_lock); @@ -875,6 +1097,7 @@ int mlx5hws_bwc_rule_create_simple(struct mlx5hws_bwc_rule *bwc_rule, if (ret) { mlx5hws_err(ctx, "BWC rule insertion: rehash failed (%d)\n", ret); + hws_bwc_rule_cnt_dec(bwc_rule); return ret; } @@ -890,6 +1113,7 @@ int mlx5hws_bwc_rule_create_simple(struct mlx5hws_bwc_rule *bwc_rule, if (unlikely(ret)) { mutex_unlock(queue_lock); mlx5hws_err(ctx, "BWC rule insertion failed (%d)\n", ret); + hws_bwc_rule_cnt_dec(bwc_rule); return ret; } @@ -919,13 +1143,24 @@ mlx5hws_bwc_rule_create(struct mlx5hws_bwc_matcher *bwc_matcher, if (unlikely(!bwc_rule)) return NULL; + bwc_rule->flow_source = flow_source; + mlx5hws_rule_skip(bwc_matcher->matcher, flow_source, + &bwc_rule->skip_rx, &bwc_rule->skip_tx); + bwc_queue_idx = hws_bwc_gen_queue_idx(ctx); - ret = mlx5hws_bwc_rule_create_simple(bwc_rule, - params->match_buf, - rule_actions, - flow_source, - bwc_queue_idx); + if (bwc_matcher->matcher_type == MLX5HWS_BWC_MATCHER_COMPLEX_FIRST) + ret = mlx5hws_bwc_rule_create_complex(bwc_rule, + params, + flow_source, + rule_actions, + bwc_queue_idx); + else + ret = mlx5hws_bwc_rule_create_simple(bwc_rule, + params->match_buf, + rule_actions, + flow_source, + bwc_queue_idx); if (unlikely(ret)) { mlx5hws_bwc_rule_free(bwc_rule); return NULL; @@ -947,57 +1182,17 @@ hws_bwc_rule_action_update(struct mlx5hws_bwc_rule *bwc_rule, idx = bwc_rule->bwc_queue_idx; - mlx5hws_bwc_rule_fill_attr(bwc_matcher, idx, 0, &rule_attr); + mlx5hws_bwc_rule_fill_attr(bwc_matcher, idx, bwc_rule->flow_source, + &rule_attr); queue_lock = hws_bwc_get_queue_lock(ctx, idx); mutex_lock(queue_lock); - /* check if rehash needed due to missing action template */ - at_idx = hws_bwc_matcher_find_at(bwc_matcher, rule_actions); + at_idx = hws_bwc_rule_get_at_idx(bwc_rule, rule_actions, idx); if (unlikely(at_idx < 0)) { - /* we need to extend BWC matcher action templates array */ mutex_unlock(queue_lock); - hws_bwc_lock_all_queues(ctx); - - /* check again - perhaps other thread already did extend_at */ - at_idx = hws_bwc_matcher_find_at(bwc_matcher, rule_actions); - if (likely(at_idx < 0)) { - ret = hws_bwc_matcher_extend_at(bwc_matcher, rule_actions); - if (unlikely(ret)) { - hws_bwc_unlock_all_queues(ctx); - mlx5hws_err(ctx, "BWC rule update: failed extending AT (%d)", ret); - return -EINVAL; - } - - /* action templates array was extended, we need the last idx */ - at_idx = bwc_matcher->num_of_at - 1; - - ret = mlx5hws_matcher_attach_at(bwc_matcher->matcher, - bwc_matcher->at[at_idx]); - if (unlikely(ret)) { - /* Action template attach failed, possibly due to - * requiring more action STEs. - * Need to attempt creating new matcher with all - * the action templates, including the new one. - */ - ret = hws_bwc_matcher_rehash_at(bwc_matcher); - if (unlikely(ret)) { - mlx5hws_action_template_destroy(bwc_matcher->at[at_idx]); - bwc_matcher->at[at_idx] = NULL; - bwc_matcher->num_of_at--; - - hws_bwc_unlock_all_queues(ctx); - - mlx5hws_err(ctx, - "BWC rule update: rehash AT failed (%d)\n", - ret); - return ret; - } - } - } - - hws_bwc_unlock_all_queues(ctx); - mutex_lock(queue_lock); + mlx5hws_err(ctx, "BWC rule update: failed getting AT\n"); + return -EINVAL; } ret = hws_bwc_rule_update_sync(bwc_rule, @@ -1023,5 +1218,9 @@ int mlx5hws_bwc_rule_action_update(struct mlx5hws_bwc_rule *bwc_rule, return -EINVAL; } + /* For complex rules, the update should happen on the last subrule. */ + while (bwc_rule->next_subrule) + bwc_rule = bwc_rule->next_subrule; + return hws_bwc_rule_action_update(bwc_rule, rule_actions); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h index 47f7ed1415..b905511f5c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h @@ -10,9 +10,7 @@ #define MLX5HWS_BWC_MATCHER_REHASH_BURST_TH 32 /* Max number of AT attach operations for the same matcher. - * When the limit is reached, next attempt to attach new AT - * will result in creation of a new matcher and moving all - * the rules to this matcher. + * When the limit is reached, a larger buffer is allocated for the ATs. */ #define MLX5HWS_BWC_MATCHER_ATTACH_AT_NUM 8 @@ -20,21 +18,52 @@ #define MLX5HWS_BWC_POLLING_TIMEOUT 60 +enum mlx5hws_bwc_matcher_type { + /* Standalone bwc matcher. */ + MLX5HWS_BWC_MATCHER_SIMPLE, + /* The first matcher of a complex matcher. When rules are inserted into + * a matcher of this type, they are split into subrules and inserted + * into their corresponding submatchers. + */ + MLX5HWS_BWC_MATCHER_COMPLEX_FIRST, + /* A submatcher that is part of a complex matcher. For most purposes + * these are treated as simple matchers, except when it comes to moving + * rules during resize. + */ + MLX5HWS_BWC_MATCHER_COMPLEX_SUBMATCHER, +}; + +struct mlx5hws_bwc_matcher_complex_data; + +struct mlx5hws_bwc_matcher_size { + u8 size_log; + atomic_t num_of_rules; + atomic_t rehash_required; +}; + struct mlx5hws_bwc_matcher { struct mlx5hws_matcher *matcher; struct mlx5hws_match_template *mt; - struct mlx5hws_action_template *at[MLX5HWS_BWC_MATCHER_ATTACH_AT_NUM]; - u32 priority; + struct mlx5hws_action_template **at; + struct mlx5hws_bwc_matcher_complex_data *complex; u8 num_of_at; - u8 size_log; - atomic_t num_of_rules; + u8 size_of_at_array; + enum mlx5hws_bwc_matcher_type matcher_type; + u32 priority; + struct mlx5hws_bwc_matcher_size rx_size; + struct mlx5hws_bwc_matcher_size tx_size; struct list_head *rules; }; struct mlx5hws_bwc_rule { struct mlx5hws_bwc_matcher *bwc_matcher; struct mlx5hws_rule *rule; + struct mlx5hws_bwc_rule *next_subrule; + struct mlx5hws_bwc_complex_subrule_data *subrule_data; + u32 flow_source; u16 bwc_queue_idx; + bool skip_rx; + bool skip_tx; struct list_head list_node; }; @@ -65,6 +94,11 @@ void mlx5hws_bwc_rule_fill_attr(struct mlx5hws_bwc_matcher *bwc_matcher, u32 flow_source, struct mlx5hws_rule_attr *rule_attr); +int mlx5hws_bwc_queue_poll(struct mlx5hws_context *ctx, + u16 queue_id, + u32 *pending_rules, + bool drain); + static inline u16 mlx5hws_bwc_queues(struct mlx5hws_context *ctx) { /* Besides the control queue, half of the queues are diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc_complex.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc_complex.c index 9fb059a651..660630f18c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc_complex.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc_complex.c @@ -3,9 +3,27 @@ #include "internal.h" -bool mlx5hws_bwc_match_params_is_complex(struct mlx5hws_context *ctx, - u8 match_criteria_enable, - struct mlx5hws_match_parameters *mask) +/* We chain submatchers by applying three rules on a subrule: modify header (to + * set register C6), jump to table (to the next submatcher) and the mandatory + * last rule. + */ +#define HWS_NUM_CHAIN_ACTIONS 3 + +static const struct rhashtable_params hws_rules_hash_params = { + .key_len = sizeof_field(struct mlx5hws_bwc_complex_subrule_data, + match_tag), + .key_offset = + offsetof(struct mlx5hws_bwc_complex_subrule_data, match_tag), + .head_offset = + offsetof(struct mlx5hws_bwc_complex_subrule_data, hash_node), + .automatic_shrinking = true, .min_size = 1, +}; + +static bool +hws_match_params_exceeds_definer(struct mlx5hws_context *ctx, + u8 match_criteria_enable, + struct mlx5hws_match_parameters *mask, + bool allow_jumbo) { struct mlx5hws_definer match_layout = {0}; struct mlx5hws_match_template *mt; @@ -20,11 +38,11 @@ bool mlx5hws_bwc_match_params_is_complex(struct mlx5hws_context *ctx, mask->match_sz, match_criteria_enable); if (!mt) { - mlx5hws_err(ctx, "BWC: failed creating match template\n"); + mlx5hws_err(ctx, "Complex matcher: failed creating match template\n"); return false; } - ret = mlx5hws_definer_calc_layout(ctx, mt, &match_layout); + ret = mlx5hws_definer_calc_layout(ctx, mt, &match_layout, allow_jumbo); if (ret) { /* The only case that we're interested in is E2BIG, * which means that the match parameters need to be @@ -48,20 +66,766 @@ bool mlx5hws_bwc_match_params_is_complex(struct mlx5hws_context *ctx, return is_complex; } +bool mlx5hws_bwc_match_params_is_complex(struct mlx5hws_context *ctx, + u8 match_criteria_enable, + struct mlx5hws_match_parameters *mask) +{ + return hws_match_params_exceeds_definer(ctx, match_criteria_enable, + mask, true); +} + +static int +hws_get_last_set_dword_idx(const struct mlx5hws_match_parameters *mask) +{ + int i; + + for (i = mask->match_sz / 4 - 1; i >= 0; i--) + if (mask->match_buf[i]) + return i; + + return -1; +} + +static bool hws_match_mask_is_empty(const struct mlx5hws_match_parameters *mask) +{ + return hws_get_last_set_dword_idx(mask) == -1; +} + +static bool hws_dword_is_inner_ipaddr_off(int dword_off) +{ + /* IPv4 and IPv6 addresses share the same entry via a union, and the + * source and dest addresses are contiguous in the fte_match_param. So + * we need to check 8 words. + */ + static const int inner_ip_dword_off = + __mlx5_dw_off(fte_match_param, inner_headers.src_ipv4_src_ipv6); + + return dword_off >= inner_ip_dword_off && + dword_off < inner_ip_dword_off + 8; +} + +static bool hws_dword_is_outer_ipaddr_off(int dword_off) +{ + static const int outer_ip_dword_off = + __mlx5_dw_off(fte_match_param, outer_headers.src_ipv4_src_ipv6); + + return dword_off >= outer_ip_dword_off && + dword_off < outer_ip_dword_off + 8; +} + +static void hws_add_dword_to_mask(struct mlx5hws_match_parameters *mask, + const struct mlx5hws_match_parameters *orig, + int dword_idx, bool *added_inner_ipv, + bool *added_outer_ipv) +{ + mask->match_buf[dword_idx] |= orig->match_buf[dword_idx]; + + *added_inner_ipv = false; + *added_outer_ipv = false; + + /* Any IP address fragment must be accompanied by a match on IP version. + * Use the `added_ipv` variables to keep track if we added IP versions + * specifically for this dword, so that we can roll them back if the + * match params become too large to fit into a definer. + */ + if (hws_dword_is_inner_ipaddr_off(dword_idx) && + !MLX5_GET(fte_match_param, mask->match_buf, + inner_headers.ip_version)) { + MLX5_SET(fte_match_param, mask->match_buf, + inner_headers.ip_version, 0xf); + *added_inner_ipv = true; + } + if (hws_dword_is_outer_ipaddr_off(dword_idx) && + !MLX5_GET(fte_match_param, mask->match_buf, + outer_headers.ip_version)) { + MLX5_SET(fte_match_param, mask->match_buf, + outer_headers.ip_version, 0xf); + *added_outer_ipv = true; + } +} + +static void hws_remove_dword_from_mask(struct mlx5hws_match_parameters *mask, + int dword_idx, bool added_inner_ipv, + bool added_outer_ipv) +{ + mask->match_buf[dword_idx] = 0; + if (added_inner_ipv) + MLX5_SET(fte_match_param, mask->match_buf, + inner_headers.ip_version, 0); + if (added_outer_ipv) + MLX5_SET(fte_match_param, mask->match_buf, + outer_headers.ip_version, 0); +} + +/* Avoid leaving a single lower dword in `mask` if there are others present in + * `orig`. Splitting IPv6 addresses like this causes them to be interpreted as + * IPv4. + */ +static void hws_avoid_ipv6_split_of(struct mlx5hws_match_parameters *orig, + struct mlx5hws_match_parameters *mask, + int off) +{ + /* Masks are allocated to a full fte_match_param, but it can't hurt to + * double check. + */ + if (orig->match_sz <= off + 3 || mask->match_sz <= off + 3) + return; + + /* Lower dword is not set, nothing to do. */ + if (!mask->match_buf[off + 3]) + return; + + /* Higher dwords also present in `mask`, no ambiguity. */ + if (mask->match_buf[off] || mask->match_buf[off + 1] || + mask->match_buf[off + 2]) + return; + + /* There are no higher dwords in `orig`, i.e. we match on IPv4. */ + if (!orig->match_buf[off] && !orig->match_buf[off + 1] && + !orig->match_buf[off + 2]) + return; + + /* Put the lower dword back in `orig`. It is always safe to do this, the + * dword will just be picked up in the next submask. + */ + orig->match_buf[off + 3] = mask->match_buf[off + 3]; + mask->match_buf[off + 3] = 0; +} + +static void hws_avoid_ipv6_split(struct mlx5hws_match_parameters *orig, + struct mlx5hws_match_parameters *mask) +{ + hws_avoid_ipv6_split_of(orig, mask, + __mlx5_dw_off(fte_match_param, + outer_headers.src_ipv4_src_ipv6)); + hws_avoid_ipv6_split_of(orig, mask, + __mlx5_dw_off(fte_match_param, + outer_headers.dst_ipv4_dst_ipv6)); + hws_avoid_ipv6_split_of(orig, mask, + __mlx5_dw_off(fte_match_param, + inner_headers.src_ipv4_src_ipv6)); + hws_avoid_ipv6_split_of(orig, mask, + __mlx5_dw_off(fte_match_param, + inner_headers.dst_ipv4_dst_ipv6)); +} + +/* Build a subset of the `orig` match parameters into `mask`. This subset is + * guaranteed to fit in a single definer an as such is a candidate for being a + * part of a complex matcher. Upon successful execution, the match params that + * go into `mask` are cleared from `orig`. + */ +static int hws_get_simple_params(struct mlx5hws_context *ctx, u8 match_criteria, + struct mlx5hws_match_parameters *orig, + struct mlx5hws_match_parameters *mask) +{ + bool added_inner_ipv, added_outer_ipv; + int dword_idx; + u32 *backup; + int ret; + + dword_idx = hws_get_last_set_dword_idx(orig); + /* Nothing to do, we consumed all of the match params before. */ + if (dword_idx == -1) + return 0; + + backup = kzalloc(MLX5_ST_SZ_BYTES(fte_match_param), GFP_KERNEL); + if (!backup) + return -ENOMEM; + + while (1) { + dword_idx = hws_get_last_set_dword_idx(orig); + /* Nothing to do, we consumed all of the original match params + * into this subset, which still fits into a single matcher. + */ + if (dword_idx == -1) { + ret = 0; + goto free_backup; + } + + memcpy(backup, mask->match_buf, mask->match_sz); + + /* Try to add this dword to the current subset. */ + hws_add_dword_to_mask(mask, orig, dword_idx, &added_inner_ipv, + &added_outer_ipv); + + if (hws_match_params_exceeds_definer(ctx, match_criteria, mask, + false)) { + /* We just added a match param that makes the definer + * too large. Revert and return what we had before. + * Note that we can't just zero out the affected fields, + * because it's possible that the dword we're looking at + * wasn't zero before (e.g. it included auto-added + * matches in IP version. This is why we employ the + * rather cumbersome memcpy for backing up. + */ + memcpy(mask->match_buf, backup, mask->match_sz); + /* Possible future improvement: We can't add any more + * dwords, but it may be possible to squeeze in + * individual bytes, as definers have special slots for + * those. + * + * For now, keep the code simple. This results in an + * extra submatcher in some cases, but it's good enough. + */ + ret = 0; + break; + } + + /* The current subset of match params still fits in a single + * definer. Remove the dword from the original mask. + * + * Also remove any explicit match on IP version if we just + * included one here. We will still automatically add it to + * accompany any IP address fragment, but do not need to + * consider it by itself. + */ + hws_remove_dword_from_mask(orig, dword_idx, added_inner_ipv, + added_outer_ipv); + } + + /* Make sure we have not picked up a single lower dword of an IPv6 + * address, as the firmware will erroneously treat it as an IPv4 + * address. + */ + hws_avoid_ipv6_split(orig, mask); + +free_backup: + kfree(backup); + + return ret; +} + +static int +hws_bwc_matcher_split_mask(struct mlx5hws_context *ctx, u8 match_criteria, + const struct mlx5hws_match_parameters *mask, + struct mlx5hws_match_parameters *submasks, + int *num_submasks) +{ + struct mlx5hws_match_parameters mask_copy; + int ret, i = 0; + + mask_copy.match_sz = MLX5_ST_SZ_BYTES(fte_match_param); + mask_copy.match_buf = kzalloc(mask_copy.match_sz, GFP_KERNEL); + if (!mask_copy.match_buf) + return -ENOMEM; + + memcpy(mask_copy.match_buf, mask->match_buf, mask->match_sz); + + while (!hws_match_mask_is_empty(&mask_copy)) { + if (i >= MLX5HWS_BWC_COMPLEX_MAX_SUBMATCHERS) { + mlx5hws_err(ctx, + "Complex matcher: mask too large for %d matchers\n", + MLX5HWS_BWC_COMPLEX_MAX_SUBMATCHERS); + ret = -E2BIG; + goto free_copy; + } + /* All but the first matcher need to match on register C6 to + * connect pieces of the complex rule together. + */ + if (i > 0) { + MLX5_SET(fte_match_param, submasks[i].match_buf, + misc_parameters_2.metadata_reg_c_6, -1); + match_criteria |= MLX5HWS_DEFINER_MATCH_CRITERIA_MISC2; + } + ret = hws_get_simple_params(ctx, match_criteria, &mask_copy, + &submasks[i]); + if (ret < 0) + goto free_copy; + i++; + } + + *num_submasks = i; + ret = 0; + +free_copy: + kfree(mask_copy.match_buf); + + return ret; +} + +static struct mlx5hws_table * +hws_isolated_table_create(const struct mlx5hws_bwc_matcher *cmatcher) +{ + struct mlx5hws_bwc_complex_submatcher *first_subm; + struct mlx5hws_cmd_ft_modify_attr ft_attr = {0}; + struct mlx5hws_table_attr tbl_attr = {0}; + struct mlx5hws_table *orig_tbl; + struct mlx5hws_context *ctx; + struct mlx5hws_table *tbl; + int ret; + + first_subm = &cmatcher->complex->submatchers[0]; + orig_tbl = first_subm->tbl; + ctx = orig_tbl->ctx; + + tbl_attr.type = orig_tbl->type; + tbl_attr.level = orig_tbl->level; + tbl = mlx5hws_table_create(ctx, &tbl_attr); + if (!tbl) + return ERR_PTR(-EINVAL); + + /* Set the default miss of the isolated table to point + * to the end anchor of the original matcher. + */ + mlx5hws_cmd_set_attr_connect_miss_tbl(ctx, tbl->fw_ft_type, + tbl->type, &ft_attr); + ft_attr.table_miss_id = first_subm->bwc_matcher->matcher->end_ft_id; + + ret = mlx5hws_cmd_flow_table_modify(ctx->mdev, &ft_attr, tbl->ft_id); + if (ret) { + mlx5hws_err(ctx, "Complex matcher: failed to set isolated tbl default miss\n"); + goto destroy_tbl; + } + + return tbl; + +destroy_tbl: + mlx5hws_table_destroy(tbl); + + return ERR_PTR(ret); +} + +static int hws_submatcher_init_first(struct mlx5hws_bwc_matcher *cmatcher, + struct mlx5hws_table *table, u32 priority, + u8 match_criteria, + struct mlx5hws_match_parameters *mask) +{ + enum mlx5hws_action_type action_types[HWS_NUM_CHAIN_ACTIONS]; + struct mlx5hws_bwc_complex_submatcher *subm; + int ret; + + subm = &cmatcher->complex->submatchers[0]; + + /* The first submatcher lives in the original table and does not have an + * associated jump to table action. It also points to the outer complex + * matcher. + */ + subm->tbl = table; + subm->action_tbl = NULL; + subm->bwc_matcher = cmatcher; + + action_types[0] = MLX5HWS_ACTION_TYP_MODIFY_HDR; + action_types[1] = MLX5HWS_ACTION_TYP_TBL; + action_types[2] = MLX5HWS_ACTION_TYP_LAST; + + ret = mlx5hws_bwc_matcher_create_simple(subm->bwc_matcher, subm->tbl, + priority, match_criteria, mask, + action_types); + if (ret) + return ret; + + subm->bwc_matcher->matcher_type = MLX5HWS_BWC_MATCHER_COMPLEX_FIRST; + + ret = rhashtable_init(&subm->rules_hash, &hws_rules_hash_params); + if (ret) + goto destroy_matcher; + mutex_init(&subm->hash_lock); + ida_init(&subm->chain_ida); + + return 0; + +destroy_matcher: + mlx5hws_bwc_matcher_destroy_simple(subm->bwc_matcher); + + return ret; +} + +static int hws_submatcher_init(struct mlx5hws_bwc_matcher *cmatcher, int idx, + struct mlx5hws_table *table, u32 priority, + u8 match_criteria, + struct mlx5hws_match_parameters *mask) +{ + enum mlx5hws_action_type action_types[HWS_NUM_CHAIN_ACTIONS]; + struct mlx5hws_bwc_complex_submatcher *subm; + bool is_last; + int ret; + + if (!idx) + return hws_submatcher_init_first(cmatcher, table, priority, + match_criteria, mask); + + subm = &cmatcher->complex->submatchers[idx]; + is_last = idx == cmatcher->complex->num_submatchers - 1; + + subm->tbl = hws_isolated_table_create(cmatcher); + if (IS_ERR(subm->tbl)) + return PTR_ERR(subm->tbl); + + subm->action_tbl = + mlx5hws_action_create_dest_table(subm->tbl->ctx, subm->tbl, + MLX5HWS_ACTION_FLAG_HWS_FDB); + if (!subm->action_tbl) { + ret = -EINVAL; + goto destroy_tbl; + } + + subm->bwc_matcher = kzalloc(sizeof(*subm->bwc_matcher), GFP_KERNEL); + if (!subm->bwc_matcher) { + ret = -ENOMEM; + goto destroy_action; + } + + /* Every matcher other than the first also matches of register C6 to + * bind subrules together in the complex rule using the chain ids. + */ + match_criteria |= MLX5HWS_DEFINER_MATCH_CRITERIA_MISC2; + + action_types[0] = MLX5HWS_ACTION_TYP_MODIFY_HDR; + action_types[1] = MLX5HWS_ACTION_TYP_TBL; + action_types[2] = MLX5HWS_ACTION_TYP_LAST; + + /* Every matcher other than the last sets register C6 and jumps to the + * next submatcher's table. The final submatcher will use the + * user-supplied actions and will attach an action template at rule + * insertion time. + */ + ret = mlx5hws_bwc_matcher_create_simple(subm->bwc_matcher, subm->tbl, + priority, match_criteria, mask, + is_last ? NULL : action_types); + if (ret) + goto free_matcher; + + subm->bwc_matcher->matcher_type = + MLX5HWS_BWC_MATCHER_COMPLEX_SUBMATCHER; + + ret = rhashtable_init(&subm->rules_hash, &hws_rules_hash_params); + if (ret) + goto destroy_matcher; + mutex_init(&subm->hash_lock); + ida_init(&subm->chain_ida); + + return 0; + +destroy_matcher: + mlx5hws_bwc_matcher_destroy_simple(subm->bwc_matcher); +free_matcher: + kfree(subm->bwc_matcher); +destroy_action: + mlx5hws_action_destroy(subm->action_tbl); +destroy_tbl: + mlx5hws_table_destroy(subm->tbl); + + return ret; +} + +static void hws_submatcher_destroy(struct mlx5hws_bwc_matcher *cmatcher, + int idx) +{ + struct mlx5hws_bwc_complex_submatcher *subm; + + subm = &cmatcher->complex->submatchers[idx]; + + ida_destroy(&subm->chain_ida); + mutex_destroy(&subm->hash_lock); + rhashtable_destroy(&subm->rules_hash); + + if (subm->bwc_matcher) { + mlx5hws_bwc_matcher_destroy_simple(subm->bwc_matcher); + if (idx) + kfree(subm->bwc_matcher); + } + + /* We own all of the isolated tables, but not the original one. */ + if (idx) { + mlx5hws_action_destroy(subm->action_tbl); + mlx5hws_table_destroy(subm->tbl); + } +} + +static int +hws_complex_data_actions_init(struct mlx5hws_bwc_matcher_complex_data *cdata) +{ + struct mlx5hws_context *ctx = cdata->submatchers[0].tbl->ctx; + u8 modify_hdr_action[MLX5_ST_SZ_BYTES(set_action_in)] = {0}; + struct mlx5hws_action_mh_pattern ptrn; + int ret = 0; + + /* Create modify header action to set REG_C_6 */ + MLX5_SET(set_action_in, modify_hdr_action, + action_type, MLX5_MODIFICATION_TYPE_SET); + MLX5_SET(set_action_in, modify_hdr_action, + field, MLX5_MODI_META_REG_C_6); + MLX5_SET(set_action_in, modify_hdr_action, + length, 0); /* zero means length of 32 */ + MLX5_SET(set_action_in, modify_hdr_action, offset, 0); + MLX5_SET(set_action_in, modify_hdr_action, data, 0); + + ptrn.data = (void *)modify_hdr_action; + ptrn.sz = MLX5HWS_ACTION_DOUBLE_SIZE; + + cdata->action_metadata = + mlx5hws_action_create_modify_header(ctx, 1, &ptrn, 0, + MLX5HWS_ACTION_FLAG_HWS_FDB); + if (!cdata->action_metadata) { + mlx5hws_err(ctx, "Complex matcher: failed to create set reg C6 action\n"); + return -EINVAL; + } + + /* Create last action */ + cdata->action_last = + mlx5hws_action_create_last(ctx, MLX5HWS_ACTION_FLAG_HWS_FDB); + if (!cdata->action_last) { + mlx5hws_err(ctx, "Complex matcher: failed to create last action\n"); + ret = -EINVAL; + goto destroy_action_metadata; + } + + return 0; + +destroy_action_metadata: + mlx5hws_action_destroy(cdata->action_metadata); + + return ret; +} + +static void +hws_complex_data_actions_destroy(struct mlx5hws_bwc_matcher_complex_data *cdata) +{ + mlx5hws_action_destroy(cdata->action_last); + mlx5hws_action_destroy(cdata->action_metadata); +} + int mlx5hws_bwc_matcher_create_complex(struct mlx5hws_bwc_matcher *bwc_matcher, struct mlx5hws_table *table, - u32 priority, - u8 match_criteria_enable, + u32 priority, u8 match_criteria_enable, struct mlx5hws_match_parameters *mask) { - mlx5hws_err(table->ctx, "Complex matcher is not supported yet\n"); - return -EOPNOTSUPP; + struct mlx5hws_match_parameters + submasks[MLX5HWS_BWC_COMPLEX_MAX_SUBMATCHERS] = {0}; + struct mlx5hws_bwc_matcher_complex_data *cdata; + struct mlx5hws_context *ctx = table->ctx; + int num_submatchers; + int i, ret; + + for (i = 0; i < ARRAY_SIZE(submasks); i++) { + submasks[i].match_sz = MLX5_ST_SZ_BYTES(fte_match_param); + submasks[i].match_buf = kzalloc(submasks[i].match_sz, + GFP_KERNEL); + if (!submasks[i].match_buf) { + ret = -ENOMEM; + goto free_submasks; + } + } + + ret = hws_bwc_matcher_split_mask(ctx, match_criteria_enable, mask, + submasks, &num_submatchers); + if (ret) + goto free_submasks; + + cdata = kzalloc(sizeof(*cdata), GFP_KERNEL); + if (!cdata) { + ret = -ENOMEM; + goto free_submasks; + } + + bwc_matcher->complex = cdata; + cdata->num_submatchers = num_submatchers; + + for (i = 0; i < num_submatchers; i++) { + ret = hws_submatcher_init(bwc_matcher, i, table, priority, + match_criteria_enable, &submasks[i]); + if (ret) + goto destroy_submatchers; + } + + ret = hws_complex_data_actions_init(cdata); + if (ret) + goto destroy_submatchers; + + ret = 0; + goto free_submasks; + +destroy_submatchers: + while (i--) + hws_submatcher_destroy(bwc_matcher, i); + kfree(cdata); + bwc_matcher->complex = NULL; + +free_submasks: + for (i = 0; i < ARRAY_SIZE(submasks); i++) + kfree(submasks[i].match_buf); + + return ret; } void mlx5hws_bwc_matcher_destroy_complex(struct mlx5hws_bwc_matcher *bwc_matcher) { - /* nothing to do here */ + int i; + + hws_complex_data_actions_destroy(bwc_matcher->complex); + for (i = 0; i < bwc_matcher->complex->num_submatchers; i++) + hws_submatcher_destroy(bwc_matcher, i); + kfree(bwc_matcher->complex); + bwc_matcher->complex = NULL; +} + +static int +hws_complex_get_subrule_data(struct mlx5hws_bwc_rule *bwc_rule, + struct mlx5hws_bwc_complex_submatcher *subm, + u32 *match_params) +__must_hold(&subm->hash_lock) +{ + struct mlx5hws_bwc_matcher *bwc_matcher = subm->bwc_matcher; + struct mlx5hws_bwc_complex_subrule_data *sr_data, *old_data; + struct mlx5hws_match_template *mt; + int ret; + + sr_data = kzalloc(sizeof(*sr_data), GFP_KERNEL); + if (!sr_data) + return -ENOMEM; + + ret = ida_alloc(&subm->chain_ida, GFP_KERNEL); + if (ret < 0) + goto free_sr_data; + sr_data->chain_id = ret; + + refcount_set(&sr_data->refcount, 1); + + mt = bwc_matcher->matcher->mt; + mlx5hws_definer_create_tag(match_params, mt->fc, mt->fc_sz, + (u8 *)&sr_data->match_tag); + + old_data = rhashtable_lookup_get_insert_fast(&subm->rules_hash, + &sr_data->hash_node, + hws_rules_hash_params); + if (IS_ERR(old_data)) { + ret = PTR_ERR(old_data); + goto free_ida; + } + + if (old_data) { + /* Rule with the same tag already exists - update refcount */ + refcount_inc(&old_data->refcount); + /* Let the new rule use the same tag as the existing rule. + * Note that we don't have any indication for the rule creation + * process that a rule with similar matching params already + * exists - no harm done when this rule is be overwritten by + * the same STE. + * There's some performance advantage in skipping such cases, + * so this is left for future optimizations. + */ + bwc_rule->subrule_data = old_data; + ret = 0; + goto free_ida; + } + + bwc_rule->subrule_data = sr_data; + return 0; + +free_ida: + ida_free(&subm->chain_ida, sr_data->chain_id); +free_sr_data: + kfree(sr_data); + + return ret; +} + +static void +hws_complex_put_subrule_data(struct mlx5hws_bwc_rule *bwc_rule, + struct mlx5hws_bwc_complex_submatcher *subm, + bool *is_last_rule) +__must_hold(&subm->hash_lock) +{ + struct mlx5hws_bwc_complex_subrule_data *sr_data; + + if (is_last_rule) + *is_last_rule = false; + + sr_data = bwc_rule->subrule_data; + if (refcount_dec_and_test(&sr_data->refcount)) { + rhashtable_remove_fast(&subm->rules_hash, + &sr_data->hash_node, + hws_rules_hash_params); + ida_free(&subm->chain_ida, sr_data->chain_id); + kfree(sr_data); + if (is_last_rule) + *is_last_rule = true; + } + + bwc_rule->subrule_data = NULL; +} + +static int hws_complex_subrule_create(struct mlx5hws_bwc_matcher *cmatcher, + struct mlx5hws_bwc_rule *subrule, + u32 *match_params, u32 flow_source, + int bwc_queue_idx, int subm_idx, + struct mlx5hws_rule_action *actions, + u32 *chain_id) +{ + struct mlx5hws_rule_action chain_actions[HWS_NUM_CHAIN_ACTIONS] = {0}; + u8 modify_hdr_action[MLX5_ST_SZ_BYTES(set_action_in)] = {0}; + struct mlx5hws_bwc_matcher_complex_data *cdata; + struct mlx5hws_bwc_complex_submatcher *subm; + int ret; + + cdata = cmatcher->complex; + subm = &cdata->submatchers[subm_idx]; + + mutex_lock(&subm->hash_lock); + + ret = hws_complex_get_subrule_data(subrule, subm, match_params); + if (ret) + goto unlock; + + *chain_id = subrule->subrule_data->chain_id; + + if (!actions) { + MLX5_SET(set_action_in, modify_hdr_action, data, *chain_id); + chain_actions[0].action = cdata->action_metadata; + chain_actions[0].modify_header.data = modify_hdr_action; + chain_actions[1].action = + cdata->submatchers[subm_idx + 1].action_tbl; + chain_actions[2].action = cdata->action_last; + actions = chain_actions; + } + + ret = mlx5hws_bwc_rule_create_simple(subrule, match_params, actions, + flow_source, bwc_queue_idx); + if (ret) + goto put_subrule_data; + + ret = 0; + goto unlock; + +put_subrule_data: + hws_complex_put_subrule_data(subrule, subm, NULL); +unlock: + mutex_unlock(&subm->hash_lock); + + return ret; +} + +static int hws_complex_subrule_destroy(struct mlx5hws_bwc_rule *bwc_rule, + struct mlx5hws_bwc_matcher *cmatcher, + int subm_idx) +{ + struct mlx5hws_bwc_matcher_complex_data *cdata; + struct mlx5hws_bwc_complex_submatcher *subm; + struct mlx5hws_context *ctx; + bool is_last_rule; + int ret = 0; + + cdata = cmatcher->complex; + subm = &cdata->submatchers[subm_idx]; + ctx = subm->tbl->ctx; + + mutex_lock(&subm->hash_lock); + + hws_complex_put_subrule_data(bwc_rule, subm, &is_last_rule); + bwc_rule->rule->skip_delete = !is_last_rule; + ret = mlx5hws_bwc_rule_destroy_simple(bwc_rule); + if (unlikely(ret)) + mlx5hws_err(ctx, + "Complex rule: failed to delete subrule %d (%d)\n", + subm_idx, ret); + + if (subm_idx) + mlx5hws_bwc_rule_free(bwc_rule); + + mutex_unlock(&subm->hash_lock); + + return ret; } int mlx5hws_bwc_rule_create_complex(struct mlx5hws_bwc_rule *bwc_rule, @@ -70,19 +834,268 @@ int mlx5hws_bwc_rule_create_complex(struct mlx5hws_bwc_rule *bwc_rule, struct mlx5hws_rule_action rule_actions[], u16 bwc_queue_idx) { - mlx5hws_err(bwc_rule->bwc_matcher->matcher->tbl->ctx, - "Complex rule is not supported yet\n"); - return -EOPNOTSUPP; + struct mlx5hws_bwc_rule + *subrules[MLX5HWS_BWC_COMPLEX_MAX_SUBMATCHERS] = {0}; + struct mlx5hws_bwc_matcher *cmatcher = bwc_rule->bwc_matcher; + struct mlx5hws_bwc_matcher_complex_data *cdata; + struct mlx5hws_rule_action *subrule_actions; + struct mlx5hws_bwc_complex_submatcher *subm; + struct mlx5hws_bwc_rule *subrule; + u32 *match_params; + u32 chain_id; + int i, ret; + + cdata = cmatcher->complex; + if (!cdata) + return -EINVAL; + + /* Duplicate user data because we will modify it to set register C6 + * values. For the same reason, make sure that we allocate a full + * match_param even if the user gave us fewer bytes. We need to ensure + * there is space for the match on C6. + */ + match_params = kzalloc(MLX5_ST_SZ_BYTES(fte_match_param), GFP_KERNEL); + if (!match_params) + return -ENOMEM; + + memcpy(match_params, params->match_buf, params->match_sz); + + ret = hws_complex_subrule_create(cmatcher, bwc_rule, match_params, + flow_source, bwc_queue_idx, 0, + NULL, &chain_id); + if (ret) + goto free_match_params; + subrules[0] = bwc_rule; + + for (i = 1; i < cdata->num_submatchers; i++) { + subm = &cdata->submatchers[i]; + subrule = mlx5hws_bwc_rule_alloc(subm->bwc_matcher); + if (!subrule) { + ret = -ENOMEM; + goto destroy_subrules; + } + + /* Match on the previous subrule's chain_id. This is how + * subrules are connected in steering. + */ + MLX5_SET(fte_match_param, match_params, + misc_parameters_2.metadata_reg_c_6, chain_id); + + /* The last subrule uses the complex rule's user-specified + * actions. Everything else uses the chaining rules based on the + * next table and chain_id. + */ + subrule_actions = + i == cdata->num_submatchers - 1 ? rule_actions : NULL; + + ret = hws_complex_subrule_create(cmatcher, subrule, + match_params, flow_source, + bwc_queue_idx, i, + subrule_actions, &chain_id); + if (ret) { + mlx5hws_bwc_rule_free(subrule); + goto destroy_subrules; + } + + subrules[i] = subrule; + } + + for (i = 0; i < cdata->num_submatchers - 1; i++) + subrules[i]->next_subrule = subrules[i + 1]; + + kfree(match_params); + + return 0; + +destroy_subrules: + while (i--) + hws_complex_subrule_destroy(subrules[i], cmatcher, i); +free_match_params: + kfree(match_params); + + return ret; } int mlx5hws_bwc_rule_destroy_complex(struct mlx5hws_bwc_rule *bwc_rule) { - return 0; + struct mlx5hws_bwc_matcher *bwc_matcher = bwc_rule->bwc_matcher; + struct mlx5hws_bwc_rule + *subrules[MLX5HWS_BWC_COMPLEX_MAX_SUBMATCHERS] = {0}; + struct mlx5hws_bwc_matcher_complex_data *cdata; + int i, err, ret_val; + + cdata = bwc_matcher->complex; + + /* Construct a list of all the subrules we need to destroy. */ + subrules[0] = bwc_rule; + for (i = 1; i < cdata->num_submatchers; i++) + subrules[i] = subrules[i - 1]->next_subrule; + + ret_val = 0; + for (i = 0; i < cdata->num_submatchers; i++) { + err = hws_complex_subrule_destroy(subrules[i], bwc_matcher, i); + /* If something goes wrong, plow along to destroy all of the + * subrules but return an error upstack. + */ + if (unlikely(err)) + ret_val = err; + } + + return ret_val; } -int mlx5hws_bwc_matcher_move_all_complex(struct mlx5hws_bwc_matcher *bwc_matcher) +static void +hws_bwc_matcher_init_move(struct mlx5hws_bwc_matcher *bwc_matcher) { - mlx5hws_err(bwc_matcher->matcher->tbl->ctx, - "Moving complex rule is not supported yet\n"); - return -EOPNOTSUPP; + struct mlx5hws_context *ctx = bwc_matcher->matcher->tbl->ctx; + u16 bwc_queues = mlx5hws_bwc_queues(ctx); + struct mlx5hws_bwc_rule *bwc_rule; + struct list_head *rules_list; + int i; + + for (i = 0; i < bwc_queues; i++) { + rules_list = &bwc_matcher->rules[i]; + if (list_empty(rules_list)) + continue; + + list_for_each_entry(bwc_rule, rules_list, list_node) { + if (!bwc_rule->subrule_data) + continue; + bwc_rule->subrule_data->was_moved = false; + } + } +} + +int mlx5hws_bwc_matcher_complex_move(struct mlx5hws_bwc_matcher *bwc_matcher) +{ + struct mlx5hws_context *ctx = bwc_matcher->matcher->tbl->ctx; + struct mlx5hws_matcher *matcher = bwc_matcher->matcher; + u16 bwc_queues = mlx5hws_bwc_queues(ctx); + struct mlx5hws_bwc_rule *tmp_bwc_rule; + struct mlx5hws_rule_attr rule_attr; + int move_error = 0, poll_error = 0; + struct mlx5hws_rule *tmp_rule; + struct list_head *rules_list; + u32 expected_completions = 1; + int i, ret = 0; + + hws_bwc_matcher_init_move(bwc_matcher); + + mlx5hws_bwc_rule_fill_attr(bwc_matcher, 0, 0, &rule_attr); + + for (i = 0; i < bwc_queues; i++) { + rules_list = &bwc_matcher->rules[i]; + if (list_empty(rules_list)) + continue; + + rule_attr.queue_id = mlx5hws_bwc_get_queue_id(ctx, i); + + list_for_each_entry(tmp_bwc_rule, rules_list, list_node) { + /* Check if a rule with similar tag has already + * been moved. + */ + if (tmp_bwc_rule->subrule_data->was_moved) { + /* This rule is a duplicate of rule with + * identical tag that has already been moved + * earlier. Just update this rule's RTCs. + */ + tmp_bwc_rule->rule->rtc_0 = + tmp_bwc_rule->subrule_data->rtc_0; + tmp_bwc_rule->rule->rtc_1 = + tmp_bwc_rule->subrule_data->rtc_1; + tmp_bwc_rule->rule->matcher = + tmp_bwc_rule->rule->matcher->resize_dst; + continue; + } + + /* First time we're moving rule with this tag. + * Move it for real. + */ + tmp_rule = tmp_bwc_rule->rule; + tmp_rule->skip_delete = false; + ret = mlx5hws_matcher_resize_rule_move(matcher, + tmp_rule, + &rule_attr); + if (unlikely(ret)) { + if (!move_error) { + mlx5hws_err(ctx, + "Moving complex BWC rule: move failed (%d), attempting to move rest of the rules\n", + ret); + move_error = ret; + } + /* Rule wasn't queued, no need to poll */ + continue; + } + + expected_completions = 1; + ret = mlx5hws_bwc_queue_poll(ctx, + rule_attr.queue_id, + &expected_completions, + true); + if (unlikely(ret)) { + if (ret == -ETIMEDOUT) { + mlx5hws_err(ctx, + "Moving complex BWC rule: timeout polling for completions (%d), aborting rehash\n", + ret); + return ret; + } + if (!poll_error) { + mlx5hws_err(ctx, + "Moving complex BWC rule: polling for completions failed (%d), attempting to move rest of the rules\n", + ret); + poll_error = ret; + } + } + + /* Done moving the rule to the new matcher, + * now update RTCs for all the duplicated rules. + */ + tmp_bwc_rule->subrule_data->rtc_0 = + tmp_bwc_rule->rule->rtc_0; + tmp_bwc_rule->subrule_data->rtc_1 = + tmp_bwc_rule->rule->rtc_1; + + tmp_bwc_rule->subrule_data->was_moved = true; + } + } + + /* Return the first error that happened */ + if (unlikely(move_error)) + return move_error; + if (unlikely(poll_error)) + return poll_error; + + return ret; +} + +int +mlx5hws_bwc_matcher_complex_move_first(struct mlx5hws_bwc_matcher *bwc_matcher) +{ + struct mlx5hws_context *ctx = bwc_matcher->matcher->tbl->ctx; + struct mlx5hws_bwc_matcher_complex_data *cdata; + struct mlx5hws_table *isolated_tbl; + u32 end_ft_id; + int i, ret; + + cdata = bwc_matcher->complex; + + /* We are rehashing the first submatcher. We need to update the + * subsequent submatchers to point to the end_ft of this new matcher. + * This needs to be done before moving any rules to prevent possible + * steering loops. + */ + end_ft_id = bwc_matcher->matcher->resize_dst->end_ft_id; + for (i = 1; i < cdata->num_submatchers; i++) { + isolated_tbl = cdata->submatchers[i].tbl; + ret = mlx5hws_matcher_update_end_ft_isolated(isolated_tbl, + end_ft_id); + if (ret) { + mlx5hws_err(ctx, + "Complex matcher: failed updating end_ft of isolated matcher (%d)\n", + ret); + return ret; + } + } + + return mlx5hws_bwc_matcher_complex_move(bwc_matcher); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc_complex.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc_complex.h index 340f0688e3..d07de631ce 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc_complex.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc_complex.h @@ -4,6 +4,62 @@ #ifndef HWS_BWC_COMPLEX_H_ #define HWS_BWC_COMPLEX_H_ +#define MLX5HWS_BWC_COMPLEX_MAX_SUBMATCHERS 4 + +/* A matcher can't contain two rules with the same match tag, but it is possible + * that two different complex rules' subrules have the same match tag. In that + * case, those subrules correspond to a single rule, and we need to refcount. + */ +struct mlx5hws_bwc_complex_subrule_data { + struct mlx5hws_rule_match_tag match_tag; + refcount_t refcount; + /* The chain_id is what glues individual subrules into larger complex + * rules. It is the value that this subrule writes to register C6, and + * that the next subrule matches against. + */ + u32 chain_id; + u32 rtc_0; + u32 rtc_1; + /* During rehash we iterate through all the subrules to move them. But + * two or more subrules can share the same physical rule in the + * submatcher, so we use `was_moved` to keep track if a given rule was + * already moved. + */ + bool was_moved; + struct rhash_head hash_node; +}; + +struct mlx5hws_bwc_complex_submatcher { + /* Isolated table that the matcher lives in. Not set for the first + * matcher, which lives in the original table. + */ + struct mlx5hws_table *tbl; + /* Match a rule with this action to go to `tbl`. This is set in all + * submatchers but the first. + */ + struct mlx5hws_action *action_tbl; + /* This submatcher's simple matcher. The first submatcher points to the + * outer (complex) matcher. + */ + struct mlx5hws_bwc_matcher *bwc_matcher; + struct rhashtable rules_hash; + struct ida chain_ida; + struct mutex hash_lock; /* Protect the hash and ida. */ +}; + +struct mlx5hws_bwc_matcher_complex_data { + struct mlx5hws_bwc_complex_submatcher + submatchers[MLX5HWS_BWC_COMPLEX_MAX_SUBMATCHERS]; + int num_submatchers; + /* Actions used by all but the last submatcher to point to the next + * submatcher in the chain. The last submatcher uses the action template + * from the complex matcher, to perform the actions that the user + * originally requested. + */ + struct mlx5hws_action *action_metadata; + struct mlx5hws_action *action_last; +}; + bool mlx5hws_bwc_match_params_is_complex(struct mlx5hws_context *ctx, u8 match_criteria_enable, struct mlx5hws_match_parameters *mask); @@ -16,7 +72,10 @@ int mlx5hws_bwc_matcher_create_complex(struct mlx5hws_bwc_matcher *bwc_matcher, void mlx5hws_bwc_matcher_destroy_complex(struct mlx5hws_bwc_matcher *bwc_matcher); -int mlx5hws_bwc_matcher_move_all_complex(struct mlx5hws_bwc_matcher *bwc_matcher); +int mlx5hws_bwc_matcher_complex_move(struct mlx5hws_bwc_matcher *bwc_matcher); + +int +mlx5hws_bwc_matcher_complex_move_first(struct mlx5hws_bwc_matcher *bwc_matcher); int mlx5hws_bwc_rule_create_complex(struct mlx5hws_bwc_rule *bwc_rule, struct mlx5hws_match_parameters *params, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c index e8f98c109b..f22eaf506d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c @@ -55,6 +55,7 @@ int mlx5hws_cmd_flow_table_create(struct mlx5_core_dev *mdev, MLX5_SET(create_flow_table_in, in, opcode, MLX5_CMD_OP_CREATE_FLOW_TABLE); MLX5_SET(create_flow_table_in, in, table_type, ft_attr->type); + MLX5_SET(create_flow_table_in, in, uid, ft_attr->uid); ft_ctx = MLX5_ADDR_OF(create_flow_table_in, in, flow_table_context); MLX5_SET(flow_table_context, ft_ctx, level, ft_attr->level); @@ -406,7 +407,6 @@ int mlx5hws_cmd_rtc_create(struct mlx5_core_dev *mdev, MLX5_SET(rtc, attr, match_definer_1, rtc_attr->match_definer_1); MLX5_SET(rtc, attr, stc_id, rtc_attr->stc_base); MLX5_SET(rtc, attr, ste_table_base_id, rtc_attr->ste_base); - MLX5_SET(rtc, attr, ste_table_offset, rtc_attr->ste_offset); MLX5_SET(rtc, attr, miss_flow_table_id, rtc_attr->miss_ft_id); MLX5_SET(rtc, attr, reparse_mode, rtc_attr->reparse_mode); @@ -1200,34 +1200,20 @@ out: int mlx5hws_cmd_query_gvmi(struct mlx5_core_dev *mdev, bool other_function, u16 vport_number, u16 *gvmi) { - bool ec_vf_func = other_function ? mlx5_core_is_ec_vf_vport(mdev, vport_number) : false; - u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {}; - int out_size; - void *out; int err; - out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out); - out = kzalloc(out_size, GFP_KERNEL); - if (!out) - return -ENOMEM; + if (!other_function) { + /* self vhca_id */ + *gvmi = MLX5_CAP_GEN(mdev, vhca_id); + return 0; + } - MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); - MLX5_SET(query_hca_cap_in, in, other_function, other_function); - MLX5_SET(query_hca_cap_in, in, function_id, - mlx5_vport_to_func_id(mdev, vport_number, ec_vf_func)); - MLX5_SET(query_hca_cap_in, in, ec_vf_function, ec_vf_func); - MLX5_SET(query_hca_cap_in, in, op_mod, - MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 | HCA_CAP_OPMOD_GET_CUR); - - err = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out); + err = mlx5_vport_get_vhca_id(mdev, vport_number, gvmi); if (err) { - kfree(out); + mlx5_core_err(mdev, "Failed to get vport vhca id for vport %d\n", + vport_number); return err; } - *gvmi = MLX5_GET(query_hca_cap_out, out, capability.cmd_hca_cap.vhca_id); - - kfree(out); - return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h index 51d9e0291a..122ccc6716 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h @@ -36,6 +36,7 @@ struct mlx5hws_cmd_set_fte_attr { struct mlx5hws_cmd_ft_create_attr { u8 type; u8 level; + u16 uid; bool rtc_valid; bool decap_en; bool reformat_en; @@ -70,7 +71,6 @@ struct mlx5hws_cmd_rtc_create_attr { u32 pd; u32 stc_base; u32 ste_base; - u32 ste_offset; u32 miss_ft_id; bool fw_gen_wqe; u8 update_index_mode; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c index 9cda2774fd..428dae8697 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c @@ -34,7 +34,6 @@ static int hws_context_pools_init(struct mlx5hws_context *ctx) /* Create an STC pool per FT type */ pool_attr.pool_type = MLX5HWS_POOL_TYPE_STC; - pool_attr.flags = MLX5HWS_POOL_FLAGS_FOR_STC_POOL; max_log_sz = min(MLX5HWS_POOL_STC_LOG_SZ, ctx->caps->stc_alloc_log_max); pool_attr.alloc_log_sz = max(max_log_sz, ctx->caps->stc_alloc_log_gran); @@ -159,10 +158,16 @@ static int hws_context_init_hws(struct mlx5hws_context *ctx, if (ret) goto pools_uninit; + ret = mlx5hws_action_ste_pool_init(ctx); + if (ret) + goto close_queues; + INIT_LIST_HEAD(&ctx->tbl_list); return 0; +close_queues: + mlx5hws_send_queues_close(ctx); pools_uninit: hws_context_pools_uninit(ctx); uninit_pd: @@ -175,6 +180,7 @@ static void hws_context_uninit_hws(struct mlx5hws_context *ctx) if (!(ctx->flags & MLX5HWS_CONTEXT_FLAG_HWS_SUPPORT)) return; + mlx5hws_action_ste_pool_uninit(ctx); mlx5hws_send_queues_close(ctx); hws_context_pools_uninit(ctx); hws_context_uninit_pd(ctx); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h index 38c3647444..3f8938c73d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h @@ -39,6 +39,8 @@ struct mlx5hws_context { struct mlx5hws_cmd_query_caps *caps; u32 pd_num; struct mlx5hws_pool *stc_pool; + struct mlx5hws_action_ste_pool *action_ste_pool; /* One per queue */ + struct delayed_work action_ste_cleanup; struct mlx5hws_context_common_res common_res; struct mlx5hws_pattern_cache *pattern_cache; struct mlx5hws_definer_cache *definer_cache; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c index 696275fd0c..2ec8cb1013 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c @@ -99,17 +99,19 @@ hws_debug_dump_matcher_attr(struct seq_file *f, struct mlx5hws_matcher *matcher) { struct mlx5hws_matcher_attr *attr = &matcher->attr; - seq_printf(f, "%d,0x%llx,%d,%d,%d,%d,%d,%d,%d,%d\n", + seq_printf(f, "%d,0x%llx,%d,%d,%d,%d,%d,%d,%d,%d,-1,-1,%d,%d\n", MLX5HWS_DEBUG_RES_TYPE_MATCHER_ATTR, HWS_PTR_TO_ID(matcher), attr->priority, attr->mode, - attr->table.sz_row_log, - attr->table.sz_col_log, + attr->size[MLX5HWS_MATCHER_SIZE_TYPE_RX].table.sz_row_log, + attr->size[MLX5HWS_MATCHER_SIZE_TYPE_RX].table.sz_col_log, attr->optimize_using_rule_idx, attr->optimize_flow_src, attr->insert_mode, - attr->distribute_mode); + attr->distribute_mode, + attr->size[MLX5HWS_MATCHER_SIZE_TYPE_TX].table.sz_row_log, + attr->size[MLX5HWS_MATCHER_SIZE_TYPE_TX].table.sz_col_log); return 0; } @@ -118,8 +120,6 @@ static int hws_debug_dump_matcher(struct seq_file *f, struct mlx5hws_matcher *ma { enum mlx5hws_table_type tbl_type = matcher->tbl->type; struct mlx5hws_cmd_ft_query_attr ft_attr = {0}; - struct mlx5hws_pool_chunk *ste; - struct mlx5hws_pool *ste_pool; u64 icm_addr_0 = 0; u64 icm_addr_1 = 0; u32 ste_0_id = -1; @@ -134,13 +134,9 @@ static int hws_debug_dump_matcher(struct seq_file *f, struct mlx5hws_matcher *ma matcher->end_ft_id, matcher->col_matcher ? HWS_PTR_TO_ID(matcher->col_matcher) : 0); - ste = &matcher->match_ste.ste; - ste_pool = matcher->match_ste.pool; - if (ste_pool) { - ste_0_id = mlx5hws_pool_chunk_get_base_id(ste_pool, ste); - if (tbl_type == MLX5HWS_TABLE_TYPE_FDB) - ste_1_id = mlx5hws_pool_chunk_get_base_mirror_id(ste_pool, ste); - } + ste_0_id = matcher->match_ste.ste_0_base; + if (tbl_type == MLX5HWS_TABLE_TYPE_FDB) + ste_1_id = matcher->match_ste.ste_1_base; seq_printf(f, ",%d,%d,%d,%d", matcher->match_ste.rtc_0_id, @@ -148,19 +144,6 @@ static int hws_debug_dump_matcher(struct seq_file *f, struct mlx5hws_matcher *ma matcher->match_ste.rtc_1_id, (int)ste_1_id); - ste = &matcher->action_ste.ste; - ste_pool = matcher->action_ste.pool; - if (ste_pool) { - ste_0_id = mlx5hws_pool_chunk_get_base_id(ste_pool, ste); - if (tbl_type == MLX5HWS_TABLE_TYPE_FDB) - ste_1_id = mlx5hws_pool_chunk_get_base_mirror_id(ste_pool, ste); - else - ste_1_id = -1; - } else { - ste_0_id = -1; - ste_1_id = -1; - } - ft_attr.type = matcher->tbl->fw_ft_type; ret = mlx5hws_cmd_flow_table_query(matcher->tbl->ctx->mdev, matcher->end_ft_id, @@ -170,10 +153,7 @@ static int hws_debug_dump_matcher(struct seq_file *f, struct mlx5hws_matcher *ma if (ret) return ret; - seq_printf(f, ",%d,%d,%d,%d,%d,0x%llx,0x%llx\n", - matcher->action_ste.rtc_0_id, (int)ste_0_id, - matcher->action_ste.rtc_1_id, (int)ste_1_id, - 0, + seq_printf(f, ",-1,-1,-1,-1,0,0x%llx,0x%llx\n", mlx5hws_debug_icm_to_idx(icm_addr_0), mlx5hws_debug_icm_to_idx(icm_addr_1)); @@ -387,14 +367,17 @@ static int hws_debug_dump_context_stc(struct seq_file *f, struct mlx5hws_context if (!stc_pool) return 0; - if (stc_pool->resource[0]) { - ret = hws_debug_dump_context_stc_resource(f, ctx, stc_pool->resource[0]); + if (stc_pool->resource) { + ret = hws_debug_dump_context_stc_resource(f, ctx, + stc_pool->resource); if (ret) return ret; } - if (stc_pool->mirror_resource[0]) { - ret = hws_debug_dump_context_stc_resource(f, ctx, stc_pool->mirror_resource[0]); + if (stc_pool->mirror_resource) { + struct mlx5hws_pool_resource *res = stc_pool->mirror_resource; + + ret = hws_debug_dump_context_stc_resource(f, ctx, res); if (ret) return ret; } @@ -402,10 +385,41 @@ static int hws_debug_dump_context_stc(struct seq_file *f, struct mlx5hws_context return 0; } +static void +hws_debug_dump_action_ste_table(struct seq_file *f, + struct mlx5hws_action_ste_table *action_tbl) +{ + int ste_0_id = mlx5hws_pool_get_base_id(action_tbl->pool); + int ste_1_id = mlx5hws_pool_get_base_mirror_id(action_tbl->pool); + + seq_printf(f, "%d,0x%llx,%d,%d,%d,%d\n", + MLX5HWS_DEBUG_RES_TYPE_ACTION_STE_TABLE, + HWS_PTR_TO_ID(action_tbl), + action_tbl->rtc_0_id, ste_0_id, + action_tbl->rtc_1_id, ste_1_id); +} + +static void hws_debug_dump_action_ste_pool(struct seq_file *f, + struct mlx5hws_action_ste_pool *pool) +{ + struct mlx5hws_action_ste_table *action_tbl; + enum mlx5hws_pool_optimize opt; + + mutex_lock(&pool->lock); + for (opt = MLX5HWS_POOL_OPTIMIZE_NONE; opt < MLX5HWS_POOL_OPTIMIZE_MAX; + opt++) { + list_for_each_entry(action_tbl, &pool->elems[opt].available, + list_node) { + hws_debug_dump_action_ste_table(f, action_tbl); + } + } + mutex_unlock(&pool->lock); +} + static int hws_debug_dump_context(struct seq_file *f, struct mlx5hws_context *ctx) { struct mlx5hws_table *tbl; - int ret; + int ret, i; ret = hws_debug_dump_context_info(f, ctx); if (ret) @@ -425,6 +439,9 @@ static int hws_debug_dump_context(struct seq_file *f, struct mlx5hws_context *ct return ret; } + for (i = 0; i < ctx->queues; i++) + hws_debug_dump_action_ste_pool(f, &ctx->action_ste_pool[i]); + return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.h index e44e7ae28f..89c396f9f2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.h @@ -26,6 +26,8 @@ enum mlx5hws_debug_res_type { MLX5HWS_DEBUG_RES_TYPE_MATCHER_TEMPLATE_HASH_DEFINER = 4205, MLX5HWS_DEBUG_RES_TYPE_MATCHER_TEMPLATE_RANGE_DEFINER = 4206, MLX5HWS_DEBUG_RES_TYPE_MATCHER_TEMPLATE_COMPARE_MATCH_DEFINER = 4207, + + MLX5HWS_DEBUG_RES_TYPE_ACTION_STE_TABLE = 4300, }; static inline u64 diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c index c8cc0c8115..82fd122d42 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c @@ -158,6 +158,218 @@ struct mlx5hws_definer_conv_data { u32 match_flags; }; +#define HWS_DEFINER_ENTRY(name)[MLX5HWS_DEFINER_FNAME_##name] = #name + +static const char * const hws_definer_fname_to_str[] = { + HWS_DEFINER_ENTRY(ETH_SMAC_47_16_O), + HWS_DEFINER_ENTRY(ETH_SMAC_47_16_I), + HWS_DEFINER_ENTRY(ETH_SMAC_15_0_O), + HWS_DEFINER_ENTRY(ETH_SMAC_15_0_I), + HWS_DEFINER_ENTRY(ETH_DMAC_47_16_O), + HWS_DEFINER_ENTRY(ETH_DMAC_47_16_I), + HWS_DEFINER_ENTRY(ETH_DMAC_15_0_O), + HWS_DEFINER_ENTRY(ETH_DMAC_15_0_I), + HWS_DEFINER_ENTRY(ETH_TYPE_O), + HWS_DEFINER_ENTRY(ETH_TYPE_I), + HWS_DEFINER_ENTRY(ETH_L3_TYPE_O), + HWS_DEFINER_ENTRY(ETH_L3_TYPE_I), + HWS_DEFINER_ENTRY(VLAN_TYPE_O), + HWS_DEFINER_ENTRY(VLAN_TYPE_I), + HWS_DEFINER_ENTRY(VLAN_FIRST_PRIO_O), + HWS_DEFINER_ENTRY(VLAN_FIRST_PRIO_I), + HWS_DEFINER_ENTRY(VLAN_CFI_O), + HWS_DEFINER_ENTRY(VLAN_CFI_I), + HWS_DEFINER_ENTRY(VLAN_ID_O), + HWS_DEFINER_ENTRY(VLAN_ID_I), + HWS_DEFINER_ENTRY(VLAN_SECOND_TYPE_O), + HWS_DEFINER_ENTRY(VLAN_SECOND_TYPE_I), + HWS_DEFINER_ENTRY(VLAN_SECOND_PRIO_O), + HWS_DEFINER_ENTRY(VLAN_SECOND_PRIO_I), + HWS_DEFINER_ENTRY(VLAN_SECOND_CFI_O), + HWS_DEFINER_ENTRY(VLAN_SECOND_CFI_I), + HWS_DEFINER_ENTRY(VLAN_SECOND_ID_O), + HWS_DEFINER_ENTRY(VLAN_SECOND_ID_I), + HWS_DEFINER_ENTRY(IPV4_IHL_O), + HWS_DEFINER_ENTRY(IPV4_IHL_I), + HWS_DEFINER_ENTRY(IP_DSCP_O), + HWS_DEFINER_ENTRY(IP_DSCP_I), + HWS_DEFINER_ENTRY(IP_ECN_O), + HWS_DEFINER_ENTRY(IP_ECN_I), + HWS_DEFINER_ENTRY(IP_TTL_O), + HWS_DEFINER_ENTRY(IP_TTL_I), + HWS_DEFINER_ENTRY(IPV4_DST_O), + HWS_DEFINER_ENTRY(IPV4_DST_I), + HWS_DEFINER_ENTRY(IPV4_SRC_O), + HWS_DEFINER_ENTRY(IPV4_SRC_I), + HWS_DEFINER_ENTRY(IP_VERSION_O), + HWS_DEFINER_ENTRY(IP_VERSION_I), + HWS_DEFINER_ENTRY(IP_FRAG_O), + HWS_DEFINER_ENTRY(IP_FRAG_I), + HWS_DEFINER_ENTRY(IP_LEN_O), + HWS_DEFINER_ENTRY(IP_LEN_I), + HWS_DEFINER_ENTRY(IP_TOS_O), + HWS_DEFINER_ENTRY(IP_TOS_I), + HWS_DEFINER_ENTRY(IPV6_FLOW_LABEL_O), + HWS_DEFINER_ENTRY(IPV6_FLOW_LABEL_I), + HWS_DEFINER_ENTRY(IPV6_DST_127_96_O), + HWS_DEFINER_ENTRY(IPV6_DST_95_64_O), + HWS_DEFINER_ENTRY(IPV6_DST_63_32_O), + HWS_DEFINER_ENTRY(IPV6_DST_31_0_O), + HWS_DEFINER_ENTRY(IPV6_DST_127_96_I), + HWS_DEFINER_ENTRY(IPV6_DST_95_64_I), + HWS_DEFINER_ENTRY(IPV6_DST_63_32_I), + HWS_DEFINER_ENTRY(IPV6_DST_31_0_I), + HWS_DEFINER_ENTRY(IPV6_SRC_127_96_O), + HWS_DEFINER_ENTRY(IPV6_SRC_95_64_O), + HWS_DEFINER_ENTRY(IPV6_SRC_63_32_O), + HWS_DEFINER_ENTRY(IPV6_SRC_31_0_O), + HWS_DEFINER_ENTRY(IPV6_SRC_127_96_I), + HWS_DEFINER_ENTRY(IPV6_SRC_95_64_I), + HWS_DEFINER_ENTRY(IPV6_SRC_63_32_I), + HWS_DEFINER_ENTRY(IPV6_SRC_31_0_I), + HWS_DEFINER_ENTRY(IP_PROTOCOL_O), + HWS_DEFINER_ENTRY(IP_PROTOCOL_I), + HWS_DEFINER_ENTRY(L4_SPORT_O), + HWS_DEFINER_ENTRY(L4_SPORT_I), + HWS_DEFINER_ENTRY(L4_DPORT_O), + HWS_DEFINER_ENTRY(L4_DPORT_I), + HWS_DEFINER_ENTRY(TCP_FLAGS_I), + HWS_DEFINER_ENTRY(TCP_FLAGS_O), + HWS_DEFINER_ENTRY(TCP_SEQ_NUM), + HWS_DEFINER_ENTRY(TCP_ACK_NUM), + HWS_DEFINER_ENTRY(GTP_TEID), + HWS_DEFINER_ENTRY(GTP_MSG_TYPE), + HWS_DEFINER_ENTRY(GTP_EXT_FLAG), + HWS_DEFINER_ENTRY(GTP_NEXT_EXT_HDR), + HWS_DEFINER_ENTRY(GTP_EXT_HDR_PDU), + HWS_DEFINER_ENTRY(GTP_EXT_HDR_QFI), + HWS_DEFINER_ENTRY(GTPU_DW0), + HWS_DEFINER_ENTRY(GTPU_FIRST_EXT_DW0), + HWS_DEFINER_ENTRY(GTPU_DW2), + HWS_DEFINER_ENTRY(FLEX_PARSER_0), + HWS_DEFINER_ENTRY(FLEX_PARSER_1), + HWS_DEFINER_ENTRY(FLEX_PARSER_2), + HWS_DEFINER_ENTRY(FLEX_PARSER_3), + HWS_DEFINER_ENTRY(FLEX_PARSER_4), + HWS_DEFINER_ENTRY(FLEX_PARSER_5), + HWS_DEFINER_ENTRY(FLEX_PARSER_6), + HWS_DEFINER_ENTRY(FLEX_PARSER_7), + HWS_DEFINER_ENTRY(VPORT_REG_C_0), + HWS_DEFINER_ENTRY(VXLAN_FLAGS), + HWS_DEFINER_ENTRY(VXLAN_VNI), + HWS_DEFINER_ENTRY(VXLAN_GPE_FLAGS), + HWS_DEFINER_ENTRY(VXLAN_GPE_RSVD0), + HWS_DEFINER_ENTRY(VXLAN_GPE_PROTO), + HWS_DEFINER_ENTRY(VXLAN_GPE_VNI), + HWS_DEFINER_ENTRY(VXLAN_GPE_RSVD1), + HWS_DEFINER_ENTRY(GENEVE_OPT_LEN), + HWS_DEFINER_ENTRY(GENEVE_OAM), + HWS_DEFINER_ENTRY(GENEVE_PROTO), + HWS_DEFINER_ENTRY(GENEVE_VNI), + HWS_DEFINER_ENTRY(SOURCE_QP), + HWS_DEFINER_ENTRY(SOURCE_GVMI), + HWS_DEFINER_ENTRY(REG_0), + HWS_DEFINER_ENTRY(REG_1), + HWS_DEFINER_ENTRY(REG_2), + HWS_DEFINER_ENTRY(REG_3), + HWS_DEFINER_ENTRY(REG_4), + HWS_DEFINER_ENTRY(REG_5), + HWS_DEFINER_ENTRY(REG_6), + HWS_DEFINER_ENTRY(REG_7), + HWS_DEFINER_ENTRY(REG_8), + HWS_DEFINER_ENTRY(REG_9), + HWS_DEFINER_ENTRY(REG_10), + HWS_DEFINER_ENTRY(REG_11), + HWS_DEFINER_ENTRY(REG_A), + HWS_DEFINER_ENTRY(REG_B), + HWS_DEFINER_ENTRY(GRE_KEY_PRESENT), + HWS_DEFINER_ENTRY(GRE_C), + HWS_DEFINER_ENTRY(GRE_K), + HWS_DEFINER_ENTRY(GRE_S), + HWS_DEFINER_ENTRY(GRE_PROTOCOL), + HWS_DEFINER_ENTRY(GRE_OPT_KEY), + HWS_DEFINER_ENTRY(GRE_OPT_SEQ), + HWS_DEFINER_ENTRY(GRE_OPT_CHECKSUM), + HWS_DEFINER_ENTRY(INTEGRITY_O), + HWS_DEFINER_ENTRY(INTEGRITY_I), + HWS_DEFINER_ENTRY(ICMP_DW1), + HWS_DEFINER_ENTRY(ICMP_DW2), + HWS_DEFINER_ENTRY(ICMP_DW3), + HWS_DEFINER_ENTRY(IPSEC_SPI), + HWS_DEFINER_ENTRY(IPSEC_SEQUENCE_NUMBER), + HWS_DEFINER_ENTRY(IPSEC_SYNDROME), + HWS_DEFINER_ENTRY(MPLS0_O), + HWS_DEFINER_ENTRY(MPLS1_O), + HWS_DEFINER_ENTRY(MPLS2_O), + HWS_DEFINER_ENTRY(MPLS3_O), + HWS_DEFINER_ENTRY(MPLS4_O), + HWS_DEFINER_ENTRY(MPLS0_I), + HWS_DEFINER_ENTRY(MPLS1_I), + HWS_DEFINER_ENTRY(MPLS2_I), + HWS_DEFINER_ENTRY(MPLS3_I), + HWS_DEFINER_ENTRY(MPLS4_I), + HWS_DEFINER_ENTRY(FLEX_PARSER0_OK), + HWS_DEFINER_ENTRY(FLEX_PARSER1_OK), + HWS_DEFINER_ENTRY(FLEX_PARSER2_OK), + HWS_DEFINER_ENTRY(FLEX_PARSER3_OK), + HWS_DEFINER_ENTRY(FLEX_PARSER4_OK), + HWS_DEFINER_ENTRY(FLEX_PARSER5_OK), + HWS_DEFINER_ENTRY(FLEX_PARSER6_OK), + HWS_DEFINER_ENTRY(FLEX_PARSER7_OK), + HWS_DEFINER_ENTRY(OKS2_MPLS0_O), + HWS_DEFINER_ENTRY(OKS2_MPLS1_O), + HWS_DEFINER_ENTRY(OKS2_MPLS2_O), + HWS_DEFINER_ENTRY(OKS2_MPLS3_O), + HWS_DEFINER_ENTRY(OKS2_MPLS4_O), + HWS_DEFINER_ENTRY(OKS2_MPLS0_I), + HWS_DEFINER_ENTRY(OKS2_MPLS1_I), + HWS_DEFINER_ENTRY(OKS2_MPLS2_I), + HWS_DEFINER_ENTRY(OKS2_MPLS3_I), + HWS_DEFINER_ENTRY(OKS2_MPLS4_I), + HWS_DEFINER_ENTRY(GENEVE_OPT_OK_0), + HWS_DEFINER_ENTRY(GENEVE_OPT_OK_1), + HWS_DEFINER_ENTRY(GENEVE_OPT_OK_2), + HWS_DEFINER_ENTRY(GENEVE_OPT_OK_3), + HWS_DEFINER_ENTRY(GENEVE_OPT_OK_4), + HWS_DEFINER_ENTRY(GENEVE_OPT_OK_5), + HWS_DEFINER_ENTRY(GENEVE_OPT_OK_6), + HWS_DEFINER_ENTRY(GENEVE_OPT_OK_7), + HWS_DEFINER_ENTRY(GENEVE_OPT_DW_0), + HWS_DEFINER_ENTRY(GENEVE_OPT_DW_1), + HWS_DEFINER_ENTRY(GENEVE_OPT_DW_2), + HWS_DEFINER_ENTRY(GENEVE_OPT_DW_3), + HWS_DEFINER_ENTRY(GENEVE_OPT_DW_4), + HWS_DEFINER_ENTRY(GENEVE_OPT_DW_5), + HWS_DEFINER_ENTRY(GENEVE_OPT_DW_6), + HWS_DEFINER_ENTRY(GENEVE_OPT_DW_7), + HWS_DEFINER_ENTRY(IB_L4_OPCODE), + HWS_DEFINER_ENTRY(IB_L4_QPN), + HWS_DEFINER_ENTRY(IB_L4_A), + HWS_DEFINER_ENTRY(RANDOM_NUM), + HWS_DEFINER_ENTRY(PTYPE_L2_O), + HWS_DEFINER_ENTRY(PTYPE_L2_I), + HWS_DEFINER_ENTRY(PTYPE_L3_O), + HWS_DEFINER_ENTRY(PTYPE_L3_I), + HWS_DEFINER_ENTRY(PTYPE_L4_O), + HWS_DEFINER_ENTRY(PTYPE_L4_I), + HWS_DEFINER_ENTRY(PTYPE_L4_EXT_O), + HWS_DEFINER_ENTRY(PTYPE_L4_EXT_I), + HWS_DEFINER_ENTRY(PTYPE_FRAG_O), + HWS_DEFINER_ENTRY(PTYPE_FRAG_I), + HWS_DEFINER_ENTRY(TNL_HDR_0), + HWS_DEFINER_ENTRY(TNL_HDR_1), + HWS_DEFINER_ENTRY(TNL_HDR_2), + HWS_DEFINER_ENTRY(TNL_HDR_3), + [MLX5HWS_DEFINER_FNAME_MAX] = "DEFINER_FNAME_UNKNOWN", +}; + +const char *mlx5hws_definer_fname_to_str(enum mlx5hws_definer_fname fname) +{ + if (fname > MLX5HWS_DEFINER_FNAME_MAX) + fname = MLX5HWS_DEFINER_FNAME_MAX; + return hws_definer_fname_to_str[fname]; +} + static void hws_definer_ones_set(struct mlx5hws_definer_fc *fc, void *match_param, @@ -509,18 +721,33 @@ static int hws_definer_conv_outer(struct mlx5hws_definer_conv_data *cd, u32 *match_param) { - bool is_s_ipv6, is_d_ipv6, smac_set, dmac_set; + bool is_ipv6, smac_set, dmac_set, ip_addr_set, ip_ver_set; struct mlx5hws_definer_fc *fc = cd->fc; struct mlx5hws_definer_fc *curr_fc; u32 *s_ipv6, *d_ipv6; if (HWS_IS_FLD_SET_SZ(match_param, outer_headers.l4_type, 0x2) || - HWS_IS_FLD_SET_SZ(match_param, outer_headers.reserved_at_c2, 0xe) || - HWS_IS_FLD_SET_SZ(match_param, outer_headers.reserved_at_c4, 0x4)) { + HWS_IS_FLD_SET_SZ(match_param, outer_headers.l4_type_ext, 0x4) || + HWS_IS_FLD_SET_SZ(match_param, outer_headers.reserved_at_c6, 0xa) || + HWS_IS_FLD_SET_SZ(match_param, outer_headers.reserved_at_d4, 0x4)) { mlx5hws_err(cd->ctx, "Unsupported outer parameters set\n"); return -EINVAL; } + ip_addr_set = HWS_IS_FLD_SET_SZ(match_param, + outer_headers.src_ipv4_src_ipv6, + 0x80) || + HWS_IS_FLD_SET_SZ(match_param, + outer_headers.dst_ipv4_dst_ipv6, 0x80); + ip_ver_set = HWS_IS_FLD_SET(match_param, outer_headers.ip_version) || + HWS_IS_FLD_SET(match_param, outer_headers.ethertype); + + if (ip_addr_set && !ip_ver_set) { + mlx5hws_err(cd->ctx, + "Unsupported match on IP address without version or ethertype\n"); + return -EINVAL; + } + /* L2 Check ethertype */ HWS_SET_HDR(fc, match_param, ETH_TYPE_O, outer_headers.ethertype, @@ -559,6 +786,9 @@ hws_definer_conv_outer(struct mlx5hws_definer_conv_data *cd, HWS_SET_HDR(fc, match_param, IP_PROTOCOL_O, outer_headers.ip_protocol, eth_l3_outer.protocol_next_header); + HWS_SET_HDR(fc, match_param, IP_VERSION_O, + outer_headers.ip_version, + eth_l3_outer.ip_version); HWS_SET_HDR(fc, match_param, IP_TTL_O, outer_headers.ttl_hoplimit, eth_l3_outer.time_to_live_hop_limit); @@ -570,10 +800,16 @@ hws_definer_conv_outer(struct mlx5hws_definer_conv_data *cd, outer_headers.dst_ipv4_dst_ipv6.ipv6_layout); /* Assume IPv6 is used if ipv6 bits are set */ - is_s_ipv6 = s_ipv6[0] || s_ipv6[1] || s_ipv6[2]; - is_d_ipv6 = d_ipv6[0] || d_ipv6[1] || d_ipv6[2]; + is_ipv6 = s_ipv6[0] || s_ipv6[1] || s_ipv6[2] || + d_ipv6[0] || d_ipv6[1] || d_ipv6[2]; - if (is_s_ipv6) { + /* IHL is an IPv4-specific field. */ + if (is_ipv6 && HWS_IS_FLD_SET(match_param, outer_headers.ipv4_ihl)) { + mlx5hws_err(cd->ctx, "Unsupported match on IPv6 address and IPv4 IHL\n"); + return -EINVAL; + } + + if (is_ipv6) { /* Handle IPv6 source address */ HWS_SET_HDR(fc, match_param, IPV6_SRC_127_96_O, outer_headers.src_ipv4_src_ipv6.ipv6_simple_layout.ipv6_127_96, @@ -587,13 +823,6 @@ hws_definer_conv_outer(struct mlx5hws_definer_conv_data *cd, HWS_SET_HDR(fc, match_param, IPV6_SRC_31_0_O, outer_headers.src_ipv4_src_ipv6.ipv6_simple_layout.ipv6_31_0, ipv6_src_outer.ipv6_address_31_0); - } else { - /* Handle IPv4 source address */ - HWS_SET_HDR(fc, match_param, IPV4_SRC_O, - outer_headers.src_ipv4_src_ipv6.ipv6_simple_layout.ipv6_31_0, - ipv4_src_dest_outer.source_address); - } - if (is_d_ipv6) { /* Handle IPv6 destination address */ HWS_SET_HDR(fc, match_param, IPV6_DST_127_96_O, outer_headers.dst_ipv4_dst_ipv6.ipv6_simple_layout.ipv6_127_96, @@ -608,6 +837,10 @@ hws_definer_conv_outer(struct mlx5hws_definer_conv_data *cd, outer_headers.dst_ipv4_dst_ipv6.ipv6_simple_layout.ipv6_31_0, ipv6_dst_outer.ipv6_address_31_0); } else { + /* Handle IPv4 source address */ + HWS_SET_HDR(fc, match_param, IPV4_SRC_O, + outer_headers.src_ipv4_src_ipv6.ipv6_simple_layout.ipv6_31_0, + ipv4_src_dest_outer.source_address); /* Handle IPv4 destination address */ HWS_SET_HDR(fc, match_param, IPV4_DST_O, outer_headers.dst_ipv4_dst_ipv6.ipv6_simple_layout.ipv6_31_0, @@ -665,18 +898,33 @@ static int hws_definer_conv_inner(struct mlx5hws_definer_conv_data *cd, u32 *match_param) { - bool is_s_ipv6, is_d_ipv6, smac_set, dmac_set; + bool is_ipv6, smac_set, dmac_set, ip_addr_set, ip_ver_set; struct mlx5hws_definer_fc *fc = cd->fc; struct mlx5hws_definer_fc *curr_fc; u32 *s_ipv6, *d_ipv6; if (HWS_IS_FLD_SET_SZ(match_param, inner_headers.l4_type, 0x2) || - HWS_IS_FLD_SET_SZ(match_param, inner_headers.reserved_at_c2, 0xe) || - HWS_IS_FLD_SET_SZ(match_param, inner_headers.reserved_at_c4, 0x4)) { + HWS_IS_FLD_SET_SZ(match_param, inner_headers.l4_type_ext, 0x4) || + HWS_IS_FLD_SET_SZ(match_param, inner_headers.reserved_at_c6, 0xa) || + HWS_IS_FLD_SET_SZ(match_param, inner_headers.reserved_at_d4, 0x4)) { mlx5hws_err(cd->ctx, "Unsupported inner parameters set\n"); return -EINVAL; } + ip_addr_set = HWS_IS_FLD_SET_SZ(match_param, + inner_headers.src_ipv4_src_ipv6, + 0x80) || + HWS_IS_FLD_SET_SZ(match_param, + inner_headers.dst_ipv4_dst_ipv6, 0x80); + ip_ver_set = HWS_IS_FLD_SET(match_param, inner_headers.ip_version) || + HWS_IS_FLD_SET(match_param, inner_headers.ethertype); + + if (ip_addr_set && !ip_ver_set) { + mlx5hws_err(cd->ctx, + "Unsupported match on IP address without version or ethertype\n"); + return -EINVAL; + } + /* L2 Check ethertype */ HWS_SET_HDR(fc, match_param, ETH_TYPE_I, inner_headers.ethertype, @@ -728,10 +976,16 @@ hws_definer_conv_inner(struct mlx5hws_definer_conv_data *cd, inner_headers.dst_ipv4_dst_ipv6.ipv6_layout); /* Assume IPv6 is used if ipv6 bits are set */ - is_s_ipv6 = s_ipv6[0] || s_ipv6[1] || s_ipv6[2]; - is_d_ipv6 = d_ipv6[0] || d_ipv6[1] || d_ipv6[2]; + is_ipv6 = s_ipv6[0] || s_ipv6[1] || s_ipv6[2] || + d_ipv6[0] || d_ipv6[1] || d_ipv6[2]; - if (is_s_ipv6) { + /* IHL is an IPv4-specific field. */ + if (is_ipv6 && HWS_IS_FLD_SET(match_param, inner_headers.ipv4_ihl)) { + mlx5hws_err(cd->ctx, "Unsupported match on IPv6 address and IPv4 IHL\n"); + return -EINVAL; + } + + if (is_ipv6) { /* Handle IPv6 source address */ HWS_SET_HDR(fc, match_param, IPV6_SRC_127_96_I, inner_headers.src_ipv4_src_ipv6.ipv6_simple_layout.ipv6_127_96, @@ -745,13 +999,6 @@ hws_definer_conv_inner(struct mlx5hws_definer_conv_data *cd, HWS_SET_HDR(fc, match_param, IPV6_SRC_31_0_I, inner_headers.src_ipv4_src_ipv6.ipv6_simple_layout.ipv6_31_0, ipv6_src_inner.ipv6_address_31_0); - } else { - /* Handle IPv4 source address */ - HWS_SET_HDR(fc, match_param, IPV4_SRC_I, - inner_headers.src_ipv4_src_ipv6.ipv6_simple_layout.ipv6_31_0, - ipv4_src_dest_inner.source_address); - } - if (is_d_ipv6) { /* Handle IPv6 destination address */ HWS_SET_HDR(fc, match_param, IPV6_DST_127_96_I, inner_headers.dst_ipv4_dst_ipv6.ipv6_simple_layout.ipv6_127_96, @@ -766,6 +1013,10 @@ hws_definer_conv_inner(struct mlx5hws_definer_conv_data *cd, inner_headers.dst_ipv4_dst_ipv6.ipv6_simple_layout.ipv6_31_0, ipv6_dst_inner.ipv6_address_31_0); } else { + /* Handle IPv4 source address */ + HWS_SET_HDR(fc, match_param, IPV4_SRC_I, + inner_headers.src_ipv4_src_ipv6.ipv6_simple_layout.ipv6_31_0, + ipv4_src_dest_inner.source_address); /* Handle IPv4 destination address */ HWS_SET_HDR(fc, match_param, IPV4_DST_I, inner_headers.dst_ipv4_dst_ipv6.ipv6_simple_layout.ipv6_31_0, @@ -1029,8 +1280,9 @@ hws_definer_conv_misc2(struct mlx5hws_definer_conv_data *cd, struct mlx5hws_definer_fc *fc = cd->fc; struct mlx5hws_definer_fc *curr_fc; - if (HWS_IS_FLD_SET_SZ(match_param, misc_parameters_2.reserved_at_1a0, 0x8) || - HWS_IS_FLD_SET_SZ(match_param, misc_parameters_2.reserved_at_1b8, 0x8) || + if (HWS_IS_FLD_SET_SZ(match_param, misc_parameters_2.psp_syndrome, 0x8) || + HWS_IS_FLD_SET_SZ(match_param, + misc_parameters_2.ipsec_next_header, 0x8) || HWS_IS_FLD_SET_SZ(match_param, misc_parameters_2.reserved_at_1c0, 0x40) || HWS_IS_FLD_SET(match_param, misc_parameters_2.macsec_syndrome) || HWS_IS_FLD_SET(match_param, misc_parameters_2.ipsec_syndrome)) { @@ -1579,80 +1831,6 @@ err_free_fc: return ret; } -struct mlx5hws_definer_fc * -mlx5hws_definer_conv_match_params_to_compressed_fc(struct mlx5hws_context *ctx, - u8 match_criteria_enable, - u32 *match_param, - int *fc_sz) -{ - struct mlx5hws_definer_fc *compressed_fc = NULL; - struct mlx5hws_definer_conv_data cd = {0}; - struct mlx5hws_definer_fc *fc; - int ret; - - fc = hws_definer_alloc_fc(ctx, MLX5HWS_DEFINER_FNAME_MAX); - if (!fc) - return NULL; - - cd.fc = fc; - cd.ctx = ctx; - - if (match_criteria_enable & MLX5HWS_DEFINER_MATCH_CRITERIA_OUTER) { - ret = hws_definer_conv_outer(&cd, match_param); - if (ret) - goto err_free_fc; - } - - if (match_criteria_enable & MLX5HWS_DEFINER_MATCH_CRITERIA_INNER) { - ret = hws_definer_conv_inner(&cd, match_param); - if (ret) - goto err_free_fc; - } - - if (match_criteria_enable & MLX5HWS_DEFINER_MATCH_CRITERIA_MISC) { - ret = hws_definer_conv_misc(&cd, match_param); - if (ret) - goto err_free_fc; - } - - if (match_criteria_enable & MLX5HWS_DEFINER_MATCH_CRITERIA_MISC2) { - ret = hws_definer_conv_misc2(&cd, match_param); - if (ret) - goto err_free_fc; - } - - if (match_criteria_enable & MLX5HWS_DEFINER_MATCH_CRITERIA_MISC3) { - ret = hws_definer_conv_misc3(&cd, match_param); - if (ret) - goto err_free_fc; - } - - if (match_criteria_enable & MLX5HWS_DEFINER_MATCH_CRITERIA_MISC4) { - ret = hws_definer_conv_misc4(&cd, match_param); - if (ret) - goto err_free_fc; - } - - if (match_criteria_enable & MLX5HWS_DEFINER_MATCH_CRITERIA_MISC5) { - ret = hws_definer_conv_misc5(&cd, match_param); - if (ret) - goto err_free_fc; - } - - /* Allocate fc array on mt */ - compressed_fc = hws_definer_alloc_compressed_fc(fc); - if (!compressed_fc) { - mlx5hws_err(ctx, - "Convert to compressed fc: failed to set field copy to match template\n"); - goto err_free_fc; - } - *fc_sz = hws_definer_get_fc_size(fc); - -err_free_fc: - kfree(fc); - return compressed_fc; -} - static int hws_definer_find_byte_in_tag(struct mlx5hws_definer *definer, u32 hl_byte_off, @@ -1815,7 +1993,7 @@ hws_definer_copy_sel_ctrl(struct mlx5hws_definer_sel_ctrl *ctrl, static int hws_definer_find_best_match_fit(struct mlx5hws_context *ctx, struct mlx5hws_definer *definer, - u8 *hl) + u8 *hl, bool allow_jumbo) { struct mlx5hws_definer_sel_ctrl ctrl = {0}; bool found; @@ -1832,6 +2010,9 @@ hws_definer_find_best_match_fit(struct mlx5hws_context *ctx, return 0; } + if (!allow_jumbo) + return -E2BIG; + /* Try to create a full/limited jumbo definer */ ctrl.allowed_full_dw = ctx->caps->full_dw_jumbo_support ? DW_SELECTORS : DW_SELECTORS_MATCH; @@ -1908,7 +2089,8 @@ int mlx5hws_definer_compare(struct mlx5hws_definer *definer_a, int mlx5hws_definer_calc_layout(struct mlx5hws_context *ctx, struct mlx5hws_match_template *mt, - struct mlx5hws_definer *match_definer) + struct mlx5hws_definer *match_definer, + bool allow_jumbo) { u8 *match_hl; int ret; @@ -1930,7 +2112,8 @@ mlx5hws_definer_calc_layout(struct mlx5hws_context *ctx, } /* Find the match definer layout for header layout match union */ - ret = hws_definer_find_best_match_fit(ctx, match_definer, match_hl); + ret = hws_definer_find_best_match_fit(ctx, match_definer, match_hl, + allow_jumbo); if (ret) { if (ret == -E2BIG) mlx5hws_dbg(ctx, @@ -2118,7 +2301,7 @@ int mlx5hws_definer_mt_init(struct mlx5hws_context *ctx, struct mlx5hws_definer match_layout = {0}; int ret; - ret = mlx5hws_definer_calc_layout(ctx, mt, &match_layout); + ret = mlx5hws_definer_calc_layout(ctx, mt, &match_layout, true); if (ret) { mlx5hws_err(ctx, "Failed to calculate matcher definer layout\n"); return ret; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.h index 5c1a2086ef..141f3eb2e3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.h @@ -823,12 +823,9 @@ void mlx5hws_definer_free(struct mlx5hws_context *ctx, int mlx5hws_definer_calc_layout(struct mlx5hws_context *ctx, struct mlx5hws_match_template *mt, - struct mlx5hws_definer *match_definer); + struct mlx5hws_definer *match_definer, + bool allow_jumbo); -struct mlx5hws_definer_fc * -mlx5hws_definer_conv_match_params_to_compressed_fc(struct mlx5hws_context *ctx, - u8 match_criteria_enable, - u32 *match_param, - int *fc_sz); +const char *mlx5hws_definer_fname_to_str(enum mlx5hws_definer_fname fname); #endif /* HWS_DEFINER_H_ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c index 1b787cd66e..6a4c4cccd6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c @@ -267,6 +267,7 @@ static int mlx5_cmd_hws_create_flow_table(struct mlx5_flow_root_namespace *ns, tbl_attr.type = MLX5HWS_TABLE_TYPE_FDB; tbl_attr.level = ft_attr->level; + tbl_attr.uid = ft_attr->uid; tbl = mlx5hws_table_create(ctx, &tbl_attr); if (!tbl) { mlx5_core_err(ns->dev, "Failed creating hws flow_table\n"); @@ -571,14 +572,12 @@ static void mlx5_fs_put_dest_action_sampler(struct mlx5_fs_hws_context *fs_ctx, static struct mlx5hws_action * mlx5_fs_create_action_dest_array(struct mlx5hws_context *ctx, struct mlx5hws_action_dest_attr *dests, - u32 num_of_dests, bool ignore_flow_level, - u32 flow_source) + u32 num_of_dests) { u32 flags = MLX5HWS_ACTION_FLAG_HWS_FDB | MLX5HWS_ACTION_FLAG_SHARED; return mlx5hws_action_create_dest_array(ctx, num_of_dests, dests, - ignore_flow_level, - flow_source, flags); + flags); } static struct mlx5hws_action * @@ -966,6 +965,9 @@ static int mlx5_fs_fte_get_hws_actions(struct mlx5_flow_root_namespace *ns, switch (attr->type) { case MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE: dest_action = mlx5_fs_get_dest_action_ft(fs_ctx, dst); + if (dst->dest_attr.ft->flags & + MLX5_FLOW_TABLE_UPLINK_VPORT) + dest_actions[num_dest_actions].is_wire_ft = true; break; case MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM: dest_action = mlx5_fs_get_dest_action_table_num(fs_ctx, @@ -1012,20 +1014,14 @@ static int mlx5_fs_fte_get_hws_actions(struct mlx5_flow_root_namespace *ns, } (*ractions)[num_actions++].action = dest_actions->dest; } else if (num_dest_actions > 1) { - u32 flow_source = fte->act_dests.flow_context.flow_source; - bool ignore_flow_level; - if (num_actions == MLX5_FLOW_CONTEXT_ACTION_MAX || num_fs_actions == MLX5_FLOW_CONTEXT_ACTION_MAX) { err = -EOPNOTSUPP; goto free_actions; } - ignore_flow_level = - !!(fte_action->flags & FLOW_ACT_IGNORE_FLOW_LEVEL); - tmp_action = mlx5_fs_create_action_dest_array(ctx, dest_actions, - num_dest_actions, - ignore_flow_level, - flow_source); + tmp_action = + mlx5_fs_create_action_dest_array(ctx, dest_actions, + num_dest_actions); if (!tmp_action) { err = -EOPNOTSUPP; goto free_actions; @@ -1081,13 +1077,8 @@ static int mlx5_cmd_hws_create_fte(struct mlx5_flow_root_namespace *ns, struct mlx5hws_bwc_rule *rule; int err = 0; - if (mlx5_fs_cmd_is_fw_term_table(ft)) { - /* Packet reformat on terminamtion table not supported yet */ - if (fte->act_dests.action.action & - MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT) - return -EOPNOTSUPP; + if (mlx5_fs_cmd_is_fw_term_table(ft)) return mlx5_fs_cmd_get_fw_cmds()->create_fte(ns, ft, group, fte); - } err = mlx5_fs_fte_get_hws_actions(ns, ft, group, fte, &ractions); if (err) @@ -1362,7 +1353,8 @@ mlx5_cmd_hws_packet_reformat_alloc(struct mlx5_flow_root_namespace *ns, pkt_reformat->fs_hws_action.pr_data = pr_data; } - pkt_reformat->owner = MLX5_FLOW_RESOURCE_OWNER_SW; + mutex_init(&pkt_reformat->fs_hws_action.lock); + pkt_reformat->owner = MLX5_FLOW_RESOURCE_OWNER_HWS; pkt_reformat->fs_hws_action.hws_action = hws_action; return 0; @@ -1380,6 +1372,15 @@ static void mlx5_cmd_hws_packet_reformat_dealloc(struct mlx5_flow_root_namespace struct mlx5_fs_hws_pr *pr_data; struct mlx5_fs_pool *pr_pool; + if (pkt_reformat->fs_hws_action.fw_reformat_id != 0) { + struct mlx5_pkt_reformat fw_pkt_reformat = { 0 }; + + fw_pkt_reformat.id = pkt_reformat->fs_hws_action.fw_reformat_id; + mlx5_fs_cmd_get_fw_cmds()-> + packet_reformat_dealloc(ns, &fw_pkt_reformat); + pkt_reformat->fs_hws_action.fw_reformat_id = 0; + } + if (pkt_reformat->reformat_type == MLX5_REFORMAT_TYPE_REMOVE_HDR) return; @@ -1532,6 +1533,58 @@ static void mlx5_cmd_hws_modify_header_dealloc(struct mlx5_flow_root_namespace * modify_hdr->fs_hws_action.mh_data = NULL; } +int +mlx5_fs_hws_action_get_pkt_reformat_id(struct mlx5_pkt_reformat *pkt_reformat, + u32 *reformat_id) +{ + enum mlx5_flow_namespace_type ns_type = pkt_reformat->ns_type; + struct mutex *lock = &pkt_reformat->fs_hws_action.lock; + u32 *id = &pkt_reformat->fs_hws_action.fw_reformat_id; + struct mlx5_pkt_reformat fw_pkt_reformat = { 0 }; + struct mlx5_pkt_reformat_params params = { 0 }; + struct mlx5_flow_root_namespace *ns; + struct mlx5_core_dev *dev; + int ret; + + mutex_lock(lock); + + if (*id != 0) { + *reformat_id = *id; + ret = 0; + goto unlock; + } + + dev = mlx5hws_action_get_dev(pkt_reformat->fs_hws_action.hws_action); + if (!dev) { + ret = -EINVAL; + goto unlock; + } + + ns = mlx5_get_root_namespace(dev, ns_type); + if (!ns) { + ret = -EINVAL; + goto unlock; + } + + params.type = pkt_reformat->reformat_type; + params.size = pkt_reformat->fs_hws_action.pr_data->data_size; + params.data = pkt_reformat->fs_hws_action.pr_data->data; + + ret = mlx5_fs_cmd_get_fw_cmds()-> + packet_reformat_alloc(ns, ¶ms, ns_type, &fw_pkt_reformat); + if (ret) + goto unlock; + + *id = fw_pkt_reformat.id; + *reformat_id = *id; + ret = 0; + +unlock: + mutex_unlock(lock); + + return ret; +} + static int mlx5_cmd_hws_create_match_definer(struct mlx5_flow_root_namespace *ns, u16 format_id, u32 *match_mask) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.h index 8b56298288..b92d55b2d1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.h @@ -41,6 +41,11 @@ struct mlx5_fs_hws_action { struct mlx5_fs_pool *fs_pool; struct mlx5_fs_hws_pr *pr_data; struct mlx5_fs_hws_mh *mh_data; + u32 fw_reformat_id; + /* Protect `fw_reformat_id` against being initialized from multiple + * threads. + */ + struct mutex lock; }; struct mlx5_fs_hws_matcher { @@ -84,12 +89,23 @@ void mlx5_fs_put_hws_action(struct mlx5_fs_hws_data *fs_hws_data); #ifdef CONFIG_MLX5_HW_STEERING +int +mlx5_fs_hws_action_get_pkt_reformat_id(struct mlx5_pkt_reformat *pkt_reformat, + u32 *reformat_id); + bool mlx5_fs_hws_is_supported(struct mlx5_core_dev *dev); const struct mlx5_flow_cmds *mlx5_fs_cmd_get_hws_cmds(void); #else +static inline int +mlx5_fs_hws_action_get_pkt_reformat_id(struct mlx5_pkt_reformat *pkt_reformat, + u32 *reformat_id) +{ + return -EOPNOTSUPP; +} + static inline bool mlx5_fs_hws_is_supported(struct mlx5_core_dev *dev) { return false; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/internal.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/internal.h index 30ccd635b5..21279d5031 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/internal.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/internal.h @@ -17,6 +17,7 @@ #include "context.h" #include "table.h" #include "send.h" +#include "action_ste_pool.h" #include "rule.h" #include "cmd.h" #include "action.h" diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c index b61864b320..32f87fdf32 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c @@ -3,25 +3,6 @@ #include "internal.h" -enum mlx5hws_matcher_rtc_type { - HWS_MATCHER_RTC_TYPE_MATCH, - HWS_MATCHER_RTC_TYPE_STE_ARRAY, - HWS_MATCHER_RTC_TYPE_MAX, -}; - -static const char * const mlx5hws_matcher_rtc_type_str[] = { - [HWS_MATCHER_RTC_TYPE_MATCH] = "MATCH", - [HWS_MATCHER_RTC_TYPE_STE_ARRAY] = "STE_ARRAY", - [HWS_MATCHER_RTC_TYPE_MAX] = "UNKNOWN", -}; - -static const char *hws_matcher_rtc_type_to_str(enum mlx5hws_matcher_rtc_type rtc_type) -{ - if (rtc_type > HWS_MATCHER_RTC_TYPE_MAX) - rtc_type = HWS_MATCHER_RTC_TYPE_MAX; - return mlx5hws_matcher_rtc_type_str[rtc_type]; -} - static bool hws_matcher_requires_col_tbl(u8 log_num_of_rules) { /* Collision table concatenation is done only for large rule tables */ @@ -42,19 +23,202 @@ static void hws_matcher_destroy_end_ft(struct mlx5hws_matcher *matcher) mlx5hws_table_destroy_default_ft(matcher->tbl, matcher->end_ft_id); } +int mlx5hws_matcher_update_end_ft_isolated(struct mlx5hws_table *tbl, + u32 miss_ft_id) +{ + struct mlx5hws_matcher *tmp_matcher; + + if (list_empty(&tbl->matchers_list)) + return -EINVAL; + + /* Update isolated_matcher_end_ft_id attribute for all + * the matchers in isolated table. + */ + list_for_each_entry(tmp_matcher, &tbl->matchers_list, list_node) + tmp_matcher->attr.isolated_matcher_end_ft_id = miss_ft_id; + + tmp_matcher = list_last_entry(&tbl->matchers_list, + struct mlx5hws_matcher, + list_node); + + return mlx5hws_table_ft_set_next_ft(tbl->ctx, + tmp_matcher->end_ft_id, + tbl->fw_ft_type, + miss_ft_id); +} + +static int hws_matcher_connect_end_ft_isolated(struct mlx5hws_matcher *matcher) +{ + struct mlx5hws_table *tbl = matcher->tbl; + u32 end_ft_id; + int ret; + + /* Reset end_ft next RTCs */ + ret = mlx5hws_table_ft_set_next_rtc(tbl->ctx, + matcher->end_ft_id, + matcher->tbl->fw_ft_type, + 0, 0); + if (ret) { + mlx5hws_err(tbl->ctx, "Isolated matcher: failed to reset FT's next RTCs\n"); + return ret; + } + + /* Connect isolated matcher's end_ft to the complex matcher's end FT */ + end_ft_id = matcher->attr.isolated_matcher_end_ft_id; + ret = mlx5hws_table_ft_set_next_ft(tbl->ctx, + matcher->end_ft_id, + matcher->tbl->fw_ft_type, + end_ft_id); + + if (ret) { + mlx5hws_err(tbl->ctx, "Isolated matcher: failed to set FT's miss_ft_id\n"); + return ret; + } + + return 0; +} + +static int hws_matcher_create_end_ft_isolated(struct mlx5hws_matcher *matcher) +{ + struct mlx5hws_table *tbl = matcher->tbl; + int ret; + + ret = mlx5hws_table_create_default_ft(tbl->ctx->mdev, + tbl, + 0, + &matcher->end_ft_id); + if (ret) { + mlx5hws_err(tbl->ctx, "Isolated matcher: failed to create end flow table\n"); + return ret; + } + + ret = hws_matcher_connect_end_ft_isolated(matcher); + if (ret) { + mlx5hws_err(tbl->ctx, "Isolated matcher: failed to connect end FT\n"); + goto destroy_default_ft; + } + + return 0; + +destroy_default_ft: + mlx5hws_table_destroy_default_ft(tbl, matcher->end_ft_id); + return ret; +} + static int hws_matcher_create_end_ft(struct mlx5hws_matcher *matcher) { struct mlx5hws_table *tbl = matcher->tbl; int ret; - ret = mlx5hws_table_create_default_ft(tbl->ctx->mdev, tbl, &matcher->end_ft_id); + if (mlx5hws_matcher_is_isolated(matcher)) + ret = hws_matcher_create_end_ft_isolated(matcher); + else + ret = mlx5hws_table_create_default_ft(tbl->ctx->mdev, + tbl, + 0, + &matcher->end_ft_id); + if (ret) { mlx5hws_err(tbl->ctx, "Failed to create matcher end flow table\n"); return ret; } + return 0; } +static int hws_matcher_connect_isolated_first(struct mlx5hws_matcher *matcher) +{ + struct mlx5hws_table *tbl = matcher->tbl; + struct mlx5hws_context *ctx = tbl->ctx; + int ret; + + /* Isolated matcher's end_ft is already pointing to the end_ft + * of the complex matcher - it was set at creation of end_ft, + * so no need to connect it. + * We still need to connect the isolated table's start FT to + * this matcher's RTC. + */ + ret = mlx5hws_table_ft_set_next_rtc(ctx, + tbl->ft_id, + tbl->fw_ft_type, + matcher->match_ste.rtc_0_id, + matcher->match_ste.rtc_1_id); + if (ret) { + mlx5hws_err(ctx, "Isolated matcher: failed to connect start FT to match RTC\n"); + return ret; + } + + /* Reset table's FT default miss (drop refcount) */ + ret = mlx5hws_table_ft_set_default_next_ft(tbl, tbl->ft_id); + if (ret) { + mlx5hws_err(ctx, "Isolated matcher: failed to reset table ft default miss\n"); + return ret; + } + + list_add(&matcher->list_node, &tbl->matchers_list); + + return ret; +} + +static int hws_matcher_connect_isolated_last(struct mlx5hws_matcher *matcher) +{ + struct mlx5hws_table *tbl = matcher->tbl; + struct mlx5hws_context *ctx = tbl->ctx; + struct mlx5hws_matcher *last; + int ret; + + last = list_last_entry(&tbl->matchers_list, + struct mlx5hws_matcher, + list_node); + + /* New matcher's end_ft is already pointing to the end_ft of + * the complex matcher. + * Connect previous matcher's end_ft to this new matcher RTC. + */ + ret = mlx5hws_table_ft_set_next_rtc(ctx, + last->end_ft_id, + tbl->fw_ft_type, + matcher->match_ste.rtc_0_id, + matcher->match_ste.rtc_1_id); + if (ret) { + mlx5hws_err(ctx, + "Isolated matcher: failed to connect matcher end_ft to new match RTC\n"); + return ret; + } + + /* Reset prev matcher FT default miss (drop refcount) */ + ret = mlx5hws_table_ft_set_default_next_ft(tbl, last->end_ft_id); + if (ret) { + mlx5hws_err(ctx, "Isolated matcher: failed to reset matcher ft default miss\n"); + return ret; + } + + /* Insert after the last matcher */ + list_add(&matcher->list_node, &last->list_node); + + return 0; +} + +static int hws_matcher_connect_isolated(struct mlx5hws_matcher *matcher) +{ + /* Isolated matcher is expected to be the only one in its table. + * However, it can have a collision matcher, and it can go through + * rehash process, in which case we will temporary have both old and + * new matchers in the isolated table. + * Check if this is the first matcher in the isolated table. + */ + if (list_empty(&matcher->tbl->matchers_list)) + return hws_matcher_connect_isolated_first(matcher); + + /* If this wasn't the first matcher, then we have 3 possible cases: + * - this is a collision matcher for the first matcher + * - this is a new rehash dest matcher + * - this is a collision matcher for the new rehash dest matcher + * The logic to add new matcher is the same for all these cases. + */ + return hws_matcher_connect_isolated_last(matcher); +} + static int hws_matcher_connect(struct mlx5hws_matcher *matcher) { struct mlx5hws_table *tbl = matcher->tbl; @@ -64,6 +228,9 @@ static int hws_matcher_connect(struct mlx5hws_matcher *matcher) struct mlx5hws_matcher *tmp_matcher; int ret; + if (mlx5hws_matcher_is_isolated(matcher)) + return hws_matcher_connect_isolated(matcher); + /* Find location in matcher list */ if (list_empty(&tbl->matchers_list)) { list_add(&matcher->list_node, &tbl->matchers_list); @@ -140,6 +307,92 @@ remove_from_list: return ret; } +static int hws_matcher_disconnect_isolated(struct mlx5hws_matcher *matcher) +{ + struct mlx5hws_matcher *first, *last, *prev, *next; + struct mlx5hws_table *tbl = matcher->tbl; + struct mlx5hws_context *ctx = tbl->ctx; + u32 end_ft_id; + int ret; + + first = list_first_entry(&tbl->matchers_list, + struct mlx5hws_matcher, + list_node); + last = list_last_entry(&tbl->matchers_list, + struct mlx5hws_matcher, + list_node); + prev = list_prev_entry(matcher, list_node); + next = list_next_entry(matcher, list_node); + + list_del_init(&matcher->list_node); + + if (first == last) { + /* This was the only matcher in the list. + * Reset isolated table FT next RTCs and connect it + * to the whole complex matcher end FT instead. + */ + ret = mlx5hws_table_ft_set_next_rtc(ctx, + tbl->ft_id, + tbl->fw_ft_type, + 0, 0); + if (ret) { + mlx5hws_err(tbl->ctx, "Isolated matcher: failed to reset FT's next RTCs\n"); + return ret; + } + + end_ft_id = matcher->attr.isolated_matcher_end_ft_id; + ret = mlx5hws_table_ft_set_next_ft(tbl->ctx, + tbl->ft_id, + tbl->fw_ft_type, + end_ft_id); + if (ret) { + mlx5hws_err(tbl->ctx, "Isolated matcher: failed to set FT's miss_ft_id\n"); + return ret; + } + + return 0; + } + + /* At this point we know that there are more matchers in the list */ + + if (matcher == first) { + /* We've disconnected the first matcher. + * Now update isolated table default FT. + */ + if (!next) + return -EINVAL; + return mlx5hws_table_ft_set_next_rtc(ctx, + tbl->ft_id, + tbl->fw_ft_type, + next->match_ste.rtc_0_id, + next->match_ste.rtc_1_id); + } + + if (matcher == last) { + /* If we've disconnected the last matcher - update prev + * matcher's end_ft to point to the complex matcher end_ft. + */ + if (!prev) + return -EINVAL; + return hws_matcher_connect_end_ft_isolated(prev); + } + + /* This wasn't the first or the last matcher, which means that it has + * both prev and next matchers. Note that this only happens if we're + * disconnecting collision matcher of the old matcher during rehash. + */ + if (!prev || !next || + !(matcher->flags & MLX5HWS_MATCHER_FLAGS_COLLISION)) + return -EINVAL; + + /* Update prev end FT to point to next match RTC */ + return mlx5hws_table_ft_set_next_rtc(ctx, + prev->end_ft_id, + tbl->fw_ft_type, + next->match_ste.rtc_0_id, + next->match_ste.rtc_1_id); +} + static int hws_matcher_disconnect(struct mlx5hws_matcher *matcher) { struct mlx5hws_matcher *next = NULL, *prev = NULL; @@ -147,6 +400,9 @@ static int hws_matcher_disconnect(struct mlx5hws_matcher *matcher) u32 prev_ft_id = tbl->ft_id; int ret; + if (mlx5hws_matcher_is_isolated(matcher)) + return hws_matcher_disconnect_isolated(matcher); + if (!list_is_first(&matcher->list_node, &tbl->matchers_list)) { prev = list_prev_entry(matcher, list_node); prev_ft_id = prev->end_ft_id; @@ -197,149 +453,98 @@ static int hws_matcher_disconnect(struct mlx5hws_matcher *matcher) static void hws_matcher_set_rtc_attr_sz(struct mlx5hws_matcher *matcher, struct mlx5hws_cmd_rtc_create_attr *rtc_attr, - enum mlx5hws_matcher_rtc_type rtc_type, bool is_mirror) { - struct mlx5hws_pool_chunk *ste = &matcher->action_ste.ste; enum mlx5hws_matcher_flow_src flow_src = matcher->attr.optimize_flow_src; - bool is_match_rtc = rtc_type == HWS_MATCHER_RTC_TYPE_MATCH; if ((flow_src == MLX5HWS_MATCHER_FLOW_SRC_VPORT && !is_mirror) || (flow_src == MLX5HWS_MATCHER_FLOW_SRC_WIRE && is_mirror)) { /* Optimize FDB RTC */ rtc_attr->log_size = 0; rtc_attr->log_depth = 0; - } else { - /* Keep original values */ - rtc_attr->log_size = is_match_rtc ? matcher->attr.table.sz_row_log : ste->order; - rtc_attr->log_depth = is_match_rtc ? matcher->attr.table.sz_col_log : 0; } } -static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher, - enum mlx5hws_matcher_rtc_type rtc_type) +static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher) { struct mlx5hws_matcher_attr *attr = &matcher->attr; struct mlx5hws_cmd_rtc_create_attr rtc_attr = {0}; struct mlx5hws_match_template *mt = matcher->mt; struct mlx5hws_context *ctx = matcher->tbl->ctx; - struct mlx5hws_action_default_stc *default_stc; - struct mlx5hws_matcher_action_ste *action_ste; + union mlx5hws_matcher_size *size_rx, *size_tx; struct mlx5hws_table *tbl = matcher->tbl; - struct mlx5hws_pool *ste_pool, *stc_pool; - struct mlx5hws_pool_chunk *ste; - u32 *rtc_0_id, *rtc_1_id; u32 obj_id; int ret; - switch (rtc_type) { - case HWS_MATCHER_RTC_TYPE_MATCH: - rtc_0_id = &matcher->match_ste.rtc_0_id; - rtc_1_id = &matcher->match_ste.rtc_1_id; - ste_pool = matcher->match_ste.pool; - ste = &matcher->match_ste.ste; - ste->order = attr->table.sz_col_log + attr->table.sz_row_log; + size_rx = &attr->size[MLX5HWS_MATCHER_SIZE_TYPE_RX]; + size_tx = &attr->size[MLX5HWS_MATCHER_SIZE_TYPE_TX]; - rtc_attr.log_size = attr->table.sz_row_log; - rtc_attr.log_depth = attr->table.sz_col_log; - rtc_attr.is_frst_jumbo = mlx5hws_matcher_mt_is_jumbo(mt); - rtc_attr.is_scnd_range = 0; - rtc_attr.miss_ft_id = matcher->end_ft_id; + rtc_attr.log_size = size_rx->table.sz_row_log; + rtc_attr.log_depth = size_rx->table.sz_col_log; + rtc_attr.is_frst_jumbo = mlx5hws_matcher_mt_is_jumbo(mt); + rtc_attr.is_scnd_range = 0; + rtc_attr.miss_ft_id = matcher->end_ft_id; - if (attr->insert_mode == MLX5HWS_MATCHER_INSERT_BY_HASH) { - /* The usual Hash Table */ - rtc_attr.update_index_mode = MLX5_IFC_RTC_STE_UPDATE_MODE_BY_HASH; + if (attr->insert_mode == MLX5HWS_MATCHER_INSERT_BY_HASH) { + /* The usual Hash Table */ + rtc_attr.update_index_mode = + MLX5_IFC_RTC_STE_UPDATE_MODE_BY_HASH; - /* The first mt is used since all share the same definer */ - rtc_attr.match_definer_0 = mlx5hws_definer_get_id(mt->definer); - } else if (attr->insert_mode == MLX5HWS_MATCHER_INSERT_BY_INDEX) { - rtc_attr.update_index_mode = MLX5_IFC_RTC_STE_UPDATE_MODE_BY_OFFSET; - rtc_attr.num_hash_definer = 1; + /* The first mt is used since all share the same definer */ + rtc_attr.match_definer_0 = mlx5hws_definer_get_id(mt->definer); + } else if (attr->insert_mode == MLX5HWS_MATCHER_INSERT_BY_INDEX) { + rtc_attr.update_index_mode = + MLX5_IFC_RTC_STE_UPDATE_MODE_BY_OFFSET; + rtc_attr.num_hash_definer = 1; - if (attr->distribute_mode == MLX5HWS_MATCHER_DISTRIBUTE_BY_HASH) { - /* Hash Split Table */ - rtc_attr.access_index_mode = MLX5_IFC_RTC_STE_ACCESS_MODE_BY_HASH; - rtc_attr.match_definer_0 = mlx5hws_definer_get_id(mt->definer); - } else if (attr->distribute_mode == MLX5HWS_MATCHER_DISTRIBUTE_BY_LINEAR) { - /* Linear Lookup Table */ - rtc_attr.access_index_mode = MLX5_IFC_RTC_STE_ACCESS_MODE_LINEAR; - rtc_attr.match_definer_0 = ctx->caps->linear_match_definer; - } + if (attr->distribute_mode == + MLX5HWS_MATCHER_DISTRIBUTE_BY_HASH) { + /* Hash Split Table */ + rtc_attr.access_index_mode = + MLX5_IFC_RTC_STE_ACCESS_MODE_BY_HASH; + rtc_attr.match_definer_0 = + mlx5hws_definer_get_id(mt->definer); + } else if (attr->distribute_mode == + MLX5HWS_MATCHER_DISTRIBUTE_BY_LINEAR) { + /* Linear Lookup Table */ + rtc_attr.access_index_mode = + MLX5_IFC_RTC_STE_ACCESS_MODE_LINEAR; + rtc_attr.match_definer_0 = + ctx->caps->linear_match_definer; } - - /* Match pool requires implicit allocation */ - ret = mlx5hws_pool_chunk_alloc(ste_pool, ste); - if (ret) { - mlx5hws_err(ctx, "Failed to allocate STE for %s RTC", - hws_matcher_rtc_type_to_str(rtc_type)); - return ret; - } - break; - - case HWS_MATCHER_RTC_TYPE_STE_ARRAY: - action_ste = &matcher->action_ste; - - rtc_0_id = &action_ste->rtc_0_id; - rtc_1_id = &action_ste->rtc_1_id; - ste_pool = action_ste->pool; - ste = &action_ste->ste; - /* Action RTC size calculation: - * log((max number of rules in matcher) * - * (max number of action STEs per rule) * - * (2 to support writing new STEs for update rule)) - */ - ste->order = ilog2(roundup_pow_of_two(action_ste->max_stes)) + - attr->table.sz_row_log + - MLX5HWS_MATCHER_ACTION_RTC_UPDATE_MULT; - rtc_attr.log_size = ste->order; - rtc_attr.log_depth = 0; - rtc_attr.update_index_mode = MLX5_IFC_RTC_STE_UPDATE_MODE_BY_OFFSET; - /* The action STEs use the default always hit definer */ - rtc_attr.match_definer_0 = ctx->caps->trivial_match_definer; - rtc_attr.is_frst_jumbo = false; - rtc_attr.miss_ft_id = 0; - break; - - default: - mlx5hws_err(ctx, "HWS Invalid RTC type\n"); - return -EINVAL; } - obj_id = mlx5hws_pool_chunk_get_base_id(ste_pool, ste); - rtc_attr.pd = ctx->pd_num; - rtc_attr.ste_base = obj_id; - rtc_attr.ste_offset = ste->offset; + rtc_attr.ste_base = matcher->match_ste.ste_0_base; rtc_attr.reparse_mode = mlx5hws_context_get_reparse_mode(ctx); rtc_attr.table_type = mlx5hws_table_get_res_fw_ft_type(tbl->type, false); - hws_matcher_set_rtc_attr_sz(matcher, &rtc_attr, rtc_type, false); + hws_matcher_set_rtc_attr_sz(matcher, &rtc_attr, false); /* STC is a single resource (obj_id), use any STC for the ID */ - stc_pool = ctx->stc_pool; - default_stc = ctx->common_res.default_stc; - obj_id = mlx5hws_pool_chunk_get_base_id(stc_pool, &default_stc->default_hit); + obj_id = mlx5hws_pool_get_base_id(ctx->stc_pool); rtc_attr.stc_base = obj_id; - ret = mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr, rtc_0_id); + ret = mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr, + &matcher->match_ste.rtc_0_id); if (ret) { - mlx5hws_err(ctx, "Failed to create matcher RTC of type %s", - hws_matcher_rtc_type_to_str(rtc_type)); - goto free_ste; + mlx5hws_err(ctx, "Failed to create matcher RTC\n"); + return ret; } if (tbl->type == MLX5HWS_TABLE_TYPE_FDB) { - obj_id = mlx5hws_pool_chunk_get_base_mirror_id(ste_pool, ste); - rtc_attr.ste_base = obj_id; + rtc_attr.log_size = size_tx->table.sz_row_log; + rtc_attr.log_depth = size_tx->table.sz_col_log; + rtc_attr.ste_base = matcher->match_ste.ste_1_base; rtc_attr.table_type = mlx5hws_table_get_res_fw_ft_type(tbl->type, true); - obj_id = mlx5hws_pool_chunk_get_base_mirror_id(stc_pool, &default_stc->default_hit); + obj_id = mlx5hws_pool_get_base_mirror_id(ctx->stc_pool); rtc_attr.stc_base = obj_id; - hws_matcher_set_rtc_attr_sz(matcher, &rtc_attr, rtc_type, true); + hws_matcher_set_rtc_attr_sz(matcher, &rtc_attr, true); - ret = mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr, rtc_1_id); + ret = mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr, + &matcher->match_ste.rtc_1_id); if (ret) { - mlx5hws_err(ctx, "Failed to create peer matcher RTC of type %s", - hws_matcher_rtc_type_to_str(rtc_type)); + mlx5hws_err(ctx, "Failed to create mirror matcher RTC\n"); goto destroy_rtc_0; } } @@ -347,46 +552,18 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher, return 0; destroy_rtc_0: - mlx5hws_cmd_rtc_destroy(ctx->mdev, *rtc_0_id); -free_ste: - if (rtc_type == HWS_MATCHER_RTC_TYPE_MATCH) - mlx5hws_pool_chunk_free(ste_pool, ste); + mlx5hws_cmd_rtc_destroy(ctx->mdev, matcher->match_ste.rtc_0_id); return ret; } -static void hws_matcher_destroy_rtc(struct mlx5hws_matcher *matcher, - enum mlx5hws_matcher_rtc_type rtc_type) +static void hws_matcher_destroy_rtc(struct mlx5hws_matcher *matcher) { - struct mlx5hws_matcher_action_ste *action_ste; - struct mlx5hws_table *tbl = matcher->tbl; - struct mlx5hws_pool_chunk *ste; - struct mlx5hws_pool *ste_pool; - u32 rtc_0_id, rtc_1_id; + struct mlx5_core_dev *mdev = matcher->tbl->ctx->mdev; - switch (rtc_type) { - case HWS_MATCHER_RTC_TYPE_MATCH: - rtc_0_id = matcher->match_ste.rtc_0_id; - rtc_1_id = matcher->match_ste.rtc_1_id; - ste_pool = matcher->match_ste.pool; - ste = &matcher->match_ste.ste; - break; - case HWS_MATCHER_RTC_TYPE_STE_ARRAY: - action_ste = &matcher->action_ste; - rtc_0_id = action_ste->rtc_0_id; - rtc_1_id = action_ste->rtc_1_id; - ste_pool = action_ste->pool; - ste = &action_ste->ste; - break; - default: - return; - } + if (matcher->tbl->type == MLX5HWS_TABLE_TYPE_FDB) + mlx5hws_cmd_rtc_destroy(mdev, matcher->match_ste.rtc_1_id); - if (tbl->type == MLX5HWS_TABLE_TYPE_FDB) - mlx5hws_cmd_rtc_destroy(matcher->tbl->ctx->mdev, rtc_1_id); - - mlx5hws_cmd_rtc_destroy(matcher->tbl->ctx->mdev, rtc_0_id); - if (rtc_type == HWS_MATCHER_RTC_TYPE_MATCH) - mlx5hws_pool_chunk_free(ste_pool, ste); + mlx5hws_cmd_rtc_destroy(mdev, matcher->match_ste.rtc_0_id); } static int @@ -394,43 +571,38 @@ hws_matcher_check_attr_sz(struct mlx5hws_cmd_query_caps *caps, struct mlx5hws_matcher *matcher) { struct mlx5hws_matcher_attr *attr = &matcher->attr; + struct mlx5hws_context *ctx = matcher->tbl->ctx; + union mlx5hws_matcher_size *size; + int i; - if (attr->table.sz_col_log > caps->rtc_log_depth_max) { - mlx5hws_err(matcher->tbl->ctx, "Matcher depth exceeds limit %d\n", - caps->rtc_log_depth_max); - return -EOPNOTSUPP; - } + for (i = 0; i < 2; i++) { + size = &attr->size[i]; - if (attr->table.sz_col_log + attr->table.sz_row_log > caps->ste_alloc_log_max) { - mlx5hws_err(matcher->tbl->ctx, "Total matcher size exceeds limit %d\n", - caps->ste_alloc_log_max); - return -EOPNOTSUPP; - } + if (size->table.sz_col_log > caps->rtc_log_depth_max) { + mlx5hws_err(ctx, "Matcher depth exceeds limit %d\n", + caps->rtc_log_depth_max); + return -EOPNOTSUPP; + } - if (attr->table.sz_col_log + attr->table.sz_row_log < caps->ste_alloc_log_gran) { - mlx5hws_err(matcher->tbl->ctx, "Total matcher size below limit %d\n", - caps->ste_alloc_log_gran); - return -EOPNOTSUPP; + if (size->table.sz_col_log + size->table.sz_row_log > + caps->ste_alloc_log_max) { + mlx5hws_err(ctx, + "Total matcher size exceeds limit %d\n", + caps->ste_alloc_log_max); + return -EOPNOTSUPP; + } + + if (size->table.sz_col_log + size->table.sz_row_log < + caps->ste_alloc_log_gran) { + mlx5hws_err(ctx, "Total matcher size below limit %d\n", + caps->ste_alloc_log_gran); + return -EOPNOTSUPP; + } } return 0; } -static void hws_matcher_set_pool_attr(struct mlx5hws_pool_attr *attr, - struct mlx5hws_matcher *matcher) -{ - switch (matcher->attr.optimize_flow_src) { - case MLX5HWS_MATCHER_FLOW_SRC_VPORT: - attr->opt_type = MLX5HWS_POOL_OPTIMIZE_ORIG; - break; - case MLX5HWS_MATCHER_FLOW_SRC_WIRE: - attr->opt_type = MLX5HWS_POOL_OPTIMIZE_MIRROR; - break; - default: - break; - } -} - static int hws_matcher_check_and_process_at(struct mlx5hws_matcher *matcher, struct mlx5hws_action_template *at) { @@ -454,85 +626,17 @@ static int hws_matcher_check_and_process_at(struct mlx5hws_matcher *matcher, return 0; } -static int hws_matcher_resize_init(struct mlx5hws_matcher *src_matcher) -{ - struct mlx5hws_matcher_resize_data *resize_data; - - resize_data = kzalloc(sizeof(*resize_data), GFP_KERNEL); - if (!resize_data) - return -ENOMEM; - - resize_data->max_stes = src_matcher->action_ste.max_stes; - - resize_data->stc = src_matcher->action_ste.stc; - resize_data->rtc_0_id = src_matcher->action_ste.rtc_0_id; - resize_data->rtc_1_id = src_matcher->action_ste.rtc_1_id; - resize_data->pool = src_matcher->action_ste.max_stes ? - src_matcher->action_ste.pool : NULL; - - /* Place the new resized matcher on the dst matcher's list */ - list_add(&resize_data->list_node, &src_matcher->resize_dst->resize_data); - - /* Move all the previous resized matchers to the dst matcher's list */ - while (!list_empty(&src_matcher->resize_data)) { - resize_data = list_first_entry(&src_matcher->resize_data, - struct mlx5hws_matcher_resize_data, - list_node); - list_del_init(&resize_data->list_node); - list_add(&resize_data->list_node, &src_matcher->resize_dst->resize_data); - } - - return 0; -} - -static void hws_matcher_resize_uninit(struct mlx5hws_matcher *matcher) -{ - struct mlx5hws_matcher_resize_data *resize_data; - - if (!mlx5hws_matcher_is_resizable(matcher)) - return; - - while (!list_empty(&matcher->resize_data)) { - resize_data = list_first_entry(&matcher->resize_data, - struct mlx5hws_matcher_resize_data, - list_node); - list_del_init(&resize_data->list_node); - - if (resize_data->max_stes) { - mlx5hws_action_free_single_stc(matcher->tbl->ctx, - matcher->tbl->type, - &resize_data->stc); - - if (matcher->tbl->type == MLX5HWS_TABLE_TYPE_FDB) - mlx5hws_cmd_rtc_destroy(matcher->tbl->ctx->mdev, - resize_data->rtc_1_id); - - mlx5hws_cmd_rtc_destroy(matcher->tbl->ctx->mdev, - resize_data->rtc_0_id); - - if (resize_data->pool) - mlx5hws_pool_destroy(resize_data->pool); - } - - kfree(resize_data); - } -} - static int hws_matcher_bind_at(struct mlx5hws_matcher *matcher) { bool is_jumbo = mlx5hws_matcher_mt_is_jumbo(matcher->mt); - struct mlx5hws_cmd_stc_modify_attr stc_attr = {0}; - struct mlx5hws_matcher_action_ste *action_ste; - struct mlx5hws_table *tbl = matcher->tbl; - struct mlx5hws_pool_attr pool_attr = {0}; - struct mlx5hws_context *ctx = tbl->ctx; - u32 required_stes; - u8 max_stes = 0; + struct mlx5hws_context *ctx = matcher->tbl->ctx; + u8 required_stes, max_stes; int i, ret; if (matcher->flags & MLX5HWS_MATCHER_FLAGS_COLLISION) return 0; + max_stes = 0; for (i = 0; i < matcher->num_of_at; i++) { struct mlx5hws_action_template *at = &matcher->at[i]; @@ -548,81 +652,40 @@ static int hws_matcher_bind_at(struct mlx5hws_matcher *matcher) /* Future: Optimize reparse */ } - /* There are no additional STEs required for matcher */ - if (!max_stes) - return 0; - - matcher->action_ste.max_stes = max_stes; - - action_ste = &matcher->action_ste; - - /* Allocate action STE mempool */ - pool_attr.table_type = tbl->type; - pool_attr.pool_type = MLX5HWS_POOL_TYPE_STE; - pool_attr.flags = MLX5HWS_POOL_FLAGS_FOR_STE_ACTION_POOL; - /* Pool size is similar to action RTC size */ - pool_attr.alloc_log_sz = ilog2(roundup_pow_of_two(action_ste->max_stes)) + - matcher->attr.table.sz_row_log + - MLX5HWS_MATCHER_ACTION_RTC_UPDATE_MULT; - hws_matcher_set_pool_attr(&pool_attr, matcher); - action_ste->pool = mlx5hws_pool_create(ctx, &pool_attr); - if (!action_ste->pool) { - mlx5hws_err(ctx, "Failed to create action ste pool\n"); - return -EINVAL; - } - - /* Allocate action RTC */ - ret = hws_matcher_create_rtc(matcher, HWS_MATCHER_RTC_TYPE_STE_ARRAY); - if (ret) { - mlx5hws_err(ctx, "Failed to create action RTC\n"); - goto free_ste_pool; - } - - /* Allocate STC for jumps to STE */ - stc_attr.action_offset = MLX5HWS_ACTION_OFFSET_HIT; - stc_attr.action_type = MLX5_IFC_STC_ACTION_TYPE_JUMP_TO_STE_TABLE; - stc_attr.reparse_mode = MLX5_IFC_STC_REPARSE_IGNORE; - stc_attr.ste_table.ste = action_ste->ste; - stc_attr.ste_table.ste_pool = action_ste->pool; - stc_attr.ste_table.match_definer_id = ctx->caps->trivial_match_definer; - - ret = mlx5hws_action_alloc_single_stc(ctx, &stc_attr, tbl->type, - &action_ste->stc); - if (ret) { - mlx5hws_err(ctx, "Failed to create action jump to table STC\n"); - goto free_rtc; - } + matcher->num_of_action_stes = max_stes; return 0; - -free_rtc: - hws_matcher_destroy_rtc(matcher, HWS_MATCHER_RTC_TYPE_STE_ARRAY); -free_ste_pool: - mlx5hws_pool_destroy(action_ste->pool); - return ret; } -static void hws_matcher_unbind_at(struct mlx5hws_matcher *matcher) +static void hws_matcher_set_ip_version_match(struct mlx5hws_matcher *matcher) { - struct mlx5hws_matcher_action_ste *action_ste; - struct mlx5hws_table *tbl = matcher->tbl; + int i; - action_ste = &matcher->action_ste; - - if (!action_ste->max_stes || - matcher->flags & MLX5HWS_MATCHER_FLAGS_COLLISION || - mlx5hws_matcher_is_in_resize(matcher)) - return; - - mlx5hws_action_free_single_stc(tbl->ctx, tbl->type, &action_ste->stc); - hws_matcher_destroy_rtc(matcher, HWS_MATCHER_RTC_TYPE_STE_ARRAY); - mlx5hws_pool_destroy(action_ste->pool); + for (i = 0; i < matcher->mt->fc_sz; i++) { + switch (matcher->mt->fc[i].fname) { + case MLX5HWS_DEFINER_FNAME_ETH_TYPE_O: + matcher->matches_outer_ethertype = 1; + break; + case MLX5HWS_DEFINER_FNAME_ETH_L3_TYPE_O: + matcher->matches_outer_ip_version = 1; + break; + case MLX5HWS_DEFINER_FNAME_ETH_TYPE_I: + matcher->matches_inner_ethertype = 1; + break; + case MLX5HWS_DEFINER_FNAME_ETH_L3_TYPE_I: + matcher->matches_inner_ip_version = 1; + break; + default: + break; + } + } } static int hws_matcher_bind_mt(struct mlx5hws_matcher *matcher) { + struct mlx5hws_cmd_ste_create_attr ste_attr = {}; struct mlx5hws_context *ctx = matcher->tbl->ctx; - struct mlx5hws_pool_attr pool_attr = {0}; + union mlx5hws_matcher_size *size; int ret; /* Calculate match, range and hash definers */ @@ -635,23 +698,41 @@ static int hws_matcher_bind_mt(struct mlx5hws_matcher *matcher) } } - /* Create an STE pool per matcher*/ - pool_attr.table_type = matcher->tbl->type; - pool_attr.pool_type = MLX5HWS_POOL_TYPE_STE; - pool_attr.flags = MLX5HWS_POOL_FLAGS_FOR_MATCHER_STE_POOL; - pool_attr.alloc_log_sz = matcher->attr.table.sz_col_log + - matcher->attr.table.sz_row_log; - hws_matcher_set_pool_attr(&pool_attr, matcher); + hws_matcher_set_ip_version_match(matcher); - matcher->match_ste.pool = mlx5hws_pool_create(ctx, &pool_attr); - if (!matcher->match_ste.pool) { - mlx5hws_err(ctx, "Failed to allocate matcher STE pool\n"); - ret = -EOPNOTSUPP; + /* Create an STE range each for RX and TX. */ + ste_attr.table_type = FS_FT_FDB_RX; + size = &matcher->attr.size[MLX5HWS_MATCHER_SIZE_TYPE_RX]; + ste_attr.log_obj_range = + matcher->attr.optimize_flow_src == + MLX5HWS_MATCHER_FLOW_SRC_VPORT ? + 0 : size->table.sz_col_log + size->table.sz_row_log; + + ret = mlx5hws_cmd_ste_create(ctx->mdev, &ste_attr, + &matcher->match_ste.ste_0_base); + if (ret) { + mlx5hws_err(ctx, "Failed to allocate RX STE range (%d)\n", ret); goto uninit_match_definer; } + ste_attr.table_type = FS_FT_FDB_TX; + size = &matcher->attr.size[MLX5HWS_MATCHER_SIZE_TYPE_TX]; + ste_attr.log_obj_range = + matcher->attr.optimize_flow_src == + MLX5HWS_MATCHER_FLOW_SRC_WIRE ? + 0 : size->table.sz_col_log + size->table.sz_row_log; + + ret = mlx5hws_cmd_ste_create(ctx->mdev, &ste_attr, + &matcher->match_ste.ste_1_base); + if (ret) { + mlx5hws_err(ctx, "Failed to allocate TX STE range (%d)\n", ret); + goto destroy_rx_ste_range; + } + return 0; +destroy_rx_ste_range: + mlx5hws_cmd_ste_destroy(ctx->mdev, matcher->match_ste.ste_0_base); uninit_match_definer: if (!(matcher->flags & MLX5HWS_MATCHER_FLAGS_COLLISION)) mlx5hws_definer_mt_uninit(ctx, matcher->mt); @@ -660,9 +741,12 @@ uninit_match_definer: static void hws_matcher_unbind_mt(struct mlx5hws_matcher *matcher) { - mlx5hws_pool_destroy(matcher->match_ste.pool); + struct mlx5hws_context *ctx = matcher->tbl->ctx; + + mlx5hws_cmd_ste_destroy(ctx->mdev, matcher->match_ste.ste_1_base); + mlx5hws_cmd_ste_destroy(ctx->mdev, matcher->match_ste.ste_0_base); if (!(matcher->flags & MLX5HWS_MATCHER_FLAGS_COLLISION)) - mlx5hws_definer_mt_uninit(matcher->tbl->ctx, matcher->mt); + mlx5hws_definer_mt_uninit(ctx, matcher->mt); } static int @@ -671,6 +755,10 @@ hws_matcher_validate_insert_mode(struct mlx5hws_cmd_query_caps *caps, { struct mlx5hws_matcher_attr *attr = &matcher->attr; struct mlx5hws_context *ctx = matcher->tbl->ctx; + union mlx5hws_matcher_size *size_rx, *size_tx; + + size_rx = &matcher->attr.size[MLX5HWS_MATCHER_SIZE_TYPE_RX]; + size_tx = &matcher->attr.size[MLX5HWS_MATCHER_SIZE_TYPE_TX]; switch (attr->insert_mode) { case MLX5HWS_MATCHER_INSERT_BY_HASH: @@ -681,7 +769,7 @@ hws_matcher_validate_insert_mode(struct mlx5hws_cmd_query_caps *caps, break; case MLX5HWS_MATCHER_INSERT_BY_INDEX: - if (attr->table.sz_col_log) { + if (size_rx->table.sz_col_log || size_tx->table.sz_col_log) { mlx5hws_err(ctx, "Matcher with INSERT_BY_INDEX supports only Nx1 table size\n"); return -EOPNOTSUPP; } @@ -701,7 +789,10 @@ hws_matcher_validate_insert_mode(struct mlx5hws_cmd_query_caps *caps, return -EOPNOTSUPP; } - if (attr->table.sz_row_log > MLX5_IFC_RTC_LINEAR_LOOKUP_TBL_LOG_MAX) { + if (size_rx->table.sz_row_log > + MLX5_IFC_RTC_LINEAR_LOOKUP_TBL_LOG_MAX || + size_tx->table.sz_row_log > + MLX5_IFC_RTC_LINEAR_LOOKUP_TBL_LOG_MAX) { mlx5hws_err(ctx, "Matcher with linear distribute: rows exceed limit %d", MLX5_IFC_RTC_LINEAR_LOOKUP_TBL_LOG_MAX); return -EOPNOTSUPP; @@ -725,6 +816,10 @@ hws_matcher_process_attr(struct mlx5hws_cmd_query_caps *caps, struct mlx5hws_matcher *matcher) { struct mlx5hws_matcher_attr *attr = &matcher->attr; + union mlx5hws_matcher_size *size_rx, *size_tx; + + size_rx = &attr->size[MLX5HWS_MATCHER_SIZE_TYPE_RX]; + size_tx = &attr->size[MLX5HWS_MATCHER_SIZE_TYPE_TX]; if (hws_matcher_validate_insert_mode(caps, matcher)) return -EOPNOTSUPP; @@ -736,10 +831,16 @@ hws_matcher_process_attr(struct mlx5hws_cmd_query_caps *caps, /* Convert number of rules to the required depth */ if (attr->mode == MLX5HWS_MATCHER_RESOURCE_MODE_RULE && - attr->insert_mode == MLX5HWS_MATCHER_INSERT_BY_HASH) - attr->table.sz_col_log = hws_matcher_rules_to_tbl_depth(attr->rule.num_log); + attr->insert_mode == MLX5HWS_MATCHER_INSERT_BY_HASH) { + size_rx->table.sz_col_log = + hws_matcher_rules_to_tbl_depth(size_rx->rule.num_log); + size_tx->table.sz_col_log = + hws_matcher_rules_to_tbl_depth(size_tx->rule.num_log); + } matcher->flags |= attr->resizable ? MLX5HWS_MATCHER_FLAGS_RESIZABLE : 0; + matcher->flags |= attr->isolated_matcher_end_ft_id ? + MLX5HWS_MATCHER_FLAGS_ISOLATED : 0; return hws_matcher_check_attr_sz(caps, matcher); } @@ -761,10 +862,10 @@ static int hws_matcher_create_and_connect(struct mlx5hws_matcher *matcher) /* Create matcher end flow table anchor */ ret = hws_matcher_create_end_ft(matcher); if (ret) - goto unbind_at; + goto unbind_mt; /* Allocate the RTC for the new matcher */ - ret = hws_matcher_create_rtc(matcher, HWS_MATCHER_RTC_TYPE_MATCH); + ret = hws_matcher_create_rtc(matcher); if (ret) goto destroy_end_ft; @@ -776,11 +877,9 @@ static int hws_matcher_create_and_connect(struct mlx5hws_matcher *matcher) return 0; destroy_rtc: - hws_matcher_destroy_rtc(matcher, HWS_MATCHER_RTC_TYPE_MATCH); + hws_matcher_destroy_rtc(matcher); destroy_end_ft: hws_matcher_destroy_end_ft(matcher); -unbind_at: - hws_matcher_unbind_at(matcher); unbind_mt: hws_matcher_unbind_mt(matcher); return ret; @@ -788,11 +887,9 @@ unbind_mt: static void hws_matcher_destroy_and_disconnect(struct mlx5hws_matcher *matcher) { - hws_matcher_resize_uninit(matcher); hws_matcher_disconnect(matcher); - hws_matcher_destroy_rtc(matcher, HWS_MATCHER_RTC_TYPE_MATCH); + hws_matcher_destroy_rtc(matcher); hws_matcher_destroy_end_ft(matcher); - hws_matcher_unbind_at(matcher); hws_matcher_unbind_mt(matcher); } @@ -800,22 +897,25 @@ static int hws_matcher_create_col_matcher(struct mlx5hws_matcher *matcher) { struct mlx5hws_context *ctx = matcher->tbl->ctx; + union mlx5hws_matcher_size *size_rx, *size_tx; struct mlx5hws_matcher *col_matcher; - int ret; + int i, ret; + + size_rx = &matcher->attr.size[MLX5HWS_MATCHER_SIZE_TYPE_RX]; + size_tx = &matcher->attr.size[MLX5HWS_MATCHER_SIZE_TYPE_TX]; if (matcher->attr.mode != MLX5HWS_MATCHER_RESOURCE_MODE_RULE || matcher->attr.insert_mode == MLX5HWS_MATCHER_INSERT_BY_INDEX) return 0; - if (!hws_matcher_requires_col_tbl(matcher->attr.rule.num_log)) + if (!hws_matcher_requires_col_tbl(size_rx->rule.num_log) && + !hws_matcher_requires_col_tbl(size_tx->rule.num_log)) return 0; col_matcher = kzalloc(sizeof(*matcher), GFP_KERNEL); if (!col_matcher) return -ENOMEM; - INIT_LIST_HEAD(&col_matcher->resize_data); - col_matcher->tbl = matcher->tbl; col_matcher->mt = matcher->mt; col_matcher->at = matcher->at; @@ -826,12 +926,20 @@ hws_matcher_create_col_matcher(struct mlx5hws_matcher *matcher) col_matcher->flags |= MLX5HWS_MATCHER_FLAGS_COLLISION; col_matcher->attr.mode = MLX5HWS_MATCHER_RESOURCE_MODE_HTABLE; col_matcher->attr.optimize_flow_src = matcher->attr.optimize_flow_src; - col_matcher->attr.table.sz_row_log = matcher->attr.rule.num_log; - col_matcher->attr.table.sz_col_log = MLX5HWS_MATCHER_ASSURED_COL_TBL_DEPTH; - if (col_matcher->attr.table.sz_row_log > MLX5HWS_MATCHER_ASSURED_ROW_RATIO) - col_matcher->attr.table.sz_row_log -= MLX5HWS_MATCHER_ASSURED_ROW_RATIO; + for (i = 0; i < 2; i++) { + union mlx5hws_matcher_size *dst = &col_matcher->attr.size[i]; + union mlx5hws_matcher_size *src = &matcher->attr.size[i]; + + dst->table.sz_row_log = src->rule.num_log; + dst->table.sz_col_log = MLX5HWS_MATCHER_ASSURED_COL_TBL_DEPTH; + if (dst->table.sz_row_log > MLX5HWS_MATCHER_ASSURED_ROW_RATIO) + dst->table.sz_row_log -= + MLX5HWS_MATCHER_ASSURED_ROW_RATIO; + } col_matcher->attr.max_num_of_at_attach = matcher->attr.max_num_of_at_attach; + col_matcher->attr.isolated_matcher_end_ft_id = + matcher->attr.isolated_matcher_end_ft_id; ret = hws_matcher_process_attr(ctx->caps, col_matcher); if (ret) @@ -869,8 +977,6 @@ static int hws_matcher_init(struct mlx5hws_matcher *matcher) struct mlx5hws_context *ctx = matcher->tbl->ctx; int ret; - INIT_LIST_HEAD(&matcher->resize_data); - mutex_lock(&ctx->ctrl_lock); /* Allocate matcher resource and connect to the packet pipe */ @@ -905,18 +1011,44 @@ static int hws_matcher_uninit(struct mlx5hws_matcher *matcher) return 0; } +static int hws_matcher_grow_at_array(struct mlx5hws_matcher *matcher) +{ + void *p; + + if (matcher->size_of_at_array >= MLX5HWS_MATCHER_MAX_AT) + return -ENOMEM; + + matcher->size_of_at_array *= 2; + p = krealloc(matcher->at, + matcher->size_of_at_array * sizeof(*matcher->at), + __GFP_ZERO | GFP_KERNEL); + if (!p) { + matcher->size_of_at_array /= 2; + return -ENOMEM; + } + + matcher->at = p; + + return 0; +} + int mlx5hws_matcher_attach_at(struct mlx5hws_matcher *matcher, struct mlx5hws_action_template *at) { bool is_jumbo = mlx5hws_matcher_mt_is_jumbo(matcher->mt); - struct mlx5hws_context *ctx = matcher->tbl->ctx; u32 required_stes; int ret; - if (!matcher->attr.max_num_of_at_attach) { - mlx5hws_dbg(ctx, "Num of current at (%d) exceed allowed value\n", - matcher->num_of_at); - return -EOPNOTSUPP; + if (unlikely(matcher->num_of_at >= matcher->size_of_at_array)) { + ret = hws_matcher_grow_at_array(matcher); + if (ret) + return ret; + + if (matcher->col_matcher) { + ret = hws_matcher_grow_at_array(matcher->col_matcher); + if (ret) + return ret; + } } ret = hws_matcher_check_and_process_at(matcher, at); @@ -924,15 +1056,11 @@ int mlx5hws_matcher_attach_at(struct mlx5hws_matcher *matcher, return ret; required_stes = at->num_of_action_stes - (!is_jumbo || at->only_term); - if (matcher->action_ste.max_stes < required_stes) { - mlx5hws_dbg(ctx, "Required STEs [%d] exceeds initial action template STE [%d]\n", - required_stes, matcher->action_ste.max_stes); - return -ENOMEM; - } + if (matcher->num_of_action_stes < required_stes) + matcher->num_of_action_stes = required_stes; matcher->at[matcher->num_of_at] = *at; matcher->num_of_at += 1; - matcher->attr.max_num_of_at_attach -= 1; if (matcher->col_matcher) matcher->col_matcher->num_of_at = matcher->num_of_at; @@ -960,8 +1088,9 @@ hws_matcher_set_templates(struct mlx5hws_matcher *matcher, if (!matcher->mt) return -ENOMEM; - matcher->at = kvcalloc(num_of_at + matcher->attr.max_num_of_at_attach, - sizeof(*matcher->at), + matcher->size_of_at_array = + num_of_at + matcher->attr.max_num_of_at_attach; + matcher->at = kvcalloc(matcher->size_of_at_array, sizeof(*matcher->at), GFP_KERNEL); if (!matcher->at) { mlx5hws_err(ctx, "Failed to allocate action template array\n"); @@ -1110,7 +1239,7 @@ static int hws_matcher_resize_precheck(struct mlx5hws_matcher *src_matcher, return -EINVAL; } - if (src_matcher->action_ste.max_stes > dst_matcher->action_ste.max_stes) { + if (src_matcher->num_of_action_stes > dst_matcher->num_of_action_stes) { mlx5hws_err(ctx, "Src/dst matcher max STEs mismatch\n"); return -EINVAL; } @@ -1139,10 +1268,6 @@ int mlx5hws_matcher_resize_set_target(struct mlx5hws_matcher *src_matcher, src_matcher->resize_dst = dst_matcher; - ret = hws_matcher_resize_init(src_matcher); - if (ret) - src_matcher->resize_dst = NULL; - out: mutex_unlock(&src_matcher->tbl->ctx->ctrl_lock); return ret; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h index 020de70270..ae20bcebfd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h @@ -23,6 +23,9 @@ */ #define MLX5HWS_MATCHER_ACTION_RTC_UPDATE_MULT 1 +/* Maximum number of action templates that can be attached to a matcher. */ +#define MLX5HWS_MATCHER_MAX_AT 128 + enum mlx5hws_matcher_offset { MLX5HWS_MATCHER_OFFSET_TAG_DW1 = 12, MLX5HWS_MATCHER_OFFSET_TAG_DW0 = 13, @@ -31,6 +34,7 @@ enum mlx5hws_matcher_offset { enum mlx5hws_matcher_flags { MLX5HWS_MATCHER_FLAGS_COLLISION = 1 << 2, MLX5HWS_MATCHER_FLAGS_RESIZABLE = 1 << 3, + MLX5HWS_MATCHER_FLAGS_ISOLATED = 1 << 4, }; struct mlx5hws_match_template { @@ -42,28 +46,16 @@ struct mlx5hws_match_template { }; struct mlx5hws_matcher_match_ste { - struct mlx5hws_pool_chunk ste; u32 rtc_0_id; u32 rtc_1_id; - struct mlx5hws_pool *pool; + u32 ste_0_base; + u32 ste_1_base; }; -struct mlx5hws_matcher_action_ste { - struct mlx5hws_pool_chunk ste; - struct mlx5hws_pool_chunk stc; - u32 rtc_0_id; - u32 rtc_1_id; - struct mlx5hws_pool *pool; - u8 max_stes; -}; - -struct mlx5hws_matcher_resize_data { - struct mlx5hws_pool_chunk stc; - u32 rtc_0_id; - u32 rtc_1_id; - struct mlx5hws_pool *pool; - u8 max_stes; - struct list_head list_node; +enum { + MLX5HWS_MATCHER_IPV_UNSET = 0, + MLX5HWS_MATCHER_IPV_4 = 1, + MLX5HWS_MATCHER_IPV_6 = 2, }; struct mlx5hws_matcher { @@ -72,16 +64,22 @@ struct mlx5hws_matcher { struct mlx5hws_match_template *mt; struct mlx5hws_action_template *at; u8 num_of_at; + u8 size_of_at_array; u8 num_of_mt; + u8 num_of_action_stes; /* enum mlx5hws_matcher_flags */ u8 flags; + u8 matches_outer_ethertype:1; + u8 matches_outer_ip_version:1; + u8 matches_inner_ethertype:1; + u8 matches_inner_ip_version:1; + u8 outer_ip_version:2; + u8 inner_ip_version:2; u32 end_ft_id; struct mlx5hws_matcher *col_matcher; struct mlx5hws_matcher *resize_dst; struct mlx5hws_matcher_match_ste match_ste; - struct mlx5hws_matcher_action_ste action_ste; struct list_head list_node; - struct list_head resize_data; }; static inline bool @@ -100,9 +98,17 @@ static inline bool mlx5hws_matcher_is_in_resize(struct mlx5hws_matcher *matcher) return !!matcher->resize_dst; } +static inline bool mlx5hws_matcher_is_isolated(struct mlx5hws_matcher *matcher) +{ + return !!(matcher->flags & MLX5HWS_MATCHER_FLAGS_ISOLATED); +} + static inline bool mlx5hws_matcher_is_insert_by_idx(struct mlx5hws_matcher *matcher) { return matcher->attr.insert_mode == MLX5HWS_MATCHER_INSERT_BY_INDEX; } +int mlx5hws_matcher_update_end_ft_isolated(struct mlx5hws_table *tbl, + u32 miss_ft_id); + #endif /* HWS_MATCHER_H_ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h index 5121951f27..1ad7a50d93 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h @@ -75,6 +75,7 @@ struct mlx5hws_context_attr { struct mlx5hws_table_attr { enum mlx5hws_table_type type; u32 level; + u16 uid; }; enum mlx5hws_matcher_flow_src { @@ -93,6 +94,23 @@ enum mlx5hws_matcher_distribute_mode { MLX5HWS_MATCHER_DISTRIBUTE_BY_LINEAR = 0x1, }; +enum mlx5hws_matcher_size_type { + MLX5HWS_MATCHER_SIZE_TYPE_RX, + MLX5HWS_MATCHER_SIZE_TYPE_TX, + MLX5HWS_MATCHER_SIZE_TYPE_MAX, +}; + +union mlx5hws_matcher_size { + struct { + u8 sz_row_log; + u8 sz_col_log; + } table; + + struct { + u8 num_log; + } rule; +}; + struct mlx5hws_matcher_attr { /* Processing priority inside table */ u32 priority; @@ -107,18 +125,11 @@ struct mlx5hws_matcher_attr { enum mlx5hws_matcher_distribute_mode distribute_mode; /* Define whether the created matcher supports resizing into a bigger matcher */ bool resizable; - union { - struct { - u8 sz_row_log; - u8 sz_col_log; - } table; - - struct { - u8 num_log; - } rule; - }; + union mlx5hws_matcher_size size[MLX5HWS_MATCHER_SIZE_TYPE_MAX]; /* Optional AT attach configuration - Max number of additional AT */ u8 max_num_of_at_attach; + /* Optional end FT (miss FT ID) for match RTC (for isolated matcher) */ + u32 isolated_matcher_end_ft_id; }; struct mlx5hws_rule_attr { @@ -211,6 +222,7 @@ struct mlx5hws_action_dest_attr { struct mlx5hws_action *dest; /* Optional reformat action */ struct mlx5hws_action *reformat; + bool is_wire_ft; }; /** @@ -501,6 +513,15 @@ int mlx5hws_rule_action_update(struct mlx5hws_rule *rule, enum mlx5hws_action_type mlx5hws_action_get_type(struct mlx5hws_action *action); +/** + * mlx5hws_action_get_dev - Get mlx5 core device. + * + * @action: The action to get the device from. + * + * Return: mlx5 core device. + */ +struct mlx5_core_dev *mlx5hws_action_get_dev(struct mlx5hws_action *action); + /** * mlx5hws_action_create_dest_drop - Create a direct rule drop action. * @@ -714,18 +735,13 @@ mlx5hws_action_create_push_vlan(struct mlx5hws_context *ctx, u32 flags); * @num_dest: The number of dests attributes. * @dests: The destination array. Each contains a destination action and can * have additional actions. - * @ignore_flow_level: Whether to turn on 'ignore_flow_level' for this dest. - * @flow_source: Source port of the traffic for this actions. * @flags: Action creation flags (enum mlx5hws_action_flags). * * Return: pointer to mlx5hws_action on success NULL otherwise. */ struct mlx5hws_action * -mlx5hws_action_create_dest_array(struct mlx5hws_context *ctx, - size_t num_dest, +mlx5hws_action_create_dest_array(struct mlx5hws_context *ctx, size_t num_dest, struct mlx5hws_action_dest_attr *dests, - bool ignore_flow_level, - u32 flow_source, u32 flags); /** diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c index f51ed24526..d56271a9e4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c @@ -279,7 +279,7 @@ int mlx5hws_pat_get_pattern(struct mlx5hws_context *ctx, return ret; clean_pattern: - mlx5hws_cmd_header_modify_pattern_destroy(ctx->mdev, *pattern_id); + mlx5hws_cmd_header_modify_pattern_destroy(ctx->mdev, ptrn_id); out_unlock: mutex_unlock(&ctx->pattern_cache->lock); return ret; @@ -490,8 +490,8 @@ hws_action_modify_get_target_fields(u8 action_type, __be64 *pattern, switch (action_type) { case MLX5_ACTION_TYPE_SET: case MLX5_ACTION_TYPE_ADD: - *src_field = MLX5_GET(set_action_in, pattern, field); - *dst_field = INVALID_FIELD; + *src_field = INVALID_FIELD; + *dst_field = MLX5_GET(set_action_in, pattern, field); break; case MLX5_ACTION_TYPE_COPY: *src_field = MLX5_GET(copy_action_in, pattern, src_field); @@ -522,57 +522,61 @@ bool mlx5hws_pat_verify_actions(struct mlx5hws_context *ctx, __be64 pattern[], s return true; } -void mlx5hws_pat_calc_nope(__be64 *pattern, size_t num_actions, - size_t max_actions, size_t *new_size, - u32 *nope_location, __be64 *new_pat) +int mlx5hws_pat_calc_nop(__be64 *pattern, size_t num_actions, + size_t max_actions, size_t *new_size, + u32 *nop_locations, __be64 *new_pat) { - u16 prev_src_field = 0, prev_dst_field = 0; - u16 src_field, dst_field; + u16 prev_src_field = INVALID_FIELD, prev_dst_field = INVALID_FIELD; u8 action_type; + bool dependent; size_t i, j; *new_size = num_actions; - *nope_location = 0; + *nop_locations = 0; if (num_actions == 1) - return; + return 0; for (i = 0, j = 0; i < num_actions; i++, j++) { - action_type = MLX5_GET(set_action_in, &pattern[i], action_type); + u16 src_field = INVALID_FIELD; + u16 dst_field = INVALID_FIELD; + if (j >= max_actions) + return -EINVAL; + + action_type = MLX5_GET(set_action_in, &pattern[i], action_type); hws_action_modify_get_target_fields(action_type, &pattern[i], &src_field, &dst_field); - if (i % 2) { - if (action_type == MLX5_ACTION_TYPE_COPY && - (prev_src_field == src_field || - prev_dst_field == dst_field)) { - /* need Nope */ - *new_size += 1; - *nope_location |= BIT(i); - memset(&new_pat[j], 0, MLX5HWS_MODIFY_ACTION_SIZE); - MLX5_SET(set_action_in, &new_pat[j], - action_type, - MLX5_MODIFICATION_TYPE_NOP); - j++; - } else if (prev_src_field == src_field) { - /* need Nope*/ - *new_size += 1; - *nope_location |= BIT(i); - MLX5_SET(set_action_in, &new_pat[j], - action_type, - MLX5_MODIFICATION_TYPE_NOP); - j++; - } - } - memcpy(&new_pat[j], &pattern[i], MLX5HWS_MODIFY_ACTION_SIZE); - /* check if no more space */ - if (j > max_actions) { - *new_size = num_actions; - *nope_location = 0; - return; + + /* For every action, look at it and the previous one. The two + * actions are dependent if: + */ + dependent = + (i > 0) && + /* At least one of the actions is a write and */ + (dst_field != INVALID_FIELD || + prev_dst_field != INVALID_FIELD) && + /* One reads from the other's source */ + (dst_field == prev_src_field || + src_field == prev_dst_field || + /* Or both write to the same destination */ + dst_field == prev_dst_field); + + if (dependent) { + *new_size += 1; + *nop_locations |= BIT(i); + memset(&new_pat[j], 0, MLX5HWS_MODIFY_ACTION_SIZE); + MLX5_SET(set_action_in, &new_pat[j], action_type, + MLX5_MODIFICATION_TYPE_NOP); + j++; + if (j >= max_actions) + return -EINVAL; } + memcpy(&new_pat[j], &pattern[i], MLX5HWS_MODIFY_ACTION_SIZE); prev_src_field = src_field; prev_dst_field = dst_field; } + + return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.h index 8ddb519800..7fbd8dc7aa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.h @@ -96,6 +96,7 @@ int mlx5hws_arg_write_inline_arg_data(struct mlx5hws_context *ctx, u8 *arg_data, size_t data_size); -void mlx5hws_pat_calc_nope(__be64 *pattern, size_t num_actions, size_t max_actions, - size_t *new_size, u32 *nope_location, __be64 *new_pat); +int mlx5hws_pat_calc_nop(__be64 *pattern, size_t num_actions, + size_t max_actions, size_t *new_size, + u32 *nop_locations, __be64 *new_pat); #endif /* MLX5HWS_PAT_ARG_H_ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c index 50a81d360b..7b5071c3df 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c @@ -20,15 +20,14 @@ static void hws_pool_free_one_resource(struct mlx5hws_pool_resource *resource) kfree(resource); } -static void hws_pool_resource_free(struct mlx5hws_pool *pool, - int resource_idx) +static void hws_pool_resource_free(struct mlx5hws_pool *pool) { - hws_pool_free_one_resource(pool->resource[resource_idx]); - pool->resource[resource_idx] = NULL; + hws_pool_free_one_resource(pool->resource); + pool->resource = NULL; if (pool->tbl_type == MLX5HWS_TABLE_TYPE_FDB) { - hws_pool_free_one_resource(pool->mirror_resource[resource_idx]); - pool->mirror_resource[resource_idx] = NULL; + hws_pool_free_one_resource(pool->mirror_resource); + pool->mirror_resource = NULL; } } @@ -61,10 +60,8 @@ hws_pool_create_one_resource(struct mlx5hws_pool *pool, u32 log_range, ret = -EINVAL; } - if (ret) { - mlx5hws_err(pool->ctx, "Failed to allocate resource objects\n"); + if (ret) goto free_resource; - } resource->pool = pool; resource->range = 1 << log_range; @@ -77,51 +74,80 @@ free_resource: return NULL; } -static int -hws_pool_resource_alloc(struct mlx5hws_pool *pool, u32 log_range, int idx) +static int hws_pool_resource_alloc(struct mlx5hws_pool *pool) { struct mlx5hws_pool_resource *resource; u32 fw_ft_type, opt_log_range; fw_ft_type = mlx5hws_table_get_res_fw_ft_type(pool->tbl_type, false); - opt_log_range = pool->opt_type == MLX5HWS_POOL_OPTIMIZE_ORIG ? 0 : log_range; + opt_log_range = pool->opt_type == MLX5HWS_POOL_OPTIMIZE_MIRROR ? + 0 : pool->alloc_log_sz; resource = hws_pool_create_one_resource(pool, opt_log_range, fw_ft_type); if (!resource) { - mlx5hws_err(pool->ctx, "Failed allocating resource\n"); + mlx5hws_err(pool->ctx, "Failed to allocate resource\n"); return -EINVAL; } - pool->resource[idx] = resource; + pool->resource = resource; if (pool->tbl_type == MLX5HWS_TABLE_TYPE_FDB) { struct mlx5hws_pool_resource *mirror_resource; fw_ft_type = mlx5hws_table_get_res_fw_ft_type(pool->tbl_type, true); - opt_log_range = pool->opt_type == MLX5HWS_POOL_OPTIMIZE_MIRROR ? 0 : log_range; + opt_log_range = pool->opt_type == MLX5HWS_POOL_OPTIMIZE_ORIG ? + 0 : pool->alloc_log_sz; mirror_resource = hws_pool_create_one_resource(pool, opt_log_range, fw_ft_type); if (!mirror_resource) { - mlx5hws_err(pool->ctx, "Failed allocating mirrored resource\n"); + mlx5hws_err(pool->ctx, "Failed to allocate mirrored resource\n"); hws_pool_free_one_resource(resource); - pool->resource[idx] = NULL; + pool->resource = NULL; return -EINVAL; } - pool->mirror_resource[idx] = mirror_resource; + pool->mirror_resource = mirror_resource; } return 0; } -static unsigned long *hws_pool_create_and_init_bitmap(u32 log_range) +static int hws_pool_buddy_init(struct mlx5hws_pool *pool) { - unsigned long *cur_bmp; + struct mlx5hws_buddy_mem *buddy; - cur_bmp = bitmap_zalloc(1 << log_range, GFP_KERNEL); - if (!cur_bmp) - return NULL; + buddy = mlx5hws_buddy_create(pool->alloc_log_sz); + if (!buddy) { + mlx5hws_err(pool->ctx, "Failed to create buddy order: %zu\n", + pool->alloc_log_sz); + return -ENOMEM; + } - bitmap_fill(cur_bmp, 1 << log_range); + if (hws_pool_resource_alloc(pool) != 0) { + mlx5hws_err(pool->ctx, "Failed to create resource type: %d size %zu\n", + pool->type, pool->alloc_log_sz); + mlx5hws_buddy_cleanup(buddy); + kfree(buddy); + return -ENOMEM; + } - return cur_bmp; + pool->db.buddy = buddy; + + return 0; +} + +static int hws_pool_buddy_db_get_chunk(struct mlx5hws_pool *pool, + struct mlx5hws_pool_chunk *chunk) +{ + struct mlx5hws_buddy_mem *buddy = pool->db.buddy; + + if (!buddy) { + mlx5hws_err(pool->ctx, "Bad buddy state\n"); + return -EINVAL; + } + + chunk->offset = mlx5hws_buddy_alloc_mem(buddy, chunk->order); + if (chunk->offset >= 0) + return 0; + + return -ENOMEM; } static void hws_pool_buddy_db_put_chunk(struct mlx5hws_pool *pool, @@ -129,147 +155,34 @@ static void hws_pool_buddy_db_put_chunk(struct mlx5hws_pool *pool, { struct mlx5hws_buddy_mem *buddy; - buddy = pool->db.buddy_manager->buddies[chunk->resource_idx]; + buddy = pool->db.buddy; if (!buddy) { - mlx5hws_err(pool->ctx, "No such buddy (%d)\n", chunk->resource_idx); + mlx5hws_err(pool->ctx, "Bad buddy state\n"); return; } mlx5hws_buddy_free_mem(buddy, chunk->offset, chunk->order); } -static struct mlx5hws_buddy_mem * -hws_pool_buddy_get_next_buddy(struct mlx5hws_pool *pool, int idx, - u32 order, bool *is_new_buddy) -{ - static struct mlx5hws_buddy_mem *buddy; - u32 new_buddy_size; - - buddy = pool->db.buddy_manager->buddies[idx]; - if (buddy) - return buddy; - - new_buddy_size = max(pool->alloc_log_sz, order); - *is_new_buddy = true; - buddy = mlx5hws_buddy_create(new_buddy_size); - if (!buddy) { - mlx5hws_err(pool->ctx, "Failed to create buddy order: %d index: %d\n", - new_buddy_size, idx); - return NULL; - } - - if (hws_pool_resource_alloc(pool, new_buddy_size, idx) != 0) { - mlx5hws_err(pool->ctx, "Failed to create resource type: %d: size %d index: %d\n", - pool->type, new_buddy_size, idx); - mlx5hws_buddy_cleanup(buddy); - return NULL; - } - - pool->db.buddy_manager->buddies[idx] = buddy; - - return buddy; -} - -static int hws_pool_buddy_get_mem_chunk(struct mlx5hws_pool *pool, - int order, - u32 *buddy_idx, - int *seg) -{ - struct mlx5hws_buddy_mem *buddy; - bool new_mem = false; - int ret = 0; - int i; - - *seg = -1; - - /* Find the next free place from the buddy array */ - while (*seg < 0) { - for (i = 0; i < MLX5HWS_POOL_RESOURCE_ARR_SZ; i++) { - buddy = hws_pool_buddy_get_next_buddy(pool, i, - order, - &new_mem); - if (!buddy) { - ret = -ENOMEM; - goto out; - } - - *seg = mlx5hws_buddy_alloc_mem(buddy, order); - if (*seg >= 0) - goto found; - - if (pool->flags & MLX5HWS_POOL_FLAGS_ONE_RESOURCE) { - mlx5hws_err(pool->ctx, - "Fail to allocate seg for one resource pool\n"); - ret = -ENOMEM; - goto out; - } - - if (new_mem) { - /* We have new memory pool, should be place for us */ - mlx5hws_err(pool->ctx, - "No memory for order: %d with buddy no: %d\n", - order, i); - ret = -ENOMEM; - goto out; - } - } - } - -found: - *buddy_idx = i; -out: - return ret; -} - -static int hws_pool_buddy_db_get_chunk(struct mlx5hws_pool *pool, - struct mlx5hws_pool_chunk *chunk) -{ - int ret = 0; - - /* Go over the buddies and find next free slot */ - ret = hws_pool_buddy_get_mem_chunk(pool, chunk->order, - &chunk->resource_idx, - &chunk->offset); - if (ret) - mlx5hws_err(pool->ctx, "Failed to get free slot for chunk with order: %d\n", - chunk->order); - - return ret; -} - static void hws_pool_buddy_db_uninit(struct mlx5hws_pool *pool) { struct mlx5hws_buddy_mem *buddy; - int i; - for (i = 0; i < MLX5HWS_POOL_RESOURCE_ARR_SZ; i++) { - buddy = pool->db.buddy_manager->buddies[i]; - if (buddy) { - mlx5hws_buddy_cleanup(buddy); - kfree(buddy); - pool->db.buddy_manager->buddies[i] = NULL; - } + buddy = pool->db.buddy; + if (buddy) { + mlx5hws_buddy_cleanup(buddy); + kfree(buddy); + pool->db.buddy = NULL; } - - kfree(pool->db.buddy_manager); } -static int hws_pool_buddy_db_init(struct mlx5hws_pool *pool, u32 log_range) +static int hws_pool_buddy_db_init(struct mlx5hws_pool *pool) { - pool->db.buddy_manager = kzalloc(sizeof(*pool->db.buddy_manager), GFP_KERNEL); - if (!pool->db.buddy_manager) - return -ENOMEM; + int ret; - if (pool->flags & MLX5HWS_POOL_FLAGS_ALLOC_MEM_ON_CREATE) { - bool new_buddy; - - if (!hws_pool_buddy_get_next_buddy(pool, 0, log_range, &new_buddy)) { - mlx5hws_err(pool->ctx, - "Failed allocating memory on create log_sz: %d\n", log_range); - kfree(pool->db.buddy_manager); - return -ENOMEM; - } - } + ret = hws_pool_buddy_init(pool); + if (ret) + return ret; pool->p_db_uninit = &hws_pool_buddy_db_uninit; pool->p_get_chunk = &hws_pool_buddy_db_get_chunk; @@ -278,261 +191,105 @@ static int hws_pool_buddy_db_init(struct mlx5hws_pool *pool, u32 log_range) return 0; } -static int hws_pool_create_resource_on_index(struct mlx5hws_pool *pool, - u32 alloc_size, int idx) +static unsigned long *hws_pool_create_and_init_bitmap(u32 log_range) { - int ret = hws_pool_resource_alloc(pool, alloc_size, idx); + unsigned long *bitmap; - if (ret) { - mlx5hws_err(pool->ctx, "Failed to create resource type: %d: size %d index: %d\n", - pool->type, alloc_size, idx); - return ret; - } - - return 0; -} - -static struct mlx5hws_pool_elements * -hws_pool_element_create_new_elem(struct mlx5hws_pool *pool, u32 order, int idx) -{ - struct mlx5hws_pool_elements *elem; - u32 alloc_size; - - alloc_size = pool->alloc_log_sz; - - elem = kzalloc(sizeof(*elem), GFP_KERNEL); - if (!elem) + bitmap = bitmap_zalloc(1 << log_range, GFP_KERNEL); + if (!bitmap) return NULL; - /* Sharing the same resource, also means that all the elements are with size 1 */ - if ((pool->flags & MLX5HWS_POOL_FLAGS_FIXED_SIZE_OBJECTS) && - !(pool->flags & MLX5HWS_POOL_FLAGS_RESOURCE_PER_CHUNK)) { - /* Currently all chunks in size 1 */ - elem->bitmap = hws_pool_create_and_init_bitmap(alloc_size - order); - if (!elem->bitmap) { - mlx5hws_err(pool->ctx, - "Failed to create bitmap type: %d: size %d index: %d\n", - pool->type, alloc_size, idx); - goto free_elem; - } + bitmap_fill(bitmap, 1 << log_range); - elem->log_size = alloc_size - order; - } - - if (hws_pool_create_resource_on_index(pool, alloc_size, idx)) { - mlx5hws_err(pool->ctx, "Failed to create resource type: %d: size %d index: %d\n", - pool->type, alloc_size, idx); - goto free_db; - } - - pool->db.element_manager->elements[idx] = elem; - - return elem; - -free_db: - bitmap_free(elem->bitmap); -free_elem: - kfree(elem); - return NULL; + return bitmap; } -static int hws_pool_element_find_seg(struct mlx5hws_pool_elements *elem, int *seg) +static int hws_pool_bitmap_init(struct mlx5hws_pool *pool) { - unsigned int segment, size; + unsigned long *bitmap; - size = 1 << elem->log_size; - - segment = find_first_bit(elem->bitmap, size); - if (segment >= size) { - elem->is_full = true; + bitmap = hws_pool_create_and_init_bitmap(pool->alloc_log_sz); + if (!bitmap) { + mlx5hws_err(pool->ctx, "Failed to create bitmap order: %zu\n", + pool->alloc_log_sz); return -ENOMEM; } - bitmap_clear(elem->bitmap, segment, 1); - *seg = segment; - return 0; -} - -static int -hws_pool_onesize_element_get_mem_chunk(struct mlx5hws_pool *pool, u32 order, - u32 *idx, int *seg) -{ - struct mlx5hws_pool_elements *elem; - - elem = pool->db.element_manager->elements[0]; - if (!elem) - elem = hws_pool_element_create_new_elem(pool, order, 0); - if (!elem) - goto err_no_elem; - - if (hws_pool_element_find_seg(elem, seg) != 0) { - mlx5hws_err(pool->ctx, "No more resources (last request order: %d)\n", order); + if (hws_pool_resource_alloc(pool) != 0) { + mlx5hws_err(pool->ctx, "Failed to create resource type: %d: size %zu\n", + pool->type, pool->alloc_log_sz); + bitmap_free(bitmap); return -ENOMEM; } - *idx = 0; - elem->num_of_elements++; - return 0; + pool->db.bitmap = bitmap; -err_no_elem: - mlx5hws_err(pool->ctx, "Failed to allocate element for order: %d\n", order); - return -ENOMEM; + return 0; } -static int -hws_pool_general_element_get_mem_chunk(struct mlx5hws_pool *pool, u32 order, - u32 *idx, int *seg) +static int hws_pool_bitmap_db_get_chunk(struct mlx5hws_pool *pool, + struct mlx5hws_pool_chunk *chunk) { - int ret, i; + unsigned long *bitmap, size; - for (i = 0; i < MLX5HWS_POOL_RESOURCE_ARR_SZ; i++) { - if (!pool->resource[i]) { - ret = hws_pool_create_resource_on_index(pool, order, i); - if (ret) - goto err_no_res; - *idx = i; - *seg = 0; /* One memory slot in that element */ - return 0; - } + if (chunk->order != 0) { + mlx5hws_err(pool->ctx, "Pool only supports order 0 allocs\n"); + return -EINVAL; } - mlx5hws_err(pool->ctx, "No more resources (last request order: %d)\n", order); - return -ENOMEM; + bitmap = pool->db.bitmap; + if (!bitmap) { + mlx5hws_err(pool->ctx, "Bad bitmap state\n"); + return -EINVAL; + } -err_no_res: - mlx5hws_err(pool->ctx, "Failed to allocate element for order: %d\n", order); - return -ENOMEM; -} + size = 1 << pool->alloc_log_sz; -static int hws_pool_general_element_db_get_chunk(struct mlx5hws_pool *pool, - struct mlx5hws_pool_chunk *chunk) -{ - int ret; + chunk->offset = find_first_bit(bitmap, size); + if (chunk->offset >= size) + return -ENOMEM; - /* Go over all memory elements and find/allocate free slot */ - ret = hws_pool_general_element_get_mem_chunk(pool, chunk->order, - &chunk->resource_idx, - &chunk->offset); - if (ret) - mlx5hws_err(pool->ctx, "Failed to get free slot for chunk with order: %d\n", - chunk->order); - - return ret; -} - -static void hws_pool_general_element_db_put_chunk(struct mlx5hws_pool *pool, - struct mlx5hws_pool_chunk *chunk) -{ - if (unlikely(!pool->resource[chunk->resource_idx])) - pr_warn("HWS: invalid resource with index %d\n", chunk->resource_idx); - - if (pool->flags & MLX5HWS_POOL_FLAGS_RELEASE_FREE_RESOURCE) - hws_pool_resource_free(pool, chunk->resource_idx); -} - -static void hws_pool_general_element_db_uninit(struct mlx5hws_pool *pool) -{ - (void)pool; -} - -/* This memory management works as the following: - * - At start doesn't allocate no mem at all. - * - When new request for chunk arrived: - * allocate resource and give it. - * - When free that chunk: - * the resource is freed. - */ -static int hws_pool_general_element_db_init(struct mlx5hws_pool *pool) -{ - pool->p_db_uninit = &hws_pool_general_element_db_uninit; - pool->p_get_chunk = &hws_pool_general_element_db_get_chunk; - pool->p_put_chunk = &hws_pool_general_element_db_put_chunk; + bitmap_clear(bitmap, chunk->offset, 1); return 0; } -static void hws_onesize_element_db_destroy_element(struct mlx5hws_pool *pool, - struct mlx5hws_pool_elements *elem, - struct mlx5hws_pool_chunk *chunk) +static void hws_pool_bitmap_db_put_chunk(struct mlx5hws_pool *pool, + struct mlx5hws_pool_chunk *chunk) { - if (unlikely(!pool->resource[chunk->resource_idx])) - pr_warn("HWS: invalid resource with index %d\n", chunk->resource_idx); + unsigned long *bitmap; - hws_pool_resource_free(pool, chunk->resource_idx); - kfree(elem); - pool->db.element_manager->elements[chunk->resource_idx] = NULL; -} - -static void hws_onesize_element_db_put_chunk(struct mlx5hws_pool *pool, - struct mlx5hws_pool_chunk *chunk) -{ - struct mlx5hws_pool_elements *elem; - - if (unlikely(chunk->resource_idx)) - pr_warn("HWS: invalid resource with index %d\n", chunk->resource_idx); - - elem = pool->db.element_manager->elements[chunk->resource_idx]; - if (!elem) { - mlx5hws_err(pool->ctx, "No such element (%d)\n", chunk->resource_idx); + bitmap = pool->db.bitmap; + if (!bitmap) { + mlx5hws_err(pool->ctx, "Bad bitmap state\n"); return; } - bitmap_set(elem->bitmap, chunk->offset, 1); - elem->is_full = false; - elem->num_of_elements--; - - if (pool->flags & MLX5HWS_POOL_FLAGS_RELEASE_FREE_RESOURCE && - !elem->num_of_elements) - hws_onesize_element_db_destroy_element(pool, elem, chunk); + bitmap_set(bitmap, chunk->offset, 1); } -static int hws_onesize_element_db_get_chunk(struct mlx5hws_pool *pool, - struct mlx5hws_pool_chunk *chunk) +static void hws_pool_bitmap_db_uninit(struct mlx5hws_pool *pool) { - int ret = 0; + unsigned long *bitmap; - /* Go over all memory elements and find/allocate free slot */ - ret = hws_pool_onesize_element_get_mem_chunk(pool, chunk->order, - &chunk->resource_idx, - &chunk->offset); - if (ret) - mlx5hws_err(pool->ctx, "Failed to get free slot for chunk with order: %d\n", - chunk->order); - - return ret; -} - -static void hws_onesize_element_db_uninit(struct mlx5hws_pool *pool) -{ - struct mlx5hws_pool_elements *elem; - int i; - - for (i = 0; i < MLX5HWS_POOL_RESOURCE_ARR_SZ; i++) { - elem = pool->db.element_manager->elements[i]; - if (elem) { - bitmap_free(elem->bitmap); - kfree(elem); - pool->db.element_manager->elements[i] = NULL; - } + bitmap = pool->db.bitmap; + if (bitmap) { + bitmap_free(bitmap); + pool->db.bitmap = NULL; } - kfree(pool->db.element_manager); } -/* This memory management works as the following: - * - At start doesn't allocate no mem at all. - * - When new request for chunk arrived: - * aloocate the first and only slot of memory/resource - * when it ended return error. - */ -static int hws_pool_onesize_element_db_init(struct mlx5hws_pool *pool) +static int hws_pool_bitmap_db_init(struct mlx5hws_pool *pool) { - pool->db.element_manager = kzalloc(sizeof(*pool->db.element_manager), GFP_KERNEL); - if (!pool->db.element_manager) - return -ENOMEM; + int ret; - pool->p_db_uninit = &hws_onesize_element_db_uninit; - pool->p_get_chunk = &hws_onesize_element_db_get_chunk; - pool->p_put_chunk = &hws_onesize_element_db_put_chunk; + ret = hws_pool_bitmap_init(pool); + if (ret) + return ret; + + pool->p_db_uninit = &hws_pool_bitmap_db_uninit; + pool->p_get_chunk = &hws_pool_bitmap_db_get_chunk; + pool->p_put_chunk = &hws_pool_bitmap_db_put_chunk; return 0; } @@ -542,15 +299,14 @@ static int hws_pool_db_init(struct mlx5hws_pool *pool, { int ret; - if (db_type == MLX5HWS_POOL_DB_TYPE_GENERAL_SIZE) - ret = hws_pool_general_element_db_init(pool); - else if (db_type == MLX5HWS_POOL_DB_TYPE_ONE_SIZE_RESOURCE) - ret = hws_pool_onesize_element_db_init(pool); + if (db_type == MLX5HWS_POOL_DB_TYPE_BITMAP) + ret = hws_pool_bitmap_db_init(pool); else - ret = hws_pool_buddy_db_init(pool, pool->alloc_log_sz); + ret = hws_pool_buddy_db_init(pool); if (ret) { - mlx5hws_err(pool->ctx, "Failed to init general db : %d (ret: %d)\n", db_type, ret); + mlx5hws_err(pool->ctx, "Failed to init pool type: %d (ret: %d)\n", + db_type, ret); return ret; } @@ -569,6 +325,8 @@ int mlx5hws_pool_chunk_alloc(struct mlx5hws_pool *pool, mutex_lock(&pool->lock); ret = pool->p_get_chunk(pool, chunk); + if (ret == 0) + pool->available_elems -= 1 << chunk->order; mutex_unlock(&pool->lock); return ret; @@ -579,6 +337,7 @@ void mlx5hws_pool_chunk_free(struct mlx5hws_pool *pool, { mutex_lock(&pool->lock); pool->p_put_chunk(pool, chunk); + pool->available_elems += 1 << chunk->order; mutex_unlock(&pool->lock); } @@ -599,17 +358,13 @@ mlx5hws_pool_create(struct mlx5hws_context *ctx, struct mlx5hws_pool_attr *pool_ pool->tbl_type = pool_attr->table_type; pool->opt_type = pool_attr->opt_type; - /* Support general db */ - if (pool->flags == (MLX5HWS_POOL_FLAGS_RELEASE_FREE_RESOURCE | - MLX5HWS_POOL_FLAGS_RESOURCE_PER_CHUNK)) - res_db_type = MLX5HWS_POOL_DB_TYPE_GENERAL_SIZE; - else if (pool->flags == (MLX5HWS_POOL_FLAGS_ONE_RESOURCE | - MLX5HWS_POOL_FLAGS_FIXED_SIZE_OBJECTS)) - res_db_type = MLX5HWS_POOL_DB_TYPE_ONE_SIZE_RESOURCE; - else + if (pool->flags & MLX5HWS_POOL_FLAG_BUDDY) res_db_type = MLX5HWS_POOL_DB_TYPE_BUDDY; + else + res_db_type = MLX5HWS_POOL_DB_TYPE_BITMAP; pool->alloc_log_sz = pool_attr->alloc_log_sz; + pool->available_elems = 1 << pool_attr->alloc_log_sz; if (hws_pool_db_init(pool, res_db_type)) goto free_pool; @@ -623,18 +378,17 @@ free_pool: return NULL; } -int mlx5hws_pool_destroy(struct mlx5hws_pool *pool) +void mlx5hws_pool_destroy(struct mlx5hws_pool *pool) { - int i; - mutex_destroy(&pool->lock); - for (i = 0; i < MLX5HWS_POOL_RESOURCE_ARR_SZ; i++) - if (pool->resource[i]) - hws_pool_resource_free(pool, i); + if (pool->available_elems != 1 << pool->alloc_log_sz) + mlx5hws_err(pool->ctx, "Attempting to destroy non-empty pool\n"); + + if (pool->resource) + hws_pool_resource_free(pool); hws_pool_db_unint(pool); kfree(pool); - return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h index 621298b352..33e33d5f1f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h @@ -6,16 +6,12 @@ #define MLX5HWS_POOL_STC_LOG_SZ 15 -#define MLX5HWS_POOL_RESOURCE_ARR_SZ 100 - enum mlx5hws_pool_type { MLX5HWS_POOL_TYPE_STE, MLX5HWS_POOL_TYPE_STC, }; struct mlx5hws_pool_chunk { - u32 resource_idx; - /* Internal offset, relative to base index */ int offset; int order; }; @@ -27,35 +23,17 @@ struct mlx5hws_pool_resource { }; enum mlx5hws_pool_flags { - /* Only a one resource in that pool */ - MLX5HWS_POOL_FLAGS_ONE_RESOURCE = 1 << 0, - MLX5HWS_POOL_FLAGS_RELEASE_FREE_RESOURCE = 1 << 1, - /* No sharing resources between chunks */ - MLX5HWS_POOL_FLAGS_RESOURCE_PER_CHUNK = 1 << 2, - /* All objects are in the same size */ - MLX5HWS_POOL_FLAGS_FIXED_SIZE_OBJECTS = 1 << 3, - /* Managed by buddy allocator */ - MLX5HWS_POOL_FLAGS_BUDDY_MANAGED = 1 << 4, - /* Allocate pool_type memory on pool creation */ - MLX5HWS_POOL_FLAGS_ALLOC_MEM_ON_CREATE = 1 << 5, - - /* These values should be used by the caller */ - MLX5HWS_POOL_FLAGS_FOR_STC_POOL = - MLX5HWS_POOL_FLAGS_ONE_RESOURCE | - MLX5HWS_POOL_FLAGS_FIXED_SIZE_OBJECTS, - MLX5HWS_POOL_FLAGS_FOR_MATCHER_STE_POOL = - MLX5HWS_POOL_FLAGS_RELEASE_FREE_RESOURCE | - MLX5HWS_POOL_FLAGS_RESOURCE_PER_CHUNK, - MLX5HWS_POOL_FLAGS_FOR_STE_ACTION_POOL = - MLX5HWS_POOL_FLAGS_ONE_RESOURCE | - MLX5HWS_POOL_FLAGS_BUDDY_MANAGED | - MLX5HWS_POOL_FLAGS_ALLOC_MEM_ON_CREATE, + /* Managed by a buddy allocator. If this is not set only allocations of + * order 0 are supported. + */ + MLX5HWS_POOL_FLAG_BUDDY = BIT(0), }; enum mlx5hws_pool_optimize { MLX5HWS_POOL_OPTIMIZE_NONE = 0x0, MLX5HWS_POOL_OPTIMIZE_ORIG = 0x1, MLX5HWS_POOL_OPTIMIZE_MIRROR = 0x2, + MLX5HWS_POOL_OPTIMIZE_MAX = 0x3, }; struct mlx5hws_pool_attr { @@ -68,34 +46,17 @@ struct mlx5hws_pool_attr { }; enum mlx5hws_db_type { - /* Uses for allocating chunk of big memory, each element has its own resource in the FW*/ - MLX5HWS_POOL_DB_TYPE_GENERAL_SIZE, - /* One resource only, all the elements are with same one size */ - MLX5HWS_POOL_DB_TYPE_ONE_SIZE_RESOURCE, - /* Many resources, the memory allocated with buddy mechanism */ + /* Uses a bitmap, supports only allocations of order 0. */ + MLX5HWS_POOL_DB_TYPE_BITMAP, + /* Entries are managed using a buddy mechanism. */ MLX5HWS_POOL_DB_TYPE_BUDDY, }; -struct mlx5hws_buddy_manager { - struct mlx5hws_buddy_mem *buddies[MLX5HWS_POOL_RESOURCE_ARR_SZ]; -}; - -struct mlx5hws_pool_elements { - u32 num_of_elements; - unsigned long *bitmap; - u32 log_size; - bool is_full; -}; - -struct mlx5hws_element_manager { - struct mlx5hws_pool_elements *elements[MLX5HWS_POOL_RESOURCE_ARR_SZ]; -}; - struct mlx5hws_pool_db { enum mlx5hws_db_type type; union { - struct mlx5hws_element_manager *element_manager; - struct mlx5hws_buddy_manager *buddy_manager; + unsigned long *bitmap; + struct mlx5hws_buddy_mem *buddy; }; }; @@ -111,11 +72,11 @@ struct mlx5hws_pool { enum mlx5hws_pool_flags flags; struct mutex lock; /* protect the pool */ size_t alloc_log_sz; + size_t available_elems; enum mlx5hws_table_type tbl_type; enum mlx5hws_pool_optimize opt_type; - struct mlx5hws_pool_resource *resource[MLX5HWS_POOL_RESOURCE_ARR_SZ]; - struct mlx5hws_pool_resource *mirror_resource[MLX5HWS_POOL_RESOURCE_ARR_SZ]; - /* DB */ + struct mlx5hws_pool_resource *resource; + struct mlx5hws_pool_resource *mirror_resource; struct mlx5hws_pool_db db; /* Functions */ mlx5hws_pool_unint_db p_db_uninit; @@ -127,7 +88,7 @@ struct mlx5hws_pool * mlx5hws_pool_create(struct mlx5hws_context *ctx, struct mlx5hws_pool_attr *pool_attr); -int mlx5hws_pool_destroy(struct mlx5hws_pool *pool); +void mlx5hws_pool_destroy(struct mlx5hws_pool *pool); int mlx5hws_pool_chunk_alloc(struct mlx5hws_pool *pool, struct mlx5hws_pool_chunk *chunk); @@ -135,17 +96,37 @@ int mlx5hws_pool_chunk_alloc(struct mlx5hws_pool *pool, void mlx5hws_pool_chunk_free(struct mlx5hws_pool *pool, struct mlx5hws_pool_chunk *chunk); -static inline u32 -mlx5hws_pool_chunk_get_base_id(struct mlx5hws_pool *pool, - struct mlx5hws_pool_chunk *chunk) +static inline u32 mlx5hws_pool_get_base_id(struct mlx5hws_pool *pool) { - return pool->resource[chunk->resource_idx]->base_id; + return pool->resource->base_id; } -static inline u32 -mlx5hws_pool_chunk_get_base_mirror_id(struct mlx5hws_pool *pool, - struct mlx5hws_pool_chunk *chunk) +static inline u32 mlx5hws_pool_get_base_mirror_id(struct mlx5hws_pool *pool) { - return pool->mirror_resource[chunk->resource_idx]->base_id; + return pool->mirror_resource->base_id; +} + +static inline bool +mlx5hws_pool_empty(struct mlx5hws_pool *pool) +{ + bool ret; + + mutex_lock(&pool->lock); + ret = pool->available_elems == 0; + mutex_unlock(&pool->lock); + + return ret; +} + +static inline bool +mlx5hws_pool_full(struct mlx5hws_pool *pool) +{ + bool ret; + + mutex_lock(&pool->lock); + ret = pool->available_elems == (1 << pool->alloc_log_sz); + mutex_unlock(&pool->lock); + + return ret; } #endif /* MLX5HWS_POOL_H_ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c index a27a2d5ffc..a94f094e72 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c @@ -3,10 +3,8 @@ #include "internal.h" -static void hws_rule_skip(struct mlx5hws_matcher *matcher, - struct mlx5hws_match_template *mt, - u32 flow_source, - bool *skip_rx, bool *skip_tx) +void mlx5hws_rule_skip(struct mlx5hws_matcher *matcher, u32 flow_source, + bool *skip_rx, bool *skip_tx) { /* By default FDB rules are added to both RX and TX */ *skip_rx = false; @@ -14,20 +12,21 @@ static void hws_rule_skip(struct mlx5hws_matcher *matcher, if (flow_source == MLX5_FLOW_CONTEXT_FLOW_SOURCE_LOCAL_VPORT) { *skip_rx = true; - } else if (flow_source == MLX5_FLOW_CONTEXT_FLOW_SOURCE_UPLINK) { - *skip_tx = true; - } else { - /* If no flow source was set for current rule, - * check for flow source in matcher attributes. - */ - if (matcher->attr.optimize_flow_src) { - *skip_tx = - matcher->attr.optimize_flow_src == MLX5HWS_MATCHER_FLOW_SRC_WIRE; - *skip_rx = - matcher->attr.optimize_flow_src == MLX5HWS_MATCHER_FLOW_SRC_VPORT; - return; - } + return; } + + if (flow_source == MLX5_FLOW_CONTEXT_FLOW_SOURCE_UPLINK) { + *skip_tx = true; + return; + } + + /* If no flow source was set for current rule, + * check for flow source in matcher attributes. + */ + *skip_tx = matcher->attr.optimize_flow_src == + MLX5HWS_MATCHER_FLOW_SRC_WIRE; + *skip_rx = matcher->attr.optimize_flow_src == + MLX5HWS_MATCHER_FLOW_SRC_VPORT; } static void @@ -66,7 +65,8 @@ static void hws_rule_init_dep_wqe(struct mlx5hws_send_ring_dep_wqe *dep_wqe, attr->rule_idx : 0; if (tbl->type == MLX5HWS_TABLE_TYPE_FDB) { - hws_rule_skip(matcher, mt, attr->flow_source, &skip_rx, &skip_tx); + mlx5hws_rule_skip(matcher, attr->flow_source, + &skip_rx, &skip_tx); if (!skip_rx) { dep_wqe->rtc_0 = matcher->match_ste.rtc_0_id; @@ -195,44 +195,30 @@ hws_rule_load_delete_info(struct mlx5hws_rule *rule, } } -static int hws_rule_alloc_action_ste(struct mlx5hws_rule *rule) +static int mlx5hws_rule_alloc_action_ste(struct mlx5hws_rule *rule, + u16 queue_id, bool skip_rx, + bool skip_tx) { struct mlx5hws_matcher *matcher = rule->matcher; - struct mlx5hws_matcher_action_ste *action_ste; - struct mlx5hws_pool_chunk ste = {0}; - int ret; + struct mlx5hws_context *ctx = matcher->tbl->ctx; - action_ste = &matcher->action_ste; - ste.order = ilog2(roundup_pow_of_two(action_ste->max_stes)); - ret = mlx5hws_pool_chunk_alloc(action_ste->pool, &ste); - if (unlikely(ret)) { - mlx5hws_err(matcher->tbl->ctx, - "Failed to allocate STE for rule actions"); - return ret; - } - - rule->action_ste.pool = matcher->action_ste.pool; - rule->action_ste.num_stes = matcher->action_ste.max_stes; - rule->action_ste.index = ste.offset; - - return 0; + rule->action_ste.ste.order = + ilog2(roundup_pow_of_two(matcher->num_of_action_stes)); + return mlx5hws_action_ste_chunk_alloc(&ctx->action_ste_pool[queue_id], + skip_rx, skip_tx, + &rule->action_ste); } -void mlx5hws_rule_free_action_ste(struct mlx5hws_rule_action_ste_info *action_ste) +void mlx5hws_rule_free_action_ste(struct mlx5hws_action_ste_chunk *action_ste) { - struct mlx5hws_pool_chunk ste = {0}; - - if (!action_ste->num_stes) + if (!action_ste->action_tbl) return; - ste.order = ilog2(roundup_pow_of_two(action_ste->num_stes)); - ste.offset = action_ste->index; - /* This release is safe only when the rule match STE was deleted * (when the rule is being deleted) or replaced with the new STE that * isn't pointing to old action STEs (when the rule is being updated). */ - mlx5hws_pool_chunk_free(action_ste->pool, &ste); + mlx5hws_action_ste_chunk_free(action_ste); } static void hws_rule_create_init(struct mlx5hws_rule *rule, @@ -250,22 +236,15 @@ static void hws_rule_create_init(struct mlx5hws_rule *rule, rule->rtc_0 = 0; rule->rtc_1 = 0; - rule->action_ste.pool = NULL; - rule->action_ste.num_stes = 0; - rule->action_ste.index = -1; - rule->status = MLX5HWS_RULE_STATUS_CREATING; } else { rule->status = MLX5HWS_RULE_STATUS_UPDATING; + /* Save the old action STE info so we can free it after writing + * new action STEs and a corresponding match STE. + */ + rule->old_action_ste = rule->action_ste; } - /* Initialize the old action STE info - shallow-copy action_ste. - * In create flow this will set old_action_ste fields to initial values. - * In update flow this will save the existing action STE info, - * so that we will later use it to free old STEs. - */ - rule->old_action_ste = rule->action_ste; - rule->pending_wqes = 0; /* Init default send STE attributes */ @@ -277,7 +256,6 @@ static void hws_rule_create_init(struct mlx5hws_rule *rule, /* Init default action apply */ apply->tbl_type = tbl->type; apply->common_res = &ctx->common_res; - apply->jump_to_action_stc = matcher->action_ste.stc.offset; apply->require_dep = 0; } @@ -353,17 +331,24 @@ static int hws_rule_create_hws(struct mlx5hws_rule *rule, if (action_stes) { /* Allocate action STEs for rules that need more than match STE */ - ret = hws_rule_alloc_action_ste(rule); + ret = mlx5hws_rule_alloc_action_ste(rule, attr->queue_id, + !!ste_attr.rtc_0, + !!ste_attr.rtc_1); if (ret) { mlx5hws_err(ctx, "Failed to allocate action memory %d", ret); mlx5hws_send_abort_new_dep_wqe(queue); return ret; } + apply.jump_to_action_stc = + rule->action_ste.action_tbl->stc.offset; /* Skip RX/TX based on the dep_wqe init */ - ste_attr.rtc_0 = dep_wqe->rtc_0 ? matcher->action_ste.rtc_0_id : 0; - ste_attr.rtc_1 = dep_wqe->rtc_1 ? matcher->action_ste.rtc_1_id : 0; + ste_attr.rtc_0 = dep_wqe->rtc_0 ? + rule->action_ste.action_tbl->rtc_0_id : 0; + ste_attr.rtc_1 = dep_wqe->rtc_1 ? + rule->action_ste.action_tbl->rtc_1_id : 0; /* Action STEs are written to a specific index last to first */ - ste_attr.direct_index = rule->action_ste.index + action_stes; + ste_attr.direct_index = + rule->action_ste.ste.offset + action_stes; apply.next_direct_idx = ste_attr.direct_index; } else { apply.next_direct_idx = 0; @@ -670,6 +655,124 @@ int mlx5hws_rule_move_hws_add(struct mlx5hws_rule *rule, return 0; } +static u8 hws_rule_ethertype_to_matcher_ipv(u32 ethertype) +{ + switch (ethertype) { + case ETH_P_IP: + return MLX5HWS_MATCHER_IPV_4; + case ETH_P_IPV6: + return MLX5HWS_MATCHER_IPV_6; + default: + return MLX5HWS_MATCHER_IPV_UNSET; + } +} + +static u8 hws_rule_ip_version_to_matcher_ipv(u32 ip_version) +{ + switch (ip_version) { + case 4: + return MLX5HWS_MATCHER_IPV_4; + case 6: + return MLX5HWS_MATCHER_IPV_6; + default: + return MLX5HWS_MATCHER_IPV_UNSET; + } +} + +static int hws_rule_check_outer_ip_version(struct mlx5hws_matcher *matcher, + u32 *match_param) +{ + struct mlx5hws_context *ctx = matcher->tbl->ctx; + u8 outer_ipv_ether = MLX5HWS_MATCHER_IPV_UNSET; + u8 outer_ipv_ip = MLX5HWS_MATCHER_IPV_UNSET; + u8 outer_ipv, ver; + + if (matcher->matches_outer_ethertype) { + ver = MLX5_GET(fte_match_param, match_param, + outer_headers.ethertype); + outer_ipv_ether = hws_rule_ethertype_to_matcher_ipv(ver); + } + if (matcher->matches_outer_ip_version) { + ver = MLX5_GET(fte_match_param, match_param, + outer_headers.ip_version); + outer_ipv_ip = hws_rule_ip_version_to_matcher_ipv(ver); + } + + if (outer_ipv_ether != MLX5HWS_MATCHER_IPV_UNSET && + outer_ipv_ip != MLX5HWS_MATCHER_IPV_UNSET && + outer_ipv_ether != outer_ipv_ip) { + mlx5hws_err(ctx, "Rule matches on inconsistent outer ethertype and ip version\n"); + return -EINVAL; + } + + outer_ipv = outer_ipv_ether != MLX5HWS_MATCHER_IPV_UNSET ? + outer_ipv_ether : outer_ipv_ip; + if (outer_ipv != MLX5HWS_MATCHER_IPV_UNSET && + matcher->outer_ip_version != MLX5HWS_MATCHER_IPV_UNSET && + outer_ipv != matcher->outer_ip_version) { + mlx5hws_err(ctx, "Matcher and rule disagree on outer IP version\n"); + return -EINVAL; + } + matcher->outer_ip_version = outer_ipv; + + return 0; +} + +static int hws_rule_check_inner_ip_version(struct mlx5hws_matcher *matcher, + u32 *match_param) +{ + struct mlx5hws_context *ctx = matcher->tbl->ctx; + u8 inner_ipv_ether = MLX5HWS_MATCHER_IPV_UNSET; + u8 inner_ipv_ip = MLX5HWS_MATCHER_IPV_UNSET; + u8 inner_ipv, ver; + + if (matcher->matches_inner_ethertype) { + ver = MLX5_GET(fte_match_param, match_param, + inner_headers.ethertype); + inner_ipv_ether = hws_rule_ethertype_to_matcher_ipv(ver); + } + if (matcher->matches_inner_ip_version) { + ver = MLX5_GET(fte_match_param, match_param, + inner_headers.ip_version); + inner_ipv_ip = hws_rule_ip_version_to_matcher_ipv(ver); + } + + if (inner_ipv_ether != MLX5HWS_MATCHER_IPV_UNSET && + inner_ipv_ip != MLX5HWS_MATCHER_IPV_UNSET && + inner_ipv_ether != inner_ipv_ip) { + mlx5hws_err(ctx, "Rule matches on inconsistent inner ethertype and ip version\n"); + return -EINVAL; + } + + inner_ipv = inner_ipv_ether != MLX5HWS_MATCHER_IPV_UNSET ? + inner_ipv_ether : inner_ipv_ip; + if (inner_ipv != MLX5HWS_MATCHER_IPV_UNSET && + matcher->inner_ip_version != MLX5HWS_MATCHER_IPV_UNSET && + inner_ipv != matcher->inner_ip_version) { + mlx5hws_err(ctx, "Matcher and rule disagree on inner IP version\n"); + return -EINVAL; + } + matcher->inner_ip_version = inner_ipv; + + return 0; +} + +static int hws_rule_check_ip_version(struct mlx5hws_matcher *matcher, + u32 *match_param) +{ + int ret; + + ret = hws_rule_check_outer_ip_version(matcher, match_param); + if (unlikely(ret)) + return ret; + + ret = hws_rule_check_inner_ip_version(matcher, match_param); + if (unlikely(ret)) + return ret; + + return 0; +} + int mlx5hws_rule_create(struct mlx5hws_matcher *matcher, u8 mt_idx, u32 *match_param, @@ -680,6 +783,10 @@ int mlx5hws_rule_create(struct mlx5hws_matcher *matcher, { int ret; + ret = hws_rule_check_ip_version(matcher, match_param); + if (unlikely(ret)) + return ret; + rule_handle->matcher = matcher; ret = hws_rule_enqueue_precheck_create(rule_handle, attr); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.h index b5ee94ac44..d0f082b8db 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.h @@ -43,12 +43,6 @@ struct mlx5hws_rule_match_tag { }; }; -struct mlx5hws_rule_action_ste_info { - struct mlx5hws_pool *pool; - int index; /* STE array index */ - u8 num_stes; -}; - struct mlx5hws_rule_resize_info { u32 rtc_0; u32 rtc_1; @@ -64,8 +58,8 @@ struct mlx5hws_rule { struct mlx5hws_rule_match_tag tag; struct mlx5hws_rule_resize_info *resize_info; }; - struct mlx5hws_rule_action_ste_info action_ste; - struct mlx5hws_rule_action_ste_info old_action_ste; + struct mlx5hws_action_ste_chunk action_ste; + struct mlx5hws_action_ste_chunk old_action_ste; u32 rtc_0; /* The RTC into which the STE was inserted */ u32 rtc_1; /* The RTC into which the STE was inserted */ u8 status; /* enum mlx5hws_rule_status */ @@ -75,7 +69,10 @@ struct mlx5hws_rule { */ }; -void mlx5hws_rule_free_action_ste(struct mlx5hws_rule_action_ste_info *action_ste); +void mlx5hws_rule_skip(struct mlx5hws_matcher *matcher, u32 flow_source, + bool *skip_rx, bool *skip_tx); + +void mlx5hws_rule_free_action_ste(struct mlx5hws_action_ste_chunk *action_ste); int mlx5hws_rule_move_hws_remove(struct mlx5hws_rule *rule, void *queue, void *user_data); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c index cb6abc4ab7..7510c46e58 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c @@ -344,18 +344,133 @@ hws_send_engine_update_rule_resize(struct mlx5hws_send_engine *queue, } } +static void hws_send_engine_dump_error_cqe(struct mlx5hws_send_engine *queue, + struct mlx5hws_send_ring_priv *priv, + struct mlx5_cqe64 *cqe) +{ + u8 wqe_opcode = cqe ? be32_to_cpu(cqe->sop_drop_qpn) >> 24 : 0; + struct mlx5hws_context *ctx = priv->rule->matcher->tbl->ctx; + u32 opcode = cqe ? get_cqe_opcode(cqe) : 0; + struct mlx5hws_rule *rule = priv->rule; + + /* If something bad happens and lots of rules are failing, we don't + * want to pollute dmesg. Print only the first bad cqe per engine, + * the one that started the avalanche. + */ + if (queue->error_cqe_printed) + return; + + queue->error_cqe_printed = true; + + if (mlx5hws_rule_move_in_progress(rule)) + mlx5hws_err(ctx, + "--- rule 0x%08llx: error completion moving rule: phase %s, wqes left %d\n", + HWS_PTR_TO_ID(rule), + rule->resize_info->state == + MLX5HWS_RULE_RESIZE_STATE_WRITING ? "WRITING" : + rule->resize_info->state == + MLX5HWS_RULE_RESIZE_STATE_DELETING ? "DELETING" : + "UNKNOWN", + rule->pending_wqes); + else + mlx5hws_err(ctx, + "--- rule 0x%08llx: error completion %s (%d), wqes left %d\n", + HWS_PTR_TO_ID(rule), + rule->status == + MLX5HWS_RULE_STATUS_CREATING ? "CREATING" : + rule->status == + MLX5HWS_RULE_STATUS_DELETING ? "DELETING" : + rule->status == + MLX5HWS_RULE_STATUS_FAILING ? "FAILING" : + rule->status == + MLX5HWS_RULE_STATUS_UPDATING ? "UPDATING" : "NA", + rule->status, + rule->pending_wqes); + + mlx5hws_err(ctx, " rule 0x%08llx: matcher 0x%llx %s\n", + HWS_PTR_TO_ID(rule), + HWS_PTR_TO_ID(rule->matcher), + (rule->matcher->flags & MLX5HWS_MATCHER_FLAGS_ISOLATED) ? + "(isolated)" : ""); + + if (!cqe) { + mlx5hws_err(ctx, " rule 0x%08llx: no CQE\n", + HWS_PTR_TO_ID(rule)); + return; + } + + mlx5hws_err(ctx, " rule 0x%08llx: cqe->opcode = %d %s\n", + HWS_PTR_TO_ID(rule), opcode, + opcode == MLX5_CQE_REQ ? "(MLX5_CQE_REQ)" : + opcode == MLX5_CQE_REQ_ERR ? "(MLX5_CQE_REQ_ERR)" : " "); + + if (opcode == MLX5_CQE_REQ_ERR) { + struct mlx5_err_cqe *err_cqe = (struct mlx5_err_cqe *)cqe; + + mlx5hws_err(ctx, + " rule 0x%08llx: |--- hw_error_syndrome = 0x%x\n", + HWS_PTR_TO_ID(rule), + err_cqe->rsvd1[16]); + mlx5hws_err(ctx, + " rule 0x%08llx: |--- hw_syndrome_type = 0x%x\n", + HWS_PTR_TO_ID(rule), + err_cqe->rsvd1[17] >> 4); + mlx5hws_err(ctx, + " rule 0x%08llx: |--- vendor_err_synd = 0x%x\n", + HWS_PTR_TO_ID(rule), + err_cqe->vendor_err_synd); + mlx5hws_err(ctx, + " rule 0x%08llx: |--- syndrome = 0x%x\n", + HWS_PTR_TO_ID(rule), + err_cqe->syndrome); + } + + mlx5hws_err(ctx, + " rule 0x%08llx: cqe->byte_cnt = 0x%08x\n", + HWS_PTR_TO_ID(rule), be32_to_cpu(cqe->byte_cnt)); + mlx5hws_err(ctx, + " rule 0x%08llx: |-- UPDATE STATUS = %s\n", + HWS_PTR_TO_ID(rule), + (be32_to_cpu(cqe->byte_cnt) & 0x80000000) ? + "FAILURE" : "SUCCESS"); + mlx5hws_err(ctx, + " rule 0x%08llx: |------- SYNDROME = %s\n", + HWS_PTR_TO_ID(rule), + ((be32_to_cpu(cqe->byte_cnt) & 0x00000003) == 1) ? + "SET_FLOW_FAIL" : + ((be32_to_cpu(cqe->byte_cnt) & 0x00000003) == 2) ? + "DISABLE_FLOW_FAIL" : "UNKNOWN"); + mlx5hws_err(ctx, + " rule 0x%08llx: cqe->sop_drop_qpn = 0x%08x\n", + HWS_PTR_TO_ID(rule), be32_to_cpu(cqe->sop_drop_qpn)); + mlx5hws_err(ctx, + " rule 0x%08llx: |-send wqe opcode = 0x%02x %s\n", + HWS_PTR_TO_ID(rule), wqe_opcode, + wqe_opcode == MLX5HWS_WQE_OPCODE_TBL_ACCESS ? + "(MLX5HWS_WQE_OPCODE_TBL_ACCESS)" : "(UNKNOWN)"); + mlx5hws_err(ctx, + " rule 0x%08llx: |------------ qpn = 0x%06x\n", + HWS_PTR_TO_ID(rule), + be32_to_cpu(cqe->sop_drop_qpn) & 0xffffff); +} + static void hws_send_engine_update_rule(struct mlx5hws_send_engine *queue, struct mlx5hws_send_ring_priv *priv, u16 wqe_cnt, - enum mlx5hws_flow_op_status *status) + enum mlx5hws_flow_op_status *status, + struct mlx5_cqe64 *cqe) { priv->rule->pending_wqes--; - if (*status == MLX5HWS_FLOW_OP_ERROR) { + if (unlikely(*status == MLX5HWS_FLOW_OP_ERROR)) { if (priv->retry_id) { + /* If there is a retry_id, then it's not an error yet, + * retry to insert this rule in the collision RTC. + */ hws_send_engine_retry_post_send(queue, priv, wqe_cnt); return; } + hws_send_engine_dump_error_cqe(queue, priv, cqe); /* Some part of the rule failed */ priv->rule->status = MLX5HWS_RULE_STATUS_FAILING; *priv->used_id = 0; @@ -420,7 +535,8 @@ static void hws_send_engine_update(struct mlx5hws_send_engine *queue, if (priv->user_data) { if (priv->rule) { - hws_send_engine_update_rule(queue, priv, wqe_cnt, &status); + hws_send_engine_update_rule(queue, priv, wqe_cnt, + &status, cqe); /* Completion is provided on the last rule WQE */ if (priv->rule->pending_wqes) return; @@ -574,7 +690,7 @@ static int hws_send_ring_alloc_sq(struct mlx5_core_dev *mdev, size_t buf_sz; int err; - sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map; + sq->uar_map = mdev->priv.bfreg.map; sq->mdev = mdev; param.db_numa_node = numa_node; @@ -648,7 +764,7 @@ static int hws_send_ring_create_sq(struct mlx5_core_dev *mdev, u32 pdn, MLX5_SET(sqc, sqc, ts_format, ts_format); MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); - MLX5_SET(wq, wq, uar_page, mdev->mlx5e_res.hw_objs.bfreg.index); + MLX5_SET(wq, wq, uar_page, mdev->priv.bfreg.index); MLX5_SET(wq, wq, log_wq_pg_sz, sq->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET64(wq, wq, dbr_addr, sq->wq_ctrl.db.dma); @@ -757,12 +873,6 @@ err_free_sqc: return err; } -static void hws_cq_complete(struct mlx5_core_cq *mcq, - struct mlx5_eqe *eqe) -{ - pr_err("CQ completion CQ: #%u\n", mcq->cqn); -} - static int hws_send_ring_alloc_cq(struct mlx5_core_dev *mdev, int numa_node, struct mlx5hws_send_engine *queue, @@ -785,7 +895,6 @@ static int hws_send_ring_alloc_cq(struct mlx5_core_dev *mdev, mcq->cqe_sz = 64; mcq->set_ci_db = cq->wq_ctrl.db.db; mcq->arm_db = cq->wq_ctrl.db.db + 1; - mcq->comp = hws_cq_complete; for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) { cqe = mlx5_cqwq_get_wqe(&cq->wq, i); @@ -824,7 +933,7 @@ static int hws_send_ring_create_cq(struct mlx5_core_dev *mdev, (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas)); MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); - MLX5_SET(cqc, cqc, uar_page, mdev->priv.uar->index); + MLX5_SET(cqc, cqc, uar_page, mdev->priv.bfreg.up->index); MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma); @@ -847,8 +956,7 @@ static int hws_send_ring_open_cq(struct mlx5_core_dev *mdev, if (!cqc_data) return -ENOMEM; - MLX5_SET(cqc, cqc_data, uar_page, mdev->priv.uar->index); - MLX5_SET(cqc, cqc_data, cqe_sz, queue->num_entries); + MLX5_SET(cqc, cqc_data, uar_page, mdev->priv.bfreg.up->index); MLX5_SET(cqc, cqc_data, log_cq_size, ilog2(queue->num_entries)); err = hws_send_ring_alloc_cq(mdev, numa_node, queue, cqc_data, cq); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.h index f833092235..3fb8e99309 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.h @@ -140,6 +140,7 @@ struct mlx5hws_send_engine { u16 used_entries; u16 num_entries; bool err; + bool error_cqe_printed; struct mutex lock; /* Protects the send engine */ }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c index ab12975312..6113383ae4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c @@ -9,6 +9,7 @@ u32 mlx5hws_table_get_id(struct mlx5hws_table *tbl) } static void hws_table_init_next_ft_attr(struct mlx5hws_table *tbl, + u16 uid, struct mlx5hws_cmd_ft_create_attr *ft_attr) { ft_attr->type = tbl->fw_ft_type; @@ -16,7 +17,9 @@ static void hws_table_init_next_ft_attr(struct mlx5hws_table *tbl, ft_attr->level = tbl->ctx->caps->fdb_ft.max_level - 1; else ft_attr->level = tbl->ctx->caps->nic_ft.max_level - 1; + ft_attr->rtc_valid = true; + ft_attr->uid = uid; } static void hws_table_set_cap_attr(struct mlx5hws_table *tbl, @@ -119,12 +122,12 @@ static int hws_table_connect_to_default_miss_tbl(struct mlx5hws_table *tbl, u32 int mlx5hws_table_create_default_ft(struct mlx5_core_dev *mdev, struct mlx5hws_table *tbl, - u32 *ft_id) + u16 uid, u32 *ft_id) { struct mlx5hws_cmd_ft_create_attr ft_attr = {0}; int ret; - hws_table_init_next_ft_attr(tbl, &ft_attr); + hws_table_init_next_ft_attr(tbl, uid, &ft_attr); hws_table_set_cap_attr(tbl, &ft_attr); ret = mlx5hws_cmd_flow_table_create(mdev, &ft_attr, ft_id); @@ -189,7 +192,10 @@ static int hws_table_init(struct mlx5hws_table *tbl) } mutex_lock(&ctx->ctrl_lock); - ret = mlx5hws_table_create_default_ft(tbl->ctx->mdev, tbl, &tbl->ft_id); + ret = mlx5hws_table_create_default_ft(tbl->ctx->mdev, + tbl, + tbl->uid, + &tbl->ft_id); if (ret) { mlx5hws_err(tbl->ctx, "Failed to create flow table object\n"); mutex_unlock(&ctx->ctrl_lock); @@ -239,6 +245,7 @@ struct mlx5hws_table *mlx5hws_table_create(struct mlx5hws_context *ctx, tbl->ctx = ctx; tbl->type = attr->type; tbl->level = attr->level; + tbl->uid = attr->uid; ret = hws_table_init(tbl); if (ret) { @@ -342,10 +349,10 @@ int mlx5hws_table_ft_set_next_rtc(struct mlx5hws_context *ctx, return mlx5hws_cmd_flow_table_modify(ctx->mdev, &ft_attr, ft_id); } -static int hws_table_ft_set_next_ft(struct mlx5hws_context *ctx, - u32 ft_id, - u32 fw_ft_type, - u32 next_ft_id) +int mlx5hws_table_ft_set_next_ft(struct mlx5hws_context *ctx, + u32 ft_id, + u32 fw_ft_type, + u32 next_ft_id) { struct mlx5hws_cmd_ft_modify_attr ft_attr = {0}; @@ -389,10 +396,10 @@ int mlx5hws_table_connect_to_miss_table(struct mlx5hws_table *src_tbl, if (dst_tbl) { if (list_empty(&dst_tbl->matchers_list)) { /* Connect src_tbl last_ft to dst_tbl start anchor */ - ret = hws_table_ft_set_next_ft(src_tbl->ctx, - last_ft_id, - src_tbl->fw_ft_type, - dst_tbl->ft_id); + ret = mlx5hws_table_ft_set_next_ft(src_tbl->ctx, + last_ft_id, + src_tbl->fw_ft_type, + dst_tbl->ft_id); if (ret) return ret; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.h index dd50420eec..1246f9bd84 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.h @@ -18,6 +18,7 @@ struct mlx5hws_table { enum mlx5hws_table_type type; u32 fw_ft_type; u32 level; + u16 uid; struct list_head matchers_list; struct list_head tbl_list_node; struct mlx5hws_default_miss default_miss; @@ -47,7 +48,7 @@ u32 mlx5hws_table_get_res_fw_ft_type(enum mlx5hws_table_type tbl_type, int mlx5hws_table_create_default_ft(struct mlx5_core_dev *mdev, struct mlx5hws_table *tbl, - u32 *ft_id); + u16 uid, u32 *ft_id); void mlx5hws_table_destroy_default_ft(struct mlx5hws_table *tbl, u32 ft_id); @@ -65,4 +66,9 @@ int mlx5hws_table_ft_set_next_rtc(struct mlx5hws_context *ctx, u32 rtc_0_id, u32 rtc_1_id); +int mlx5hws_table_ft_set_next_ft(struct mlx5hws_context *ctx, + u32 ft_id, + u32 fw_ft_type, + u32 next_ft_id); + #endif /* MLX5HWS_TABLE_H_ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_cmd.c index baefb9a3fa..1ebb2b15c0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_cmd.c @@ -2,6 +2,7 @@ /* Copyright (c) 2019 Mellanox Technologies. */ #include "dr_types.h" +#include "eswitch.h" int mlx5dr_cmd_query_esw_vport_context(struct mlx5_core_dev *mdev, bool other_vport, @@ -34,34 +35,21 @@ int mlx5dr_cmd_query_esw_vport_context(struct mlx5_core_dev *mdev, int mlx5dr_cmd_query_gvmi(struct mlx5_core_dev *mdev, bool other_vport, u16 vport_number, u16 *gvmi) { - bool ec_vf_func = other_vport ? mlx5_core_is_ec_vf_vport(mdev, vport_number) : false; - u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {}; - int out_size; - void *out; int err; - out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out); - out = kzalloc(out_size, GFP_KERNEL); - if (!out) - return -ENOMEM; + if (!other_vport) { + /* self vhca_id */ + *gvmi = MLX5_CAP_GEN(mdev, vhca_id); + return 0; + } - MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); - MLX5_SET(query_hca_cap_in, in, other_function, other_vport); - MLX5_SET(query_hca_cap_in, in, function_id, mlx5_vport_to_func_id(mdev, vport_number, ec_vf_func)); - MLX5_SET(query_hca_cap_in, in, ec_vf_function, ec_vf_func); - MLX5_SET(query_hca_cap_in, in, op_mod, - MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 | - HCA_CAP_OPMOD_GET_CUR); - - err = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out); + err = mlx5_vport_get_vhca_id(mdev, vport_number, gvmi); if (err) { - kfree(out); + mlx5_core_err(mdev, "Failed to get vport vhca id for vport %d\n", + vport_number); return err; } - *gvmi = MLX5_GET(query_hca_cap_out, out, capability.cmd_hca_cap.vhca_id); - - kfree(out); return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_domain.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_domain.c index 65740bb68b..e8c67ed9f7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_domain.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_domain.c @@ -410,7 +410,7 @@ static int dr_domain_caps_init(struct mlx5_core_dev *mdev, switch (dmn->type) { case MLX5DR_DOMAIN_TYPE_NIC_RX: if (!DR_DOMAIN_SW_STEERING_SUPPORTED(dmn, rx)) - return -ENOTSUPP; + return -EOPNOTSUPP; dmn->info.supp_sw_steering = true; dmn->info.rx.type = DR_DOMAIN_NIC_TYPE_RX; @@ -419,7 +419,7 @@ static int dr_domain_caps_init(struct mlx5_core_dev *mdev, break; case MLX5DR_DOMAIN_TYPE_NIC_TX: if (!DR_DOMAIN_SW_STEERING_SUPPORTED(dmn, tx)) - return -ENOTSUPP; + return -EOPNOTSUPP; dmn->info.supp_sw_steering = true; dmn->info.tx.type = DR_DOMAIN_NIC_TYPE_TX; @@ -428,10 +428,10 @@ static int dr_domain_caps_init(struct mlx5_core_dev *mdev, break; case MLX5DR_DOMAIN_TYPE_FDB: if (!dmn->info.caps.eswitch_manager) - return -ENOTSUPP; + return -EOPNOTSUPP; if (!DR_DOMAIN_SW_STEERING_SUPPORTED(dmn, fdb)) - return -ENOTSUPP; + return -EOPNOTSUPP; dmn->info.rx.type = DR_DOMAIN_NIC_TYPE_RX; dmn->info.tx.type = DR_DOMAIN_NIC_TYPE_TX; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c index 4fd4e84833..d034372fa0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c @@ -1049,12 +1049,6 @@ static int dr_prepare_qp_to_rts(struct mlx5dr_domain *dmn) return 0; } -static void dr_cq_complete(struct mlx5_core_cq *mcq, - struct mlx5_eqe *eqe) -{ - pr_err("CQ completion CQ: #%u\n", mcq->cqn); -} - static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, struct mlx5_uars_page *uar, size_t ncqe) @@ -1089,6 +1083,13 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, cqe->op_own = MLX5_CQE_INVALID << 4 | MLX5_CQE_OWNER_MASK; } + cq->mcq.cqe_sz = 64; + cq->mcq.set_ci_db = cq->wq_ctrl.db.db; + cq->mcq.arm_db = cq->wq_ctrl.db.db + 1; + *cq->mcq.set_ci_db = 0; + cq->mcq.vector = 0; + cq->mdev = mdev; + inlen = MLX5_ST_SZ_BYTES(create_cq_in) + sizeof(u64) * cq->wq_ctrl.buf.npages; in = kvzalloc(inlen, GFP_KERNEL); @@ -1112,28 +1113,12 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas); mlx5_fill_page_frag_array(&cq->wq_ctrl.buf, pas); - cq->mcq.comp = dr_cq_complete; - err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out)); kvfree(in); if (err) goto err_cqwq; - cq->mcq.cqe_sz = 64; - cq->mcq.set_ci_db = cq->wq_ctrl.db.db; - cq->mcq.arm_db = cq->wq_ctrl.db.db + 1; - *cq->mcq.set_ci_db = 0; - - /* set no-zero value, in order to avoid the HW to run db-recovery on - * CQ that used in polling mode. - */ - *cq->mcq.arm_db = cpu_to_be32(2 << 28); - - cq->mcq.vector = 0; - cq->mcq.uar = uar; - cq->mdev = mdev; - return cq; err_cqwq: diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/fs_dr.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/fs_dr.c index 8007d3f523..f367997ab6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/fs_dr.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/fs_dr.c @@ -833,15 +833,21 @@ static u32 mlx5_cmd_dr_get_capabilities(struct mlx5_flow_root_namespace *ns, return steering_caps; } -int mlx5_fs_dr_action_get_pkt_reformat_id(struct mlx5_pkt_reformat *pkt_reformat) +int +mlx5_fs_dr_action_get_pkt_reformat_id(struct mlx5_pkt_reformat *pkt_reformat, + u32 *reformat_id) { + struct mlx5dr_action *dr_action; + switch (pkt_reformat->reformat_type) { case MLX5_REFORMAT_TYPE_L2_TO_VXLAN: case MLX5_REFORMAT_TYPE_L2_TO_NVGRE: case MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL: case MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL: case MLX5_REFORMAT_TYPE_INSERT_HDR: - return mlx5dr_action_get_pkt_reformat_id(pkt_reformat->fs_dr_action.dr_action); + dr_action = pkt_reformat->fs_dr_action.dr_action; + *reformat_id = mlx5dr_action_get_pkt_reformat_id(dr_action); + return 0; } return -EOPNOTSUPP; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/fs_dr.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/fs_dr.h index 99a3b2eff6..f869f2daef 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/fs_dr.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/fs_dr.h @@ -38,7 +38,9 @@ struct mlx5_fs_dr_table { bool mlx5_fs_dr_is_supported(struct mlx5_core_dev *dev); -int mlx5_fs_dr_action_get_pkt_reformat_id(struct mlx5_pkt_reformat *pkt_reformat); +int +mlx5_fs_dr_action_get_pkt_reformat_id(struct mlx5_pkt_reformat *pkt_reformat, + u32 *reformat_id); const struct mlx5_flow_cmds *mlx5_fs_cmd_get_dr_cmds(void); @@ -49,9 +51,11 @@ static inline const struct mlx5_flow_cmds *mlx5_fs_cmd_get_dr_cmds(void) return NULL; } -static inline u32 mlx5_fs_dr_action_get_pkt_reformat_id(struct mlx5_pkt_reformat *pkt_reformat) +static inline int +mlx5_fs_dr_action_get_pkt_reformat_id(struct mlx5_pkt_reformat *pkt_reformat, + u32 *reformat_id) { - return 0; + return -EOPNOTSUPP; } static inline bool mlx5_fs_dr_is_supported(struct mlx5_core_dev *dev) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index d10d4c3960..306affbcfd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -36,6 +36,7 @@ #include #include #include "mlx5_core.h" +#include "eswitch.h" #include "sf/sf.h" /* Mutex to hold while enabling or disabling RoCE */ @@ -77,15 +78,14 @@ int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, } static int mlx5_query_nic_vport_context(struct mlx5_core_dev *mdev, u16 vport, - u32 *out) + bool other_vport, u32 *out) { u32 in[MLX5_ST_SZ_DW(query_nic_vport_context_in)] = {}; MLX5_SET(query_nic_vport_context_in, in, opcode, MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT); MLX5_SET(query_nic_vport_context_in, in, vport_number, vport); - if (vport) - MLX5_SET(query_nic_vport_context_in, in, other_vport, 1); + MLX5_SET(query_nic_vport_context_in, in, other_vport, other_vport); return mlx5_cmd_exec_inout(mdev, query_nic_vport_context, in, out); } @@ -96,7 +96,7 @@ int mlx5_query_nic_vport_min_inline(struct mlx5_core_dev *mdev, u32 out[MLX5_ST_SZ_DW(query_nic_vport_context_out)] = {}; int err; - err = mlx5_query_nic_vport_context(mdev, vport, out); + err = mlx5_query_nic_vport_context(mdev, vport, vport > 0, out); if (!err) *min_inline = MLX5_GET(query_nic_vport_context_out, out, nic_vport_context.min_wqe_inline_mode); @@ -218,7 +218,7 @@ int mlx5_query_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 *mtu) if (!out) return -ENOMEM; - err = mlx5_query_nic_vport_context(mdev, 0, out); + err = mlx5_query_nic_vport_context(mdev, 0, false, out); if (!err) *mtu = MLX5_GET(query_nic_vport_context_out, out, nic_vport_context.mtu); @@ -428,7 +428,7 @@ int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev, if (!out) return -ENOMEM; - err = mlx5_query_nic_vport_context(mdev, 0, out); + err = mlx5_query_nic_vport_context(mdev, 0, false, out); if (err) goto out; @@ -450,7 +450,7 @@ int mlx5_query_nic_vport_sd_group(struct mlx5_core_dev *mdev, u8 *sd_group) if (!out) return -ENOMEM; - err = mlx5_query_nic_vport_context(mdev, 0, out); + err = mlx5_query_nic_vport_context(mdev, 0, false, out); if (err) goto out; @@ -461,23 +461,27 @@ out: return err; } -int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid) +int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, + u16 vport, bool other_vport, u64 *node_guid) { u32 *out; int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out); + int err; out = kvzalloc(outlen, GFP_KERNEL); if (!out) return -ENOMEM; - mlx5_query_nic_vport_context(mdev, 0, out); + err = mlx5_query_nic_vport_context(mdev, vport, other_vport, out); + if (err) + goto out; *node_guid = MLX5_GET64(query_nic_vport_context_out, out, nic_vport_context.node_guid); - +out: kvfree(out); - return 0; + return err; } EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_node_guid); @@ -519,19 +523,22 @@ int mlx5_query_nic_vport_qkey_viol_cntr(struct mlx5_core_dev *mdev, { u32 *out; int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out); + int err; out = kvzalloc(outlen, GFP_KERNEL); if (!out) return -ENOMEM; - mlx5_query_nic_vport_context(mdev, 0, out); + err = mlx5_query_nic_vport_context(mdev, 0, false, out); + if (err) + goto out; *qkey_viol_cntr = MLX5_GET(query_nic_vport_context_out, out, nic_vport_context.qkey_violation_counter); - +out: kvfree(out); - return 0; + return err; } EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_qkey_viol_cntr); @@ -797,7 +804,7 @@ int mlx5_query_nic_vport_promisc(struct mlx5_core_dev *mdev, if (!out) return -ENOMEM; - err = mlx5_query_nic_vport_context(mdev, vport, out); + err = mlx5_query_nic_vport_context(mdev, vport, vport > 0, out); if (err) goto out; @@ -901,7 +908,7 @@ int mlx5_nic_vport_query_local_lb(struct mlx5_core_dev *mdev, bool *status) if (!out) return -ENOMEM; - err = mlx5_query_nic_vport_context(mdev, 0, out); + err = mlx5_query_nic_vport_context(mdev, 0, false, out); if (err) goto out; @@ -1183,18 +1190,63 @@ u64 mlx5_query_nic_system_image_guid(struct mlx5_core_dev *mdev) } EXPORT_SYMBOL_GPL(mlx5_query_nic_system_image_guid); +void mlx5_query_nic_sw_system_image_guid(struct mlx5_core_dev *mdev, u8 *buf, + u8 *len) +{ + u64 fw_system_image_guid; + + *len = 0; + + fw_system_image_guid = mlx5_query_nic_system_image_guid(mdev); + if (!fw_system_image_guid) + return; + + memcpy(buf, &fw_system_image_guid, sizeof(fw_system_image_guid)); + *len += sizeof(fw_system_image_guid); + + if (MLX5_CAP_GEN_2(mdev, load_balance_id) && + MLX5_CAP_GEN_2(mdev, lag_per_mp_group)) + buf[(*len)++] = MLX5_CAP_GEN_2(mdev, load_balance_id); +} + +static bool mlx5_vport_use_vhca_id_as_func_id(struct mlx5_core_dev *dev, + u16 vport_num, u16 *vhca_id) +{ + if (!MLX5_CAP_GEN_2(dev, function_id_type_vhca_id)) + return false; + + return mlx5_esw_vport_vhca_id(dev->priv.eswitch, vport_num, vhca_id); +} + int mlx5_vport_get_other_func_cap(struct mlx5_core_dev *dev, u16 vport, void *out, u16 opmod) { - bool ec_vf_func = mlx5_core_is_ec_vf_vport(dev, vport); u8 in[MLX5_ST_SZ_BYTES(query_hca_cap_in)] = {}; + u16 vhca_id = 0, function_id = 0; + bool ec_vf_func = false; + + /* if this vport is referring to a vport on the ec PF (embedded cpu ) + * let the FW know which domain we are querying since vport numbers or + * function_ids are not unique across the different PF domains, + * unless we use vhca_id as the function_id below. + */ + ec_vf_func = mlx5_core_is_ec_vf_vport(dev, vport); + function_id = mlx5_vport_to_func_id(dev, vport, ec_vf_func); + + if (mlx5_vport_use_vhca_id_as_func_id(dev, vport, &vhca_id)) { + MLX5_SET(query_hca_cap_in, in, function_id_type, 1); + function_id = vhca_id; + ec_vf_func = false; + mlx5_core_dbg(dev, "%s using vhca_id as function_id for vport %d vhca_id 0x%x\n", + __func__, vport, vhca_id); + } opmod = (opmod << 1) | (HCA_CAP_OPMOD_GET_MAX & 0x01); MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); MLX5_SET(query_hca_cap_in, in, op_mod, opmod); - MLX5_SET(query_hca_cap_in, in, function_id, mlx5_vport_to_func_id(dev, vport, ec_vf_func)); MLX5_SET(query_hca_cap_in, in, other_function, true); MLX5_SET(query_hca_cap_in, in, ec_vf_function, ec_vf_func); + MLX5_SET(query_hca_cap_in, in, function_id, function_id); return mlx5_cmd_exec_inout(dev, query_hca_cap, in, out); } EXPORT_SYMBOL_GPL(mlx5_vport_get_other_func_cap); @@ -1206,7 +1258,9 @@ int mlx5_vport_get_vhca_id(struct mlx5_core_dev *dev, u16 vport, u16 *vhca_id) void *hca_caps; int err; - *vhca_id = 0; + /* try get vhca_id via eswitch */ + if (mlx5_esw_vport_vhca_id(dev->priv.eswitch, vport, vhca_id)) + return 0; query_ctx = kzalloc(query_out_sz, GFP_KERNEL); if (!query_ctx) @@ -1223,12 +1277,14 @@ out_free: kfree(query_ctx); return err; } +EXPORT_SYMBOL_GPL(mlx5_vport_get_vhca_id); int mlx5_vport_set_other_func_cap(struct mlx5_core_dev *dev, const void *hca_cap, u16 vport, u16 opmod) { - bool ec_vf_func = mlx5_core_is_ec_vf_vport(dev, vport); int set_sz = MLX5_ST_SZ_BYTES(set_hca_cap_in); + u16 vhca_id = 0, function_id = 0; + bool ec_vf_func = false; void *set_hca_cap; void *set_ctx; int ret; @@ -1237,14 +1293,29 @@ int mlx5_vport_set_other_func_cap(struct mlx5_core_dev *dev, const void *hca_cap if (!set_ctx) return -ENOMEM; + /* if this vport is referring to a vport on the ec PF (embedded cpu ) + * let the FW know which domain we are querying since vport numbers or + * function_ids are not unique across the different PF domains, + * unless we use vhca_id as the function_id below. + */ + ec_vf_func = mlx5_core_is_ec_vf_vport(dev, vport); + function_id = mlx5_vport_to_func_id(dev, vport, ec_vf_func); + + if (mlx5_vport_use_vhca_id_as_func_id(dev, vport, &vhca_id)) { + MLX5_SET(set_hca_cap_in, set_ctx, function_id_type, 1); + function_id = vhca_id; + ec_vf_func = false; + mlx5_core_dbg(dev, "%s using vhca_id as function_id for vport %d vhca_id 0x%x\n", + __func__, vport, vhca_id); + } + MLX5_SET(set_hca_cap_in, set_ctx, opcode, MLX5_CMD_OP_SET_HCA_CAP); MLX5_SET(set_hca_cap_in, set_ctx, op_mod, opmod << 1); set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx, capability); memcpy(set_hca_cap, hca_cap, MLX5_ST_SZ_BYTES(cmd_hca_cap)); - MLX5_SET(set_hca_cap_in, set_ctx, function_id, - mlx5_vport_to_func_id(dev, vport, ec_vf_func)); MLX5_SET(set_hca_cap_in, set_ctx, other_function, true); MLX5_SET(set_hca_cap_in, set_ctx, ec_vf_function, ec_vf_func); + MLX5_SET(set_hca_cap_in, set_ctx, function_id, function_id); ret = mlx5_cmd_exec_in(dev, set_hca_cap, set_ctx); kfree(set_ctx); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wc.c b/drivers/net/ethernet/mellanox/mlx5/core/wc.c index 740b719e70..05e5fd777d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/wc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/wc.c @@ -7,6 +7,10 @@ #include "mlx5_core.h" #include "wq.h" +#if IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && IS_ENABLED(CONFIG_ARM64) +#include +#endif + #define TEST_WC_NUM_WQES 255 #define TEST_WC_LOG_CQ_SZ (order_base_2(TEST_WC_NUM_WQES)) #define TEST_WC_SQ_LOG_WQ_SZ TEST_WC_LOG_CQ_SZ @@ -94,7 +98,7 @@ static int create_wc_cq(struct mlx5_wc_cq *cq, void *cqc_data) MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); - MLX5_SET(cqc, cqc, uar_page, mdev->priv.uar->index); + MLX5_SET(cqc, cqc, uar_page, mdev->priv.bfreg.up->index); MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma); @@ -116,7 +120,7 @@ static int mlx5_wc_create_cq(struct mlx5_core_dev *mdev, struct mlx5_wc_cq *cq) return -ENOMEM; MLX5_SET(cqc, cqc, log_cq_size, TEST_WC_LOG_CQ_SZ); - MLX5_SET(cqc, cqc, uar_page, mdev->priv.uar->index); + MLX5_SET(cqc, cqc, uar_page, mdev->priv.bfreg.up->index); if (MLX5_CAP_GEN(mdev, cqe_128_always) && cache_line_size() >= 128) MLX5_SET(cqc, cqc, cqe_sz, CQE_STRIDE_128_PAD); @@ -255,7 +259,29 @@ static void mlx5_wc_destroy_sq(struct mlx5_wc_sq *sq) mlx5_wq_destroy(&sq->wq_ctrl); } -static void mlx5_wc_post_nop(struct mlx5_wc_sq *sq, bool signaled) +static void mlx5_iowrite64_copy(struct mlx5_wc_sq *sq, __be32 mmio_wqe[16], + size_t mmio_wqe_size, unsigned int offset) +{ +#if IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && IS_ENABLED(CONFIG_ARM64) + if (cpu_has_neon()) { + kernel_neon_begin(); + asm volatile + (".arch_extension simd\n\t" + "ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [%0]\n\t" + "st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [%1]" + : + : "r"(mmio_wqe), "r"(sq->bfreg.map + offset) + : "memory", "v0", "v1", "v2", "v3"); + kernel_neon_end(); + return; + } +#endif + __iowrite64_copy(sq->bfreg.map + offset, mmio_wqe, + mmio_wqe_size / 8); +} + +static void mlx5_wc_post_nop(struct mlx5_wc_sq *sq, unsigned int *offset, + bool signaled) { int buf_size = (1 << MLX5_CAP_GEN(sq->cq.mdev, log_bf_reg_size)) / 2; struct mlx5_wqe_ctrl_seg *ctrl; @@ -288,10 +314,9 @@ static void mlx5_wc_post_nop(struct mlx5_wc_sq *sq, bool signaled) */ wmb(); - __iowrite64_copy(sq->bfreg.map + sq->bfreg.offset, mmio_wqe, - sizeof(mmio_wqe) / 8); + mlx5_iowrite64_copy(sq, mmio_wqe, sizeof(mmio_wqe), *offset); - sq->bfreg.offset ^= buf_size; + *offset ^= buf_size; } static int mlx5_wc_poll_cq(struct mlx5_wc_sq *sq) @@ -332,6 +357,7 @@ static int mlx5_wc_poll_cq(struct mlx5_wc_sq *sq) static void mlx5_core_test_wc(struct mlx5_core_dev *mdev) { + unsigned int offset = 0; unsigned long expires; struct mlx5_wc_sq *sq; int i, err; @@ -358,9 +384,9 @@ static void mlx5_core_test_wc(struct mlx5_core_dev *mdev) goto err_create_sq; for (i = 0; i < TEST_WC_NUM_WQES - 1; i++) - mlx5_wc_post_nop(sq, false); + mlx5_wc_post_nop(sq, &offset, false); - mlx5_wc_post_nop(sq, true); + mlx5_wc_post_nop(sq, &offset, true); expires = jiffies + TEST_WC_POLLING_MAX_TIME_JIFFIES; do { @@ -378,6 +404,9 @@ err_create_cq: mlx5_free_bfreg(mdev, &sq->bfreg); err_alloc_bfreg: kfree(sq); + + if (mdev->wc_state == MLX5_WC_STATE_UNSUPPORTED) + mlx5_core_warn(mdev, "Write combining is not supported\n"); } bool mlx5_wc_support_get(struct mlx5_core_dev *mdev) diff --git a/drivers/pci/tph.c b/drivers/pci/tph.c index 07de59ca2e..47a160b399 100644 --- a/drivers/pci/tph.c +++ b/drivers/pci/tph.c @@ -155,7 +155,16 @@ static u8 get_st_modes(struct pci_dev *pdev) return reg; } -static u32 get_st_table_loc(struct pci_dev *pdev) +/** + * pcie_tph_get_st_table_loc - Return the device's ST table location + * @pdev: PCI device to query + * + * Return: + * PCI_TPH_LOC_NONE - Not present + * PCI_TPH_LOC_CAP - Located in the TPH Requester Extended Capability + * PCI_TPH_LOC_MSIX - Located in the MSI-X Table + */ +u32 pcie_tph_get_st_table_loc(struct pci_dev *pdev) { u32 reg; @@ -163,18 +172,19 @@ static u32 get_st_table_loc(struct pci_dev *pdev) return FIELD_GET(PCI_TPH_CAP_LOC_MASK, reg); } +EXPORT_SYMBOL(pcie_tph_get_st_table_loc); /* * Return the size of ST table. If ST table is not in TPH Requester Extended * Capability space, return 0. Otherwise return the ST Table Size + 1. */ -static u16 get_st_table_size(struct pci_dev *pdev) +u16 pcie_tph_get_st_table_size(struct pci_dev *pdev) { u32 reg; u32 loc; /* Check ST table location first */ - loc = get_st_table_loc(pdev); + loc = pcie_tph_get_st_table_loc(pdev); /* Convert loc to match with PCI_TPH_LOC_* defined in pci_regs.h */ loc = FIELD_PREP(PCI_TPH_CAP_LOC_MASK, loc); @@ -185,6 +195,7 @@ static u16 get_st_table_size(struct pci_dev *pdev) return FIELD_GET(PCI_TPH_CAP_ST_MASK, reg) + 1; } +EXPORT_SYMBOL(pcie_tph_get_st_table_size); /* Return device's Root Port completer capability */ static u8 get_rp_completer_type(struct pci_dev *pdev) @@ -253,7 +264,7 @@ static int write_tag_to_st_table(struct pci_dev *pdev, int index, u16 tag) int offset; /* Check if index is out of bound */ - st_table_size = get_st_table_size(pdev); + st_table_size = pcie_tph_get_st_table_size(pdev); if (index >= st_table_size) return -ENXIO; @@ -340,7 +351,7 @@ int pcie_tph_set_st_entry(struct pci_dev *pdev, unsigned int index, u16 tag) */ set_ctrl_reg_req_en(pdev, PCI_TPH_REQ_DISABLE); - loc = get_st_table_loc(pdev); + loc = pcie_tph_get_st_table_loc(pdev); /* Convert loc to match with PCI_TPH_LOC_* */ loc = FIELD_PREP(PCI_TPH_CAP_LOC_MASK, loc); @@ -485,7 +496,7 @@ void pci_restore_tph_state(struct pci_dev *pdev) pci_write_config_dword(pdev, pdev->tph_cap + PCI_TPH_CTRL, *cap++); st_entry = (u16 *)cap; offset = PCI_TPH_BASE_SIZEOF; - num_entries = get_st_table_size(pdev); + num_entries = pcie_tph_get_st_table_size(pdev); for (i = 0; i < num_entries; i++) { pci_write_config_word(pdev, pdev->tph_cap + offset, *st_entry++); @@ -517,7 +528,7 @@ void pci_save_tph_state(struct pci_dev *pdev) /* Save all ST entries in extended capability structure */ st_entry = (u16 *)cap; offset = PCI_TPH_BASE_SIZEOF; - num_entries = get_st_table_size(pdev); + num_entries = pcie_tph_get_st_table_size(pdev); for (i = 0; i < num_entries; i++) { pci_read_config_word(pdev, pdev->tph_cap + offset, st_entry++); @@ -541,7 +552,7 @@ void pci_tph_init(struct pci_dev *pdev) if (!pdev->tph_cap) return; - num_entries = get_st_table_size(pdev); + num_entries = pcie_tph_get_st_table_size(pdev); save_size = sizeof(u32) + num_entries * sizeof(u16); pci_add_ext_cap_save_buffer(pdev, PCI_EXT_CAP_ID_TPH, save_size); } diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 7ea46522f2..3c05407449 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -552,6 +552,8 @@ static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent) vcq->mcq.set_ci_db = vcq->db.db; vcq->mcq.arm_db = vcq->db.db + 1; vcq->mcq.cqe_sz = 64; + vcq->mcq.comp = mlx5_vdpa_cq_comp; + vcq->cqe = num_ent; err = cq_frag_buf_alloc(ndev, &vcq->buf, num_ent); if (err) @@ -591,10 +593,6 @@ static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent) if (err) goto err_vec; - vcq->mcq.comp = mlx5_vdpa_cq_comp; - vcq->cqe = num_ent; - vcq->mcq.set_ci_db = vcq->db.db; - vcq->mcq.arm_db = vcq->db.db + 1; mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index); kfree(in); return 0; diff --git a/fs/Kconfig b/fs/Kconfig index 5378e55f87..a9c6fa9cff 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -18,6 +18,10 @@ config VALIDATE_FS_PARSER config FS_IOMAP bool +# Stackable filesystems +config FS_STACK + bool + config BUFFER_HEAD bool diff --git a/fs/Makefile b/fs/Makefile index 0da17ff145..716c9fe04d 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -41,6 +41,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o +obj-$(CONFIG_FS_STACK) += backing-file.o obj-$(CONFIG_FS_MBCACHE) += mbcache.o obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ diff --git a/fs/backing-file.c b/fs/backing-file.c new file mode 100644 index 0000000000..e6f4fe27b5 --- /dev/null +++ b/fs/backing-file.c @@ -0,0 +1,346 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Common helpers for stackable filesystems and backing files. + * + * Forked from fs/overlayfs/file.c. + * + * Copyright (C) 2017 Red Hat, Inc. + * Copyright (C) 2023 CTERA Networks. + */ + +#include +#include +#include +#include +#include + +#include "internal.h" + +/** + * backing_file_open - open a backing file for kernel internal use + * @user_path: path that the user reuqested to open + * @flags: open flags + * @real_path: path of the backing file + * @cred: credentials for open + * + * Open a backing file for a stackable filesystem (e.g., overlayfs). + * @user_path may be on the stackable filesystem and @real_path on the + * underlying filesystem. In this case, we want to be able to return the + * @user_path of the stackable filesystem. This is done by embedding the + * returned file into a container structure that also stores the stacked + * file's path, which can be retrieved using backing_file_user_path(). + */ +struct file *backing_file_open(const struct file *user_file, int flags, + const struct path *real_path, + const struct cred *cred) +{ + const struct path *user_path = &user_file->f_path; + struct file *f; + int error; + + f = alloc_empty_backing_file(flags, cred, user_file); + if (IS_ERR(f)) + return f; + + path_get(user_path); + backing_file_set_user_path(f, user_path); + error = vfs_open(real_path, f); + if (error) { + fput(f); + f = ERR_PTR(error); + } + + return f; +} +EXPORT_SYMBOL_GPL(backing_file_open); + +struct backing_aio { + struct kiocb iocb; + refcount_t ref; + struct kiocb *orig_iocb; + /* used for aio completion */ + void (*end_write)(struct file *); + struct work_struct work; + long res; +}; + +static struct kmem_cache *backing_aio_cachep; + +#define BACKING_IOCB_MASK \ + (IOCB_NOWAIT | IOCB_HIPRI | IOCB_DSYNC | IOCB_SYNC | IOCB_APPEND) + +static rwf_t iocb_to_rw_flags(int flags) +{ + return (__force rwf_t)(flags & BACKING_IOCB_MASK); +} + +static void backing_aio_put(struct backing_aio *aio) +{ + if (refcount_dec_and_test(&aio->ref)) { + fput(aio->iocb.ki_filp); + kmem_cache_free(backing_aio_cachep, aio); + } +} + +static void backing_aio_cleanup(struct backing_aio *aio, long res) +{ + struct kiocb *iocb = &aio->iocb; + struct kiocb *orig_iocb = aio->orig_iocb; + + if (iocb->ki_flags & IOCB_WRITE) + kiocb_end_write(iocb); + + if (aio->end_write) + aio->end_write(orig_iocb->ki_filp); + + orig_iocb->ki_pos = iocb->ki_pos; + backing_aio_put(aio); +} + +static void backing_aio_rw_complete(struct kiocb *iocb, long res) +{ + struct backing_aio *aio = container_of(iocb, struct backing_aio, iocb); + struct kiocb *orig_iocb = aio->orig_iocb; + + backing_aio_cleanup(aio, res); + orig_iocb->ki_complete(orig_iocb, res); +} + +static void backing_aio_complete_work(struct work_struct *work) +{ + struct backing_aio *aio = container_of(work, struct backing_aio, work); + + backing_aio_rw_complete(&aio->iocb, aio->res); +} + +static void backing_aio_queue_completion(struct kiocb *iocb, long res) +{ + struct backing_aio *aio = container_of(iocb, struct backing_aio, iocb); + + /* + * Punt to a work queue to serialize updates of mtime/size. + */ + aio->res = res; + INIT_WORK(&aio->work, backing_aio_complete_work); + queue_work(file_inode(aio->orig_iocb->ki_filp)->i_sb->s_dio_done_wq, + &aio->work); +} + +static int backing_aio_init_wq(struct kiocb *iocb) +{ + struct super_block *sb = file_inode(iocb->ki_filp)->i_sb; + + if (sb->s_dio_done_wq) + return 0; + + return sb_init_dio_done_wq(sb); +} + + +ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter, + struct kiocb *iocb, int flags, + struct backing_file_ctx *ctx) +{ + struct backing_aio *aio = NULL; + const struct cred *old_cred; + ssize_t ret; + + if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING))) + return -EIO; + + if (!iov_iter_count(iter)) + return 0; + + if (iocb->ki_flags & IOCB_DIRECT && + !(file->f_mode & FMODE_CAN_ODIRECT)) + return -EINVAL; + + old_cred = override_creds(ctx->cred); + if (is_sync_kiocb(iocb)) { + rwf_t rwf = iocb_to_rw_flags(flags); + + ret = vfs_iter_read(file, iter, &iocb->ki_pos, rwf); + } else { + ret = -ENOMEM; + aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL); + if (!aio) + goto out; + + aio->orig_iocb = iocb; + kiocb_clone(&aio->iocb, iocb, get_file(file)); + aio->iocb.ki_complete = backing_aio_rw_complete; + refcount_set(&aio->ref, 2); + ret = vfs_iocb_iter_read(file, &aio->iocb, iter); + backing_aio_put(aio); + if (ret != -EIOCBQUEUED) + backing_aio_cleanup(aio, ret); + } +out: + revert_creds(old_cred); + + if (ctx->accessed) + ctx->accessed(ctx->user_file); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_read_iter); + +ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter, + struct kiocb *iocb, int flags, + struct backing_file_ctx *ctx) +{ + const struct cred *old_cred; + ssize_t ret; + + if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING))) + return -EIO; + + if (!iov_iter_count(iter)) + return 0; + + ret = file_remove_privs(ctx->user_file); + if (ret) + return ret; + + if (iocb->ki_flags & IOCB_DIRECT && + !(file->f_mode & FMODE_CAN_ODIRECT)) + return -EINVAL; + + /* + * Stacked filesystems don't support deferred completions, don't copy + * this property in case it is set by the issuer. + */ + flags &= ~IOCB_DIO_CALLER_COMP; + + old_cred = override_creds(ctx->cred); + if (is_sync_kiocb(iocb)) { + rwf_t rwf = iocb_to_rw_flags(flags); + + file_start_write(file); + ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf); + file_end_write(file); + if (ctx->end_write) + ctx->end_write(ctx->user_file); + } else { + struct backing_aio *aio; + + ret = backing_aio_init_wq(iocb); + if (ret) + goto out; + + ret = -ENOMEM; + aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL); + if (!aio) + goto out; + + aio->orig_iocb = iocb; + aio->end_write = ctx->end_write; + kiocb_clone(&aio->iocb, iocb, get_file(file)); + aio->iocb.ki_flags = flags; + aio->iocb.ki_complete = backing_aio_queue_completion; + refcount_set(&aio->ref, 2); + kiocb_start_write(&aio->iocb); + ret = vfs_iocb_iter_write(file, &aio->iocb, iter); + backing_aio_put(aio); + if (ret != -EIOCBQUEUED) + backing_aio_cleanup(aio, ret); + } +out: + revert_creds(old_cred); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_write_iter); + +ssize_t backing_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags, + struct backing_file_ctx *ctx) +{ + const struct cred *old_cred; + ssize_t ret; + + if (WARN_ON_ONCE(!(in->f_mode & FMODE_BACKING))) + return -EIO; + + old_cred = override_creds(ctx->cred); + ret = vfs_splice_read(in, ppos, pipe, len, flags); + revert_creds(old_cred); + + if (ctx->accessed) + ctx->accessed(ctx->user_file); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_splice_read); + +ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, size_t len, + unsigned int flags, + struct backing_file_ctx *ctx) +{ + const struct cred *old_cred; + ssize_t ret; + + if (WARN_ON_ONCE(!(out->f_mode & FMODE_BACKING))) + return -EIO; + + ret = file_remove_privs(ctx->user_file); + if (ret) + return ret; + + old_cred = override_creds(ctx->cred); + file_start_write(out); + ret = iter_file_splice_write(pipe, out, ppos, len, flags); + file_end_write(out); + revert_creds(old_cred); + + if (ctx->end_write) + ctx->end_write(ctx->user_file); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_splice_write); + +int backing_file_mmap(struct file *file, struct vm_area_struct *vma, + struct backing_file_ctx *ctx) +{ + const struct cred *old_cred; + int ret; + + if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)) || + WARN_ON_ONCE(ctx->user_file != vma->vm_file)) + return -EIO; + + if (!file->f_op->mmap) + return -ENODEV; + + vma_set_file(vma, file); + + old_cred = override_creds(ctx->cred); + ret = security_mmap_backing_file(vma, file, ctx->user_file); + if (ret) { + revert_creds(old_cred); + return ret; + } + ret = call_mmap(vma->vm_file, vma); + revert_creds(old_cred); + + if (ctx->accessed) + ctx->accessed(ctx->user_file); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_mmap); + +static int __init backing_aio_init(void) +{ + backing_aio_cachep = kmem_cache_create("backing_aio", + sizeof(struct backing_aio), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!backing_aio_cachep) + return -ENOMEM; + + return 0; +} +fs_initcall(backing_aio_init); diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index 175a25fcad..009d23cd43 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -259,9 +259,7 @@ static void cachefiles_write_complete(struct kiocb *iocb, long ret) _enter("%ld", ret); - /* Tell lockdep we inherited freeze protection from submission thread */ - __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); - __sb_end_write(inode->i_sb, SB_FREEZE_WRITE); + kiocb_end_write(iocb); if (ret < 0) trace_cachefiles_io_error(object, inode, ret, @@ -286,7 +284,6 @@ int __cachefiles_write(struct cachefiles_object *object, { struct cachefiles_cache *cache; struct cachefiles_kiocb *ki; - struct inode *inode; unsigned int old_nofs; ssize_t ret; size_t len = iov_iter_count(iter); @@ -322,19 +319,12 @@ int __cachefiles_write(struct cachefiles_object *object, ki->iocb.ki_complete = cachefiles_write_complete; atomic_long_add(ki->b_writing, &cache->b_writing); - /* Open-code file_start_write here to grab freeze protection, which - * will be released by another thread in aio_complete_rw(). Fool - * lockdep by telling it the lock got released so that it doesn't - * complain about the held lock when we return to userspace. - */ - inode = file_inode(file); - __sb_start_write(inode->i_sb, SB_FREEZE_WRITE); - __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE); + kiocb_start_write(&ki->iocb); get_file(ki->iocb.ki_filp); cachefiles_grab_object(object, cachefiles_obj_get_ioreq); - trace_cachefiles_write(object, inode, ki->iocb.ki_pos, len); + trace_cachefiles_write(object, file_inode(file), ki->iocb.ki_pos, len); old_nofs = memalloc_nofs_save(); ret = cachefiles_inject_write_error(); if (ret == 0) diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 6f5c59baec..bc2bb20013 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -560,8 +560,8 @@ static bool cachefiles_open_file(struct cachefiles_object *object, */ path.mnt = cache->mnt; path.dentry = dentry; - file = open_with_fake_path(&path, O_RDWR | O_LARGEFILE | O_DIRECT, - d_backing_inode(dentry), cache->cache_cred); + file = kernel_file_open(&path, O_RDWR | O_LARGEFILE | O_DIRECT, + d_backing_inode(dentry), cache->cache_cred); if (IS_ERR(file)) { trace_cachefiles_vfs_error(object, d_backing_inode(dentry), PTR_ERR(file), diff --git a/fs/file.c b/fs/file.c index 452cdb93a4..192aef54a8 100644 --- a/fs/file.c +++ b/fs/file.c @@ -649,7 +649,7 @@ EXPORT_SYMBOL(close_fd); /* for ksys_close() */ /** * last_fd - return last valid index into fd table - * @cur_fds: files struct + * @fdt: File descriptor table. * * Context: Either rcu read lock or files_lock must be held. * @@ -704,6 +704,7 @@ static inline void __range_close(struct files_struct *cur_fds, unsigned int fd, * * @fd: starting file descriptor to close * @max_fd: last file descriptor to close + * @flags: CLOSE_RANGE flags. * * This closes a range of file descriptors. All file descriptors * from @fd up to and including @max_fd are closed. diff --git a/fs/file_table.c b/fs/file_table.c index cdc1dea331..fc04eb48d5 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -44,18 +44,64 @@ static struct kmem_cache *filp_cachep __read_mostly; static struct percpu_counter nr_files __cacheline_aligned_in_smp; +/* Container for backing file with optional user path */ +struct backing_file { + struct file file; + struct path user_path; +#ifdef CONFIG_SECURITY + void *security; +#endif +}; + +#define backing_file(f) container_of(f, struct backing_file, file) + +struct path *backing_file_user_path(const struct file *f) +{ + return &backing_file(f)->user_path; +} +EXPORT_SYMBOL_GPL(backing_file_user_path); + +void backing_file_set_user_path(struct file *f, const struct path *path) +{ + backing_file(f)->user_path = *path; +} +EXPORT_SYMBOL_GPL(backing_file_set_user_path); + +#ifdef CONFIG_SECURITY +void *backing_file_security(const struct file *f) +{ + return backing_file(f)->security; +} + +void backing_file_set_security(struct file *f, void *security) +{ + backing_file(f)->security = security; +} +#endif /* CONFIG_SECURITY */ + static void file_free_rcu(struct rcu_head *head) { struct file *f = container_of(head, struct file, f_u.fu_rcuhead); put_cred(f->f_cred); - kmem_cache_free(filp_cachep, f); + if (unlikely(f->f_mode & FMODE_BACKING)) + kfree(backing_file(f)); + else + kmem_cache_free(filp_cachep, f); +} + +static inline void backing_file_free(struct backing_file *ff) +{ + security_backing_file_free(&ff->file); + path_put(&ff->user_path); } static inline void file_free(struct file *f) { security_file_free(f); - if (!(f->f_mode & FMODE_NOACCOUNT)) + if (unlikely(f->f_mode & FMODE_BACKING)) + backing_file_free(backing_file(f)); + if (likely(!(f->f_mode & FMODE_NOACCOUNT))) percpu_counter_dec(&nr_files); call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); } @@ -131,20 +177,15 @@ static int __init init_fs_stat_sysctls(void) fs_initcall(init_fs_stat_sysctls); #endif -static struct file *__alloc_file(int flags, const struct cred *cred) +static int init_file(struct file *f, int flags, const struct cred *cred) { - struct file *f; int error; - f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); - if (unlikely(!f)) - return ERR_PTR(-ENOMEM); - f->f_cred = get_cred(cred); error = security_file_alloc(f); if (unlikely(error)) { - file_free_rcu(&f->f_u.fu_rcuhead); - return ERR_PTR(error); + put_cred(f->f_cred); + return error; } atomic_long_set(&f->f_count, 1); @@ -155,7 +196,7 @@ static struct file *__alloc_file(int flags, const struct cred *cred) f->f_mode = OPEN_FMODE(flags); /* f->f_version: 0 */ - return f; + return 0; } /* Find an unused file structure and return a pointer to it. @@ -172,6 +213,7 @@ struct file *alloc_empty_file(int flags, const struct cred *cred) { static long old_max; struct file *f; + int error; /* * Privileged users can go above max_files @@ -185,9 +227,17 @@ struct file *alloc_empty_file(int flags, const struct cred *cred) goto over; } - f = __alloc_file(flags, cred); - if (!IS_ERR(f)) - percpu_counter_inc(&nr_files); + f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); + if (unlikely(!f)) + return ERR_PTR(-ENOMEM); + + error = init_file(f, flags, cred); + if (unlikely(error)) { + kmem_cache_free(filp_cachep, f); + return ERR_PTR(error); + } + + percpu_counter_inc(&nr_files); return f; @@ -203,18 +253,71 @@ over: /* * Variant of alloc_empty_file() that doesn't check and modify nr_files. * - * Should not be used unless there's a very good reason to do so. + * This is only for kernel internal use, and the allocate file must not be + * installed into file tables or such. */ struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred) { - struct file *f = __alloc_file(flags, cred); + struct file *f; + int error; - if (!IS_ERR(f)) - f->f_mode |= FMODE_NOACCOUNT; + f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); + if (unlikely(!f)) + return ERR_PTR(-ENOMEM); + + error = init_file(f, flags, cred); + if (unlikely(error)) { + kmem_cache_free(filp_cachep, f); + return ERR_PTR(error); + } + + f->f_mode |= FMODE_NOACCOUNT; return f; } +static int init_backing_file(struct backing_file *ff, + const struct file *user_file) +{ + memset(&ff->user_path, 0, sizeof(ff->user_path)); + backing_file_set_security(&ff->file, NULL); + return security_backing_file_alloc(&ff->file, user_file); +} + +/* + * Variant of alloc_empty_file() that allocates a backing_file container + * and doesn't check and modify nr_files. + * + * This is only for kernel internal use, and the allocate file must not be + * installed into file tables or such. + */ +struct file *alloc_empty_backing_file(int flags, const struct cred *cred, + const struct file *user_file) +{ + struct backing_file *ff; + int error; + + ff = kzalloc(sizeof(struct backing_file), GFP_KERNEL); + if (unlikely(!ff)) + return ERR_PTR(-ENOMEM); + + error = init_file(&ff->file, flags, cred); + if (unlikely(error)) { + kfree(ff); + return ERR_PTR(error); + } + + /* The f_mode flags must be set before fput(). */ + ff->file.f_mode |= FMODE_BACKING | FMODE_NOACCOUNT; + error = init_backing_file(ff, user_file); + if (unlikely(error)) { + fput(&ff->file); + return ERR_PTR(error); + } + + return &ff->file; +} + /** * file_init_path - initialize a 'struct file' based on path * @@ -366,12 +469,7 @@ static void __fput(struct file *file) } fops_put(file->f_op); put_pid(file->f_owner.pid); - if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) - i_readcount_dec(inode); - if (mode & FMODE_WRITER) { - put_write_access(inode); - __mnt_drop_write(mnt); - } + put_file_access(file); dput(dentry); if (unlikely(mode & FMODE_NEED_UNMOUNT)) dissolve_on_fput(mnt); diff --git a/fs/fs_context.c b/fs/fs_context.c index 648d2ee9e5..3473e63e83 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -162,6 +162,10 @@ EXPORT_SYMBOL(vfs_parse_fs_param); /** * vfs_parse_fs_string - Convenience function to just parse a string. + * @fc: Filesystem context. + * @key: Parameter name. + * @value: Default value. + * @v_size: Maximum number of bytes in the value. */ int vfs_parse_fs_string(struct fs_context *fc, const char *key, const char *value, size_t v_size) @@ -357,7 +361,7 @@ void fc_drop_locked(struct fs_context *fc) static void legacy_fs_context_free(struct fs_context *fc); /** - * vfs_dup_fc_config: Duplicate a filesystem context. + * vfs_dup_fs_context - Duplicate a filesystem context. * @src_fc: The context to copy. */ struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc) @@ -403,7 +407,9 @@ EXPORT_SYMBOL(vfs_dup_fs_context); /** * logfc - Log a message to a filesystem context - * @fc: The filesystem context to log to. + * @log: The filesystem context to log to, or NULL to use printk. + * @prefix: A string to prefix the output with, or NULL. + * @level: 'w' for a warning, 'e' for an error. Anything else is a notice. * @fmt: The format of the buffer. */ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, ...) diff --git a/fs/inode.c b/fs/inode.c index 2bc233f8db..fc48477343 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1980,7 +1980,7 @@ void touch_atime(const struct path *path) if (!sb_start_write_trylock(inode->i_sb)) return; - if (__mnt_want_write(mnt) != 0) + if (mnt_get_write_access(mnt) != 0) goto skip_update; /* * File systems can error out when updating inodes if they need to @@ -1993,7 +1993,7 @@ void touch_atime(const struct path *path) */ now = current_time(inode); update_time(inode, &now, S_ATIME); - __mnt_drop_write(mnt); + mnt_put_write_access(mnt); skip_update: sb_end_write(inode->i_sb); } @@ -2110,9 +2110,9 @@ static int __file_update_time(struct file *file, struct timespec64 *now, struct inode *inode = file_inode(file); /* try to update time settings */ - if (!__mnt_want_write_file(file)) { + if (!mnt_get_write_access_file(file)) { ret = update_time(inode, now, sync_mode); - __mnt_drop_write_file(file); + mnt_put_write_access_file(file); } return ret; diff --git a/fs/internal.h b/fs/internal.h index 3e8dbf777c..f48f5fa349 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -76,8 +76,8 @@ extern int sb_prepare_remount_readonly(struct super_block *); extern void __init mnt_init(void); -extern int __mnt_want_write_file(struct file *); -extern void __mnt_drop_write_file(struct file *); +int mnt_get_write_access_file(struct file *file); +void mnt_put_write_access_file(struct file *file); extern void dissolve_on_fput(struct vfsmount *); @@ -93,8 +93,28 @@ extern void chroot_fs_refs(const struct path *, const struct path *); /* * file_table.c */ -extern struct file *alloc_empty_file(int, const struct cred *); -extern struct file *alloc_empty_file_noaccount(int, const struct cred *); +struct file *alloc_empty_file(int flags, const struct cred *cred); +struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred); +struct file *alloc_empty_backing_file(int flags, const struct cred *cred, + const struct file *user_file); +void backing_file_set_user_path(struct file *f, const struct path *path); + +static inline void file_put_write_access(struct file *file) +{ + put_write_access(file->f_inode); + mnt_put_write_access(file->f_path.mnt); + if (unlikely(file->f_mode & FMODE_BACKING)) + mnt_put_write_access(backing_file_user_path(file)->mnt); +} + +static inline void put_file_access(struct file *file) +{ + if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { + i_readcount_dec(file->f_inode); + } else if (file->f_mode & FMODE_WRITER) { + file_put_write_access(file); + } +} /* * super.c diff --git a/fs/ioctl.c b/fs/ioctl.c index 088462ee5a..6477689112 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -109,9 +109,6 @@ static int ioctl_fibmap(struct file *filp, int __user *p) * Returns 0 on success, -errno on error, 1 if this was the last * extent that will fit in user array. */ -#define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC) -#define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED) -#define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE) int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical, u64 phys, u64 len, u32 flags) { @@ -127,6 +124,10 @@ int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical, if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max) return 1; +#define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC) +#define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED) +#define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE) + if (flags & SET_UNKNOWN_FLAGS) flags |= FIEMAP_EXTENT_UNKNOWN; if (flags & SET_NO_UNMOUNTED_IO_FLAGS) @@ -913,6 +914,9 @@ out: #ifdef CONFIG_COMPAT /** * compat_ptr_ioctl - generic implementation of .compat_ioctl file operation + * @file: The file to operate on. + * @cmd: The ioctl command number. + * @arg: The argument to the ioctl. * * This is not normally called as a function, but instead set in struct * file_operations as diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c index 5d82627457..c429c42a68 100644 --- a/fs/kernel_read_file.c +++ b/fs/kernel_read_file.c @@ -8,16 +8,16 @@ /** * kernel_read_file() - read file contents into a kernel buffer * - * @file file to read from - * @offset where to start reading from (see below). - * @buf pointer to a "void *" buffer for reading into (if + * @file: file to read from + * @offset: where to start reading from (see below). + * @buf: pointer to a "void *" buffer for reading into (if * *@buf is NULL, a buffer will be allocated, and * @buf_size will be ignored) - * @buf_size size of buf, if already allocated. If @buf not + * @buf_size: size of buf, if already allocated. If @buf not * allocated, this is the largest size to allocate. - * @file_size if non-NULL, the full size of @file will be + * @file_size: if non-NULL, the full size of @file will be * written here. - * @id the kernel_read_file_id identifying the type of + * @id: the kernel_read_file_id identifying the type of * file contents being read (for LSMs to examine) * * @offset must be 0 unless both @buf and @file_size are non-NULL diff --git a/fs/namei.c b/fs/namei.c index 0a4b15d9a0..23c73afe57 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -644,6 +644,8 @@ static bool nd_alloc_stack(struct nameidata *nd) /** * path_connected - Verify that a dentry is below mnt.mnt_root + * @mnt: The mountpoint to check. + * @dentry: The dentry to check. * * Rename can sometimes move a file or directory outside of a bind * mount, path_connected allows those cases to be detected. @@ -1083,6 +1085,7 @@ fs_initcall(init_fs_namei_sysctls); /** * may_follow_link - Check symlink following for unsafe situations * @nd: nameidata pathwalk data + * @inode: Used for idmapping. * * In the case of the sysctl_protected_symlinks sysctl being enabled, * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is diff --git a/fs/namespace.c b/fs/namespace.c index d032d84d66..218f1b77bf 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -333,16 +333,16 @@ static int mnt_is_readonly(struct vfsmount *mnt) * can determine when writes are able to occur to a filesystem. */ /** - * __mnt_want_write - get write access to a mount without freeze protection + * mnt_get_write_access - get write access to a mount without freeze protection * @m: the mount on which to take a write * * This tells the low-level filesystem that a write is about to be performed to * it, and makes sure that writes are allowed (mnt it read-write) before * returning success. This operation does not protect against filesystem being - * frozen. When the write operation is finished, __mnt_drop_write() must be + * frozen. When the write operation is finished, mnt_put_write_access() must be * called. This is effectively a refcount. */ -int __mnt_want_write(struct vfsmount *m) +int mnt_get_write_access(struct vfsmount *m) { struct mount *mnt = real_mount(m); int ret = 0; @@ -371,7 +371,7 @@ int __mnt_want_write(struct vfsmount *m) return ret; } -EXPORT_SYMBOL_GPL(__mnt_want_write); +EXPORT_SYMBOL_GPL(mnt_get_write_access); /** * mnt_want_write - get write access to a mount @@ -387,7 +387,7 @@ int mnt_want_write(struct vfsmount *m) int ret; sb_start_write(m->mnt_sb); - ret = __mnt_want_write(m); + ret = mnt_get_write_access(m); if (ret) sb_end_write(m->mnt_sb); return ret; @@ -395,15 +395,15 @@ int mnt_want_write(struct vfsmount *m) EXPORT_SYMBOL_GPL(mnt_want_write); /** - * __mnt_want_write_file - get write access to a file's mount + * mnt_get_write_access_file - get write access to a file's mount * @file: the file who's mount on which to take a write * - * This is like __mnt_want_write, but if the file is already open for writing it + * This is like mnt_get_write_access, but if @file is already open for write it * skips incrementing mnt_writers (since the open file already has a reference) * and instead only does the check for emergency r/o remounts. This must be - * paired with __mnt_drop_write_file. + * paired with mnt_put_write_access_file. */ -int __mnt_want_write_file(struct file *file) +int mnt_get_write_access_file(struct file *file) { if (file->f_mode & FMODE_WRITER) { /* @@ -414,7 +414,7 @@ int __mnt_want_write_file(struct file *file) return -EROFS; return 0; } - return __mnt_want_write(file->f_path.mnt); + return mnt_get_write_access(file->f_path.mnt); } /** @@ -431,7 +431,7 @@ int mnt_want_write_file(struct file *file) int ret; sb_start_write(file_inode(file)->i_sb); - ret = __mnt_want_write_file(file); + ret = mnt_get_write_access_file(file); if (ret) sb_end_write(file_inode(file)->i_sb); return ret; @@ -439,20 +439,20 @@ int mnt_want_write_file(struct file *file) EXPORT_SYMBOL_GPL(mnt_want_write_file); /** - * __mnt_drop_write - give up write access to a mount + * mnt_put_write_access - give up write access to a mount * @mnt: the mount on which to give up write access * * Tells the low-level filesystem that we are done * performing writes to it. Must be matched with - * __mnt_want_write() call above. + * mnt_get_write_access() call above. */ -void __mnt_drop_write(struct vfsmount *mnt) +void mnt_put_write_access(struct vfsmount *mnt) { preempt_disable(); mnt_dec_writers(real_mount(mnt)); preempt_enable(); } -EXPORT_SYMBOL_GPL(__mnt_drop_write); +EXPORT_SYMBOL_GPL(mnt_put_write_access); /** * mnt_drop_write - give up write access to a mount @@ -464,20 +464,20 @@ EXPORT_SYMBOL_GPL(__mnt_drop_write); */ void mnt_drop_write(struct vfsmount *mnt) { - __mnt_drop_write(mnt); + mnt_put_write_access(mnt); sb_end_write(mnt->mnt_sb); } EXPORT_SYMBOL_GPL(mnt_drop_write); -void __mnt_drop_write_file(struct file *file) +void mnt_put_write_access_file(struct file *file) { if (!(file->f_mode & FMODE_WRITER)) - __mnt_drop_write(file->f_path.mnt); + mnt_put_write_access(file->f_path.mnt); } void mnt_drop_write_file(struct file *file) { - __mnt_drop_write_file(file); + mnt_put_write_access_file(file); sb_end_write(file_inode(file)->i_sb); } EXPORT_SYMBOL(mnt_drop_write_file); diff --git a/fs/open.c b/fs/open.c index 51052202ec..4260d61560 100644 --- a/fs/open.c +++ b/fs/open.c @@ -842,6 +842,30 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) return ksys_fchown(fd, user, group); } +static inline int file_get_write_access(struct file *f) +{ + int error; + + error = get_write_access(f->f_inode); + if (unlikely(error)) + return error; + error = mnt_get_write_access(f->f_path.mnt); + if (unlikely(error)) + goto cleanup_inode; + if (unlikely(f->f_mode & FMODE_BACKING)) { + error = mnt_get_write_access(backing_file_user_path(f)->mnt); + if (unlikely(error)) + goto cleanup_mnt; + } + return 0; + +cleanup_mnt: + mnt_put_write_access(f->f_path.mnt); +cleanup_inode: + put_write_access(f->f_inode); + return error; +} + static int do_dentry_open(struct file *f, struct inode *inode, int (*open)(struct inode *, struct file *)) @@ -861,15 +885,12 @@ static int do_dentry_open(struct file *f, return 0; } - if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { - error = get_write_access(inode); + if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { + i_readcount_inc(inode); + } else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { + error = file_get_write_access(f); if (unlikely(error)) goto cleanup_file; - error = __mnt_want_write(f->f_path.mnt); - if (unlikely(error)) { - put_write_access(inode); - goto cleanup_file; - } f->f_mode |= FMODE_WRITER; } @@ -901,8 +922,6 @@ static int do_dentry_open(struct file *f, goto cleanup_all; } f->f_mode |= FMODE_OPENED; - if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) - i_readcount_inc(inode); if ((f->f_mode & FMODE_READ) && likely(f->f_op->read || f->f_op->read_iter)) f->f_mode |= FMODE_CAN_READ; @@ -948,10 +967,7 @@ cleanup_all: if (WARN_ON_ONCE(error > 0)) error = -EINVAL; fops_put(f->f_op); - if (f->f_mode & FMODE_WRITER) { - put_write_access(inode); - __mnt_drop_write(f->f_path.mnt); - } + put_file_access(f); cleanup_file: path_put(&f->f_path); f->f_path.mnt = NULL; @@ -1092,23 +1108,38 @@ struct file *dentry_create(const struct path *path, int flags, umode_t mode, } EXPORT_SYMBOL(dentry_create); -struct file *open_with_fake_path(const struct path *path, int flags, +/** + * kernel_file_open - open a file for kernel internal use + * @path: path of the file to open + * @flags: open flags + * @inode: the inode + * @cred: credentials for open + * + * Open a file for use by in-kernel consumers. The file is not accounted + * against nr_files and must not be installed into the file descriptor + * table. + * + * Return: Opened file on success, an error pointer on failure. + */ +struct file *kernel_file_open(const struct path *path, int flags, struct inode *inode, const struct cred *cred) { - struct file *f = alloc_empty_file_noaccount(flags, cred); - if (!IS_ERR(f)) { - int error; + struct file *f; + int error; - f->f_path = *path; - error = do_dentry_open(f, inode, NULL); - if (error) { - fput(f); - f = ERR_PTR(error); - } + f = alloc_empty_file_noaccount(flags, cred); + if (IS_ERR(f)) + return f; + + f->f_path = *path; + error = do_dentry_open(f, inode, NULL); + if (error) { + fput(f); + f = ERR_PTR(error); } return f; } -EXPORT_SYMBOL(open_with_fake_path); +EXPORT_SYMBOL_GPL(kernel_file_open); #define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE)) #define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC) @@ -1483,7 +1514,7 @@ SYSCALL_DEFINE1(close, unsigned int, fd) } /** - * close_range() - Close all file descriptors in a given range. + * sys_close_range() - Close all file descriptors in a given range. * * @fd: starting file descriptor to close * @max_fd: last file descriptor to close diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index 6708e54b0e..148d9567b5 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config OVERLAY_FS tristate "Overlay filesystem support" + select FS_STACK select EXPORTFS help An overlay filesystem combines two filesystems - an 'upper' filesystem diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 5db89c8de1..a5bc5dfc93 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -9,25 +9,11 @@ #include #include #include -#include #include -#include #include +#include #include "overlayfs.h" -#include "../internal.h" /* for sb_init_dio_done_wq */ - -struct ovl_aio_req { - struct kiocb iocb; - refcount_t ref; - struct kiocb *orig_iocb; - /* used for aio completion */ - struct work_struct work; - long res; -}; - -static struct kmem_cache *ovl_aio_request_cachep; - static char ovl_whatisit(struct inode *inode, struct inode *realinode) { if (realinode != ovl_inode_upper(inode)) @@ -38,8 +24,8 @@ static char ovl_whatisit(struct inode *inode, struct inode *realinode) return 'm'; } -/* No atime modification nor notify on underlying */ -#define OVL_OPEN_FLAGS (O_NOATIME | FMODE_NONOTIFY) +/* No atime modification on underlying */ +#define OVL_OPEN_FLAGS (O_NOATIME) static struct file *ovl_open_realfile(const struct file *file, const struct path *realpath) @@ -65,8 +51,8 @@ static struct file *ovl_open_realfile(const struct file *file, if (!inode_owner_or_capable(real_idmap, realinode)) flags &= ~O_NOATIME; - realfile = open_with_fake_path(&file->f_path, flags, realinode, - current_cred()); + realfile = backing_file_open(file, + flags, realpath, current_cred()); } revert_creds(old_cred); @@ -270,83 +256,16 @@ static void ovl_file_accessed(struct file *file) touch_atime(&file->f_path); } -#define OVL_IOCB_MASK \ - (IOCB_NOWAIT | IOCB_HIPRI | IOCB_DSYNC | IOCB_SYNC | IOCB_APPEND) - -static rwf_t iocb_to_rw_flags(int flags) -{ - return (__force rwf_t)(flags & OVL_IOCB_MASK); -} - -static inline void ovl_aio_put(struct ovl_aio_req *aio_req) -{ - if (refcount_dec_and_test(&aio_req->ref)) { - fput(aio_req->iocb.ki_filp); - kmem_cache_free(ovl_aio_request_cachep, aio_req); - } -} - -static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req) -{ - struct kiocb *iocb = &aio_req->iocb; - struct kiocb *orig_iocb = aio_req->orig_iocb; - - if (iocb->ki_flags & IOCB_WRITE) { - kiocb_end_write(iocb); - ovl_file_modified(orig_iocb->ki_filp); - } - - orig_iocb->ki_pos = iocb->ki_pos; - ovl_aio_put(aio_req); -} - -static void ovl_aio_rw_complete(struct kiocb *iocb, long res) -{ - struct ovl_aio_req *aio_req = container_of(iocb, - struct ovl_aio_req, iocb); - struct kiocb *orig_iocb = aio_req->orig_iocb; - - ovl_aio_cleanup_handler(aio_req); - orig_iocb->ki_complete(orig_iocb, res); -} - -static void ovl_aio_complete_work(struct work_struct *work) -{ - struct ovl_aio_req *aio_req = container_of(work, - struct ovl_aio_req, work); - - ovl_aio_rw_complete(&aio_req->iocb, aio_req->res); -} - -static void ovl_aio_queue_completion(struct kiocb *iocb, long res) -{ - struct ovl_aio_req *aio_req = container_of(iocb, - struct ovl_aio_req, iocb); - struct kiocb *orig_iocb = aio_req->orig_iocb; - - /* - * Punt to a work queue to serialize updates of mtime/size. - */ - aio_req->res = res; - INIT_WORK(&aio_req->work, ovl_aio_complete_work); - queue_work(file_inode(orig_iocb->ki_filp)->i_sb->s_dio_done_wq, - &aio_req->work); -} - -static int ovl_init_aio_done_wq(struct super_block *sb) -{ - if (sb->s_dio_done_wq) - return 0; - - return sb_init_dio_done_wq(sb); -} - static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct fd real; - const struct cred *old_cred; ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ovl_creds(file_inode(file)->i_sb), + .user_file = file, + .accessed = ovl_file_accessed, + }; if (!iov_iter_count(iter)) return 0; @@ -355,37 +274,8 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) if (ret) return ret; - ret = -EINVAL; - if (iocb->ki_flags & IOCB_DIRECT && - !(real.file->f_mode & FMODE_CAN_ODIRECT)) - goto out_fdput; - - old_cred = ovl_override_creds(file_inode(file)->i_sb); - if (is_sync_kiocb(iocb)) { - rwf_t rwf = iocb_to_rw_flags(iocb->ki_flags); - - ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, rwf); - } else { - struct ovl_aio_req *aio_req; - - ret = -ENOMEM; - aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL); - if (!aio_req) - goto out; - - aio_req->orig_iocb = iocb; - kiocb_clone(&aio_req->iocb, iocb, get_file(real.file)); - aio_req->iocb.ki_complete = ovl_aio_rw_complete; - refcount_set(&aio_req->ref, 2); - ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter); - ovl_aio_put(aio_req); - if (ret != -EIOCBQUEUED) - ovl_aio_cleanup_handler(aio_req); - } -out: - revert_creds(old_cred); - ovl_file_accessed(file); -out_fdput: + ret = backing_file_read_iter(real.file, iter, iocb, iocb->ki_flags, + &ctx); fdput(real); return ret; @@ -396,9 +286,13 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct fd real; - const struct cred *old_cred; ssize_t ret; int ifl = iocb->ki_flags; + struct backing_file_ctx ctx = { + .cred = ovl_creds(inode->i_sb), + .user_file = file, + .end_write = ovl_file_modified, + }; if (!iov_iter_count(iter)) return 0; @@ -406,63 +300,15 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) inode_lock(inode); /* Update mode */ ovl_copyattr(inode); - ret = file_remove_privs(file); - if (ret) - goto out_unlock; ret = ovl_real_fdget(file, &real); if (ret) goto out_unlock; - ret = -EINVAL; - if (iocb->ki_flags & IOCB_DIRECT && - !(real.file->f_mode & FMODE_CAN_ODIRECT)) - goto out_fdput; - if (!ovl_should_sync(OVL_FS(inode->i_sb))) ifl &= ~(IOCB_DSYNC | IOCB_SYNC); - /* - * Overlayfs doesn't support deferred completions, don't copy - * this property in case it is set by the issuer. - */ - ifl &= ~IOCB_DIO_CALLER_COMP; - - old_cred = ovl_override_creds(file_inode(file)->i_sb); - if (is_sync_kiocb(iocb)) { - rwf_t rwf = iocb_to_rw_flags(ifl); - - file_start_write(real.file); - ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, rwf); - file_end_write(real.file); - /* Update size */ - ovl_file_modified(file); - } else { - struct ovl_aio_req *aio_req; - - ret = ovl_init_aio_done_wq(inode->i_sb); - if (ret) - goto out; - - ret = -ENOMEM; - aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL); - if (!aio_req) - goto out; - - aio_req->orig_iocb = iocb; - kiocb_clone(&aio_req->iocb, iocb, get_file(real.file)); - aio_req->iocb.ki_flags = ifl; - aio_req->iocb.ki_complete = ovl_aio_queue_completion; - refcount_set(&aio_req->ref, 2); - kiocb_start_write(&aio_req->iocb); - ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter); - ovl_aio_put(aio_req); - if (ret != -EIOCBQUEUED) - ovl_aio_cleanup_handler(aio_req); - } -out: - revert_creds(old_cred); -out_fdput: + ret = backing_file_write_iter(real.file, iter, iocb, ifl, &ctx); fdput(real); out_unlock: @@ -475,20 +321,21 @@ static ssize_t ovl_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { - const struct cred *old_cred; struct fd real; ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ovl_creds(file_inode(in)->i_sb), + .user_file = in, + .accessed = ovl_file_accessed, + }; ret = ovl_real_fdget(in, &real); if (ret) return ret; - old_cred = ovl_override_creds(file_inode(in)->i_sb); - ret = vfs_splice_read(real.file, ppos, pipe, len, flags); - revert_creds(old_cred); - ovl_file_accessed(in); - + ret = backing_file_splice_read(real.file, ppos, pipe, len, flags, &ctx); fdput(real); + return ret; } @@ -504,30 +351,23 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { struct fd real; - const struct cred *old_cred; struct inode *inode = file_inode(out); ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ovl_creds(inode->i_sb), + .user_file = out, + .end_write = ovl_file_modified, + }; inode_lock(inode); /* Update mode */ ovl_copyattr(inode); - ret = file_remove_privs(out); - if (ret) - goto out_unlock; ret = ovl_real_fdget(out, &real); if (ret) goto out_unlock; - old_cred = ovl_override_creds(inode->i_sb); - file_start_write(real.file); - - ret = iter_file_splice_write(pipe, real.file, ppos, len, flags); - - file_end_write(real.file); - /* Update size */ - ovl_file_modified(out); - revert_creds(old_cred); + ret = backing_file_splice_write(pipe, real.file, ppos, len, flags, &ctx); fdput(real); out_unlock: @@ -565,23 +405,13 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync) static int ovl_mmap(struct file *file, struct vm_area_struct *vma) { struct file *realfile = file->private_data; - const struct cred *old_cred; - int ret; + struct backing_file_ctx ctx = { + .cred = ovl_creds(file_inode(file)->i_sb), + .user_file = file, + .accessed = ovl_file_accessed, + }; - if (!realfile->f_op->mmap) - return -ENODEV; - - if (WARN_ON(file != vma->vm_file)) - return -EIO; - - vma_set_file(vma, realfile); - - old_cred = ovl_override_creds(file_inode(file)->i_sb); - ret = call_mmap(vma->vm_file, vma); - revert_creds(old_cred); - ovl_file_accessed(file); - - return ret; + return backing_file_mmap(realfile, vma, &ctx); } static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len) @@ -774,19 +604,3 @@ const struct file_operations ovl_file_operations = { .copy_file_range = ovl_copy_file_range, .remap_file_range = ovl_remap_file_range, }; - -int __init ovl_aio_request_cache_init(void) -{ - ovl_aio_request_cachep = kmem_cache_create("ovl_aio_req", - sizeof(struct ovl_aio_req), - 0, SLAB_HWCACHE_ALIGN, NULL); - if (!ovl_aio_request_cachep) - return -ENOMEM; - - return 0; -} - -void ovl_aio_request_cache_destroy(void) -{ - kmem_cache_destroy(ovl_aio_request_cachep); -} diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index a4b94a74b8..8b31bc3ee7 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -417,6 +417,12 @@ int ovl_want_write(struct dentry *dentry); void ovl_drop_write(struct dentry *dentry); struct dentry *ovl_workdir(struct dentry *dentry); const struct cred *ovl_override_creds(struct super_block *sb); + +static inline const struct cred *ovl_creds(struct super_block *sb) +{ + return OVL_FS(sb)->creator_cred; +} + int ovl_can_decode_fh(struct super_block *sb); struct dentry *ovl_indexdir(struct super_block *sb); bool ovl_index_all(struct super_block *sb); @@ -835,8 +841,6 @@ struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir, /* file.c */ extern const struct file_operations ovl_file_operations; -int __init ovl_aio_request_cache_init(void); -void ovl_aio_request_cache_destroy(void); int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa); int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa); int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index c49b1e7575..37387fd98e 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -34,14 +34,22 @@ static struct dentry *ovl_d_real(struct dentry *dentry, struct dentry *real = NULL, *lower; int err; - /* It's an overlay file */ + /* + * vfs is only expected to call d_real() with NULL from d_real_inode() + * and with overlay inode from file_dentry() on an overlay file. + * + * TODO: remove @inode argument from d_real() API, remove code in this + * function that deals with non-NULL @inode and remove d_real() call + * from file_dentry(). + */ if (inode && d_inode(dentry) == inode) return dentry; + else if (inode) + goto bug; if (!d_is_reg(dentry)) { - if (!inode || inode == d_inode(dentry)) - return dentry; - goto bug; + /* d_real_inode() is only relevant for regular files */ + return dentry; } real = ovl_dentry_upper(dentry); @@ -1514,14 +1522,10 @@ static int __init ovl_init(void) if (ovl_inode_cachep == NULL) return -ENOMEM; - err = ovl_aio_request_cache_init(); - if (!err) { - err = register_filesystem(&ovl_fs_type); - if (!err) - return 0; + err = register_filesystem(&ovl_fs_type); + if (!err) + return 0; - ovl_aio_request_cache_destroy(); - } kmem_cache_destroy(ovl_inode_cachep); return err; @@ -1537,7 +1541,6 @@ static void __exit ovl_exit(void) */ rcu_barrier(); kmem_cache_destroy(ovl_inode_cachep); - ovl_aio_request_cache_destroy(); } module_init(ovl_init); diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 2cdbb70d2b..81ef76c77c 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -21,7 +21,7 @@ int ovl_get_write_access(struct dentry *dentry) { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); - return __mnt_want_write(ovl_upper_mnt(ofs)); + return mnt_get_write_access(ovl_upper_mnt(ofs)); } /* Get write access to upper sb - may block if upper sb is frozen */ @@ -40,7 +40,7 @@ int ovl_want_write(struct dentry *dentry) void ovl_put_write_access(struct dentry *dentry) { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); - __mnt_drop_write(ovl_upper_mnt(ofs)); + mnt_put_write_access(ovl_upper_mnt(ofs)); } void ovl_end_write(struct dentry *dentry) @@ -1370,7 +1370,7 @@ int ovl_ensure_verity_loaded(struct path *datapath) * If this inode was not yet opened, the verity info hasn't been * loaded yet, so we need to do that here to force it into memory. */ - filp = open_with_fake_path(datapath, O_RDONLY, inode, current_cred()); + filp = kernel_file_open(datapath, O_RDONLY, inode, current_cred()); if (IS_ERR(filp)) return PTR_ERR(filp); fput(filp); diff --git a/fs/proc/base.c b/fs/proc/base.c index 67d1afedaa..4f346b2982 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2218,7 +2218,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path) rc = -ENOENT; vma = find_exact_vma(mm, vm_start, vm_end); if (vma && vma->vm_file) { - *path = vma->vm_file->f_path; + *path = *file_user_path(vma->vm_file); path_get(path); rc = 0; } diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index 13452b32e2..b7e06be412 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -59,7 +59,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region) if (file) { seq_pad(m, ' '); - seq_file_path(m, file, ""); + seq_path(m, file_user_path(file), ""); } seq_putc(m, '\n'); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e396e52ca0..6180dc9351 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -295,7 +295,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) if (anon_name) seq_printf(m, "[anon_shmem:%s]", anon_name->name); else - seq_file_path(m, file, "\n"); + seq_path(m, file_user_path(file), "\n"); goto done; } @@ -1952,7 +1952,7 @@ static int show_numa_map(struct seq_file *m, void *v) if (file) { seq_puts(m, " file="); - seq_file_path(m, file, "\n\t= "); + seq_path(m, file_user_path(file), "\n\t= "); } else if (vma_is_initial_heap(vma)) { seq_puts(m, " heap"); } else if (vma_is_initial_stack(vma)) { diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 4d52623e1b..a3822c149f 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -162,7 +162,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) if (file) { seq_pad(m, ' '); - seq_file_path(m, file, ""); + seq_path(m, file_user_path(file), ""); } else if (mm && vma_is_initial_stack(vma)) { seq_pad(m, ' '); seq_puts(m, "[stack]"); diff --git a/include/config/FS_STACK b/include/config/FS_STACK new file mode 100644 index 0000000000..e69de29bb2 diff --git a/include/config/auto.conf b/include/config/auto.conf index 609942694f..53ff647a33 100644 --- a/include/config/auto.conf +++ b/include/config/auto.conf @@ -2913,6 +2913,7 @@ CONFIG_USB_SERIAL_F8153X=m CONFIG_VIDEO_CX2341X=m CONFIG_KASAN=y CONFIG_LOCALVERSION="" +CONFIG_FS_STACK=y CONFIG_PROVE_RCU=y CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU=y CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK=y diff --git a/include/generated/autoconf.h b/include/generated/autoconf.h index c17c5fdbf4..396ca6ad84 100644 --- a/include/generated/autoconf.h +++ b/include/generated/autoconf.h @@ -2915,6 +2915,7 @@ #define CONFIG_VIDEO_CX2341X_MODULE 1 #define CONFIG_KASAN 1 #define CONFIG_LOCALVERSION "" +#define CONFIG_FS_STACK 1 #define CONFIG_PROVE_RCU 1 #define CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU 1 #define CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK 1 diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h new file mode 100644 index 0000000000..103b6992b8 --- /dev/null +++ b/include/linux/backing-file.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Common helpers for stackable filesystems and backing files. + * + * Copyright (C) 2023 CTERA Networks. + */ + +#ifndef _LINUX_BACKING_FILE_H +#define _LINUX_BACKING_FILE_H + +#include +#include +#include + +struct backing_file_ctx { + const struct cred *cred; + struct file *user_file; + void (*accessed)(struct file *); + void (*end_write)(struct file *); +}; + +struct file *backing_file_open(const struct file *user_file, int flags, + const struct path *real_path, + const struct cred *cred); +ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter, + struct kiocb *iocb, int flags, + struct backing_file_ctx *ctx); +ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter, + struct kiocb *iocb, int flags, + struct backing_file_ctx *ctx); +ssize_t backing_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags, + struct backing_file_ctx *ctx); +ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, size_t len, + unsigned int flags, + struct backing_file_ctx *ctx); +int backing_file_mmap(struct file *file, struct vm_area_struct *vma, + struct backing_file_ctx *ctx); + +#endif /* _LINUX_BACKING_FILE_H */ diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 589c5f4a3b..6fc88f0a05 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -59,17 +59,6 @@ */ #define barrier_before_unreachable() asm volatile("") -/* - * GCC 'asm goto' miscompiles certain code sequences: - * - * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670 - * - * Work it around via a compiler barrier quirk suggested by Jakub Jelinek. - * - * (asm goto is automatically volatile - the naming reflects this.) - */ -#define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0) - #if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP) #define __HAVE_BUILTIN_BSWAP32__ #define __HAVE_BUILTIN_BSWAP64__ diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 575898ac4d..1f405d334c 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -373,8 +373,15 @@ __no_sanitize_memory #define __member_size(p) __builtin_object_size(p, 1) #endif -#ifndef asm_volatile_goto -#define asm_volatile_goto(x...) asm goto(x) +/* + * Some versions of gcc do not mark 'asm goto' volatile: + * + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103979 + * + * We do it here by hand, because it doesn't hurt. + */ +#ifndef asm_goto_output +#define asm_goto_output(x...) asm volatile goto(x) #endif #ifdef CONFIG_CC_HAS_ASM_INLINE diff --git a/include/linux/fs.h b/include/linux/fs.h index e7bde505e5..5ca2cb505f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -167,6 +167,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* Supports IOCB_HAS_METADATA */ #define FMODE_HAS_METADATA ((__force fmode_t)0x800000) +/* File is embedded in backing_file object */ +#define FMODE_BACKING ((__force fmode_t)0x2000000) + /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x4000000) @@ -1833,6 +1836,8 @@ static inline int vfs_whiteout(struct mnt_idmap *idmap, struct file *vfs_tmpfile_open(struct mnt_idmap *idmap, const struct path *parentpath, umode_t mode, int open_flag, const struct cred *cred); +struct file *kernel_file_open(const struct path *path, int flags, + struct inode *inode, const struct cred *cred); int vfs_mkobj(struct dentry *, umode_t, int (*f)(struct dentry *, umode_t, void *), @@ -2577,11 +2582,41 @@ static inline struct file *file_open_root_mnt(struct vfsmount *mnt, return file_open_root(&(struct path){.mnt = mnt, .dentry = mnt->mnt_root}, name, flags, mode); } -extern struct file * dentry_open(const struct path *, int, const struct cred *); -extern struct file *dentry_create(const struct path *path, int flags, - umode_t mode, const struct cred *cred); -extern struct file * open_with_fake_path(const struct path *, int, - struct inode*, const struct cred *); +struct file *dentry_open(const struct path *path, int flags, + const struct cred *creds); +struct file *dentry_create(const struct path *path, int flags, umode_t mode, + const struct cred *cred); +struct path *backing_file_user_path(const struct file *f); + +#ifdef CONFIG_SECURITY +void *backing_file_security(const struct file *f); +void backing_file_set_security(struct file *f, void *security); +#else +static inline void *backing_file_security(const struct file *f) +{ + return NULL; +} +static inline void backing_file_set_security(struct file *f, void *security) +{ +} +#endif /* CONFIG_SECURITY */ + +/* + * file_user_path - get the path to display for memory mapped file + * + * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file + * stored in ->vm_file is a backing file whose f_inode is on the underlying + * filesystem. When the mapped file path is displayed to user (e.g. via + * /proc//maps), this helper should be used to get the path to display + * to the user, which is the path of the fd that user has requested to map. + */ +static inline const struct path *file_user_path(const struct file *f) +{ + if (unlikely(f->f_mode & FMODE_BACKING)) + return backing_file_user_path(f); + return &f->f_path; +} + static inline struct file *file_clone_open(struct file *file) { return dentry_open(&file->f_path, file->f_flags, file->f_cred); diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index bb8467cd11..bcb6609b54 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -91,11 +91,12 @@ static inline void fsnotify_dentry(struct dentry *dentry, __u32 mask) static inline int fsnotify_file(struct file *file, __u32 mask) { - const struct path *path = &file->f_path; + const struct path *path; if (file->f_mode & FMODE_NONOTIFY) return 0; + path = &file->f_path; return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH); } diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h index 97a8b21eb0..c0a2839253 100644 --- a/include/linux/lsm_audit.h +++ b/include/linux/lsm_audit.h @@ -93,7 +93,7 @@ struct common_audit_data { #endif char *kmod_name; struct lsm_ioctlop_audit *op; - struct file *file; + const struct file *file; struct lsm_ibpkey_audit *ibpkey; struct lsm_ibendport_audit *ibendport; int reason; diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index fc89fae1ea..304da2a90b 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -26,13 +26,13 @@ * #undef LSM_HOOK * }; */ -LSM_HOOK(int, 0, binder_set_context_mgr, struct task_struct *mgr) -LSM_HOOK(int, 0, binder_transaction, struct task_struct *from, - struct task_struct *to) -LSM_HOOK(int, 0, binder_transfer_binder, struct task_struct *from, - struct task_struct *to) -LSM_HOOK(int, 0, binder_transfer_file, struct task_struct *from, - struct task_struct *to, struct file *file) +LSM_HOOK(int, 0, binder_set_context_mgr, const struct cred *mgr) +LSM_HOOK(int, 0, binder_transaction, const struct cred *from, + const struct cred *to) +LSM_HOOK(int, 0, binder_transfer_binder, const struct cred *from, + const struct cred *to) +LSM_HOOK(int, 0, binder_transfer_file, const struct cred *from, + const struct cred *to, const struct file *file) LSM_HOOK(int, 0, ptrace_access_check, struct task_struct *child, unsigned int mode) LSM_HOOK(int, 0, ptrace_traceme, struct task_struct *parent) @@ -168,6 +168,9 @@ LSM_HOOK(int, 0, kernfs_init_security, struct kernfs_node *kn_dir, LSM_HOOK(int, 0, file_permission, struct file *file, int mask) LSM_HOOK(int, 0, file_alloc_security, struct file *file) LSM_HOOK(void, LSM_RET_VOID, file_free_security, struct file *file) +LSM_HOOK(int, 0, backing_file_alloc, struct file *backing_file, + const struct file *user_file) +LSM_HOOK(void, LSM_RET_VOID, backing_file_free, struct file *backing_file) LSM_HOOK(int, 0, file_ioctl, struct file *file, unsigned int cmd, unsigned long arg) LSM_HOOK(int, 0, file_ioctl_compat, struct file *file, unsigned int cmd, @@ -175,6 +178,8 @@ LSM_HOOK(int, 0, file_ioctl_compat, struct file *file, unsigned int cmd, LSM_HOOK(int, 0, mmap_addr, unsigned long addr) LSM_HOOK(int, 0, mmap_file, struct file *file, unsigned long reqprot, unsigned long prot, unsigned long flags) +LSM_HOOK(int, 0, mmap_backing_file, struct vm_area_struct *vma, + struct file *backing_file, struct file *user_file) LSM_HOOK(int, 0, file_mprotect, struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot) LSM_HOOK(int, 0, file_lock, struct file *file, unsigned int cmd) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 3f04476cc6..a16571929f 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1330,22 +1330,22 @@ * * @binder_set_context_mgr: * Check whether @mgr is allowed to be the binder context manager. - * @mgr contains the task_struct for the task being registered. + * @mgr contains the struct cred for the current binder process. * Return 0 if permission is granted. * @binder_transaction: * Check whether @from is allowed to invoke a binder transaction call * to @to. - * @from contains the task_struct for the sending task. - * @to contains the task_struct for the receiving task. + * @from contains the struct cred for the sending process. + * @to contains the struct cred for the receiving process. * @binder_transfer_binder: * Check whether @from is allowed to transfer a binder reference to @to. - * @from contains the task_struct for the sending task. - * @to contains the task_struct for the receiving task. + * @from contains the struct cred for the sending process. + * @to contains the struct cred for the receiving process. * @binder_transfer_file: * Check whether @from is allowed to transfer @file to @to. - * @from contains the task_struct for the sending task. + * @from contains the struct cred for the sending process. * @file contains the struct file being transferred. - * @to contains the task_struct for the receiving task. + * @to contains the struct cred for the receiving process. * * @ptrace_access_check: * Check permission before allowing the current process to trace the @@ -1637,6 +1637,7 @@ struct security_hook_list { struct lsm_blob_sizes { int lbs_cred; int lbs_file; + int lbs_backing_file; int lbs_inode; int lbs_superblock; int lbs_ipc; diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h index 991526039c..9d47cdc727 100644 --- a/include/linux/mlx5/cq.h +++ b/include/linux/mlx5/cq.h @@ -41,7 +41,6 @@ struct mlx5_core_cq { int cqe_sz; __be32 *set_ci_db; __be32 *arm_db; - struct mlx5_uars_page *uar; refcount_t refcount; struct completion free; unsigned vector; @@ -184,6 +183,7 @@ static inline void mlx5_cq_put(struct mlx5_core_cq *cq) complete(&cq->free); } +void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq, struct mlx5_eqe *eqe); int mlx5_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, u32 *in, int inlen, u32 *out, int outlen); int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index c9e0d0f437..6644864b19 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -279,6 +279,7 @@ enum { MLX5_MKEY_MASK_SMALL_FENCE = 1ull << 23, MLX5_MKEY_MASK_RELAXED_ORDERING_WRITE = 1ull << 25, MLX5_MKEY_MASK_FREE = 1ull << 29, + MLX5_MKEY_MASK_PAGE_SIZE_5 = 1ull << 42, MLX5_MKEY_MASK_RELAXED_ORDERING_READ = 1ull << 47, }; @@ -1237,6 +1238,7 @@ enum mlx5_cap_type { MLX5_CAP_IPSEC, MLX5_CAP_CRYPTO = 0x1a, MLX5_CAP_SHAMPO = 0x1d, + MLX5_CAP_PSP = 0x1e, MLX5_CAP_MACSEC = 0x1f, MLX5_CAP_GENERAL_2 = 0x20, MLX5_CAP_PORT_SELECTION = 0x25, @@ -1476,6 +1478,9 @@ enum mlx5_qcam_feature_groups { #define MLX5_CAP_SHAMPO(mdev, cap) \ MLX5_GET(shampo_cap, mdev->caps.hca[MLX5_CAP_SHAMPO]->cur, cap) +#define MLX5_CAP_PSP(mdev, cap)\ + MLX5_GET(psp_cap, (mdev)->caps.hca[MLX5_CAP_PSP]->cur, cap) + enum { MLX5_CMD_STAT_OK = 0x0, MLX5_CMD_STAT_INT_ERR = 0x1, @@ -1510,6 +1515,7 @@ enum { MLX5_PHYSICAL_LAYER_RECOVERY_GROUP = 0x1a, MLX5_INFINIBAND_PORT_COUNTERS_GROUP = 0x20, MLX5_INFINIBAND_EXTENDED_PORT_COUNTERS_GROUP = 0x21, + MLX5_RS_FEC_HISTOGRAM_GROUP = 0x23, }; enum { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 04705078df..1c54aa6f74 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -129,12 +130,14 @@ enum { MLX5_REG_PDDR = 0x5031, MLX5_REG_PMLP = 0x5002, MLX5_REG_PPLM = 0x5023, + MLX5_REG_PPHCR = 0x503E, MLX5_REG_PCAM = 0x507f, MLX5_REG_NODE_DESC = 0x6001, MLX5_REG_HOST_ENDIANNESS = 0x7004, MLX5_REG_MTCAP = 0x9009, MLX5_REG_MTMP = 0x900A, MLX5_REG_MCIA = 0x9014, + MLX5_REG_MNVDA = 0x9024, MLX5_REG_MFRL = 0x9028, MLX5_REG_MLCR = 0x902b, MLX5_REG_MRTC = 0x902d, @@ -398,6 +401,7 @@ struct mlx5_core_rsc_common { enum mlx5_res_type res; refcount_t refcount; struct completion free; + bool invalid; }; struct mlx5_uars_page { @@ -430,7 +434,6 @@ struct mlx5_sq_bfreg { struct mlx5_uars_page *up; bool wc; u32 index; - unsigned int offset; }; struct mlx5_core_health { @@ -485,7 +488,6 @@ struct mlx5_devcom_dev; struct mlx5_fw_reset; struct mlx5_eq_table; struct mlx5_irq_table; -struct mlx5_vhca_state_notifier; struct mlx5_sf_dev_table; struct mlx5_sf_hw_table; struct mlx5_sf_table; @@ -596,6 +598,7 @@ struct mlx5_priv { struct mlx5_flow_steering *steering; struct mlx5_mpfs *mpfs; + struct blocking_notifier_head esw_n_head; struct mlx5_eswitch *eswitch; struct mlx5_core_sriov sriov; struct mlx5_lag *lag; @@ -609,14 +612,20 @@ struct mlx5_priv { struct mlx5_ft_pool *ft_pool; struct mlx5_bfreg_data bfregs; - struct mlx5_uars_page *uar; + struct mlx5_sq_bfreg bfreg; #ifdef CONFIG_MLX5_SF - struct mlx5_vhca_state_notifier *vhca_state_notifier; + struct mlx5_nb vhca_state_nb; + struct blocking_notifier_head vhca_state_n_head; + struct notifier_block sf_dev_nb; struct mlx5_sf_dev_table *sf_dev_table; struct mlx5_core_dev *parent_mdev; #endif #ifdef CONFIG_MLX5_SF_MANAGER + struct notifier_block sf_hw_table_vhca_nb; struct mlx5_sf_hw_table *sf_hw_table; + struct notifier_block sf_table_esw_nb; + struct notifier_block sf_table_vhca_nb; + struct notifier_block sf_table_mdev_nb; struct mlx5_sf_table *sf_table; #endif struct blocking_notifier_head lag_nh; @@ -655,12 +664,14 @@ struct mlx5e_resources { u32 pdn; struct mlx5_td td; u32 mkey; - struct mlx5_sq_bfreg bfreg; + struct mlx5_sq_bfreg *bfregs; + unsigned int num_bfregs; #define MLX5_MAX_NUM_TC 8 u32 tisn[MLX5_MAX_PORTS][MLX5_MAX_NUM_TC]; bool tisn_valid; } hw_objs; struct net_device *uplink_netdev; + netdevice_tracker tracker; struct mutex uplink_netdev_lock; struct mlx5_crypto_dek_priv *dek_priv; }; @@ -687,7 +698,7 @@ struct mlx5_fw_tracer; struct mlx5_vxlan; struct mlx5_geneve; struct mlx5_hv_vhca; -struct mlx5_thermal; +struct mlx5_st; #define MLX5_LOG_SW_ICM_BLOCK_SIZE(dev) (MLX5_CAP_DEV_MEM(dev, log_sw_icm_alloc_granularity)) #define MLX5_SW_ICM_BLOCK_SIZE(dev) (1 << MLX5_LOG_SW_ICM_BLOCK_SIZE(dev)) @@ -757,6 +768,7 @@ struct mlx5_core_dev { u32 issi; struct mlx5e_resources mlx5e_res; struct mlx5_dm *dm; + struct mlx5_st *st; struct mlx5_vxlan *vxlan; struct mlx5_geneve *geneve; struct { @@ -798,6 +810,8 @@ struct mlx5_db { int index; }; +#define MLX5_DEFAULT_NUM_DOORBELLS 8 + enum { MLX5_COMP_EQ_SIZE = 1024, }; @@ -811,6 +825,7 @@ typedef void (*mlx5_cmd_cbk_t)(int status, void *context); enum { MLX5_CMD_ENT_STATE_PENDING_COMP, + MLX5_CMD_ENT_STATE_TIMEDOUT, }; struct mlx5_cmd_work_ent { @@ -1160,6 +1175,23 @@ int mlx5_dm_sw_icm_alloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type, int mlx5_dm_sw_icm_dealloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type, u64 length, u16 uid, phys_addr_t addr, u32 obj_id); +#ifdef CONFIG_PCIE_TPH +int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type, + unsigned int cpu_uid, u16 *st_index); +int mlx5_st_dealloc_index(struct mlx5_core_dev *dev, u16 st_index); +#else +static inline int mlx5_st_alloc_index(struct mlx5_core_dev *dev, + enum tph_mem_type mem_type, + unsigned int cpu_uid, u16 *st_index) +{ + return -EOPNOTSUPP; +} +static inline int mlx5_st_dealloc_index(struct mlx5_core_dev *dev, u16 st_index) +{ + return -EOPNOTSUPP; +} +#endif + struct mlx5_core_dev *mlx5_vf_get_core_dev(struct pci_dev *pdev); void mlx5_vf_put_core_dev(struct mlx5_core_dev *mdev); @@ -1349,4 +1381,12 @@ enum { }; bool mlx5_wc_support_get(struct mlx5_core_dev *mdev); + +static inline struct net *mlx5_core_net(struct mlx5_core_dev *dev) +{ + return devlink_net(priv_to_devlink(dev)); +} + +#define MLX5_SW_IMAGE_GUID_MAX_BYTES 9 + #endif /* MLX5_DRIVER_H */ diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index fb5f98fcc7..9cadb1d5e6 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -40,7 +40,7 @@ #define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v) -#define MLX5_RDMA_TRANSPORT_BYPASS_PRIO 0 +#define MLX5_RDMA_TRANSPORT_BYPASS_PRIO 16 #define MLX5_FS_MAX_POOL_SIZE BIT(30) enum mlx5_flow_destination_type { @@ -71,6 +71,7 @@ enum { MLX5_FLOW_TABLE_UNMANAGED = BIT(3), MLX5_FLOW_TABLE_OTHER_VPORT = BIT(4), MLX5_FLOW_TABLE_UPLINK_VPORT = BIT(5), + MLX5_FLOW_TABLE_OTHER_ESWITCH = BIT(6), }; #define LEFTOVERS_RULE_NUM 2 @@ -116,6 +117,7 @@ enum mlx5_flow_namespace_type { }; enum { + FDB_DROP_ROOT, FDB_BYPASS_PATH, FDB_CRYPTO_INGRESS, FDB_TC_OFFLOAD, @@ -127,6 +129,24 @@ enum { FDB_PER_VPORT, }; +enum fs_flow_table_type { + FS_FT_NIC_RX = 0x0, + FS_FT_NIC_TX = 0x1, + FS_FT_ESW_EGRESS_ACL = 0x2, + FS_FT_ESW_INGRESS_ACL = 0x3, + FS_FT_FDB = 0X4, + FS_FT_SNIFFER_RX = 0X5, + FS_FT_SNIFFER_TX = 0X6, + FS_FT_RDMA_RX = 0X7, + FS_FT_RDMA_TX = 0X8, + FS_FT_PORT_SEL = 0X9, + FS_FT_FDB_RX = 0xa, + FS_FT_FDB_TX = 0xb, + FS_FT_RDMA_TRANSPORT_RX = 0xd, + FS_FT_RDMA_TRANSPORT_TX = 0xe, + FS_FT_MAX_TYPE = FS_FT_RDMA_TRANSPORT_TX, +}; + struct mlx5_pkt_reformat; struct mlx5_modify_hdr; struct mlx5_flow_definer; @@ -208,6 +228,7 @@ struct mlx5_flow_table_attr { u32 flags; u16 uid; u16 vport; + u16 esw_owner_vhca_id; struct mlx5_flow_table *next_ft; struct { @@ -353,4 +374,8 @@ u32 mlx5_flow_table_id(struct mlx5_flow_table *ft); struct mlx5_flow_root_namespace * mlx5_get_root_namespace(struct mlx5_core_dev *dev, enum mlx5_flow_namespace_type ns_type); + +int mlx5_fs_set_root_dev(struct mlx5_core_dev *dev, + struct mlx5_core_dev *new_dev, + enum fs_flow_table_type table_type); #endif diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 9521159b08..53cba23d1f 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -189,6 +189,9 @@ enum { MLX5_CMD_OP_QUERY_XRQ_ERROR_PARAMS = 0x727, MLX5_CMD_OP_RELEASE_XRQ_ERROR = 0x729, MLX5_CMD_OP_MODIFY_XRQ = 0x72a, + MLX5_CMD_OPCODE_QUERY_DELEGATED_VHCA = 0x732, + MLX5_CMD_OPCODE_CREATE_ESW_VPORT = 0x733, + MLX5_CMD_OPCODE_DESTROY_ESW_VPORT = 0x734, MLX5_CMD_OP_QUERY_ESW_FUNCTIONS = 0x740, MLX5_CMD_OP_QUERY_VPORT_STATE = 0x750, MLX5_CMD_OP_MODIFY_VPORT_STATE = 0x751, @@ -311,6 +314,8 @@ enum { MLX5_CMD_OP_CREATE_UMEM = 0xa08, MLX5_CMD_OP_DESTROY_UMEM = 0xa0a, MLX5_CMD_OP_SYNC_STEERING = 0xb00, + MLX5_CMD_OP_PSP_GEN_SPI = 0xb10, + MLX5_CMD_OP_PSP_ROTATE_KEY = 0xb11, MLX5_CMD_OP_QUERY_VHCA_STATE = 0xb0d, MLX5_CMD_OP_MODIFY_VHCA_STATE = 0xb0e, MLX5_CMD_OP_SYNC_CRYPTO = 0xb12, @@ -420,7 +425,8 @@ struct mlx5_ifc_flow_table_fields_supported_bits { /* Table 2170 - Flow Table Fields Supported 2 Format */ struct mlx5_ifc_flow_table_fields_supported_2_bits { - u8 reserved_at_0[0x2]; + u8 inner_l4_type_ext[0x1]; + u8 outer_l4_type_ext[0x1]; u8 inner_l4_type[0x1]; u8 outer_l4_type[0x1]; u8 reserved_at_4[0xa]; @@ -429,7 +435,11 @@ struct mlx5_ifc_flow_table_fields_supported_2_bits { u8 tunnel_header_0_1[0x1]; u8 reserved_at_11[0xf]; - u8 reserved_at_20[0x60]; + u8 reserved_at_20[0xf]; + u8 ipsec_next_header[0x1]; + u8 reserved_at_30[0x10]; + + u8 reserved_at_40[0x40]; }; struct mlx5_ifc_flow_table_prop_layout_bits { @@ -481,12 +491,14 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 execute_aso[0x1]; u8 reserved_at_47[0x19]; - u8 reserved_at_60[0x2]; + u8 reformat_l2_to_l3_psp_tunnel[0x1]; + u8 reformat_l3_psp_tunnel_to_l2[0x1]; u8 reformat_insert[0x1]; u8 reformat_remove[0x1]; u8 macsec_encrypt[0x1]; u8 macsec_decrypt[0x1]; - u8 reserved_at_66[0x2]; + u8 psp_encrypt[0x1]; + u8 psp_decrypt[0x1]; u8 reformat_add_macsec[0x1]; u8 reformat_remove_macsec[0x1]; u8 reparse[0x1]; @@ -552,6 +564,13 @@ enum { MLX5_PACKET_L4_TYPE_UDP, }; +enum { + MLX5_PACKET_L4_TYPE_EXT_NONE, + MLX5_PACKET_L4_TYPE_EXT_TCP, + MLX5_PACKET_L4_TYPE_EXT_UDP, + MLX5_PACKET_L4_TYPE_EXT_ICMP, +}; + struct mlx5_ifc_fte_match_set_lyr_2_4_bits { u8 smac_47_16[0x20]; @@ -578,10 +597,10 @@ struct mlx5_ifc_fte_match_set_lyr_2_4_bits { u8 tcp_dport[0x10]; u8 l4_type[0x2]; - u8 reserved_at_c2[0xe]; + u8 l4_type_ext[0x4]; + u8 reserved_at_c6[0xa]; u8 ipv4_ihl[0x4]; - u8 reserved_at_c4[0x4]; - + u8 reserved_at_d4[0x4]; u8 ttl_hoplimit[0x8]; u8 udp_sport[0x10]; @@ -688,11 +707,10 @@ struct mlx5_ifc_fte_match_set_misc2_bits { u8 metadata_reg_a[0x20]; - u8 reserved_at_1a0[0x8]; - + u8 psp_syndrome[0x8]; u8 macsec_syndrome[0x8]; u8 ipsec_syndrome[0x8]; - u8 reserved_at_1b8[0x8]; + u8 ipsec_next_header[0x8]; u8 reserved_at_1c0[0x40]; }; @@ -1496,6 +1514,21 @@ struct mlx5_ifc_macsec_cap_bits { u8 reserved_at_40[0x7c0]; }; +struct mlx5_ifc_psp_cap_bits { + u8 reserved_at_0[0x1]; + u8 psp_crypto_offload[0x1]; + u8 reserved_at_2[0x1]; + u8 psp_crypto_esp_aes_gcm_256_encrypt[0x1]; + u8 psp_crypto_esp_aes_gcm_128_encrypt[0x1]; + u8 psp_crypto_esp_aes_gcm_256_decrypt[0x1]; + u8 psp_crypto_esp_aes_gcm_128_decrypt[0x1]; + u8 reserved_at_7[0x4]; + u8 log_max_num_of_psp_spi[0x5]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x7e0]; +}; + enum { MLX5_WQ_TYPE_LINKED_LIST = 0x0, MLX5_WQ_TYPE_CYCLIC = 0x1, @@ -1845,7 +1878,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 log_bf_reg_size[0x5]; - u8 reserved_at_270[0x3]; + u8 disciplined_fr_counter[0x1]; + u8 reserved_at_271[0x2]; u8 qp_error_syndrome[0x1]; u8 reserved_at_274[0x2]; u8 lag_dct[0x2]; @@ -1858,7 +1892,12 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_280[0x10]; u8 max_wqe_sz_sq[0x10]; - u8 reserved_at_2a0[0xb]; + u8 reserved_at_2a0[0x7]; + u8 mkey_pcie_tph[0x1]; + u8 reserved_at_2a8[0x1]; + u8 tis_tir_td_order[0x1]; + + u8 psp[0x1]; u8 shampo[0x1]; u8 reserved_at_2ac[0x4]; u8 max_wqe_sz_rq[0x10]; @@ -1919,7 +1958,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 log_max_rqt[0x5]; u8 reserved_at_390[0x3]; u8 log_max_rqt_size[0x5]; - u8 reserved_at_398[0x3]; + u8 reserved_at_398[0x1]; + u8 vnic_env_cnt_bar_uar_access[0x1]; + u8 vnic_env_cnt_odp_page_fault[0x1]; u8 log_max_tis_per_sq[0x5]; u8 ext_stride_num_range[0x1]; @@ -2170,7 +2211,9 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 min_mkey_log_entity_size_fixed_buffer[0x5]; u8 ec_vf_vport_base[0x10]; - u8 reserved_at_3a0[0xa]; + u8 reserved_at_3a0[0x2]; + u8 max_mkey_log_entity_size_fixed_buffer[0x6]; + u8 reserved_at_3a8[0x2]; u8 max_mkey_log_entity_size_mtt[0x6]; u8 max_rqt_vhca_id[0x10]; @@ -2190,7 +2233,23 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 reserved_at_440[0x8]; u8 max_num_eqs_24b[0x18]; - u8 reserved_at_460[0x3a0]; + + u8 reserved_at_460[0x144]; + u8 load_balance_id[0x4]; + u8 reserved_at_5a8[0x18]; + + u8 query_adjacent_functions_id[0x1]; + u8 ingress_egress_esw_vport_connect[0x1]; + u8 function_id_type_vhca_id[0x1]; + u8 reserved_at_5c3[0x1]; + u8 lag_per_mp_group[0x1]; + u8 reserved_at_5c5[0xb]; + u8 delegate_vhca_management_profiles[0x10]; + + u8 delegated_vhca_max[0x10]; + u8 delegate_vhca_max[0x10]; + + u8 reserved_at_600[0x200]; }; enum mlx5_ifc_flow_destination_type { @@ -3771,6 +3830,7 @@ union mlx5_ifc_hca_cap_union_bits { struct mlx5_ifc_macsec_cap_bits macsec_cap; struct mlx5_ifc_crypto_cap_bits crypto_cap; struct mlx5_ifc_ipsec_cap_bits ipsec_cap; + struct mlx5_ifc_psp_cap_bits psp_cap; u8 reserved_at_0[0x8000]; }; @@ -3800,6 +3860,7 @@ enum { enum { MLX5_FLOW_CONTEXT_ENCRYPT_DECRYPT_TYPE_IPSEC = 0x0, MLX5_FLOW_CONTEXT_ENCRYPT_DECRYPT_TYPE_MACSEC = 0x1, + MLX5_FLOW_CONTEXT_ENCRYPT_DECRYPT_TYPE_PSP = 0x2, }; struct mlx5_ifc_vlan_bits { @@ -3964,7 +4025,13 @@ struct mlx5_ifc_vnic_diagnostic_statistics_bits { u8 handled_pkt_steering_fail[0x40]; - u8 reserved_at_360[0xc80]; + u8 bar_uar_access[0x20]; + + u8 odp_local_triggered_page_fault[0x20]; + + u8 odp_remote_triggered_page_fault[0x20]; + + u8 reserved_at_3c0[0xc20]; }; struct mlx5_ifc_traffic_counter_bits { @@ -4403,6 +4470,10 @@ enum { MLX5_MKC_ACCESS_MODE_CROSSING = 0x6, }; +enum { + MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX = 0, +}; + struct mlx5_ifc_mkc_bits { u8 reserved_at_0[0x1]; u8 free[0x1]; @@ -4454,7 +4525,11 @@ struct mlx5_ifc_mkc_bits { u8 relaxed_ordering_read[0x1]; u8 log_page_size[0x6]; - u8 reserved_at_1e0[0x20]; + u8 reserved_at_1e0[0x5]; + u8 pcie_tph_en[0x1]; + u8 pcie_tph_ph[0x2]; + u8 pcie_tph_steering_tag_index[0x8]; + u8 reserved_at_1f0[0x10]; }; struct mlx5_ifc_pkey_bits { @@ -4838,6 +4913,11 @@ union mlx5_ifc_field_select_802_1_r_roce_auto_bits { u8 reserved_at_0[0x20]; }; +struct mlx5_ifc_rs_histogram_cntrs_bits { + u8 hist[16][0x40]; + u8 reserved_at_400[0x2c0]; +}; + union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits { struct mlx5_ifc_eth_802_3_cntrs_grp_data_layout_bits eth_802_3_cntrs_grp_data_layout; struct mlx5_ifc_eth_2863_cntrs_grp_data_layout_bits eth_2863_cntrs_grp_data_layout; @@ -4852,6 +4932,7 @@ union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits { struct mlx5_ifc_phys_layer_cntrs_bits phys_layer_cntrs; struct mlx5_ifc_phys_layer_statistical_cntrs_bits phys_layer_statistical_cntrs; struct mlx5_ifc_phys_layer_recovery_cntrs_bits phys_layer_recovery_cntrs; + struct mlx5_ifc_rs_histogram_cntrs_bits rs_histogram_cntrs; u8 reserved_at_0[0x7c0]; }; @@ -5134,7 +5215,9 @@ struct mlx5_ifc_set_hca_cap_in_bits { u8 other_function[0x1]; u8 ec_vf_function[0x1]; - u8 reserved_at_42[0xe]; + u8 reserved_at_42[0x1]; + u8 function_id_type[0x1]; + u8 reserved_at_44[0xc]; u8 function_id[0x10]; u8 reserved_at_60[0x20]; @@ -5167,13 +5250,15 @@ struct mlx5_ifc_set_fte_in_bits { u8 op_mod[0x10]; u8 other_vport[0x1]; - u8 reserved_at_41[0xf]; + u8 other_eswitch[0x1]; + u8 reserved_at_42[0xe]; u8 vport_number[0x10]; u8 reserved_at_60[0x20]; u8 table_type[0x8]; - u8 reserved_at_88[0x18]; + u8 reserved_at_88[0x8]; + u8 eswitch_owner_vhca_id[0x10]; u8 reserved_at_a0[0x8]; u8 table_id[0x18]; @@ -6332,7 +6417,9 @@ struct mlx5_ifc_query_hca_cap_in_bits { u8 other_function[0x1]; u8 ec_vf_function[0x1]; - u8 reserved_at_42[0xe]; + u8 reserved_at_42[0x1]; + u8 function_id_type[0x1]; + u8 reserved_at_44[0xc]; u8 function_id[0x10]; u8 reserved_at_60[0x20]; @@ -6958,6 +7045,28 @@ struct mlx5_ifc_query_esw_vport_context_in_bits { u8 reserved_at_60[0x20]; }; +struct mlx5_ifc_destroy_esw_vport_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x20]; +}; + +struct mlx5_ifc_destroy_esw_vport_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x10]; + u8 vport_num[0x10]; + + u8 reserved_at_60[0x20]; +}; + struct mlx5_ifc_modify_esw_vport_context_out_bits { u8 status[0x8]; u8 reserved_at_8[0x18]; @@ -7093,6 +7202,8 @@ enum mlx5_reformat_ctx_type { MLX5_REFORMAT_TYPE_DEL_ESP_TRANSPORT_OVER_UDP = 0xa, MLX5_REFORMAT_TYPE_ADD_ESP_TRANSPORT_OVER_IPV6 = 0xb, MLX5_REFORMAT_TYPE_ADD_ESP_TRANSPORT_OVER_UDPV6 = 0xc, + MLX5_REFORMAT_TYPE_ADD_PSP_TUNNEL = 0xd, + MLX5_REFORMAT_TYPE_DEL_PSP_TUNNEL = 0xe, MLX5_REFORMAT_TYPE_INSERT_HDR = 0xf, MLX5_REFORMAT_TYPE_REMOVE_HDR = 0x10, MLX5_REFORMAT_TYPE_ADD_MACSEC = 0x11, @@ -7219,6 +7330,7 @@ enum { MLX5_ACTION_IN_FIELD_IPSEC_SYNDROME = 0x5D, MLX5_ACTION_IN_FIELD_OUT_EMD_47_32 = 0x6F, MLX5_ACTION_IN_FIELD_OUT_EMD_31_0 = 0x70, + MLX5_ACTION_IN_FIELD_PSP_SYNDROME = 0x71, }; struct mlx5_ifc_alloc_modify_header_context_out_bits { @@ -7459,6 +7571,85 @@ struct mlx5_ifc_query_adapter_in_bits { u8 reserved_at_40[0x40]; }; +struct mlx5_ifc_function_vhca_rid_info_reg_bits { + u8 host_number[0x8]; + u8 host_pci_device_function[0x8]; + u8 host_pci_bus[0x8]; + u8 reserved_at_18[0x3]; + u8 pci_bus_assigned[0x1]; + u8 function_type[0x4]; + + u8 parent_pci_device_function[0x8]; + u8 parent_pci_bus[0x8]; + u8 vhca_id[0x10]; + + u8 reserved_at_40[0x10]; + u8 function_id[0x10]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_delegated_function_vhca_rid_info_bits { + struct mlx5_ifc_function_vhca_rid_info_reg_bits function_vhca_rid_info; + + u8 reserved_at_80[0x18]; + u8 manage_profile[0x8]; + + u8 reserved_at_a0[0x60]; +}; + +struct mlx5_ifc_query_delegated_vhca_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x20]; + + u8 reserved_at_60[0x10]; + u8 functions_count[0x10]; + + u8 reserved_at_80[0x80]; + + struct mlx5_ifc_delegated_function_vhca_rid_info_bits + delegated_function_vhca_rid_info[]; +}; + +struct mlx5_ifc_query_delegated_vhca_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_create_esw_vport_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x20]; + + u8 reserved_at_60[0x10]; + u8 vport_num[0x10]; +}; + +struct mlx5_ifc_create_esw_vport_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x10]; + u8 managed_vhca_id[0x10]; + + u8 reserved_at_60[0x20]; +}; + struct mlx5_ifc_qp_2rst_out_bits { u8 status[0x8]; u8 reserved_at_8[0x18]; @@ -7586,7 +7777,12 @@ struct mlx5_ifc_modify_vport_state_in_bits { u8 reserved_at_41[0xf]; u8 vport_number[0x10]; - u8 reserved_at_60[0x18]; + u8 reserved_at_60[0x10]; + u8 ingress_connect[0x1]; + u8 egress_connect[0x1]; + u8 ingress_connect_valid[0x1]; + u8 egress_connect_valid[0x1]; + u8 reserved_at_74[0x4]; u8 admin_state[0x4]; u8 reserved_at_7c[0x4]; }; @@ -8614,13 +8810,15 @@ struct mlx5_ifc_destroy_flow_table_in_bits { u8 op_mod[0x10]; u8 other_vport[0x1]; - u8 reserved_at_41[0xf]; + u8 other_eswitch[0x1]; + u8 reserved_at_42[0xe]; u8 vport_number[0x10]; u8 reserved_at_60[0x20]; u8 table_type[0x8]; - u8 reserved_at_88[0x18]; + u8 reserved_at_88[0x8]; + u8 eswitch_owner_vhca_id[0x10]; u8 reserved_at_a0[0x8]; u8 table_id[0x18]; @@ -8645,13 +8843,15 @@ struct mlx5_ifc_destroy_flow_group_in_bits { u8 op_mod[0x10]; u8 other_vport[0x1]; - u8 reserved_at_41[0xf]; + u8 other_eswitch[0x1]; + u8 reserved_at_42[0xe]; u8 vport_number[0x10]; u8 reserved_at_60[0x20]; u8 table_type[0x8]; - u8 reserved_at_88[0x18]; + u8 reserved_at_88[0x8]; + u8 eswitch_owner_vhca_id[0x10]; u8 reserved_at_a0[0x8]; u8 table_id[0x18]; @@ -8790,13 +8990,15 @@ struct mlx5_ifc_delete_fte_in_bits { u8 op_mod[0x10]; u8 other_vport[0x1]; - u8 reserved_at_41[0xf]; + u8 other_eswitch[0x1]; + u8 reserved_at_42[0xe]; u8 vport_number[0x10]; u8 reserved_at_60[0x20]; u8 table_type[0x8]; - u8 reserved_at_88[0x18]; + u8 reserved_at_88[0x8]; + u8 eswitch_owner_vhca_id[0x10]; u8 reserved_at_a0[0x8]; u8 table_id[0x18]; @@ -9340,13 +9542,15 @@ struct mlx5_ifc_create_flow_table_in_bits { u8 op_mod[0x10]; u8 other_vport[0x1]; - u8 reserved_at_41[0xf]; + u8 other_eswitch[0x1]; + u8 reserved_at_42[0xe]; u8 vport_number[0x10]; u8 reserved_at_60[0x20]; u8 table_type[0x8]; - u8 reserved_at_88[0x18]; + u8 reserved_at_88[0x8]; + u8 eswitch_owner_vhca_id[0x10]; u8 reserved_at_a0[0x20]; @@ -9385,7 +9589,8 @@ struct mlx5_ifc_create_flow_group_in_bits { u8 op_mod[0x10]; u8 other_vport[0x1]; - u8 reserved_at_41[0xf]; + u8 other_eswitch[0x1]; + u8 reserved_at_42[0xe]; u8 vport_number[0x10]; u8 reserved_at_60[0x20]; @@ -9393,7 +9598,7 @@ struct mlx5_ifc_create_flow_group_in_bits { u8 table_type[0x8]; u8 reserved_at_88[0x4]; u8 group_type[0x4]; - u8 reserved_at_90[0x10]; + u8 eswitch_owner_vhca_id[0x10]; u8 reserved_at_a0[0x8]; u8 table_id[0x18]; @@ -9979,6 +10184,10 @@ struct mlx5_ifc_pude_reg_bits { u8 reserved_at_20[0x60]; }; +enum { + MLX5_PTYS_CONNECTOR_TYPE_PORT_DA = 0x7, +}; + struct mlx5_ifc_ptys_reg_bits { u8 reserved_at_0[0x1]; u8 an_disable_admin[0x1]; @@ -10015,7 +10224,8 @@ struct mlx5_ifc_ptys_reg_bits { u8 ib_link_width_oper[0x10]; u8 ib_proto_oper[0x10]; - u8 reserved_at_160[0x1c]; + u8 reserved_at_160[0x8]; + u8 lane_rate_oper[0x14]; u8 connector_type[0x4]; u8 eth_proto_lp_advertise[0x20]; @@ -10459,10 +10669,19 @@ struct mlx5_ifc_pifr_reg_bits { u8 port_filter_update_en[8][0x20]; }; +enum { + MLX5_BUF_OWNERSHIP_UNKNOWN = 0x0, + MLX5_BUF_OWNERSHIP_FW_OWNED = 0x1, + MLX5_BUF_OWNERSHIP_SW_OWNED = 0x2, +}; + struct mlx5_ifc_pfcc_reg_bits { - u8 reserved_at_0[0x8]; + u8 reserved_at_0[0x4]; + u8 buf_ownership[0x2]; + u8 reserved_at_6[0x2]; u8 local_port[0x8]; - u8 reserved_at_10[0xb]; + u8 reserved_at_10[0xa]; + u8 cable_length_mask[0x1]; u8 ppan_mask_n[0x1]; u8 minor_stall_mask[0x1]; u8 critical_stall_mask[0x1]; @@ -10491,7 +10710,10 @@ struct mlx5_ifc_pfcc_reg_bits { u8 device_stall_minor_watermark[0x10]; u8 device_stall_critical_watermark[0x10]; - u8 reserved_at_a0[0x60]; + u8 reserved_at_a0[0x18]; + u8 cable_length[0x8]; + + u8 reserved_at_c0[0x40]; }; struct mlx5_ifc_pelc_reg_bits { @@ -10592,11 +10814,15 @@ struct mlx5_ifc_mtutc_reg_bits { struct mlx5_ifc_pcam_enhanced_features_bits { u8 reserved_at_0[0x10]; u8 ppcnt_recovery_counters[0x1]; - u8 reserved_at_11[0xc]; + u8 reserved_at_11[0x7]; + u8 cable_length[0x1]; + u8 reserved_at_19[0x4]; u8 fec_200G_per_lane_in_pplm[0x1]; u8 reserved_at_1e[0x2a]; u8 fec_100G_per_lane_in_pplm[0x1]; - u8 reserved_at_49[0x1f]; + u8 reserved_at_49[0xa]; + u8 buffer_ownership[0x1]; + u8 resereved_at_54[0x14]; u8 fec_50G_per_lane_in_pplm[0x1]; u8 reserved_at_69[0x4]; u8 rx_icrc_encapsulated_counter[0x1]; @@ -10617,7 +10843,9 @@ struct mlx5_ifc_pcam_regs_5000_to_507f_bits { u8 port_access_reg_cap_mask_127_to_96[0x20]; u8 port_access_reg_cap_mask_95_to_64[0x20]; - u8 port_access_reg_cap_mask_63_to_36[0x1c]; + u8 port_access_reg_cap_mask_63[0x1]; + u8 pphcr[0x1]; + u8 port_access_reg_cap_mask_61_to_36[0x1a]; u8 pplm[0x1]; u8 port_access_reg_cap_mask_34_to_32[0x3]; @@ -11541,6 +11769,28 @@ struct mlx5_ifc_mtctr_reg_bits { u8 second_clock_timestamp[0x40]; }; +struct mlx5_ifc_bin_range_layout_bits { + u8 reserved_at_0[0xa]; + u8 high_val[0x6]; + u8 reserved_at_10[0xa]; + u8 low_val[0x6]; +}; + +struct mlx5_ifc_pphcr_reg_bits { + u8 active_hist_type[0x4]; + u8 reserved_at_4[0x4]; + u8 local_port[0x8]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x8]; + u8 num_of_bins[0x8]; + u8 reserved_at_30[0x10]; + + u8 reserved_at_40[0x40]; + + struct mlx5_ifc_bin_range_layout_bits bin_range[16]; +}; + union mlx5_ifc_ports_control_registers_document_bits { struct mlx5_ifc_bufferx_reg_bits bufferx_reg; struct mlx5_ifc_eth_2819_cntrs_grp_data_layout_bits eth_2819_cntrs_grp_data_layout; @@ -11607,6 +11857,7 @@ union mlx5_ifc_ports_control_registers_document_bits { struct mlx5_ifc_mtmp_reg_bits mtmp_reg; struct mlx5_ifc_mtptm_reg_bits mtptm_reg; struct mlx5_ifc_mtctr_reg_bits mtctr_reg; + struct mlx5_ifc_pphcr_reg_bits pphcr_reg; u8 reserved_at_0[0x60e0]; }; @@ -11637,10 +11888,12 @@ struct mlx5_ifc_set_flow_table_root_in_bits { u8 op_mod[0x10]; u8 other_vport[0x1]; - u8 reserved_at_41[0xf]; + u8 other_eswitch[0x1]; + u8 reserved_at_42[0xe]; u8 vport_number[0x10]; - u8 reserved_at_60[0x20]; + u8 reserved_at_60[0x10]; + u8 eswitch_owner_vhca_id[0x10]; u8 table_type[0x8]; u8 reserved_at_88[0x7]; @@ -11680,14 +11933,16 @@ struct mlx5_ifc_modify_flow_table_in_bits { u8 op_mod[0x10]; u8 other_vport[0x1]; - u8 reserved_at_41[0xf]; + u8 other_eswitch[0x1]; + u8 reserved_at_42[0xe]; u8 vport_number[0x10]; u8 reserved_at_60[0x10]; u8 modify_field_select[0x10]; u8 table_type[0x8]; - u8 reserved_at_88[0x18]; + u8 reserved_at_88[0x8]; + u8 eswitch_owner_vhca_id[0x10]; u8 reserved_at_a0[0x8]; u8 table_id[0x18]; @@ -12379,7 +12634,9 @@ struct mlx5_ifc_mtrc_ctrl_bits { struct mlx5_ifc_host_params_context_bits { u8 host_number[0x8]; - u8 reserved_at_8[0x7]; + u8 reserved_at_8[0x5]; + u8 host_pf_not_exist[0x1]; + u8 reserved_at_14[0x1]; u8 host_pf_disabled[0x1]; u8 host_num_of_vfs[0x10]; @@ -12500,17 +12757,6 @@ struct mlx5_ifc_affiliated_event_header_bits { u8 obj_id[0x20]; }; -enum { - MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY = BIT_ULL(0xc), - MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_IPSEC = BIT_ULL(0x13), - MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_SAMPLER = BIT_ULL(0x20), - MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_FLOW_METER_ASO = BIT_ULL(0x24), -}; - -enum { - MLX5_HCA_CAP_2_GENERAL_OBJECT_TYPES_RDMA_CTRL = BIT_ULL(0x13), -}; - enum { MLX5_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY = 0xc, MLX5_GENERAL_OBJECT_TYPES_IPSEC = 0x13, @@ -12519,9 +12765,28 @@ enum { MLX5_GENERAL_OBJECT_TYPES_MACSEC = 0x27, MLX5_GENERAL_OBJECT_TYPES_INT_KEK = 0x47, MLX5_GENERAL_OBJECT_TYPES_RDMA_CTRL = 0x53, + MLX5_GENERAL_OBJECT_TYPES_PCIE_CONG_EVENT = 0x58, MLX5_GENERAL_OBJECT_TYPES_FLOW_TABLE_ALIAS = 0xff15, }; +enum { + MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY = + BIT_ULL(MLX5_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY), + MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_IPSEC = + BIT_ULL(MLX5_GENERAL_OBJECT_TYPES_IPSEC), + MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_SAMPLER = + BIT_ULL(MLX5_GENERAL_OBJECT_TYPES_SAMPLER), + MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_FLOW_METER_ASO = + BIT_ULL(MLX5_GENERAL_OBJECT_TYPES_FLOW_METER_ASO), +}; + +enum { + MLX5_HCA_CAP_2_GENERAL_OBJECT_TYPES_RDMA_CTRL = + BIT_ULL(MLX5_GENERAL_OBJECT_TYPES_RDMA_CTRL - 0x40), + MLX5_HCA_CAP_2_GENERAL_OBJECT_TYPES_PCIE_CONG_EVENT = + BIT_ULL(MLX5_GENERAL_OBJECT_TYPES_PCIE_CONG_EVENT - 0x40), +}; + enum { MLX5_IPSEC_OBJECT_ICV_LEN_16B, }; @@ -12898,6 +13163,7 @@ enum { MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_PURPOSE_TLS = 0x1, MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_PURPOSE_IPSEC = 0x2, MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_PURPOSE_MACSEC = 0x4, + MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_PURPOSE_PSP = 0x6, }; struct mlx5_ifc_tls_static_params_bits { @@ -13278,4 +13544,101 @@ struct mlx5_ifc_mrtcq_reg_bits { u8 reserved_at_80[0x180]; }; +struct mlx5_ifc_pcie_cong_event_obj_bits { + u8 modify_select_field[0x40]; + + u8 inbound_event_en[0x1]; + u8 outbound_event_en[0x1]; + u8 reserved_at_42[0x1e]; + + u8 reserved_at_60[0x1]; + u8 inbound_cong_state[0x3]; + u8 reserved_at_64[0x1]; + u8 outbound_cong_state[0x3]; + u8 reserved_at_68[0x18]; + + u8 inbound_cong_low_threshold[0x10]; + u8 inbound_cong_high_threshold[0x10]; + + u8 outbound_cong_low_threshold[0x10]; + u8 outbound_cong_high_threshold[0x10]; + + u8 reserved_at_e0[0x340]; +}; + +struct mlx5_ifc_pcie_cong_event_cmd_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; + struct mlx5_ifc_pcie_cong_event_obj_bits cong_obj; +}; + +struct mlx5_ifc_pcie_cong_event_cmd_out_bits { + struct mlx5_ifc_general_obj_out_cmd_hdr_bits hdr; + struct mlx5_ifc_pcie_cong_event_obj_bits cong_obj; +}; + +enum mlx5e_pcie_cong_event_mod_field { + MLX5_PCIE_CONG_EVENT_MOD_EVENT_EN = BIT(0), + MLX5_PCIE_CONG_EVENT_MOD_THRESH = BIT(2), +}; + +struct mlx5_ifc_psp_rotate_key_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_psp_rotate_key_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +enum mlx5_psp_gen_spi_in_key_size { + MLX5_PSP_GEN_SPI_IN_KEY_SIZE_128 = 0x0, + MLX5_PSP_GEN_SPI_IN_KEY_SIZE_256 = 0x1, +}; + +struct mlx5_ifc_key_spi_bits { + u8 spi[0x20]; + + u8 reserved_at_20[0x60]; + + u8 key[8][0x20]; +}; + +struct mlx5_ifc_psp_gen_spi_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x20]; + + u8 key_size[0x2]; + u8 reserved_at_62[0xe]; + u8 num_of_spi[0x10]; +}; + +struct mlx5_ifc_psp_gen_spi_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x10]; + u8 num_of_spi[0x10]; + + u8 reserved_at_60[0x20]; + + struct mlx5_ifc_key_spi_bits key_spi[]; +}; + #endif /* MLX5_IFC_H */ diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index 58770b86f7..1df9d9a57b 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -112,6 +112,7 @@ enum mlx5e_ext_link_mode { MLX5E_400GAUI_2_400GBASE_CR2_KR2 = 17, MLX5E_800GAUI_8_800GBASE_CR8_KR8 = 19, MLX5E_800GAUI_4_800GBASE_CR4_KR4 = 20, + MLX5E_1600TAUI_8_1600TBASE_CR8_KR8 = 23, MLX5E_EXT_LINK_MODES_NUMBER, }; diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index fc7eeff99a..d67aedc6ea 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -237,13 +237,11 @@ enum { }; enum { - MLX5_ETH_WQE_SVLAN = 1 << 0, MLX5_ETH_WQE_TRAILER_HDR_OUTER_IP_ASSOC = 1 << 26, MLX5_ETH_WQE_TRAILER_HDR_OUTER_L4_ASSOC = 1 << 27, MLX5_ETH_WQE_TRAILER_HDR_INNER_IP_ASSOC = 3 << 26, MLX5_ETH_WQE_TRAILER_HDR_INNER_L4_ASSOC = 1 << 28, MLX5_ETH_WQE_INSERT_TRAILER = 1 << 30, - MLX5_ETH_WQE_INSERT_VLAN = 1 << 15, }; enum { @@ -253,9 +251,15 @@ enum { MLX5_ETH_WQE_SWP_OUTER_L4_UDP = 1 << 5, }; +/* Metadata bits 0-7 are used by timestamping */ +/* Base shift for metadata bits used by IPsec and MACsec */ +#define MLX5_ETH_WQE_FT_META_SHIFT 8 + enum { - MLX5_ETH_WQE_FT_META_IPSEC = BIT(0), - MLX5_ETH_WQE_FT_META_MACSEC = BIT(1), + MLX5_ETH_WQE_FT_META_IPSEC = BIT(0) << MLX5_ETH_WQE_FT_META_SHIFT, + MLX5_ETH_WQE_FT_META_MACSEC = BIT(1) << MLX5_ETH_WQE_FT_META_SHIFT, + MLX5_ETH_WQE_FT_META_MACSEC_FS_ID_MASK = + GENMASK(5, 2) << MLX5_ETH_WQE_FT_META_SHIFT, }; struct mlx5_wqe_eth_seg { @@ -275,10 +279,6 @@ struct mlx5_wqe_eth_seg { DECLARE_FLEX_ARRAY(u8, data); }; } inline_hdr; - struct { - __be16 type; - __be16 vlan_tci; - } insert; __be32 trailer; }; }; diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index c36cc6d829..f876bfc066 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -73,7 +73,8 @@ int mlx5_modify_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 mtu); int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev, u64 *system_image_guid); int mlx5_query_nic_vport_sd_group(struct mlx5_core_dev *mdev, u8 *sd_group); -int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid); +int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, + u16 vport, bool other_vport, u64 *node_guid); int mlx5_modify_nic_vport_node_guid(struct mlx5_core_dev *mdev, u16 vport, u64 node_guid); int mlx5_query_nic_vport_qkey_viol_cntr(struct mlx5_core_dev *mdev, @@ -135,4 +136,6 @@ int mlx5_nic_vport_unaffiliate_multiport(struct mlx5_core_dev *port_mdev); u64 mlx5_query_nic_system_image_guid(struct mlx5_core_dev *mdev); int mlx5_vport_get_other_func_cap(struct mlx5_core_dev *dev, u16 vport, void *out, u16 opmod); +int mlx5_vport_get_vhca_id(struct mlx5_core_dev *dev, u16 vport, u16 *vhca_id); + #endif /* __MLX5_VPORT_H__ */ diff --git a/include/linux/mount.h b/include/linux/mount.h index 37dc2a161f..0f214b0a09 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -92,8 +92,8 @@ extern bool __mnt_is_readonly(struct vfsmount *mnt); extern bool mnt_may_suid(struct vfsmount *mnt); extern struct vfsmount *clone_private_mount(const struct path *path); -extern int __mnt_want_write(struct vfsmount *); -extern void __mnt_drop_write(struct vfsmount *); +int mnt_get_write_access(struct vfsmount *mnt); +void mnt_put_write_access(struct vfsmount *mnt); extern struct vfsmount *fc_mount(struct fs_context *fc); extern struct vfsmount *fc_mount_longterm(struct fs_context *fc); diff --git a/include/linux/pci-tph.h b/include/linux/pci-tph.h index c3e806c13d..ba28140ce6 100644 --- a/include/linux/pci-tph.h +++ b/include/linux/pci-tph.h @@ -28,6 +28,8 @@ int pcie_tph_get_cpu_st(struct pci_dev *dev, unsigned int cpu_uid, u16 *tag); void pcie_disable_tph(struct pci_dev *pdev); int pcie_enable_tph(struct pci_dev *pdev, int mode); +u16 pcie_tph_get_st_table_size(struct pci_dev *pdev); +u32 pcie_tph_get_st_table_loc(struct pci_dev *pdev); #else static inline int pcie_tph_set_st_entry(struct pci_dev *pdev, unsigned int index, u16 tag) diff --git a/include/linux/security.h b/include/linux/security.h index 16f44e78b7..db02db9f62 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -263,13 +263,13 @@ extern int security_init(void); extern int early_security_init(void); /* Security operations */ -int security_binder_set_context_mgr(struct task_struct *mgr); -int security_binder_transaction(struct task_struct *from, - struct task_struct *to); -int security_binder_transfer_binder(struct task_struct *from, - struct task_struct *to); -int security_binder_transfer_file(struct task_struct *from, - struct task_struct *to, struct file *file); +int security_binder_set_context_mgr(const struct cred *mgr); +int security_binder_transaction(const struct cred *from, + const struct cred *to); +int security_binder_transfer_binder(const struct cred *from, + const struct cred *to); +int security_binder_transfer_file(const struct cred *from, + const struct cred *to, const struct file *file); int security_ptrace_access_check(struct task_struct *child, unsigned int mode); int security_ptrace_traceme(struct task_struct *parent); int security_capget(struct task_struct *target, @@ -387,11 +387,17 @@ int security_kernfs_init_security(struct kernfs_node *kn_dir, int security_file_permission(struct file *file, int mask); int security_file_alloc(struct file *file); void security_file_free(struct file *file); +int security_backing_file_alloc(struct file *backing_file, + const struct file *user_file); +void security_backing_file_free(struct file *backing_file); int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int security_file_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg); int security_mmap_file(struct file *file, unsigned long prot, unsigned long flags); +int security_mmap_backing_file(struct vm_area_struct *vma, + struct file *backing_file, + struct file *user_file); int security_mmap_addr(unsigned long addr); int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot); @@ -520,26 +526,26 @@ static inline int early_security_init(void) return 0; } -static inline int security_binder_set_context_mgr(struct task_struct *mgr) +static inline int security_binder_set_context_mgr(const struct cred *mgr) { return 0; } -static inline int security_binder_transaction(struct task_struct *from, - struct task_struct *to) +static inline int security_binder_transaction(const struct cred *from, + const struct cred *to) { return 0; } -static inline int security_binder_transfer_binder(struct task_struct *from, - struct task_struct *to) +static inline int security_binder_transfer_binder(const struct cred *from, + const struct cred *to) { return 0; } -static inline int security_binder_transfer_file(struct task_struct *from, - struct task_struct *to, - struct file *file) +static inline int security_binder_transfer_file(const struct cred *from, + const struct cred *to, + const struct file *file) { return 0; } @@ -976,6 +982,15 @@ static inline int security_file_alloc(struct file *file) static inline void security_file_free(struct file *file) { } +static inline int security_backing_file_alloc(struct file *backing_file, + const struct file *user_file) +{ + return 0; +} + +static inline void security_backing_file_free(struct file *backing_file) +{ } + static inline int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -995,6 +1010,13 @@ static inline int security_mmap_file(struct file *file, unsigned long prot, return 0; } +static inline int security_mmap_backing_file(struct vm_area_struct *vma, + struct file *backing_file, + struct file *user_file) +{ + return 0; +} + static inline int security_mmap_addr(unsigned long addr) { return cap_mmap_addr(addr); diff --git a/init/Kconfig b/init/Kconfig index c2b4633fb6..df5a34169f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -70,13 +70,24 @@ config CC_CAN_LINK_STATIC default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(CLANG_FLAGS) $(USERCFLAGS) $(USERLDFLAGS) $(m64-flag) -static) if 64BIT default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(CLANG_FLAGS) $(USERCFLAGS) $(USERLDFLAGS) $(m32-flag) -static) +# Fixed in GCC 14, 13.3, 12.4 and 11.5 +# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113921 +config GCC_ASM_GOTO_OUTPUT_BROKEN + bool + depends on CC_IS_GCC + default y if GCC_VERSION < 110500 + default y if GCC_VERSION >= 120000 && GCC_VERSION < 120400 + default y if GCC_VERSION >= 130000 && GCC_VERSION < 130300 + config CC_HAS_ASM_GOTO_OUTPUT - def_bool $(success,echo 'int foo(int x) { asm goto ("": "=r"(x) ::: bar); return x; bar: return 0; }' | $(CC) -x c - -c -o /dev/null) + def_bool y + depends on !GCC_ASM_GOTO_OUTPUT_BROKEN + depends on $(success,echo 'int foo(int x) { asm goto ("": "=r"(x) ::: bar); return x; bar: return 0; }' | $(CC) -x c - -c -o /dev/null) config CC_HAS_ASM_GOTO_TIED_OUTPUT depends on CC_HAS_ASM_GOTO_OUTPUT # Detect buggy gcc and clang, fixed in gcc-11 clang-14. - def_bool $(success,echo 'int foo(int *x) { asm goto (".long (%l[bar]) - .\n": "+m"(*x) ::: bar); return *x; bar: return 0; }' | $CC -x c - -c -o /dev/null) + def_bool $(success,echo 'int foo(int *x) { asm goto (".long (%l[bar]) - .": "+m"(*x) ::: bar); return *x; bar: return 0; }' | $CC -x c - -c -o /dev/null) config TOOLS_SUPPORT_RELR def_bool $(success,env "CC=$(CC)" "LD=$(LD)" "NM=$(NM)" "OBJCOPY=$(OBJCOPY)" $(srctree)/scripts/tools-support-relr.sh) diff --git a/kernel.sbat b/kernel.sbat index 3432f3e0b5..761b3e8540 100644 --- a/kernel.sbat +++ b/kernel.sbat @@ -1,3 +1,3 @@ sbat,1,SBAT Version,sbat,1,https://github.com/rhboot/shim/blob/main/SBAT.md -kernel.rhel,1,Red Hat,kernel-core,5.14.0-687.17.1.el9.x86_64,mailto:secalert@redhat.com -kernel.almalinux,1,AlmaLinux,kernel-core,5.14.0-687.17.1.el9.x86_64,mailto:security@almalinux.org +kernel.rhel,1,Red Hat,kernel-core,5.14.0-687.19.1.el9.x86_64,mailto:secalert@redhat.com +kernel.almalinux,1,AlmaLinux,kernel-core,5.14.0-687.19.1.el9.x86_64,mailto:security@almalinux.org diff --git a/kernel/acct.c b/kernel/acct.c index bbea312b9d..b3e00389d4 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -235,7 +235,7 @@ static int acct_on(struct filename *pathname) filp_close(file, NULL); return PTR_ERR(internal); } - err = __mnt_want_write(internal); + err = mnt_get_write_access(internal); if (err) { mntput(internal); kfree(acct); @@ -260,7 +260,7 @@ static int acct_on(struct filename *pathname) old = xchg(&ns->bacct, &acct->pin); mutex_unlock(&acct->lock); pin_kill(old); - __mnt_drop_write(mnt); + mnt_put_write_access(mnt); mntput(mnt); return 0; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 0d3bd850fe..5065087dd2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8953,7 +8953,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) * need to add enough zero bytes after the string to handle * the 64bit alignment we do later. */ - name = file_path(file, buf, PATH_MAX - sizeof(u64)); + name = d_path(file_user_path(file), buf, PATH_MAX - sizeof(u64)); if (IS_ERR(name)) { name = "//toolong"; goto cpy_name; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 16b0970601..97dfac72f1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11248,6 +11248,21 @@ static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd } } +/* + * This flag serializes load-balancing passes over large domains + * (above the NODE topology level) - only one load-balancing instance + * may run at a time, to reduce overhead on very large systems with + * lots of CPUs and large NUMA distances. + * + * - Note that load-balancing passes triggered while another one + * is executing are skipped and not re-tried. + * + * - Also note that this does not serialize rebalance_domains() + * execution, as non-SD_SERIALIZE domains will still be + * load-balanced in parallel. + */ +static atomic_t sched_balance_running = ATOMIC_INIT(0); + /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. @@ -11273,6 +11288,7 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq, .fbq_type = all, .tasks = LIST_HEAD_INIT(env.tasks), }; + bool need_unlock = false; cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask); @@ -11284,6 +11300,14 @@ redo: goto out_balanced; } + if (!need_unlock && (sd->flags & SD_SERIALIZE)) { + int zero = 0; + if (!atomic_try_cmpxchg_acquire(&sched_balance_running, &zero, 1)) + goto out_balanced; + + need_unlock = true; + } + group = sched_balance_find_src_group(&env); if (!group) { schedstat_inc(sd->lb_nobusyg[idle]); @@ -11524,6 +11548,9 @@ out_one_pinned: sd->balance_interval < sd->max_interval) sd->balance_interval *= 2; out: + if (need_unlock) + atomic_set_release(&sched_balance_running, 0); + return ld_moved; } @@ -11648,21 +11675,6 @@ out_unlock: return 0; } -/* - * This flag serializes load-balancing passes over large domains - * (above the NODE topology level) - only one load-balancing instance - * may run at a time, to reduce overhead on very large systems with - * lots of CPUs and large NUMA distances. - * - * - Note that load-balancing passes triggered while another one - * is executing are skipped and not re-tried. - * - * - Also note that this does not serialize rebalance_domains() - * execution, as non-SD_SERIALIZE domains will still be - * load-balanced in parallel. - */ -static atomic_t sched_balance_running = ATOMIC_INIT(0); - /* * Scale the max sched_balance_rq interval with the number of CPUs in the system. * This trades load-balance latency on larger machines for less cross talk. @@ -11718,7 +11730,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; - int need_serialize, need_decay = 0; + int need_decay = 0; u64 max_cost = 0; rcu_read_lock(); @@ -11742,13 +11754,6 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) } interval = get_sd_balance_interval(sd, busy); - - need_serialize = sd->flags & SD_SERIALIZE; - if (need_serialize) { - if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1)) - goto out; - } - if (time_after_eq(jiffies, sd->last_balance + interval)) { if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) { /* @@ -11762,9 +11767,6 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) sd->last_balance = jiffies; interval = get_sd_balance_interval(sd, busy); } - if (need_serialize) - atomic_set_release(&sched_balance_running, 0); -out: if (time_after(next_balance, sd->last_balance + interval)) { next_balance = sd->last_balance + interval; update_next_balance = 1; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 0fda3619c4..6d89bc793c 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -405,7 +405,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, vmstart = vma->vm_start; } if (file) { - ret = trace_seq_path(s, &file->f_path); + ret = trace_seq_path(s, file_user_path(file)); if (ret) trace_seq_printf(s, "[+0x%lx]", ip - vmstart); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index d553b15a0c..c6aa1e50ed 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -685,6 +685,9 @@ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, if (!skb2) return 1; + /* Remove debris left by IPv4 stack. */ + memset(IP6CB(skb2), 0, sizeof(*IP6CB(skb2))); + skb_dst_drop(skb2); skb_pull(skb2, nhs); skb_reset_network_header(skb2); diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index a9f11910a1..014d346851 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -470,9 +470,13 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, if (!ih) goto out_unlock; - if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir]) - ct->proto.sctp.init[!dir] = 0; - ct->proto.sctp.init[dir] = 1; + /* Do not record INIT matching peer vtag (stale or retransmitted INIT). */ + if (old_state == SCTP_CONNTRACK_NONE || + ct->proto.sctp.vtag[!dir] != ih->init_tag) { + if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir]) + ct->proto.sctp.init[!dir] = 0; + ct->proto.sctp.init[dir] = 1; + } pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir); ct->proto.sctp.vtag[!dir] = ih->init_tag; diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index e59fa3be40..0f3bccc69b 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -13,6 +13,8 @@ #include #include +#define NF_FLOW_RULE_ACTION_MAX 24 + static struct workqueue_struct *nf_flow_offload_add_wq; static struct workqueue_struct *nf_flow_offload_del_wq; static struct workqueue_struct *nf_flow_offload_stats_wq; @@ -208,7 +210,12 @@ static void flow_offload_mangle(struct flow_action_entry *entry, static inline struct flow_action_entry * flow_action_entry_next(struct nf_flow_rule *flow_rule) { - int i = flow_rule->rule->action.num_entries++; + int i; + + if (unlikely(flow_rule->rule->action.num_entries >= NF_FLOW_RULE_ACTION_MAX)) + return NULL; + + i = flow_rule->rule->action.num_entries++; return &flow_rule->rule->action.entries[i]; } @@ -226,6 +233,9 @@ static int flow_offload_eth_src(struct net *net, u32 mask, val; u16 val16; + if (!entry0 || !entry1) + return -E2BIG; + this_tuple = &flow->tuplehash[dir].tuple; switch (this_tuple->xmit_type) { @@ -276,6 +286,9 @@ static int flow_offload_eth_dst(struct net *net, u8 nud_state; u16 val16; + if (!entry0 || !entry1) + return -E2BIG; + this_tuple = &flow->tuplehash[dir].tuple; switch (this_tuple->xmit_type) { @@ -317,16 +330,19 @@ static int flow_offload_eth_dst(struct net *net, return 0; } -static void flow_offload_ipv4_snat(struct net *net, - const struct flow_offload *flow, - enum flow_offload_tuple_dir dir, - struct nf_flow_rule *flow_rule) +static int flow_offload_ipv4_snat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) { struct flow_action_entry *entry = flow_action_entry_next(flow_rule); u32 mask = ~htonl(0xffffffff); __be32 addr; u32 offset; + if (!entry) + return -E2BIG; + switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr; @@ -337,23 +353,27 @@ static void flow_offload_ipv4_snat(struct net *net, offset = offsetof(struct iphdr, daddr); break; default: - return; + return -EOPNOTSUPP; } flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP4, offset, &addr, &mask); + return 0; } -static void flow_offload_ipv4_dnat(struct net *net, - const struct flow_offload *flow, - enum flow_offload_tuple_dir dir, - struct nf_flow_rule *flow_rule) +static int flow_offload_ipv4_dnat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) { struct flow_action_entry *entry = flow_action_entry_next(flow_rule); u32 mask = ~htonl(0xffffffff); __be32 addr; u32 offset; + if (!entry) + return -E2BIG; + switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr; @@ -364,14 +384,15 @@ static void flow_offload_ipv4_dnat(struct net *net, offset = offsetof(struct iphdr, saddr); break; default: - return; + return -EOPNOTSUPP; } flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP4, offset, &addr, &mask); + return 0; } -static void flow_offload_ipv6_mangle(struct nf_flow_rule *flow_rule, +static int flow_offload_ipv6_mangle(struct nf_flow_rule *flow_rule, unsigned int offset, const __be32 *addr, const __be32 *mask) { @@ -380,15 +401,20 @@ static void flow_offload_ipv6_mangle(struct nf_flow_rule *flow_rule, for (i = 0; i < sizeof(struct in6_addr) / sizeof(u32); i++) { entry = flow_action_entry_next(flow_rule); + if (!entry) + return -E2BIG; + flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP6, offset + i * sizeof(u32), &addr[i], mask); } + + return 0; } -static void flow_offload_ipv6_snat(struct net *net, - const struct flow_offload *flow, - enum flow_offload_tuple_dir dir, - struct nf_flow_rule *flow_rule) +static int flow_offload_ipv6_snat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) { u32 mask = ~htonl(0xffffffff); const __be32 *addr; @@ -404,16 +430,16 @@ static void flow_offload_ipv6_snat(struct net *net, offset = offsetof(struct ipv6hdr, daddr); break; default: - return; + return -EOPNOTSUPP; } - flow_offload_ipv6_mangle(flow_rule, offset, addr, &mask); + return flow_offload_ipv6_mangle(flow_rule, offset, addr, &mask); } -static void flow_offload_ipv6_dnat(struct net *net, - const struct flow_offload *flow, - enum flow_offload_tuple_dir dir, - struct nf_flow_rule *flow_rule) +static int flow_offload_ipv6_dnat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) { u32 mask = ~htonl(0xffffffff); const __be32 *addr; @@ -429,10 +455,10 @@ static void flow_offload_ipv6_dnat(struct net *net, offset = offsetof(struct ipv6hdr, saddr); break; default: - return; + return -EOPNOTSUPP; } - flow_offload_ipv6_mangle(flow_rule, offset, addr, &mask); + return flow_offload_ipv6_mangle(flow_rule, offset, addr, &mask); } static int flow_offload_l4proto(const struct flow_offload *flow) @@ -454,15 +480,18 @@ static int flow_offload_l4proto(const struct flow_offload *flow) return type; } -static void flow_offload_port_snat(struct net *net, - const struct flow_offload *flow, - enum flow_offload_tuple_dir dir, - struct nf_flow_rule *flow_rule) +static int flow_offload_port_snat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) { struct flow_action_entry *entry = flow_action_entry_next(flow_rule); u32 mask, port; u32 offset; + if (!entry) + return -E2BIG; + switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port); @@ -477,22 +506,26 @@ static void flow_offload_port_snat(struct net *net, mask = ~htonl(0xffff); break; default: - return; + return -EOPNOTSUPP; } flow_offload_mangle(entry, flow_offload_l4proto(flow), offset, &port, &mask); + return 0; } -static void flow_offload_port_dnat(struct net *net, - const struct flow_offload *flow, - enum flow_offload_tuple_dir dir, - struct nf_flow_rule *flow_rule) +static int flow_offload_port_dnat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) { struct flow_action_entry *entry = flow_action_entry_next(flow_rule); u32 mask, port; u32 offset; + if (!entry) + return -E2BIG; + switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port); @@ -507,20 +540,24 @@ static void flow_offload_port_dnat(struct net *net, mask = ~htonl(0xffff0000); break; default: - return; + return -EOPNOTSUPP; } flow_offload_mangle(entry, flow_offload_l4proto(flow), offset, &port, &mask); + return 0; } -static void flow_offload_ipv4_checksum(struct net *net, - const struct flow_offload *flow, - struct nf_flow_rule *flow_rule) +static int flow_offload_ipv4_checksum(struct net *net, + const struct flow_offload *flow, + struct nf_flow_rule *flow_rule) { u8 protonum = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto; struct flow_action_entry *entry = flow_action_entry_next(flow_rule); + if (!entry) + return -E2BIG; + entry->id = FLOW_ACTION_CSUM; entry->csum_flags = TCA_CSUM_UPDATE_FLAG_IPV4HDR; @@ -532,12 +569,14 @@ static void flow_offload_ipv4_checksum(struct net *net, entry->csum_flags |= TCA_CSUM_UPDATE_FLAG_UDP; break; } + + return 0; } -static void flow_offload_redirect(struct net *net, - const struct flow_offload *flow, - enum flow_offload_tuple_dir dir, - struct nf_flow_rule *flow_rule) +static int flow_offload_redirect(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) { const struct flow_offload_tuple *this_tuple, *other_tuple; struct flow_action_entry *entry; @@ -555,21 +594,28 @@ static void flow_offload_redirect(struct net *net, ifindex = other_tuple->iifidx; break; default: - return; + return -EOPNOTSUPP; } dev = dev_get_by_index(net, ifindex); if (!dev) - return; + return -ENODEV; entry = flow_action_entry_next(flow_rule); + if (!entry) { + dev_put(dev); + return -E2BIG; + } + entry->id = FLOW_ACTION_REDIRECT; entry->dev = dev; + + return 0; } -static void flow_offload_encap_tunnel(const struct flow_offload *flow, - enum flow_offload_tuple_dir dir, - struct nf_flow_rule *flow_rule) +static int flow_offload_encap_tunnel(const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) { const struct flow_offload_tuple *this_tuple; struct flow_action_entry *entry; @@ -577,7 +623,7 @@ static void flow_offload_encap_tunnel(const struct flow_offload *flow, this_tuple = &flow->tuplehash[dir].tuple; if (this_tuple->xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) - return; + return 0; dst = this_tuple->dst_cache; if (dst && dst->lwtstate) { @@ -586,15 +632,19 @@ static void flow_offload_encap_tunnel(const struct flow_offload *flow, tun_info = lwt_tun_info(dst->lwtstate); if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX)) { entry = flow_action_entry_next(flow_rule); + if (!entry) + return -E2BIG; entry->id = FLOW_ACTION_TUNNEL_ENCAP; entry->tunnel = tun_info; } } + + return 0; } -static void flow_offload_decap_tunnel(const struct flow_offload *flow, - enum flow_offload_tuple_dir dir, - struct nf_flow_rule *flow_rule) +static int flow_offload_decap_tunnel(const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) { const struct flow_offload_tuple *other_tuple; struct flow_action_entry *entry; @@ -602,7 +652,7 @@ static void flow_offload_decap_tunnel(const struct flow_offload *flow, other_tuple = &flow->tuplehash[!dir].tuple; if (other_tuple->xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) - return; + return 0; dst = other_tuple->dst_cache; if (dst && dst->lwtstate) { @@ -611,9 +661,13 @@ static void flow_offload_decap_tunnel(const struct flow_offload *flow, tun_info = lwt_tun_info(dst->lwtstate); if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX)) { entry = flow_action_entry_next(flow_rule); + if (!entry) + return -E2BIG; entry->id = FLOW_ACTION_TUNNEL_DECAP; } } + + return 0; } static int @@ -625,8 +679,9 @@ nf_flow_rule_route_common(struct net *net, const struct flow_offload *flow, const struct flow_offload_tuple *tuple; int i; - flow_offload_decap_tunnel(flow, dir, flow_rule); - flow_offload_encap_tunnel(flow, dir, flow_rule); + if (flow_offload_decap_tunnel(flow, dir, flow_rule) < 0 || + flow_offload_encap_tunnel(flow, dir, flow_rule) < 0) + return -1; if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 || flow_offload_eth_dst(net, flow, dir, flow_rule) < 0) @@ -642,6 +697,8 @@ nf_flow_rule_route_common(struct net *net, const struct flow_offload *flow, if (tuple->encap[i].proto == htons(ETH_P_8021Q)) { entry = flow_action_entry_next(flow_rule); + if (!entry) + return -1; entry->id = FLOW_ACTION_VLAN_POP; } } @@ -655,6 +712,8 @@ nf_flow_rule_route_common(struct net *net, const struct flow_offload *flow, continue; entry = flow_action_entry_next(flow_rule); + if (!entry) + return -1; switch (other_tuple->encap[i].proto) { case htons(ETH_P_PPP_SES): @@ -680,18 +739,22 @@ int nf_flow_rule_route_ipv4(struct net *net, struct flow_offload *flow, return -1; if (test_bit(NF_FLOW_SNAT, &flow->flags)) { - flow_offload_ipv4_snat(net, flow, dir, flow_rule); - flow_offload_port_snat(net, flow, dir, flow_rule); + if (flow_offload_ipv4_snat(net, flow, dir, flow_rule) < 0 || + flow_offload_port_snat(net, flow, dir, flow_rule) < 0) + return -1; } if (test_bit(NF_FLOW_DNAT, &flow->flags)) { - flow_offload_ipv4_dnat(net, flow, dir, flow_rule); - flow_offload_port_dnat(net, flow, dir, flow_rule); + if (flow_offload_ipv4_dnat(net, flow, dir, flow_rule) < 0 || + flow_offload_port_dnat(net, flow, dir, flow_rule) < 0) + return -1; } if (test_bit(NF_FLOW_SNAT, &flow->flags) || test_bit(NF_FLOW_DNAT, &flow->flags)) - flow_offload_ipv4_checksum(net, flow, flow_rule); + if (flow_offload_ipv4_checksum(net, flow, flow_rule) < 0) + return -1; - flow_offload_redirect(net, flow, dir, flow_rule); + if (flow_offload_redirect(net, flow, dir, flow_rule) < 0) + return -1; return 0; } @@ -705,22 +768,23 @@ int nf_flow_rule_route_ipv6(struct net *net, struct flow_offload *flow, return -1; if (test_bit(NF_FLOW_SNAT, &flow->flags)) { - flow_offload_ipv6_snat(net, flow, dir, flow_rule); - flow_offload_port_snat(net, flow, dir, flow_rule); + if (flow_offload_ipv6_snat(net, flow, dir, flow_rule) < 0 || + flow_offload_port_snat(net, flow, dir, flow_rule) < 0) + return -1; } if (test_bit(NF_FLOW_DNAT, &flow->flags)) { - flow_offload_ipv6_dnat(net, flow, dir, flow_rule); - flow_offload_port_dnat(net, flow, dir, flow_rule); + if (flow_offload_ipv6_dnat(net, flow, dir, flow_rule) < 0 || + flow_offload_port_dnat(net, flow, dir, flow_rule) < 0) + return -1; } - flow_offload_redirect(net, flow, dir, flow_rule); + if (flow_offload_redirect(net, flow, dir, flow_rule) < 0) + return -1; return 0; } EXPORT_SYMBOL_GPL(nf_flow_rule_route_ipv6); -#define NF_FLOW_RULE_ACTION_MAX 16 - static struct nf_flow_rule * nf_flow_offload_rule_alloc(struct net *net, const struct flow_offload_work *offload, diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index cf5683afaf..56a4deb276 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -57,7 +57,7 @@ /* Jump to label if @reg is zero */ #define NFT_PIPAPO_AVX2_NOMATCH_GOTO(reg, label) \ - asm_volatile_goto("vptest %%ymm" #reg ", %%ymm" #reg ";" \ + asm goto("vptest %%ymm" #reg ", %%ymm" #reg ";" \ "je %l[" #label "]" : : : : label) /* Store 256 bits from YMM register into memory. Contrary to bucket load diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 4848d5d50a..488a74d146 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -1554,6 +1554,12 @@ static enum sctp_disposition sctp_sf_do_unexpected_init( /* Tag the variable length parameters. */ chunk->param_hdr.v = skb_pull(chunk->skb, sizeof(struct sctp_inithdr)); + if (asoc->state >= SCTP_STATE_ESTABLISHED) { + /* Discard INIT matching peer vtag after handshake completion (stale INIT). */ + if (ntohl(chunk->subh.init_hdr->init_tag) == asoc->peer.i.init_tag) + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + } + /* Verify the INIT chunk before processing it. */ err_chunk = NULL; if (!sctp_verify_init(net, ep, asoc, chunk->chunk_hdr->type, diff --git a/samples/bpf/asm_goto_workaround.h b/samples/bpf/asm_goto_workaround.h index 7048bb3594..634e81d83e 100644 --- a/samples/bpf/asm_goto_workaround.h +++ b/samples/bpf/asm_goto_workaround.h @@ -4,14 +4,14 @@ #define __ASM_GOTO_WORKAROUND_H /* - * This will bring in asm_volatile_goto and asm_inline macro definitions + * This will bring in asm_goto_output and asm_inline macro definitions * if enabled by compiler and config options. */ #include -#ifdef asm_volatile_goto -#undef asm_volatile_goto -#define asm_volatile_goto(x...) asm volatile("invalid use of asm_volatile_goto") +#ifdef asm_goto_output +#undef asm_goto_output +#define asm_goto_output(x...) asm volatile("invalid use of asm_goto_output") #endif /* diff --git a/security/security.c b/security/security.c index 5660bbab98..27a309ab0b 100644 --- a/security/security.c +++ b/security/security.c @@ -89,6 +89,7 @@ const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { static BLOCKING_NOTIFIER_HEAD(blocking_lsm_notifier_chain); static struct kmem_cache *lsm_file_cache; +static struct kmem_cache *lsm_backing_file_cache; static struct kmem_cache *lsm_inode_cache; char *lsm_names; @@ -260,6 +261,8 @@ static void __init lsm_set_blob_sizes(struct lsm_blob_sizes *needed) lsm_set_blob_size(&needed->lbs_cred, &blob_sizes.lbs_cred); lsm_set_blob_size(&needed->lbs_file, &blob_sizes.lbs_file); + lsm_set_blob_size(&needed->lbs_backing_file, + &blob_sizes.lbs_backing_file); /* * The inode blob gets an rcu_head in addition to * what the modules might need. @@ -447,14 +450,15 @@ static void __init ordered_lsm_init(void) report_lsm_order(); - init_debug("cred blob size = %d\n", blob_sizes.lbs_cred); - init_debug("file blob size = %d\n", blob_sizes.lbs_file); - init_debug("inode blob size = %d\n", blob_sizes.lbs_inode); - init_debug("ipc blob size = %d\n", blob_sizes.lbs_ipc); - init_debug("msg_msg blob size = %d\n", blob_sizes.lbs_msg_msg); - init_debug("superblock blob size = %d\n", blob_sizes.lbs_superblock); - init_debug("task blob size = %d\n", blob_sizes.lbs_task); - init_debug("xattr slots = %d\n", blob_sizes.lbs_xattr_count); + init_debug("cred blob size = %d\n", blob_sizes.lbs_cred); + init_debug("file blob size = %d\n", blob_sizes.lbs_file); + init_debug("backing_file blob size = %d\n", blob_sizes.lbs_backing_file); + init_debug("inode blob size = %d\n", blob_sizes.lbs_inode); + init_debug("ipc blob size = %d\n", blob_sizes.lbs_ipc); + init_debug("msg_msg blob size = %d\n", blob_sizes.lbs_msg_msg); + init_debug("superblock blob size = %d\n", blob_sizes.lbs_superblock); + init_debug("task blob size = %d\n", blob_sizes.lbs_task); + init_debug("xattr slots = %d\n", blob_sizes.lbs_xattr_count); /* * Create any kmem_caches needed for blobs @@ -463,6 +467,11 @@ static void __init ordered_lsm_init(void) lsm_file_cache = kmem_cache_create("lsm_file_cache", blob_sizes.lbs_file, 0, SLAB_PANIC, NULL); + if (blob_sizes.lbs_backing_file) + lsm_backing_file_cache = kmem_cache_create( + "lsm_backing_file_cache", + blob_sizes.lbs_backing_file, + 0, SLAB_PANIC, NULL); if (blob_sizes.lbs_inode) lsm_inode_cache = kmem_cache_create("lsm_inode_cache", blob_sizes.lbs_inode, 0, @@ -644,6 +653,53 @@ int unregister_blocking_lsm_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_blocking_lsm_notifier); +/** + * lsm_backing_file_alloc - allocate a composite backing file blob + * @backing_file: the backing file + * + * Allocate the backing file blob for all the modules. + * + * Returns 0, or -ENOMEM if memory can't be allocated. + */ +static int lsm_backing_file_alloc(struct file *backing_file) +{ + void *blob; + + if (!lsm_backing_file_cache) { + backing_file_set_security(backing_file, NULL); + return 0; + } + + blob = kmem_cache_zalloc(lsm_backing_file_cache, GFP_KERNEL); + backing_file_set_security(backing_file, blob); + if (!blob) + return -ENOMEM; + return 0; +} + +/** + * lsm_blob_alloc - allocate a composite blob + * @dest: the destination for the blob + * @size: the size of the blob + * @gfp: allocation type + * + * Allocate a blob for all the modules + * + * Returns 0, or -ENOMEM if memory can't be allocated. + */ +static int lsm_blob_alloc(void **dest, size_t size, gfp_t gfp) +{ + if (size == 0) { + *dest = NULL; + return 0; + } + + *dest = kzalloc(size, gfp); + if (*dest == NULL) + return -ENOMEM; + return 0; +} + /** * lsm_cred_alloc - allocate a composite cred blob * @cred: the cred that needs a blob @@ -655,15 +711,7 @@ EXPORT_SYMBOL(unregister_blocking_lsm_notifier); */ static int lsm_cred_alloc(struct cred *cred, gfp_t gfp) { - if (blob_sizes.lbs_cred == 0) { - cred->security = NULL; - return 0; - } - - cred->security = kzalloc(blob_sizes.lbs_cred, gfp); - if (cred->security == NULL) - return -ENOMEM; - return 0; + return lsm_blob_alloc(&cred->security, blob_sizes.lbs_cred, gfp); } /** @@ -732,15 +780,7 @@ static int lsm_inode_alloc(struct inode *inode) */ static int lsm_task_alloc(struct task_struct *task) { - if (blob_sizes.lbs_task == 0) { - task->security = NULL; - return 0; - } - - task->security = kzalloc(blob_sizes.lbs_task, GFP_KERNEL); - if (task->security == NULL) - return -ENOMEM; - return 0; + return lsm_blob_alloc(&task->security, blob_sizes.lbs_task, GFP_KERNEL); } /** @@ -753,15 +793,7 @@ static int lsm_task_alloc(struct task_struct *task) */ static int lsm_ipc_alloc(struct kern_ipc_perm *kip) { - if (blob_sizes.lbs_ipc == 0) { - kip->security = NULL; - return 0; - } - - kip->security = kzalloc(blob_sizes.lbs_ipc, GFP_KERNEL); - if (kip->security == NULL) - return -ENOMEM; - return 0; + return lsm_blob_alloc(&kip->security, blob_sizes.lbs_ipc, GFP_KERNEL); } /** @@ -774,15 +806,8 @@ static int lsm_ipc_alloc(struct kern_ipc_perm *kip) */ static int lsm_msg_msg_alloc(struct msg_msg *mp) { - if (blob_sizes.lbs_msg_msg == 0) { - mp->security = NULL; - return 0; - } - - mp->security = kzalloc(blob_sizes.lbs_msg_msg, GFP_KERNEL); - if (mp->security == NULL) - return -ENOMEM; - return 0; + return lsm_blob_alloc(&mp->security, blob_sizes.lbs_msg_msg, + GFP_KERNEL); } /** @@ -809,15 +834,8 @@ static void __init lsm_early_task(struct task_struct *task) */ static int lsm_superblock_alloc(struct super_block *sb) { - if (blob_sizes.lbs_superblock == 0) { - sb->s_security = NULL; - return 0; - } - - sb->s_security = kzalloc(blob_sizes.lbs_superblock, GFP_KERNEL); - if (sb->s_security == NULL) - return -ENOMEM; - return 0; + return lsm_blob_alloc(&sb->s_security, blob_sizes.lbs_superblock, + GFP_KERNEL); } /* @@ -887,25 +905,25 @@ OUT: \ /* Security operations */ -int security_binder_set_context_mgr(struct task_struct *mgr) +int security_binder_set_context_mgr(const struct cred *mgr) { return call_int_hook(binder_set_context_mgr, mgr); } -int security_binder_transaction(struct task_struct *from, - struct task_struct *to) +int security_binder_transaction(const struct cred *from, + const struct cred *to) { return call_int_hook(binder_transaction, from, to); } -int security_binder_transfer_binder(struct task_struct *from, - struct task_struct *to) +int security_binder_transfer_binder(const struct cred *from, + const struct cred *to) { return call_int_hook(binder_transfer_binder, from, to); } -int security_binder_transfer_file(struct task_struct *from, - struct task_struct *to, struct file *file) +int security_binder_transfer_file(const struct cred *from, + const struct cred *to, const struct file *file) { return call_int_hook(binder_transfer_file, from, to, file); } @@ -1704,6 +1722,57 @@ void security_file_free(struct file *file) } } +/** + * security_backing_file_alloc() - Allocate and setup a backing file blob + * @backing_file: the backing file + * @user_file: the associated user visible file + * + * Allocate a backing file LSM blob and perform any necessary initialization of + * the LSM blob. There will be some operations where the LSM will not have + * access to @user_file after this point, so any important state associated + * with @user_file that is important to the LSM should be captured in the + * backing file's LSM blob. + * + * LSM's should avoid taking a reference to @user_file in this hook as it will + * result in problems later when the system attempts to drop/put the file + * references due to a circular dependency. + * + * Return: Return 0 if the hook is successful, negative values otherwise. + */ +int security_backing_file_alloc(struct file *backing_file, + const struct file *user_file) +{ + int rc; + + rc = lsm_backing_file_alloc(backing_file); + if (rc) + return rc; + rc = call_int_hook(backing_file_alloc, backing_file, user_file); + if (unlikely(rc)) + security_backing_file_free(backing_file); + + return rc; +} + +/** + * security_backing_file_free() - Free a backing file blob + * @backing_file: the backing file + * + * Free any LSM state associate with a backing file's LSM blob, including the + * blob itself. + */ +void security_backing_file_free(struct file *backing_file) +{ + void *blob = backing_file_security(backing_file); + + call_void_hook(backing_file_free, backing_file); + + if (blob) { + backing_file_set_security(backing_file, NULL); + kmem_cache_free(lsm_backing_file_cache, blob); + } +} + int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return call_int_hook(file_ioctl, file, cmd, arg); @@ -1772,6 +1841,32 @@ int security_mmap_file(struct file *file, unsigned long prot, return ima_file_mmap(file, prot); } +/** + * security_mmap_backing_file - Check if mmap'ing a backing file is allowed + * @vma: the vm_area_struct for the mmap'd region + * @backing_file: the backing file being mmap'd + * @user_file: the user file being mmap'd + * + * Check permissions for a mmap operation on a stacked filesystem. This hook + * is called after the security_mmap_file() and is responsible for authorizing + * the mmap on @backing_file. It is important to note that the mmap operation + * on @user_file has already been authorized and the @vma->vm_file has been + * set to @backing_file. + * + * Return: Returns 0 if permission is granted. + */ +int security_mmap_backing_file(struct vm_area_struct *vma, + struct file *backing_file, + struct file *user_file) +{ + /* recommended by the stackable filesystem devs */ + if (WARN_ON_ONCE(!(backing_file->f_mode & FMODE_BACKING))) + return -EIO; + + return call_int_hook(mmap_backing_file, vma, backing_file, user_file); +} +EXPORT_SYMBOL_GPL(security_mmap_backing_file); + int security_mmap_addr(unsigned long addr) { return call_int_hook(mmap_addr, addr); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 9a69a2a4b3..b31b06d444 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -245,29 +245,6 @@ static inline u32 task_sid_obj(const struct task_struct *task) return sid; } -/* - * get the security ID of a task for use with binder - */ -static inline u32 task_sid_binder(const struct task_struct *task) -{ - /* - * In many case where this function is used we should be using the - * task's subjective SID, but we can't reliably access the subjective - * creds of a task other than our own so we must use the objective - * creds/SID, which are safe to access. The downside is that if a task - * is temporarily overriding it's creds it will not be reflected here; - * however, it isn't clear that binder would handle that case well - * anyway. - * - * If this ever changes and we can safely reference the subjective - * creds/SID of another task, this function will make it easier to - * identify the various places where we make use of the task SIDs in - * the binder code. It is also likely that we will need to adjust - * the main drivers/android binder code as well. - */ - return task_sid_obj(task); -} - static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dentry); /* @@ -1726,9 +1703,67 @@ static inline int file_path_has_perm(const struct cred *cred, } #ifdef CONFIG_BPF_SYSCALL -static int bpf_fd_pass(struct file *file, u32 sid); +static int bpf_fd_pass(const struct file *file, u32 sid); #endif +static int __file_has_perm(const struct cred *cred, const struct file *file, + u32 av, bool bf_user_file) + +{ + struct common_audit_data ad; + struct inode *inode; + u32 ssid = cred_sid(cred); + u32 tsid_fd; + int rc; + + if (bf_user_file) { + struct backing_file_security_struct *bfsec; + const struct path *path; + + if (WARN_ON(!(file->f_mode & FMODE_BACKING))) + return -EIO; + + bfsec = selinux_backing_file(file); + path = backing_file_user_path(file); + tsid_fd = bfsec->uf_sid; + inode = d_inode(path->dentry); + + ad.type = LSM_AUDIT_DATA_PATH; + ad.u.path = *path; + } else { + struct file_security_struct *fsec = selinux_file(file); + + tsid_fd = fsec->sid; + inode = file_inode(file); + + ad.type = LSM_AUDIT_DATA_FILE; + ad.u.file = file; + } + + if (ssid != tsid_fd) { + rc = avc_has_perm(&selinux_state, + ssid, tsid_fd, + SECCLASS_FD, + FD__USE, + &ad); + if (rc) + return rc; + } + +#ifdef CONFIG_BPF_SYSCALL + /* regardless of backing vs user file, use the underlying file here */ + rc = bpf_fd_pass(file, ssid); + if (rc) + return rc; +#endif + + /* av is zero if only checking access to the descriptor. */ + if (av) + return inode_has_perm(cred, inode, av, &ad); + + return 0; +} + /* Check whether a task can use an open file descriptor to access an inode in a given way. Check access to the descriptor itself, and then use dentry_has_perm to @@ -1737,42 +1772,10 @@ static int bpf_fd_pass(struct file *file, u32 sid); has the same SID as the process. If av is zero, then access to the file is not checked, e.g. for cases where only the descriptor is affected like seek. */ -static int file_has_perm(const struct cred *cred, - struct file *file, - u32 av) +static inline int file_has_perm(const struct cred *cred, + const struct file *file, u32 av) { - struct file_security_struct *fsec = selinux_file(file); - struct inode *inode = file_inode(file); - struct common_audit_data ad; - u32 sid = cred_sid(cred); - int rc; - - ad.type = LSM_AUDIT_DATA_FILE; - ad.u.file = file; - - if (sid != fsec->sid) { - rc = avc_has_perm(&selinux_state, - sid, fsec->sid, - SECCLASS_FD, - FD__USE, - &ad); - if (rc) - goto out; - } - -#ifdef CONFIG_BPF_SYSCALL - rc = bpf_fd_pass(file, cred_sid(cred)); - if (rc) - return rc; -#endif - - /* av is zero if only checking access to the descriptor. */ - rc = 0; - if (av) - rc = inode_has_perm(cred, inode, av, &ad); - -out: - return rc; + return __file_has_perm(cred, file, av, false); } /* @@ -1999,7 +2002,7 @@ static inline u32 file_mask_to_av(int mode, int mask) } /* Convert a Linux file to an access vector. */ -static inline u32 file_to_av(struct file *file) +static inline u32 file_to_av(const struct file *file) { u32 av = 0; @@ -2039,18 +2042,19 @@ static inline u32 open_file_to_av(struct file *file) /* Hook functions begin here. */ -static int selinux_binder_set_context_mgr(struct task_struct *mgr) +static int selinux_binder_set_context_mgr(const struct cred *mgr) { return avc_has_perm(&selinux_state, - current_sid(), task_sid_binder(mgr), SECCLASS_BINDER, + current_sid(), cred_sid(mgr), SECCLASS_BINDER, BINDER__SET_CONTEXT_MGR, NULL); } -static int selinux_binder_transaction(struct task_struct *from, - struct task_struct *to) +static int selinux_binder_transaction(const struct cred *from, + const struct cred *to) { u32 mysid = current_sid(); - u32 fromsid = task_sid_binder(from); + u32 fromsid = cred_sid(from); + u32 tosid = cred_sid(to); int rc; if (mysid != fromsid) { @@ -2061,24 +2065,24 @@ static int selinux_binder_transaction(struct task_struct *from, return rc; } - return avc_has_perm(&selinux_state, fromsid, task_sid_binder(to), + return avc_has_perm(&selinux_state, fromsid, tosid, SECCLASS_BINDER, BINDER__CALL, NULL); } -static int selinux_binder_transfer_binder(struct task_struct *from, - struct task_struct *to) +static int selinux_binder_transfer_binder(const struct cred *from, + const struct cred *to) { return avc_has_perm(&selinux_state, - task_sid_binder(from), task_sid_binder(to), + cred_sid(from), cred_sid(to), SECCLASS_BINDER, BINDER__TRANSFER, NULL); } -static int selinux_binder_transfer_file(struct task_struct *from, - struct task_struct *to, - struct file *file) +static int selinux_binder_transfer_file(const struct cred *from, + const struct cred *to, + const struct file *file) { - u32 sid = task_sid_binder(to); + u32 sid = cred_sid(to); struct file_security_struct *fsec = selinux_file(file); struct dentry *dentry = file->f_path.dentry; struct inode_security_struct *isec; @@ -3668,6 +3672,17 @@ static int selinux_file_alloc_security(struct file *file) return 0; } +static int selinux_backing_file_alloc(struct file *backing_file, + const struct file *user_file) +{ + struct backing_file_security_struct *bfsec; + + bfsec = selinux_backing_file(backing_file); + bfsec->uf_sid = selinux_file(user_file)->sid; + + return 0; +} + /* * Check whether a task has the ioctl permission and cmd * operation to an inode. @@ -3781,43 +3796,56 @@ static int selinux_file_ioctl_compat(struct file *file, unsigned int cmd, static int default_noexec __ro_after_init; -static int file_map_prot_check(struct file *file, unsigned long prot, int shared) +static int __file_map_prot_check(const struct cred *cred, + const struct file *file, unsigned long prot, + bool shared, bool mounter, bool bf_user_file) { - const struct cred *cred = current_cred(); - u32 sid = cred_sid(cred); - int rc = 0; + struct inode *inode = NULL; + bool prot_exec = prot & PROT_EXEC; + bool prot_write = prot & PROT_WRITE; + + if (file) { + if (bf_user_file) + inode = d_inode(backing_file_user_path(file)->dentry); + else + inode = file_inode(file); + } + + if (default_noexec && prot_exec && + (!file || IS_PRIVATE(inode) || (!shared && prot_write)) && !mounter) { + int rc; + u32 sid = cred_sid(cred); - if (default_noexec && - (prot & PROT_EXEC) && (!file || IS_PRIVATE(file_inode(file)) || - (!shared && (prot & PROT_WRITE)))) { /* - * We are making executable an anonymous mapping or a - * private file mapping that will also be writable. - * This has an additional check. + * We are making executable an anonymous mapping or a private + * file mapping that will also be writable. */ rc = avc_has_perm(&selinux_state, - sid, sid, SECCLASS_PROCESS, - PROCESS__EXECMEM, NULL); + sid, sid, SECCLASS_PROCESS, PROCESS__EXECMEM, + NULL); if (rc) - goto error; + return rc; } if (file) { - /* read access is always possible with a mapping */ + /* "read" always possible, "write" only if shared */ u32 av = FILE__READ; - - /* write access only matters if the mapping is shared */ - if (shared && (prot & PROT_WRITE)) + if (shared && prot_write) av |= FILE__WRITE; - - if (prot & PROT_EXEC) + if (prot_exec) av |= FILE__EXECUTE; - return file_has_perm(cred, file, av); + return __file_has_perm(cred, file, av, bf_user_file); } -error: - return rc; + return 0; +} + +static inline int file_map_prot_check(const struct cred *cred, + const struct file *file, + unsigned long prot, bool shared, bool mounter) +{ + return __file_map_prot_check(cred, file, prot, shared, mounter, false); } static int selinux_mmap_addr(unsigned long addr) @@ -3834,17 +3862,17 @@ static int selinux_mmap_addr(unsigned long addr) return rc; } -static int selinux_mmap_file(struct file *file, unsigned long reqprot, - unsigned long prot, unsigned long flags) +static int selinux_mmap_file_common(const struct cred *cred, struct file *file, + unsigned long reqprot, unsigned long prot, + bool shared, bool mounter) { - struct common_audit_data ad; - int rc; - if (file) { + int rc; + struct common_audit_data ad; + ad.type = LSM_AUDIT_DATA_FILE; ad.u.file = file; - rc = inode_has_perm(current_cred(), file_inode(file), - FILE__MAP, &ad); + rc = inode_has_perm(cred, file_inode(file), FILE__MAP, &ad); if (rc) return rc; } @@ -3852,23 +3880,70 @@ static int selinux_mmap_file(struct file *file, unsigned long reqprot, if (checkreqprot_get(&selinux_state)) prot = reqprot; - return file_map_prot_check(file, prot, - (flags & MAP_TYPE) == MAP_SHARED); + return file_map_prot_check(cred, file, prot, shared, mounter); +} + +static int selinux_mmap_file(struct file *file, unsigned long reqprot, + unsigned long prot, unsigned long flags) +{ + return selinux_mmap_file_common(current_cred(), file, reqprot, prot, + (flags & MAP_TYPE) == MAP_SHARED, + false); +} + +/** + * selinux_mmap_backing_file - Check mmap permissions on a backing file + * @vma: memory region + * @backing_file: stacked filesystem backing file + * @user_file: user visible file + * + * This is called after selinux_mmap_file() on stacked filesystems, and it + * is this function's responsibility to verify access to @backing_file and + * setup the SELinux state for possible later use in the mprotect() code path. + * + * By the time this function is called, mmap() access to @user_file has already + * been authorized and @vma->vm_file has been set to point to @backing_file. + * + * Return zero on success, negative values otherwise. + */ +static int selinux_mmap_backing_file(struct vm_area_struct *vma, + struct file *backing_file, + struct file *user_file __always_unused) +{ + unsigned long prot = 0; + + /* translate vma->vm_flags perms into PROT perms */ + if (vma->vm_flags & VM_READ) + prot |= PROT_READ; + if (vma->vm_flags & VM_WRITE) + prot |= PROT_WRITE; + if (vma->vm_flags & VM_EXEC) + prot |= PROT_EXEC; + + return selinux_mmap_file_common(backing_file->f_cred, backing_file, + prot, prot, vma->vm_flags & VM_SHARED, + true); } static int selinux_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot) { + int rc; const struct cred *cred = current_cred(); u32 sid = cred_sid(cred); + const struct file *file = vma->vm_file; + bool backing_file; + bool shared = vma->vm_flags & VM_SHARED; + + /* check if we need to trigger the "backing files are awful" mode */ + backing_file = file && (file->f_mode & FMODE_BACKING); if (checkreqprot_get(&selinux_state)) prot = reqprot; if (default_noexec && (prot & PROT_EXEC) && !(vma->vm_flags & VM_EXEC)) { - int rc = 0; /* * We don't use the vma_is_initial_heap() helper as it has * a history of problems and is currently broken on systems @@ -3883,12 +3958,16 @@ static int selinux_file_mprotect(struct vm_area_struct *vma, rc = avc_has_perm(&selinux_state, sid, sid, SECCLASS_PROCESS, PROCESS__EXECHEAP, NULL); - } else if (!vma->vm_file && (vma_is_initial_stack(vma) || + if (rc) + return rc; + } else if (!file && (vma_is_initial_stack(vma) || vma_is_stack_for_current(vma))) { rc = avc_has_perm(&selinux_state, sid, sid, SECCLASS_PROCESS, PROCESS__EXECSTACK, NULL); - } else if (vma->vm_file && vma->anon_vma) { + if (rc) + return rc; + } else if (file && vma->anon_vma) { /* * We are making executable a file mapping that has * had some COW done. Since pages might have been @@ -3896,13 +3975,29 @@ static int selinux_file_mprotect(struct vm_area_struct *vma, * modified content. This typically should only * occur for text relocations. */ - rc = file_has_perm(cred, vma->vm_file, FILE__EXECMOD); + rc = __file_has_perm(cred, file, FILE__EXECMOD, + backing_file); + if (rc) + return rc; + if (backing_file) { + rc = file_has_perm(file->f_cred, file, + FILE__EXECMOD); + if (rc) + return rc; + } } + } + + rc = __file_map_prot_check(cred, file, prot, shared, false, backing_file); + if (rc) + return rc; + if (backing_file) { + rc = file_map_prot_check(file->f_cred, file, prot, shared, true); if (rc) return rc; } - return file_map_prot_check(vma->vm_file, prot, vma->vm_flags&VM_SHARED); + return 0; } static int selinux_file_lock(struct file *file, unsigned int cmd) @@ -6907,7 +7002,7 @@ static u32 bpf_map_fmode_to_av(fmode_t fmode) * access the bpf object and that's why we have to add this additional check in * selinux_file_receive and selinux_binder_transfer_files. */ -static int bpf_fd_pass(struct file *file, u32 sid) +static int bpf_fd_pass(const struct file *file, u32 sid) { struct bpf_security_struct *bpfsec; struct bpf_prog *prog; @@ -7029,6 +7124,7 @@ static void selinux_bpf_token_free(struct bpf_token *token) struct lsm_blob_sizes selinux_blob_sizes __ro_after_init = { .lbs_cred = sizeof(struct task_security_struct), .lbs_file = sizeof(struct file_security_struct), + .lbs_backing_file = sizeof(struct backing_file_security_struct), .lbs_inode = sizeof(struct inode_security_struct), .lbs_ipc = sizeof(struct ipc_security_struct), .lbs_msg_msg = sizeof(struct msg_security_struct), @@ -7238,9 +7334,11 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = { LSM_HOOK_INIT(file_permission, selinux_file_permission), LSM_HOOK_INIT(file_alloc_security, selinux_file_alloc_security), + LSM_HOOK_INIT(backing_file_alloc, selinux_backing_file_alloc), LSM_HOOK_INIT(file_ioctl, selinux_file_ioctl), LSM_HOOK_INIT(file_ioctl_compat, selinux_file_ioctl_compat), LSM_HOOK_INIT(mmap_file, selinux_mmap_file), + LSM_HOOK_INIT(mmap_backing_file, selinux_mmap_backing_file), LSM_HOOK_INIT(mmap_addr, selinux_mmap_addr), LSM_HOOK_INIT(file_mprotect, selinux_file_mprotect), LSM_HOOK_INIT(file_lock, selinux_file_lock), diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h index 2953132408..b1c5a2877f 100644 --- a/security/selinux/include/objsec.h +++ b/security/selinux/include/objsec.h @@ -60,6 +60,10 @@ struct file_security_struct { u32 pseqno; /* Policy seqno at the time of file open */ }; +struct backing_file_security_struct { + u32 uf_sid; /* associated user file fsec->sid */ +}; + struct superblock_security_struct { u32 sid; /* SID of file system superblock */ u32 def_sid; /* default SID for labeling */ @@ -158,6 +162,13 @@ static inline struct file_security_struct *selinux_file(const struct file *file) return file->f_security + selinux_blob_sizes.lbs_file; } +static inline struct backing_file_security_struct * +selinux_backing_file(const struct file *backing_file) +{ + void *blob = backing_file_security(backing_file); + return blob + selinux_blob_sizes.lbs_backing_file; +} + static inline struct inode_security_struct *selinux_inode( const struct inode *inode) { diff --git a/sound/drivers/aloop.c b/sound/drivers/aloop.c index db137222d3..d2b9160a08 100644 --- a/sound/drivers/aloop.c +++ b/sound/drivers/aloop.c @@ -99,6 +99,9 @@ struct loopback_ops { struct loopback_cable { spinlock_t lock; struct loopback_pcm *streams[2]; + /* in-flight peer stops running outside cable->lock */ + atomic_t stop_count; + wait_queue_head_t stop_wait; struct snd_pcm_hardware hw; /* flags */ unsigned int valid; @@ -366,8 +369,11 @@ static int loopback_check_format(struct loopback_cable *cable, int stream) return 0; if (stream == SNDRV_PCM_STREAM_CAPTURE) return -EIO; - else if (cruntime->state == SNDRV_PCM_STATE_RUNNING) + else if (cruntime->state == SNDRV_PCM_STATE_RUNNING) { + /* close must not free the peer runtime below */ + atomic_inc(&cable->stop_count); stop_capture = true; + } } setup = get_setup(dpcm_play); @@ -396,8 +402,11 @@ static int loopback_check_format(struct loopback_cable *cable, int stream) } } - if (stop_capture) + if (stop_capture) { snd_pcm_stop(dpcm_capt->substream, SNDRV_PCM_STATE_DRAINING); + if (atomic_dec_and_test(&cable->stop_count)) + wake_up(&cable->stop_wait); + } return 0; } @@ -1049,23 +1058,29 @@ static void free_cable(struct snd_pcm_substream *substream) struct loopback *loopback = substream->private_data; int dev = get_cable_index(substream); struct loopback_cable *cable; + struct loopback_pcm *dpcm; + bool other_alive; cable = loopback->cables[substream->number][dev]; if (!cable) return; - if (cable->streams[!substream->stream]) { - /* other stream is still alive */ - guard(spinlock_irq)(&cable->lock); - cable->streams[substream->stream] = NULL; - } else { - struct loopback_pcm *dpcm = substream->runtime->private_data; - if (cable->ops && cable->ops->close_cable && dpcm) - cable->ops->close_cable(dpcm); - /* free the cable */ - loopback->cables[substream->number][dev] = NULL; - kfree(cable); + scoped_guard(spinlock_irq, &cable->lock) { + cable->streams[substream->stream] = NULL; + other_alive = cable->streams[!substream->stream]; } + + /* Pair with the stop_count increment in loopback_check_format(). */ + wait_event(cable->stop_wait, !atomic_read(&cable->stop_count)); + if (other_alive) + return; + + dpcm = substream->runtime->private_data; + if (cable->ops && cable->ops->close_cable && dpcm) + cable->ops->close_cable(dpcm); + /* free the cable */ + loopback->cables[substream->number][dev] = NULL; + kfree(cable); } static int loopback_jiffies_timer_open(struct loopback_pcm *dpcm) @@ -1260,6 +1275,8 @@ static int loopback_open(struct snd_pcm_substream *substream) goto unlock; } spin_lock_init(&cable->lock); + atomic_set(&cable->stop_count, 0); + init_waitqueue_head(&cable->stop_wait); cable->hw = loopback_pcm_hardware; if (loopback->timer_source) cable->ops = &loopback_snd_timer_ops; diff --git a/tools/arch/x86/include/asm/rmwcc.h b/tools/arch/x86/include/asm/rmwcc.h index 11ff975242..e2ff22b379 100644 --- a/tools/arch/x86/include/asm/rmwcc.h +++ b/tools/arch/x86/include/asm/rmwcc.h @@ -4,7 +4,7 @@ #define __GEN_RMWcc(fullop, var, cc, ...) \ do { \ - asm_volatile_goto (fullop "; j" cc " %l[cc_label]" \ + asm goto (fullop "; j" cc " %l[cc_label]" \ : : "m" (var), ## __VA_ARGS__ \ : "memory" : cc_label); \ return 0; \ diff --git a/tools/include/linux/compiler_types.h b/tools/include/linux/compiler_types.h index 1bdd834bdd..d09f9dc172 100644 --- a/tools/include/linux/compiler_types.h +++ b/tools/include/linux/compiler_types.h @@ -36,8 +36,8 @@ #include #endif -#ifndef asm_volatile_goto -#define asm_volatile_goto(x...) asm goto(x) +#ifndef asm_goto_output +#define asm_goto_output(x...) asm goto(x) #endif #endif /* __LINUX_COMPILER_TYPES_H */