openmpi/8322.patch

From 31068e063b8795ae11f3a59d4080db1fe111cfaf Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Mon, 28 Dec 2020 15:36:05 -0500
Subject: [PATCH 1/3] Major update to the AVX* detection and support

1. Consistent march flag order between configure and make.

2. op/avx: give the option to skip some tests

it is possible to skip some intrinsic tests by setting some environment variables to "no" before invoking configure:
 - ompi_cv_op_avx_check_avx512
 - ompi_cv_op_avx_check_avx2
 - ompi_cv_op_avx_check_avx
 - ompi_cv_op_avx_check_sse41
 - ompi_cv_op_avx_check_sse3

3. op/avx: update AVX512 flags

try
-mavx512f -mavx512bw -mavx512vl -mavx512dq
instead of
-march=skylake-avx512

since the former is less likely to conflict with user provided CFLAGS
(e.g. -march=...)

Thanks Bart Oldeman for pointing this.

4. op/avx: have the op/avx library depend on libmpi.so

Refs. open-mpi/ompi#8323

Signed-off-by: Gilles Gouaillardet <gilles@rist.or.jp>
Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
---
 ompi/mca/op/avx/Makefile.am  |   4 +-
 ompi/mca/op/avx/configure.m4 | 325 ++++++++++++++++++-----------------
 2 files changed, 174 insertions(+), 155 deletions(-)

diff --git a/ompi/mca/op/avx/Makefile.am b/ompi/mca/op/avx/Makefile.am
index 41dcf2e1834..b1d84d90b33 100644
--- a/ompi/mca/op/avx/Makefile.am
+++ b/ompi/mca/op/avx/Makefile.am
@@ -2,7 +2,7 @@
 # Copyright (c) 2019-2020 The University of Tennessee and The University
 #                         of Tennessee Research Foundation.  All rights
 #                         reserved.
-# Copyright (c) 2020      Research Organization for Information Science
+# Copyright (c) 2020-2021 Research Organization for Information Science
 #                         and Technology (RIST).  All rights reserved.
 # $COPYRIGHT$
 #
@@ -86,7 +86,7 @@ mcacomponentdir = $(ompilibdir)
 mcacomponent_LTLIBRARIES = $(component_install)
 mca_op_avx_la_SOURCES = $(sources)
 mca_op_avx_la_LIBADD = $(specialized_op_libs)
-mca_op_avx_la_LDFLAGS = -module -avoid-version
+mca_op_avx_la_LDFLAGS = -module -avoid-version $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la


 # Specific information for static builds.
diff --git a/ompi/mca/op/avx/configure.m4 b/ompi/mca/op/avx/configure.m4
index 09d8b374c8e..f61b7100ef4 100644
--- a/ompi/mca/op/avx/configure.m4
+++ b/ompi/mca/op/avx/configure.m4
@@ -29,6 +29,13 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
     op_avx_support=0
     op_avx2_support=0
     op_avx512_support=0
+
+    AS_VAR_PUSHDEF([op_avx_check_sse3], [ompi_cv_op_avx_check_sse3])
+    AS_VAR_PUSHDEF([op_avx_check_sse41], [ompi_cv_op_avx_check_sse41])
+    AS_VAR_PUSHDEF([op_avx_check_avx], [ompi_cv_op_avx_check_avx])
+    AS_VAR_PUSHDEF([op_avx_check_avx2], [ompi_cv_op_avx_check_avx2])
+    AS_VAR_PUSHDEF([op_avx_check_avx512], [ompi_cv_op_avx_check_avx512])
+
     OPAL_VAR_SCOPE_PUSH([op_avx_cflags_save])

     AS_IF([test "$opal_cv_asm_arch" = "X86_64"],
@@ -37,21 +44,9 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
            #
            # Check for AVX512 support
            #
-           AC_MSG_CHECKING([for AVX512 support (no additional flags)])
-           AC_LINK_IFELSE(
-               [AC_LANG_PROGRAM([[#include <immintrin.h>]],
-                                [[
-    __m512 vA, vB;
-    _mm512_add_ps(vA, vB)
-                                ]])],
-               [op_avx512_support=1
-                AC_MSG_RESULT([yes])],
-               [AC_MSG_RESULT([no])])
-
-           AS_IF([test $op_avx512_support -eq 0],
-                 [AC_MSG_CHECKING([for AVX512 support (with -march=skylake-avx512)])
-                  op_avx_cflags_save="$CFLAGS"
-                  CFLAGS="$CFLAGS -march=skylake-avx512"
+           AC_CACHE_CHECK([if we are checking for AVX512 support], op_avx_check_avx512, AS_VAR_SET(op_avx_check_avx512, yes))
+           AS_IF([test "$op_avx_check_avx512" = "yes"],
+                 [AC_MSG_CHECKING([for AVX512 support (no additional flags)])
                   AC_LINK_IFELSE(
                       [AC_LANG_PROGRAM([[#include <immintrin.h>]],
                                        [[
@@ -59,99 +54,115 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
     _mm512_add_ps(vA, vB)
                                        ]])],
                       [op_avx512_support=1
-                       MCA_BUILD_OP_AVX512_FLAGS="-march=skylake-avx512"
                        AC_MSG_RESULT([yes])],
                       [AC_MSG_RESULT([no])])
-                  CFLAGS="$op_avx_cflags_save"
-                 ])
-           #
-           # Some combination of gcc and older as would not correctly build the code generated by
-           # _mm256_loadu_si256. Screen them out.
-           #
-           AS_IF([test $op_avx512_support -eq 1],
-                 [AC_MSG_CHECKING([if _mm512_loadu_si512 generates code that can be compiled])
-                  op_avx_cflags_save="$CFLAGS"
-                  CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS"
-                  AC_LINK_IFELSE(
-                      [AC_LANG_PROGRAM([[#include <immintrin.h>]],
-                               [[
+
+                  AS_IF([test $op_avx512_support -eq 0],
+                        [AC_MSG_CHECKING([for AVX512 support (with -mavx512f -mavx512bw -mavx512vl -mavx512dq)])
+                         op_avx_cflags_save="$CFLAGS"
+                         CFLAGS="-mavx512f -mavx512bw -mavx512vl -mavx512dq $CFLAGS"
+                         AC_LINK_IFELSE(
+                             [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                                              [[
+    __m512 vA, vB;
+    _mm512_add_ps(vA, vB)
+                                       ]])],
+                             [op_avx512_support=1
+                              MCA_BUILD_OP_AVX512_FLAGS="-mavx512f -mavx512bw -mavx512vl -mavx512dq"
+                              AC_MSG_RESULT([yes])],
+                             [AC_MSG_RESULT([no])])
+                         CFLAGS="$op_avx_cflags_save"
+                        ])
+                  #
+                  # Some combination of gcc and older as would not correctly build the code generated by
+                  # _mm256_loadu_si256. Screen them out.
+                  #
+                  AS_IF([test $op_avx512_support -eq 1],
+                        [AC_MSG_CHECKING([if _mm512_loadu_si512 generates code that can be compiled])
+                         op_avx_cflags_save="$CFLAGS"
+                         CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS"
+                         AC_LINK_IFELSE(
+                             [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                                      [[
     int A[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
     __m512i vA = _mm512_loadu_si512((__m512i*)&(A[1]))
-                               ]])],
-                      [AC_MSG_RESULT([yes])],
-                      [op_avx512_support=0
-                       MCA_BUILD_OP_AVX512_FLAGS=""
-                       AC_MSG_RESULT([no])])
-                  CFLAGS="$op_avx_cflags_save"
-                 ])
-           #
-           # Some PGI compilers do not define _mm512_mullo_epi64. Screen them out.
-           #
-           AS_IF([test $op_avx512_support -eq 1],
-                 [AC_MSG_CHECKING([if _mm512_mullo_epi64 generates code that can be compiled])
-                  op_avx_cflags_save="$CFLAGS"
-                  CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS"
-                  AC_LINK_IFELSE(
-                      [AC_LANG_PROGRAM([[#include <immintrin.h>]],
-                               [[
+                                      ]])],
+                             [AC_MSG_RESULT([yes])],
+                             [op_avx512_support=0
+                              MCA_BUILD_OP_AVX512_FLAGS=""
+                              AC_MSG_RESULT([no])])
+                         CFLAGS="$op_avx_cflags_save"
+                        ])
+                  #
+                  # Some PGI compilers do not define _mm512_mullo_epi64. Screen them out.
+                  #
+                  AS_IF([test $op_avx512_support -eq 1],
+                        [AC_MSG_CHECKING([if _mm512_mullo_epi64 generates code that can be compiled])
+                         op_avx_cflags_save="$CFLAGS"
+                         CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS"
+                         AC_LINK_IFELSE(
+                             [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                                      [[
     __m512i vA, vB;
     _mm512_mullo_epi64(vA, vB)
-                               ]])],
-                      [AC_MSG_RESULT([yes])],
-                      [op_avx512_support=0
-                       MCA_BUILD_OP_AVX512_FLAGS=""
-                       AC_MSG_RESULT([no])])
-                  CFLAGS="$op_avx_cflags_save"
-                 ])
+                                      ]])],
+                             [AC_MSG_RESULT([yes])],
+                             [op_avx512_support=0
+                              MCA_BUILD_OP_AVX512_FLAGS=""
+                              AC_MSG_RESULT([no])])
+                         CFLAGS="$op_avx_cflags_save"
+                        ])])
            #
            # Check support for AVX2
            #
-           AC_MSG_CHECKING([for AVX2 support (no additional flags)])
-           AC_LINK_IFELSE(
-               [AC_LANG_PROGRAM([[#include <immintrin.h>]],
-                       [[
+           AC_CACHE_CHECK([if we are checking for AVX2 support], op_avx_check_avx2, AS_VAR_SET(op_avx_check_avx2, yes))
+           AS_IF([test "$op_avx_check_avx2" = "yes"],
+                 [AC_MSG_CHECKING([for AVX2 support (no additional flags)])
+                  AC_LINK_IFELSE(
+                      [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                              [[
     __m256 vA, vB;
     _mm256_add_ps(vA, vB)
-                       ]])],
-               [op_avx2_support=1
-                AC_MSG_RESULT([yes])],
-               [AC_MSG_RESULT([no])])
-           AS_IF([test $op_avx2_support -eq 0],
-               [AC_MSG_CHECKING([for AVX2 support (with -mavx2)])
-                op_avx_cflags_save="$CFLAGS"
-                CFLAGS="$CFLAGS -mavx2"
-                AC_LINK_IFELSE(
-                    [AC_LANG_PROGRAM([[#include <immintrin.h>]],
-                            [[
+                              ]])],
+                      [op_avx2_support=1
+                       AC_MSG_RESULT([yes])],
+                      [AC_MSG_RESULT([no])])
+                  AS_IF([test $op_avx2_support -eq 0],
+                      [AC_MSG_CHECKING([for AVX2 support (with -mavx2)])
+                       op_avx_cflags_save="$CFLAGS"
+                       CFLAGS="-mavx2 $CFLAGS"
+                       AC_LINK_IFELSE(
+                           [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                                   [[
     __m256 vA, vB;
     _mm256_add_ps(vA, vB)
-                            ]])],
-                    [op_avx2_support=1
-                     MCA_BUILD_OP_AVX2_FLAGS="-mavx2"
-                     AC_MSG_RESULT([yes])],
-                    [AC_MSG_RESULT([no])])
-                CFLAGS="$op_avx_cflags_save"
-                ])
-           #
-           # Some combination of gcc and older as would not correctly build the code generated by
-           # _mm256_loadu_si256. Screen them out.
-           #
-           AS_IF([test $op_avx2_support -eq 1],
-                 [AC_MSG_CHECKING([if _mm256_loadu_si256 generates code that can be compiled])
-                  op_avx_cflags_save="$CFLAGS"
-                  CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX2_FLAGS"
-                  AC_LINK_IFELSE(
-                      [AC_LANG_PROGRAM([[#include <immintrin.h>]],
-                               [[
+                                   ]])],
+                           [op_avx2_support=1
+                            MCA_BUILD_OP_AVX2_FLAGS="-mavx2"
+                            AC_MSG_RESULT([yes])],
+                           [AC_MSG_RESULT([no])])
+                       CFLAGS="$op_avx_cflags_save"
+                       ])
+                  #
+                  # Some combination of gcc and older as would not correctly build the code generated by
+                  # _mm256_loadu_si256. Screen them out.
+                  #
+                  AS_IF([test $op_avx2_support -eq 1],
+                        [AC_MSG_CHECKING([if _mm256_loadu_si256 generates code that can be compiled])
+                         op_avx_cflags_save="$CFLAGS"
+                         CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX2_FLAGS"
+                         AC_LINK_IFELSE(
+                             [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                                      [[
     int A[8] = {0, 1, 2, 3, 4, 5, 6, 7};
     __m256i vA = _mm256_loadu_si256((__m256i*)&A)
-                               ]])],
-                      [AC_MSG_RESULT([yes])],
-                      [op_avx2_support=0
-                       MCA_BUILD_OP_AVX2_FLAGS=""
-                       AC_MSG_RESULT([no])])
-                  CFLAGS="$op_avx_cflags_save"
-                 ])
+                                      ]])],
+                             [AC_MSG_RESULT([yes])],
+                             [op_avx2_support=0
+                              MCA_BUILD_OP_AVX2_FLAGS=""
+                              AC_MSG_RESULT([no])])
+                         CFLAGS="$op_avx_cflags_save"
+                        ])])
            #
            # What about early AVX support. The rest of the logic is slightly different as
            # we need to include some of the SSE4.1 and SSE3 instructions. So, we first check
@@ -160,90 +171,92 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
            # the AVX flag, and then recheck if we have support for the SSE4.1 and SSE3
            # instructions.
            #
-           AC_MSG_CHECKING([for AVX support (no additional flags)])
-           AC_LINK_IFELSE(
-               [AC_LANG_PROGRAM([[#include <immintrin.h>]],
-                       [[
+           AC_CACHE_CHECK([if we are checking for AVX support], op_avx_check_avx, AS_VAR_SET(op_avx_check_avx, yes))
+           AS_IF([test "$op_avx_check_avx" = "yes"],
+                 [AC_MSG_CHECKING([for AVX support (no additional flags)])
+                  AC_LINK_IFELSE(
+                      [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                              [[
     __m128 vA, vB;
     _mm_add_ps(vA, vB)
-                       ]])],
-               [op_avx_support=1
-                AC_MSG_RESULT([yes])],
-               [AC_MSG_RESULT([no])])
+                              ]])],
+                      [op_avx_support=1
+                       AC_MSG_RESULT([yes])],
+                      [AC_MSG_RESULT([no])])])
            #
            # Check for SSE4.1 support
            #
-           AS_IF([test $op_avx_support -eq 1],
-               [AC_MSG_CHECKING([for SSE4.1 support])
-                AC_LINK_IFELSE(
-                    [AC_LANG_PROGRAM([[#include <immintrin.h>]],
-                            [[
+           AC_CACHE_CHECK([if we are checking for SSE4.1 support], op_avx_check_sse41, AS_VAR_SET(op_avx_check_sse41, yes))
+           AS_IF([test $op_avx_support -eq 1 && test "$op_avx_check_sse41" = "yes"],
+                 [AC_MSG_CHECKING([for SSE4.1 support])
+                  AC_LINK_IFELSE(
+                      [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                              [[
     __m128i vA, vB;
     (void)_mm_max_epi8(vA, vB)
-                            ]])],
-                    [op_sse41_support=1
-                     AC_MSG_RESULT([yes])],
-                    [AC_MSG_RESULT([no])])
-                ])
+                              ]])],
+                      [op_sse41_support=1
+                       AC_MSG_RESULT([yes])],
+                      [AC_MSG_RESULT([no])])
+                  ])
            #
            # Check for SSE3 support
            #
-           AS_IF([test $op_avx_support -eq 1],
-               [AC_MSG_CHECKING([for SSE3 support])
-                AC_LINK_IFELSE(
-                    [AC_LANG_PROGRAM([[#include <immintrin.h>]],
-                            [[
+           AC_CACHE_CHECK([if we are checking for SSE3 support], op_avx_check_sse3, AS_VAR_SET(op_avx_check_sse3, yes))
+           AS_IF([test $op_avx_support -eq 1 && test "$op_avx_check_sse3" = "yes"],
+                 [AC_MSG_CHECKING([for SSE3 support])
+                  AC_LINK_IFELSE(
+                      [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                              [[
     int A[4] = {0, 1, 2, 3};
     __m128i vA = _mm_lddqu_si128((__m128i*)&A)
-                            ]])],
-                    [op_sse3_support=1
-                     AC_MSG_RESULT([yes])],
-                    [AC_MSG_RESULT([no])])
-                ])
+                              ]])],
+                      [op_sse3_support=1
+                       AC_MSG_RESULT([yes])],
+                      [AC_MSG_RESULT([no])])
+                  ])
            # Second pass, do we need to add the AVX flag ?
            AS_IF([test $op_avx_support -eq 0 || test $op_sse41_support -eq 0 || test $op_sse3_support -eq 0],
-               [AC_MSG_CHECKING([for AVX support (with -mavx)])
-                op_avx_cflags_save="$CFLAGS"
-                CFLAGS="$CFLAGS -mavx"
-                AC_LINK_IFELSE(
-                    [AC_LANG_PROGRAM([[#include <immintrin.h>]],
-                            [[
+                 [AS_IF([test "$op_avx_check_avx" = "yes"],
+                        [AC_MSG_CHECKING([for AVX support (with -mavx)])
+                         op_avx_cflags_save="$CFLAGS"
+                         CFLAGS="-mavx $CFLAGS"
+                         AC_LINK_IFELSE(
+                             [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                                   [[
     __m128 vA, vB;
     _mm_add_ps(vA, vB)
                             ]])],
-                    [op_avx_support=1
-                     MCA_BUILD_OP_AVX_FLAGS="-mavx"
-                     op_sse41_support=0
-                     op_sse3_support=0
-                     AC_MSG_RESULT([yes])],
-                    [AC_MSG_RESULT([no])])
+                             [op_avx_support=1
+                              MCA_BUILD_OP_AVX_FLAGS="-mavx"
+                              op_sse41_support=0
+                              op_sse3_support=0
+                              AC_MSG_RESULT([yes])],
+                             [AC_MSG_RESULT([no])])])

-                AS_IF([test $op_sse41_support -eq 0],
-                    [AC_MSG_CHECKING([for SSE4.1 support])
-                     AC_LINK_IFELSE(
-                         [AC_LANG_PROGRAM([[#include <immintrin.h>]],
-                                 [[
+                  AS_IF([test "$op_avx_check_sse41" = "yes" && test $op_sse41_support -eq 0],
+                        [AC_MSG_CHECKING([for SSE4.1 support])
+                         AC_LINK_IFELSE(
+                             [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                                     [[
     __m128i vA, vB;
     (void)_mm_max_epi8(vA, vB)
-                                 ]])],
-                         [op_sse41_support=1
-                          AC_MSG_RESULT([yes])],
-                         [AC_MSG_RESULT([no])])
-                     ])
-                AS_IF([test $op_sse3_support -eq 0],
-                    [AC_MSG_CHECKING([for SSE3 support])
-                     AC_LINK_IFELSE(
-                         [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                                     ]])],
+                             [op_sse41_support=1
+                              AC_MSG_RESULT([yes])],
+                             [AC_MSG_RESULT([no])])])
+                  AS_IF([test "$op_avx_check_sse3" = "yes" && test $op_sse3_support -eq 0],
+                        [AC_MSG_CHECKING([for SSE3 support])
+                         AC_LINK_IFELSE(
+                             [AC_LANG_PROGRAM([[#include <immintrin.h>]],
                                  [[
     int A[4] = {0, 1, 2, 3};
     __m128i vA = _mm_lddqu_si128((__m128i*)&A)
                                  ]])],
-                         [op_sse3_support=1
-                          AC_MSG_RESULT([yes])],
-                         [AC_MSG_RESULT([no])])
-                     ])
-                CFLAGS="$op_avx_cflags_save"
-               ])
+                             [op_sse3_support=1
+                              AC_MSG_RESULT([yes])],
+                             [AC_MSG_RESULT([no])])])
+                  CFLAGS="$op_avx_cflags_save"])

            AC_LANG_POP([C])
           ])
@@ -276,6 +289,12 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
     AC_SUBST(MCA_BUILD_OP_AVX2_FLAGS)
     AC_SUBST(MCA_BUILD_OP_AVX_FLAGS)

+    AS_VAR_POPDEF([op_avx_check_avx512])
+    AS_VAR_POPDEF([op_avx_check_avx2])
+    AS_VAR_POPDEF([op_avx_check_avx])
+    AS_VAR_POPDEF([op_avx_check_sse41])
+    AS_VAR_POPDEF([op_avx_check_sse3])
+
     OPAL_VAR_SCOPE_POP
     # Enable this component iff we have at least the most basic form of support
     # for vectorial ISA

From fcf2766a03e3c2a1001679013878209bcddd50ae Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Mon, 28 Dec 2020 12:18:07 -0500
Subject: [PATCH 2/3] AVX code generation improvements

1. Allow fallback to a lesser AVX support during make

Due to the fact that some distro restrict the compiule architecture
during make (while not setting any restrictions during configure) we
need to detect the target architecture also during make in order to
restrict the code we generate.

2. Add comments and better protect the arch specific code.

Identify all the vectorial functions used and clasify them according to
the neccesary hardware capabilities.
Use these requirements to protect the code for load and stores (the rest
of the code being automatically generated it is more difficult to
protect).

3. Correctly check for AVX* support.

Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
---
 ompi/mca/op/avx/configure.m4       |  28 +--
 ompi/mca/op/avx/op_avx_functions.c | 322 ++++++++++++++++++++++++-----
 2 files changed, 288 insertions(+), 62 deletions(-)

diff --git a/ompi/mca/op/avx/configure.m4 b/ompi/mca/op/avx/configure.m4
index f61b7100ef4..f3651f09d43 100644
--- a/ompi/mca/op/avx/configure.m4
+++ b/ompi/mca/op/avx/configure.m4
@@ -44,7 +44,7 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
            #
            # Check for AVX512 support
            #
-           AC_CACHE_CHECK([if we are checking for AVX512 support], op_avx_check_avx512, AS_VAR_SET(op_avx_check_avx512, yes))
+           AC_CACHE_CHECK([for AVX512 support], op_avx_check_avx512, AS_VAR_SET(op_avx_check_avx512, yes))
            AS_IF([test "$op_avx_check_avx512" = "yes"],
                  [AC_MSG_CHECKING([for AVX512 support (no additional flags)])
                   AC_LINK_IFELSE(
@@ -115,14 +115,14 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
            #
            # Check support for AVX2
            #
-           AC_CACHE_CHECK([if we are checking for AVX2 support], op_avx_check_avx2, AS_VAR_SET(op_avx_check_avx2, yes))
+           AC_CACHE_CHECK([for AVX2 support], op_avx_check_avx2, AS_VAR_SET(op_avx_check_avx2, yes))
            AS_IF([test "$op_avx_check_avx2" = "yes"],
                  [AC_MSG_CHECKING([for AVX2 support (no additional flags)])
                   AC_LINK_IFELSE(
                       [AC_LANG_PROGRAM([[#include <immintrin.h>]],
                               [[
-    __m256 vA, vB;
-    _mm256_add_ps(vA, vB)
+    __m256i vA, vB, vC;
+    vC = _mm256_and_si256(vA, vB)
                               ]])],
                       [op_avx2_support=1
                        AC_MSG_RESULT([yes])],
@@ -134,8 +134,8 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
                        AC_LINK_IFELSE(
                            [AC_LANG_PROGRAM([[#include <immintrin.h>]],
                                    [[
-    __m256 vA, vB;
-    _mm256_add_ps(vA, vB)
+    __m256i vA, vB, vC;
+    vC = _mm256_and_si256(vA, vB)
                                    ]])],
                            [op_avx2_support=1
                             MCA_BUILD_OP_AVX2_FLAGS="-mavx2"
@@ -164,21 +164,21 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
                          CFLAGS="$op_avx_cflags_save"
                         ])])
            #
-           # What about early AVX support. The rest of the logic is slightly different as
+           # What about early AVX support? The rest of the logic is slightly different as
            # we need to include some of the SSE4.1 and SSE3 instructions. So, we first check
            # if we can compile AVX code without a flag, then we validate that we have support
            # for the SSE4.1 and SSE3 instructions we need. If not, we check for the usage of
            # the AVX flag, and then recheck if we have support for the SSE4.1 and SSE3
            # instructions.
            #
-           AC_CACHE_CHECK([if we are checking for AVX support], op_avx_check_avx, AS_VAR_SET(op_avx_check_avx, yes))
+           AC_CACHE_CHECK([for AVX support], op_avx_check_avx, AS_VAR_SET(op_avx_check_avx, yes))
            AS_IF([test "$op_avx_check_avx" = "yes"],
                  [AC_MSG_CHECKING([for AVX support (no additional flags)])
                   AC_LINK_IFELSE(
                       [AC_LANG_PROGRAM([[#include <immintrin.h>]],
                               [[
-    __m128 vA, vB;
-    _mm_add_ps(vA, vB)
+    __m256 vA, vB, vC;
+    vC = _mm256_add_ps(vA, vB)
                               ]])],
                       [op_avx_support=1
                        AC_MSG_RESULT([yes])],
@@ -186,7 +186,7 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
            #
            # Check for SSE4.1 support
            #
-           AC_CACHE_CHECK([if we are checking for SSE4.1 support], op_avx_check_sse41, AS_VAR_SET(op_avx_check_sse41, yes))
+           AC_CACHE_CHECK([for SSE4.1 support], op_avx_check_sse41, AS_VAR_SET(op_avx_check_sse41, yes))
            AS_IF([test $op_avx_support -eq 1 && test "$op_avx_check_sse41" = "yes"],
                  [AC_MSG_CHECKING([for SSE4.1 support])
                   AC_LINK_IFELSE(
@@ -202,7 +202,7 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
            #
            # Check for SSE3 support
            #
-           AC_CACHE_CHECK([if we are checking for SSE3 support], op_avx_check_sse3, AS_VAR_SET(op_avx_check_sse3, yes))
+           AC_CACHE_CHECK([for SSE3 support], op_avx_check_sse3, AS_VAR_SET(op_avx_check_sse3, yes))
            AS_IF([test $op_avx_support -eq 1 && test "$op_avx_check_sse3" = "yes"],
                  [AC_MSG_CHECKING([for SSE3 support])
                   AC_LINK_IFELSE(
@@ -224,8 +224,8 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
                          AC_LINK_IFELSE(
                              [AC_LANG_PROGRAM([[#include <immintrin.h>]],
                                    [[
-    __m128 vA, vB;
-    _mm_add_ps(vA, vB)
+    __m256 vA, vB, vC;
+    vC = _mm256_add_ps(vA, vB)
                             ]])],
                              [op_avx_support=1
                               MCA_BUILD_OP_AVX_FLAGS="-mavx"
diff --git a/ompi/mca/op/avx/op_avx_functions.c b/ompi/mca/op/avx/op_avx_functions.c
index 95a9c9ab84e..ef3f0932906 100644
--- a/ompi/mca/op/avx/op_avx_functions.c
+++ b/ompi/mca/op/avx/op_avx_functions.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 The University of Tennessee and The University
+ * Copyright (c) 2019-2021 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2020      Research Organization for Information Science
@@ -24,16 +24,42 @@
 #include "ompi/mca/op/avx/op_avx.h"

 #include <immintrin.h>
-
+/**
+ * The following logic is necessary to cope with distro maintainer's desire to change the compilation
+ * flags after the configure step, leading to inconsistencies between what OMPI has detected and what
+ * code can be generated during make. If we detect that the current code generation architecture has
+ * been changed from our own setting and cannot generate the code we need (AVX512, AVX2) we fall back
+ * to a lesser support (AVX512 -> AVX2, AVX2 -> AVX, AVX -> error out).
+ */
 #if defined(GENERATE_AVX512_CODE)
-#define PREPEND _avx512
-#elif defined(GENERATE_AVX2_CODE)
-#define PREPEND _avx2
-#elif defined(GENERATE_AVX_CODE)
-#define PREPEND _avx
-#else
-#error This file should not be compiled in this conditions
-#endif
+#  if defined(__AVX512BW__) && defined(__AVX512F__) && defined(__AVX512VL__)
+#    define PREPEND _avx512
+#  else
+#    undef GENERATE_AVX512_CODE
+#  endif  /* defined(__AVX512BW__) && defined(__AVX512F__) && defined(__AVX512VL__) */
+#endif  /* defined(GENERATE_AVX512_CODE) */
+
+#if !defined(PREPEND) && defined(GENERATE_AVX2_CODE)
+#  if defined(__AVX2__)
+#    define PREPEND _avx2
+#  else
+#    undef GENERATE_AVX2_CODE
+#  endif  /* defined(__AVX2__) */
+#endif  /* !defined(PREPEND) && defined(GENERATE_AVX2_CODE) */
+
+#if !defined(PREPEND) && defined(GENERATE_AVX_CODE)
+#  if defined(__AVX__)
+#    define PREPEND _avx
+#  endif
+#endif  /* !defined(PREPEND) && defined(GENERATE_AVX_CODE) */
+
+#if !defined(PREPEND)
+#  if OMPI_MCA_OP_HAVE_AVX512 || OMPI_MCA_OP_HAVE_AVX2
+#    error The configure step has detected possible support for AVX512 and/or AVX2 but the compiler flags during make are too restrictive. Please disable the AVX component by adding --enable-mca-no-build=op-avx to your configure step.
+#  else
+#    error This file should not be compiled in this conditions. Please provide the config.log file to the OMPI developers.
+#  endif  /* OMPI_MCA_OP_HAVE_AVX512 || OMPI_MCA_OP_HAVE_AVX2 */
+#endif  /* !defined(PREPEND) */

 /*
  * Concatenate preprocessor tokens A and B without expanding macro definitions
@@ -46,6 +72,102 @@
  */
 #define OP_CONCAT(A, B) OP_CONCAT_NX(A, B)

+/*
+ * grep -e "_mm[125][251][862]_.*(" avx512.c -o | sed 's/(//g' | sort | uniq
+ *
+ * https://software.intel.com/sites/landingpage/IntrinsicsGuide
+ *
+ * _mm_add_epi[8,16,32,64]         SSE2
+ * _mm_add_pd                      SSE2
+ * _mm_add_ps                      SSE
+ * _mm_adds_epi[8,16]              SSE2
+ * _mm_adds_epu[8,16]              SSE2
+ * _mm_and_si128                   SSE2
+ * _mm_lddqu_si128                 SSE3
+ * _mm_loadu_pd                    SSE2
+ * _mm_loadu_ps                    SSE
+ * _mm_max_epi8                    SSE4.1
+ * _mm_max_epi16                   SSE2
+ * _mm_max_epi32                   SSE4.1
+ * _mm_max_epi64                   AVX512VL + AVX512F
+ * _mm_max_epu8                    SSE2
+ * _mm_max_epu[16,32]              SSE4.1
+ * _mm_max_epu64                   AVX512VL + AVX512F
+ * _mm_max_pd                      SSE2
+ * _mm_max_ps                      SSE
+ * _mm_min_epi8                    SSE4.1
+ * _mm_min_epi16                   SSE2
+ * _mm_min_epi32                   SSE4.1
+ * _mm_min_epi64                   AVX512VL + AVX512F
+ * _mm_min_epu8                    SSE2
+ * _mm_min_epu[16,32]              SSE4.1
+ * _mm_min_epu64                   AVX512VL + AVX512F
+ * _mm_min_pd                      SSE2
+ * _mm_min_ps                      SSE
+ * _mm_mul_pd                      SSE2
+ * _mm_mul_ps                      SSE
+ * _mm_mullo_epi16                 SSE2
+ * _mm_mullo_epi32                 SSE4.1
+ * _mm_mullo_epi64                 AVX512VL + AVX512DQ
+ * _mm_or_si128                    SSE2
+ * _mm_storeu_pd                   SSE2
+ * _mm_storeu_ps                   SSE
+ * _mm_storeu_si128                SSE2
+ * _mm_xor_si128                   SSE2
+ * _mm256_add_epi[8,16,32,64]      AVX2
+ * _mm256_add_p[s,d]               AVX
+ * _mm256_adds_epi[8,16]           AVX2
+ * _mm256_adds_epu[8,16]           AVX2
+ * _mm256_and_si256                AVX2
+ * _mm256_loadu_p[s,d]             AVX
+ * _mm256_loadu_si256              AVX
+ * _mm256_max_epi[8,16,32]         AVX2
+ * _mm256_max_epi64                AVX512VL + AVX512F
+ * _mm256_max_epu[8,16,32]         AVX2
+ * _mm256_max_epu64                AVX512VL + AVX512F
+ * _mm256_max_p[s,d]               AVX
+ * _mm256_min_epi[8,16,32]         AVX2
+ * _mm256_min_epi64                AVX512VL + AVX512F
+ * _mm256_min_epu[8,16,32]         AVX2
+ * _mm256_min_epu64                AVX512VL + AVX512F
+ * _mm256_min_p[s,d]               AVX
+ * _mm256_mul_p[s,d]               AVX
+ * _mm256_mullo_epi[16,32]         AVX2
+ * _mm256_mullo_epi64              AVX512VL + AVX512DQ
+ * _mm256_or_si256                 AVX2
+ * _mm256_storeu_p[s,d]            AVX
+ * _mm256_storeu_si256             AVX
+ * _mm256_xor_si256                AVX2
+ * _mm512_add_epi[8,16]            AVX512BW
+ * _mm512_add_epi[32,64]           AVX512F
+ * _mm512_add_p[s,d]               AVX512F
+ * _mm512_adds_epi[8,16]           AVX512BW
+ * _mm512_adds_epu[8,16]           AVX512BW
+ * _mm512_and_si512                AVX512F
+ * _mm512_cvtepi16_epi8            AVX512BW
+ * _mm512_cvtepi8_epi16            AVX512BW
+ * _mm512_loadu_p[s,d]             AVX512F
+ * _mm512_loadu_si512              AVX512F
+ * _mm512_max_epi[8,16]            AVX512BW
+ * _mm512_max_epi[32,64]           AVX512F
+ * _mm512_max_epu[8,16]            AVX512BW
+ * _mm512_max_epu[32,64]           AVX512F
+ * _mm512_max_p[s,d]               AVX512F
+ * _mm512_min_epi[8,16]            AVX512BW
+ * _mm512_min_epi[32,64]           AVX512F
+ * _mm512_min_epu[8,16]            AVX512BW
+ * _mm512_min_epu[32,64]           AVX512F
+ * _mm512_min_p[s,d]               AVX512F
+ * _mm512_mul_p[s,d]               AVX512F
+ * _mm512_mullo_epi16              AVX512BW
+ * _mm512_mullo_epi32              AVX512F
+ * _mm512_mullo_epi64              AVX512DQ
+ * _mm512_or_si512                 AVX512F
+ * _mm512_storeu_p[s,d]            AVX512F
+ * _mm512_storeu_si512             AVX512F
+ * _mm512_xor_si512                AVX512F
+ */
+
 /*
  * Since all the functions in this file are essentially identical, we
  * use a macro to substitute in names and types.  The core operation
@@ -62,13 +184,14 @@
   (((_flag) & mca_op_avx_component.flags) == (_flag))

 #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
+#if __AVX512F__
 #define OP_AVX_AVX512_FUNC(name, type_sign, type_size, type, op)               \
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG|OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) { \
         int types_per_step = (512 / 8) / sizeof(type);                         \
         for( ; left_over >= types_per_step; left_over -= types_per_step ) {    \
-            __m512i vecA =  _mm512_loadu_si512((__m512*)in);                   \
+            __m512i vecA = _mm512_loadu_si512((__m512*)in);                    \
             in += types_per_step;                                              \
-            __m512i vecB =  _mm512_loadu_si512((__m512*)out);                  \
+            __m512i vecB = _mm512_loadu_si512((__m512*)out);                   \
             __m512i res = _mm512_##op##_ep##type_sign##type_size(vecA, vecB);  \
             _mm512_storeu_si512((__m512*)out, res);                            \
             out += types_per_step;                                             \
@@ -76,10 +199,14 @@
         if( 0 == left_over ) return;                                           \
     }
 #else
+#error Target architecture lacks AVX512F support needed for _mm512_loadu_si512 and _mm512_storeu_si512
+#endif  /* __AVX512F__ */
+#else
 #define OP_AVX_AVX512_FUNC(name, type_sign, type_size, type, op) {}
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */

 #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
+#if __AVX__
 #define OP_AVX_AVX2_FUNC(name, type_sign, type_size, type, op)                 \
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) {  \
         int types_per_step = (256 / 8) / sizeof(type);  /* AVX2 */             \
@@ -87,30 +214,37 @@
             __m256i vecA = _mm256_loadu_si256((__m256i*)in);                   \
             in += types_per_step;                                              \
             __m256i vecB = _mm256_loadu_si256((__m256i*)out);                  \
-            __m256i res =  _mm256_##op##_ep##type_sign##type_size(vecA, vecB); \
+            __m256i res = _mm256_##op##_ep##type_sign##type_size(vecA, vecB);  \
             _mm256_storeu_si256((__m256i*)out, res);                           \
             out += types_per_step;                                             \
         }                                                                      \
         if( 0 == left_over ) return;                                           \
     }
 #else
+#error Target architecture lacks AVX support needed for _mm256_loadu_si256 and _mm256_storeu_si256
+#endif  /* __AVX__ */
+#else
 #define OP_AVX_AVX2_FUNC(name, type_sign, type_size, type, op) {}
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */

 #if defined(GENERATE_SSE3_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
+#if __SSE3__
 #define OP_AVX_SSE4_1_FUNC(name, type_sign, type_size, type, op)               \
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG | OMPI_OP_AVX_HAS_SSE4_1_FLAG) ) { \
-        int types_per_step = (128 / 8) / sizeof(type);  /* AVX */              \
+        int types_per_step = (128 / 8) / sizeof(type);                         \
         for( ; left_over >= types_per_step; left_over -= types_per_step ) {    \
             __m128i vecA = _mm_lddqu_si128((__m128i*)in);                      \
             in += types_per_step;                                              \
             __m128i vecB = _mm_lddqu_si128((__m128i*)out);                     \
-            __m128i res =  _mm_##op##_ep##type_sign##type_size(vecA, vecB);    \
+            __m128i res = _mm_##op##_ep##type_sign##type_size(vecA, vecB);     \
             _mm_storeu_si128((__m128i*)out, res);                              \
             out += types_per_step;                                             \
         }                                                                      \
     }
 #else
+#error Target architecture lacks SSE3 support needed for _mm_lddqu_si128 and _mm_storeu_si128
+#endif  /* __SSE3__ */
+#else
 #define OP_AVX_SSE4_1_FUNC(name, type_sign, type_size, type, op) {}
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */

@@ -143,12 +277,13 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
 }

 #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
+#if __AVX512BW__ && __AVX__
 #define OP_AVX_AVX512_MUL(name, type_sign, type_size, type, op)         \
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG | OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) {  \
         int types_per_step = (256 / 8) / sizeof(type);                  \
         for (; left_over >= types_per_step; left_over -= types_per_step) { \
-            __m256i vecA_tmp =  _mm256_loadu_si256((__m256i*)in);       \
-            __m256i vecB_tmp =  _mm256_loadu_si256((__m256i*)out);      \
+            __m256i vecA_tmp = _mm256_loadu_si256((__m256i*)in);        \
+            __m256i vecB_tmp = _mm256_loadu_si256((__m256i*)out);       \
             in += types_per_step;                                       \
             __m512i vecA = _mm512_cvtepi8_epi16(vecA_tmp);              \
             __m512i vecB = _mm512_cvtepi8_epi16(vecB_tmp);              \
@@ -160,6 +295,9 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
         if( 0 == left_over ) return;                                    \
     }
 #else
+#error Target architecture lacks AVX512BW and AVX support needed for _mm256_loadu_si256, _mm256_storeu_si256 and _mm512_cvtepi8_epi16
+#endif  /* __AVX512BW__ && __AVX__ */
+#else
 #define OP_AVX_AVX512_MUL(name, type_sign, type_size, type, op) {}
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
 /**
@@ -201,13 +339,14 @@ static void OP_CONCAT( ompi_op_avx_2buff_##name##_##type, PREPEND)(const void *_
  *
  */
 #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
+#if __AVX512F__
 #define OP_AVX_AVX512_BIT_FUNC(name, type_size, type, op)               \
     if( OMPI_OP_AVX_HAS_FLAGS( OMPI_OP_AVX_HAS_AVX512F_FLAG) ) {        \
         types_per_step = (512 / 8) / sizeof(type);                      \
         for (; left_over >= types_per_step; left_over -= types_per_step) { \
-            __m512i vecA =  _mm512_loadu_si512((__m512i*)in);           \
+            __m512i vecA = _mm512_loadu_si512((__m512i*)in);            \
             in += types_per_step;                                       \
-            __m512i vecB =  _mm512_loadu_si512((__m512i*)out);          \
+            __m512i vecB = _mm512_loadu_si512((__m512i*)out);           \
             __m512i res = _mm512_##op##_si512(vecA, vecB);              \
             _mm512_storeu_si512((__m512i*)out, res);                    \
             out += types_per_step;                                      \
@@ -215,10 +354,14 @@ static void OP_CONCAT( ompi_op_avx_2buff_##name##_##type, PREPEND)(const void *_
         if( 0 == left_over ) return;                                    \
     }
 #else
+#error Target architecture lacks AVX512F support needed for _mm512_loadu_si512 and _mm512_storeu_si512
+#endif  /* __AVX512F__ */
+#else
 #define OP_AVX_AVX512_BIT_FUNC(name, type_size, type, op) {}
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */

 #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
+#if __AVX__
 #define OP_AVX_AVX2_BIT_FUNC(name, type_size, type, op)                 \
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
         types_per_step = (256 / 8) / sizeof(type);                      \
@@ -226,17 +369,21 @@ static void OP_CONCAT( ompi_op_avx_2buff_##name##_##type, PREPEND)(const void *_
             __m256i vecA = _mm256_loadu_si256((__m256i*)in);            \
             in += types_per_step;                                       \
             __m256i vecB = _mm256_loadu_si256((__m256i*)out);           \
-            __m256i res =  _mm256_##op##_si256(vecA, vecB);             \
+            __m256i res = _mm256_##op##_si256(vecA, vecB);              \
             _mm256_storeu_si256((__m256i*)out, res);                    \
             out += types_per_step;                                      \
         }                                                               \
         if( 0 == left_over ) return;                                    \
     }
 #else
+#error Target architecture lacks AVX support needed for _mm256_loadu_si256 and _mm256_storeu_si256
+#endif  /* __AVX__ */
+#else
 #define OP_AVX_AVX2_BIT_FUNC(name, type_size, type, op) {}
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */

 #if defined(GENERATE_SSE3_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
+#if __SSE3__ && __SSE2__
 #define OP_AVX_SSE3_BIT_FUNC(name, type_size, type, op)                 \
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG) ) {            \
         types_per_step = (128 / 8) / sizeof(type);                      \
@@ -244,12 +391,15 @@ static void OP_CONCAT( ompi_op_avx_2buff_##name##_##type, PREPEND)(const void *_
             __m128i vecA = _mm_lddqu_si128((__m128i*)in);               \
             in += types_per_step;                                       \
             __m128i vecB = _mm_lddqu_si128((__m128i*)out);              \
-            __m128i res =  _mm_##op##_si128(vecA, vecB);                \
+            __m128i res = _mm_##op##_si128(vecA, vecB);                 \
             _mm_storeu_si128((__m128i*)out, res);                       \
             out += types_per_step;                                      \
         }                                                               \
     }
 #else
+#error Target architecture lacks SSE2 and SSE3 support needed for _mm_lddqu_si128 and _mm_storeu_si128
+#endif  /* __SSE3__ && __SSE2__ */
+#else
 #define OP_AVX_SSE3_BIT_FUNC(name, type_size, type, op) {}
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */

@@ -282,12 +432,13 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
 }

 #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
+#if __AVX512F__
 #define OP_AVX_AVX512_FLOAT_FUNC(op)                                    \
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) {         \
         types_per_step = (512 / 8) / sizeof(float);                     \
         for (; left_over >= types_per_step; left_over -= types_per_step) { \
-            __m512 vecA =  _mm512_loadu_ps((__m512*)in);                \
-            __m512 vecB =  _mm512_loadu_ps((__m512*)out);         \
+            __m512 vecA = _mm512_loadu_ps((__m512*)in);                 \
+            __m512 vecB = _mm512_loadu_ps((__m512*)out);                \
             in += types_per_step;                                       \
             __m512 res = _mm512_##op##_ps(vecA, vecB);                  \
             _mm512_storeu_ps((__m512*)out, res);                        \
@@ -296,28 +447,36 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
         if( 0 == left_over ) return;                                    \
     }
 #else
+#error Target architecture lacks AVX512F support needed for _mm512_loadu_ps and _mm512_storeu_ps
+#endif  /* __AVX512F__ */
+#else
 #define OP_AVX_AVX512_FLOAT_FUNC(op) {}
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */

 #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
+#if __AVX__
 #define OP_AVX_AVX_FLOAT_FUNC(op)                                       \
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) {             \
         types_per_step = (256 / 8) / sizeof(float);                     \
         for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
-            __m256 vecA =  _mm256_loadu_ps(in);                          \
+            __m256 vecA = _mm256_loadu_ps(in);                          \
             in += types_per_step;                                       \
-            __m256 vecB =  _mm256_loadu_ps(out);                         \
+            __m256 vecB = _mm256_loadu_ps(out);                         \
             __m256 res = _mm256_##op##_ps(vecA, vecB);                  \
-            _mm256_storeu_ps(out, res);                                  \
+            _mm256_storeu_ps(out, res);                                 \
             out += types_per_step;                                      \
         }                                                               \
         if( 0 == left_over ) return;                                    \
     }
 #else
+#error Target architecture lacks AVX support needed for _mm256_loadu_ps and _mm256_storeu_ps
+#endif  /* __AVX__ */
+#else
 #define OP_AVX_AVX_FLOAT_FUNC(op) {}
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */

 #if defined(GENERATE_AVX_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
+#if __SSE__
 #define OP_AVX_SSE_FLOAT_FUNC(op)                                       \
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE_FLAG) ) {             \
         types_per_step = (128 / 8) / sizeof(float);                     \
@@ -331,6 +490,9 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
         }                                                               \
     }
 #else
+#error Target architecture lacks SSE support needed for _mm_loadu_ps and _mm_storeu_ps
+#endif  /* __SSE__ */
+#else
 #define OP_AVX_SSE_FLOAT_FUNC(op) {}
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */

@@ -363,13 +525,14 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v
 }

 #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
+#if __AVX512F__
 #define OP_AVX_AVX512_DOUBLE_FUNC(op)                                   \
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) {         \
         types_per_step = (512 / 8)  / sizeof(double);                   \
         for (; left_over >= types_per_step; left_over -= types_per_step) { \
-            __m512d vecA =  _mm512_loadu_pd(in);                        \
+            __m512d vecA = _mm512_loadu_pd(in);                         \
             in += types_per_step;                                       \
-            __m512d vecB =  _mm512_loadu_pd(out);                       \
+            __m512d vecB = _mm512_loadu_pd(out);                        \
             __m512d res = _mm512_##op##_pd(vecA, vecB);                 \
             _mm512_storeu_pd((out), res);                               \
             out += types_per_step;                                      \
@@ -377,17 +540,21 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v
         if( 0 == left_over ) return;                                    \
     }
 #else
+#error Target architecture lacks AVXF512 support needed for _mm512_loadu_pd and _mm512_storeu_pd
+#endif  /* __AVXF512__ */
+#else
 #define OP_AVX_AVX512_DOUBLE_FUNC(op) {}
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */

 #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
+#if __AVX__
 #define OP_AVX_AVX_DOUBLE_FUNC(op)                                      \
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) {             \
         types_per_step = (256 / 8)  / sizeof(double);                   \
         for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
-            __m256d vecA =  _mm256_loadu_pd(in);                        \
+            __m256d vecA = _mm256_loadu_pd(in);                         \
             in += types_per_step;                                       \
-            __m256d vecB =  _mm256_loadu_pd(out);                       \
+            __m256d vecB = _mm256_loadu_pd(out);                        \
             __m256d res = _mm256_##op##_pd(vecA, vecB);                 \
             _mm256_storeu_pd(out, res);                                 \
             out += types_per_step;                                      \
@@ -395,10 +562,14 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v
         if( 0 == left_over ) return;                                    \
       }
 #else
+#error Target architecture lacks AVX support needed for _mm256_loadu_pd and _mm256_storeu_pd
+#endif  /* __AVX__ */
+#else
 #define OP_AVX_AVX_DOUBLE_FUNC(op) {}
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */

 #if defined(GENERATE_AVX_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
+#if __SSE2__
 #define OP_AVX_SSE2_DOUBLE_FUNC(op)                                     \
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE2_FLAG) ) {            \
         types_per_step = (128 / 8)  / sizeof(double);                   \
@@ -412,6 +583,9 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v
         }                                                               \
     }
 #else
+#error Target architecture lacks SSE2 support needed for _mm_loadu_pd and _mm_storeu_pd
+#endif  /* __SSE2__ */
+#else
 #define OP_AVX_SSE2_DOUBLE_FUNC(op) {}
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */

@@ -580,12 +754,13 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in,
  *  routines, needed for some optimizations.
  */
 #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
+#if __AVX512F__
 #define OP_AVX_AVX512_FUNC_3(name, type_sign, type_size, type, op)      \
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG|OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) {   \
         int types_per_step = (512 / 8) / sizeof(type);                  \
         for (; left_over >= types_per_step; left_over -= types_per_step) { \
-            __m512i vecA =  _mm512_loadu_si512(in1);                    \
-            __m512i vecB =  _mm512_loadu_si512(in2);                    \
+            __m512i vecA = _mm512_loadu_si512(in1);                     \
+            __m512i vecB = _mm512_loadu_si512(in2);                     \
             in1 += types_per_step;                                      \
             in2 += types_per_step;                                      \
             __m512i res = _mm512_##op##_ep##type_sign##type_size(vecA, vecB); \
@@ -595,10 +770,14 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in,
         if( 0 == left_over ) return;                                    \
     }
 #else
+#error Target architecture lacks AVX512F support needed for _mm512_loadu_si512 and _mm512_storeu_si512
+#endif  /* __AVX512F__ */
+#else
 #define OP_AVX_AVX512_FUNC_3(name, type_sign, type_size, type, op) {}
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */

 #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
+#if __AVX__
 #define OP_AVX_AVX2_FUNC_3(name, type_sign, type_size, type, op)        \
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
         int types_per_step = (256 / 8) / sizeof(type);                  \
@@ -607,17 +786,21 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in,
             __m256i vecB = _mm256_loadu_si256((__m256i*)in2);           \
             in1 += types_per_step;                                      \
             in2 += types_per_step;                                      \
-            __m256i res =  _mm256_##op##_ep##type_sign##type_size(vecA, vecB); \
+            __m256i res = _mm256_##op##_ep##type_sign##type_size(vecA, vecB); \
             _mm256_storeu_si256((__m256i*)out, res);                    \
             out += types_per_step;                                      \
         }                                                               \
         if( 0 == left_over ) return;                                    \
     }
 #else
+#error Target architecture lacks AVX support needed for _mm256_loadu_si256 and _mm256_storeu_si256
+#endif  /* __AVX__ */
+#else
 #define OP_AVX_AVX2_FUNC_3(name, type_sign, type_size, type, op) {}
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */

 #if defined(GENERATE_SSE3_CODE) && defined(OMPI_MCA_OP_HAVE_SSE41) && (1 == OMPI_MCA_OP_HAVE_SSE41) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
+#if __SSE3__ && __SSE2__
 #define OP_AVX_SSE4_1_FUNC_3(name, type_sign, type_size, type, op)      \
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG | OMPI_OP_AVX_HAS_SSE4_1_FLAG) ) {       \
         int types_per_step = (128 / 8) / sizeof(type);                  \
@@ -626,12 +809,15 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in,
             __m128i vecB = _mm_lddqu_si128((__m128i*)in2);              \
             in1 += types_per_step;                                      \
             in2 += types_per_step;                                      \
-            __m128i res =  _mm_##op##_ep##type_sign##type_size(vecA, vecB); \
+            __m128i res = _mm_##op##_ep##type_sign##type_size(vecA, vecB); \
             _mm_storeu_si128((__m128i*)out, res);                       \
             out += types_per_step;                                      \
         }                                                               \
     }
 #else
+#error Target architecture lacks SSE2 and SSE3 support needed for _mm_lddqu_si128 and _mm_storeu_si128
+#endif  /* __SSE3__ && __SSE2__ */
+#else
 #define OP_AVX_SSE4_1_FUNC_3(name, type_sign, type_size, type, op) {}
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */

@@ -667,12 +853,13 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re
 }

 #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
+#if __AVX512BW__ && __AVX__
 #define OP_AVX_AVX512_MUL_3(name, type_sign, type_size, type, op)       \
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG | OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) { \
         int types_per_step = (256 / 8) / sizeof(type);                  \
         for (; left_over >= types_per_step; left_over -= types_per_step) { \
-            __m256i vecA_tmp =  _mm256_loadu_si256((__m256i*)in1);      \
-            __m256i vecB_tmp =  _mm256_loadu_si256((__m256i*)in2);      \
+            __m256i vecA_tmp = _mm256_loadu_si256((__m256i*)in1);       \
+            __m256i vecB_tmp = _mm256_loadu_si256((__m256i*)in2);       \
             in1 += types_per_step;                                      \
             in2 += types_per_step;                                      \
             __m512i vecA = _mm512_cvtepi8_epi16(vecA_tmp);              \
@@ -685,6 +872,9 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re
         if( 0 == left_over ) return;                                    \
   }
 #else
+#error Target architecture lacks AVX512BW and AVX support needed for _mm256_loadu_si256, _mm256_storeu_si256 and _mm512_cvtepi8_epi16
+#endif  /* __AVX512BW__ && __AVX__ */
+#else
 #define OP_AVX_AVX512_MUL_3(name, type_sign, type_size, type, op) {}
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */
 /**
@@ -723,12 +913,13 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re
 }

 #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
+#if __AVX512F__
 #define OP_AVX_AVX512_BIT_FUNC_3(name, type_size, type, op)             \
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) {         \
         types_per_step = (512 / 8) / sizeof(type);                      \
         for (; left_over >= types_per_step; left_over -= types_per_step) {  \
-            __m512i vecA =  _mm512_loadu_si512(in1);                    \
-            __m512i vecB =  _mm512_loadu_si512(in2);                    \
+            __m512i vecA = _mm512_loadu_si512(in1);                     \
+            __m512i vecB = _mm512_loadu_si512(in2);                     \
             in1 += types_per_step;                                      \
             in2 += types_per_step;                                      \
             __m512i res = _mm512_##op##_si512(vecA, vecB);              \
@@ -738,10 +929,14 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re
         if( 0 == left_over ) return;                                    \
     }
 #else
+#error Target architecture lacks AVX512F support needed for _mm512_loadu_si512 and _mm512_storeu_si512
+#endif  /* __AVX512F__ */
+#else
 #define OP_AVX_AVX512_BIT_FUNC_3(name, type_size, type, op) {}
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */

 #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
+#if __AVX__
 #define OP_AVX_AVX2_BIT_FUNC_3(name, type_size, type, op)               \
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
         types_per_step = (256 / 8) / sizeof(type);                      \
@@ -750,17 +945,21 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re
             __m256i vecB = _mm256_loadu_si256((__m256i*)in2);           \
             in1 += types_per_step;                                      \
             in2 += types_per_step;                                      \
-            __m256i res =  _mm256_##op##_si256(vecA, vecB);             \
+            __m256i res = _mm256_##op##_si256(vecA, vecB);              \
             _mm256_storeu_si256((__m256i*)out, res);                    \
             out += types_per_step;                                      \
         }                                                               \
         if( 0 == left_over ) return;                                    \
     }
 #else
+#error Target architecture lacks AVX support needed for _mm256_loadu_si256 and _mm256_storeu_si256
+#endif  /* __AVX__ */
+#else
 #define OP_AVX_AVX2_BIT_FUNC_3(name, type_size, type, op) {}
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */

 #if defined(GENERATE_SSE3_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
+#if __SSE3__ && __SSE2__
 #define OP_AVX_SSE3_BIT_FUNC_3(name, type_size, type, op)               \
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG) ) {            \
         types_per_step = (128 / 8) / sizeof(type);                      \
@@ -769,12 +968,15 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re
             __m128i vecB = _mm_lddqu_si128((__m128i*)in2);              \
             in1 += types_per_step;                                      \
             in2 += types_per_step;                                      \
-            __m128i res =  _mm_##op##_si128(vecA, vecB);                \
+            __m128i res = _mm_##op##_si128(vecA, vecB);                 \
             _mm_storeu_si128((__m128i*)out, res);                       \
             out += types_per_step;                                      \
         }                                                               \
     }
 #else
+#error Target architecture lacks SSE2 and SSE3 support needed for _mm_lddqu_si128 and _mm_storeu_si128
+#endif  /* __SSE3__ && __SSE2__ */
+#else
 #define OP_AVX_SSE3_BIT_FUNC_3(name, type_size, type, op) {}
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */

@@ -809,12 +1011,13 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1,
 }

 #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
+#if __AVX512F__
 #define OP_AVX_AVX512_FLOAT_FUNC_3(op)                                  \
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) {         \
         types_per_step = (512 / 8) / sizeof(float);                     \
         for (; left_over >= types_per_step; left_over -= types_per_step) { \
-            __m512 vecA =  _mm512_loadu_ps(in1);                        \
-            __m512 vecB =  _mm512_loadu_ps(in2);                        \
+            __m512 vecA = _mm512_loadu_ps(in1);                         \
+            __m512 vecB = _mm512_loadu_ps(in2);                         \
             in1 += types_per_step;                                      \
             in2 += types_per_step;                                      \
             __m512 res = _mm512_##op##_ps(vecA, vecB);                  \
@@ -824,16 +1027,20 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1,
         if( 0 == left_over ) return;                                    \
     }
 #else
+#error Target architecture lacks AVX512F support needed for _mm512_loadu_ps and _mm512_storeu_ps
+#endif  /* __AVX512F__ */
+#else
 #define OP_AVX_AVX512_FLOAT_FUNC_3(op) {}
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */

 #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
+#if __AVX__
 #define OP_AVX_AVX_FLOAT_FUNC_3(op)                                     \
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) {             \
         types_per_step = (256 / 8) / sizeof(float);                     \
         for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
-            __m256 vecA =  _mm256_loadu_ps(in1);                        \
-            __m256 vecB =  _mm256_loadu_ps(in2);                        \
+            __m256 vecA = _mm256_loadu_ps(in1);                         \
+            __m256 vecB = _mm256_loadu_ps(in2);                         \
             in1 += types_per_step;                                      \
             in2 += types_per_step;                                      \
             __m256 res = _mm256_##op##_ps(vecA, vecB);                  \
@@ -843,10 +1050,14 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1,
         if( 0 == left_over ) return;                                    \
     }
 #else
+#error Target architecture lacks AVX support needed for _mm256_loadu_ps and _mm256_storeu_ps
+#endif  /* __AVX__ */
+#else
 #define OP_AVX_AVX_FLOAT_FUNC_3(op) {}
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */

 #if defined(GENERATE_AVX_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
+#if __SSE__
 #define OP_AVX_SSE_FLOAT_FUNC_3(op)                  \
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE_FLAG) ) {             \
         types_per_step = (128 / 8) / sizeof(float);                     \
@@ -861,6 +1072,9 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1,
         }                                                               \
     }
 #else
+#error Target architecture lacks SSE support needed for _mm_loadu_ps and _mm_storeu_ps
+#endif  /* __SSE__ */
+#else
 #define OP_AVX_SSE_FLOAT_FUNC_3(op) {}
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */

@@ -895,12 +1109,13 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1,
 }

 #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512)
+#if __AVX512F__
 #define OP_AVX_AVX512_DOUBLE_FUNC_3(op)                                 \
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) {         \
         types_per_step = (512 / 8) / sizeof(double);                    \
         for (; left_over >= types_per_step; left_over -= types_per_step) { \
-            __m512d vecA =  _mm512_loadu_pd((in1));                     \
-            __m512d vecB =  _mm512_loadu_pd((in2));                     \
+            __m512d vecA = _mm512_loadu_pd((in1));                      \
+            __m512d vecB = _mm512_loadu_pd((in2));                      \
             in1 += types_per_step;                                      \
             in2 += types_per_step;                                      \
             __m512d res = _mm512_##op##_pd(vecA, vecB);                 \
@@ -910,16 +1125,20 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1,
         if( 0 == left_over ) return;                                    \
     }
 #else
+#error Target architecture lacks AVXF512 support needed for _mm512_loadu_pd and _mm512_storeu_pd
+#endif  /* __AVXF512__ */
+#else
 #define OP_AVX_AVX512_DOUBLE_FUNC_3(op) {}
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */

 #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2)
+#if __AVX__
 #define OP_AVX_AVX_DOUBLE_FUNC_3(op)                                    \
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) {             \
         types_per_step = (256 / 8) / sizeof(double);                    \
         for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
-            __m256d vecA =  _mm256_loadu_pd(in1);                       \
-            __m256d vecB =  _mm256_loadu_pd(in2);                       \
+            __m256d vecA = _mm256_loadu_pd(in1);                        \
+            __m256d vecB = _mm256_loadu_pd(in2);                        \
             in1 += types_per_step;                                      \
             in2 += types_per_step;                                      \
             __m256d res = _mm256_##op##_pd(vecA, vecB);                 \
@@ -929,10 +1148,14 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1,
         if( 0 == left_over ) return;                                    \
     }
 #else
+#error Target architecture lacks AVX support needed for _mm256_loadu_pd and _mm256_storeu_pd
+#endif  /* __AVX__ */
+#else
 #define OP_AVX_AVX_DOUBLE_FUNC_3(op) {}
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */

 #if defined(GENERATE_AVX_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX)
+#if __SSE2__
 #define OP_AVX_SSE2_DOUBLE_FUNC_3(op)                                   \
     if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE2_FLAG) ) {            \
         types_per_step = (128 / 8) / sizeof(double);                    \
@@ -947,6 +1170,9 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1,
         }                                                               \
     }
 #else
+#error Target architecture lacks SSE2 support needed for _mm_loadu_pd and _mm_storeu_pd
+#endif  /* __SSE2__ */
+#else
 #define OP_AVX_SSE2_DOUBLE_FUNC_3(op) {}
 #endif  /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */


From 20be3fc25713ac2de3eb4d77b85248d7fe2bc28b Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Tue, 5 Jan 2021 22:40:26 -0500
Subject: [PATCH 3/3] A better test for MPI_OP performance.

The test now has the ability to add a shift to all or to any of the
input and output buffers to assess the impact of unaligned operations.

Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
---
 test/datatype/reduce_local.c | 161 ++++++++++++++++++++++-------------
 1 file changed, 104 insertions(+), 57 deletions(-)

diff --git a/test/datatype/reduce_local.c b/test/datatype/reduce_local.c
index 97890f94227..f227439b714 100644
--- a/test/datatype/reduce_local.c
+++ b/test/datatype/reduce_local.c
@@ -59,7 +59,7 @@ static int total_errors = 0;
      _a < _b ? _a : _b; })

 static void print_status(char* op, char* type, int type_size,
-                         int count, double duration,
+                         int count, int max_shift, double *duration, int repeats,
                          int correct )
 {
     if(correct) {
@@ -68,7 +68,15 @@ static void print_status(char* op, char* type, int type_size,
         printf("%-10s %s [\033[1;31mfail\033[0m]", op, type);
         total_errors++;
     }
-    printf(" count  %-10d  time %.6f seconds\n", count, duration);
+    if( 1 == max_shift ) {
+        printf(" count  %-10d  time (seconds) %.8f seconds\n", count, duration[0] / repeats);
+    } else {
+        printf(" count  %-10d  time (seconds / shifts) ", count);
+        for( int i = 0; i < max_shift; i++ ) {
+            printf("%.8f ", duration[i] / repeats );
+        }
+        printf("\n");
+    }
 }

 static int do_ops_built = 0;
@@ -115,19 +123,23 @@ do { \
     const TYPE *_p1 = ((TYPE*)(INBUF)), *_p3 = ((TYPE*)(CHECK_BUF)); \
     TYPE *_p2 = ((TYPE*)(INOUT_BUF)); \
     skip_op_type = 0; \
-    for(int _k = 0; _k < min((COUNT), 4); +_k++ ) { \
-        memcpy(_p2, _p3, sizeof(TYPE) * (COUNT)); \
-        tstart = MPI_Wtime(); \
-        MPI_Reduce_local(_p1+_k, _p2+_k, (COUNT)-_k, (MPITYPE), (MPIOP)); \
-        tend = MPI_Wtime(); \
-        if( check ) { \
-            for( i = 0; i < (COUNT)-_k; i++ ) { \
-                if(((_p2+_k)[i]) == (((_p1+_k)[i]) OPNAME ((_p3+_k)[i]))) \
-                    continue; \
-                printf("First error at alignment %d position %d (%" TYPE_PREFIX " %s %" TYPE_PREFIX " != %" TYPE_PREFIX ")\n", \
-                       _k, i, (_p1+_k)[i], (#OPNAME), (_p3+_k)[i], (_p2+_k)[i]); \
-                correctness = 0; \
-                break; \
+    for(int _k = 0; _k < min((COUNT), max_shift); +_k++ ) { \
+        duration[_k] = 0.0; \
+        for(int _r = repeats; _r > 0; _r--) { \
+            memcpy(_p2, _p3, sizeof(TYPE) * (COUNT)); \
+            tstart = MPI_Wtime(); \
+            MPI_Reduce_local(_p1+_k, _p2+_k, (COUNT)-_k, (MPITYPE), (MPIOP)); \
+            tend = MPI_Wtime(); \
+            duration[_k] += (tend - tstart); \
+            if( check ) { \
+                for( i = 0; i < (COUNT)-_k; i++ ) { \
+                    if(((_p2+_k)[i]) == (((_p1+_k)[i]) OPNAME ((_p3+_k)[i]))) \
+                        continue; \
+                    printf("First error at alignment %d position %d (%" TYPE_PREFIX " %s %" TYPE_PREFIX " != %" TYPE_PREFIX ")\n", \
+                           _k, i, (_p1+_k)[i], (#OPNAME), (_p3+_k)[i], (_p2+_k)[i]); \
+                    correctness = 0; \
+                    break; \
+                } \
             } \
         } \
     } \
@@ -139,20 +151,24 @@ do { \
     const TYPE *_p1 = ((TYPE*)(INBUF)), *_p3 = ((TYPE*)(CHECK_BUF)); \
     TYPE *_p2 = ((TYPE*)(INOUT_BUF)); \
     skip_op_type = 0; \
-    for(int _k = 0; _k < min((COUNT), 4); +_k++ ) { \
-        memcpy(_p2, _p3, sizeof(TYPE) * (COUNT)); \
-        tstart = MPI_Wtime(); \
-        MPI_Reduce_local(_p1+_k, _p2+_k, (COUNT), (MPITYPE), (MPIOP)); \
-        tend = MPI_Wtime(); \
-        if( check ) { \
-            for( i = 0; i < (COUNT); i++ ) { \
-                TYPE _v1 = *(_p1+_k), _v2 = *(_p2+_k), _v3 = *(_p3+_k); \
-                if(_v2 == OPNAME(_v1, _v3)) \
-                    continue; \
-                printf("First error at alignment %d position %d (%" TYPE_PREFIX " !=  %s(%" TYPE_PREFIX ", %" TYPE_PREFIX ")\n", \
-                       _k, i, _v1, (#OPNAME), _v3, _v2); \
-                correctness = 0; \
-                break; \
+    for(int _k = 0; _k < min((COUNT), max_shift); +_k++ ) { \
+        duration[_k] = 0.0; \
+        for(int _r = repeats; _r > 0; _r--) { \
+            memcpy(_p2, _p3, sizeof(TYPE) * (COUNT)); \
+            tstart = MPI_Wtime(); \
+            MPI_Reduce_local(_p1+_k, _p2+_k, (COUNT), (MPITYPE), (MPIOP)); \
+            tend = MPI_Wtime(); \
+            duration[_k] += (tend - tstart); \
+            if( check ) { \
+                for( i = 0; i < (COUNT); i++ ) { \
+                    TYPE _v1 = *(_p1+_k), _v2 = *(_p2+_k), _v3 = *(_p3+_k); \
+                    if(_v2 == OPNAME(_v1, _v3)) \
+                        continue; \
+                    printf("First error at alignment %d position %d (%" TYPE_PREFIX " !=  %s(%" TYPE_PREFIX ", %" TYPE_PREFIX ")\n", \
+                           _k, i, _v1, (#OPNAME), _v3, _v2); \
+                    correctness = 0; \
+                    break; \
+                } \
             } \
         } \
     } \
@@ -163,24 +179,36 @@ int main(int argc, char **argv)
 {
     static void *in_buf = NULL, *inout_buf = NULL, *inout_check_buf = NULL;
     int count, type_size = 8, rank, size, provided, correctness = 1;
-    int repeats = 1, i, c;
-    double tstart, tend;
+    int repeats = 1, i, c, op1_alignment = 0, res_alignment = 0;
+    int max_shift = 4;
+    double *duration, tstart, tend;
     bool check = true;
     char type[5] = "uifd", *op = "sum", *mpi_type;
     int lower = 1, upper = 1000000, skip_op_type;
     MPI_Op mpi_op;

-    while( -1 != (c = getopt(argc, argv, "l:u:t:o:s:n:vfh")) ) {
+    while( -1 != (c = getopt(argc, argv, "l:u:r:t:o:i:s:n:1:2:vfh")) ) {
         switch(c) {
         case 'l':
             lower = atoi(optarg);
             if( lower <= 0 ) {
-                fprintf(stderr, "The number of elements must be positive\n");
+                fprintf(stderr, "The lower number of elements must be positive\n");
                 exit(-1);
             }
             break;
         case 'u':
             upper = atoi(optarg);
+            if( lower <= 0 ) {
+                fprintf(stderr, "The upper number of elements must be positive\n");
+                exit(-1);
+            }
+            break;
+        case 'i':
+            max_shift = atoi(optarg);
+            if( max_shift <= 0 ) {
+                fprintf(stderr, "The max shift must be positive\n");
+                exit(-1);
+            }
             break;
         case 'f':
             check = false;
@@ -216,14 +244,32 @@ int main(int argc, char **argv)
                 exit(-1);
             }
             break;
+        case '1':
+            op1_alignment = atoi(optarg);
+            if( op1_alignment < 0 ) {
+                fprintf(stderr, "alignment for the first operand must be positive\n");
+                exit(-1);
+            }
+            break;
+        case '2':
+            res_alignment = atoi(optarg);
+            if( res_alignment < 0 ) {
+                fprintf(stderr, "alignment for the result must be positive\n");
+                exit(-1);
+            }
+            break;
         case 'h':
             fprintf(stdout, "%s options are:\n"
                     " -l <number> : lower number of elements\n"
                     " -u <number> : upper number of elements\n"
                     " -s <type_size> : 8, 16, 32 or 64 bits elements\n"
                     " -t [i,u,f,d] : type of the elements to apply the operations on\n"
+                    " -r <number> : number of repetitions for each test\n"
                     " -o <op> : comma separated list of operations to execute among\n"
                     "           sum, min, max, prod, bor, bxor, band\n"
+                    " -i <number> : shift on all buffers to check alignment\n"
+                    " -1 <number> : (mis)alignment in elements for the first op\n"
+                    " -2 <number> : (mis)alignment in elements for the result\n"
                     " -v: increase the verbosity level\n"
                     " -h: this help message\n", argv[0]);
             exit(0);
@@ -233,9 +279,10 @@ int main(int argc, char **argv)
     if( !do_ops_built ) {  /* not yet done, take the default */
             build_do_ops( "all", do_ops);
     }
-    in_buf          = malloc(upper * sizeof(double));
-    inout_buf       = malloc(upper * sizeof(double));
-    inout_check_buf = malloc(upper * sizeof(double));
+    posix_memalign( &in_buf,          64, (upper + op1_alignment) * sizeof(double));
+    posix_memalign( &inout_buf,       64, (upper + res_alignment) * sizeof(double));
+    posix_memalign( &inout_check_buf, 64, upper * sizeof(double));
+    duration = (double*)malloc(max_shift * sizeof(double));

     ompi_mpi_init(argc, argv, MPI_THREAD_SERIALIZED, &provided, false);

@@ -253,8 +300,8 @@ int main(int argc, char **argv)
                 correctness = 1;
                 if('i' == type[type_idx]) {
                     if( 8 == type_size ) {
-                        int8_t *in_int8 = (int8_t*)in_buf,
-                            *inout_int8 = (int8_t*)inout_buf,
+                        int8_t *in_int8 = (int8_t*)((char*)in_buf + op1_alignment * sizeof(int8_t)),
+                            *inout_int8 = (int8_t*)((char*)inout_buf + res_alignment * sizeof(int8_t)),
                             *inout_int8_for_check = (int8_t*)inout_check_buf;
                         for( i = 0; i < count; i++ ) {
                             in_int8[i] = 5;
@@ -299,8 +346,8 @@ int main(int argc, char **argv)
                         }
                     }
                     if( 16 == type_size ) {
-                        int16_t *in_int16 = (int16_t*)in_buf,
-                            *inout_int16 = (int16_t*)inout_buf,
+                        int16_t *in_int16 = (int16_t*)((char*)in_buf + op1_alignment * sizeof(int16_t)),
+                            *inout_int16 = (int16_t*)((char*)inout_buf + res_alignment * sizeof(int16_t)),
                             *inout_int16_for_check = (int16_t*)inout_check_buf;
                         for( i = 0; i < count; i++ ) {
                             in_int16[i] = 5;
@@ -345,8 +392,8 @@ int main(int argc, char **argv)
                         }
                     }
                     if( 32 == type_size ) {
-                        int32_t *in_int32 = (int32_t*)in_buf,
-                            *inout_int32 = (int32_t*)inout_buf,
+                        int32_t *in_int32 = (int32_t*)((char*)in_buf + op1_alignment * sizeof(int32_t)),
+                            *inout_int32 = (int32_t*)((char*)inout_buf + res_alignment * sizeof(int32_t)),
                             *inout_int32_for_check = (int32_t*)inout_check_buf;
                         for( i = 0; i < count; i++ ) {
                             in_int32[i] = 5;
@@ -391,8 +438,8 @@ int main(int argc, char **argv)
                         }
                     }
                     if( 64 == type_size ) {
-                        int64_t *in_int64 = (int64_t*)in_buf,
-                            *inout_int64 = (int64_t*)inout_buf,
+                        int64_t *in_int64 = (int64_t*)((char*)in_buf + op1_alignment * sizeof(int64_t)),
+                            *inout_int64 = (int64_t*)((char*)inout_buf + res_alignment * sizeof(int64_t)),
                             *inout_int64_for_check = (int64_t*)inout_check_buf;
                         for( i = 0; i < count; i++ ) {
                             in_int64[i] = 5;
@@ -440,8 +487,8 @@ int main(int argc, char **argv)

                 if( 'u' == type[type_idx] ) {
                     if( 8 == type_size ) {
-                        uint8_t *in_uint8 = (uint8_t*)in_buf,
-                            *inout_uint8 = (uint8_t*)inout_buf,
+                        uint8_t *in_uint8 = (uint8_t*)((char*)in_buf + op1_alignment * sizeof(uint8_t)),
+                            *inout_uint8 = (uint8_t*)((char*)inout_buf + res_alignment * sizeof(uint8_t)),
                             *inout_uint8_for_check = (uint8_t*)inout_check_buf;
                         for( i = 0; i < count; i++ ) {
                             in_uint8[i] = 5;
@@ -486,8 +533,8 @@ int main(int argc, char **argv)
                         }
                     }
                     if( 16 == type_size ) {
-                        uint16_t *in_uint16 = (uint16_t*)in_buf,
-                            *inout_uint16 = (uint16_t*)inout_buf,
+                        uint16_t *in_uint16 = (uint16_t*)((char*)in_buf + op1_alignment * sizeof(uint16_t)),
+                            *inout_uint16 = (uint16_t*)((char*)inout_buf + res_alignment * sizeof(uint16_t)),
                             *inout_uint16_for_check = (uint16_t*)inout_check_buf;
                         for( i = 0; i < count; i++ ) {
                             in_uint16[i] = 5;
@@ -532,8 +579,8 @@ int main(int argc, char **argv)
                         }
                     }
                     if( 32 == type_size ) {
-                        uint32_t *in_uint32 = (uint32_t*)in_buf,
-                            *inout_uint32 = (uint32_t*)inout_buf,
+                        uint32_t *in_uint32 = (uint32_t*)((char*)in_buf + op1_alignment * sizeof(uint32_t)),
+                            *inout_uint32 = (uint32_t*)((char*)inout_buf + res_alignment * sizeof(uint32_t)),
                             *inout_uint32_for_check = (uint32_t*)inout_check_buf;
                         for( i = 0; i < count; i++ ) {
                             in_uint32[i] = 5;
@@ -578,8 +625,8 @@ int main(int argc, char **argv)
                         }
                     }
                     if( 64 == type_size ) {
-                        uint64_t *in_uint64 = (uint64_t*)in_buf,
-                              *inout_uint64 = (uint64_t*)inout_buf,
+                        uint64_t *in_uint64 = (uint64_t*)((char*)in_buf + op1_alignment * sizeof(uint64_t)),
+                              *inout_uint64 = (uint64_t*)((char*)inout_buf + res_alignment * sizeof(uint64_t)),
                             *inout_uint64_for_check = (uint64_t*)inout_check_buf;
                         for( i = 0; i < count; i++ ) {
                             in_uint64[i] = 5;
@@ -626,8 +673,8 @@ int main(int argc, char **argv)
                 }

                 if( 'f' == type[type_idx] ) {
-                    float *in_float = (float*)in_buf,
-                        *inout_float = (float*)inout_buf,
+                    float *in_float = (float*)((char*)in_buf + op1_alignment * sizeof(float)),
+                        *inout_float = (float*)((char*)inout_buf + res_alignment * sizeof(float)),
                         *inout_float_for_check = (float*)inout_check_buf;
                     for( i = 0; i < count; i++ ) {
                         in_float[i] = 1000.0+1;
@@ -658,8 +705,8 @@ int main(int argc, char **argv)
                 }

                 if( 'd' == type[type_idx] ) {
-                    double *in_double = (double*)in_buf,
-                        *inout_double = (double*)inout_buf,
+                    double *in_double = (double*)((char*)in_buf + op1_alignment * sizeof(double)),
+                        *inout_double = (double*)((char*)inout_buf + res_alignment * sizeof(double)),
                         *inout_double_for_check = (double*)inout_check_buf;
                     for( i = 0; i < count; i++ ) {
                         in_double[i] = 10.0+1;
@@ -691,7 +738,7 @@ int main(int argc, char **argv)
         check_and_continue:
                 if( !skip_op_type )
                     print_status(array_of_ops[do_ops[op_idx]].mpi_op_name,
-                                 mpi_type, type_size, count, tend-tstart, correctness);
+                                 mpi_type, type_size, count, max_shift, duration, repeats, correctness);
             }
             if( !skip_op_type )
                 printf("\n");