diff --git a/.gitignore b/.gitignore
index a4e3e90..056c3c7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1 @@
-SOURCES/mesa-20.3.3.tar.xz
+SOURCES/mesa-21.1.5.tar.xz
diff --git a/.mesa.metadata b/.mesa.metadata
index 3d34979..b800049 100644
--- a/.mesa.metadata
+++ b/.mesa.metadata
@@ -1 +1 @@
-c0e42fada2b306a6d9740376398c0d8b0a130427 SOURCES/mesa-20.3.3.tar.xz
+6962198a822b83195065611e253cde98f627e904 SOURCES/mesa-21.1.5.tar.xz
diff --git a/SOURCES/Makefile b/SOURCES/Makefile
index eea9f33..f17e9fd 100644
--- a/SOURCES/Makefile
+++ b/SOURCES/Makefile
@@ -1,4 +1,4 @@
-VERSION ?= 20.3.3
+VERSION ?= 21.1.5
 SANITIZE ?= 1
 
 DIRNAME = mesa-${VERSION}
diff --git a/SOURCES/anv-remove-warning.patch b/SOURCES/anv-remove-warning.patch
deleted file mode 100644
index 130a050..0000000
--- a/SOURCES/anv-remove-warning.patch
+++ /dev/null
@@ -1,13 +0,0 @@
-diff -up mesa-20.3.3/src/intel/vulkan/anv_perf.c.dma mesa-20.3.3/src/intel/vulkan/anv_perf.c
---- mesa-20.3.3/src/intel/vulkan/anv_perf.c.dma	2021-02-16 12:56:09.881084752 +1000
-+++ mesa-20.3.3/src/intel/vulkan/anv_perf.c	2021-02-16 12:56:14.626213956 +1000
-@@ -47,9 +47,6 @@ anv_get_perf(const struct gen_device_inf
-    gen_perf_init_metrics(perf, devinfo, fd, false /* pipeline statistics */);
- 
-    if (!perf->n_queries) {
--      if (perf->platform_supported)
--         mesa_logw("Performance support disabled, "
--                   "consider sysctl dev.i915.perf_stream_paranoid=0\n");
-       goto err;
-    }
- 
diff --git a/SOURCES/cpu-affinity-fixes-20.3.3.patch b/SOURCES/cpu-affinity-fixes-20.3.3.patch
deleted file mode 100644
index d11f5c4..0000000
--- a/SOURCES/cpu-affinity-fixes-20.3.3.patch
+++ /dev/null
@@ -1,1583 +0,0 @@
-diff --git a/src/amd/compiler/tests/main.cpp b/src/amd/compiler/tests/main.cpp
-index cb646e2dd30..eac0a244adf 100644
---- a/src/amd/compiler/tests/main.cpp
-+++ b/src/amd/compiler/tests/main.cpp
-@@ -34,6 +34,8 @@
- #include "aco_ir.h"
- #include "framework.h"
- 
-+#include "util/u_cpu_detect.h"
-+
- static const char *help_message =
-    "Usage: %s [-h] [-l --list] [--no-check] [TEST [TEST ...]]\n"
-    "\n"
-@@ -227,6 +229,8 @@ int main(int argc, char **argv)
-       return 99;
-    }
- 
-+   util_cpu_detect();
-+
-    if (do_list) {
-       for (auto test : tests)
-          printf("%s\n", test.first.c_str());
-diff --git a/src/compiler/glsl/standalone.cpp b/src/compiler/glsl/standalone.cpp
-index ca187001186..2714d8b95ed 100644
---- a/src/compiler/glsl/standalone.cpp
-+++ b/src/compiler/glsl/standalone.cpp
-@@ -401,6 +401,8 @@ standalone_compile_shader(const struct standalone_options *_options,
-    int status = EXIT_SUCCESS;
-    bool glsl_es = false;
- 
-+   util_cpu_detect();
-+
-    options = _options;
- 
-    switch (options->glsl_version) {
-diff --git a/src/compiler/nir/tests/negative_equal_tests.cpp b/src/compiler/nir/tests/negative_equal_tests.cpp
-index f83041a4fbf..76472e48309 100644
---- a/src/compiler/nir/tests/negative_equal_tests.cpp
-+++ b/src/compiler/nir/tests/negative_equal_tests.cpp
-@@ -36,6 +36,7 @@ protected:
-    const_value_negative_equal_test()
-    {
-       glsl_type_singleton_init_or_ref();
-+      util_cpu_detect();
- 
-       memset(c1, 0, sizeof(c1));
-       memset(c2, 0, sizeof(c2));
-@@ -55,6 +56,7 @@ protected:
-    alu_srcs_negative_equal_test()
-    {
-       glsl_type_singleton_init_or_ref();
-+      util_cpu_detect();
- 
-       static const nir_shader_compiler_options options = { };
-       nir_builder_init_simple_shader(&bld, NULL, MESA_SHADER_VERTEX, &options);
-diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
-index 165d73d94fc..33269e528fe 100644
---- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
-+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
-@@ -104,13 +104,13 @@ lp_build_min_simple(struct lp_build_context *bld,
- 
-    /* TODO: optimize the constant case */
- 
--   if (type.floating && util_cpu_caps.has_sse) {
-+   if (type.floating && util_get_cpu_caps()->has_sse) {
-       if (type.width == 32) {
-          if (type.length == 1) {
-             intrinsic = "llvm.x86.sse.min.ss";
-             intr_size = 128;
-          }
--         else if (type.length <= 4 || !util_cpu_caps.has_avx) {
-+         else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) {
-             intrinsic = "llvm.x86.sse.min.ps";
-             intr_size = 128;
-          }
-@@ -119,12 +119,12 @@ lp_build_min_simple(struct lp_build_context *bld,
-             intr_size = 256;
-          }
-       }
--      if (type.width == 64 && util_cpu_caps.has_sse2) {
-+      if (type.width == 64 && util_get_cpu_caps()->has_sse2) {
-          if (type.length == 1) {
-             intrinsic = "llvm.x86.sse2.min.sd";
-             intr_size = 128;
-          }
--         else if (type.length == 2 || !util_cpu_caps.has_avx) {
-+         else if (type.length == 2 || !util_get_cpu_caps()->has_avx) {
-             intrinsic = "llvm.x86.sse2.min.pd";
-             intr_size = 128;
-          }
-@@ -134,7 +134,7 @@ lp_build_min_simple(struct lp_build_context *bld,
-          }
-       }
-    }
--   else if (type.floating && util_cpu_caps.has_altivec) {
-+   else if (type.floating && util_get_cpu_caps()->has_altivec) {
-       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
-           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
-          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
-@@ -144,7 +144,7 @@ lp_build_min_simple(struct lp_build_context *bld,
-          intrinsic = "llvm.ppc.altivec.vminfp";
-          intr_size = 128;
-       }
--   } else if (util_cpu_caps.has_altivec) {
-+   } else if (util_get_cpu_caps()->has_altivec) {
-       intr_size = 128;
-       if (type.width == 8) {
-          if (!type.sign) {
-@@ -174,7 +174,7 @@ lp_build_min_simple(struct lp_build_context *bld,
-        * The sse intrinsics return the second operator in case of nan by
-        * default so we need to special code to handle those.
-        */
--      if (util_cpu_caps.has_sse && type.floating &&
-+      if (util_get_cpu_caps()->has_sse && type.floating &&
-           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
-           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
-           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
-@@ -274,13 +274,13 @@ lp_build_max_simple(struct lp_build_context *bld,
- 
-    /* TODO: optimize the constant case */
- 
--   if (type.floating && util_cpu_caps.has_sse) {
-+   if (type.floating && util_get_cpu_caps()->has_sse) {
-       if (type.width == 32) {
-          if (type.length == 1) {
-             intrinsic = "llvm.x86.sse.max.ss";
-             intr_size = 128;
-          }
--         else if (type.length <= 4 || !util_cpu_caps.has_avx) {
-+         else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) {
-             intrinsic = "llvm.x86.sse.max.ps";
-             intr_size = 128;
-          }
-@@ -289,12 +289,12 @@ lp_build_max_simple(struct lp_build_context *bld,
-             intr_size = 256;
-          }
-       }
--      if (type.width == 64 && util_cpu_caps.has_sse2) {
-+      if (type.width == 64 && util_get_cpu_caps()->has_sse2) {
-          if (type.length == 1) {
-             intrinsic = "llvm.x86.sse2.max.sd";
-             intr_size = 128;
-          }
--         else if (type.length == 2 || !util_cpu_caps.has_avx) {
-+         else if (type.length == 2 || !util_get_cpu_caps()->has_avx) {
-             intrinsic = "llvm.x86.sse2.max.pd";
-             intr_size = 128;
-          }
-@@ -304,7 +304,7 @@ lp_build_max_simple(struct lp_build_context *bld,
-          }
-       }
-    }
--   else if (type.floating && util_cpu_caps.has_altivec) {
-+   else if (type.floating && util_get_cpu_caps()->has_altivec) {
-       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
-           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
-          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
-@@ -314,7 +314,7 @@ lp_build_max_simple(struct lp_build_context *bld,
-          intrinsic = "llvm.ppc.altivec.vmaxfp";
-          intr_size = 128;
-       }
--   } else if (util_cpu_caps.has_altivec) {
-+   } else if (util_get_cpu_caps()->has_altivec) {
-      intr_size = 128;
-      if (type.width == 8) {
-        if (!type.sign) {
-@@ -338,7 +338,7 @@ lp_build_max_simple(struct lp_build_context *bld,
-    }
- 
-    if (intrinsic) {
--      if (util_cpu_caps.has_sse && type.floating &&
-+      if (util_get_cpu_caps()->has_sse && type.floating &&
-           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
-           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
-           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
-@@ -472,12 +472,12 @@ lp_build_add(struct lp_build_context *bld,
-             return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
-          }
-          if (type.width * type.length == 128) {
--            if (util_cpu_caps.has_sse2) {
-+            if (util_get_cpu_caps()->has_sse2) {
-                if (type.width == 8)
-                  intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
-                if (type.width == 16)
-                  intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
--            } else if (util_cpu_caps.has_altivec) {
-+            } else if (util_get_cpu_caps()->has_altivec) {
-                if (type.width == 8)
-                   intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
-                if (type.width == 16)
-@@ -485,7 +485,7 @@ lp_build_add(struct lp_build_context *bld,
-             }
-          }
-          if (type.width * type.length == 256) {
--            if (util_cpu_caps.has_avx2) {
-+            if (util_get_cpu_caps()->has_avx2) {
-                if (type.width == 8)
-                   intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
-                if (type.width == 16)
-@@ -713,11 +713,11 @@ lp_build_hadd_partial4(struct lp_build_context *bld,
-    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
-    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
- 
--   if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
-+   if (util_get_cpu_caps()->has_sse3 && bld->type.width == 32 &&
-        bld->type.length == 4) {
-       intrinsic = "llvm.x86.sse3.hadd.ps";
-    }
--   else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
-+   else if (util_get_cpu_caps()->has_avx && bld->type.width == 32 &&
-             bld->type.length == 8) {
-       intrinsic = "llvm.x86.avx.hadd.ps.256";
-    }
-@@ -796,12 +796,12 @@ lp_build_sub(struct lp_build_context *bld,
-             return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
-          }
-          if (type.width * type.length == 128) {
--            if (util_cpu_caps.has_sse2) {
-+            if (util_get_cpu_caps()->has_sse2) {
-                if (type.width == 8)
-                   intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
-                if (type.width == 16)
-                   intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
--            } else if (util_cpu_caps.has_altivec) {
-+            } else if (util_get_cpu_caps()->has_altivec) {
-                if (type.width == 8)
-                   intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
-                if (type.width == 16)
-@@ -809,7 +809,7 @@ lp_build_sub(struct lp_build_context *bld,
-             }
-          }
-          if (type.width * type.length == 256) {
--            if (util_cpu_caps.has_avx2) {
-+            if (util_get_cpu_caps()->has_avx2) {
-                if (type.width == 8)
-                   intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
-                if (type.width == 16)
-@@ -1078,8 +1078,8 @@ lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
-     */
-    if (LLVM_VERSION_MAJOR < 7 &&
-        (bld->type.length == 4 || bld->type.length == 8) &&
--       ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
--        util_cpu_caps.has_sse4_1)) {
-+       ((util_get_cpu_caps()->has_sse2 && (bld->type.sign == 0)) ||
-+        util_get_cpu_caps()->has_sse4_1)) {
-       const char *intrinsic = NULL;
-       LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
-       LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
-@@ -1096,7 +1096,7 @@ lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
-       aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
-       bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
- 
--      if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
-+      if (util_get_cpu_caps()->has_avx2 && bld->type.length == 8) {
-          if (bld->type.sign) {
-             intrinsic = "llvm.x86.avx2.pmul.dq";
-          } else {
-@@ -1331,8 +1331,8 @@ lp_build_div(struct lp_build_context *bld,
- 
-    /* fast rcp is disabled (just uses div), so makes no sense to try that */
-    if(FALSE &&
--      ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
--       (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
-+      ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
-+       (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) &&
-       type.floating)
-       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
- 
-@@ -1745,7 +1745,7 @@ lp_build_abs(struct lp_build_context *bld,
-       return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
-    }
- 
--   if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && LLVM_VERSION_MAJOR < 6) {
-+   if(type.width*type.length == 128 && util_get_cpu_caps()->has_ssse3 && LLVM_VERSION_MAJOR < 6) {
-       switch(type.width) {
-       case 8:
-          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
-@@ -1755,7 +1755,7 @@ lp_build_abs(struct lp_build_context *bld,
-          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
-       }
-    }
--   else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && LLVM_VERSION_MAJOR < 6) {
-+   else if (type.width*type.length == 256 && util_get_cpu_caps()->has_avx2 && LLVM_VERSION_MAJOR < 6) {
-       switch(type.width) {
-       case 8:
-          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
-@@ -1897,15 +1897,15 @@ lp_build_int_to_float(struct lp_build_context *bld,
- static boolean
- arch_rounding_available(const struct lp_type type)
- {
--   if ((util_cpu_caps.has_sse4_1 &&
-+   if ((util_get_cpu_caps()->has_sse4_1 &&
-        (type.length == 1 || type.width*type.length == 128)) ||
--       (util_cpu_caps.has_avx && type.width*type.length == 256) ||
--       (util_cpu_caps.has_avx512f && type.width*type.length == 512))
-+       (util_get_cpu_caps()->has_avx && type.width*type.length == 256) ||
-+       (util_get_cpu_caps()->has_avx512f && type.width*type.length == 512))
-       return TRUE;
--   else if ((util_cpu_caps.has_altivec &&
-+   else if ((util_get_cpu_caps()->has_altivec &&
-             (type.width == 32 && type.length == 4)))
-       return TRUE;
--   else if (util_cpu_caps.has_neon)
-+   else if (util_get_cpu_caps()->has_neon)
-       return TRUE;
- 
-    return FALSE;
-@@ -1935,7 +1935,7 @@ lp_build_iround_nearest_sse2(struct lp_build_context *bld,
-    assert(type.width == 32);
- 
-    assert(lp_check_value(type, a));
--   assert(util_cpu_caps.has_sse2);
-+   assert(util_get_cpu_caps()->has_sse2);
- 
-    /* This is relying on MXCSR rounding mode, which should always be nearest. */
-    if (type.length == 1) {
-@@ -1961,7 +1961,7 @@ lp_build_iround_nearest_sse2(struct lp_build_context *bld,
-       }
-       else {
-          assert(type.width*type.length == 256);
--         assert(util_cpu_caps.has_avx);
-+         assert(util_get_cpu_caps()->has_avx);
- 
-          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
-       }
-@@ -1987,7 +1987,7 @@ lp_build_round_altivec(struct lp_build_context *bld,
-    assert(type.floating);
- 
-    assert(lp_check_value(type, a));
--   assert(util_cpu_caps.has_altivec);
-+   assert(util_get_cpu_caps()->has_altivec);
- 
-    (void)type;
- 
-@@ -2014,7 +2014,7 @@ lp_build_round_arch(struct lp_build_context *bld,
-                     LLVMValueRef a,
-                     enum lp_build_round_mode mode)
- {
--   if (util_cpu_caps.has_sse4_1 || util_cpu_caps.has_neon) {
-+   if (util_get_cpu_caps()->has_sse4_1 || util_get_cpu_caps()->has_neon) {
-       LLVMBuilderRef builder = bld->gallivm->builder;
-       const struct lp_type type = bld->type;
-       const char *intrinsic_root;
-@@ -2042,7 +2042,7 @@ lp_build_round_arch(struct lp_build_context *bld,
-       lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
-       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
-    }
--   else /* (util_cpu_caps.has_altivec) */
-+   else /* (util_get_cpu_caps()->has_altivec) */
-      return lp_build_round_altivec(bld, a, mode);
- }
- 
-@@ -2377,9 +2377,9 @@ lp_build_iround(struct lp_build_context *bld,
- 
-    assert(lp_check_value(type, a));
- 
--   if ((util_cpu_caps.has_sse2 &&
-+   if ((util_get_cpu_caps()->has_sse2 &&
-        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
--       (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
-+       (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) {
-       return lp_build_iround_nearest_sse2(bld, a);
-    }
-    if (arch_rounding_available(type)) {
-@@ -2664,8 +2664,8 @@ lp_build_rcp(struct lp_build_context *bld,
-     * particular uses that require less workarounds.
-     */
- 
--   if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
--         (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
-+   if (FALSE && ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
-+         (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8))){
-       const unsigned num_iterations = 0;
-       LLVMValueRef res;
-       unsigned i;
-@@ -2784,8 +2784,8 @@ lp_build_fast_rsqrt_available(struct lp_type type)
- {
-    assert(type.floating);
- 
--   if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
--       (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
-+   if ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
-+       (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) {
-       return true;
-    }
-    return false;
-@@ -3694,7 +3694,7 @@ lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
- LLVMValueRef
- lp_build_fpstate_get(struct gallivm_state *gallivm)
- {
--   if (util_cpu_caps.has_sse) {
-+   if (util_get_cpu_caps()->has_sse) {
-       LLVMBuilderRef builder = gallivm->builder;
-       LLVMValueRef mxcsr_ptr = lp_build_alloca(
-          gallivm,
-@@ -3715,7 +3715,7 @@ void
- lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
-                                   boolean zero)
- {
--   if (util_cpu_caps.has_sse) {
-+   if (util_get_cpu_caps()->has_sse) {
-       /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
-       int daz_ftz = _MM_FLUSH_ZERO_MASK;
- 
-@@ -3724,7 +3724,7 @@ lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
-       LLVMValueRef mxcsr =
-          LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
- 
--      if (util_cpu_caps.has_daz) {
-+      if (util_get_cpu_caps()->has_daz) {
-          /* Enable denormals are zero mode */
-          daz_ftz |= _MM_DENORMALS_ZERO_MASK;
-       }
-@@ -3745,7 +3745,7 @@ void
- lp_build_fpstate_set(struct gallivm_state *gallivm,
-                      LLVMValueRef mxcsr_ptr)
- {
--   if (util_cpu_caps.has_sse) {
-+   if (util_get_cpu_caps()->has_sse) {
-       LLVMBuilderRef builder = gallivm->builder;
-       mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
-                      LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
-diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
-index c68b8850473..af445b00c1a 100644
---- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
-+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
-@@ -101,7 +101,7 @@ lp_build_half_to_float(struct gallivm_state *gallivm,
-    LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
-    LLVMValueRef h;
- 
--   if (util_cpu_caps.has_f16c &&
-+   if (util_get_cpu_caps()->has_f16c &&
-        (src_length == 4 || src_length == 8)) {
-       if (LLVM_VERSION_MAJOR < 11) {
-          const char *intrinsic = NULL;
-@@ -167,7 +167,7 @@ lp_build_float_to_half(struct gallivm_state *gallivm,
-     * useless.
-     */
- 
--   if (util_cpu_caps.has_f16c &&
-+   if (util_get_cpu_caps()->has_f16c &&
-        (length == 4 || length == 8)) {
-       struct lp_type i168_type = lp_type_int_vec(16, 16 * 8);
-       unsigned mode = 3; /* same as LP_BUILD_ROUND_TRUNCATE */
-@@ -489,7 +489,7 @@ int lp_build_conv_auto(struct gallivm_state *gallivm,
- 
-       /* Special case 4x4x32 --> 1x16x8 */
-       if (src_type.length == 4 &&
--            (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec))
-+            (util_get_cpu_caps()->has_sse2 || util_get_cpu_caps()->has_altivec))
-       {
-          num_dsts = (num_srcs + 3) / 4;
-          dst_type->length = num_srcs * 4 >= 16 ? 16 : num_srcs * 4;
-@@ -500,7 +500,7 @@ int lp_build_conv_auto(struct gallivm_state *gallivm,
- 
-       /* Special case 2x8x32 --> 1x16x8 */
-       if (src_type.length == 8 &&
--          util_cpu_caps.has_avx)
-+          util_get_cpu_caps()->has_avx)
-       {
-          num_dsts = (num_srcs + 1) / 2;
-          dst_type->length = num_srcs * 8 >= 16 ? 16 : num_srcs * 8;
-@@ -597,7 +597,7 @@ lp_build_conv(struct gallivm_state *gallivm,
-        ((dst_type.length == 16 && 4 * num_dsts == num_srcs) ||
-         (num_dsts == 1 && dst_type.length * num_srcs == 16 && num_srcs != 3)) &&
- 
--       (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec))
-+       (util_get_cpu_caps()->has_sse2 || util_get_cpu_caps()->has_altivec))
-    {
-       struct lp_build_context bld;
-       struct lp_type int16_type, int32_type;
-@@ -710,7 +710,7 @@ lp_build_conv(struct gallivm_state *gallivm,
-       ((dst_type.length == 16 && 2 * num_dsts == num_srcs) ||
-        (num_dsts == 1 && dst_type.length * num_srcs == 8)) &&
- 
--      util_cpu_caps.has_avx) {
-+      util_get_cpu_caps()->has_avx) {
- 
-       struct lp_build_context bld;
-       struct lp_type int16_type, int32_type;
-diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c b/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c
-index 174857e06d9..e17c7881e7d 100644
---- a/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c
-+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c
-@@ -642,8 +642,8 @@ s3tc_dxt1_full_to_rgba_aos(struct gallivm_state *gallivm,
-        * XXX with sse2 and 16x8 vectors, should use pavgb even when n == 1.
-        * Much cheaper (but we don't care that much if n == 1).
-        */
--      if ((util_cpu_caps.has_sse2 && n == 4) ||
--          (util_cpu_caps.has_avx2 && n == 8)) {
-+      if ((util_get_cpu_caps()->has_sse2 && n == 4) ||
-+          (util_get_cpu_caps()->has_avx2 && n == 8)) {
-          color2_2 = lp_build_pavgb(&bld8, colors0, colors1);
-          color2_2 = LLVMBuildBitCast(builder, color2_2, bld32.vec_type, "");
-       }
-@@ -1350,7 +1350,7 @@ s3tc_decode_block_dxt1(struct gallivm_state *gallivm,
-    if (is_dxt1_variant) {
-       LLVMValueRef color23_2, color2_2;
- 
--      if (util_cpu_caps.has_sse2) {
-+      if (util_get_cpu_caps()->has_sse2) {
-          LLVMValueRef intrargs[2];
-          intrargs[0] = LLVMBuildBitCast(builder, color01, bld8.vec_type, "");
-          /* same interleave as for lerp23 - correct result in 2nd element */
-@@ -1389,7 +1389,7 @@ s3tc_decode_block_dxt1(struct gallivm_state *gallivm,
-       color23 = lp_build_select(&bld32, sel_mask, color23, color23_2);
-    }
- 
--   if (util_cpu_caps.has_ssse3) {
-+   if (util_get_cpu_caps()->has_ssse3) {
-       /*
-        * Use pshufb as mini-lut. (Only doable with intrinsics as the
-        * final shuffles are non-constant. pshufb is awesome!)
-@@ -1689,7 +1689,7 @@ s3tc_decode_block_dxt5(struct gallivm_state *gallivm,
-    type16.sign = FALSE;
-    sel_mask = LLVMBuildBitCast(builder, sel_mask, bld8.vec_type, "");
- 
--   if (!util_cpu_caps.has_ssse3) {
-+   if (!util_get_cpu_caps()->has_ssse3) {
-       LLVMValueRef acodeg, mask1, acode0, acode1;
- 
-       /* extraction of the 3 bit values into something more useful is HARD */
-diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
-index 121452d7596..97deffe1de0 100644
---- a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
-+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
-@@ -90,7 +90,7 @@ uyvy_to_yuv_soa(struct gallivm_state *gallivm,
-     * per element. Didn't measure performance but cuts shader size
-     * by quite a bit (less difference if cpu has no sse4.1 support).
-     */
--   if (util_cpu_caps.has_sse2 && n > 1) {
-+   if (util_get_cpu_caps()->has_sse2 && n > 1) {
-       LLVMValueRef sel, tmp, tmp2;
-       struct lp_build_context bld32;
- 
-@@ -174,7 +174,7 @@ yuyv_to_yuv_soa(struct gallivm_state *gallivm,
-     * per element. Didn't measure performance but cuts shader size
-     * by quite a bit (less difference if cpu has no sse4.1 support).
-     */
--   if (util_cpu_caps.has_sse2 && n > 1) {
-+   if (util_get_cpu_caps()->has_sse2 && n > 1) {
-       LLVMValueRef sel, tmp;
-       struct lp_build_context bld32;
- 
-diff --git a/src/gallium/auxiliary/gallivm/lp_bld_gather.c b/src/gallium/auxiliary/gallivm/lp_bld_gather.c
-index e991b0dc375..42cc17371a0 100644
---- a/src/gallium/auxiliary/gallivm/lp_bld_gather.c
-+++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.c
-@@ -488,7 +488,7 @@ lp_build_gather(struct gallivm_state *gallivm,
-        * 32bit/64bit fetches you're doing it wrong (this is gather, not
-        * conversion) and it would be awkward for floats.
-        */
--   } else if (util_cpu_caps.has_avx2 && !need_expansion &&
-+   } else if (util_get_cpu_caps()->has_avx2 && !need_expansion &&
-               src_width == 32 && (length == 4 || length == 8)) {
-       return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
-                                   base_ptr, offsets);
-@@ -500,7 +500,7 @@ lp_build_gather(struct gallivm_state *gallivm,
-     * (In general, should be more of a win if the fetch is 256bit wide -
-     * this is true for the 32bit case above too.)
-     */
--   } else if (0 && util_cpu_caps.has_avx2 && !need_expansion &&
-+   } else if (0 && util_get_cpu_caps()->has_avx2 && !need_expansion &&
-               src_width == 64 && (length == 2 || length == 4)) {
-       return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
-                                   base_ptr, offsets);
-diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c
-index 685ed0e58aa..dd428242cb9 100644
---- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
-+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
-@@ -433,6 +433,7 @@ lp_build_init(void)
-    /* For simulating less capable machines */
- #ifdef DEBUG
-    if (debug_get_bool_option("LP_FORCE_SSE2", FALSE)) {
-+      extern struct util_cpu_caps_t util_cpu_caps;
-       assert(util_cpu_caps.has_sse2);
-       util_cpu_caps.has_sse3 = 0;
-       util_cpu_caps.has_ssse3 = 0;
-@@ -445,7 +446,7 @@ lp_build_init(void)
-    }
- #endif
- 
--   if (util_cpu_caps.has_avx2 || util_cpu_caps.has_avx) {
-+   if (util_get_cpu_caps()->has_avx2 || util_get_cpu_caps()->has_avx) {
-       lp_native_vector_width = 256;
-    } else {
-       /* Leave it at 128, even when no SIMD extensions are available.
-@@ -460,16 +461,16 @@ lp_build_init(void)
- #if LLVM_VERSION_MAJOR < 4
-    if (lp_native_vector_width <= 128) {
-       /* Hide AVX support, as often LLVM AVX intrinsics are only guarded by
--       * "util_cpu_caps.has_avx" predicate, and lack the
-+       * "util_get_cpu_caps()->has_avx" predicate, and lack the
-        * "lp_native_vector_width > 128" predicate. And also to ensure a more
-        * consistent behavior, allowing one to test SSE2 on AVX machines.
-        * XXX: should not play games with util_cpu_caps directly as it might
-        * get used for other things outside llvm too.
-        */
--      util_cpu_caps.has_avx = 0;
--      util_cpu_caps.has_avx2 = 0;
--      util_cpu_caps.has_f16c = 0;
--      util_cpu_caps.has_fma = 0;
-+      util_get_cpu_caps()->has_avx = 0;
-+      util_get_cpu_caps()->has_avx2 = 0;
-+      util_get_cpu_caps()->has_f16c = 0;
-+      util_get_cpu_caps()->has_fma = 0;
-    }
- #endif
- 
-@@ -482,7 +483,7 @@ lp_build_init(void)
-     * Right now denorms get explicitly disabled (but elsewhere) for x86,
-     * whereas ppc64 explicitly enables them...
-     */
--   if (util_cpu_caps.has_altivec) {
-+   if (util_get_cpu_caps()->has_altivec) {
-       unsigned short mask[] = { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
-                                 0xFFFF, 0xFFFF, 0xFFFE, 0xFFFF };
-       __asm (
-diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
-index 315977ae745..3ed3b5a74b1 100644
---- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
-+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
-@@ -196,7 +196,7 @@ lp_build_compare(struct gallivm_state *gallivm,
- 
-    if (!type.floating && !type.sign &&
-        type.width * type.length == 128 &&
--       util_cpu_caps.has_sse2 &&
-+       util_get_cpu_caps()->has_sse2 &&
-        (func == PIPE_FUNC_LESS ||
-         func == PIPE_FUNC_LEQUAL ||
-         func == PIPE_FUNC_GREATER ||
-@@ -348,11 +348,11 @@ lp_build_select(struct lp_build_context *bld,
- 
-       res = LLVMBuildSelect(builder, mask, a, b, "");
-    }
--   else if (((util_cpu_caps.has_sse4_1 &&
-+   else if (((util_get_cpu_caps()->has_sse4_1 &&
-               type.width * type.length == 128) ||
--             (util_cpu_caps.has_avx &&
-+             (util_get_cpu_caps()->has_avx &&
-               type.width * type.length == 256 && type.width >= 32) ||
--             (util_cpu_caps.has_avx2 &&
-+             (util_get_cpu_caps()->has_avx2 &&
-               type.width * type.length == 256)) &&
-             !LLVMIsConstant(a) &&
-             !LLVMIsConstant(b) &&
-@@ -379,7 +379,7 @@ lp_build_select(struct lp_build_context *bld,
-             intrinsic = "llvm.x86.avx.blendv.ps.256";
-             arg_type = LLVMVectorType(LLVMFloatTypeInContext(lc), 8);
-          } else {
--            assert(util_cpu_caps.has_avx2);
-+            assert(util_get_cpu_caps()->has_avx2);
-             intrinsic = "llvm.x86.avx2.pblendvb";
-             arg_type = LLVMVectorType(LLVMInt8TypeInContext(lc), 32);
-          }
-diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
-index 9b75676a4e2..4f3e696816c 100644
---- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
-+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
-@@ -400,22 +400,22 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
-     * http://llvm.org/PR19429
-     * http://llvm.org/PR16721
-     */
--   MAttrs.push_back(util_cpu_caps.has_sse    ? "+sse"    : "-sse"   );
--   MAttrs.push_back(util_cpu_caps.has_sse2   ? "+sse2"   : "-sse2"  );
--   MAttrs.push_back(util_cpu_caps.has_sse3   ? "+sse3"   : "-sse3"  );
--   MAttrs.push_back(util_cpu_caps.has_ssse3  ? "+ssse3"  : "-ssse3" );
--   MAttrs.push_back(util_cpu_caps.has_sse4_1 ? "+sse4.1" : "-sse4.1");
--   MAttrs.push_back(util_cpu_caps.has_sse4_2 ? "+sse4.2" : "-sse4.2");
-+   MAttrs.push_back(util_get_cpu_caps()->has_sse    ? "+sse"    : "-sse"   );
-+   MAttrs.push_back(util_get_cpu_caps()->has_sse2   ? "+sse2"   : "-sse2"  );
-+   MAttrs.push_back(util_get_cpu_caps()->has_sse3   ? "+sse3"   : "-sse3"  );
-+   MAttrs.push_back(util_get_cpu_caps()->has_ssse3  ? "+ssse3"  : "-ssse3" );
-+   MAttrs.push_back(util_get_cpu_caps()->has_sse4_1 ? "+sse4.1" : "-sse4.1");
-+   MAttrs.push_back(util_get_cpu_caps()->has_sse4_2 ? "+sse4.2" : "-sse4.2");
-    /*
-     * AVX feature is not automatically detected from CPUID by the X86 target
-     * yet, because the old (yet default) JIT engine is not capable of
-     * emitting the opcodes. On newer llvm versions it is and at least some
-     * versions (tested with 3.3) will emit avx opcodes without this anyway.
-     */
--   MAttrs.push_back(util_cpu_caps.has_avx  ? "+avx"  : "-avx");
--   MAttrs.push_back(util_cpu_caps.has_f16c ? "+f16c" : "-f16c");
--   MAttrs.push_back(util_cpu_caps.has_fma  ? "+fma"  : "-fma");
--   MAttrs.push_back(util_cpu_caps.has_avx2 ? "+avx2" : "-avx2");
-+   MAttrs.push_back(util_get_cpu_caps()->has_avx  ? "+avx"  : "-avx");
-+   MAttrs.push_back(util_get_cpu_caps()->has_f16c ? "+f16c" : "-f16c");
-+   MAttrs.push_back(util_get_cpu_caps()->has_fma  ? "+fma"  : "-fma");
-+   MAttrs.push_back(util_get_cpu_caps()->has_avx2 ? "+avx2" : "-avx2");
-    /* disable avx512 and all subvariants */
-    MAttrs.push_back("-avx512cd");
-    MAttrs.push_back("-avx512er");
-@@ -426,7 +426,7 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
-    MAttrs.push_back("-avx512vl");
- #endif
- #if defined(PIPE_ARCH_ARM)
--   if (!util_cpu_caps.has_neon) {
-+   if (!util_get_cpu_caps()->has_neon) {
-       MAttrs.push_back("-neon");
-       MAttrs.push_back("-crypto");
-       MAttrs.push_back("-vfp2");
-@@ -434,7 +434,7 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
- #endif
- 
- #if defined(PIPE_ARCH_PPC)
--   MAttrs.push_back(util_cpu_caps.has_altivec ? "+altivec" : "-altivec");
-+   MAttrs.push_back(util_get_cpu_caps()->has_altivec ? "+altivec" : "-altivec");
- #if (LLVM_VERSION_MAJOR < 4)
-    /*
-     * Make sure VSX instructions are disabled
-@@ -444,7 +444,7 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
-     * https://llvm.org/bugs/show_bug.cgi?id=33531 (fixed in 4.0)
-     * https://llvm.org/bugs/show_bug.cgi?id=34647 (llc performance on certain unusual shader IR; intro'd in 4.0, pending as of 5.0)
-     */
--   if (util_cpu_caps.has_altivec) {
-+   if (util_get_cpu_caps()->has_altivec) {
-       MAttrs.push_back("-vsx");
-    }
- #else
-@@ -458,8 +458,8 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
-     * Make sure VSX instructions are ENABLED (if supported), unless
-     * VSX instructions are explicitly enabled/disabled via GALLIVM_VSX=1 or 0.
-     */
--   if (util_cpu_caps.has_altivec) {
--      MAttrs.push_back(util_cpu_caps.has_vsx ? "+vsx" : "-vsx");
-+   if (util_get_cpu_caps()->has_altivec) {
-+      MAttrs.push_back(util_get_cpu_caps()->has_vsx ? "+vsx" : "-vsx");
-    }
- #endif
- #endif
-diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
-index e1f652a9342..76e57c52f80 100644
---- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
-+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
-@@ -322,7 +322,7 @@ lp_build_interleave2(struct gallivm_state *gallivm,
- {
-    LLVMValueRef shuffle;
- 
--   if (type.length == 2 && type.width == 128 && util_cpu_caps.has_avx) {
-+   if (type.length == 2 && type.width == 128 && util_get_cpu_caps()->has_avx) {
-       /*
-        * XXX: This is a workaround for llvm code generation deficiency. Strangely
-        * enough, while this needs vinsertf128/vextractf128 instructions (hence
-@@ -484,7 +484,7 @@ lp_build_unpack2_native(struct gallivm_state *gallivm,
- 
-    /* Interleave bits */
- #if UTIL_ARCH_LITTLE_ENDIAN
--   if (src_type.length * src_type.width == 256 && util_cpu_caps.has_avx2) {
-+   if (src_type.length * src_type.width == 256 && util_get_cpu_caps()->has_avx2) {
-       *dst_lo = lp_build_interleave2_half(gallivm, src_type, src, msb, 0);
-       *dst_hi = lp_build_interleave2_half(gallivm, src_type, src, msb, 1);
-    } else {
-@@ -585,22 +585,22 @@ lp_build_pack2(struct gallivm_state *gallivm,
-    assert(src_type.length * 2 == dst_type.length);
- 
-    /* Check for special cases first */
--   if ((util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec) &&
-+   if ((util_get_cpu_caps()->has_sse2 || util_get_cpu_caps()->has_altivec) &&
-         src_type.width * src_type.length >= 128) {
-       const char *intrinsic = NULL;
-       boolean swap_intrinsic_operands = FALSE;
- 
-       switch(src_type.width) {
-       case 32:
--         if (util_cpu_caps.has_sse2) {
-+         if (util_get_cpu_caps()->has_sse2) {
-            if (dst_type.sign) {
-               intrinsic = "llvm.x86.sse2.packssdw.128";
-            } else {
--              if (util_cpu_caps.has_sse4_1) {
-+              if (util_get_cpu_caps()->has_sse4_1) {
-                  intrinsic = "llvm.x86.sse41.packusdw";
-               }
-            }
--         } else if (util_cpu_caps.has_altivec) {
-+         } else if (util_get_cpu_caps()->has_altivec) {
-             if (dst_type.sign) {
-                intrinsic = "llvm.ppc.altivec.vpkswss";
-             } else {
-@@ -613,18 +613,18 @@ lp_build_pack2(struct gallivm_state *gallivm,
-          break;
-       case 16:
-          if (dst_type.sign) {
--            if (util_cpu_caps.has_sse2) {
-+            if (util_get_cpu_caps()->has_sse2) {
-                intrinsic = "llvm.x86.sse2.packsswb.128";
--            } else if (util_cpu_caps.has_altivec) {
-+            } else if (util_get_cpu_caps()->has_altivec) {
-                intrinsic = "llvm.ppc.altivec.vpkshss";
- #if UTIL_ARCH_LITTLE_ENDIAN
-                swap_intrinsic_operands = TRUE;
- #endif
-             }
-          } else {
--            if (util_cpu_caps.has_sse2) {
-+            if (util_get_cpu_caps()->has_sse2) {
-                intrinsic = "llvm.x86.sse2.packuswb.128";
--            } else if (util_cpu_caps.has_altivec) {
-+            } else if (util_get_cpu_caps()->has_altivec) {
-                intrinsic = "llvm.ppc.altivec.vpkshus";
- #if UTIL_ARCH_LITTLE_ENDIAN
-                swap_intrinsic_operands = TRUE;
-@@ -740,7 +740,7 @@ lp_build_pack2_native(struct gallivm_state *gallivm,
- 
-    /* At this point only have special case for avx2 */
-    if (src_type.length * src_type.width == 256 &&
--       util_cpu_caps.has_avx2) {
-+       util_get_cpu_caps()->has_avx2) {
-       switch(src_type.width) {
-       case 32:
-          if (dst_type.sign) {
-@@ -793,7 +793,7 @@ lp_build_packs2(struct gallivm_state *gallivm,
- 
-    /* All X86 SSE non-interleaved pack instructions take signed inputs and
-     * saturate them, so no need to clamp for those cases. */
--   if(util_cpu_caps.has_sse2 &&
-+   if(util_get_cpu_caps()->has_sse2 &&
-       src_type.width * src_type.length >= 128 &&
-       src_type.sign &&
-       (src_type.width == 32 || src_type.width == 16))
-diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
-index 686abc08620..98dcde912b5 100644
---- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
-+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
-@@ -1152,7 +1152,7 @@ lp_build_minify(struct lp_build_context *bld,
-       LLVMValueRef size;
-       assert(bld->type.sign);
-       if (lod_scalar ||
--         (util_cpu_caps.has_avx2 || !util_cpu_caps.has_sse)) {
-+         (util_get_cpu_caps()->has_avx2 || !util_get_cpu_caps()->has_sse)) {
-          size = LLVMBuildLShr(builder, base_size, level, "minify");
-          size = lp_build_max(bld, size, bld->one);
-       }
-diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
-index 2b91edd37c7..6e47640e70d 100644
---- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
-+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
-@@ -3234,7 +3234,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
-        * as it appears to be a loss with just AVX)
-        */
-       if (num_quads == 1 || !use_aos ||
--          (util_cpu_caps.has_avx2 &&
-+          (util_get_cpu_caps()->has_avx2 &&
-            (bld.num_lods == 1 ||
-             derived_sampler_state.min_img_filter == derived_sampler_state.mag_img_filter))) {
-          if (use_aos) {
-diff --git a/src/gallium/auxiliary/rtasm/rtasm_cpu.c b/src/gallium/auxiliary/rtasm/rtasm_cpu.c
-index b1c8b990ef1..03b11f914b4 100644
---- a/src/gallium/auxiliary/rtasm/rtasm_cpu.c
-+++ b/src/gallium/auxiliary/rtasm/rtasm_cpu.c
-@@ -35,10 +35,10 @@
- 
- DEBUG_GET_ONCE_BOOL_OPTION(nosse, "GALLIUM_NOSSE", false);
- 
--static struct util_cpu_caps *get_cpu_caps(void)
-+static const struct util_cpu_caps_t *get_cpu_caps(void)
- {
-    util_cpu_detect();
--   return &util_cpu_caps;
-+   return util_get_cpu_caps();
- }
- 
- int rtasm_cpu_has_sse(void)
-diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
-index ad687f32853..ddd65fb6a08 100644
---- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
-+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
-@@ -2152,17 +2152,17 @@ static void x86_init_func_common( struct x86_function *p )
- {
-    util_cpu_detect();
-    p->caps = 0;
--   if(util_cpu_caps.has_mmx)
-+   if(util_get_cpu_caps()->has_mmx)
-       p->caps |= X86_MMX;
--   if(util_cpu_caps.has_mmx2)
-+   if(util_get_cpu_caps()->has_mmx2)
-       p->caps |= X86_MMX2;
--   if(util_cpu_caps.has_sse)
-+   if(util_get_cpu_caps()->has_sse)
-       p->caps |= X86_SSE;
--   if(util_cpu_caps.has_sse2)
-+   if(util_get_cpu_caps()->has_sse2)
-       p->caps |= X86_SSE2;
--   if(util_cpu_caps.has_sse3)
-+   if(util_get_cpu_caps()->has_sse3)
-       p->caps |= X86_SSE3;
--   if(util_cpu_caps.has_sse4_1)
-+   if(util_get_cpu_caps()->has_sse4_1)
-       p->caps |= X86_SSE4_1;
-    p->csr = p->store;
- #if defined(PIPE_ARCH_X86)
-diff --git a/src/gallium/auxiliary/util/u_threaded_context.c b/src/gallium/auxiliary/util/u_threaded_context.c
-index 1eaff77724e..bf56993db09 100644
---- a/src/gallium/auxiliary/util/u_threaded_context.c
-+++ b/src/gallium/auxiliary/util/u_threaded_context.c
-@@ -2071,8 +2071,8 @@ tc_set_context_param(struct pipe_context *_pipe,
-    if (param == PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE) {
-       /* Pin the gallium thread as requested. */
-       util_set_thread_affinity(tc->queue.threads[0],
--                               util_cpu_caps.L3_affinity_mask[value],
--                               NULL, UTIL_MAX_CPUS);
-+                               util_get_cpu_caps()->L3_affinity_mask[value],
-+                               NULL, util_get_cpu_caps()->num_cpu_mask_bits);
- 
-       /* Execute this immediately (without enqueuing).
-        * It's required to be thread-safe.
-@@ -2720,7 +2720,7 @@ threaded_context_create(struct pipe_context *pipe,
- 
-    util_cpu_detect();
- 
--   if (!debug_get_bool_option("GALLIUM_THREAD", util_cpu_caps.nr_cpus > 1))
-+   if (!debug_get_bool_option("GALLIUM_THREAD", util_get_cpu_caps()->nr_cpus > 1))
-       return pipe;
- 
-    tc = os_malloc_aligned(sizeof(struct threaded_context), 16);
-diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
-index 64cf72ae101..913c1bd2462 100644
---- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
-+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
-@@ -435,7 +435,7 @@ lp_build_occlusion_count(struct gallivm_state *gallivm,
-    assert(type.length <= 16);
-    assert(type.floating);
- 
--   if(util_cpu_caps.has_sse && type.length == 4) {
-+   if(util_get_cpu_caps()->has_sse && type.length == 4) {
-       const char *movmskintr = "llvm.x86.sse.movmsk.ps";
-       const char *popcntintr = "llvm.ctpop.i32";
-       LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue,
-@@ -446,7 +446,7 @@ lp_build_occlusion_count(struct gallivm_state *gallivm,
-                                        LLVMInt32TypeInContext(context), bits);
-       count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), "");
-    }
--   else if(util_cpu_caps.has_avx && type.length == 8) {
-+   else if(util_get_cpu_caps()->has_avx && type.length == 8) {
-       const char *movmskintr = "llvm.x86.avx.movmsk.ps.256";
-       const char *popcntintr = "llvm.ctpop.i32";
-       LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue,
-diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
-index f133bbf8a4d..628a4338c1e 100644
---- a/src/gallium/drivers/llvmpipe/lp_screen.c
-+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
-@@ -915,7 +915,7 @@ llvmpipe_create_screen(struct sw_winsys *winsys)
- 
-    screen->allow_cl = !!getenv("LP_CL");
-    screen->use_tgsi = (LP_DEBUG & DEBUG_TGSI_IR);
--   screen->num_threads = util_cpu_caps.nr_cpus > 1 ? util_cpu_caps.nr_cpus : 0;
-+   screen->num_threads = util_get_cpu_caps()->nr_cpus > 1 ? util_get_cpu_caps()->nr_cpus : 0;
- #ifdef EMBEDDED_DEVICE
-    screen->num_threads = 0;
- #endif
-diff --git a/src/gallium/drivers/llvmpipe/lp_test_arit.c b/src/gallium/drivers/llvmpipe/lp_test_arit.c
-index 873dcf37fac..725854cc25c 100644
---- a/src/gallium/drivers/llvmpipe/lp_test_arit.c
-+++ b/src/gallium/drivers/llvmpipe/lp_test_arit.c
-@@ -382,7 +382,7 @@ flush_denorm_to_zero(float val)
-    fi_val.f = val;
- 
- #if defined(PIPE_ARCH_SSE)
--   if (util_cpu_caps.has_sse) {
-+   if (util_get_cpu_caps()->has_sse) {
-       if ((fi_val.ui & 0x7f800000) == 0) {
-          fi_val.ui &= 0xff800000;
-       }
-@@ -458,7 +458,7 @@ test_unary(unsigned verbose, FILE *fp, const struct unary_test_t *test, unsigned
-             continue;
-          }
- 
--         if (!util_cpu_caps.has_neon &&
-+         if (!util_get_cpu_caps()->has_neon &&
-              test->ref == &nearbyintf && length == 2 &&
-              ref != roundf(testval)) {
-             /* FIXME: The generic (non SSE) path in lp_build_iround, which is
-diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c
-index 2bf223d66f9..815736166d5 100644
---- a/src/gallium/drivers/llvmpipe/lp_texture.c
-+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
-@@ -85,7 +85,7 @@ llvmpipe_texture_layout(struct llvmpipe_screen *screen,
-     * of a block for all formats) though this should not be strictly necessary
-     * neither. In any case it can only affect compressed or 1d textures.
-     */
--   unsigned mip_align = MAX2(64, util_cpu_caps.cacheline);
-+   unsigned mip_align = MAX2(64, util_get_cpu_caps()->cacheline);
- 
-    assert(LP_MAX_TEXTURE_2D_LEVELS <= LP_MAX_TEXTURE_LEVELS);
-    assert(LP_MAX_TEXTURE_3D_LEVELS <= LP_MAX_TEXTURE_LEVELS);
-@@ -123,7 +123,7 @@ llvmpipe_texture_layout(struct llvmpipe_screen *screen,
-       if (util_format_is_compressed(pt->format))
-          lpr->row_stride[level] = nblocksx * block_size;
-       else
--         lpr->row_stride[level] = align(nblocksx * block_size, util_cpu_caps.cacheline);
-+         lpr->row_stride[level] = align(nblocksx * block_size, util_get_cpu_caps()->cacheline);
- 
-       /* if row_stride * height > LP_MAX_TEXTURE_SIZE */
-       if ((uint64_t)lpr->row_stride[level] * nblocksy > LP_MAX_TEXTURE_SIZE) {
-diff --git a/src/gallium/drivers/swr/swr_loader.cpp b/src/gallium/drivers/swr/swr_loader.cpp
-index 97db7ca3e8b..d891b6b14e8 100644
---- a/src/gallium/drivers/swr/swr_loader.cpp
-+++ b/src/gallium/drivers/swr/swr_loader.cpp
-@@ -91,7 +91,7 @@ swr_create_screen(struct sw_winsys *winsys)
- 
-    util_cpu_detect();
- 
--   if (util_cpu_caps.has_avx512f && util_cpu_caps.has_avx512er) {
-+   if (util_get_cpu_caps()->has_avx512f && util_get_cpu_caps()->has_avx512er) {
-       swr_print_info("SWR detected KNL instruction support ");
- #ifndef HAVE_SWR_KNL
-       swr_print_info("(skipping: not built).\n");
-@@ -103,7 +103,7 @@ swr_create_screen(struct sw_winsys *winsys)
- #endif
-    }
- 
--   if (util_cpu_caps.has_avx512f && util_cpu_caps.has_avx512bw) {
-+   if (util_get_cpu_caps()->has_avx512f && util_get_cpu_caps()->has_avx512bw) {
-       swr_print_info("SWR detected SKX instruction support ");
- #ifndef HAVE_SWR_SKX
-       swr_print_info("(skipping not built).\n");
-@@ -113,7 +113,7 @@ swr_create_screen(struct sw_winsys *winsys)
- #endif
-    }
- 
--   if (util_cpu_caps.has_avx2) {
-+   if (util_get_cpu_caps()->has_avx2) {
-       swr_print_info("SWR detected AVX2 instruction support ");
- #ifndef HAVE_SWR_AVX2
-       swr_print_info("(skipping not built).\n");
-@@ -123,7 +123,7 @@ swr_create_screen(struct sw_winsys *winsys)
- #endif
-    }
- 
--   if (util_cpu_caps.has_avx) {
-+   if (util_get_cpu_caps()->has_avx) {
-       swr_print_info("SWR detected AVX instruction support ");
- #ifndef HAVE_SWR_AVX
-       swr_print_info("(skipping not built).\n");
-diff --git a/src/gallium/drivers/vc4/vc4_tiling.h b/src/gallium/drivers/vc4/vc4_tiling.h
-index 66767e7f1f8..5afe32939a8 100644
---- a/src/gallium/drivers/vc4/vc4_tiling.h
-+++ b/src/gallium/drivers/vc4/vc4_tiling.h
-@@ -90,7 +90,7 @@ vc4_load_lt_image(void *dst, uint32_t dst_stride,
-                   int cpp, const struct pipe_box *box)
- {
- #ifdef USE_ARM_ASM
--        if (util_cpu_caps.has_neon) {
-+        if (util_get_cpu_caps()->has_neon) {
-                 vc4_load_lt_image_neon(dst, dst_stride, src, src_stride,
-                                        cpp, box);
-                 return;
-@@ -106,7 +106,7 @@ vc4_store_lt_image(void *dst, uint32_t dst_stride,
-                    int cpp, const struct pipe_box *box)
- {
- #ifdef USE_ARM_ASM
--        if (util_cpu_caps.has_neon) {
-+        if (util_get_cpu_caps()->has_neon) {
-                 vc4_store_lt_image_neon(dst, dst_stride, src, src_stride,
-                                         cpp, box);
-                 return;
-diff --git a/src/gallium/tests/unit/translate_test.c b/src/gallium/tests/unit/translate_test.c
-index 4d9c4e27ebf..782f16e7f78 100644
---- a/src/gallium/tests/unit/translate_test.c
-+++ b/src/gallium/tests/unit/translate_test.c
-@@ -50,6 +50,7 @@ int main(int argc, char** argv)
- {
-    struct translate *(*create_fn)(const struct translate_key *key) = 0;
- 
-+   extern struct util_cpu_caps_t util_cpu_caps;
-    struct translate_key key;
-    unsigned output_format;
-    unsigned input_format;
-@@ -87,7 +88,7 @@ int main(int argc, char** argv)
-    }
-    else if (!strcmp(argv[1], "sse"))
-    {
--      if(!util_cpu_caps.has_sse || !rtasm_cpu_has_sse())
-+      if(!util_get_cpu_caps()->has_sse || !rtasm_cpu_has_sse())
-       {
-          printf("Error: CPU doesn't support SSE (test with qemu)\n");
-          return 2;
-@@ -99,7 +100,7 @@ int main(int argc, char** argv)
-    }
-    else if (!strcmp(argv[1], "sse2"))
-    {
--      if(!util_cpu_caps.has_sse2 || !rtasm_cpu_has_sse())
-+      if(!util_get_cpu_caps()->has_sse2 || !rtasm_cpu_has_sse())
-       {
-          printf("Error: CPU doesn't support SSE2 (test with qemu)\n");
-          return 2;
-@@ -110,7 +111,7 @@ int main(int argc, char** argv)
-    }
-    else if (!strcmp(argv[1], "sse3"))
-    {
--      if(!util_cpu_caps.has_sse3 || !rtasm_cpu_has_sse())
-+      if(!util_get_cpu_caps()->has_sse3 || !rtasm_cpu_has_sse())
-       {
-          printf("Error: CPU doesn't support SSE3 (test with qemu)\n");
-          return 2;
-@@ -120,7 +121,7 @@ int main(int argc, char** argv)
-    }
-    else if (!strcmp(argv[1], "sse4.1"))
-    {
--      if(!util_cpu_caps.has_sse4_1 || !rtasm_cpu_has_sse())
-+      if(!util_get_cpu_caps()->has_sse4_1 || !rtasm_cpu_has_sse())
-       {
-          printf("Error: CPU doesn't support SSE4.1 (test with qemu)\n");
-          return 2;
-diff --git a/src/gallium/tests/unit/u_half_test.c b/src/gallium/tests/unit/u_half_test.c
-index 7f2eba9382b..4474cfb82b0 100644
---- a/src/gallium/tests/unit/u_half_test.c
-+++ b/src/gallium/tests/unit/u_half_test.c
-@@ -36,13 +36,14 @@ test(void)
- int
- main(int argc, char **argv)
- {
--   assert(!util_cpu_caps.has_f16c);
-+   util_cpu_detect();
-    test();
- 
--   /* Test f16c. */
--   util_cpu_detect();
--   if (util_cpu_caps.has_f16c)
-+   /* Test non-f16c. */
-+   if (util_get_cpu_caps()->has_f16c) {
-+      ((struct util_cpu_caps_t *)util_get_cpu_caps())->has_f16c = false;
-       test();
-+   }
- 
-    printf("Success!\n");
-    return 0;
-diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
-index 8a0aedfed64..a18362ce6ea 100644
---- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
-+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
-@@ -312,8 +312,8 @@ static void amdgpu_pin_threads_to_L3_cache(struct radeon_winsys *rws,
-    struct amdgpu_winsys *ws = amdgpu_winsys(rws);
- 
-    util_set_thread_affinity(ws->cs_queue.threads[0],
--                            util_cpu_caps.L3_affinity_mask[cache],
--                            NULL, UTIL_MAX_CPUS);
-+                            util_get_cpu_caps()->L3_affinity_mask[cache],
-+                            NULL, util_get_cpu_caps()->num_cpu_mask_bits);
- }
- 
- static uint32_t kms_handle_hash(const void *key)
-diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
-index f0e1b9f7df3..4430ce50466 100644
---- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
-+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
-@@ -801,8 +801,8 @@ static void radeon_pin_threads_to_L3_cache(struct radeon_winsys *ws,
- 
-    if (util_queue_is_initialized(&rws->cs_queue)) {
-       util_set_thread_affinity(rws->cs_queue.threads[0],
--                               util_cpu_caps.L3_affinity_mask[cache],
--                               NULL, UTIL_MAX_CPUS);
-+                               util_get_cpu_caps()->L3_affinity_mask[cache],
-+                               NULL, util_get_cpu_caps()->num_cpu_mask_bits);
-    }
- }
- 
-diff --git a/src/mesa/main/glthread.c b/src/mesa/main/glthread.c
-index eb8eb30cabc..c9dfef541fc 100644
---- a/src/mesa/main/glthread.c
-+++ b/src/mesa/main/glthread.c
-@@ -199,19 +199,20 @@ _mesa_glthread_flush_batch(struct gl_context *ctx)
-    /* Pin threads regularly to the same Zen CCX that the main thread is
-     * running on. The main thread can move between CCXs.
-     */
--   if (util_cpu_caps.nr_cpus != util_cpu_caps.cores_per_L3 &&
-+   if (util_get_cpu_caps()->nr_cpus != util_get_cpu_caps()->cores_per_L3 &&
-        /* driver support */
-        ctx->Driver.PinDriverToL3Cache &&
-        ++glthread->pin_thread_counter % 128 == 0) {
-       int cpu = util_get_current_cpu();
- 
-       if (cpu >= 0) {
--         unsigned L3_cache = util_cpu_caps.cpu_to_L3[cpu];
--
--         util_set_thread_affinity(glthread->queue.threads[0],
--                                  util_cpu_caps.L3_affinity_mask[L3_cache],
--                                  NULL, UTIL_MAX_CPUS);
--         ctx->Driver.PinDriverToL3Cache(ctx, L3_cache);
-+         uint16_t L3_cache = util_get_cpu_caps()->cpu_to_L3[cpu];
-+         if (L3_cache != U_CPU_INVALID_L3) {
-+            util_set_thread_affinity(glthread->queue.threads[0],
-+                                     util_get_cpu_caps()->L3_affinity_mask[L3_cache],
-+                                     NULL, util_get_cpu_caps()->num_cpu_mask_bits);
-+            ctx->Driver.PinDriverToL3Cache(ctx, L3_cache);
-+         }
-       }
-    }
- 
-diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
-index 40364296664..f27fa7ff29c 100644
---- a/src/mesa/state_tracker/st_context.c
-+++ b/src/mesa/state_tracker/st_context.c
-@@ -815,6 +815,10 @@ st_create_context_priv(struct gl_context *ctx, struct pipe_context *pipe,
-          !st->lower_ucp;
-    st->shader_has_one_variant[MESA_SHADER_COMPUTE] = st->has_shareable_shaders;
- 
-+   if (util_get_cpu_caps()->cores_per_L3 == util_get_cpu_caps()->nr_cpus ||
-+       !st->pipe->set_context_param)
-+      st->pin_thread_counter = ST_L3_PINNING_DISABLED;
-+
-    st->bitmap.cache.empty = true;
- 
-    if (ctx->Const.ForceGLNamesReuse && ctx->Shared->RefCount == 1) {
-diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
-index b1fda06ff3e..9ab6969de62 100644
---- a/src/mesa/state_tracker/st_context.h
-+++ b/src/mesa/state_tracker/st_context.h
-@@ -55,6 +55,7 @@ struct st_program;
- struct st_perf_monitor_group;
- struct u_upload_mgr;
- 
-+#define ST_L3_PINNING_DISABLED 0xffffffff
- 
- struct st_bitmap_cache
- {
-@@ -130,6 +131,9 @@ struct st_context
-    struct draw_stage *feedback_stage;  /**< For GL_FEEDBACK rendermode */
-    struct draw_stage *selection_stage;  /**< For GL_SELECT rendermode */
-    struct draw_stage *rastpos_stage;  /**< For glRasterPos */
-+
-+   unsigned pin_thread_counter; /* for L3 thread pinning on AMD Zen */
-+
-    GLboolean clamp_frag_color_in_shader;
-    GLboolean clamp_vert_color_in_shader;
-    boolean clamp_frag_depth_in_shader;
-@@ -235,8 +239,6 @@ struct st_context
-    /** This masks out unused shader resources. Only valid in draw calls. */
-    uint64_t active_states;
- 
--   unsigned pin_thread_counter; /* for L3 thread pinning on AMD Zen */
--
-    /* If true, further analysis of states is required to know if something
-     * has changed. Used mainly for shaders.
-     */
-diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
-index 996d985510c..159d7017b07 100644
---- a/src/mesa/state_tracker/st_draw.c
-+++ b/src/mesa/state_tracker/st_draw.c
-@@ -124,26 +124,26 @@ prepare_draw(struct st_context *st, struct gl_context *ctx)
-       st_validate_state(st, ST_PIPELINE_RENDER);
-    }
- 
--   struct pipe_context *pipe = st->pipe;
--
-    /* Pin threads regularly to the same Zen CCX that the main thread is
-     * running on. The main thread can move between CCXs.
-     */
--   if (unlikely(/* AMD Zen */
--                util_cpu_caps.nr_cpus != util_cpu_caps.cores_per_L3 &&
-+   if (unlikely(st->pin_thread_counter != ST_L3_PINNING_DISABLED &&
-                 /* no glthread */
-                 ctx->CurrentClientDispatch != ctx->MarshalExec &&
--                /* driver support */
--                pipe->set_context_param &&
-                 /* do it occasionally */
-                 ++st->pin_thread_counter % 512 == 0)) {
-+      st->pin_thread_counter = 0;
-+
-       int cpu = util_get_current_cpu();
-       if (cpu >= 0) {
--         unsigned L3_cache = util_cpu_caps.cpu_to_L3[cpu];
--
--         pipe->set_context_param(pipe,
--                                 PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE,
--                                 L3_cache);
-+         struct pipe_context *pipe = st->pipe;
-+         uint16_t L3_cache = util_get_cpu_caps()->cpu_to_L3[cpu];
-+
-+         if (L3_cache != U_CPU_INVALID_L3) {
-+            pipe->set_context_param(pipe,
-+                                    PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE,
-+                                    L3_cache);
-+         }
-       }
-    }
- }
-diff --git a/src/util/half_float.h b/src/util/half_float.h
-index c52bccf8d1e..8f1a1dbf11d 100644
---- a/src/util/half_float.h
-+++ b/src/util/half_float.h
-@@ -59,7 +59,7 @@ static inline uint16_t
- _mesa_float_to_half(float val)
- {
- #if defined(USE_X86_64_ASM)
--   if (util_cpu_caps.has_f16c) {
-+   if (util_get_cpu_caps()->has_f16c) {
-       __m128 in = {val};
-       __m128i out;
- 
-@@ -75,7 +75,7 @@ static inline float
- _mesa_half_to_float(uint16_t val)
- {
- #if defined(USE_X86_64_ASM)
--   if (util_cpu_caps.has_f16c) {
-+   if (util_get_cpu_caps()->has_f16c) {
-       __m128i in = {val};
-       __m128 out;
- 
-@@ -90,7 +90,7 @@ static inline uint16_t
- _mesa_float_to_float16_rtz(float val)
- {
- #if defined(USE_X86_64_ASM)
--   if (util_cpu_caps.has_f16c) {
-+   if (util_get_cpu_caps()->has_f16c) {
-       __m128 in = {val};
-       __m128i out;
- 
-diff --git a/src/util/tests/format/u_format_test.c b/src/util/tests/format/u_format_test.c
-index f4a62a5c6a8..e6473c2bf6d 100644
---- a/src/util/tests/format/u_format_test.c
-+++ b/src/util/tests/format/u_format_test.c
-@@ -850,6 +850,8 @@ int main(int argc, char **argv)
- {
-    boolean success;
- 
-+   util_cpu_detect();
-+
-    success = test_all();
- 
-    return success ? 0 : 1;
-diff --git a/src/util/u_cpu_detect.c b/src/util/u_cpu_detect.c
-index 025f2f30156..4a4b06e1bc6 100644
---- a/src/util/u_cpu_detect.c
-+++ b/src/util/u_cpu_detect.c
-@@ -90,7 +90,7 @@
- DEBUG_GET_ONCE_BOOL_OPTION(dump_cpu, "GALLIUM_DUMP_CPU", false)
- 
- 
--struct util_cpu_caps util_cpu_caps;
-+struct util_cpu_caps_t util_cpu_caps;
- 
- #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
- static int has_cpuid(void);
-@@ -438,26 +438,22 @@ get_cpu_topology(void)
-    util_cpu_caps.cores_per_L3 = util_cpu_caps.nr_cpus;
-    util_cpu_caps.num_L3_caches = 1;
- 
-+   memset(util_cpu_caps.cpu_to_L3, 0xff, sizeof(util_cpu_caps.cpu_to_L3));
-+
- #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-    /* AMD Zen */
-    if (util_cpu_caps.family >= CPU_AMD_ZEN1_ZEN2 &&
-        util_cpu_caps.family < CPU_AMD_LAST) {
-       uint32_t regs[4];
- 
--      /* Query the L3 cache count. */
--      cpuid_count(0x8000001D, 3, regs);
--      unsigned cache_level = (regs[0] >> 5) & 0x7;
--      unsigned cores_per_L3 = ((regs[0] >> 14) & 0xfff) + 1;
--
--      if (cache_level != 3 || cores_per_L3 == util_cpu_caps.nr_cpus)
--         return;
--
-       uint32_t saved_mask[UTIL_MAX_CPUS / 32] = {0};
-       uint32_t mask[UTIL_MAX_CPUS / 32] = {0};
--      uint32_t allowed_mask[UTIL_MAX_CPUS / 32] = {0};
--      uint32_t apic_id[UTIL_MAX_CPUS];
-       bool saved = false;
- 
-+      uint32_t L3_found[UTIL_MAX_CPUS] = {0};
-+      uint32_t num_L3_caches = 0;
-+      util_affinity_mask *L3_affinity_masks = NULL;
-+
-       /* Query APIC IDs from each CPU core.
-        *
-        * An APIC ID is a logical ID of the CPU with respect to the cache
-@@ -482,41 +478,60 @@ get_cpu_topology(void)
- 
-          if (util_set_current_thread_affinity(mask,
-                                               !saved ? saved_mask : NULL,
--                                              UTIL_MAX_CPUS)) {
-+                                              util_cpu_caps.num_cpu_mask_bits)) {
-             saved = true;
--            allowed_mask[i / 32] |= cpu_bit;
- 
-             /* Query the APIC ID of the current core. */
-             cpuid(0x00000001, regs);
--            apic_id[i] = regs[1] >> 24;
-+            unsigned apic_id = regs[1] >> 24;
-+
-+            /* Query the total core count for the CPU */
-+            uint32_t core_count = 1;
-+            if (regs[3] & (1 << 28))
-+               core_count = (regs[1] >> 16) & 0xff;
-+
-+            core_count = util_next_power_of_two(core_count);
-+
-+            /* Query the L3 cache count. */
-+            cpuid_count(0x8000001D, 3, regs);
-+            unsigned cache_level = (regs[0] >> 5) & 0x7;
-+            unsigned cores_per_L3 = ((regs[0] >> 14) & 0xfff) + 1;
-+
-+            if (cache_level != 3)
-+               continue;
-+
-+            unsigned local_core_id = apic_id & (core_count - 1);
-+            unsigned phys_id = (apic_id & ~(core_count - 1)) >> util_logbase2(core_count);
-+            unsigned local_l3_cache_index = local_core_id / util_next_power_of_two(cores_per_L3);
-+#define L3_ID(p, i) (p << 16 | i << 1 | 1);
-+
-+            unsigned l3_id = L3_ID(phys_id, local_l3_cache_index);
-+            int idx = -1;
-+            for (unsigned c = 0; c < num_L3_caches; c++) {
-+               if (L3_found[c] == l3_id) {
-+                  idx = c;
-+                  break;
-+               }
-+            }
-+            if (idx == -1) {
-+               idx = num_L3_caches;
-+               L3_found[num_L3_caches++] = l3_id;
-+               L3_affinity_masks = realloc(L3_affinity_masks, sizeof(util_affinity_mask) * num_L3_caches);
-+               if (!L3_affinity_masks)
-+                  return;
-+               memset(&L3_affinity_masks[num_L3_caches - 1], 0, sizeof(util_affinity_mask));
-+            }
-+            util_cpu_caps.cpu_to_L3[i] = idx;
-+            L3_affinity_masks[idx][i / 32] |= cpu_bit;
-+
-          }
-          mask[i / 32] = 0;
-       }
- 
--      if (saved) {
--
--         /* We succeeded in using at least one CPU. */
--         util_cpu_caps.num_L3_caches = util_cpu_caps.nr_cpus / cores_per_L3;
--         util_cpu_caps.cores_per_L3 = cores_per_L3;
--         util_cpu_caps.L3_affinity_mask = calloc(sizeof(util_affinity_mask),
--                                                 util_cpu_caps.num_L3_caches);
--
--         for (unsigned i = 0; i < util_cpu_caps.nr_cpus && i < UTIL_MAX_CPUS;
--              i++) {
--            uint32_t cpu_bit = 1u << (i % 32);
--
--            if (allowed_mask[i / 32] & cpu_bit) {
--               /* Each APIC ID bit represents a topology level, so we need
--                * to round up to the next power of two.
--                */
--               unsigned L3_index = apic_id[i] /
--                                   util_next_power_of_two(cores_per_L3);
--
--               util_cpu_caps.L3_affinity_mask[L3_index][i / 32] |= cpu_bit;
--               util_cpu_caps.cpu_to_L3[i] = L3_index;
--            }
--         }
-+      util_cpu_caps.num_L3_caches = num_L3_caches;
-+      util_cpu_caps.L3_affinity_mask = L3_affinity_masks;
- 
-+      if (saved) {
-          if (debug_get_option_dump_cpu()) {
-             fprintf(stderr, "CPU <-> L3 cache mapping:\n");
-             for (unsigned i = 0; i < util_cpu_caps.num_L3_caches; i++) {
-@@ -528,7 +543,8 @@ get_cpu_topology(void)
-          }
- 
-          /* Restore the original affinity mask. */
--         util_set_current_thread_affinity(saved_mask, NULL, UTIL_MAX_CPUS);
-+         util_set_current_thread_affinity(saved_mask, NULL,
-+                                          util_cpu_caps.num_cpu_mask_bits);
-       } else {
-          if (debug_get_option_dump_cpu())
-             fprintf(stderr, "Cannot set thread affinity for any thread.\n");
-@@ -547,7 +563,7 @@ util_cpu_detect_once(void)
-    {
-       SYSTEM_INFO system_info;
-       GetSystemInfo(&system_info);
--      util_cpu_caps.nr_cpus = system_info.dwNumberOfProcessors;
-+      util_cpu_caps.nr_cpus = MAX2(1, system_info.dwNumberOfProcessors);
-    }
- #elif defined(PIPE_OS_UNIX) && defined(_SC_NPROCESSORS_ONLN)
-    util_cpu_caps.nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
-@@ -569,6 +585,8 @@ util_cpu_detect_once(void)
-    util_cpu_caps.nr_cpus = 1;
- #endif
- 
-+   util_cpu_caps.num_cpu_mask_bits = align(util_cpu_caps.nr_cpus, 32);
-+
-    /* Make the fallback cacheline size nonzero so that it can be
-     * safely passed to align().
-     */
-diff --git a/src/util/u_cpu_detect.h b/src/util/u_cpu_detect.h
-index a76fd912910..1c7239b2ec7 100644
---- a/src/util/u_cpu_detect.h
-+++ b/src/util/u_cpu_detect.h
-@@ -55,7 +55,7 @@ enum cpu_family {
- 
- typedef uint32_t util_affinity_mask[UTIL_MAX_CPUS / 32];
- 
--struct util_cpu_caps {
-+struct util_cpu_caps_t {
-    int nr_cpus;
-    enum cpu_family family;
- 
-@@ -98,14 +98,27 @@ struct util_cpu_caps {
- 
-    unsigned num_L3_caches;
-    unsigned cores_per_L3;
-+   unsigned num_cpu_mask_bits;
- 
-    uint16_t cpu_to_L3[UTIL_MAX_CPUS];
-    /* Affinity masks for each L3 cache. */
-    util_affinity_mask *L3_affinity_mask;
- };
- 
--extern struct util_cpu_caps
--util_cpu_caps;
-+#define U_CPU_INVALID_L3 0xffff
-+
-+static inline const struct util_cpu_caps_t *
-+util_get_cpu_caps(void)
-+{
-+	extern struct util_cpu_caps_t util_cpu_caps;
-+
-+	/* If you hit this assert, it means that something is using the
-+	 * cpu-caps without having first called util_cpu_detect()
-+	 */
-+	assert(util_cpu_caps.nr_cpus >= 1);
-+
-+	return &util_cpu_caps;
-+}
- 
- void util_cpu_detect(void);
- 
-diff --git a/src/util/u_math.c b/src/util/u_math.c
-index 9a8a9ecbbde..41e7f599eb0 100644
---- a/src/util/u_math.c
-+++ b/src/util/u_math.c
-@@ -92,7 +92,7 @@ util_fpstate_get(void)
-    unsigned mxcsr = 0;
- 
- #if defined(PIPE_ARCH_SSE)
--   if (util_cpu_caps.has_sse) {
-+   if (util_get_cpu_caps()->has_sse) {
-       mxcsr = _mm_getcsr();
-    }
- #endif
-@@ -110,10 +110,10 @@ unsigned
- util_fpstate_set_denorms_to_zero(unsigned current_mxcsr)
- {
- #if defined(PIPE_ARCH_SSE)
--   if (util_cpu_caps.has_sse) {
-+   if (util_get_cpu_caps()->has_sse) {
-       /* Enable flush to zero mode */
-       current_mxcsr |= _MM_FLUSH_ZERO_MASK;
--      if (util_cpu_caps.has_daz) {
-+      if (util_get_cpu_caps()->has_daz) {
-          /* Enable denormals are zero mode */
-          current_mxcsr |= _MM_DENORMALS_ZERO_MASK;
-       }
-@@ -132,7 +132,7 @@ void
- util_fpstate_set(unsigned mxcsr)
- {
- #if defined(PIPE_ARCH_SSE)
--   if (util_cpu_caps.has_sse) {
-+   if (util_get_cpu_caps()->has_sse) {
-       _mm_setcsr(mxcsr);
-    }
- #endif
-diff --git a/src/util/u_queue.c b/src/util/u_queue.c
-index b11b297a45c..8f21f0667c6 100644
---- a/src/util/u_queue.c
-+++ b/src/util/u_queue.c
-@@ -27,7 +27,7 @@
- #include "u_queue.h"
- 
- #include "c11/threads.h"
--
-+#include "util/u_cpu_detect.h"
- #include "util/os_time.h"
- #include "util/u_string.h"
- #include "util/u_thread.h"
-@@ -258,7 +258,8 @@ util_queue_thread_func(void *input)
-       uint32_t mask[UTIL_MAX_CPUS / 32];
- 
-       memset(mask, 0xff, sizeof(mask));
--      util_set_current_thread_affinity(mask, NULL, UTIL_MAX_CPUS);
-+      util_set_current_thread_affinity(mask, NULL,
-+                                       util_get_cpu_caps()->num_cpu_mask_bits);
-    }
- 
- #if defined(__linux__)
diff --git a/SOURCES/lavapipe-disable-env-var.patch b/SOURCES/lavapipe-disable-env-var.patch
index 9b59577..ba50bee 100644
--- a/SOURCES/lavapipe-disable-env-var.patch
+++ b/SOURCES/lavapipe-disable-env-var.patch
@@ -1,13 +1,13 @@
-diff -up mesa-20.3.0-rc1/src/gallium/frontends/lavapipe/lvp_device.c.dma mesa-20.3.0-rc1/src/gallium/frontends/lavapipe/lvp_device.c
---- mesa-20.3.0-rc1/src/gallium/frontends/lavapipe/lvp_device.c.dma	2020-11-19 15:11:42.483134826 +1000
-+++ mesa-20.3.0-rc1/src/gallium/frontends/lavapipe/lvp_device.c	2020-11-19 15:13:08.556425782 +1000
-@@ -118,6 +118,9 @@ VkResult lvp_CreateInstance(
-       client_version = VK_API_VERSION_1_0;
-    }
+diff -up mesa-21.1.1/src/gallium/frontends/lavapipe/lvp_device.c.dma mesa-21.1.1/src/gallium/frontends/lavapipe/lvp_device.c
+--- mesa-21.1.1/src/gallium/frontends/lavapipe/lvp_device.c.dma	2021-05-20 13:08:02.207217380 +1000
++++ mesa-21.1.1/src/gallium/frontends/lavapipe/lvp_device.c	2021-05-20 13:08:35.868127094 +1000
+@@ -224,6 +224,9 @@ VKAPI_ATTR VkResult VKAPI_CALL lvp_Creat
+    if (pAllocator == NULL)
+       pAllocator = &default_alloc;
  
 +   if (!getenv("RH_SW_VULKAN"))
 +      return VK_ERROR_INITIALIZATION_FAILED;
 +
-    instance = vk_zalloc2(&default_alloc, pAllocator, sizeof(*instance), 8,
-                          VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
+    instance = vk_zalloc(pAllocator, sizeof(*instance), 8,
+                         VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
     if (!instance)
diff --git a/SOURCES/mesa-20.3.3-stable-fixes.patch b/SOURCES/mesa-20.3.3-stable-fixes.patch
deleted file mode 100644
index 231e20b..0000000
--- a/SOURCES/mesa-20.3.3-stable-fixes.patch
+++ /dev/null
@@ -1,930 +0,0 @@
-diff --git a/src/amd/vulkan/radv_query.c b/src/amd/vulkan/radv_query.c
-index d49bc0f0564..90512d4f276 100644
---- a/src/amd/vulkan/radv_query.c
-+++ b/src/amd/vulkan/radv_query.c
-@@ -1679,13 +1679,14 @@ static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer,
- 
- 			va += 8 * idx;
- 
--			si_cs_emit_write_event_eop(cs,
--						   cmd_buffer->device->physical_device->rad_info.chip_class,
--						   radv_cmd_buffer_uses_mec(cmd_buffer),
--						   V_028A90_PS_DONE, 0,
--						   EOP_DST_SEL_TC_L2,
--						   EOP_DATA_SEL_GDS,
--						   va, EOP_DATA_GDS(0, 1), 0);
-+			radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
-+			radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_GDS) |
-+					COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
-+					COPY_DATA_WR_CONFIRM);
-+			radeon_emit(cs, 0);
-+			radeon_emit(cs, 0);
-+			radeon_emit(cs, va);
-+			radeon_emit(cs, va >> 32);
- 
- 			/* Record that the command buffer needs GDS. */
- 			cmd_buffer->gds_needed = true;
-@@ -1769,13 +1770,14 @@ static void emit_end_query(struct radv_cmd_buffer *cmd_buffer,
- 
- 			va += 8 * idx;
- 
--			si_cs_emit_write_event_eop(cs,
--						   cmd_buffer->device->physical_device->rad_info.chip_class,
--						   radv_cmd_buffer_uses_mec(cmd_buffer),
--						   V_028A90_PS_DONE, 0,
--						   EOP_DST_SEL_TC_L2,
--						   EOP_DATA_SEL_GDS,
--						   va, EOP_DATA_GDS(0, 1), 0);
-+			radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
-+			radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_GDS) |
-+					COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
-+					COPY_DATA_WR_CONFIRM);
-+			radeon_emit(cs, 0);
-+			radeon_emit(cs, 0);
-+			radeon_emit(cs, va);
-+			radeon_emit(cs, va >> 32);
- 
- 			cmd_buffer->state.active_pipeline_gds_queries--;
- 		}
-diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
-index 9d9491d4361..2eb3ba4e64e 100644
---- a/src/amd/vulkan/radv_shader.h
-+++ b/src/amd/vulkan/radv_shader.h
-@@ -573,9 +573,11 @@ get_tcs_num_patches(unsigned tcs_num_input_vertices,
- 	if (chip_class >= GFX7 && family != CHIP_STONEY)
- 		hardware_lds_size = 65536;
- 
--	num_patches = MIN2(num_patches, hardware_lds_size / (input_patch_size + output_patch_size));
-+	if (input_patch_size + output_patch_size)
-+		num_patches = MIN2(num_patches, hardware_lds_size / (input_patch_size + output_patch_size));
- 	/* Make sure the output data fits in the offchip buffer */
--	num_patches = MIN2(num_patches, (tess_offchip_block_dw_size * 4) / output_patch_size);
-+	if (output_patch_size)
-+		num_patches = MIN2(num_patches, (tess_offchip_block_dw_size * 4) / output_patch_size);
- 	/* Not necessary for correctness, but improves performance. The
- 	 * specific value is taken from the proprietary driver.
- 	 */
-diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c
-index 1eef6aac70c..a6a663d97a6 100644
---- a/src/gallium/auxiliary/cso_cache/cso_context.c
-+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
-@@ -402,10 +402,13 @@ void cso_destroy_context( struct cso_context *ctx )
-                                                 PIPE_SHADER_CAP_MAX_SHADER_BUFFERS);
-             int maxcb = scr->get_shader_param(scr, sh,
-                                               PIPE_SHADER_CAP_MAX_CONST_BUFFERS);
-+            int maximg = scr->get_shader_param(scr, sh,
-+                                              PIPE_SHADER_CAP_MAX_SHADER_IMAGES);
-             assert(maxsam <= PIPE_MAX_SAMPLERS);
-             assert(maxview <= PIPE_MAX_SHADER_SAMPLER_VIEWS);
-             assert(maxssbo <= PIPE_MAX_SHADER_BUFFERS);
-             assert(maxcb <= PIPE_MAX_CONSTANT_BUFFERS);
-+            assert(maximg <= PIPE_MAX_SHADER_IMAGES);
-             if (maxsam > 0) {
-                ctx->pipe->bind_sampler_states(ctx->pipe, sh, 0, maxsam, zeros);
-             }
-@@ -415,6 +418,9 @@ void cso_destroy_context( struct cso_context *ctx )
-             if (maxssbo > 0) {
-                ctx->pipe->set_shader_buffers(ctx->pipe, sh, 0, maxssbo, ssbos, 0);
-             }
-+            if (maximg > 0) {
-+               ctx->pipe->set_shader_images(ctx->pipe, sh, 0, maximg, NULL);
-+            }
-             for (int i = 0; i < maxcb; i++) {
-                ctx->pipe->set_constant_buffer(ctx->pipe, sh, i, NULL);
-             }
-diff --git a/src/gallium/drivers/iris/iris_program.c b/src/gallium/drivers/iris/iris_program.c
-index 8157e921850..971fc80b5ac 100644
---- a/src/gallium/drivers/iris/iris_program.c
-+++ b/src/gallium/drivers/iris/iris_program.c
-@@ -2109,8 +2109,8 @@ iris_get_scratch_space(struct iris_context *ice,
-     * in the base configuration.
-     */
-    unsigned subslice_total = screen->subslice_total;
--   if (devinfo->gen >= 12)
--      subslice_total = devinfo->num_subslices[0];
-+   if (devinfo->gen == 12)
-+      subslice_total = (devinfo->is_dg1 || devinfo->gt == 2 ? 6 : 2);
-    else if (devinfo->gen == 11)
-       subslice_total = 8;
-    else if (devinfo->gen < 11)
-diff --git a/src/gallium/drivers/iris/iris_resolve.c b/src/gallium/drivers/iris/iris_resolve.c
-index 276ad62b1dd..045f43ed8c0 100644
---- a/src/gallium/drivers/iris/iris_resolve.c
-+++ b/src/gallium/drivers/iris/iris_resolve.c
-@@ -793,7 +793,9 @@ iris_resource_set_aux_state(struct iris_context *ice,
-       if (res->aux.state[level][start_layer + a] != aux_state) {
-          res->aux.state[level][start_layer + a] = aux_state;
-          /* XXX: Need to track which bindings to make dirty */
--         ice->state.dirty |= IRIS_DIRTY_RENDER_BUFFER;
-+         ice->state.dirty |= IRIS_DIRTY_RENDER_BUFFER |
-+                             IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES |
-+                             IRIS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES;
-          ice->state.stage_dirty |= IRIS_ALL_STAGE_DIRTY_BINDINGS;
-       }
-    }
-diff --git a/src/gallium/drivers/iris/iris_resource.c b/src/gallium/drivers/iris/iris_resource.c
-index 8747ef4aa8a..3b34e32cd21 100644
---- a/src/gallium/drivers/iris/iris_resource.c
-+++ b/src/gallium/drivers/iris/iris_resource.c
-@@ -1125,6 +1125,20 @@ iris_flush_resource(struct pipe_context *ctx, struct pipe_resource *resource)
-                                 0, INTEL_REMAINING_LAYERS,
-                                 mod ? mod->aux_usage : ISL_AUX_USAGE_NONE,
-                                 mod ? mod->supports_clear_color : false);
-+
-+   if (!res->mod_info && res->aux.usage != ISL_AUX_USAGE_NONE) {
-+      /* flush_resource may be used to prepare an image for sharing external
-+       * to the driver (e.g. via eglCreateImage). To account for this, make
-+       * sure to get rid of any compression that a consumer wouldn't know how
-+       * to handle.
-+       */
-+      for (int i = 0; i < IRIS_BATCH_COUNT; i++) {
-+         if (iris_batch_references(&ice->batches[i], res->bo))
-+            iris_batch_flush(&ice->batches[i]);
-+      }
-+
-+      iris_resource_disable_aux(res);
-+   }
- }
- 
- static void
-diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c
-index 59a63f7bbab..b9ddb863a16 100644
---- a/src/gallium/drivers/iris/iris_state.c
-+++ b/src/gallium/drivers/iris/iris_state.c
-@@ -1666,6 +1666,8 @@ struct iris_rasterizer_state {
-    bool multisample;
-    bool force_persample_interp;
-    bool conservative_rasterization;
-+   bool fill_mode_point;
-+   bool fill_mode_line;
-    bool fill_mode_point_or_line;
-    enum pipe_sprite_coord_mode sprite_coord_mode; /* PIPE_SPRITE_* */
-    uint16_t sprite_coord_enable;
-@@ -1729,11 +1731,15 @@ iris_create_rasterizer_state(struct pipe_context *ctx,
-    cso->conservative_rasterization =
-       state->conservative_raster_mode == PIPE_CONSERVATIVE_RASTER_POST_SNAP;
- 
--   cso->fill_mode_point_or_line =
--      state->fill_front == PIPE_POLYGON_MODE_LINE ||
-+   cso->fill_mode_point =
-       state->fill_front == PIPE_POLYGON_MODE_POINT ||
--      state->fill_back == PIPE_POLYGON_MODE_LINE ||
-       state->fill_back == PIPE_POLYGON_MODE_POINT;
-+   cso->fill_mode_line =
-+      state->fill_front == PIPE_POLYGON_MODE_LINE ||
-+      state->fill_back == PIPE_POLYGON_MODE_LINE;
-+   cso->fill_mode_point_or_line =
-+      cso->fill_mode_point ||
-+      cso->fill_mode_line;
- 
-    if (state->clip_plane_enable != 0)
-       cso->num_clip_plane_consts = util_logbase2(state->clip_plane_enable) + 1;
-@@ -4059,6 +4065,28 @@ iris_emit_sbe_swiz(struct iris_batch *batch,
-    }
- }
- 
-+static bool
-+iris_is_drawing_points(const struct iris_context *ice)
-+{
-+   const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
-+
-+   if (cso_rast->fill_mode_point) {
-+      return true;
-+   }
-+
-+   if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
-+      const struct brw_gs_prog_data *gs_prog_data =
-+         (void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data;
-+      return gs_prog_data->output_topology == _3DPRIM_POINTLIST;
-+   } else if (ice->shaders.prog[MESA_SHADER_TESS_EVAL]) {
-+      const struct brw_tes_prog_data *tes_data =
-+         (void *) ice->shaders.prog[MESA_SHADER_TESS_EVAL]->prog_data;
-+      return tes_data->output_topology == BRW_TESS_OUTPUT_TOPOLOGY_POINT;
-+   } else {
-+      return ice->state.prim_mode == PIPE_PRIM_POINTS;
-+   }
-+}
-+
- static unsigned
- iris_calculate_point_sprite_overrides(const struct brw_wm_prog_data *prog_data,
-                                       const struct iris_rasterizer_state *cso)
-@@ -4093,7 +4121,8 @@ iris_emit_sbe(struct iris_batch *batch, const struct iris_context *ice)
-                                       &urb_read_offset, &urb_read_length);
- 
-    unsigned sprite_coord_overrides =
--      iris_calculate_point_sprite_overrides(wm_prog_data, cso_rast);
-+      iris_is_drawing_points(ice) ?
-+      iris_calculate_point_sprite_overrides(wm_prog_data, cso_rast) : 0;
- 
-    iris_emit_cmd(batch, GENX(3DSTATE_SBE), sbe) {
-       sbe.AttributeSwizzleEnable = true;
-diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
-index 8f688fa3650..ef35f86b05f 100644
---- a/src/gallium/drivers/radeonsi/si_descriptors.c
-+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
-@@ -1482,11 +1482,12 @@ void si_update_needs_color_decompress_masks(struct si_context *sctx)
- /* Reset descriptors of buffer resources after \p buf has been invalidated.
-  * If buf == NULL, reset all descriptors.
-  */
--static void si_reset_buffer_resources(struct si_context *sctx, struct si_buffer_resources *buffers,
-+static bool si_reset_buffer_resources(struct si_context *sctx, struct si_buffer_resources *buffers,
-                                       unsigned descriptors_idx, uint64_t slot_mask,
-                                       struct pipe_resource *buf, enum radeon_bo_priority priority)
- {
-    struct si_descriptors *descs = &sctx->descriptors[descriptors_idx];
-+   bool noop = true;
-    uint64_t mask = buffers->enabled_mask & slot_mask;
- 
-    while (mask) {
-@@ -1501,8 +1502,10 @@ static void si_reset_buffer_resources(struct si_context *sctx, struct si_buffer_
-             sctx, si_resource(buffer),
-             buffers->writable_mask & (1llu << i) ? RADEON_USAGE_READWRITE : RADEON_USAGE_READ,
-             priority, true);
-+         noop = false;
-       }
-    }
-+   return !noop;
- }
- 
- /* Update all buffer bindings where the buffer is bound, including
-@@ -1577,11 +1580,15 @@ void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf)
-    }
- 
-    if (!buffer || buffer->bind_history & PIPE_BIND_SHADER_BUFFER) {
--      for (shader = 0; shader < SI_NUM_SHADERS; shader++)
--         si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader],
--                                   si_const_and_shader_buffer_descriptors_idx(shader),
--                                   u_bit_consecutive64(0, SI_NUM_SHADER_BUFFERS), buf,
--                                   sctx->const_and_shader_buffers[shader].priority);
-+      for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
-+         if (si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader],
-+                                       si_const_and_shader_buffer_descriptors_idx(shader),
-+                                       u_bit_consecutive64(0, SI_NUM_SHADER_BUFFERS), buf,
-+                                       sctx->const_and_shader_buffers[shader].priority) &&
-+             shader == PIPE_SHADER_COMPUTE) {
-+            sctx->compute_shaderbuf_sgprs_dirty = true;
-+         }
-+      }
-    }
- 
-    if (!buffer || buffer->bind_history & PIPE_BIND_SAMPLER_VIEW) {
-@@ -1633,6 +1640,9 @@ void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf)
-                radeon_add_to_gfx_buffer_list_check_mem(sctx, si_resource(buffer),
-                                                        RADEON_USAGE_READWRITE,
-                                                        RADEON_PRIO_SAMPLER_BUFFER, true);
-+
-+               if (shader == PIPE_SHADER_COMPUTE)
-+                  sctx->compute_image_sgprs_dirty = true;
-             }
-          }
-       }
-diff --git a/src/gallium/frontends/dri/dri_helpers.c b/src/gallium/frontends/dri/dri_helpers.c
-index 01a1fb3d96c..5e87df35a55 100644
---- a/src/gallium/frontends/dri/dri_helpers.c
-+++ b/src/gallium/frontends/dri/dri_helpers.c
-@@ -258,7 +258,9 @@ dri2_create_image_from_renderbuffer2(__DRIcontext *context,
- 				     int renderbuffer, void *loaderPrivate,
-                                      unsigned *error)
- {
--   struct gl_context *ctx = ((struct st_context *)dri_context(context)->st)->ctx;
-+   struct st_context *st_ctx = (struct st_context *)dri_context(context)->st;
-+   struct gl_context *ctx = st_ctx->ctx;
-+   struct pipe_context *p_ctx = st_ctx->pipe;
-    struct gl_renderbuffer *rb;
-    struct pipe_resource *tex;
-    __DRIimage *img;
-@@ -299,6 +301,13 @@ dri2_create_image_from_renderbuffer2(__DRIcontext *context,
- 
-    pipe_resource_reference(&img->texture, tex);
- 
-+   /* If the resource supports EGL_MESA_image_dma_buf_export, make sure that
-+    * it's in a shareable state. Do this now while we still have the access to
-+    * the context.
-+    */
-+   if (dri2_get_mapping_by_format(img->dri_format))
-+      p_ctx->flush_resource(p_ctx, tex);
-+
-    *error = __DRI_IMAGE_ERROR_SUCCESS;
-    return img;
- }
-@@ -326,7 +335,9 @@ dri2_create_from_texture(__DRIcontext *context, int target, unsigned texture,
-                          void *loaderPrivate)
- {
-    __DRIimage *img;
--   struct gl_context *ctx = ((struct st_context *)dri_context(context)->st)->ctx;
-+   struct st_context *st_ctx = (struct st_context *)dri_context(context)->st;
-+   struct gl_context *ctx = st_ctx->ctx;
-+   struct pipe_context *p_ctx = st_ctx->pipe;
-    struct gl_texture_object *obj;
-    struct pipe_resource *tex;
-    GLuint face = 0;
-@@ -376,6 +387,13 @@ dri2_create_from_texture(__DRIcontext *context, int target, unsigned texture,
- 
-    pipe_resource_reference(&img->texture, tex);
- 
-+   /* If the resource supports EGL_MESA_image_dma_buf_export, make sure that
-+    * it's in a shareable state. Do this now while we still have the access to
-+    * the context.
-+    */
-+   if (dri2_get_mapping_by_format(img->dri_format))
-+      p_ctx->flush_resource(p_ctx, tex);
-+
-    *error = __DRI_IMAGE_ERROR_SUCCESS;
-    return img;
- }
-@@ -547,6 +565,9 @@ dri2_get_mapping_by_fourcc(int fourcc)
- const struct dri2_format_mapping *
- dri2_get_mapping_by_format(int format)
- {
-+   if (format == __DRI_IMAGE_FORMAT_NONE)
-+      return NULL;
-+
-    for (unsigned i = 0; i < ARRAY_SIZE(dri2_format_table); i++) {
-       if (dri2_format_table[i].dri_format == format)
-          return &dri2_format_table[i];
-diff --git a/src/gallium/frontends/lavapipe/lvp_device.c b/src/gallium/frontends/lavapipe/lvp_device.c
-index 45734f95880..187aecde1f8 100644
---- a/src/gallium/frontends/lavapipe/lvp_device.c
-+++ b/src/gallium/frontends/lavapipe/lvp_device.c
-@@ -52,8 +52,6 @@ lvp_physical_device_init(struct lvp_physical_device *device,
-    if (!device->pscreen)
-       return vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
- 
--   fprintf(stderr, "WARNING: lavapipe is not a conformant vulkan implementation, testing use only.\n");
--
-    device->max_images = device->pscreen->get_shader_param(device->pscreen, PIPE_SHADER_FRAGMENT, PIPE_SHADER_CAP_MAX_SHADER_IMAGES);
-    lvp_physical_device_get_supported_extensions(device, &device->supported_extensions);
-    result = lvp_init_wsi(device);
-@@ -575,6 +573,19 @@ void lvp_GetPhysicalDeviceProperties2(
-    }
- }
- 
-+static void lvp_get_physical_device_queue_family_properties(
-+   VkQueueFamilyProperties*                    pQueueFamilyProperties)
-+{
-+   *pQueueFamilyProperties = (VkQueueFamilyProperties) {
-+      .queueFlags = VK_QUEUE_GRAPHICS_BIT |
-+      VK_QUEUE_COMPUTE_BIT |
-+      VK_QUEUE_TRANSFER_BIT,
-+      .queueCount = 1,
-+      .timestampValidBits = 64,
-+      .minImageTransferGranularity = (VkExtent3D) { 1, 1, 1 },
-+   };
-+}
-+
- void lvp_GetPhysicalDeviceQueueFamilyProperties(
-    VkPhysicalDevice                            physicalDevice,
-    uint32_t*                                   pCount,
-@@ -586,15 +597,21 @@ void lvp_GetPhysicalDeviceQueueFamilyProperties(
-    }
- 
-    assert(*pCount >= 1);
-+   lvp_get_physical_device_queue_family_properties(pQueueFamilyProperties);
-+}
- 
--   *pQueueFamilyProperties = (VkQueueFamilyProperties) {
--      .queueFlags = VK_QUEUE_GRAPHICS_BIT |
--      VK_QUEUE_COMPUTE_BIT |
--      VK_QUEUE_TRANSFER_BIT,
--      .queueCount = 1,
--      .timestampValidBits = 64,
--      .minImageTransferGranularity = (VkExtent3D) { 1, 1, 1 },
--   };
-+void lvp_GetPhysicalDeviceQueueFamilyProperties2(
-+   VkPhysicalDevice                            physicalDevice,
-+   uint32_t*                                   pCount,
-+   VkQueueFamilyProperties2                   *pQueueFamilyProperties)
-+{
-+   if (pQueueFamilyProperties == NULL) {
-+      *pCount = 1;
-+      return;
-+   }
-+
-+   assert(*pCount >= 1);
-+   lvp_get_physical_device_queue_family_properties(&pQueueFamilyProperties->queueFamilyProperties);
- }
- 
- void lvp_GetPhysicalDeviceMemoryProperties(
-@@ -617,6 +634,14 @@ void lvp_GetPhysicalDeviceMemoryProperties(
-    };
- }
- 
-+void lvp_GetPhysicalDeviceMemoryProperties2(
-+   VkPhysicalDevice                            physicalDevice,
-+   VkPhysicalDeviceMemoryProperties2          *pMemoryProperties)
-+{
-+   lvp_GetPhysicalDeviceMemoryProperties(physicalDevice,
-+                                         &pMemoryProperties->memoryProperties);
-+}
-+
- PFN_vkVoidFunction lvp_GetInstanceProcAddr(
-    VkInstance                                  _instance,
-    const char*                                 pName)
-@@ -822,6 +847,8 @@ VkResult lvp_CreateDevice(
-    const VkAllocationCallbacks*                pAllocator,
-    VkDevice*                                   pDevice)
- {
-+   fprintf(stderr, "WARNING: lavapipe is not a conformant vulkan implementation, testing use only.\n");
-+
-    LVP_FROM_HANDLE(lvp_physical_device, physical_device, physicalDevice);
-    struct lvp_device *device;
- 
-diff --git a/src/glx/g_glxglvnddispatchfuncs.c b/src/glx/g_glxglvnddispatchfuncs.c
-index 0f02ed2d321..e0ea27c0b18 100644
---- a/src/glx/g_glxglvnddispatchfuncs.c
-+++ b/src/glx/g_glxglvnddispatchfuncs.c
-@@ -87,6 +87,7 @@ const char * const __glXDispatchTableStrings[DI_LAST_INDEX] = {
-     __ATTRIB(SelectEventSGIX),
-     // glXSwapBuffers implemented by libglvnd
-     __ATTRIB(SwapBuffersMscOML),
-+    __ATTRIB(SwapIntervalEXT),
-     __ATTRIB(SwapIntervalMESA),
-     __ATTRIB(SwapIntervalSGI),
-     // glXUseXFont implemented by libglvnd
-@@ -893,6 +894,24 @@ static int dispatch_SwapIntervalMESA(unsigned int interval)
- 
- 
- 
-+static void dispatch_SwapIntervalEXT(Display *dpy, GLXDrawable drawable, int interval)
-+{
-+    PFNGLXSWAPINTERVALEXTPROC pSwapIntervalEXT;
-+    __GLXvendorInfo *dd;
-+
-+    dd = GetDispatchFromDrawable(dpy, drawable);
-+    if (dd == NULL)
-+        return;
-+
-+    __FETCH_FUNCTION_PTR(SwapIntervalEXT);
-+    if (pSwapIntervalEXT == NULL)
-+        return;
-+
-+    pSwapIntervalEXT(dpy, drawable, interval);
-+}
-+
-+
-+
- static Bool dispatch_WaitForMscOML(Display *dpy, GLXDrawable drawable,
-                                       int64_t target_msc, int64_t divisor,
-                                       int64_t remainder, int64_t *ust,
-@@ -974,6 +993,7 @@ const void * const __glXDispatchFunctions[DI_LAST_INDEX + 1] = {
-     __ATTRIB(ReleaseTexImageEXT),
-     __ATTRIB(SelectEventSGIX),
-     __ATTRIB(SwapBuffersMscOML),
-+    __ATTRIB(SwapIntervalEXT),
-     __ATTRIB(SwapIntervalMESA),
-     __ATTRIB(SwapIntervalSGI),
-     __ATTRIB(WaitForMscOML),
-diff --git a/src/glx/g_glxglvnddispatchindices.h b/src/glx/g_glxglvnddispatchindices.h
-index 3ba50a74abb..b65d078098f 100644
---- a/src/glx/g_glxglvnddispatchindices.h
-+++ b/src/glx/g_glxglvnddispatchindices.h
-@@ -79,6 +79,7 @@ typedef enum __GLXdispatchIndex {
-     DI_SelectEventSGIX,
-     // SwapBuffers implemented by libglvnd
-     DI_SwapBuffersMscOML,
-+    DI_SwapIntervalEXT,
-     DI_SwapIntervalMESA,
-     DI_SwapIntervalSGI,
-     // UseXFont implemented by libglvnd
-diff --git a/src/intel/common/gen_mi_builder.h b/src/intel/common/gen_mi_builder.h
-index ddd8459ef07..47fb98e99f7 100644
---- a/src/intel/common/gen_mi_builder.h
-+++ b/src/intel/common/gen_mi_builder.h
-@@ -932,6 +932,13 @@ gen_mi_store_address(struct gen_mi_builder *b,
- static inline void
- gen_mi_self_mod_barrier(struct gen_mi_builder *b)
- {
-+   /* First make sure all the memory writes from previous modifying commands
-+    * have landed. We want to do this before going through the CS cache,
-+    * otherwise we could be fetching memory that hasn't been written to yet.
-+    */
-+   gen_mi_builder_emit(b, GENX(PIPE_CONTROL), pc) {
-+      pc.CommandStreamerStallEnable = true;
-+   }
-    /* Documentation says Gen11+ should be able to invalidate the command cache
-     * but experiment show it doesn't work properly, so for now just get over
-     * the CS prefetch.
-diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp
-index 917c3abfe9e..6896987055f 100644
---- a/src/intel/compiler/brw_fs_copy_propagation.cpp
-+++ b/src/intel/compiler/brw_fs_copy_propagation.cpp
-@@ -437,6 +437,7 @@ instruction_requires_packed_data(fs_inst *inst)
-    case FS_OPCODE_DDX_COARSE:
-    case FS_OPCODE_DDY_FINE:
-    case FS_OPCODE_DDY_COARSE:
-+   case SHADER_OPCODE_QUAD_SWIZZLE:
-       return true;
-    default:
-       return false;
-diff --git a/src/intel/compiler/brw_ir_fs.h b/src/intel/compiler/brw_ir_fs.h
-index 6ba3a6ca97e..3a4acc1834a 100644
---- a/src/intel/compiler/brw_ir_fs.h
-+++ b/src/intel/compiler/brw_ir_fs.h
-@@ -451,13 +451,15 @@ regs_written(const fs_inst *inst)
-  * Return the number of dataflow registers read by the instruction (either
-  * fully or partially) counted from 'floor(reg_offset(inst->src[i]) /
-  * register_size)'.  The somewhat arbitrary register size unit is 4B for the
-- * UNIFORM and IMM files and 32B for all other files.
-+ * UNIFORM files and 32B for all other files.
-  */
- inline unsigned
- regs_read(const fs_inst *inst, unsigned i)
- {
--   const unsigned reg_size =
--      inst->src[i].file == UNIFORM || inst->src[i].file == IMM ? 4 : REG_SIZE;
-+   if (inst->src[i].file == IMM)
-+      return 1;
-+
-+   const unsigned reg_size = inst->src[i].file == UNIFORM ? 4 : REG_SIZE;
-    return DIV_ROUND_UP(reg_offset(inst->src[i]) % reg_size +
-                        inst->size_read(i) -
-                        MIN2(inst->size_read(i), reg_padding(inst->src[i])),
-diff --git a/src/intel/vulkan/anv_allocator.c b/src/intel/vulkan/anv_allocator.c
-index 9007cd00e85..48811912e95 100644
---- a/src/intel/vulkan/anv_allocator.c
-+++ b/src/intel/vulkan/anv_allocator.c
-@@ -1447,8 +1447,8 @@ anv_scratch_pool_alloc(struct anv_device *device, struct anv_scratch_pool *pool,
-     * For, Gen11+, scratch space allocation is based on the number of threads
-     * in the base configuration.
-     */
--   if (devinfo->gen >= 12)
--      subslices = devinfo->num_subslices[0];
-+   if (devinfo->gen == 12)
-+      subslices = (devinfo->is_dg1 || devinfo->gt == 2 ? 6 : 2);
-    else if (devinfo->gen == 11)
-       subslices = 8;
-    else if (devinfo->gen >= 9)
-diff --git a/src/intel/vulkan/anv_image.c b/src/intel/vulkan/anv_image.c
-index 0290431f145..80307cd612f 100644
---- a/src/intel/vulkan/anv_image.c
-+++ b/src/intel/vulkan/anv_image.c
-@@ -684,6 +684,25 @@ choose_drm_format_mod(const struct anv_physical_device *device,
-       return NULL;
- }
- 
-+static VkImageUsageFlags
-+anv_image_create_usage(const VkImageCreateInfo *pCreateInfo,
-+                       VkImageUsageFlags usage)
-+{
-+   /* Add TRANSFER_SRC usage for multisample attachment images. This is
-+    * because we might internally use the TRANSFER_SRC layout on them for
-+    * blorp operations associated with resolving those into other attachments
-+    * at the end of a subpass.
-+    *
-+    * Without this additional usage, we compute an incorrect AUX state in
-+    * anv_layout_to_aux_state().
-+    */
-+   if (pCreateInfo->samples > VK_SAMPLE_COUNT_1_BIT &&
-+       (usage & (VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
-+                 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT)))
-+      usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
-+   return usage;
-+}
-+
- VkResult
- anv_image_create(VkDevice _device,
-                  const struct anv_image_create_info *create_info,
-@@ -732,7 +751,7 @@ anv_image_create(VkDevice _device,
-    image->levels = pCreateInfo->mipLevels;
-    image->array_size = pCreateInfo->arrayLayers;
-    image->samples = pCreateInfo->samples;
--   image->usage = pCreateInfo->usage;
-+   image->usage = anv_image_create_usage(pCreateInfo, pCreateInfo->usage);
-    image->create_flags = pCreateInfo->flags;
-    image->tiling = pCreateInfo->tiling;
-    image->disjoint = pCreateInfo->flags & VK_IMAGE_CREATE_DISJOINT_BIT;
-@@ -745,8 +764,11 @@ anv_image_create(VkDevice _device,
-       const VkImageStencilUsageCreateInfoEXT *stencil_usage_info =
-          vk_find_struct_const(pCreateInfo->pNext,
-                               IMAGE_STENCIL_USAGE_CREATE_INFO_EXT);
--      if (stencil_usage_info)
--         image->stencil_usage = stencil_usage_info->stencilUsage;
-+      if (stencil_usage_info) {
-+         image->stencil_usage =
-+            anv_image_create_usage(pCreateInfo,
-+                                   stencil_usage_info->stencilUsage);
-+      }
-    }
- 
-    /* In case of external format, We don't know format yet,
-diff --git a/src/intel/vulkan/anv_pass.c b/src/intel/vulkan/anv_pass.c
-index af23b87969d..1818f6c587b 100644
---- a/src/intel/vulkan/anv_pass.c
-+++ b/src/intel/vulkan/anv_pass.c
-@@ -23,6 +23,7 @@
- 
- #include "anv_private.h"
- 
-+#include "vk_format_info.h"
- #include "vk_util.h"
- 
- static void
-@@ -406,6 +407,70 @@ num_subpass_attachments2(const VkSubpassDescription2KHR *desc)
-           (ds_resolve && ds_resolve->pDepthStencilResolveAttachment);
- }
- 
-+static bool
-+vk_image_layout_depth_only(VkImageLayout layout)
-+{
-+   switch (layout) {
-+   case VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_OPTIMAL:
-+   case VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_OPTIMAL:
-+      return true;
-+
-+   default:
-+      return false;
-+   }
-+}
-+
-+/* From the Vulkan Specification 1.2.166 - VkAttachmentReference2:
-+ *
-+ *   "If layout only specifies the layout of the depth aspect of the
-+ *    attachment, the layout of the stencil aspect is specified by the
-+ *    stencilLayout member of a VkAttachmentReferenceStencilLayout structure
-+ *    included in the pNext chain. Otherwise, layout describes the layout for
-+ *    all relevant image aspects."
-+ */
-+static VkImageLayout
-+stencil_ref_layout(const VkAttachmentReference2KHR *att_ref)
-+{
-+   if (!vk_image_layout_depth_only(att_ref->layout))
-+      return att_ref->layout;
-+
-+   const VkAttachmentReferenceStencilLayoutKHR *stencil_ref =
-+      vk_find_struct_const(att_ref->pNext,
-+                           ATTACHMENT_REFERENCE_STENCIL_LAYOUT_KHR);
-+   if (!stencil_ref)
-+      return VK_IMAGE_LAYOUT_UNDEFINED;
-+   return stencil_ref->stencilLayout;
-+}
-+
-+/* From the Vulkan Specification 1.2.166 - VkAttachmentDescription2:
-+ *
-+ *   "If format is a depth/stencil format, and initialLayout only specifies
-+ *    the initial layout of the depth aspect of the attachment, the initial
-+ *    layout of the stencil aspect is specified by the stencilInitialLayout
-+ *    member of a VkAttachmentDescriptionStencilLayout structure included in
-+ *    the pNext chain. Otherwise, initialLayout describes the initial layout
-+ *    for all relevant image aspects."
-+ */
-+static VkImageLayout
-+stencil_desc_layout(const VkAttachmentDescription2KHR *att_desc, bool final)
-+{
-+   if (!vk_format_has_stencil(att_desc->format))
-+      return VK_IMAGE_LAYOUT_UNDEFINED;
-+
-+   const VkImageLayout main_layout =
-+      final ? att_desc->finalLayout : att_desc->initialLayout;
-+   if (!vk_image_layout_depth_only(main_layout))
-+      return main_layout;
-+
-+   const VkAttachmentDescriptionStencilLayoutKHR *stencil_desc =
-+      vk_find_struct_const(att_desc->pNext,
-+                           ATTACHMENT_DESCRIPTION_STENCIL_LAYOUT_KHR);
-+   assert(stencil_desc);
-+   return final ?
-+      stencil_desc->stencilFinalLayout :
-+      stencil_desc->stencilInitialLayout;
-+}
-+
- VkResult anv_CreateRenderPass2(
-     VkDevice                                    _device,
-     const VkRenderPassCreateInfo2KHR*           pCreateInfo,
-@@ -450,10 +515,6 @@ VkResult anv_CreateRenderPass2(
-    pass->subpass_flushes = subpass_flushes;
- 
-    for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) {
--      const VkAttachmentDescriptionStencilLayoutKHR *stencil_layout =
--         vk_find_struct_const(pCreateInfo->pAttachments[i].pNext,
--                              ATTACHMENT_DESCRIPTION_STENCIL_LAYOUT_KHR);
--
-       pass->attachments[i] = (struct anv_render_pass_attachment) {
-          .format                 = pCreateInfo->pAttachments[i].format,
-          .samples                = pCreateInfo->pAttachments[i].samples,
-@@ -463,12 +524,10 @@ VkResult anv_CreateRenderPass2(
-          .initial_layout         = pCreateInfo->pAttachments[i].initialLayout,
-          .final_layout           = pCreateInfo->pAttachments[i].finalLayout,
- 
--         .stencil_initial_layout = (stencil_layout ?
--                                    stencil_layout->stencilInitialLayout :
--                                    pCreateInfo->pAttachments[i].initialLayout),
--         .stencil_final_layout   = (stencil_layout ?
--                                    stencil_layout->stencilFinalLayout :
--                                    pCreateInfo->pAttachments[i].finalLayout),
-+         .stencil_initial_layout = stencil_desc_layout(&pCreateInfo->pAttachments[i],
-+                                                       false),
-+         .stencil_final_layout   = stencil_desc_layout(&pCreateInfo->pAttachments[i],
-+                                                       true),
-       };
-    }
- 
-@@ -487,17 +546,11 @@ VkResult anv_CreateRenderPass2(
-          subpass_attachments += desc->inputAttachmentCount;
- 
-          for (uint32_t j = 0; j < desc->inputAttachmentCount; j++) {
--            const VkAttachmentReferenceStencilLayoutKHR *stencil_layout =
--               vk_find_struct_const(desc->pInputAttachments[j].pNext,
--                                    ATTACHMENT_REFERENCE_STENCIL_LAYOUT_KHR);
--
-             subpass->input_attachments[j] = (struct anv_subpass_attachment) {
-                .usage =          VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT,
-                .attachment =     desc->pInputAttachments[j].attachment,
-                .layout =         desc->pInputAttachments[j].layout,
--               .stencil_layout = (stencil_layout ?
--                                  stencil_layout->stencilLayout :
--                                  desc->pInputAttachments[j].layout),
-+               .stencil_layout = stencil_ref_layout(&desc->pInputAttachments[j]),
-             };
-          }
-       }
-@@ -531,17 +584,11 @@ VkResult anv_CreateRenderPass2(
-       if (desc->pDepthStencilAttachment) {
-          subpass->depth_stencil_attachment = subpass_attachments++;
- 
--         const VkAttachmentReferenceStencilLayoutKHR *stencil_attachment =
--            vk_find_struct_const(desc->pDepthStencilAttachment->pNext,
--                                 ATTACHMENT_REFERENCE_STENCIL_LAYOUT_KHR);
--
-          *subpass->depth_stencil_attachment = (struct anv_subpass_attachment) {
-             .usage =          VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
-             .attachment =     desc->pDepthStencilAttachment->attachment,
-             .layout =         desc->pDepthStencilAttachment->layout,
--            .stencil_layout = stencil_attachment ?
--                              stencil_attachment->stencilLayout :
--                              desc->pDepthStencilAttachment->layout,
-+            .stencil_layout = stencil_ref_layout(desc->pDepthStencilAttachment),
-          };
-       }
- 
-@@ -552,17 +599,11 @@ VkResult anv_CreateRenderPass2(
-       if (ds_resolve && ds_resolve->pDepthStencilResolveAttachment) {
-          subpass->ds_resolve_attachment = subpass_attachments++;
- 
--         const VkAttachmentReferenceStencilLayoutKHR *stencil_resolve_attachment =
--            vk_find_struct_const(ds_resolve->pDepthStencilResolveAttachment->pNext,
--                                 ATTACHMENT_REFERENCE_STENCIL_LAYOUT_KHR);
--
-          *subpass->ds_resolve_attachment = (struct anv_subpass_attachment) {
-             .usage =          VK_IMAGE_USAGE_TRANSFER_DST_BIT,
-             .attachment =     ds_resolve->pDepthStencilResolveAttachment->attachment,
-             .layout =         ds_resolve->pDepthStencilResolveAttachment->layout,
--            .stencil_layout = stencil_resolve_attachment ?
--                              stencil_resolve_attachment->stencilLayout :
--                              ds_resolve->pDepthStencilResolveAttachment->layout,
-+            .stencil_layout = stencil_ref_layout(ds_resolve->pDepthStencilResolveAttachment),
-          };
-          subpass->depth_resolve_mode = ds_resolve->depthResolveMode;
-          subpass->stencil_resolve_mode = ds_resolve->stencilResolveMode;
-diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c
-index a9c49e0f592..e3eb376fa5a 100644
---- a/src/intel/vulkan/genX_cmd_buffer.c
-+++ b/src/intel/vulkan/genX_cmd_buffer.c
-@@ -462,8 +462,10 @@ anv_image_init_aux_tt(struct anv_cmd_buffer *cmd_buffer,
- {
-    uint32_t plane = anv_image_aspect_to_plane(image->aspects, aspect);
- 
-+   const struct anv_surface *surface = &image->planes[plane].surface;
-    uint64_t base_address =
--      anv_address_physical(image->planes[plane].address);
-+      anv_address_physical(anv_address_add(image->planes[plane].address,
-+                                           surface->offset));
- 
-    const struct isl_surf *isl_surf = &image->planes[plane].surface.isl;
-    uint64_t format_bits = gen_aux_map_format_bits_for_isl_surf(isl_surf);
-@@ -1231,6 +1233,17 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
-             uint32_t level_layer_count =
-                MIN2(layer_count, aux_layers - base_layer);
- 
-+            /* If will_full_fast_clear is set, the caller promises to
-+             * fast-clear the largest portion of the specified range as it can.
-+             * For color images, that means only the first LOD and array slice.
-+             */
-+            if (level == 0 && base_layer == 0 && will_full_fast_clear) {
-+               base_layer++;
-+               level_layer_count--;
-+               if (level_layer_count == 0)
-+                  continue;
-+            }
-+
-             anv_image_ccs_op(cmd_buffer, image,
-                              image->planes[plane].surface.isl.format,
-                              ISL_SWIZZLE_IDENTITY,
-@@ -1250,6 +1263,12 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
-                           "define an MCS buffer.");
-          }
- 
-+         /* If will_full_fast_clear is set, the caller promises to fast-clear
-+          * the largest portion of the specified range as it can.
-+          */
-+         if (will_full_fast_clear)
-+            return;
-+
-          assert(base_level == 0 && level_count == 1);
-          anv_image_mcs_op(cmd_buffer, image,
-                           image->planes[plane].surface.isl.format,
-diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c
-index 205e8677f19..33f071019b7 100644
---- a/src/intel/vulkan/genX_pipeline.c
-+++ b/src/intel/vulkan/genX_pipeline.c
-@@ -1180,7 +1180,22 @@ emit_cb_state(struct anv_graphics_pipeline *pipeline,
- #endif
-          .LogicOpEnable = info->logicOpEnable,
-          .LogicOpFunction = vk_to_gen_logic_op[info->logicOp],
--         .ColorBufferBlendEnable = a->blendEnable,
-+         /* Vulkan specification 1.2.168, VkLogicOp:
-+          *
-+          *   "Logical operations are controlled by the logicOpEnable and
-+          *    logicOp members of VkPipelineColorBlendStateCreateInfo. If
-+          *    logicOpEnable is VK_TRUE, then a logical operation selected by
-+          *    logicOp is applied between each color attachment and the
-+          *    fragment’s corresponding output value, and blending of all
-+          *    attachments is treated as if it were disabled."
-+          *
-+          * From the Broadwell PRM Volume 2d: Command Reference: Structures:
-+          * BLEND_STATE_ENTRY:
-+          *
-+          *   "Enabling LogicOp and Color Buffer Blending at the same time is
-+          *    UNDEFINED"
-+          */
-+         .ColorBufferBlendEnable = !info->logicOpEnable && a->blendEnable,
-          .ColorClampRange = COLORCLAMP_RTFORMAT,
-          .PreBlendColorClampEnable = true,
-          .PostBlendColorClampEnable = true,
-diff --git a/src/intel/vulkan/vk_format_info.h b/src/intel/vulkan/vk_format_info.h
-index 006e1f4a6ad..4e72c244742 100644
---- a/src/intel/vulkan/vk_format_info.h
-+++ b/src/intel/vulkan/vk_format_info.h
-@@ -164,4 +164,11 @@ vk_format_has_depth(VkFormat format)
-    return aspects & VK_IMAGE_ASPECT_DEPTH_BIT;
- }
- 
-+static inline bool
-+vk_format_has_stencil(VkFormat format)
-+{
-+   const VkImageAspectFlags aspects = vk_format_aspects(format);
-+   return aspects & VK_IMAGE_ASPECT_STENCIL_BIT;
-+}
-+
- #endif /* VK_FORMAT_INFO_H */
-diff --git a/src/mesa/state_tracker/st_pbo.c b/src/mesa/state_tracker/st_pbo.c
-index 65a1ce8862a..b03921c1be6 100644
---- a/src/mesa/state_tracker/st_pbo.c
-+++ b/src/mesa/state_tracker/st_pbo.c
-@@ -431,16 +431,21 @@ create_fs(struct st_context *st, bool download,
-    nir_ssa_def *coord = nir_load_var(&b, fragcoord);
- 
-    nir_ssa_def *layer = NULL;
--   if (st->pbo.layers && need_layer && (!download || target == PIPE_TEXTURE_1D_ARRAY ||
--                                                     target == PIPE_TEXTURE_2D_ARRAY ||
--                                                     target == PIPE_TEXTURE_3D ||
--                                                     target == PIPE_TEXTURE_CUBE ||
--                                                     target == PIPE_TEXTURE_CUBE_ARRAY)) {
--      nir_variable *var = nir_variable_create(b.shader, nir_var_shader_in,
--                                              glsl_int_type(), "gl_Layer");
--      var->data.location = VARYING_SLOT_LAYER;
--      var->data.interpolation = INTERP_MODE_FLAT;
--      layer = nir_load_var(&b, var);
-+   if (st->pbo.layers && (!download || target == PIPE_TEXTURE_1D_ARRAY ||
-+                                       target == PIPE_TEXTURE_2D_ARRAY ||
-+                                       target == PIPE_TEXTURE_3D ||
-+                                       target == PIPE_TEXTURE_CUBE ||
-+                                       target == PIPE_TEXTURE_CUBE_ARRAY)) {
-+      if (need_layer) {
-+         nir_variable *var = nir_variable_create(b.shader, nir_var_shader_in,
-+                                                glsl_int_type(), "gl_Layer");
-+         var->data.location = VARYING_SLOT_LAYER;
-+         var->data.interpolation = INTERP_MODE_FLAT;
-+         layer = nir_load_var(&b, var);
-+      }
-+      else {
-+         layer = zero;
-+      }
-    }
- 
-    /* offset_pos = param.xy + f2i(coord.xy) */
-diff --git a/src/util/format/u_format.csv b/src/util/format/u_format.csv
-index 8acfb869bdb..237c4c95475 100644
---- a/src/util/format/u_format.csv
-+++ b/src/util/format/u_format.csv
-@@ -500,7 +500,7 @@ PIPE_FORMAT_R4G4B4A4_UINT           , plain, 1, 1, 1, up4 , up4 , up4 , up4 , xy
- PIPE_FORMAT_B4G4R4A4_UINT           , plain, 1, 1, 1, up4 , up4 , up4 , up4 , zyxw, rgb, up4 , up4 , up4 , up4 , yzwx
- PIPE_FORMAT_A4R4G4B4_UINT           , plain, 1, 1, 1, up4 , up4 , up4 , up4 , yzwx, rgb, up4 , up4 , up4 , up4 , zyxw
- PIPE_FORMAT_A4B4G4R4_UINT           , plain, 1, 1, 1, up4 , up4 , up4 , up4 , wzyx, rgb, up4 , up4 , up4 , up4 , xyzw
--PIPE_FORMAT_A1R5G5B5_UINT           , plain, 1, 1, 1, up1 , up5 , up5 , up5 , wzyx, rgb, up5 , up5 , up5 , up1 , zyxw
-+PIPE_FORMAT_A1R5G5B5_UINT           , plain, 1, 1, 1, up1 , up5 , up5 , up5 , yzwx, rgb, up5 , up5 , up5 , up1 , zyxw
- PIPE_FORMAT_A1B5G5R5_UINT           , plain, 1, 1, 1, up1 , up5 , up5 , up5 , wzyx, rgb, up5 , up5 , up5 , up1 , xyzw
- PIPE_FORMAT_R5G5B5A1_UINT           , plain, 1, 1, 1, up5 , up5 , up5 , up1 , xyzw, rgb, up5 , up5 , up5 , up1 , wzyx
- PIPE_FORMAT_B5G5R5A1_UINT           , plain, 1, 1, 1, up5 , up5 , up5 , up1 , zyxw, rgb, up1 , up5 , up5 , up5 , yzwx
-diff --git a/src/vulkan/device-select-layer/VkLayer_MESA_device_select.json b/src/vulkan/device-select-layer/VkLayer_MESA_device_select.json
-index 1d5fffd0135..361ae9fe74e 100644
---- a/src/vulkan/device-select-layer/VkLayer_MESA_device_select.json
-+++ b/src/vulkan/device-select-layer/VkLayer_MESA_device_select.json
-@@ -4,7 +4,7 @@
-     "name": "VK_LAYER_MESA_device_select",
-     "type": "GLOBAL",
-     "library_path": "libVkLayer_MESA_device_select.so",
--    "api_version": "1.1.73",
-+    "api_version": "1.2.73",
-     "implementation_version": "1",
-     "description": "Linux device selection layer",
-     "functions": {
diff --git a/SOURCES/mesa-vk-wsi-sw-fixes.patch b/SOURCES/mesa-vk-wsi-sw-fixes.patch
new file mode 100644
index 0000000..a72e411
--- /dev/null
+++ b/SOURCES/mesa-vk-wsi-sw-fixes.patch
@@ -0,0 +1,403 @@
+diff --git a/src/vulkan/wsi/wsi_common_wayland.c b/src/vulkan/wsi/wsi_common_wayland.c
+index e2a7d337ecf..bc4d87611e0 100644
+--- a/src/vulkan/wsi/wsi_common_wayland.c
++++ b/src/vulkan/wsi/wsi_common_wayland.c
+@@ -31,6 +31,7 @@
+ #include <string.h>
+ #include <pthread.h>
+ #include <poll.h>
++#include <sys/mman.h>
+ 
+ #include "drm-uapi/drm_fourcc.h"
+ 
+@@ -44,9 +45,15 @@
+ #include <util/hash_table.h>
+ #include <util/timespec.h>
+ #include <util/u_vector.h>
++#include <util/anon_file.h>
+ 
+ struct wsi_wayland;
+ 
++struct wsi_wl_display_swrast {
++   struct wl_shm *                              wl_shm;
++   struct u_vector                              formats;
++};
++
+ struct wsi_wl_display_drm {
+    struct wl_drm *                              wl_drm;
+    struct u_vector                              formats;
+@@ -69,6 +76,7 @@ struct wsi_wl_display {
+    struct wl_display *                          wl_display_wrapper;
+    struct wl_event_queue *                      queue;
+ 
++   struct wsi_wl_display_swrast                 swrast;
+    struct wsi_wl_display_drm                    drm;
+    struct wsi_wl_display_dmabuf                 dmabuf;
+ 
+@@ -79,6 +87,8 @@ struct wsi_wl_display {
+ 
+    /* Only used for displays created by wsi_wl_display_create */
+    uint32_t                                     refcount;
++
++   bool sw;
+ };
+ 
+ struct wsi_wayland {
+@@ -183,6 +193,40 @@ wsi_wl_display_add_wl_format(struct wsi_wl_display *display,
+    }
+ }
+ 
++static void
++wsi_wl_display_add_wl_shm_format(struct wsi_wl_display *display,
++                                 struct u_vector *formats,
++                                 uint32_t wl_shm_format)
++{
++   switch (wl_shm_format) {
++   case WL_SHM_FORMAT_XBGR8888:
++      wsi_wl_display_add_vk_format(display, formats,
++                                   VK_FORMAT_R8G8B8_SRGB);
++      wsi_wl_display_add_vk_format(display, formats,
++                                   VK_FORMAT_R8G8B8_UNORM);
++      FALLTHROUGH;
++   case WL_SHM_FORMAT_ABGR8888:
++      wsi_wl_display_add_vk_format(display, formats,
++                                   VK_FORMAT_R8G8B8A8_SRGB);
++      wsi_wl_display_add_vk_format(display, formats,
++                                   VK_FORMAT_R8G8B8A8_UNORM);
++      break;
++   case WL_SHM_FORMAT_XRGB8888:
++      wsi_wl_display_add_vk_format(display, formats,
++                                   VK_FORMAT_B8G8R8_SRGB);
++      wsi_wl_display_add_vk_format(display, formats,
++                                   VK_FORMAT_B8G8R8_UNORM);
++      FALLTHROUGH;
++   case WL_SHM_FORMAT_ARGB8888:
++      wsi_wl_display_add_vk_format(display, formats,
++                                   VK_FORMAT_B8G8R8A8_SRGB);
++      wsi_wl_display_add_vk_format(display, formats,
++                                   VK_FORMAT_B8G8R8A8_UNORM);
++      break;
++   }
++}
++
++
+ static void
+ drm_handle_device(void *data, struct wl_drm *drm, const char *name)
+ {
+@@ -232,6 +276,23 @@ wl_drm_format_for_vk_format(VkFormat vk_format, bool alpha)
+    }
+ }
+ 
++static uint32_t
++wl_shm_format_for_vk_format(VkFormat vk_format, bool alpha)
++{
++   switch (vk_format) {
++   case VK_FORMAT_R8G8B8A8_UNORM:
++   case VK_FORMAT_R8G8B8A8_SRGB:
++      return alpha ? WL_SHM_FORMAT_ABGR8888 : WL_SHM_FORMAT_XBGR8888;
++   case VK_FORMAT_B8G8R8A8_UNORM:
++   case VK_FORMAT_B8G8R8A8_SRGB:
++      return alpha ? WL_SHM_FORMAT_ARGB8888 : WL_SHM_FORMAT_XRGB8888;
++
++   default:
++      assert(!"Unsupported Vulkan format");
++      return 0;
++   }
++}
++
+ static void
+ drm_handle_format(void *data, struct wl_drm *drm, uint32_t wl_format)
+ {
+@@ -311,12 +372,34 @@ static const struct zwp_linux_dmabuf_v1_listener dmabuf_listener = {
+    dmabuf_handle_modifier,
+ };
+ 
++static void
++shm_handle_format(void *data, struct wl_shm *shm, uint32_t format)
++{
++   struct wsi_wl_display *display = data;
++   if (display->swrast.formats.element_size == 0)
++      return;
++
++   wsi_wl_display_add_wl_shm_format(display, &display->swrast.formats, format);
++}
++
++static const struct wl_shm_listener shm_listener = {
++   .format = shm_handle_format
++};
++
+ static void
+ registry_handle_global(void *data, struct wl_registry *registry,
+                        uint32_t name, const char *interface, uint32_t version)
+ {
+    struct wsi_wl_display *display = data;
+ 
++   if (display->sw) {
++      if (strcmp(interface, "wl_shm") == 0) {
++         display->swrast.wl_shm = wl_registry_bind(registry, name, &wl_shm_interface, 1);
++         wl_shm_add_listener(display->swrast.wl_shm, &shm_listener, display);
++      }
++      return;
++   }
++
+    if (strcmp(interface, "wl_drm") == 0) {
+       assert(display->drm.wl_drm == NULL);
+ 
+@@ -348,10 +431,13 @@ wsi_wl_display_finish(struct wsi_wl_display *display)
+ {
+    assert(display->refcount == 0);
+ 
++   u_vector_finish(&display->swrast.formats);
+    u_vector_finish(&display->drm.formats);
+    u_vector_finish(&display->dmabuf.formats);
+    u_vector_finish(&display->dmabuf.modifiers.argb8888);
+    u_vector_finish(&display->dmabuf.modifiers.xrgb8888);
++   if (display->swrast.wl_shm)
++      wl_shm_destroy(display->swrast.wl_shm);
+    if (display->drm.wl_drm)
+       wl_drm_destroy(display->drm.wl_drm);
+    if (display->dmabuf.wl_dmabuf)
+@@ -366,16 +452,18 @@ static VkResult
+ wsi_wl_display_init(struct wsi_wayland *wsi_wl,
+                     struct wsi_wl_display *display,
+                     struct wl_display *wl_display,
+-                    bool get_format_list)
++                    bool get_format_list, bool sw)
+ {
+    VkResult result = VK_SUCCESS;
+    memset(display, 0, sizeof(*display));
+ 
+    display->wsi_wl = wsi_wl;
+    display->wl_display = wl_display;
++   display->sw = sw;
+ 
+    if (get_format_list) {
+-      if (!u_vector_init(&display->drm.formats, sizeof(VkFormat), 8) ||
++      if (!u_vector_init(&display->swrast.formats, sizeof(VkFormat), 8) ||
++          !u_vector_init(&display->drm.formats, sizeof(VkFormat), 8) ||
+           !u_vector_init(&display->dmabuf.formats, sizeof(VkFormat), 8) ||
+           !u_vector_init(&display->dmabuf.modifiers.argb8888,
+                          sizeof(uint64_t), 32) ||
+@@ -414,7 +502,7 @@ wsi_wl_display_init(struct wsi_wayland *wsi_wl,
+    wl_display_roundtrip_queue(display->wl_display, display->queue);
+ 
+    /* Round-trip again to get formats, modifiers and capabilities */
+-   if (display->drm.wl_drm || display->dmabuf.wl_dmabuf)
++   if (display->drm.wl_drm || display->dmabuf.wl_dmabuf || display->swrast.wl_shm)
+       wl_display_roundtrip_queue(display->wl_display, display->queue);
+ 
+    if (wsi_wl->wsi->force_bgra8_unorm_first) {
+@@ -432,8 +520,10 @@ wsi_wl_display_init(struct wsi_wayland *wsi_wl,
+       }
+    }
+ 
++   if (display->sw)
++      display->formats = &display->swrast.formats;
+    /* We need prime support for wl_drm */
+-   if (display->drm.wl_drm &&
++   else if (display->drm.wl_drm &&
+        (display->drm.capabilities & WL_DRM_CAPABILITY_PRIME)) {
+       display->formats = &display->drm.formats;
+    } else if (display->dmabuf.wl_dmabuf) {
+@@ -463,6 +553,7 @@ fail:
+ 
+ static VkResult
+ wsi_wl_display_create(struct wsi_wayland *wsi, struct wl_display *wl_display,
++                      bool sw,
+                       struct wsi_wl_display **display_out)
+ {
+    struct wsi_wl_display *display =
+@@ -471,7 +562,8 @@ wsi_wl_display_create(struct wsi_wayland *wsi, struct wl_display *wl_display,
+    if (!display)
+       return VK_ERROR_OUT_OF_HOST_MEMORY;
+ 
+-   VkResult result = wsi_wl_display_init(wsi, display, wl_display, true);
++   VkResult result = wsi_wl_display_init(wsi, display, wl_display, true,
++                                         sw);
+    if (result != VK_SUCCESS) {
+       vk_free(wsi->alloc, display);
+       return result;
+@@ -509,7 +601,8 @@ wsi_wl_get_presentation_support(struct wsi_device *wsi_device,
+       (struct wsi_wayland *)wsi_device->wsi[VK_ICD_WSI_PLATFORM_WAYLAND];
+ 
+    struct wsi_wl_display display;
+-   VkResult ret = wsi_wl_display_init(wsi, &display, wl_display, false);
++   VkResult ret = wsi_wl_display_init(wsi, &display, wl_display, false,
++                                      wsi_device->sw);
+    if (ret == VK_SUCCESS)
+       wsi_wl_display_finish(&display);
+ 
+@@ -612,7 +705,8 @@ wsi_wl_surface_get_formats(VkIcdSurfaceBase *icd_surface,
+       (struct wsi_wayland *)wsi_device->wsi[VK_ICD_WSI_PLATFORM_WAYLAND];
+ 
+    struct wsi_wl_display display;
+-   if (wsi_wl_display_init(wsi, &display, surface->display, true))
++   if (wsi_wl_display_init(wsi, &display, surface->display, true,
++                           wsi_device->sw))
+       return VK_ERROR_SURFACE_LOST_KHR;
+ 
+    VK_OUTARRAY_MAKE(out, pSurfaceFormats, pSurfaceFormatCount);
+@@ -642,7 +736,8 @@ wsi_wl_surface_get_formats2(VkIcdSurfaceBase *icd_surface,
+       (struct wsi_wayland *)wsi_device->wsi[VK_ICD_WSI_PLATFORM_WAYLAND];
+ 
+    struct wsi_wl_display display;
+-   if (wsi_wl_display_init(wsi, &display, surface->display, true))
++   if (wsi_wl_display_init(wsi, &display, surface->display, true,
++                           wsi_device->sw))
+       return VK_ERROR_SURFACE_LOST_KHR;
+ 
+    VK_OUTARRAY_MAKE(out, pSurfaceFormats, pSurfaceFormatCount);
+@@ -722,10 +817,12 @@ struct wsi_wl_image {
+    struct wsi_image                             base;
+    struct wl_buffer *                           buffer;
+    bool                                         busy;
++   void *                                       data_ptr;
++   uint32_t                                     data_size;
+ };
+ 
+ struct wsi_wl_swapchain {
+-   struct wsi_swapchain                        base;
++   struct wsi_swapchain                         base;
+ 
+    struct wsi_wl_display                        *display;
+ 
+@@ -742,6 +839,7 @@ struct wsi_wl_swapchain {
+    VkExtent2D                                   extent;
+    VkFormat                                     vk_format;
+    uint32_t                                     drm_format;
++   uint32_t                                     shm_format;
+ 
+    uint32_t                                     num_drm_modifiers;
+    const uint64_t *                             drm_modifiers;
+@@ -859,6 +957,23 @@ wsi_wl_swapchain_queue_present(struct wsi_swapchain *wsi_chain,
+ {
+    struct wsi_wl_swapchain *chain = (struct wsi_wl_swapchain *)wsi_chain;
+ 
++   if (chain->display->sw) {
++      struct wsi_wl_image *image = &chain->images[image_index];
++      void *dptr = image->data_ptr;
++      void *sptr;
++      chain->base.wsi->MapMemory(chain->base.device,
++                                 image->base.memory,
++                                 0, 0, 0, &sptr);
++
++      for (unsigned r = 0; r < chain->extent.height; r++) {
++         memcpy(dptr, sptr, image->base.row_pitches[0]);
++         dptr += image->base.row_pitches[0];
++         sptr += image->base.row_pitches[0];
++      }
++      chain->base.wsi->UnmapMemory(chain->base.device,
++                                   image->base.memory);
++
++   }
+    if (chain->base.present_mode == VK_PRESENT_MODE_FIFO_KHR) {
+       while (!chain->fifo_ready) {
+          int ret = wl_display_dispatch_queue(chain->display->wl_display,
+@@ -928,7 +1043,31 @@ wsi_wl_image_init(struct wsi_wl_swapchain *chain,
+    if (result != VK_SUCCESS)
+       return result;
+ 
+-   if (!chain->drm_wrapper) {
++   if (display->sw) {
++      int fd, stride;
++
++      stride = image->base.row_pitches[0];
++      image->data_size = stride * chain->extent.height;
++
++      /* Create a shareable buffer */
++      fd = os_create_anonymous_file(image->data_size, NULL);
++      if (fd < 0)
++         goto fail_image;
++
++      image->data_ptr = mmap(NULL, image->data_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
++      if (image->data_ptr == MAP_FAILED) {
++         close(fd);
++         goto fail_image;
++      }
++      /* Share it in a wl_buffer */
++      struct wl_shm_pool *pool = wl_shm_create_pool(display->swrast.wl_shm, fd, image->data_size);
++      wl_proxy_set_queue((struct wl_proxy *)pool, display->queue);
++      image->buffer = wl_shm_pool_create_buffer(pool, 0, chain->extent.width,
++                                                chain->extent.height, stride,
++                                                chain->shm_format);
++      wl_shm_pool_destroy(pool);
++      close(fd);
++   } else if (!chain->drm_wrapper) {
+       /* Only request modifiers if we have dmabuf, else it must be implicit. */
+       assert(display->dmabuf.wl_dmabuf);
+       assert(image->base.drm_modifier != DRM_FORMAT_MOD_INVALID);
+@@ -995,6 +1134,8 @@ wsi_wl_swapchain_destroy(struct wsi_swapchain *wsi_chain,
+       if (chain->images[i].buffer) {
+          wl_buffer_destroy(chain->images[i].buffer);
+          wsi_destroy_image(&chain->base, &chain->images[i].base);
++         if (chain->images[i].data_ptr)
++            munmap(chain->images[i].data_ptr, chain->images[i].data_size);
+       }
+    }
+ 
+@@ -1049,8 +1190,10 @@ wsi_wl_surface_create_swapchain(VkIcdSurfaceBase *icd_surface,
+    /* Mark a bunch of stuff as NULL.  This way we can just call
+     * destroy_swapchain for cleanup.
+     */
+-   for (uint32_t i = 0; i < num_images; i++)
++   for (uint32_t i = 0; i < num_images; i++) {
+       chain->images[i].buffer = NULL;
++      chain->images[i].data_ptr = NULL;
++   }
+    chain->surface = NULL;
+    chain->drm_wrapper = NULL;
+    chain->frame = NULL;
+@@ -1066,7 +1209,10 @@ wsi_wl_surface_create_swapchain(VkIcdSurfaceBase *icd_surface,
+    chain->base.image_count = num_images;
+    chain->extent = pCreateInfo->imageExtent;
+    chain->vk_format = pCreateInfo->imageFormat;
+-   chain->drm_format = wl_drm_format_for_vk_format(chain->vk_format, alpha);
++   if (wsi_device->sw)
++      chain->shm_format = wl_shm_format_for_vk_format(chain->vk_format, alpha);
++   else
++      chain->drm_format = wl_drm_format_for_vk_format(chain->vk_format, alpha);
+ 
+    if (pCreateInfo->oldSwapchain) {
+       /* If we have an oldSwapchain parameter, copy the display struct over
+@@ -1076,7 +1222,8 @@ wsi_wl_surface_create_swapchain(VkIcdSurfaceBase *icd_surface,
+       chain->display = wsi_wl_display_ref(old_chain->display);
+    } else {
+       chain->display = NULL;
+-      result = wsi_wl_display_create(wsi, surface->display, &chain->display);
++      result = wsi_wl_display_create(wsi, surface->display,
++                                     wsi_device->sw, &chain->display);
+       if (result != VK_SUCCESS)
+          goto fail;
+    }
+diff --git a/src/vulkan/wsi/wsi_common_x11.c b/src/vulkan/wsi/wsi_common_x11.c
+index 54769b81ccc..fa0c3d997dc 100644
+--- a/src/vulkan/wsi/wsi_common_x11.c
++++ b/src/vulkan/wsi/wsi_common_x11.c
+@@ -439,8 +439,10 @@ VkBool32 wsi_get_physical_device_xcb_presentation_support(
+    if (!wsi_conn)
+       return false;
+ 
+-   if (!wsi_x11_check_for_dri3(wsi_conn))
+-      return false;
++   if (!wsi_device->sw) {
++      if (!wsi_x11_check_for_dri3(wsi_conn))
++         return false;
++   }
+ 
+    unsigned visual_depth;
+    if (!connection_get_visualtype(connection, visual_id, &visual_depth))
+@@ -484,9 +486,11 @@ x11_surface_get_support(VkIcdSurfaceBase *icd_surface,
+    if (!wsi_conn)
+       return VK_ERROR_OUT_OF_HOST_MEMORY;
+ 
+-   if (!wsi_x11_check_for_dri3(wsi_conn)) {
+-      *pSupported = false;
+-      return VK_SUCCESS;
++   if (!wsi_device->sw) {
++      if (!wsi_x11_check_for_dri3(wsi_conn)) {
++         *pSupported = false;
++         return VK_SUCCESS;
++      }
+    }
+ 
+    unsigned visual_depth;
diff --git a/SPECS/mesa.spec b/SPECS/mesa.spec
index 4c1ccd7..64bb68e 100644
--- a/SPECS/mesa.spec
+++ b/SPECS/mesa.spec
@@ -25,10 +25,6 @@
 %define with_xa        1
 %endif
 
-%ifnarch %{x86}
-%global with_asm 1
-%endif
-
 %global dri_drivers %{?platform_drivers}
 
 %if 0%{?with_vulkan_hw}
@@ -43,8 +39,8 @@
 
 Name:           mesa
 Summary:        Mesa graphics libraries
-Version:        20.3.3
-Release:        2%{?rctag:.%{rctag}}%{?dist}
+Version:        21.1.5
+Release:        1%{?rctag:.%{rctag}}%{?dist}
 
 License:        MIT
 URL:            http://www.mesa3d.org
@@ -59,10 +55,7 @@ Source3:        Makefile
 Source4:        Mesa-MLAA-License-Clarification-Email.txt
 
 Patch0:	lavapipe-disable-env-var.patch
-Patch1: mesa-20.3.3-stable-fixes.patch
-Patch2: anv-remove-warning.patch
-
-Patch10: cpu-affinity-fixes-20.3.3.patch
+Patch1: mesa-vk-wsi-sw-fixes.patch
 
 BuildRequires:  gcc
 BuildRequires:  gcc-c++
@@ -329,8 +322,9 @@ export ASFLAGS="--generate-missing-build-notes=yes"
 %meson -Dcpp_std=gnu++14 \
   -Db_ndebug=true \
   -Dplatforms=x11,wayland \
-  -Ddri3=true \
+  -Ddri3=enabled \
   -Ddri-drivers=%{?dri_drivers} \
+  -Dosmesa=true \
 %if 0%{?with_hardware}
   -Dgallium-drivers=swrast%{?with_iris:,iris},virgl,nouveau%{?with_vmware:,svga},radeonsi,r600%{?with_freedreno:,freedreno}%{?with_etnaviv:,etnaviv}%{?with_tegra:,tegra}%{?with_vc4:,vc4}%{?with_kmsro:,kmsro} \
 %else
@@ -344,22 +338,21 @@ export ASFLAGS="--generate-missing-build-notes=yes"
   -Dgallium-nine=%{?with_nine:true}%{!?with_nine:false} \
   -Dgallium-opencl=%{?with_opencl:icd}%{!?with_opencl:disabled} \
   -Dvulkan-drivers=%{?vulkan_drivers} \
-  -Dshared-glapi=true \
-  -Dgles1=false \
-  -Dgles2=true \
+  -Dvulkan-layers=device-select \
+  -Dshared-glapi=enabled \
+  -Dgles1=disabled \
+  -Dgles2=enabled \
   -Dopengl=true \
-  -Dgbm=true \
+  -Dgbm=enabled \
   -Dglx=dri \
   -Degl=true \
   -Dglvnd=true \
-  -Dasm=%{?with_asm:true}%{!?with_asm:false} \
+  -Dmicrosoft-clc=disabled \
   -Dllvm=true \
   -Dshared-llvm=true \
   -Dvalgrind=%{?with_valgrind:true}%{!?with_valgrind:false} \
   -Dbuild-tests=false \
   -Dselinux=true \
-  -Dosmesa=gallium \
-  -Dvulkan-device-select-layer=true \
   %{nil}
 %meson_build
 
@@ -551,12 +544,20 @@ done
 
 %if 0%{?with_vulkan_hw}
 %files vulkan-devel
-%{_includedir}/vulkan/
 %endif
 
 %changelog
-* Fri Mar 26 2021 Dave Airlie <airlied@redhat.com> - 20.3.3-2
-- Fix CPU affinity memory corruption crash (#1938788)
+* Thu Jul 22 2021 Dave Airlie <airlied@redhat.com> - 21.1.5-1
+- Fix vulkan sw with wayland, pull in .4 + .5 fixes
+
+* Sat Jun 19 2021 Dave Airlie <airlied@redhat.com> - 21.1.3-1
+- rebase to 21.1.3
+
+* Tue Jun 01 2021 Dave Airlie <airlied@redhat.com> - 21.1.1-2
+- rebuild against llvm 12
+
+* Thu May 20 2021 Dave Airlie <airlied@redhat.com> - 21.1.1-1
+- Update to 21.1.1
 
 * Tue Feb 16 2021 Dave Airlie <airlied@redhat.com> - 20.3.3-1
 - Update to 20.3.3 + upstream fixes for qemu regression