diff --git a/.gitignore b/.gitignore index 94c9bc6..a4e3e90 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1 @@ -SOURCES/mesa-20.1.4.tar.xz +SOURCES/mesa-20.3.3.tar.xz diff --git a/.mesa.metadata b/.mesa.metadata index ca12abc..3d34979 100644 --- a/.mesa.metadata +++ b/.mesa.metadata @@ -1 +1 @@ -78243cd7152a8ba759f8f2bdfcf0a877b455e351 SOURCES/mesa-20.1.4.tar.xz +c0e42fada2b306a6d9740376398c0d8b0a130427 SOURCES/mesa-20.3.3.tar.xz diff --git a/SOURCES/0001-nir-use-bitfield_insert-instead-of-bfi-in-nir_lower_.patch b/SOURCES/0001-nir-use-bitfield_insert-instead-of-bfi-in-nir_lower_.patch deleted file mode 100644 index 0daf825..0000000 --- a/SOURCES/0001-nir-use-bitfield_insert-instead-of-bfi-in-nir_lower_.patch +++ /dev/null @@ -1,34 +0,0 @@ -From d3ec950f0d8492b980a91844ffd744d7e7824277 Mon Sep 17 00:00:00 2001 -From: Ben Skeggs -Date: Sat, 6 Jun 2020 16:58:00 +1000 -Subject: [PATCH] nir: use bitfield_insert instead of bfi in - nir_lower_double_ops - -NVIDIA hardware doesn't have an equivilant to bfi, but we do already have -a lowering for bitfield_insert->bfi. - -Signed-off-by: Ben Skeggs -Reviewed-by: Jason Ekstrand -Part-of: ---- - src/compiler/nir/nir_lower_double_ops.c | 4 +++- - 1 file changed, 3 insertions(+), 1 deletion(-) - -diff --git a/src/compiler/nir/nir_lower_double_ops.c b/src/compiler/nir/nir_lower_double_ops.c -index f9c93a910a5..73226fd62ef 100644 ---- a/src/compiler/nir/nir_lower_double_ops.c -+++ b/src/compiler/nir/nir_lower_double_ops.c -@@ -49,7 +49,9 @@ set_exponent(nir_builder *b, nir_ssa_def *src, nir_ssa_def *exp) - /* The exponent is bits 52-62, or 20-30 of the high word, so set the exponent - * to 1023 - */ -- nir_ssa_def *new_hi = nir_bfi(b, nir_imm_int(b, 0x7ff00000), exp, hi); -+ nir_ssa_def *new_hi = nir_bitfield_insert(b, hi, exp, -+ nir_imm_int(b, 20), -+ nir_imm_int(b, 11)); - /* recombine */ - return nir_pack_64_2x32_split(b, lo, new_hi); - } --- -2.26.2 - diff --git a/SOURCES/Makefile b/SOURCES/Makefile index 8396596..eea9f33 100644 --- a/SOURCES/Makefile +++ b/SOURCES/Makefile @@ -1,4 +1,4 @@ -VERSION ?= 20.1.4 +VERSION ?= 20.3.3 SANITIZE ?= 1 DIRNAME = mesa-${VERSION} diff --git a/SOURCES/anv-remove-warning.patch b/SOURCES/anv-remove-warning.patch new file mode 100644 index 0000000..130a050 --- /dev/null +++ b/SOURCES/anv-remove-warning.patch @@ -0,0 +1,13 @@ +diff -up mesa-20.3.3/src/intel/vulkan/anv_perf.c.dma mesa-20.3.3/src/intel/vulkan/anv_perf.c +--- mesa-20.3.3/src/intel/vulkan/anv_perf.c.dma 2021-02-16 12:56:09.881084752 +1000 ++++ mesa-20.3.3/src/intel/vulkan/anv_perf.c 2021-02-16 12:56:14.626213956 +1000 +@@ -47,9 +47,6 @@ anv_get_perf(const struct gen_device_inf + gen_perf_init_metrics(perf, devinfo, fd, false /* pipeline statistics */); + + if (!perf->n_queries) { +- if (perf->platform_supported) +- mesa_logw("Performance support disabled, " +- "consider sysctl dev.i915.perf_stream_paranoid=0\n"); + goto err; + } + diff --git a/SOURCES/cpu-affinity-fixes-20.3.3.patch b/SOURCES/cpu-affinity-fixes-20.3.3.patch new file mode 100644 index 0000000..d11f5c4 --- /dev/null +++ b/SOURCES/cpu-affinity-fixes-20.3.3.patch @@ -0,0 +1,1583 @@ +diff --git a/src/amd/compiler/tests/main.cpp b/src/amd/compiler/tests/main.cpp +index cb646e2dd30..eac0a244adf 100644 +--- a/src/amd/compiler/tests/main.cpp ++++ b/src/amd/compiler/tests/main.cpp +@@ -34,6 +34,8 @@ + #include "aco_ir.h" + #include "framework.h" + ++#include "util/u_cpu_detect.h" ++ + static const char *help_message = + "Usage: %s [-h] [-l --list] [--no-check] [TEST [TEST ...]]\n" + "\n" +@@ -227,6 +229,8 @@ int main(int argc, char **argv) + return 99; + } + ++ util_cpu_detect(); ++ + if (do_list) { + for (auto test : tests) + printf("%s\n", test.first.c_str()); +diff --git a/src/compiler/glsl/standalone.cpp b/src/compiler/glsl/standalone.cpp +index ca187001186..2714d8b95ed 100644 +--- a/src/compiler/glsl/standalone.cpp ++++ b/src/compiler/glsl/standalone.cpp +@@ -401,6 +401,8 @@ standalone_compile_shader(const struct standalone_options *_options, + int status = EXIT_SUCCESS; + bool glsl_es = false; + ++ util_cpu_detect(); ++ + options = _options; + + switch (options->glsl_version) { +diff --git a/src/compiler/nir/tests/negative_equal_tests.cpp b/src/compiler/nir/tests/negative_equal_tests.cpp +index f83041a4fbf..76472e48309 100644 +--- a/src/compiler/nir/tests/negative_equal_tests.cpp ++++ b/src/compiler/nir/tests/negative_equal_tests.cpp +@@ -36,6 +36,7 @@ protected: + const_value_negative_equal_test() + { + glsl_type_singleton_init_or_ref(); ++ util_cpu_detect(); + + memset(c1, 0, sizeof(c1)); + memset(c2, 0, sizeof(c2)); +@@ -55,6 +56,7 @@ protected: + alu_srcs_negative_equal_test() + { + glsl_type_singleton_init_or_ref(); ++ util_cpu_detect(); + + static const nir_shader_compiler_options options = { }; + nir_builder_init_simple_shader(&bld, NULL, MESA_SHADER_VERTEX, &options); +diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c +index 165d73d94fc..33269e528fe 100644 +--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c ++++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c +@@ -104,13 +104,13 @@ lp_build_min_simple(struct lp_build_context *bld, + + /* TODO: optimize the constant case */ + +- if (type.floating && util_cpu_caps.has_sse) { ++ if (type.floating && util_get_cpu_caps()->has_sse) { + if (type.width == 32) { + if (type.length == 1) { + intrinsic = "llvm.x86.sse.min.ss"; + intr_size = 128; + } +- else if (type.length <= 4 || !util_cpu_caps.has_avx) { ++ else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) { + intrinsic = "llvm.x86.sse.min.ps"; + intr_size = 128; + } +@@ -119,12 +119,12 @@ lp_build_min_simple(struct lp_build_context *bld, + intr_size = 256; + } + } +- if (type.width == 64 && util_cpu_caps.has_sse2) { ++ if (type.width == 64 && util_get_cpu_caps()->has_sse2) { + if (type.length == 1) { + intrinsic = "llvm.x86.sse2.min.sd"; + intr_size = 128; + } +- else if (type.length == 2 || !util_cpu_caps.has_avx) { ++ else if (type.length == 2 || !util_get_cpu_caps()->has_avx) { + intrinsic = "llvm.x86.sse2.min.pd"; + intr_size = 128; + } +@@ -134,7 +134,7 @@ lp_build_min_simple(struct lp_build_context *bld, + } + } + } +- else if (type.floating && util_cpu_caps.has_altivec) { ++ else if (type.floating && util_get_cpu_caps()->has_altivec) { + if (nan_behavior == GALLIVM_NAN_RETURN_NAN || + nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) { + debug_printf("%s: altivec doesn't support nan return nan behavior\n", +@@ -144,7 +144,7 @@ lp_build_min_simple(struct lp_build_context *bld, + intrinsic = "llvm.ppc.altivec.vminfp"; + intr_size = 128; + } +- } else if (util_cpu_caps.has_altivec) { ++ } else if (util_get_cpu_caps()->has_altivec) { + intr_size = 128; + if (type.width == 8) { + if (!type.sign) { +@@ -174,7 +174,7 @@ lp_build_min_simple(struct lp_build_context *bld, + * The sse intrinsics return the second operator in case of nan by + * default so we need to special code to handle those. + */ +- if (util_cpu_caps.has_sse && type.floating && ++ if (util_get_cpu_caps()->has_sse && type.floating && + nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED && + nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN && + nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) { +@@ -274,13 +274,13 @@ lp_build_max_simple(struct lp_build_context *bld, + + /* TODO: optimize the constant case */ + +- if (type.floating && util_cpu_caps.has_sse) { ++ if (type.floating && util_get_cpu_caps()->has_sse) { + if (type.width == 32) { + if (type.length == 1) { + intrinsic = "llvm.x86.sse.max.ss"; + intr_size = 128; + } +- else if (type.length <= 4 || !util_cpu_caps.has_avx) { ++ else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) { + intrinsic = "llvm.x86.sse.max.ps"; + intr_size = 128; + } +@@ -289,12 +289,12 @@ lp_build_max_simple(struct lp_build_context *bld, + intr_size = 256; + } + } +- if (type.width == 64 && util_cpu_caps.has_sse2) { ++ if (type.width == 64 && util_get_cpu_caps()->has_sse2) { + if (type.length == 1) { + intrinsic = "llvm.x86.sse2.max.sd"; + intr_size = 128; + } +- else if (type.length == 2 || !util_cpu_caps.has_avx) { ++ else if (type.length == 2 || !util_get_cpu_caps()->has_avx) { + intrinsic = "llvm.x86.sse2.max.pd"; + intr_size = 128; + } +@@ -304,7 +304,7 @@ lp_build_max_simple(struct lp_build_context *bld, + } + } + } +- else if (type.floating && util_cpu_caps.has_altivec) { ++ else if (type.floating && util_get_cpu_caps()->has_altivec) { + if (nan_behavior == GALLIVM_NAN_RETURN_NAN || + nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) { + debug_printf("%s: altivec doesn't support nan return nan behavior\n", +@@ -314,7 +314,7 @@ lp_build_max_simple(struct lp_build_context *bld, + intrinsic = "llvm.ppc.altivec.vmaxfp"; + intr_size = 128; + } +- } else if (util_cpu_caps.has_altivec) { ++ } else if (util_get_cpu_caps()->has_altivec) { + intr_size = 128; + if (type.width == 8) { + if (!type.sign) { +@@ -338,7 +338,7 @@ lp_build_max_simple(struct lp_build_context *bld, + } + + if (intrinsic) { +- if (util_cpu_caps.has_sse && type.floating && ++ if (util_get_cpu_caps()->has_sse && type.floating && + nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED && + nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN && + nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) { +@@ -472,12 +472,12 @@ lp_build_add(struct lp_build_context *bld, + return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b); + } + if (type.width * type.length == 128) { +- if (util_cpu_caps.has_sse2) { ++ if (util_get_cpu_caps()->has_sse2) { + if (type.width == 8) + intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b"; + if (type.width == 16) + intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w"; +- } else if (util_cpu_caps.has_altivec) { ++ } else if (util_get_cpu_caps()->has_altivec) { + if (type.width == 8) + intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs"; + if (type.width == 16) +@@ -485,7 +485,7 @@ lp_build_add(struct lp_build_context *bld, + } + } + if (type.width * type.length == 256) { +- if (util_cpu_caps.has_avx2) { ++ if (util_get_cpu_caps()->has_avx2) { + if (type.width == 8) + intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b"; + if (type.width == 16) +@@ -713,11 +713,11 @@ lp_build_hadd_partial4(struct lp_build_context *bld, + tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0]; + tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0]; + +- if (util_cpu_caps.has_sse3 && bld->type.width == 32 && ++ if (util_get_cpu_caps()->has_sse3 && bld->type.width == 32 && + bld->type.length == 4) { + intrinsic = "llvm.x86.sse3.hadd.ps"; + } +- else if (util_cpu_caps.has_avx && bld->type.width == 32 && ++ else if (util_get_cpu_caps()->has_avx && bld->type.width == 32 && + bld->type.length == 8) { + intrinsic = "llvm.x86.avx.hadd.ps.256"; + } +@@ -796,12 +796,12 @@ lp_build_sub(struct lp_build_context *bld, + return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b); + } + if (type.width * type.length == 128) { +- if (util_cpu_caps.has_sse2) { ++ if (util_get_cpu_caps()->has_sse2) { + if (type.width == 8) + intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b"; + if (type.width == 16) + intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w"; +- } else if (util_cpu_caps.has_altivec) { ++ } else if (util_get_cpu_caps()->has_altivec) { + if (type.width == 8) + intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs"; + if (type.width == 16) +@@ -809,7 +809,7 @@ lp_build_sub(struct lp_build_context *bld, + } + } + if (type.width * type.length == 256) { +- if (util_cpu_caps.has_avx2) { ++ if (util_get_cpu_caps()->has_avx2) { + if (type.width == 8) + intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b"; + if (type.width == 16) +@@ -1078,8 +1078,8 @@ lp_build_mul_32_lohi_cpu(struct lp_build_context *bld, + */ + if (LLVM_VERSION_MAJOR < 7 && + (bld->type.length == 4 || bld->type.length == 8) && +- ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) || +- util_cpu_caps.has_sse4_1)) { ++ ((util_get_cpu_caps()->has_sse2 && (bld->type.sign == 0)) || ++ util_get_cpu_caps()->has_sse4_1)) { + const char *intrinsic = NULL; + LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd; + LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec; +@@ -1096,7 +1096,7 @@ lp_build_mul_32_lohi_cpu(struct lp_build_context *bld, + aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, ""); + bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, ""); + +- if (util_cpu_caps.has_avx2 && bld->type.length == 8) { ++ if (util_get_cpu_caps()->has_avx2 && bld->type.length == 8) { + if (bld->type.sign) { + intrinsic = "llvm.x86.avx2.pmul.dq"; + } else { +@@ -1331,8 +1331,8 @@ lp_build_div(struct lp_build_context *bld, + + /* fast rcp is disabled (just uses div), so makes no sense to try that */ + if(FALSE && +- ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || +- (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) && ++ ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) || ++ (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) && + type.floating) + return lp_build_mul(bld, a, lp_build_rcp(bld, b)); + +@@ -1745,7 +1745,7 @@ lp_build_abs(struct lp_build_context *bld, + return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); + } + +- if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && LLVM_VERSION_MAJOR < 6) { ++ if(type.width*type.length == 128 && util_get_cpu_caps()->has_ssse3 && LLVM_VERSION_MAJOR < 6) { + switch(type.width) { + case 8: + return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a); +@@ -1755,7 +1755,7 @@ lp_build_abs(struct lp_build_context *bld, + return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a); + } + } +- else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && LLVM_VERSION_MAJOR < 6) { ++ else if (type.width*type.length == 256 && util_get_cpu_caps()->has_avx2 && LLVM_VERSION_MAJOR < 6) { + switch(type.width) { + case 8: + return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a); +@@ -1897,15 +1897,15 @@ lp_build_int_to_float(struct lp_build_context *bld, + static boolean + arch_rounding_available(const struct lp_type type) + { +- if ((util_cpu_caps.has_sse4_1 && ++ if ((util_get_cpu_caps()->has_sse4_1 && + (type.length == 1 || type.width*type.length == 128)) || +- (util_cpu_caps.has_avx && type.width*type.length == 256) || +- (util_cpu_caps.has_avx512f && type.width*type.length == 512)) ++ (util_get_cpu_caps()->has_avx && type.width*type.length == 256) || ++ (util_get_cpu_caps()->has_avx512f && type.width*type.length == 512)) + return TRUE; +- else if ((util_cpu_caps.has_altivec && ++ else if ((util_get_cpu_caps()->has_altivec && + (type.width == 32 && type.length == 4))) + return TRUE; +- else if (util_cpu_caps.has_neon) ++ else if (util_get_cpu_caps()->has_neon) + return TRUE; + + return FALSE; +@@ -1935,7 +1935,7 @@ lp_build_iround_nearest_sse2(struct lp_build_context *bld, + assert(type.width == 32); + + assert(lp_check_value(type, a)); +- assert(util_cpu_caps.has_sse2); ++ assert(util_get_cpu_caps()->has_sse2); + + /* This is relying on MXCSR rounding mode, which should always be nearest. */ + if (type.length == 1) { +@@ -1961,7 +1961,7 @@ lp_build_iround_nearest_sse2(struct lp_build_context *bld, + } + else { + assert(type.width*type.length == 256); +- assert(util_cpu_caps.has_avx); ++ assert(util_get_cpu_caps()->has_avx); + + intrinsic = "llvm.x86.avx.cvt.ps2dq.256"; + } +@@ -1987,7 +1987,7 @@ lp_build_round_altivec(struct lp_build_context *bld, + assert(type.floating); + + assert(lp_check_value(type, a)); +- assert(util_cpu_caps.has_altivec); ++ assert(util_get_cpu_caps()->has_altivec); + + (void)type; + +@@ -2014,7 +2014,7 @@ lp_build_round_arch(struct lp_build_context *bld, + LLVMValueRef a, + enum lp_build_round_mode mode) + { +- if (util_cpu_caps.has_sse4_1 || util_cpu_caps.has_neon) { ++ if (util_get_cpu_caps()->has_sse4_1 || util_get_cpu_caps()->has_neon) { + LLVMBuilderRef builder = bld->gallivm->builder; + const struct lp_type type = bld->type; + const char *intrinsic_root; +@@ -2042,7 +2042,7 @@ lp_build_round_arch(struct lp_build_context *bld, + lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type); + return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); + } +- else /* (util_cpu_caps.has_altivec) */ ++ else /* (util_get_cpu_caps()->has_altivec) */ + return lp_build_round_altivec(bld, a, mode); + } + +@@ -2377,9 +2377,9 @@ lp_build_iround(struct lp_build_context *bld, + + assert(lp_check_value(type, a)); + +- if ((util_cpu_caps.has_sse2 && ++ if ((util_get_cpu_caps()->has_sse2 && + ((type.width == 32) && (type.length == 1 || type.length == 4))) || +- (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) { ++ (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) { + return lp_build_iround_nearest_sse2(bld, a); + } + if (arch_rounding_available(type)) { +@@ -2664,8 +2664,8 @@ lp_build_rcp(struct lp_build_context *bld, + * particular uses that require less workarounds. + */ + +- if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || +- (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){ ++ if (FALSE && ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) || ++ (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8))){ + const unsigned num_iterations = 0; + LLVMValueRef res; + unsigned i; +@@ -2784,8 +2784,8 @@ lp_build_fast_rsqrt_available(struct lp_type type) + { + assert(type.floating); + +- if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || +- (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) { ++ if ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) || ++ (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) { + return true; + } + return false; +@@ -3694,7 +3694,7 @@ lp_build_is_inf_or_nan(struct gallivm_state *gallivm, + LLVMValueRef + lp_build_fpstate_get(struct gallivm_state *gallivm) + { +- if (util_cpu_caps.has_sse) { ++ if (util_get_cpu_caps()->has_sse) { + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef mxcsr_ptr = lp_build_alloca( + gallivm, +@@ -3715,7 +3715,7 @@ void + lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm, + boolean zero) + { +- if (util_cpu_caps.has_sse) { ++ if (util_get_cpu_caps()->has_sse) { + /* turn on DAZ (64) | FTZ (32768) = 32832 if available */ + int daz_ftz = _MM_FLUSH_ZERO_MASK; + +@@ -3724,7 +3724,7 @@ lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm, + LLVMValueRef mxcsr = + LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr"); + +- if (util_cpu_caps.has_daz) { ++ if (util_get_cpu_caps()->has_daz) { + /* Enable denormals are zero mode */ + daz_ftz |= _MM_DENORMALS_ZERO_MASK; + } +@@ -3745,7 +3745,7 @@ void + lp_build_fpstate_set(struct gallivm_state *gallivm, + LLVMValueRef mxcsr_ptr) + { +- if (util_cpu_caps.has_sse) { ++ if (util_get_cpu_caps()->has_sse) { + LLVMBuilderRef builder = gallivm->builder; + mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr, + LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), ""); +diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c +index c68b8850473..af445b00c1a 100644 +--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c ++++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c +@@ -101,7 +101,7 @@ lp_build_half_to_float(struct gallivm_state *gallivm, + LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type); + LLVMValueRef h; + +- if (util_cpu_caps.has_f16c && ++ if (util_get_cpu_caps()->has_f16c && + (src_length == 4 || src_length == 8)) { + if (LLVM_VERSION_MAJOR < 11) { + const char *intrinsic = NULL; +@@ -167,7 +167,7 @@ lp_build_float_to_half(struct gallivm_state *gallivm, + * useless. + */ + +- if (util_cpu_caps.has_f16c && ++ if (util_get_cpu_caps()->has_f16c && + (length == 4 || length == 8)) { + struct lp_type i168_type = lp_type_int_vec(16, 16 * 8); + unsigned mode = 3; /* same as LP_BUILD_ROUND_TRUNCATE */ +@@ -489,7 +489,7 @@ int lp_build_conv_auto(struct gallivm_state *gallivm, + + /* Special case 4x4x32 --> 1x16x8 */ + if (src_type.length == 4 && +- (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec)) ++ (util_get_cpu_caps()->has_sse2 || util_get_cpu_caps()->has_altivec)) + { + num_dsts = (num_srcs + 3) / 4; + dst_type->length = num_srcs * 4 >= 16 ? 16 : num_srcs * 4; +@@ -500,7 +500,7 @@ int lp_build_conv_auto(struct gallivm_state *gallivm, + + /* Special case 2x8x32 --> 1x16x8 */ + if (src_type.length == 8 && +- util_cpu_caps.has_avx) ++ util_get_cpu_caps()->has_avx) + { + num_dsts = (num_srcs + 1) / 2; + dst_type->length = num_srcs * 8 >= 16 ? 16 : num_srcs * 8; +@@ -597,7 +597,7 @@ lp_build_conv(struct gallivm_state *gallivm, + ((dst_type.length == 16 && 4 * num_dsts == num_srcs) || + (num_dsts == 1 && dst_type.length * num_srcs == 16 && num_srcs != 3)) && + +- (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec)) ++ (util_get_cpu_caps()->has_sse2 || util_get_cpu_caps()->has_altivec)) + { + struct lp_build_context bld; + struct lp_type int16_type, int32_type; +@@ -710,7 +710,7 @@ lp_build_conv(struct gallivm_state *gallivm, + ((dst_type.length == 16 && 2 * num_dsts == num_srcs) || + (num_dsts == 1 && dst_type.length * num_srcs == 8)) && + +- util_cpu_caps.has_avx) { ++ util_get_cpu_caps()->has_avx) { + + struct lp_build_context bld; + struct lp_type int16_type, int32_type; +diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c b/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c +index 174857e06d9..e17c7881e7d 100644 +--- a/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c ++++ b/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c +@@ -642,8 +642,8 @@ s3tc_dxt1_full_to_rgba_aos(struct gallivm_state *gallivm, + * XXX with sse2 and 16x8 vectors, should use pavgb even when n == 1. + * Much cheaper (but we don't care that much if n == 1). + */ +- if ((util_cpu_caps.has_sse2 && n == 4) || +- (util_cpu_caps.has_avx2 && n == 8)) { ++ if ((util_get_cpu_caps()->has_sse2 && n == 4) || ++ (util_get_cpu_caps()->has_avx2 && n == 8)) { + color2_2 = lp_build_pavgb(&bld8, colors0, colors1); + color2_2 = LLVMBuildBitCast(builder, color2_2, bld32.vec_type, ""); + } +@@ -1350,7 +1350,7 @@ s3tc_decode_block_dxt1(struct gallivm_state *gallivm, + if (is_dxt1_variant) { + LLVMValueRef color23_2, color2_2; + +- if (util_cpu_caps.has_sse2) { ++ if (util_get_cpu_caps()->has_sse2) { + LLVMValueRef intrargs[2]; + intrargs[0] = LLVMBuildBitCast(builder, color01, bld8.vec_type, ""); + /* same interleave as for lerp23 - correct result in 2nd element */ +@@ -1389,7 +1389,7 @@ s3tc_decode_block_dxt1(struct gallivm_state *gallivm, + color23 = lp_build_select(&bld32, sel_mask, color23, color23_2); + } + +- if (util_cpu_caps.has_ssse3) { ++ if (util_get_cpu_caps()->has_ssse3) { + /* + * Use pshufb as mini-lut. (Only doable with intrinsics as the + * final shuffles are non-constant. pshufb is awesome!) +@@ -1689,7 +1689,7 @@ s3tc_decode_block_dxt5(struct gallivm_state *gallivm, + type16.sign = FALSE; + sel_mask = LLVMBuildBitCast(builder, sel_mask, bld8.vec_type, ""); + +- if (!util_cpu_caps.has_ssse3) { ++ if (!util_get_cpu_caps()->has_ssse3) { + LLVMValueRef acodeg, mask1, acode0, acode1; + + /* extraction of the 3 bit values into something more useful is HARD */ +diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c +index 121452d7596..97deffe1de0 100644 +--- a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c ++++ b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c +@@ -90,7 +90,7 @@ uyvy_to_yuv_soa(struct gallivm_state *gallivm, + * per element. Didn't measure performance but cuts shader size + * by quite a bit (less difference if cpu has no sse4.1 support). + */ +- if (util_cpu_caps.has_sse2 && n > 1) { ++ if (util_get_cpu_caps()->has_sse2 && n > 1) { + LLVMValueRef sel, tmp, tmp2; + struct lp_build_context bld32; + +@@ -174,7 +174,7 @@ yuyv_to_yuv_soa(struct gallivm_state *gallivm, + * per element. Didn't measure performance but cuts shader size + * by quite a bit (less difference if cpu has no sse4.1 support). + */ +- if (util_cpu_caps.has_sse2 && n > 1) { ++ if (util_get_cpu_caps()->has_sse2 && n > 1) { + LLVMValueRef sel, tmp; + struct lp_build_context bld32; + +diff --git a/src/gallium/auxiliary/gallivm/lp_bld_gather.c b/src/gallium/auxiliary/gallivm/lp_bld_gather.c +index e991b0dc375..42cc17371a0 100644 +--- a/src/gallium/auxiliary/gallivm/lp_bld_gather.c ++++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.c +@@ -488,7 +488,7 @@ lp_build_gather(struct gallivm_state *gallivm, + * 32bit/64bit fetches you're doing it wrong (this is gather, not + * conversion) and it would be awkward for floats. + */ +- } else if (util_cpu_caps.has_avx2 && !need_expansion && ++ } else if (util_get_cpu_caps()->has_avx2 && !need_expansion && + src_width == 32 && (length == 4 || length == 8)) { + return lp_build_gather_avx2(gallivm, length, src_width, dst_type, + base_ptr, offsets); +@@ -500,7 +500,7 @@ lp_build_gather(struct gallivm_state *gallivm, + * (In general, should be more of a win if the fetch is 256bit wide - + * this is true for the 32bit case above too.) + */ +- } else if (0 && util_cpu_caps.has_avx2 && !need_expansion && ++ } else if (0 && util_get_cpu_caps()->has_avx2 && !need_expansion && + src_width == 64 && (length == 2 || length == 4)) { + return lp_build_gather_avx2(gallivm, length, src_width, dst_type, + base_ptr, offsets); +diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c +index 685ed0e58aa..dd428242cb9 100644 +--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c ++++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c +@@ -433,6 +433,7 @@ lp_build_init(void) + /* For simulating less capable machines */ + #ifdef DEBUG + if (debug_get_bool_option("LP_FORCE_SSE2", FALSE)) { ++ extern struct util_cpu_caps_t util_cpu_caps; + assert(util_cpu_caps.has_sse2); + util_cpu_caps.has_sse3 = 0; + util_cpu_caps.has_ssse3 = 0; +@@ -445,7 +446,7 @@ lp_build_init(void) + } + #endif + +- if (util_cpu_caps.has_avx2 || util_cpu_caps.has_avx) { ++ if (util_get_cpu_caps()->has_avx2 || util_get_cpu_caps()->has_avx) { + lp_native_vector_width = 256; + } else { + /* Leave it at 128, even when no SIMD extensions are available. +@@ -460,16 +461,16 @@ lp_build_init(void) + #if LLVM_VERSION_MAJOR < 4 + if (lp_native_vector_width <= 128) { + /* Hide AVX support, as often LLVM AVX intrinsics are only guarded by +- * "util_cpu_caps.has_avx" predicate, and lack the ++ * "util_get_cpu_caps()->has_avx" predicate, and lack the + * "lp_native_vector_width > 128" predicate. And also to ensure a more + * consistent behavior, allowing one to test SSE2 on AVX machines. + * XXX: should not play games with util_cpu_caps directly as it might + * get used for other things outside llvm too. + */ +- util_cpu_caps.has_avx = 0; +- util_cpu_caps.has_avx2 = 0; +- util_cpu_caps.has_f16c = 0; +- util_cpu_caps.has_fma = 0; ++ util_get_cpu_caps()->has_avx = 0; ++ util_get_cpu_caps()->has_avx2 = 0; ++ util_get_cpu_caps()->has_f16c = 0; ++ util_get_cpu_caps()->has_fma = 0; + } + #endif + +@@ -482,7 +483,7 @@ lp_build_init(void) + * Right now denorms get explicitly disabled (but elsewhere) for x86, + * whereas ppc64 explicitly enables them... + */ +- if (util_cpu_caps.has_altivec) { ++ if (util_get_cpu_caps()->has_altivec) { + unsigned short mask[] = { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFE, 0xFFFF }; + __asm ( +diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c +index 315977ae745..3ed3b5a74b1 100644 +--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c ++++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c +@@ -196,7 +196,7 @@ lp_build_compare(struct gallivm_state *gallivm, + + if (!type.floating && !type.sign && + type.width * type.length == 128 && +- util_cpu_caps.has_sse2 && ++ util_get_cpu_caps()->has_sse2 && + (func == PIPE_FUNC_LESS || + func == PIPE_FUNC_LEQUAL || + func == PIPE_FUNC_GREATER || +@@ -348,11 +348,11 @@ lp_build_select(struct lp_build_context *bld, + + res = LLVMBuildSelect(builder, mask, a, b, ""); + } +- else if (((util_cpu_caps.has_sse4_1 && ++ else if (((util_get_cpu_caps()->has_sse4_1 && + type.width * type.length == 128) || +- (util_cpu_caps.has_avx && ++ (util_get_cpu_caps()->has_avx && + type.width * type.length == 256 && type.width >= 32) || +- (util_cpu_caps.has_avx2 && ++ (util_get_cpu_caps()->has_avx2 && + type.width * type.length == 256)) && + !LLVMIsConstant(a) && + !LLVMIsConstant(b) && +@@ -379,7 +379,7 @@ lp_build_select(struct lp_build_context *bld, + intrinsic = "llvm.x86.avx.blendv.ps.256"; + arg_type = LLVMVectorType(LLVMFloatTypeInContext(lc), 8); + } else { +- assert(util_cpu_caps.has_avx2); ++ assert(util_get_cpu_caps()->has_avx2); + intrinsic = "llvm.x86.avx2.pblendvb"; + arg_type = LLVMVectorType(LLVMInt8TypeInContext(lc), 32); + } +diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp +index 9b75676a4e2..4f3e696816c 100644 +--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp ++++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp +@@ -400,22 +400,22 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT, + * http://llvm.org/PR19429 + * http://llvm.org/PR16721 + */ +- MAttrs.push_back(util_cpu_caps.has_sse ? "+sse" : "-sse" ); +- MAttrs.push_back(util_cpu_caps.has_sse2 ? "+sse2" : "-sse2" ); +- MAttrs.push_back(util_cpu_caps.has_sse3 ? "+sse3" : "-sse3" ); +- MAttrs.push_back(util_cpu_caps.has_ssse3 ? "+ssse3" : "-ssse3" ); +- MAttrs.push_back(util_cpu_caps.has_sse4_1 ? "+sse4.1" : "-sse4.1"); +- MAttrs.push_back(util_cpu_caps.has_sse4_2 ? "+sse4.2" : "-sse4.2"); ++ MAttrs.push_back(util_get_cpu_caps()->has_sse ? "+sse" : "-sse" ); ++ MAttrs.push_back(util_get_cpu_caps()->has_sse2 ? "+sse2" : "-sse2" ); ++ MAttrs.push_back(util_get_cpu_caps()->has_sse3 ? "+sse3" : "-sse3" ); ++ MAttrs.push_back(util_get_cpu_caps()->has_ssse3 ? "+ssse3" : "-ssse3" ); ++ MAttrs.push_back(util_get_cpu_caps()->has_sse4_1 ? "+sse4.1" : "-sse4.1"); ++ MAttrs.push_back(util_get_cpu_caps()->has_sse4_2 ? "+sse4.2" : "-sse4.2"); + /* + * AVX feature is not automatically detected from CPUID by the X86 target + * yet, because the old (yet default) JIT engine is not capable of + * emitting the opcodes. On newer llvm versions it is and at least some + * versions (tested with 3.3) will emit avx opcodes without this anyway. + */ +- MAttrs.push_back(util_cpu_caps.has_avx ? "+avx" : "-avx"); +- MAttrs.push_back(util_cpu_caps.has_f16c ? "+f16c" : "-f16c"); +- MAttrs.push_back(util_cpu_caps.has_fma ? "+fma" : "-fma"); +- MAttrs.push_back(util_cpu_caps.has_avx2 ? "+avx2" : "-avx2"); ++ MAttrs.push_back(util_get_cpu_caps()->has_avx ? "+avx" : "-avx"); ++ MAttrs.push_back(util_get_cpu_caps()->has_f16c ? "+f16c" : "-f16c"); ++ MAttrs.push_back(util_get_cpu_caps()->has_fma ? "+fma" : "-fma"); ++ MAttrs.push_back(util_get_cpu_caps()->has_avx2 ? "+avx2" : "-avx2"); + /* disable avx512 and all subvariants */ + MAttrs.push_back("-avx512cd"); + MAttrs.push_back("-avx512er"); +@@ -426,7 +426,7 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT, + MAttrs.push_back("-avx512vl"); + #endif + #if defined(PIPE_ARCH_ARM) +- if (!util_cpu_caps.has_neon) { ++ if (!util_get_cpu_caps()->has_neon) { + MAttrs.push_back("-neon"); + MAttrs.push_back("-crypto"); + MAttrs.push_back("-vfp2"); +@@ -434,7 +434,7 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT, + #endif + + #if defined(PIPE_ARCH_PPC) +- MAttrs.push_back(util_cpu_caps.has_altivec ? "+altivec" : "-altivec"); ++ MAttrs.push_back(util_get_cpu_caps()->has_altivec ? "+altivec" : "-altivec"); + #if (LLVM_VERSION_MAJOR < 4) + /* + * Make sure VSX instructions are disabled +@@ -444,7 +444,7 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT, + * https://llvm.org/bugs/show_bug.cgi?id=33531 (fixed in 4.0) + * https://llvm.org/bugs/show_bug.cgi?id=34647 (llc performance on certain unusual shader IR; intro'd in 4.0, pending as of 5.0) + */ +- if (util_cpu_caps.has_altivec) { ++ if (util_get_cpu_caps()->has_altivec) { + MAttrs.push_back("-vsx"); + } + #else +@@ -458,8 +458,8 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT, + * Make sure VSX instructions are ENABLED (if supported), unless + * VSX instructions are explicitly enabled/disabled via GALLIVM_VSX=1 or 0. + */ +- if (util_cpu_caps.has_altivec) { +- MAttrs.push_back(util_cpu_caps.has_vsx ? "+vsx" : "-vsx"); ++ if (util_get_cpu_caps()->has_altivec) { ++ MAttrs.push_back(util_get_cpu_caps()->has_vsx ? "+vsx" : "-vsx"); + } + #endif + #endif +diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c +index e1f652a9342..76e57c52f80 100644 +--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c ++++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c +@@ -322,7 +322,7 @@ lp_build_interleave2(struct gallivm_state *gallivm, + { + LLVMValueRef shuffle; + +- if (type.length == 2 && type.width == 128 && util_cpu_caps.has_avx) { ++ if (type.length == 2 && type.width == 128 && util_get_cpu_caps()->has_avx) { + /* + * XXX: This is a workaround for llvm code generation deficiency. Strangely + * enough, while this needs vinsertf128/vextractf128 instructions (hence +@@ -484,7 +484,7 @@ lp_build_unpack2_native(struct gallivm_state *gallivm, + + /* Interleave bits */ + #if UTIL_ARCH_LITTLE_ENDIAN +- if (src_type.length * src_type.width == 256 && util_cpu_caps.has_avx2) { ++ if (src_type.length * src_type.width == 256 && util_get_cpu_caps()->has_avx2) { + *dst_lo = lp_build_interleave2_half(gallivm, src_type, src, msb, 0); + *dst_hi = lp_build_interleave2_half(gallivm, src_type, src, msb, 1); + } else { +@@ -585,22 +585,22 @@ lp_build_pack2(struct gallivm_state *gallivm, + assert(src_type.length * 2 == dst_type.length); + + /* Check for special cases first */ +- if ((util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec) && ++ if ((util_get_cpu_caps()->has_sse2 || util_get_cpu_caps()->has_altivec) && + src_type.width * src_type.length >= 128) { + const char *intrinsic = NULL; + boolean swap_intrinsic_operands = FALSE; + + switch(src_type.width) { + case 32: +- if (util_cpu_caps.has_sse2) { ++ if (util_get_cpu_caps()->has_sse2) { + if (dst_type.sign) { + intrinsic = "llvm.x86.sse2.packssdw.128"; + } else { +- if (util_cpu_caps.has_sse4_1) { ++ if (util_get_cpu_caps()->has_sse4_1) { + intrinsic = "llvm.x86.sse41.packusdw"; + } + } +- } else if (util_cpu_caps.has_altivec) { ++ } else if (util_get_cpu_caps()->has_altivec) { + if (dst_type.sign) { + intrinsic = "llvm.ppc.altivec.vpkswss"; + } else { +@@ -613,18 +613,18 @@ lp_build_pack2(struct gallivm_state *gallivm, + break; + case 16: + if (dst_type.sign) { +- if (util_cpu_caps.has_sse2) { ++ if (util_get_cpu_caps()->has_sse2) { + intrinsic = "llvm.x86.sse2.packsswb.128"; +- } else if (util_cpu_caps.has_altivec) { ++ } else if (util_get_cpu_caps()->has_altivec) { + intrinsic = "llvm.ppc.altivec.vpkshss"; + #if UTIL_ARCH_LITTLE_ENDIAN + swap_intrinsic_operands = TRUE; + #endif + } + } else { +- if (util_cpu_caps.has_sse2) { ++ if (util_get_cpu_caps()->has_sse2) { + intrinsic = "llvm.x86.sse2.packuswb.128"; +- } else if (util_cpu_caps.has_altivec) { ++ } else if (util_get_cpu_caps()->has_altivec) { + intrinsic = "llvm.ppc.altivec.vpkshus"; + #if UTIL_ARCH_LITTLE_ENDIAN + swap_intrinsic_operands = TRUE; +@@ -740,7 +740,7 @@ lp_build_pack2_native(struct gallivm_state *gallivm, + + /* At this point only have special case for avx2 */ + if (src_type.length * src_type.width == 256 && +- util_cpu_caps.has_avx2) { ++ util_get_cpu_caps()->has_avx2) { + switch(src_type.width) { + case 32: + if (dst_type.sign) { +@@ -793,7 +793,7 @@ lp_build_packs2(struct gallivm_state *gallivm, + + /* All X86 SSE non-interleaved pack instructions take signed inputs and + * saturate them, so no need to clamp for those cases. */ +- if(util_cpu_caps.has_sse2 && ++ if(util_get_cpu_caps()->has_sse2 && + src_type.width * src_type.length >= 128 && + src_type.sign && + (src_type.width == 32 || src_type.width == 16)) +diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c +index 686abc08620..98dcde912b5 100644 +--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c ++++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c +@@ -1152,7 +1152,7 @@ lp_build_minify(struct lp_build_context *bld, + LLVMValueRef size; + assert(bld->type.sign); + if (lod_scalar || +- (util_cpu_caps.has_avx2 || !util_cpu_caps.has_sse)) { ++ (util_get_cpu_caps()->has_avx2 || !util_get_cpu_caps()->has_sse)) { + size = LLVMBuildLShr(builder, base_size, level, "minify"); + size = lp_build_max(bld, size, bld->one); + } +diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c +index 2b91edd37c7..6e47640e70d 100644 +--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c ++++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c +@@ -3234,7 +3234,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm, + * as it appears to be a loss with just AVX) + */ + if (num_quads == 1 || !use_aos || +- (util_cpu_caps.has_avx2 && ++ (util_get_cpu_caps()->has_avx2 && + (bld.num_lods == 1 || + derived_sampler_state.min_img_filter == derived_sampler_state.mag_img_filter))) { + if (use_aos) { +diff --git a/src/gallium/auxiliary/rtasm/rtasm_cpu.c b/src/gallium/auxiliary/rtasm/rtasm_cpu.c +index b1c8b990ef1..03b11f914b4 100644 +--- a/src/gallium/auxiliary/rtasm/rtasm_cpu.c ++++ b/src/gallium/auxiliary/rtasm/rtasm_cpu.c +@@ -35,10 +35,10 @@ + + DEBUG_GET_ONCE_BOOL_OPTION(nosse, "GALLIUM_NOSSE", false); + +-static struct util_cpu_caps *get_cpu_caps(void) ++static const struct util_cpu_caps_t *get_cpu_caps(void) + { + util_cpu_detect(); +- return &util_cpu_caps; ++ return util_get_cpu_caps(); + } + + int rtasm_cpu_has_sse(void) +diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c +index ad687f32853..ddd65fb6a08 100644 +--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c ++++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c +@@ -2152,17 +2152,17 @@ static void x86_init_func_common( struct x86_function *p ) + { + util_cpu_detect(); + p->caps = 0; +- if(util_cpu_caps.has_mmx) ++ if(util_get_cpu_caps()->has_mmx) + p->caps |= X86_MMX; +- if(util_cpu_caps.has_mmx2) ++ if(util_get_cpu_caps()->has_mmx2) + p->caps |= X86_MMX2; +- if(util_cpu_caps.has_sse) ++ if(util_get_cpu_caps()->has_sse) + p->caps |= X86_SSE; +- if(util_cpu_caps.has_sse2) ++ if(util_get_cpu_caps()->has_sse2) + p->caps |= X86_SSE2; +- if(util_cpu_caps.has_sse3) ++ if(util_get_cpu_caps()->has_sse3) + p->caps |= X86_SSE3; +- if(util_cpu_caps.has_sse4_1) ++ if(util_get_cpu_caps()->has_sse4_1) + p->caps |= X86_SSE4_1; + p->csr = p->store; + #if defined(PIPE_ARCH_X86) +diff --git a/src/gallium/auxiliary/util/u_threaded_context.c b/src/gallium/auxiliary/util/u_threaded_context.c +index 1eaff77724e..bf56993db09 100644 +--- a/src/gallium/auxiliary/util/u_threaded_context.c ++++ b/src/gallium/auxiliary/util/u_threaded_context.c +@@ -2071,8 +2071,8 @@ tc_set_context_param(struct pipe_context *_pipe, + if (param == PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE) { + /* Pin the gallium thread as requested. */ + util_set_thread_affinity(tc->queue.threads[0], +- util_cpu_caps.L3_affinity_mask[value], +- NULL, UTIL_MAX_CPUS); ++ util_get_cpu_caps()->L3_affinity_mask[value], ++ NULL, util_get_cpu_caps()->num_cpu_mask_bits); + + /* Execute this immediately (without enqueuing). + * It's required to be thread-safe. +@@ -2720,7 +2720,7 @@ threaded_context_create(struct pipe_context *pipe, + + util_cpu_detect(); + +- if (!debug_get_bool_option("GALLIUM_THREAD", util_cpu_caps.nr_cpus > 1)) ++ if (!debug_get_bool_option("GALLIUM_THREAD", util_get_cpu_caps()->nr_cpus > 1)) + return pipe; + + tc = os_malloc_aligned(sizeof(struct threaded_context), 16); +diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c +index 64cf72ae101..913c1bd2462 100644 +--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c ++++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c +@@ -435,7 +435,7 @@ lp_build_occlusion_count(struct gallivm_state *gallivm, + assert(type.length <= 16); + assert(type.floating); + +- if(util_cpu_caps.has_sse && type.length == 4) { ++ if(util_get_cpu_caps()->has_sse && type.length == 4) { + const char *movmskintr = "llvm.x86.sse.movmsk.ps"; + const char *popcntintr = "llvm.ctpop.i32"; + LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue, +@@ -446,7 +446,7 @@ lp_build_occlusion_count(struct gallivm_state *gallivm, + LLVMInt32TypeInContext(context), bits); + count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), ""); + } +- else if(util_cpu_caps.has_avx && type.length == 8) { ++ else if(util_get_cpu_caps()->has_avx && type.length == 8) { + const char *movmskintr = "llvm.x86.avx.movmsk.ps.256"; + const char *popcntintr = "llvm.ctpop.i32"; + LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue, +diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c +index f133bbf8a4d..628a4338c1e 100644 +--- a/src/gallium/drivers/llvmpipe/lp_screen.c ++++ b/src/gallium/drivers/llvmpipe/lp_screen.c +@@ -915,7 +915,7 @@ llvmpipe_create_screen(struct sw_winsys *winsys) + + screen->allow_cl = !!getenv("LP_CL"); + screen->use_tgsi = (LP_DEBUG & DEBUG_TGSI_IR); +- screen->num_threads = util_cpu_caps.nr_cpus > 1 ? util_cpu_caps.nr_cpus : 0; ++ screen->num_threads = util_get_cpu_caps()->nr_cpus > 1 ? util_get_cpu_caps()->nr_cpus : 0; + #ifdef EMBEDDED_DEVICE + screen->num_threads = 0; + #endif +diff --git a/src/gallium/drivers/llvmpipe/lp_test_arit.c b/src/gallium/drivers/llvmpipe/lp_test_arit.c +index 873dcf37fac..725854cc25c 100644 +--- a/src/gallium/drivers/llvmpipe/lp_test_arit.c ++++ b/src/gallium/drivers/llvmpipe/lp_test_arit.c +@@ -382,7 +382,7 @@ flush_denorm_to_zero(float val) + fi_val.f = val; + + #if defined(PIPE_ARCH_SSE) +- if (util_cpu_caps.has_sse) { ++ if (util_get_cpu_caps()->has_sse) { + if ((fi_val.ui & 0x7f800000) == 0) { + fi_val.ui &= 0xff800000; + } +@@ -458,7 +458,7 @@ test_unary(unsigned verbose, FILE *fp, const struct unary_test_t *test, unsigned + continue; + } + +- if (!util_cpu_caps.has_neon && ++ if (!util_get_cpu_caps()->has_neon && + test->ref == &nearbyintf && length == 2 && + ref != roundf(testval)) { + /* FIXME: The generic (non SSE) path in lp_build_iround, which is +diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c +index 2bf223d66f9..815736166d5 100644 +--- a/src/gallium/drivers/llvmpipe/lp_texture.c ++++ b/src/gallium/drivers/llvmpipe/lp_texture.c +@@ -85,7 +85,7 @@ llvmpipe_texture_layout(struct llvmpipe_screen *screen, + * of a block for all formats) though this should not be strictly necessary + * neither. In any case it can only affect compressed or 1d textures. + */ +- unsigned mip_align = MAX2(64, util_cpu_caps.cacheline); ++ unsigned mip_align = MAX2(64, util_get_cpu_caps()->cacheline); + + assert(LP_MAX_TEXTURE_2D_LEVELS <= LP_MAX_TEXTURE_LEVELS); + assert(LP_MAX_TEXTURE_3D_LEVELS <= LP_MAX_TEXTURE_LEVELS); +@@ -123,7 +123,7 @@ llvmpipe_texture_layout(struct llvmpipe_screen *screen, + if (util_format_is_compressed(pt->format)) + lpr->row_stride[level] = nblocksx * block_size; + else +- lpr->row_stride[level] = align(nblocksx * block_size, util_cpu_caps.cacheline); ++ lpr->row_stride[level] = align(nblocksx * block_size, util_get_cpu_caps()->cacheline); + + /* if row_stride * height > LP_MAX_TEXTURE_SIZE */ + if ((uint64_t)lpr->row_stride[level] * nblocksy > LP_MAX_TEXTURE_SIZE) { +diff --git a/src/gallium/drivers/swr/swr_loader.cpp b/src/gallium/drivers/swr/swr_loader.cpp +index 97db7ca3e8b..d891b6b14e8 100644 +--- a/src/gallium/drivers/swr/swr_loader.cpp ++++ b/src/gallium/drivers/swr/swr_loader.cpp +@@ -91,7 +91,7 @@ swr_create_screen(struct sw_winsys *winsys) + + util_cpu_detect(); + +- if (util_cpu_caps.has_avx512f && util_cpu_caps.has_avx512er) { ++ if (util_get_cpu_caps()->has_avx512f && util_get_cpu_caps()->has_avx512er) { + swr_print_info("SWR detected KNL instruction support "); + #ifndef HAVE_SWR_KNL + swr_print_info("(skipping: not built).\n"); +@@ -103,7 +103,7 @@ swr_create_screen(struct sw_winsys *winsys) + #endif + } + +- if (util_cpu_caps.has_avx512f && util_cpu_caps.has_avx512bw) { ++ if (util_get_cpu_caps()->has_avx512f && util_get_cpu_caps()->has_avx512bw) { + swr_print_info("SWR detected SKX instruction support "); + #ifndef HAVE_SWR_SKX + swr_print_info("(skipping not built).\n"); +@@ -113,7 +113,7 @@ swr_create_screen(struct sw_winsys *winsys) + #endif + } + +- if (util_cpu_caps.has_avx2) { ++ if (util_get_cpu_caps()->has_avx2) { + swr_print_info("SWR detected AVX2 instruction support "); + #ifndef HAVE_SWR_AVX2 + swr_print_info("(skipping not built).\n"); +@@ -123,7 +123,7 @@ swr_create_screen(struct sw_winsys *winsys) + #endif + } + +- if (util_cpu_caps.has_avx) { ++ if (util_get_cpu_caps()->has_avx) { + swr_print_info("SWR detected AVX instruction support "); + #ifndef HAVE_SWR_AVX + swr_print_info("(skipping not built).\n"); +diff --git a/src/gallium/drivers/vc4/vc4_tiling.h b/src/gallium/drivers/vc4/vc4_tiling.h +index 66767e7f1f8..5afe32939a8 100644 +--- a/src/gallium/drivers/vc4/vc4_tiling.h ++++ b/src/gallium/drivers/vc4/vc4_tiling.h +@@ -90,7 +90,7 @@ vc4_load_lt_image(void *dst, uint32_t dst_stride, + int cpp, const struct pipe_box *box) + { + #ifdef USE_ARM_ASM +- if (util_cpu_caps.has_neon) { ++ if (util_get_cpu_caps()->has_neon) { + vc4_load_lt_image_neon(dst, dst_stride, src, src_stride, + cpp, box); + return; +@@ -106,7 +106,7 @@ vc4_store_lt_image(void *dst, uint32_t dst_stride, + int cpp, const struct pipe_box *box) + { + #ifdef USE_ARM_ASM +- if (util_cpu_caps.has_neon) { ++ if (util_get_cpu_caps()->has_neon) { + vc4_store_lt_image_neon(dst, dst_stride, src, src_stride, + cpp, box); + return; +diff --git a/src/gallium/tests/unit/translate_test.c b/src/gallium/tests/unit/translate_test.c +index 4d9c4e27ebf..782f16e7f78 100644 +--- a/src/gallium/tests/unit/translate_test.c ++++ b/src/gallium/tests/unit/translate_test.c +@@ -50,6 +50,7 @@ int main(int argc, char** argv) + { + struct translate *(*create_fn)(const struct translate_key *key) = 0; + ++ extern struct util_cpu_caps_t util_cpu_caps; + struct translate_key key; + unsigned output_format; + unsigned input_format; +@@ -87,7 +88,7 @@ int main(int argc, char** argv) + } + else if (!strcmp(argv[1], "sse")) + { +- if(!util_cpu_caps.has_sse || !rtasm_cpu_has_sse()) ++ if(!util_get_cpu_caps()->has_sse || !rtasm_cpu_has_sse()) + { + printf("Error: CPU doesn't support SSE (test with qemu)\n"); + return 2; +@@ -99,7 +100,7 @@ int main(int argc, char** argv) + } + else if (!strcmp(argv[1], "sse2")) + { +- if(!util_cpu_caps.has_sse2 || !rtasm_cpu_has_sse()) ++ if(!util_get_cpu_caps()->has_sse2 || !rtasm_cpu_has_sse()) + { + printf("Error: CPU doesn't support SSE2 (test with qemu)\n"); + return 2; +@@ -110,7 +111,7 @@ int main(int argc, char** argv) + } + else if (!strcmp(argv[1], "sse3")) + { +- if(!util_cpu_caps.has_sse3 || !rtasm_cpu_has_sse()) ++ if(!util_get_cpu_caps()->has_sse3 || !rtasm_cpu_has_sse()) + { + printf("Error: CPU doesn't support SSE3 (test with qemu)\n"); + return 2; +@@ -120,7 +121,7 @@ int main(int argc, char** argv) + } + else if (!strcmp(argv[1], "sse4.1")) + { +- if(!util_cpu_caps.has_sse4_1 || !rtasm_cpu_has_sse()) ++ if(!util_get_cpu_caps()->has_sse4_1 || !rtasm_cpu_has_sse()) + { + printf("Error: CPU doesn't support SSE4.1 (test with qemu)\n"); + return 2; +diff --git a/src/gallium/tests/unit/u_half_test.c b/src/gallium/tests/unit/u_half_test.c +index 7f2eba9382b..4474cfb82b0 100644 +--- a/src/gallium/tests/unit/u_half_test.c ++++ b/src/gallium/tests/unit/u_half_test.c +@@ -36,13 +36,14 @@ test(void) + int + main(int argc, char **argv) + { +- assert(!util_cpu_caps.has_f16c); ++ util_cpu_detect(); + test(); + +- /* Test f16c. */ +- util_cpu_detect(); +- if (util_cpu_caps.has_f16c) ++ /* Test non-f16c. */ ++ if (util_get_cpu_caps()->has_f16c) { ++ ((struct util_cpu_caps_t *)util_get_cpu_caps())->has_f16c = false; + test(); ++ } + + printf("Success!\n"); + return 0; +diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c +index 8a0aedfed64..a18362ce6ea 100644 +--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c ++++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c +@@ -312,8 +312,8 @@ static void amdgpu_pin_threads_to_L3_cache(struct radeon_winsys *rws, + struct amdgpu_winsys *ws = amdgpu_winsys(rws); + + util_set_thread_affinity(ws->cs_queue.threads[0], +- util_cpu_caps.L3_affinity_mask[cache], +- NULL, UTIL_MAX_CPUS); ++ util_get_cpu_caps()->L3_affinity_mask[cache], ++ NULL, util_get_cpu_caps()->num_cpu_mask_bits); + } + + static uint32_t kms_handle_hash(const void *key) +diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c +index f0e1b9f7df3..4430ce50466 100644 +--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c ++++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c +@@ -801,8 +801,8 @@ static void radeon_pin_threads_to_L3_cache(struct radeon_winsys *ws, + + if (util_queue_is_initialized(&rws->cs_queue)) { + util_set_thread_affinity(rws->cs_queue.threads[0], +- util_cpu_caps.L3_affinity_mask[cache], +- NULL, UTIL_MAX_CPUS); ++ util_get_cpu_caps()->L3_affinity_mask[cache], ++ NULL, util_get_cpu_caps()->num_cpu_mask_bits); + } + } + +diff --git a/src/mesa/main/glthread.c b/src/mesa/main/glthread.c +index eb8eb30cabc..c9dfef541fc 100644 +--- a/src/mesa/main/glthread.c ++++ b/src/mesa/main/glthread.c +@@ -199,19 +199,20 @@ _mesa_glthread_flush_batch(struct gl_context *ctx) + /* Pin threads regularly to the same Zen CCX that the main thread is + * running on. The main thread can move between CCXs. + */ +- if (util_cpu_caps.nr_cpus != util_cpu_caps.cores_per_L3 && ++ if (util_get_cpu_caps()->nr_cpus != util_get_cpu_caps()->cores_per_L3 && + /* driver support */ + ctx->Driver.PinDriverToL3Cache && + ++glthread->pin_thread_counter % 128 == 0) { + int cpu = util_get_current_cpu(); + + if (cpu >= 0) { +- unsigned L3_cache = util_cpu_caps.cpu_to_L3[cpu]; +- +- util_set_thread_affinity(glthread->queue.threads[0], +- util_cpu_caps.L3_affinity_mask[L3_cache], +- NULL, UTIL_MAX_CPUS); +- ctx->Driver.PinDriverToL3Cache(ctx, L3_cache); ++ uint16_t L3_cache = util_get_cpu_caps()->cpu_to_L3[cpu]; ++ if (L3_cache != U_CPU_INVALID_L3) { ++ util_set_thread_affinity(glthread->queue.threads[0], ++ util_get_cpu_caps()->L3_affinity_mask[L3_cache], ++ NULL, util_get_cpu_caps()->num_cpu_mask_bits); ++ ctx->Driver.PinDriverToL3Cache(ctx, L3_cache); ++ } + } + } + +diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c +index 40364296664..f27fa7ff29c 100644 +--- a/src/mesa/state_tracker/st_context.c ++++ b/src/mesa/state_tracker/st_context.c +@@ -815,6 +815,10 @@ st_create_context_priv(struct gl_context *ctx, struct pipe_context *pipe, + !st->lower_ucp; + st->shader_has_one_variant[MESA_SHADER_COMPUTE] = st->has_shareable_shaders; + ++ if (util_get_cpu_caps()->cores_per_L3 == util_get_cpu_caps()->nr_cpus || ++ !st->pipe->set_context_param) ++ st->pin_thread_counter = ST_L3_PINNING_DISABLED; ++ + st->bitmap.cache.empty = true; + + if (ctx->Const.ForceGLNamesReuse && ctx->Shared->RefCount == 1) { +diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h +index b1fda06ff3e..9ab6969de62 100644 +--- a/src/mesa/state_tracker/st_context.h ++++ b/src/mesa/state_tracker/st_context.h +@@ -55,6 +55,7 @@ struct st_program; + struct st_perf_monitor_group; + struct u_upload_mgr; + ++#define ST_L3_PINNING_DISABLED 0xffffffff + + struct st_bitmap_cache + { +@@ -130,6 +131,9 @@ struct st_context + struct draw_stage *feedback_stage; /**< For GL_FEEDBACK rendermode */ + struct draw_stage *selection_stage; /**< For GL_SELECT rendermode */ + struct draw_stage *rastpos_stage; /**< For glRasterPos */ ++ ++ unsigned pin_thread_counter; /* for L3 thread pinning on AMD Zen */ ++ + GLboolean clamp_frag_color_in_shader; + GLboolean clamp_vert_color_in_shader; + boolean clamp_frag_depth_in_shader; +@@ -235,8 +239,6 @@ struct st_context + /** This masks out unused shader resources. Only valid in draw calls. */ + uint64_t active_states; + +- unsigned pin_thread_counter; /* for L3 thread pinning on AMD Zen */ +- + /* If true, further analysis of states is required to know if something + * has changed. Used mainly for shaders. + */ +diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c +index 996d985510c..159d7017b07 100644 +--- a/src/mesa/state_tracker/st_draw.c ++++ b/src/mesa/state_tracker/st_draw.c +@@ -124,26 +124,26 @@ prepare_draw(struct st_context *st, struct gl_context *ctx) + st_validate_state(st, ST_PIPELINE_RENDER); + } + +- struct pipe_context *pipe = st->pipe; +- + /* Pin threads regularly to the same Zen CCX that the main thread is + * running on. The main thread can move between CCXs. + */ +- if (unlikely(/* AMD Zen */ +- util_cpu_caps.nr_cpus != util_cpu_caps.cores_per_L3 && ++ if (unlikely(st->pin_thread_counter != ST_L3_PINNING_DISABLED && + /* no glthread */ + ctx->CurrentClientDispatch != ctx->MarshalExec && +- /* driver support */ +- pipe->set_context_param && + /* do it occasionally */ + ++st->pin_thread_counter % 512 == 0)) { ++ st->pin_thread_counter = 0; ++ + int cpu = util_get_current_cpu(); + if (cpu >= 0) { +- unsigned L3_cache = util_cpu_caps.cpu_to_L3[cpu]; +- +- pipe->set_context_param(pipe, +- PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE, +- L3_cache); ++ struct pipe_context *pipe = st->pipe; ++ uint16_t L3_cache = util_get_cpu_caps()->cpu_to_L3[cpu]; ++ ++ if (L3_cache != U_CPU_INVALID_L3) { ++ pipe->set_context_param(pipe, ++ PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE, ++ L3_cache); ++ } + } + } + } +diff --git a/src/util/half_float.h b/src/util/half_float.h +index c52bccf8d1e..8f1a1dbf11d 100644 +--- a/src/util/half_float.h ++++ b/src/util/half_float.h +@@ -59,7 +59,7 @@ static inline uint16_t + _mesa_float_to_half(float val) + { + #if defined(USE_X86_64_ASM) +- if (util_cpu_caps.has_f16c) { ++ if (util_get_cpu_caps()->has_f16c) { + __m128 in = {val}; + __m128i out; + +@@ -75,7 +75,7 @@ static inline float + _mesa_half_to_float(uint16_t val) + { + #if defined(USE_X86_64_ASM) +- if (util_cpu_caps.has_f16c) { ++ if (util_get_cpu_caps()->has_f16c) { + __m128i in = {val}; + __m128 out; + +@@ -90,7 +90,7 @@ static inline uint16_t + _mesa_float_to_float16_rtz(float val) + { + #if defined(USE_X86_64_ASM) +- if (util_cpu_caps.has_f16c) { ++ if (util_get_cpu_caps()->has_f16c) { + __m128 in = {val}; + __m128i out; + +diff --git a/src/util/tests/format/u_format_test.c b/src/util/tests/format/u_format_test.c +index f4a62a5c6a8..e6473c2bf6d 100644 +--- a/src/util/tests/format/u_format_test.c ++++ b/src/util/tests/format/u_format_test.c +@@ -850,6 +850,8 @@ int main(int argc, char **argv) + { + boolean success; + ++ util_cpu_detect(); ++ + success = test_all(); + + return success ? 0 : 1; +diff --git a/src/util/u_cpu_detect.c b/src/util/u_cpu_detect.c +index 025f2f30156..4a4b06e1bc6 100644 +--- a/src/util/u_cpu_detect.c ++++ b/src/util/u_cpu_detect.c +@@ -90,7 +90,7 @@ + DEBUG_GET_ONCE_BOOL_OPTION(dump_cpu, "GALLIUM_DUMP_CPU", false) + + +-struct util_cpu_caps util_cpu_caps; ++struct util_cpu_caps_t util_cpu_caps; + + #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) + static int has_cpuid(void); +@@ -438,26 +438,22 @@ get_cpu_topology(void) + util_cpu_caps.cores_per_L3 = util_cpu_caps.nr_cpus; + util_cpu_caps.num_L3_caches = 1; + ++ memset(util_cpu_caps.cpu_to_L3, 0xff, sizeof(util_cpu_caps.cpu_to_L3)); ++ + #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) + /* AMD Zen */ + if (util_cpu_caps.family >= CPU_AMD_ZEN1_ZEN2 && + util_cpu_caps.family < CPU_AMD_LAST) { + uint32_t regs[4]; + +- /* Query the L3 cache count. */ +- cpuid_count(0x8000001D, 3, regs); +- unsigned cache_level = (regs[0] >> 5) & 0x7; +- unsigned cores_per_L3 = ((regs[0] >> 14) & 0xfff) + 1; +- +- if (cache_level != 3 || cores_per_L3 == util_cpu_caps.nr_cpus) +- return; +- + uint32_t saved_mask[UTIL_MAX_CPUS / 32] = {0}; + uint32_t mask[UTIL_MAX_CPUS / 32] = {0}; +- uint32_t allowed_mask[UTIL_MAX_CPUS / 32] = {0}; +- uint32_t apic_id[UTIL_MAX_CPUS]; + bool saved = false; + ++ uint32_t L3_found[UTIL_MAX_CPUS] = {0}; ++ uint32_t num_L3_caches = 0; ++ util_affinity_mask *L3_affinity_masks = NULL; ++ + /* Query APIC IDs from each CPU core. + * + * An APIC ID is a logical ID of the CPU with respect to the cache +@@ -482,41 +478,60 @@ get_cpu_topology(void) + + if (util_set_current_thread_affinity(mask, + !saved ? saved_mask : NULL, +- UTIL_MAX_CPUS)) { ++ util_cpu_caps.num_cpu_mask_bits)) { + saved = true; +- allowed_mask[i / 32] |= cpu_bit; + + /* Query the APIC ID of the current core. */ + cpuid(0x00000001, regs); +- apic_id[i] = regs[1] >> 24; ++ unsigned apic_id = regs[1] >> 24; ++ ++ /* Query the total core count for the CPU */ ++ uint32_t core_count = 1; ++ if (regs[3] & (1 << 28)) ++ core_count = (regs[1] >> 16) & 0xff; ++ ++ core_count = util_next_power_of_two(core_count); ++ ++ /* Query the L3 cache count. */ ++ cpuid_count(0x8000001D, 3, regs); ++ unsigned cache_level = (regs[0] >> 5) & 0x7; ++ unsigned cores_per_L3 = ((regs[0] >> 14) & 0xfff) + 1; ++ ++ if (cache_level != 3) ++ continue; ++ ++ unsigned local_core_id = apic_id & (core_count - 1); ++ unsigned phys_id = (apic_id & ~(core_count - 1)) >> util_logbase2(core_count); ++ unsigned local_l3_cache_index = local_core_id / util_next_power_of_two(cores_per_L3); ++#define L3_ID(p, i) (p << 16 | i << 1 | 1); ++ ++ unsigned l3_id = L3_ID(phys_id, local_l3_cache_index); ++ int idx = -1; ++ for (unsigned c = 0; c < num_L3_caches; c++) { ++ if (L3_found[c] == l3_id) { ++ idx = c; ++ break; ++ } ++ } ++ if (idx == -1) { ++ idx = num_L3_caches; ++ L3_found[num_L3_caches++] = l3_id; ++ L3_affinity_masks = realloc(L3_affinity_masks, sizeof(util_affinity_mask) * num_L3_caches); ++ if (!L3_affinity_masks) ++ return; ++ memset(&L3_affinity_masks[num_L3_caches - 1], 0, sizeof(util_affinity_mask)); ++ } ++ util_cpu_caps.cpu_to_L3[i] = idx; ++ L3_affinity_masks[idx][i / 32] |= cpu_bit; ++ + } + mask[i / 32] = 0; + } + +- if (saved) { +- +- /* We succeeded in using at least one CPU. */ +- util_cpu_caps.num_L3_caches = util_cpu_caps.nr_cpus / cores_per_L3; +- util_cpu_caps.cores_per_L3 = cores_per_L3; +- util_cpu_caps.L3_affinity_mask = calloc(sizeof(util_affinity_mask), +- util_cpu_caps.num_L3_caches); +- +- for (unsigned i = 0; i < util_cpu_caps.nr_cpus && i < UTIL_MAX_CPUS; +- i++) { +- uint32_t cpu_bit = 1u << (i % 32); +- +- if (allowed_mask[i / 32] & cpu_bit) { +- /* Each APIC ID bit represents a topology level, so we need +- * to round up to the next power of two. +- */ +- unsigned L3_index = apic_id[i] / +- util_next_power_of_two(cores_per_L3); +- +- util_cpu_caps.L3_affinity_mask[L3_index][i / 32] |= cpu_bit; +- util_cpu_caps.cpu_to_L3[i] = L3_index; +- } +- } ++ util_cpu_caps.num_L3_caches = num_L3_caches; ++ util_cpu_caps.L3_affinity_mask = L3_affinity_masks; + ++ if (saved) { + if (debug_get_option_dump_cpu()) { + fprintf(stderr, "CPU <-> L3 cache mapping:\n"); + for (unsigned i = 0; i < util_cpu_caps.num_L3_caches; i++) { +@@ -528,7 +543,8 @@ get_cpu_topology(void) + } + + /* Restore the original affinity mask. */ +- util_set_current_thread_affinity(saved_mask, NULL, UTIL_MAX_CPUS); ++ util_set_current_thread_affinity(saved_mask, NULL, ++ util_cpu_caps.num_cpu_mask_bits); + } else { + if (debug_get_option_dump_cpu()) + fprintf(stderr, "Cannot set thread affinity for any thread.\n"); +@@ -547,7 +563,7 @@ util_cpu_detect_once(void) + { + SYSTEM_INFO system_info; + GetSystemInfo(&system_info); +- util_cpu_caps.nr_cpus = system_info.dwNumberOfProcessors; ++ util_cpu_caps.nr_cpus = MAX2(1, system_info.dwNumberOfProcessors); + } + #elif defined(PIPE_OS_UNIX) && defined(_SC_NPROCESSORS_ONLN) + util_cpu_caps.nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); +@@ -569,6 +585,8 @@ util_cpu_detect_once(void) + util_cpu_caps.nr_cpus = 1; + #endif + ++ util_cpu_caps.num_cpu_mask_bits = align(util_cpu_caps.nr_cpus, 32); ++ + /* Make the fallback cacheline size nonzero so that it can be + * safely passed to align(). + */ +diff --git a/src/util/u_cpu_detect.h b/src/util/u_cpu_detect.h +index a76fd912910..1c7239b2ec7 100644 +--- a/src/util/u_cpu_detect.h ++++ b/src/util/u_cpu_detect.h +@@ -55,7 +55,7 @@ enum cpu_family { + + typedef uint32_t util_affinity_mask[UTIL_MAX_CPUS / 32]; + +-struct util_cpu_caps { ++struct util_cpu_caps_t { + int nr_cpus; + enum cpu_family family; + +@@ -98,14 +98,27 @@ struct util_cpu_caps { + + unsigned num_L3_caches; + unsigned cores_per_L3; ++ unsigned num_cpu_mask_bits; + + uint16_t cpu_to_L3[UTIL_MAX_CPUS]; + /* Affinity masks for each L3 cache. */ + util_affinity_mask *L3_affinity_mask; + }; + +-extern struct util_cpu_caps +-util_cpu_caps; ++#define U_CPU_INVALID_L3 0xffff ++ ++static inline const struct util_cpu_caps_t * ++util_get_cpu_caps(void) ++{ ++ extern struct util_cpu_caps_t util_cpu_caps; ++ ++ /* If you hit this assert, it means that something is using the ++ * cpu-caps without having first called util_cpu_detect() ++ */ ++ assert(util_cpu_caps.nr_cpus >= 1); ++ ++ return &util_cpu_caps; ++} + + void util_cpu_detect(void); + +diff --git a/src/util/u_math.c b/src/util/u_math.c +index 9a8a9ecbbde..41e7f599eb0 100644 +--- a/src/util/u_math.c ++++ b/src/util/u_math.c +@@ -92,7 +92,7 @@ util_fpstate_get(void) + unsigned mxcsr = 0; + + #if defined(PIPE_ARCH_SSE) +- if (util_cpu_caps.has_sse) { ++ if (util_get_cpu_caps()->has_sse) { + mxcsr = _mm_getcsr(); + } + #endif +@@ -110,10 +110,10 @@ unsigned + util_fpstate_set_denorms_to_zero(unsigned current_mxcsr) + { + #if defined(PIPE_ARCH_SSE) +- if (util_cpu_caps.has_sse) { ++ if (util_get_cpu_caps()->has_sse) { + /* Enable flush to zero mode */ + current_mxcsr |= _MM_FLUSH_ZERO_MASK; +- if (util_cpu_caps.has_daz) { ++ if (util_get_cpu_caps()->has_daz) { + /* Enable denormals are zero mode */ + current_mxcsr |= _MM_DENORMALS_ZERO_MASK; + } +@@ -132,7 +132,7 @@ void + util_fpstate_set(unsigned mxcsr) + { + #if defined(PIPE_ARCH_SSE) +- if (util_cpu_caps.has_sse) { ++ if (util_get_cpu_caps()->has_sse) { + _mm_setcsr(mxcsr); + } + #endif +diff --git a/src/util/u_queue.c b/src/util/u_queue.c +index b11b297a45c..8f21f0667c6 100644 +--- a/src/util/u_queue.c ++++ b/src/util/u_queue.c +@@ -27,7 +27,7 @@ + #include "u_queue.h" + + #include "c11/threads.h" +- ++#include "util/u_cpu_detect.h" + #include "util/os_time.h" + #include "util/u_string.h" + #include "util/u_thread.h" +@@ -258,7 +258,8 @@ util_queue_thread_func(void *input) + uint32_t mask[UTIL_MAX_CPUS / 32]; + + memset(mask, 0xff, sizeof(mask)); +- util_set_current_thread_affinity(mask, NULL, UTIL_MAX_CPUS); ++ util_set_current_thread_affinity(mask, NULL, ++ util_get_cpu_caps()->num_cpu_mask_bits); + } + + #if defined(__linux__) diff --git a/SOURCES/lavapipe-disable-env-var.patch b/SOURCES/lavapipe-disable-env-var.patch new file mode 100644 index 0000000..9b59577 --- /dev/null +++ b/SOURCES/lavapipe-disable-env-var.patch @@ -0,0 +1,13 @@ +diff -up mesa-20.3.0-rc1/src/gallium/frontends/lavapipe/lvp_device.c.dma mesa-20.3.0-rc1/src/gallium/frontends/lavapipe/lvp_device.c +--- mesa-20.3.0-rc1/src/gallium/frontends/lavapipe/lvp_device.c.dma 2020-11-19 15:11:42.483134826 +1000 ++++ mesa-20.3.0-rc1/src/gallium/frontends/lavapipe/lvp_device.c 2020-11-19 15:13:08.556425782 +1000 +@@ -118,6 +118,9 @@ VkResult lvp_CreateInstance( + client_version = VK_API_VERSION_1_0; + } + ++ if (!getenv("RH_SW_VULKAN")) ++ return VK_ERROR_INITIALIZATION_FAILED; ++ + instance = vk_zalloc2(&default_alloc, pAllocator, sizeof(*instance), 8, + VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); + if (!instance) diff --git a/SOURCES/mesa-20.3.3-stable-fixes.patch b/SOURCES/mesa-20.3.3-stable-fixes.patch new file mode 100644 index 0000000..231e20b --- /dev/null +++ b/SOURCES/mesa-20.3.3-stable-fixes.patch @@ -0,0 +1,930 @@ +diff --git a/src/amd/vulkan/radv_query.c b/src/amd/vulkan/radv_query.c +index d49bc0f0564..90512d4f276 100644 +--- a/src/amd/vulkan/radv_query.c ++++ b/src/amd/vulkan/radv_query.c +@@ -1679,13 +1679,14 @@ static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer, + + va += 8 * idx; + +- si_cs_emit_write_event_eop(cs, +- cmd_buffer->device->physical_device->rad_info.chip_class, +- radv_cmd_buffer_uses_mec(cmd_buffer), +- V_028A90_PS_DONE, 0, +- EOP_DST_SEL_TC_L2, +- EOP_DATA_SEL_GDS, +- va, EOP_DATA_GDS(0, 1), 0); ++ radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); ++ radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_GDS) | ++ COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | ++ COPY_DATA_WR_CONFIRM); ++ radeon_emit(cs, 0); ++ radeon_emit(cs, 0); ++ radeon_emit(cs, va); ++ radeon_emit(cs, va >> 32); + + /* Record that the command buffer needs GDS. */ + cmd_buffer->gds_needed = true; +@@ -1769,13 +1770,14 @@ static void emit_end_query(struct radv_cmd_buffer *cmd_buffer, + + va += 8 * idx; + +- si_cs_emit_write_event_eop(cs, +- cmd_buffer->device->physical_device->rad_info.chip_class, +- radv_cmd_buffer_uses_mec(cmd_buffer), +- V_028A90_PS_DONE, 0, +- EOP_DST_SEL_TC_L2, +- EOP_DATA_SEL_GDS, +- va, EOP_DATA_GDS(0, 1), 0); ++ radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); ++ radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_GDS) | ++ COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | ++ COPY_DATA_WR_CONFIRM); ++ radeon_emit(cs, 0); ++ radeon_emit(cs, 0); ++ radeon_emit(cs, va); ++ radeon_emit(cs, va >> 32); + + cmd_buffer->state.active_pipeline_gds_queries--; + } +diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h +index 9d9491d4361..2eb3ba4e64e 100644 +--- a/src/amd/vulkan/radv_shader.h ++++ b/src/amd/vulkan/radv_shader.h +@@ -573,9 +573,11 @@ get_tcs_num_patches(unsigned tcs_num_input_vertices, + if (chip_class >= GFX7 && family != CHIP_STONEY) + hardware_lds_size = 65536; + +- num_patches = MIN2(num_patches, hardware_lds_size / (input_patch_size + output_patch_size)); ++ if (input_patch_size + output_patch_size) ++ num_patches = MIN2(num_patches, hardware_lds_size / (input_patch_size + output_patch_size)); + /* Make sure the output data fits in the offchip buffer */ +- num_patches = MIN2(num_patches, (tess_offchip_block_dw_size * 4) / output_patch_size); ++ if (output_patch_size) ++ num_patches = MIN2(num_patches, (tess_offchip_block_dw_size * 4) / output_patch_size); + /* Not necessary for correctness, but improves performance. The + * specific value is taken from the proprietary driver. + */ +diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c +index 1eef6aac70c..a6a663d97a6 100644 +--- a/src/gallium/auxiliary/cso_cache/cso_context.c ++++ b/src/gallium/auxiliary/cso_cache/cso_context.c +@@ -402,10 +402,13 @@ void cso_destroy_context( struct cso_context *ctx ) + PIPE_SHADER_CAP_MAX_SHADER_BUFFERS); + int maxcb = scr->get_shader_param(scr, sh, + PIPE_SHADER_CAP_MAX_CONST_BUFFERS); ++ int maximg = scr->get_shader_param(scr, sh, ++ PIPE_SHADER_CAP_MAX_SHADER_IMAGES); + assert(maxsam <= PIPE_MAX_SAMPLERS); + assert(maxview <= PIPE_MAX_SHADER_SAMPLER_VIEWS); + assert(maxssbo <= PIPE_MAX_SHADER_BUFFERS); + assert(maxcb <= PIPE_MAX_CONSTANT_BUFFERS); ++ assert(maximg <= PIPE_MAX_SHADER_IMAGES); + if (maxsam > 0) { + ctx->pipe->bind_sampler_states(ctx->pipe, sh, 0, maxsam, zeros); + } +@@ -415,6 +418,9 @@ void cso_destroy_context( struct cso_context *ctx ) + if (maxssbo > 0) { + ctx->pipe->set_shader_buffers(ctx->pipe, sh, 0, maxssbo, ssbos, 0); + } ++ if (maximg > 0) { ++ ctx->pipe->set_shader_images(ctx->pipe, sh, 0, maximg, NULL); ++ } + for (int i = 0; i < maxcb; i++) { + ctx->pipe->set_constant_buffer(ctx->pipe, sh, i, NULL); + } +diff --git a/src/gallium/drivers/iris/iris_program.c b/src/gallium/drivers/iris/iris_program.c +index 8157e921850..971fc80b5ac 100644 +--- a/src/gallium/drivers/iris/iris_program.c ++++ b/src/gallium/drivers/iris/iris_program.c +@@ -2109,8 +2109,8 @@ iris_get_scratch_space(struct iris_context *ice, + * in the base configuration. + */ + unsigned subslice_total = screen->subslice_total; +- if (devinfo->gen >= 12) +- subslice_total = devinfo->num_subslices[0]; ++ if (devinfo->gen == 12) ++ subslice_total = (devinfo->is_dg1 || devinfo->gt == 2 ? 6 : 2); + else if (devinfo->gen == 11) + subslice_total = 8; + else if (devinfo->gen < 11) +diff --git a/src/gallium/drivers/iris/iris_resolve.c b/src/gallium/drivers/iris/iris_resolve.c +index 276ad62b1dd..045f43ed8c0 100644 +--- a/src/gallium/drivers/iris/iris_resolve.c ++++ b/src/gallium/drivers/iris/iris_resolve.c +@@ -793,7 +793,9 @@ iris_resource_set_aux_state(struct iris_context *ice, + if (res->aux.state[level][start_layer + a] != aux_state) { + res->aux.state[level][start_layer + a] = aux_state; + /* XXX: Need to track which bindings to make dirty */ +- ice->state.dirty |= IRIS_DIRTY_RENDER_BUFFER; ++ ice->state.dirty |= IRIS_DIRTY_RENDER_BUFFER | ++ IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES | ++ IRIS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES; + ice->state.stage_dirty |= IRIS_ALL_STAGE_DIRTY_BINDINGS; + } + } +diff --git a/src/gallium/drivers/iris/iris_resource.c b/src/gallium/drivers/iris/iris_resource.c +index 8747ef4aa8a..3b34e32cd21 100644 +--- a/src/gallium/drivers/iris/iris_resource.c ++++ b/src/gallium/drivers/iris/iris_resource.c +@@ -1125,6 +1125,20 @@ iris_flush_resource(struct pipe_context *ctx, struct pipe_resource *resource) + 0, INTEL_REMAINING_LAYERS, + mod ? mod->aux_usage : ISL_AUX_USAGE_NONE, + mod ? mod->supports_clear_color : false); ++ ++ if (!res->mod_info && res->aux.usage != ISL_AUX_USAGE_NONE) { ++ /* flush_resource may be used to prepare an image for sharing external ++ * to the driver (e.g. via eglCreateImage). To account for this, make ++ * sure to get rid of any compression that a consumer wouldn't know how ++ * to handle. ++ */ ++ for (int i = 0; i < IRIS_BATCH_COUNT; i++) { ++ if (iris_batch_references(&ice->batches[i], res->bo)) ++ iris_batch_flush(&ice->batches[i]); ++ } ++ ++ iris_resource_disable_aux(res); ++ } + } + + static void +diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c +index 59a63f7bbab..b9ddb863a16 100644 +--- a/src/gallium/drivers/iris/iris_state.c ++++ b/src/gallium/drivers/iris/iris_state.c +@@ -1666,6 +1666,8 @@ struct iris_rasterizer_state { + bool multisample; + bool force_persample_interp; + bool conservative_rasterization; ++ bool fill_mode_point; ++ bool fill_mode_line; + bool fill_mode_point_or_line; + enum pipe_sprite_coord_mode sprite_coord_mode; /* PIPE_SPRITE_* */ + uint16_t sprite_coord_enable; +@@ -1729,11 +1731,15 @@ iris_create_rasterizer_state(struct pipe_context *ctx, + cso->conservative_rasterization = + state->conservative_raster_mode == PIPE_CONSERVATIVE_RASTER_POST_SNAP; + +- cso->fill_mode_point_or_line = +- state->fill_front == PIPE_POLYGON_MODE_LINE || ++ cso->fill_mode_point = + state->fill_front == PIPE_POLYGON_MODE_POINT || +- state->fill_back == PIPE_POLYGON_MODE_LINE || + state->fill_back == PIPE_POLYGON_MODE_POINT; ++ cso->fill_mode_line = ++ state->fill_front == PIPE_POLYGON_MODE_LINE || ++ state->fill_back == PIPE_POLYGON_MODE_LINE; ++ cso->fill_mode_point_or_line = ++ cso->fill_mode_point || ++ cso->fill_mode_line; + + if (state->clip_plane_enable != 0) + cso->num_clip_plane_consts = util_logbase2(state->clip_plane_enable) + 1; +@@ -4059,6 +4065,28 @@ iris_emit_sbe_swiz(struct iris_batch *batch, + } + } + ++static bool ++iris_is_drawing_points(const struct iris_context *ice) ++{ ++ const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast; ++ ++ if (cso_rast->fill_mode_point) { ++ return true; ++ } ++ ++ if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) { ++ const struct brw_gs_prog_data *gs_prog_data = ++ (void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data; ++ return gs_prog_data->output_topology == _3DPRIM_POINTLIST; ++ } else if (ice->shaders.prog[MESA_SHADER_TESS_EVAL]) { ++ const struct brw_tes_prog_data *tes_data = ++ (void *) ice->shaders.prog[MESA_SHADER_TESS_EVAL]->prog_data; ++ return tes_data->output_topology == BRW_TESS_OUTPUT_TOPOLOGY_POINT; ++ } else { ++ return ice->state.prim_mode == PIPE_PRIM_POINTS; ++ } ++} ++ + static unsigned + iris_calculate_point_sprite_overrides(const struct brw_wm_prog_data *prog_data, + const struct iris_rasterizer_state *cso) +@@ -4093,7 +4121,8 @@ iris_emit_sbe(struct iris_batch *batch, const struct iris_context *ice) + &urb_read_offset, &urb_read_length); + + unsigned sprite_coord_overrides = +- iris_calculate_point_sprite_overrides(wm_prog_data, cso_rast); ++ iris_is_drawing_points(ice) ? ++ iris_calculate_point_sprite_overrides(wm_prog_data, cso_rast) : 0; + + iris_emit_cmd(batch, GENX(3DSTATE_SBE), sbe) { + sbe.AttributeSwizzleEnable = true; +diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c +index 8f688fa3650..ef35f86b05f 100644 +--- a/src/gallium/drivers/radeonsi/si_descriptors.c ++++ b/src/gallium/drivers/radeonsi/si_descriptors.c +@@ -1482,11 +1482,12 @@ void si_update_needs_color_decompress_masks(struct si_context *sctx) + /* Reset descriptors of buffer resources after \p buf has been invalidated. + * If buf == NULL, reset all descriptors. + */ +-static void si_reset_buffer_resources(struct si_context *sctx, struct si_buffer_resources *buffers, ++static bool si_reset_buffer_resources(struct si_context *sctx, struct si_buffer_resources *buffers, + unsigned descriptors_idx, uint64_t slot_mask, + struct pipe_resource *buf, enum radeon_bo_priority priority) + { + struct si_descriptors *descs = &sctx->descriptors[descriptors_idx]; ++ bool noop = true; + uint64_t mask = buffers->enabled_mask & slot_mask; + + while (mask) { +@@ -1501,8 +1502,10 @@ static void si_reset_buffer_resources(struct si_context *sctx, struct si_buffer_ + sctx, si_resource(buffer), + buffers->writable_mask & (1llu << i) ? RADEON_USAGE_READWRITE : RADEON_USAGE_READ, + priority, true); ++ noop = false; + } + } ++ return !noop; + } + + /* Update all buffer bindings where the buffer is bound, including +@@ -1577,11 +1580,15 @@ void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf) + } + + if (!buffer || buffer->bind_history & PIPE_BIND_SHADER_BUFFER) { +- for (shader = 0; shader < SI_NUM_SHADERS; shader++) +- si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader], +- si_const_and_shader_buffer_descriptors_idx(shader), +- u_bit_consecutive64(0, SI_NUM_SHADER_BUFFERS), buf, +- sctx->const_and_shader_buffers[shader].priority); ++ for (shader = 0; shader < SI_NUM_SHADERS; shader++) { ++ if (si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader], ++ si_const_and_shader_buffer_descriptors_idx(shader), ++ u_bit_consecutive64(0, SI_NUM_SHADER_BUFFERS), buf, ++ sctx->const_and_shader_buffers[shader].priority) && ++ shader == PIPE_SHADER_COMPUTE) { ++ sctx->compute_shaderbuf_sgprs_dirty = true; ++ } ++ } + } + + if (!buffer || buffer->bind_history & PIPE_BIND_SAMPLER_VIEW) { +@@ -1633,6 +1640,9 @@ void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf) + radeon_add_to_gfx_buffer_list_check_mem(sctx, si_resource(buffer), + RADEON_USAGE_READWRITE, + RADEON_PRIO_SAMPLER_BUFFER, true); ++ ++ if (shader == PIPE_SHADER_COMPUTE) ++ sctx->compute_image_sgprs_dirty = true; + } + } + } +diff --git a/src/gallium/frontends/dri/dri_helpers.c b/src/gallium/frontends/dri/dri_helpers.c +index 01a1fb3d96c..5e87df35a55 100644 +--- a/src/gallium/frontends/dri/dri_helpers.c ++++ b/src/gallium/frontends/dri/dri_helpers.c +@@ -258,7 +258,9 @@ dri2_create_image_from_renderbuffer2(__DRIcontext *context, + int renderbuffer, void *loaderPrivate, + unsigned *error) + { +- struct gl_context *ctx = ((struct st_context *)dri_context(context)->st)->ctx; ++ struct st_context *st_ctx = (struct st_context *)dri_context(context)->st; ++ struct gl_context *ctx = st_ctx->ctx; ++ struct pipe_context *p_ctx = st_ctx->pipe; + struct gl_renderbuffer *rb; + struct pipe_resource *tex; + __DRIimage *img; +@@ -299,6 +301,13 @@ dri2_create_image_from_renderbuffer2(__DRIcontext *context, + + pipe_resource_reference(&img->texture, tex); + ++ /* If the resource supports EGL_MESA_image_dma_buf_export, make sure that ++ * it's in a shareable state. Do this now while we still have the access to ++ * the context. ++ */ ++ if (dri2_get_mapping_by_format(img->dri_format)) ++ p_ctx->flush_resource(p_ctx, tex); ++ + *error = __DRI_IMAGE_ERROR_SUCCESS; + return img; + } +@@ -326,7 +335,9 @@ dri2_create_from_texture(__DRIcontext *context, int target, unsigned texture, + void *loaderPrivate) + { + __DRIimage *img; +- struct gl_context *ctx = ((struct st_context *)dri_context(context)->st)->ctx; ++ struct st_context *st_ctx = (struct st_context *)dri_context(context)->st; ++ struct gl_context *ctx = st_ctx->ctx; ++ struct pipe_context *p_ctx = st_ctx->pipe; + struct gl_texture_object *obj; + struct pipe_resource *tex; + GLuint face = 0; +@@ -376,6 +387,13 @@ dri2_create_from_texture(__DRIcontext *context, int target, unsigned texture, + + pipe_resource_reference(&img->texture, tex); + ++ /* If the resource supports EGL_MESA_image_dma_buf_export, make sure that ++ * it's in a shareable state. Do this now while we still have the access to ++ * the context. ++ */ ++ if (dri2_get_mapping_by_format(img->dri_format)) ++ p_ctx->flush_resource(p_ctx, tex); ++ + *error = __DRI_IMAGE_ERROR_SUCCESS; + return img; + } +@@ -547,6 +565,9 @@ dri2_get_mapping_by_fourcc(int fourcc) + const struct dri2_format_mapping * + dri2_get_mapping_by_format(int format) + { ++ if (format == __DRI_IMAGE_FORMAT_NONE) ++ return NULL; ++ + for (unsigned i = 0; i < ARRAY_SIZE(dri2_format_table); i++) { + if (dri2_format_table[i].dri_format == format) + return &dri2_format_table[i]; +diff --git a/src/gallium/frontends/lavapipe/lvp_device.c b/src/gallium/frontends/lavapipe/lvp_device.c +index 45734f95880..187aecde1f8 100644 +--- a/src/gallium/frontends/lavapipe/lvp_device.c ++++ b/src/gallium/frontends/lavapipe/lvp_device.c +@@ -52,8 +52,6 @@ lvp_physical_device_init(struct lvp_physical_device *device, + if (!device->pscreen) + return vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY); + +- fprintf(stderr, "WARNING: lavapipe is not a conformant vulkan implementation, testing use only.\n"); +- + device->max_images = device->pscreen->get_shader_param(device->pscreen, PIPE_SHADER_FRAGMENT, PIPE_SHADER_CAP_MAX_SHADER_IMAGES); + lvp_physical_device_get_supported_extensions(device, &device->supported_extensions); + result = lvp_init_wsi(device); +@@ -575,6 +573,19 @@ void lvp_GetPhysicalDeviceProperties2( + } + } + ++static void lvp_get_physical_device_queue_family_properties( ++ VkQueueFamilyProperties* pQueueFamilyProperties) ++{ ++ *pQueueFamilyProperties = (VkQueueFamilyProperties) { ++ .queueFlags = VK_QUEUE_GRAPHICS_BIT | ++ VK_QUEUE_COMPUTE_BIT | ++ VK_QUEUE_TRANSFER_BIT, ++ .queueCount = 1, ++ .timestampValidBits = 64, ++ .minImageTransferGranularity = (VkExtent3D) { 1, 1, 1 }, ++ }; ++} ++ + void lvp_GetPhysicalDeviceQueueFamilyProperties( + VkPhysicalDevice physicalDevice, + uint32_t* pCount, +@@ -586,15 +597,21 @@ void lvp_GetPhysicalDeviceQueueFamilyProperties( + } + + assert(*pCount >= 1); ++ lvp_get_physical_device_queue_family_properties(pQueueFamilyProperties); ++} + +- *pQueueFamilyProperties = (VkQueueFamilyProperties) { +- .queueFlags = VK_QUEUE_GRAPHICS_BIT | +- VK_QUEUE_COMPUTE_BIT | +- VK_QUEUE_TRANSFER_BIT, +- .queueCount = 1, +- .timestampValidBits = 64, +- .minImageTransferGranularity = (VkExtent3D) { 1, 1, 1 }, +- }; ++void lvp_GetPhysicalDeviceQueueFamilyProperties2( ++ VkPhysicalDevice physicalDevice, ++ uint32_t* pCount, ++ VkQueueFamilyProperties2 *pQueueFamilyProperties) ++{ ++ if (pQueueFamilyProperties == NULL) { ++ *pCount = 1; ++ return; ++ } ++ ++ assert(*pCount >= 1); ++ lvp_get_physical_device_queue_family_properties(&pQueueFamilyProperties->queueFamilyProperties); + } + + void lvp_GetPhysicalDeviceMemoryProperties( +@@ -617,6 +634,14 @@ void lvp_GetPhysicalDeviceMemoryProperties( + }; + } + ++void lvp_GetPhysicalDeviceMemoryProperties2( ++ VkPhysicalDevice physicalDevice, ++ VkPhysicalDeviceMemoryProperties2 *pMemoryProperties) ++{ ++ lvp_GetPhysicalDeviceMemoryProperties(physicalDevice, ++ &pMemoryProperties->memoryProperties); ++} ++ + PFN_vkVoidFunction lvp_GetInstanceProcAddr( + VkInstance _instance, + const char* pName) +@@ -822,6 +847,8 @@ VkResult lvp_CreateDevice( + const VkAllocationCallbacks* pAllocator, + VkDevice* pDevice) + { ++ fprintf(stderr, "WARNING: lavapipe is not a conformant vulkan implementation, testing use only.\n"); ++ + LVP_FROM_HANDLE(lvp_physical_device, physical_device, physicalDevice); + struct lvp_device *device; + +diff --git a/src/glx/g_glxglvnddispatchfuncs.c b/src/glx/g_glxglvnddispatchfuncs.c +index 0f02ed2d321..e0ea27c0b18 100644 +--- a/src/glx/g_glxglvnddispatchfuncs.c ++++ b/src/glx/g_glxglvnddispatchfuncs.c +@@ -87,6 +87,7 @@ const char * const __glXDispatchTableStrings[DI_LAST_INDEX] = { + __ATTRIB(SelectEventSGIX), + // glXSwapBuffers implemented by libglvnd + __ATTRIB(SwapBuffersMscOML), ++ __ATTRIB(SwapIntervalEXT), + __ATTRIB(SwapIntervalMESA), + __ATTRIB(SwapIntervalSGI), + // glXUseXFont implemented by libglvnd +@@ -893,6 +894,24 @@ static int dispatch_SwapIntervalMESA(unsigned int interval) + + + ++static void dispatch_SwapIntervalEXT(Display *dpy, GLXDrawable drawable, int interval) ++{ ++ PFNGLXSWAPINTERVALEXTPROC pSwapIntervalEXT; ++ __GLXvendorInfo *dd; ++ ++ dd = GetDispatchFromDrawable(dpy, drawable); ++ if (dd == NULL) ++ return; ++ ++ __FETCH_FUNCTION_PTR(SwapIntervalEXT); ++ if (pSwapIntervalEXT == NULL) ++ return; ++ ++ pSwapIntervalEXT(dpy, drawable, interval); ++} ++ ++ ++ + static Bool dispatch_WaitForMscOML(Display *dpy, GLXDrawable drawable, + int64_t target_msc, int64_t divisor, + int64_t remainder, int64_t *ust, +@@ -974,6 +993,7 @@ const void * const __glXDispatchFunctions[DI_LAST_INDEX + 1] = { + __ATTRIB(ReleaseTexImageEXT), + __ATTRIB(SelectEventSGIX), + __ATTRIB(SwapBuffersMscOML), ++ __ATTRIB(SwapIntervalEXT), + __ATTRIB(SwapIntervalMESA), + __ATTRIB(SwapIntervalSGI), + __ATTRIB(WaitForMscOML), +diff --git a/src/glx/g_glxglvnddispatchindices.h b/src/glx/g_glxglvnddispatchindices.h +index 3ba50a74abb..b65d078098f 100644 +--- a/src/glx/g_glxglvnddispatchindices.h ++++ b/src/glx/g_glxglvnddispatchindices.h +@@ -79,6 +79,7 @@ typedef enum __GLXdispatchIndex { + DI_SelectEventSGIX, + // SwapBuffers implemented by libglvnd + DI_SwapBuffersMscOML, ++ DI_SwapIntervalEXT, + DI_SwapIntervalMESA, + DI_SwapIntervalSGI, + // UseXFont implemented by libglvnd +diff --git a/src/intel/common/gen_mi_builder.h b/src/intel/common/gen_mi_builder.h +index ddd8459ef07..47fb98e99f7 100644 +--- a/src/intel/common/gen_mi_builder.h ++++ b/src/intel/common/gen_mi_builder.h +@@ -932,6 +932,13 @@ gen_mi_store_address(struct gen_mi_builder *b, + static inline void + gen_mi_self_mod_barrier(struct gen_mi_builder *b) + { ++ /* First make sure all the memory writes from previous modifying commands ++ * have landed. We want to do this before going through the CS cache, ++ * otherwise we could be fetching memory that hasn't been written to yet. ++ */ ++ gen_mi_builder_emit(b, GENX(PIPE_CONTROL), pc) { ++ pc.CommandStreamerStallEnable = true; ++ } + /* Documentation says Gen11+ should be able to invalidate the command cache + * but experiment show it doesn't work properly, so for now just get over + * the CS prefetch. +diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp +index 917c3abfe9e..6896987055f 100644 +--- a/src/intel/compiler/brw_fs_copy_propagation.cpp ++++ b/src/intel/compiler/brw_fs_copy_propagation.cpp +@@ -437,6 +437,7 @@ instruction_requires_packed_data(fs_inst *inst) + case FS_OPCODE_DDX_COARSE: + case FS_OPCODE_DDY_FINE: + case FS_OPCODE_DDY_COARSE: ++ case SHADER_OPCODE_QUAD_SWIZZLE: + return true; + default: + return false; +diff --git a/src/intel/compiler/brw_ir_fs.h b/src/intel/compiler/brw_ir_fs.h +index 6ba3a6ca97e..3a4acc1834a 100644 +--- a/src/intel/compiler/brw_ir_fs.h ++++ b/src/intel/compiler/brw_ir_fs.h +@@ -451,13 +451,15 @@ regs_written(const fs_inst *inst) + * Return the number of dataflow registers read by the instruction (either + * fully or partially) counted from 'floor(reg_offset(inst->src[i]) / + * register_size)'. The somewhat arbitrary register size unit is 4B for the +- * UNIFORM and IMM files and 32B for all other files. ++ * UNIFORM files and 32B for all other files. + */ + inline unsigned + regs_read(const fs_inst *inst, unsigned i) + { +- const unsigned reg_size = +- inst->src[i].file == UNIFORM || inst->src[i].file == IMM ? 4 : REG_SIZE; ++ if (inst->src[i].file == IMM) ++ return 1; ++ ++ const unsigned reg_size = inst->src[i].file == UNIFORM ? 4 : REG_SIZE; + return DIV_ROUND_UP(reg_offset(inst->src[i]) % reg_size + + inst->size_read(i) - + MIN2(inst->size_read(i), reg_padding(inst->src[i])), +diff --git a/src/intel/vulkan/anv_allocator.c b/src/intel/vulkan/anv_allocator.c +index 9007cd00e85..48811912e95 100644 +--- a/src/intel/vulkan/anv_allocator.c ++++ b/src/intel/vulkan/anv_allocator.c +@@ -1447,8 +1447,8 @@ anv_scratch_pool_alloc(struct anv_device *device, struct anv_scratch_pool *pool, + * For, Gen11+, scratch space allocation is based on the number of threads + * in the base configuration. + */ +- if (devinfo->gen >= 12) +- subslices = devinfo->num_subslices[0]; ++ if (devinfo->gen == 12) ++ subslices = (devinfo->is_dg1 || devinfo->gt == 2 ? 6 : 2); + else if (devinfo->gen == 11) + subslices = 8; + else if (devinfo->gen >= 9) +diff --git a/src/intel/vulkan/anv_image.c b/src/intel/vulkan/anv_image.c +index 0290431f145..80307cd612f 100644 +--- a/src/intel/vulkan/anv_image.c ++++ b/src/intel/vulkan/anv_image.c +@@ -684,6 +684,25 @@ choose_drm_format_mod(const struct anv_physical_device *device, + return NULL; + } + ++static VkImageUsageFlags ++anv_image_create_usage(const VkImageCreateInfo *pCreateInfo, ++ VkImageUsageFlags usage) ++{ ++ /* Add TRANSFER_SRC usage for multisample attachment images. This is ++ * because we might internally use the TRANSFER_SRC layout on them for ++ * blorp operations associated with resolving those into other attachments ++ * at the end of a subpass. ++ * ++ * Without this additional usage, we compute an incorrect AUX state in ++ * anv_layout_to_aux_state(). ++ */ ++ if (pCreateInfo->samples > VK_SAMPLE_COUNT_1_BIT && ++ (usage & (VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | ++ VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT))) ++ usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT; ++ return usage; ++} ++ + VkResult + anv_image_create(VkDevice _device, + const struct anv_image_create_info *create_info, +@@ -732,7 +751,7 @@ anv_image_create(VkDevice _device, + image->levels = pCreateInfo->mipLevels; + image->array_size = pCreateInfo->arrayLayers; + image->samples = pCreateInfo->samples; +- image->usage = pCreateInfo->usage; ++ image->usage = anv_image_create_usage(pCreateInfo, pCreateInfo->usage); + image->create_flags = pCreateInfo->flags; + image->tiling = pCreateInfo->tiling; + image->disjoint = pCreateInfo->flags & VK_IMAGE_CREATE_DISJOINT_BIT; +@@ -745,8 +764,11 @@ anv_image_create(VkDevice _device, + const VkImageStencilUsageCreateInfoEXT *stencil_usage_info = + vk_find_struct_const(pCreateInfo->pNext, + IMAGE_STENCIL_USAGE_CREATE_INFO_EXT); +- if (stencil_usage_info) +- image->stencil_usage = stencil_usage_info->stencilUsage; ++ if (stencil_usage_info) { ++ image->stencil_usage = ++ anv_image_create_usage(pCreateInfo, ++ stencil_usage_info->stencilUsage); ++ } + } + + /* In case of external format, We don't know format yet, +diff --git a/src/intel/vulkan/anv_pass.c b/src/intel/vulkan/anv_pass.c +index af23b87969d..1818f6c587b 100644 +--- a/src/intel/vulkan/anv_pass.c ++++ b/src/intel/vulkan/anv_pass.c +@@ -23,6 +23,7 @@ + + #include "anv_private.h" + ++#include "vk_format_info.h" + #include "vk_util.h" + + static void +@@ -406,6 +407,70 @@ num_subpass_attachments2(const VkSubpassDescription2KHR *desc) + (ds_resolve && ds_resolve->pDepthStencilResolveAttachment); + } + ++static bool ++vk_image_layout_depth_only(VkImageLayout layout) ++{ ++ switch (layout) { ++ case VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_OPTIMAL: ++ case VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_OPTIMAL: ++ return true; ++ ++ default: ++ return false; ++ } ++} ++ ++/* From the Vulkan Specification 1.2.166 - VkAttachmentReference2: ++ * ++ * "If layout only specifies the layout of the depth aspect of the ++ * attachment, the layout of the stencil aspect is specified by the ++ * stencilLayout member of a VkAttachmentReferenceStencilLayout structure ++ * included in the pNext chain. Otherwise, layout describes the layout for ++ * all relevant image aspects." ++ */ ++static VkImageLayout ++stencil_ref_layout(const VkAttachmentReference2KHR *att_ref) ++{ ++ if (!vk_image_layout_depth_only(att_ref->layout)) ++ return att_ref->layout; ++ ++ const VkAttachmentReferenceStencilLayoutKHR *stencil_ref = ++ vk_find_struct_const(att_ref->pNext, ++ ATTACHMENT_REFERENCE_STENCIL_LAYOUT_KHR); ++ if (!stencil_ref) ++ return VK_IMAGE_LAYOUT_UNDEFINED; ++ return stencil_ref->stencilLayout; ++} ++ ++/* From the Vulkan Specification 1.2.166 - VkAttachmentDescription2: ++ * ++ * "If format is a depth/stencil format, and initialLayout only specifies ++ * the initial layout of the depth aspect of the attachment, the initial ++ * layout of the stencil aspect is specified by the stencilInitialLayout ++ * member of a VkAttachmentDescriptionStencilLayout structure included in ++ * the pNext chain. Otherwise, initialLayout describes the initial layout ++ * for all relevant image aspects." ++ */ ++static VkImageLayout ++stencil_desc_layout(const VkAttachmentDescription2KHR *att_desc, bool final) ++{ ++ if (!vk_format_has_stencil(att_desc->format)) ++ return VK_IMAGE_LAYOUT_UNDEFINED; ++ ++ const VkImageLayout main_layout = ++ final ? att_desc->finalLayout : att_desc->initialLayout; ++ if (!vk_image_layout_depth_only(main_layout)) ++ return main_layout; ++ ++ const VkAttachmentDescriptionStencilLayoutKHR *stencil_desc = ++ vk_find_struct_const(att_desc->pNext, ++ ATTACHMENT_DESCRIPTION_STENCIL_LAYOUT_KHR); ++ assert(stencil_desc); ++ return final ? ++ stencil_desc->stencilFinalLayout : ++ stencil_desc->stencilInitialLayout; ++} ++ + VkResult anv_CreateRenderPass2( + VkDevice _device, + const VkRenderPassCreateInfo2KHR* pCreateInfo, +@@ -450,10 +515,6 @@ VkResult anv_CreateRenderPass2( + pass->subpass_flushes = subpass_flushes; + + for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) { +- const VkAttachmentDescriptionStencilLayoutKHR *stencil_layout = +- vk_find_struct_const(pCreateInfo->pAttachments[i].pNext, +- ATTACHMENT_DESCRIPTION_STENCIL_LAYOUT_KHR); +- + pass->attachments[i] = (struct anv_render_pass_attachment) { + .format = pCreateInfo->pAttachments[i].format, + .samples = pCreateInfo->pAttachments[i].samples, +@@ -463,12 +524,10 @@ VkResult anv_CreateRenderPass2( + .initial_layout = pCreateInfo->pAttachments[i].initialLayout, + .final_layout = pCreateInfo->pAttachments[i].finalLayout, + +- .stencil_initial_layout = (stencil_layout ? +- stencil_layout->stencilInitialLayout : +- pCreateInfo->pAttachments[i].initialLayout), +- .stencil_final_layout = (stencil_layout ? +- stencil_layout->stencilFinalLayout : +- pCreateInfo->pAttachments[i].finalLayout), ++ .stencil_initial_layout = stencil_desc_layout(&pCreateInfo->pAttachments[i], ++ false), ++ .stencil_final_layout = stencil_desc_layout(&pCreateInfo->pAttachments[i], ++ true), + }; + } + +@@ -487,17 +546,11 @@ VkResult anv_CreateRenderPass2( + subpass_attachments += desc->inputAttachmentCount; + + for (uint32_t j = 0; j < desc->inputAttachmentCount; j++) { +- const VkAttachmentReferenceStencilLayoutKHR *stencil_layout = +- vk_find_struct_const(desc->pInputAttachments[j].pNext, +- ATTACHMENT_REFERENCE_STENCIL_LAYOUT_KHR); +- + subpass->input_attachments[j] = (struct anv_subpass_attachment) { + .usage = VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT, + .attachment = desc->pInputAttachments[j].attachment, + .layout = desc->pInputAttachments[j].layout, +- .stencil_layout = (stencil_layout ? +- stencil_layout->stencilLayout : +- desc->pInputAttachments[j].layout), ++ .stencil_layout = stencil_ref_layout(&desc->pInputAttachments[j]), + }; + } + } +@@ -531,17 +584,11 @@ VkResult anv_CreateRenderPass2( + if (desc->pDepthStencilAttachment) { + subpass->depth_stencil_attachment = subpass_attachments++; + +- const VkAttachmentReferenceStencilLayoutKHR *stencil_attachment = +- vk_find_struct_const(desc->pDepthStencilAttachment->pNext, +- ATTACHMENT_REFERENCE_STENCIL_LAYOUT_KHR); +- + *subpass->depth_stencil_attachment = (struct anv_subpass_attachment) { + .usage = VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, + .attachment = desc->pDepthStencilAttachment->attachment, + .layout = desc->pDepthStencilAttachment->layout, +- .stencil_layout = stencil_attachment ? +- stencil_attachment->stencilLayout : +- desc->pDepthStencilAttachment->layout, ++ .stencil_layout = stencil_ref_layout(desc->pDepthStencilAttachment), + }; + } + +@@ -552,17 +599,11 @@ VkResult anv_CreateRenderPass2( + if (ds_resolve && ds_resolve->pDepthStencilResolveAttachment) { + subpass->ds_resolve_attachment = subpass_attachments++; + +- const VkAttachmentReferenceStencilLayoutKHR *stencil_resolve_attachment = +- vk_find_struct_const(ds_resolve->pDepthStencilResolveAttachment->pNext, +- ATTACHMENT_REFERENCE_STENCIL_LAYOUT_KHR); +- + *subpass->ds_resolve_attachment = (struct anv_subpass_attachment) { + .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT, + .attachment = ds_resolve->pDepthStencilResolveAttachment->attachment, + .layout = ds_resolve->pDepthStencilResolveAttachment->layout, +- .stencil_layout = stencil_resolve_attachment ? +- stencil_resolve_attachment->stencilLayout : +- ds_resolve->pDepthStencilResolveAttachment->layout, ++ .stencil_layout = stencil_ref_layout(ds_resolve->pDepthStencilResolveAttachment), + }; + subpass->depth_resolve_mode = ds_resolve->depthResolveMode; + subpass->stencil_resolve_mode = ds_resolve->stencilResolveMode; +diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c +index a9c49e0f592..e3eb376fa5a 100644 +--- a/src/intel/vulkan/genX_cmd_buffer.c ++++ b/src/intel/vulkan/genX_cmd_buffer.c +@@ -462,8 +462,10 @@ anv_image_init_aux_tt(struct anv_cmd_buffer *cmd_buffer, + { + uint32_t plane = anv_image_aspect_to_plane(image->aspects, aspect); + ++ const struct anv_surface *surface = &image->planes[plane].surface; + uint64_t base_address = +- anv_address_physical(image->planes[plane].address); ++ anv_address_physical(anv_address_add(image->planes[plane].address, ++ surface->offset)); + + const struct isl_surf *isl_surf = &image->planes[plane].surface.isl; + uint64_t format_bits = gen_aux_map_format_bits_for_isl_surf(isl_surf); +@@ -1231,6 +1233,17 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, + uint32_t level_layer_count = + MIN2(layer_count, aux_layers - base_layer); + ++ /* If will_full_fast_clear is set, the caller promises to ++ * fast-clear the largest portion of the specified range as it can. ++ * For color images, that means only the first LOD and array slice. ++ */ ++ if (level == 0 && base_layer == 0 && will_full_fast_clear) { ++ base_layer++; ++ level_layer_count--; ++ if (level_layer_count == 0) ++ continue; ++ } ++ + anv_image_ccs_op(cmd_buffer, image, + image->planes[plane].surface.isl.format, + ISL_SWIZZLE_IDENTITY, +@@ -1250,6 +1263,12 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, + "define an MCS buffer."); + } + ++ /* If will_full_fast_clear is set, the caller promises to fast-clear ++ * the largest portion of the specified range as it can. ++ */ ++ if (will_full_fast_clear) ++ return; ++ + assert(base_level == 0 && level_count == 1); + anv_image_mcs_op(cmd_buffer, image, + image->planes[plane].surface.isl.format, +diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c +index 205e8677f19..33f071019b7 100644 +--- a/src/intel/vulkan/genX_pipeline.c ++++ b/src/intel/vulkan/genX_pipeline.c +@@ -1180,7 +1180,22 @@ emit_cb_state(struct anv_graphics_pipeline *pipeline, + #endif + .LogicOpEnable = info->logicOpEnable, + .LogicOpFunction = vk_to_gen_logic_op[info->logicOp], +- .ColorBufferBlendEnable = a->blendEnable, ++ /* Vulkan specification 1.2.168, VkLogicOp: ++ * ++ * "Logical operations are controlled by the logicOpEnable and ++ * logicOp members of VkPipelineColorBlendStateCreateInfo. If ++ * logicOpEnable is VK_TRUE, then a logical operation selected by ++ * logicOp is applied between each color attachment and the ++ * fragment’s corresponding output value, and blending of all ++ * attachments is treated as if it were disabled." ++ * ++ * From the Broadwell PRM Volume 2d: Command Reference: Structures: ++ * BLEND_STATE_ENTRY: ++ * ++ * "Enabling LogicOp and Color Buffer Blending at the same time is ++ * UNDEFINED" ++ */ ++ .ColorBufferBlendEnable = !info->logicOpEnable && a->blendEnable, + .ColorClampRange = COLORCLAMP_RTFORMAT, + .PreBlendColorClampEnable = true, + .PostBlendColorClampEnable = true, +diff --git a/src/intel/vulkan/vk_format_info.h b/src/intel/vulkan/vk_format_info.h +index 006e1f4a6ad..4e72c244742 100644 +--- a/src/intel/vulkan/vk_format_info.h ++++ b/src/intel/vulkan/vk_format_info.h +@@ -164,4 +164,11 @@ vk_format_has_depth(VkFormat format) + return aspects & VK_IMAGE_ASPECT_DEPTH_BIT; + } + ++static inline bool ++vk_format_has_stencil(VkFormat format) ++{ ++ const VkImageAspectFlags aspects = vk_format_aspects(format); ++ return aspects & VK_IMAGE_ASPECT_STENCIL_BIT; ++} ++ + #endif /* VK_FORMAT_INFO_H */ +diff --git a/src/mesa/state_tracker/st_pbo.c b/src/mesa/state_tracker/st_pbo.c +index 65a1ce8862a..b03921c1be6 100644 +--- a/src/mesa/state_tracker/st_pbo.c ++++ b/src/mesa/state_tracker/st_pbo.c +@@ -431,16 +431,21 @@ create_fs(struct st_context *st, bool download, + nir_ssa_def *coord = nir_load_var(&b, fragcoord); + + nir_ssa_def *layer = NULL; +- if (st->pbo.layers && need_layer && (!download || target == PIPE_TEXTURE_1D_ARRAY || +- target == PIPE_TEXTURE_2D_ARRAY || +- target == PIPE_TEXTURE_3D || +- target == PIPE_TEXTURE_CUBE || +- target == PIPE_TEXTURE_CUBE_ARRAY)) { +- nir_variable *var = nir_variable_create(b.shader, nir_var_shader_in, +- glsl_int_type(), "gl_Layer"); +- var->data.location = VARYING_SLOT_LAYER; +- var->data.interpolation = INTERP_MODE_FLAT; +- layer = nir_load_var(&b, var); ++ if (st->pbo.layers && (!download || target == PIPE_TEXTURE_1D_ARRAY || ++ target == PIPE_TEXTURE_2D_ARRAY || ++ target == PIPE_TEXTURE_3D || ++ target == PIPE_TEXTURE_CUBE || ++ target == PIPE_TEXTURE_CUBE_ARRAY)) { ++ if (need_layer) { ++ nir_variable *var = nir_variable_create(b.shader, nir_var_shader_in, ++ glsl_int_type(), "gl_Layer"); ++ var->data.location = VARYING_SLOT_LAYER; ++ var->data.interpolation = INTERP_MODE_FLAT; ++ layer = nir_load_var(&b, var); ++ } ++ else { ++ layer = zero; ++ } + } + + /* offset_pos = param.xy + f2i(coord.xy) */ +diff --git a/src/util/format/u_format.csv b/src/util/format/u_format.csv +index 8acfb869bdb..237c4c95475 100644 +--- a/src/util/format/u_format.csv ++++ b/src/util/format/u_format.csv +@@ -500,7 +500,7 @@ PIPE_FORMAT_R4G4B4A4_UINT , plain, 1, 1, 1, up4 , up4 , up4 , up4 , xy + PIPE_FORMAT_B4G4R4A4_UINT , plain, 1, 1, 1, up4 , up4 , up4 , up4 , zyxw, rgb, up4 , up4 , up4 , up4 , yzwx + PIPE_FORMAT_A4R4G4B4_UINT , plain, 1, 1, 1, up4 , up4 , up4 , up4 , yzwx, rgb, up4 , up4 , up4 , up4 , zyxw + PIPE_FORMAT_A4B4G4R4_UINT , plain, 1, 1, 1, up4 , up4 , up4 , up4 , wzyx, rgb, up4 , up4 , up4 , up4 , xyzw +-PIPE_FORMAT_A1R5G5B5_UINT , plain, 1, 1, 1, up1 , up5 , up5 , up5 , wzyx, rgb, up5 , up5 , up5 , up1 , zyxw ++PIPE_FORMAT_A1R5G5B5_UINT , plain, 1, 1, 1, up1 , up5 , up5 , up5 , yzwx, rgb, up5 , up5 , up5 , up1 , zyxw + PIPE_FORMAT_A1B5G5R5_UINT , plain, 1, 1, 1, up1 , up5 , up5 , up5 , wzyx, rgb, up5 , up5 , up5 , up1 , xyzw + PIPE_FORMAT_R5G5B5A1_UINT , plain, 1, 1, 1, up5 , up5 , up5 , up1 , xyzw, rgb, up5 , up5 , up5 , up1 , wzyx + PIPE_FORMAT_B5G5R5A1_UINT , plain, 1, 1, 1, up5 , up5 , up5 , up1 , zyxw, rgb, up1 , up5 , up5 , up5 , yzwx +diff --git a/src/vulkan/device-select-layer/VkLayer_MESA_device_select.json b/src/vulkan/device-select-layer/VkLayer_MESA_device_select.json +index 1d5fffd0135..361ae9fe74e 100644 +--- a/src/vulkan/device-select-layer/VkLayer_MESA_device_select.json ++++ b/src/vulkan/device-select-layer/VkLayer_MESA_device_select.json +@@ -4,7 +4,7 @@ + "name": "VK_LAYER_MESA_device_select", + "type": "GLOBAL", + "library_path": "libVkLayer_MESA_device_select.so", +- "api_version": "1.1.73", ++ "api_version": "1.2.73", + "implementation_version": "1", + "description": "Linux device selection layer", + "functions": { diff --git a/SOURCES/nouveau-tu1xx-support.patch b/SOURCES/nouveau-tu1xx-support.patch deleted file mode 100644 index 3254466..0000000 --- a/SOURCES/nouveau-tu1xx-support.patch +++ /dev/null @@ -1,10387 +0,0 @@ -diff --git a/src/gallium/drivers/nouveau/Makefile.sources b/src/gallium/drivers/nouveau/Makefile.sources -index 6c360992a53..9de8168fbd9 100644 ---- a/src/gallium/drivers/nouveau/Makefile.sources -+++ b/src/gallium/drivers/nouveau/Makefile.sources -@@ -151,6 +151,14 @@ NVC0_CODEGEN_SOURCES := \ - codegen/nv50_ir_target_nvc0.h - - NVC0_C_SOURCES := \ -+ nvc0/cla0c0qmd.h \ -+ nvc0/clc0c0qmd.h \ -+ nvc0/clc3c0qmd.h \ -+ nvc0/drf.h \ -+ nvc0/qmd.h \ -+ nvc0/qmda0c0.c \ -+ nvc0/qmdc0c0.c \ -+ nvc0/qmdc3c0.c \ - nvc0/gm107_texture.xml.h \ - nvc0/nvc0_3d.xml.h \ - nvc0/nvc0_compute.c \ -diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h -index 42ee969c66b..d58c0d206ec 100644 ---- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h -+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h -@@ -67,8 +67,10 @@ enum operation - OP_AND, - OP_OR, - OP_XOR, -+ OP_LOP3_LUT, - OP_SHL, - OP_SHR, -+ OP_SHF, - OP_MAX, - OP_MIN, - OP_SAT, // CLAMP(f32, 0.0, 1.0) -@@ -116,6 +118,7 @@ enum operation - OP_PINTERP, - OP_EMIT, // emit vertex - OP_RESTART, // restart primitive -+ OP_FINAL, // finish emitting primitives - OP_TEX, - OP_TXB, // texture bias - OP_TXL, // texure lod -@@ -151,7 +154,10 @@ enum operation - OP_INSBF, // insert first src1[8:15] bits of src0 into src2 at src1[0:7] - OP_EXTBF, // place bits [K,K+N) of src0 into dst, src1 = 0xNNKK - OP_BFIND, // find highest/lowest set bit -+ OP_BREV, // bitfield reverse -+ OP_BMSK, // bitfield mask - OP_PERMT, // dst = bytes from src2,src0 selected by src1 (nvc0's src order) -+ OP_SGXT, - OP_ATOM, - OP_BAR, // execution barrier, sources = { id, thread count, predicate } - OP_VADD, // byte/word vector operations -@@ -167,6 +173,7 @@ enum operation - OP_SHFL, // warp shuffle - OP_VOTE, - OP_BUFQ, // buffer query -+ OP_WARPSYNC, - OP_LAST - }; - -@@ -254,11 +261,29 @@ enum operation - #define NV50_IR_SUBOP_VOTE_ALL 0 - #define NV50_IR_SUBOP_VOTE_ANY 1 - #define NV50_IR_SUBOP_VOTE_UNI 2 -+#define NV50_IR_SUBOP_LOP3_LUT_SRC0 0xf0 -+#define NV50_IR_SUBOP_LOP3_LUT_SRC1 0xcc -+#define NV50_IR_SUBOP_LOP3_LUT_SRC2 0xaa -+#define NV50_IR_SUBOP_LOP3_LUT(exp) ({ \ -+ uint8_t a = NV50_IR_SUBOP_LOP3_LUT_SRC0; \ -+ uint8_t b = NV50_IR_SUBOP_LOP3_LUT_SRC1; \ -+ uint8_t c = NV50_IR_SUBOP_LOP3_LUT_SRC2; \ -+ (uint8_t)(exp); \ -+}) -+#define NV50_IR_SUBOP_BMSK_C (0 << 0) -+#define NV50_IR_SUBOP_BMSK_W (1 << 0) - - #define NV50_IR_SUBOP_MINMAX_LOW 1 - #define NV50_IR_SUBOP_MINMAX_MED 2 - #define NV50_IR_SUBOP_MINMAX_HIGH 3 - -+#define NV50_IR_SUBOP_SHF_L (0 << 0) -+#define NV50_IR_SUBOP_SHF_R (1 << 0) -+#define NV50_IR_SUBOP_SHF_LO (0 << 1) -+#define NV50_IR_SUBOP_SHF_HI (1 << 1) -+#define NV50_IR_SUBOP_SHF_C (0 << 2) -+#define NV50_IR_SUBOP_SHF_W (1 << 2) -+ - // xmad(src0, src1, 0) << 16 + src2 - #define NV50_IR_SUBOP_XMAD_PSL (1 << 0) - // (xmad(src0, src1, src2) & 0xffff) | (src1 << 16) -@@ -900,7 +925,7 @@ public: - - uint16_t subOp; // quadop, 1 for mul-high, etc. - -- unsigned encSize : 4; // encoding size in bytes -+ unsigned encSize : 5; // encoding size in bytes - unsigned saturate : 1; // to [0.0f, 1.0f] - unsigned join : 1; // converge control flow (use OP_JOIN until end) - unsigned fixed : 1; // prevent dead code elimination -diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h -index 5dc0e24c5dc..63ea7f5e7e8 100644 ---- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h -+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h -@@ -29,6 +29,8 @@ - #include "tgsi/tgsi_parse.h" - #include "tgsi/tgsi_scan.h" - -+struct nir_shader_compiler_options; -+ - /* - * This struct constitutes linkage information in TGSI terminology. - * -@@ -70,10 +72,12 @@ struct nv50_ir_prog_symbol - uint32_t offset; - }; - -+#define NVISA_GF100_CHIPSET 0xc0 - #define NVISA_GK104_CHIPSET 0xe0 - #define NVISA_GK20A_CHIPSET 0xea - #define NVISA_GM107_CHIPSET 0x110 - #define NVISA_GM200_CHIPSET 0x120 -+#define NVISA_GV100_CHIPSET 0x140 - - struct nv50_ir_prog_info - { -@@ -200,6 +204,9 @@ struct nv50_ir_prog_info - extern "C" { - #endif - -+const struct nir_shader_compiler_options * -+nv50_ir_nir_shader_compiler_options(int chipset); -+ - extern int nv50_ir_generate_code(struct nv50_ir_prog_info *); - - extern void nv50_ir_relocate_code(void *relocData, uint32_t *code, -diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp -index e244bd0d610..dd8e1ab86c4 100644 ---- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp -+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp -@@ -23,6 +23,7 @@ - */ - - #include "codegen/nv50_ir_target_gm107.h" -+#include "codegen/nv50_ir_sched_gm107.h" - - //#define GM107_DEBUG_SCHED_DATA - -@@ -170,6 +171,7 @@ private: - void emitBFI(); - void emitBFE(); - void emitFLO(); -+ void emitPRMT(); - - void emitLDSTs(int, DataType); - void emitLDSTc(int); -@@ -2371,6 +2373,33 @@ CodeEmitterGM107::emitFLO() - emitGPR (0x00, insn->def(0)); - } - -+void -+CodeEmitterGM107::emitPRMT() -+{ -+ switch (insn->src(1).getFile()) { -+ case FILE_GPR: -+ emitInsn(0x5bc00000); -+ emitGPR (0x14, insn->src(1)); -+ break; -+ case FILE_MEMORY_CONST: -+ emitInsn(0x4bc00000); -+ emitCBUF(0x22, -1, 0x14, 16, 2, insn->src(1)); -+ break; -+ case FILE_IMMEDIATE: -+ emitInsn(0x36c00000); -+ emitIMMD(0x14, 19, insn->src(1)); -+ break; -+ default: -+ assert(!"bad src1 file"); -+ break; -+ } -+ -+ emitField(0x30, 3, insn->subOp); -+ emitGPR (0x27, insn->src(2)); -+ emitGPR (0x08, insn->src(0)); -+ emitGPR (0x00, insn->def(0)); -+} -+ - /******************************************************************************* - * memory - ******************************************************************************/ -@@ -3537,6 +3566,9 @@ CodeEmitterGM107::emitInstruction(Instruction *i) - case OP_BFIND: - emitFLO(); - break; -+ case OP_PERMT: -+ emitPRMT(); -+ break; - case OP_SLCT: - if (isFloatType(insn->dType)) - emitFCMP(); -@@ -3742,156 +3774,6 @@ CodeEmitterGM107::getMinEncodingSize(const Instruction *i) const - * sched data calculator - ******************************************************************************/ - --class SchedDataCalculatorGM107 : public Pass --{ --public: -- SchedDataCalculatorGM107(const TargetGM107 *targ) : targ(targ) {} -- --private: -- struct RegScores -- { -- struct ScoreData { -- int r[256]; -- int p[8]; -- int c; -- } rd, wr; -- int base; -- -- void rebase(const int base) -- { -- const int delta = this->base - base; -- if (!delta) -- return; -- this->base = 0; -- -- for (int i = 0; i < 256; ++i) { -- rd.r[i] += delta; -- wr.r[i] += delta; -- } -- for (int i = 0; i < 8; ++i) { -- rd.p[i] += delta; -- wr.p[i] += delta; -- } -- rd.c += delta; -- wr.c += delta; -- } -- void wipe() -- { -- memset(&rd, 0, sizeof(rd)); -- memset(&wr, 0, sizeof(wr)); -- } -- int getLatest(const ScoreData& d) const -- { -- int max = 0; -- for (int i = 0; i < 256; ++i) -- if (d.r[i] > max) -- max = d.r[i]; -- for (int i = 0; i < 8; ++i) -- if (d.p[i] > max) -- max = d.p[i]; -- if (d.c > max) -- max = d.c; -- return max; -- } -- inline int getLatestRd() const -- { -- return getLatest(rd); -- } -- inline int getLatestWr() const -- { -- return getLatest(wr); -- } -- inline int getLatest() const -- { -- return MAX2(getLatestRd(), getLatestWr()); -- } -- void setMax(const RegScores *that) -- { -- for (int i = 0; i < 256; ++i) { -- rd.r[i] = MAX2(rd.r[i], that->rd.r[i]); -- wr.r[i] = MAX2(wr.r[i], that->wr.r[i]); -- } -- for (int i = 0; i < 8; ++i) { -- rd.p[i] = MAX2(rd.p[i], that->rd.p[i]); -- wr.p[i] = MAX2(wr.p[i], that->wr.p[i]); -- } -- rd.c = MAX2(rd.c, that->rd.c); -- wr.c = MAX2(wr.c, that->wr.c); -- } -- void print(int cycle) -- { -- for (int i = 0; i < 256; ++i) { -- if (rd.r[i] > cycle) -- INFO("rd $r%i @ %i\n", i, rd.r[i]); -- if (wr.r[i] > cycle) -- INFO("wr $r%i @ %i\n", i, wr.r[i]); -- } -- for (int i = 0; i < 8; ++i) { -- if (rd.p[i] > cycle) -- INFO("rd $p%i @ %i\n", i, rd.p[i]); -- if (wr.p[i] > cycle) -- INFO("wr $p%i @ %i\n", i, wr.p[i]); -- } -- if (rd.c > cycle) -- INFO("rd $c @ %i\n", rd.c); -- if (wr.c > cycle) -- INFO("wr $c @ %i\n", wr.c); -- } -- }; -- -- RegScores *score; // for current BB -- std::vector scoreBoards; -- -- const TargetGM107 *targ; -- bool visit(Function *); -- bool visit(BasicBlock *); -- -- void commitInsn(const Instruction *, int); -- int calcDelay(const Instruction *, int) const; -- void setDelay(Instruction *, int, const Instruction *); -- void recordWr(const Value *, int, int); -- void checkRd(const Value *, int, int&) const; -- -- inline void emitYield(Instruction *); -- inline void emitStall(Instruction *, uint8_t); -- inline void emitReuse(Instruction *, uint8_t); -- inline void emitWrDepBar(Instruction *, uint8_t); -- inline void emitRdDepBar(Instruction *, uint8_t); -- inline void emitWtDepBar(Instruction *, uint8_t); -- -- inline int getStall(const Instruction *) const; -- inline int getWrDepBar(const Instruction *) const; -- inline int getRdDepBar(const Instruction *) const; -- inline int getWtDepBar(const Instruction *) const; -- -- void setReuseFlag(Instruction *); -- -- inline void printSchedInfo(int, const Instruction *) const; -- -- struct LiveBarUse { -- LiveBarUse(Instruction *insn, Instruction *usei) -- : insn(insn), usei(usei) { } -- Instruction *insn; -- Instruction *usei; -- }; -- -- struct LiveBarDef { -- LiveBarDef(Instruction *insn, Instruction *defi) -- : insn(insn), defi(defi) { } -- Instruction *insn; -- Instruction *defi; -- }; -- -- bool insertBarriers(BasicBlock *); -- -- bool doesInsnWriteTo(const Instruction *insn, const Value *val) const; -- Instruction *findFirstUse(const Instruction *) const; -- Instruction *findFirstDef(const Instruction *) const; -- -- bool needRdDepBar(const Instruction *) const; -- bool needWrDepBar(const Instruction *) const; --}; -- - inline void - SchedDataCalculatorGM107::emitStall(Instruction *insn, uint8_t cnt) - { -diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gv100.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gv100.cpp -new file mode 100644 -index 00000000000..ef33743e610 ---- /dev/null -+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gv100.cpp -@@ -0,0 +1,2052 @@ -+/* -+ * Copyright 2020 Red Hat Inc. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice shall be included in -+ * all copies or substantial portions of the Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -+ * OTHER DEALINGS IN THE SOFTWARE. -+ */ -+#include "codegen/nv50_ir_emit_gv100.h" -+#include "codegen/nv50_ir_sched_gm107.h" -+ -+namespace nv50_ir { -+ -+/******************************************************************************* -+ * instruction format helpers -+ ******************************************************************************/ -+ -+#define FA_NODEF (1 << 0) -+#define FA_RRR (1 << 1) -+#define FA_RRI (1 << 2) -+#define FA_RRC (1 << 3) -+#define FA_RIR (1 << 4) -+#define FA_RCR (1 << 5) -+ -+#define FA_SRC_MASK 0x0ff -+#define FA_SRC_NEG 0x100 -+#define FA_SRC_ABS 0x200 -+ -+#define EMPTY -1 -+#define __(a) (a) // no source modifiers -+#define _A(a) ((a) | FA_SRC_ABS) -+#define N_(a) ((a) | FA_SRC_NEG) -+#define NA(a) ((a) | FA_SRC_NEG | FA_SRC_ABS) -+ -+void -+CodeEmitterGV100::emitFormA_I32(int src) -+{ -+ emitIMMD(32, 32, insn->src(src)); -+ if (insn->src(src).mod.abs()) -+ code[1] &= 0x7fffffff; -+ if (insn->src(src).mod.neg()) -+ code[1] ^= 0x80000000; -+} -+ -+void -+CodeEmitterGV100::emitFormA_RRC(uint16_t op, int src1, int src2) -+{ -+ emitInsn(op); -+ if (src1 >= 0) { -+ emitNEG (75, (src1 & FA_SRC_MASK), (src1 & FA_SRC_NEG)); -+ emitABS (74, (src1 & FA_SRC_MASK), (src1 & FA_SRC_ABS)); -+ emitGPR (64, insn->src(src1 & FA_SRC_MASK)); -+ } -+ if (src2 >= 0) { -+ emitNEG (63, (src2 & FA_SRC_MASK), (src2 & FA_SRC_NEG)); -+ emitABS (62, (src2 & FA_SRC_MASK), (src2 & FA_SRC_ABS)); -+ emitCBUF(54, -1, 38, 0, 2, insn->src(src2 & FA_SRC_MASK)); -+ } -+} -+ -+void -+CodeEmitterGV100::emitFormA_RRI(uint16_t op, int src1, int src2) -+{ -+ emitInsn(op); -+ if (src1 >= 0) { -+ emitNEG (75, (src1 & FA_SRC_MASK), (src1 & FA_SRC_NEG)); -+ emitABS (74, (src1 & FA_SRC_MASK), (src1 & FA_SRC_ABS)); -+ emitGPR (64, insn->src(src1 & FA_SRC_MASK)); -+ } -+ if (src2 >= 0) -+ emitFormA_I32(src2 & FA_SRC_MASK); -+} -+ -+void -+CodeEmitterGV100::emitFormA_RRR(uint16_t op, int src1, int src2) -+{ -+ emitInsn(op); -+ if (src2 >= 0) { -+ emitNEG (75, (src2 & FA_SRC_MASK), (src2 & FA_SRC_NEG)); -+ emitABS (74, (src2 & FA_SRC_MASK), (src2 & FA_SRC_ABS)); -+ emitGPR (64, insn->src(src2 & FA_SRC_MASK)); -+ } -+ -+ if (src1 >= 0) { -+ emitNEG (63, (src1 & FA_SRC_MASK), (src1 & FA_SRC_NEG)); -+ emitABS (62, (src1 & FA_SRC_MASK), (src1 & FA_SRC_ABS)); -+ emitGPR (32, insn->src(src1 & FA_SRC_MASK)); -+ } -+} -+ -+void -+CodeEmitterGV100::emitFormA(uint16_t op, uint8_t forms, -+ int src0, int src1, int src2) -+{ -+ switch ((src1 < 0) ? FILE_GPR : insn->src(src1 & FA_SRC_MASK).getFile()) { -+ case FILE_GPR: -+ switch ((src2 < 0) ? FILE_GPR : insn->src(src2 & FA_SRC_MASK).getFile()) { -+ case FILE_GPR: -+ assert(forms & FA_RRR); -+ emitFormA_RRR((1 << 9) | op, src1, src2); -+ break; -+ case FILE_IMMEDIATE: -+ assert(forms & FA_RRI); -+ emitFormA_RRI((2 << 9) | op, src1, src2); -+ break; -+ case FILE_MEMORY_CONST: -+ assert(forms & FA_RRC); -+ emitFormA_RRC((3 << 9) | op, src1, src2); -+ break; -+ default: -+ assert(!"bad src2 file"); -+ break; -+ } -+ break; -+ case FILE_IMMEDIATE: -+ assert((src2 < 0) || insn->src(src2 & FA_SRC_MASK).getFile() == FILE_GPR); -+ assert(forms & FA_RIR); -+ emitFormA_RRI((4 << 9) | op, src2, src1); -+ break; -+ case FILE_MEMORY_CONST: -+ assert((src2 < 0) || insn->src(src2 & FA_SRC_MASK).getFile() == FILE_GPR); -+ assert(forms & FA_RCR); -+ emitFormA_RRC((5 << 9) | op, src2, src1); -+ break; -+ default: -+ assert(!"bad src1 file"); -+ break; -+ } -+ -+ if (src0 >= 0) { -+ assert(insn->src(src0 & FA_SRC_MASK).getFile() == FILE_GPR); -+ emitABS(73, (src0 & FA_SRC_MASK), (src0 & FA_SRC_ABS)); -+ emitNEG(72, (src0 & FA_SRC_MASK), (src0 & FA_SRC_NEG)); -+ emitGPR(24, insn->src(src0 & FA_SRC_MASK)); -+ } -+ -+ if (!(forms & FA_NODEF)) -+ emitGPR(16, insn->def(0)); -+} -+ -+/******************************************************************************* -+ * control -+ ******************************************************************************/ -+ -+void -+CodeEmitterGV100::emitBRA() -+{ -+ const FlowInstruction *insn = this->insn->asFlow(); -+ int64_t target = ((int64_t)insn->target.bb->binPos - (codeSize + 0x10)) / 4; -+ -+ assert(!insn->indirect && !insn->absolute); -+ -+ emitInsn (0x947); -+ emitField(34, 48, target); -+ emitPRED (87); -+ emitField(86, 2, 0); // ./.INC/.DEC -+} -+ -+void -+CodeEmitterGV100::emitEXIT() -+{ -+ emitInsn (0x94d); -+ emitNOT (90); -+ emitPRED (87); -+ emitField(85, 1, 0); // .NO_ATEXIT -+ emitField(84, 2, 0); // ./.KEEPREFCOUNT/.PREEMPTED/.INVALID3 -+} -+ -+void -+CodeEmitterGV100::emitKILL() -+{ -+ emitInsn(0x95b); -+ emitPRED(87); -+} -+ -+void -+CodeEmitterGV100::emitNOP() -+{ -+ emitInsn(0x918); -+} -+ -+void -+CodeEmitterGV100::emitWARPSYNC() -+{ -+ emitFormA(0x148, FA_NODEF | FA_RRR | FA_RIR | FA_RCR, EMPTY, __(0), EMPTY); -+ emitNOT (90); -+ emitPRED (87); -+} -+ -+/******************************************************************************* -+ * movement / conversion -+ ******************************************************************************/ -+ -+void -+CodeEmitterGV100::emitCS2R() -+{ -+ emitInsn(0x805); -+ emitSYS (72, insn->src(0)); -+ emitGPR (16, insn->def(0)); -+} -+ -+void -+CodeEmitterGV100::emitF2F() -+{ -+ if (typeSizeof(insn->sType) != 8 && typeSizeof(insn->dType) != 8) -+ emitFormA(0x104, FA_RRR | FA_RIR | FA_RCR, EMPTY, NA(0), EMPTY); -+ else -+ emitFormA(0x110, FA_RRR | FA_RIR | FA_RCR, EMPTY, NA(0), EMPTY); -+ emitField(84, 2, util_logbase2(typeSizeof(insn->sType))); -+ emitFMZ (80, 1); -+ emitRND (78); -+ emitField(75, 2, util_logbase2(typeSizeof(insn->dType))); -+ emitField(60, 2, insn->subOp); // ./.H1/.INVALID2/.INVALID3 -+} -+ -+void -+CodeEmitterGV100::emitF2I() -+{ -+ if (typeSizeof(insn->sType) != 8 && typeSizeof(insn->dType) != 8) -+ emitFormA(0x105, FA_RRR | FA_RIR | FA_RCR, EMPTY, NA(0), EMPTY); -+ else -+ emitFormA(0x111, FA_RRR | FA_RIR | FA_RCR, EMPTY, NA(0), EMPTY); -+ emitField(84, 2, util_logbase2(typeSizeof(insn->sType))); -+ emitFMZ (80, 1); -+ emitRND (78); -+ emitField(77, 1, 0); // .NTZ -+ emitField(75, 2, util_logbase2(typeSizeof(insn->dType))); -+ emitField(72, 1, isSignedType(insn->dType)); -+} -+ -+void -+CodeEmitterGV100::emitFRND() -+{ -+ int subop = 0; -+ -+ switch (insn->op) { -+ case OP_CVT: -+ switch (insn->rnd) { -+ case ROUND_NI: subop = 0; break; -+ case ROUND_MI: subop = 1; break; -+ case ROUND_PI: subop = 2; break; -+ case ROUND_ZI: subop = 3; break; -+ default: -+ assert(!"invalid FRND mode"); -+ break; -+ } -+ break; -+ case OP_FLOOR: subop = 1; break; -+ case OP_CEIL : subop = 2; break; -+ case OP_TRUNC: subop = 3; break; -+ default: -+ assert(!"invalid FRND opcode"); -+ break; -+ } -+ -+ if (typeSizeof(insn->sType) != 8 && typeSizeof(insn->dType) != 8) -+ emitFormA(0x107, FA_RRR | FA_RIR | FA_RCR, EMPTY, NA(0), EMPTY); -+ else -+ emitFormA(0x113, FA_RRR | FA_RIR | FA_RCR, EMPTY, NA(0), EMPTY); -+ emitField(84, 2, util_logbase2(typeSizeof(insn->sType))); -+ emitFMZ (80, 1); -+ emitField(78, 2, subop); -+ emitField(75, 2, util_logbase2(typeSizeof(insn->dType))); -+} -+ -+void -+CodeEmitterGV100::emitI2F() -+{ -+ if (typeSizeof(insn->sType) != 8 && typeSizeof(insn->dType) != 8) -+ emitFormA(0x106, FA_RRR | FA_RIR | FA_RCR, EMPTY, __(0), EMPTY); -+ else -+ emitFormA(0x112, FA_RRR | FA_RIR | FA_RCR, EMPTY, __(0), EMPTY); -+ emitField(84, 2, util_logbase2(typeSizeof(insn->sType))); -+ emitRND (78); -+ emitField(75, 2, util_logbase2(typeSizeof(insn->dType))); -+ emitField(74, 1, isSignedType(insn->sType)); -+ if (typeSizeof(insn->sType) == 2) -+ emitField(60, 2, insn->subOp >> 1); -+ else -+ emitField(60, 2, insn->subOp); // ./.B1/.B2/.B3 -+} -+ -+void -+CodeEmitterGV100::emitMOV() -+{ -+ switch (insn->def(0).getFile()) { -+ case FILE_GPR: -+ switch (insn->src(0).getFile()) { -+ case FILE_GPR: -+ case FILE_MEMORY_CONST: -+ case FILE_IMMEDIATE: -+ emitFormA(0x002, FA_RRR | FA_RIR | FA_RCR, EMPTY, __(0), EMPTY); -+ emitField(72, 4, insn->lanes); -+ break; -+ case FILE_PREDICATE: -+ emitInsn (0x807); -+ emitGPR (16, insn->def(0)); -+ emitGPR (24); -+ emitField(32, 32, 0xffffffff); -+ emitField(90, 1, 1); -+ emitPRED (87, insn->src(0)); -+ break; -+ default: -+ assert(!"bad src file"); -+ break; -+ } -+ break; -+ case FILE_PREDICATE: -+ emitInsn (0x20c); -+ emitPRED (87); -+ emitPRED (84); -+ emitNOT (71); -+ emitPRED (68); -+ emitPRED (81, insn->def(0)); -+ emitCond3(76, CC_NE); -+ emitGPR (24, insn->src(0)); -+ emitGPR (32); -+ break; -+ default: -+ assert(!"bad dst file"); -+ break; -+ } -+} -+ -+void -+CodeEmitterGV100::emitPRMT() -+{ -+ emitFormA(0x016, FA_RRR | FA_RRI | FA_RRC | FA_RIR | FA_RCR, __(0), __(1), __(2)); -+ emitField(72, 3, insn->subOp); -+} -+ -+void -+CodeEmitterGV100::emitS2R() -+{ -+ emitInsn(0x919); -+ emitSYS (72, insn->src(0)); -+ emitGPR (16, insn->def(0)); -+} -+ -+static void -+selpFlip(const FixupEntry *entry, uint32_t *code, const FixupData& data) -+{ -+ int loc = entry->loc; -+ if (data.force_persample_interp) -+ code[loc + 2] |= 1 << 26; -+ else -+ code[loc + 2] &= ~(1 << 26); -+} -+ -+void -+CodeEmitterGV100::emitSEL() -+{ -+ emitFormA(0x007, FA_RRR | FA_RIR | FA_RCR, __(0), __(1), EMPTY); -+ emitNOT (90, insn->src(2)); -+ emitPRED (87, insn->src(2)); -+ if (insn->subOp == 1) -+ addInterp(0, 0, selpFlip); -+} -+ -+void -+CodeEmitterGV100::emitSHFL() -+{ -+ switch (insn->src(1).getFile()) { -+ case FILE_GPR: -+ switch (insn->src(2).getFile()) { -+ case FILE_GPR: -+ emitInsn(0x389); -+ emitGPR (64, insn->src(2)); -+ break; -+ case FILE_IMMEDIATE: -+ emitInsn(0x589); -+ emitIMMD(40, 13, insn->src(2)); -+ break; -+ default: -+ assert(!"bad src2 file"); -+ break; -+ } -+ emitGPR(32, insn->src(1)); -+ break; -+ case FILE_IMMEDIATE: -+ switch (insn->src(2).getFile()) { -+ case FILE_GPR: -+ emitInsn(0x989); -+ emitGPR (64, insn->src(2)); -+ break; -+ case FILE_IMMEDIATE: -+ emitInsn(0xf89); -+ emitIMMD(40, 13, insn->src(2)); -+ break; -+ default: -+ assert(!"bad src2 file"); -+ break; -+ } -+ emitIMMD(53, 5, insn->src(1)); -+ break; -+ default: -+ assert(!"bad src1 file"); -+ break; -+ } -+ -+ if (insn->defExists(1)) -+ emitPRED(81, insn->def(1)); -+ else -+ emitPRED(81); -+ -+ emitField(58, 2, insn->subOp); -+ emitGPR (24, insn->src(0)); -+ emitGPR (16, insn->def(0)); -+} -+ -+/******************************************************************************* -+ * fp32 -+ ******************************************************************************/ -+ -+void -+CodeEmitterGV100::emitFADD() -+{ -+ if (insn->src(1).getFile() == FILE_GPR) -+ emitFormA(0x021, FA_RRR , NA(0), NA(1), EMPTY); -+ else -+ emitFormA(0x021, FA_RRI | FA_RRC, NA(0), EMPTY, NA(1)); -+ emitFMZ (80, 1); -+ emitRND (78); -+ emitSAT (77); -+} -+ -+void -+CodeEmitterGV100::emitFFMA() -+{ -+ emitFormA(0x023, FA_RRR | FA_RRI | FA_RRC | FA_RIR | FA_RCR, NA(0), NA(1), NA(2)); -+ emitField(80, 1, insn->ftz); -+ emitRND (78); -+ emitSAT (77); -+ emitField(76, 1, insn->dnz); -+} -+ -+void -+CodeEmitterGV100::emitFMNMX() -+{ -+ emitFormA(0x009, FA_RRR | FA_RIR | FA_RCR, NA(0), NA(1), EMPTY); -+ emitField(90, 1, insn->op == OP_MAX); -+ emitPRED (87); -+ emitFMZ (80, 1); -+} -+ -+void -+CodeEmitterGV100::emitFMUL() -+{ -+ emitFormA(0x020, FA_RRR | FA_RIR | FA_RCR, NA(0), NA(1), EMPTY); -+ emitField(80, 1, insn->ftz); -+ emitPDIV (84); -+ emitRND (78); -+ emitSAT (77); -+ emitField(76, 1, insn->dnz); -+} -+ -+void -+CodeEmitterGV100::emitFSET_BF() -+{ -+ const CmpInstruction *insn = this->insn->asCmp(); -+ -+ emitFormA(0x00a, FA_RRR | FA_RIR | FA_RCR, NA(0), NA(1), EMPTY); -+ emitFMZ (80, 1); -+ emitCond4(76, insn->setCond); -+ -+ if (insn->op != OP_SET) { -+ switch (insn->op) { -+ case OP_SET_AND: emitField(74, 2, 0); break; -+ case OP_SET_OR : emitField(74, 2, 1); break; -+ case OP_SET_XOR: emitField(74, 2, 2); break; -+ default: -+ assert(!"invalid set op"); -+ break; -+ } -+ emitNOT (90, insn->src(2)); -+ emitPRED(87, insn->src(2)); -+ } else { -+ emitPRED(87); -+ } -+} -+ -+void -+CodeEmitterGV100::emitFSETP() -+{ -+ const CmpInstruction *insn = this->insn->asCmp(); -+ -+ emitFormA(0x00b, FA_NODEF | FA_RRR | FA_RIR | FA_RCR, NA(0), NA(1), EMPTY); -+ emitFMZ (80, 1); -+ emitCond4(76, insn->setCond); -+ -+ if (insn->op != OP_SET) { -+ switch (insn->op) { -+ case OP_SET_AND: emitField(74, 2, 0); break; -+ case OP_SET_OR : emitField(74, 2, 1); break; -+ case OP_SET_XOR: emitField(74, 2, 2); break; -+ default: -+ assert(!"invalid set op"); -+ break; -+ } -+ emitNOT (90, insn->src(2)); -+ emitPRED(87, insn->src(2)); -+ } else { -+ emitPRED(87); -+ } -+ -+ if (insn->defExists(1)) -+ emitPRED(84, insn->def(1)); -+ else -+ emitPRED(84); -+ emitPRED(81, insn->def(0)); -+} -+ -+void -+CodeEmitterGV100::emitFSWZADD() -+{ -+ uint8_t subOp = 0; -+ -+ // NP/PN swapped vs SM60 -+ for (int i = 0; i < 4; i++) { -+ uint8_t p = ((insn->subOp >> (i * 2)) & 3); -+ if (p == 1 || p == 2) -+ p ^= 3; -+ subOp |= p << (i * 2); -+ } -+ -+ emitInsn (0x822); -+ emitFMZ (80, 1); -+ emitRND (78); -+ emitField(77, 1, insn->lanes); /* abused for .ndv */ -+ emitGPR (64, insn->src(1)); -+ emitField(32, 8, subOp); -+ emitGPR (24, insn->src(0)); -+ emitGPR (16, insn->def(0)); -+} -+ -+void -+CodeEmitterGV100::emitMUFU() -+{ -+ int mufu = 0; -+ -+ switch (insn->op) { -+ case OP_COS : mufu = 0; break; -+ case OP_SIN : mufu = 1; break; -+ case OP_EX2 : mufu = 2; break; -+ case OP_LG2 : mufu = 3; break; -+ case OP_RCP : mufu = 4 + 2 * insn->subOp; break; -+ case OP_RSQ : mufu = 5 + 2 * insn->subOp; break; -+ case OP_SQRT: mufu = 8; break; -+ default: -+ assert(!"invalid mufu"); -+ break; -+ } -+ -+ emitFormA(0x108, FA_RRR | FA_RIR | FA_RCR, EMPTY, NA(0), EMPTY); -+ emitField(74, 4, mufu); -+} -+ -+/******************************************************************************* -+ * fp64 -+ ******************************************************************************/ -+ -+void -+CodeEmitterGV100::emitDADD() -+{ -+ emitFormA(0x029, FA_RRR | FA_RRI | FA_RRC, NA(0), EMPTY, NA(1)); -+ emitRND(78); -+} -+ -+void -+CodeEmitterGV100::emitDFMA() -+{ -+ emitFormA(0x02b, FA_RRR | FA_RRI | FA_RRC | FA_RIR | FA_RCR, NA(0), NA(1), NA(2)); -+ emitRND(78); -+} -+ -+void -+CodeEmitterGV100::emitDMUL() -+{ -+ emitFormA(0x028, FA_RRR | FA_RIR | FA_RCR, NA(0), NA(1), EMPTY); -+ emitRND(78); -+} -+ -+void -+CodeEmitterGV100::emitDSETP() -+{ -+ const CmpInstruction *insn = this->insn->asCmp(); -+ -+ if (insn->src(1).getFile() == FILE_GPR) -+ emitFormA(0x02a, FA_NODEF | FA_RRR , NA(0), NA(1), EMPTY); -+ else -+ emitFormA(0x02a, FA_NODEF | FA_RRI | FA_RRC, NA(0), EMPTY, NA(1)); -+ -+ if (insn->op != OP_SET) { -+ switch (insn->op) { -+ case OP_SET_AND: emitField(74, 2, 0); break; -+ case OP_SET_OR : emitField(74, 2, 1); break; -+ case OP_SET_XOR: emitField(74, 2, 2); break; -+ default: -+ assert(!"invalid set op"); -+ break; -+ } -+ emitNOT (90, insn->src(2)); -+ emitPRED(87, insn->src(2)); -+ } else { -+ emitPRED(87); -+ } -+ -+ if (insn->defExists(1)) -+ emitPRED(84, insn->def(1)); -+ else -+ emitPRED(84); -+ emitPRED (81, insn->def(0)); -+ emitCond4(76, insn->setCond); -+} -+ -+/******************************************************************************* -+ * integer -+ ******************************************************************************/ -+ -+void -+CodeEmitterGV100::emitBMSK() -+{ -+ emitFormA(0x01b, FA_RRR | FA_RIR | FA_RCR, __(0), __(1), EMPTY); -+ emitField(75, 1, insn->subOp); // .C/.W -+} -+ -+void -+CodeEmitterGV100::emitBREV() -+{ -+ emitFormA(0x101, FA_RRR | FA_RIR | FA_RCR, EMPTY, __(0), EMPTY); -+} -+ -+void -+CodeEmitterGV100::emitFLO() -+{ -+ emitFormA(0x100, FA_RRR | FA_RIR | FA_RCR, EMPTY, __(0), EMPTY); -+ emitPRED (81); -+ emitField(74, 1, insn->subOp == NV50_IR_SUBOP_BFIND_SAMT); -+ emitField(73, 1, isSignedType(insn->dType)); -+ emitNOT (63, insn->src(0)); -+} -+ -+void -+CodeEmitterGV100::emitIABS() -+{ -+ emitFormA(0x013, FA_RRR | FA_RIR | FA_RCR, EMPTY, __(0), EMPTY); -+} -+ -+void -+CodeEmitterGV100::emitIADD3() -+{ -+// emitFormA(0x010, FA_RRR | FA_RIR | FA_RCR, N_(0), N_(1), N_(2)); -+ emitFormA(0x010, FA_RRR | FA_RIR | FA_RCR, N_(0), N_(1), EMPTY); -+ emitGPR (64); //XXX: fix when switching back to N_(2) -+ emitPRED (84, NULL); // .CC1 -+ emitPRED (81, insn->flagsDef >= 0 ? insn->getDef(insn->flagsDef) : NULL); -+ if (insn->flagsSrc >= 0) { -+ emitField(74, 1, 1); // .X -+ emitPRED (87, insn->getSrc(insn->flagsSrc)); -+ emitField(77, 4, 0xf); // .X1 -+ } -+} -+ -+void -+CodeEmitterGV100::emitIMAD() -+{ -+ emitFormA(0x024, FA_RRR | FA_RRI | FA_RRC | FA_RIR | FA_RCR, __(0), __(1), N_(2)); -+ emitField(73, 1, isSignedType(insn->sType)); -+} -+ -+void -+CodeEmitterGV100::emitIMAD_WIDE() -+{ -+ emitFormA(0x025, FA_RRR | FA_RRC | FA_RIR | FA_RCR, __(0), __(1), N_(2)); -+ emitPRED (81); -+ emitField(73, 1, isSignedType(insn->sType)); -+} -+ -+void -+CodeEmitterGV100::emitISETP() -+{ -+ const CmpInstruction *insn = this->insn->asCmp(); -+ -+ emitFormA(0x00c, FA_NODEF | FA_RRR | FA_RIR | FA_RCR, __(0), __(1), EMPTY); -+ -+ if (insn->op != OP_SET) { -+ switch (insn->op) { -+ case OP_SET_AND: emitField(74, 2, 0); break; -+ case OP_SET_OR : emitField(74, 2, 1); break; -+ case OP_SET_XOR: emitField(74, 2, 2); break; -+ default: -+ assert(!"invalid set op"); -+ break; -+ } -+ emitNOT (90, insn->src(2)); -+ emitPRED(87, insn->src(2)); -+ } else { -+ emitPRED(87); -+ } -+ -+ //XXX: CC->pred -+ if (insn->flagsSrc >= 0) { -+ assert(0); -+ emitField(68, 4, 6); -+ } else { -+ emitNOT (71); -+ if (!insn->subOp) -+ emitPRED(68); -+ } -+ -+ if (insn->defExists(1)) -+ emitPRED(84, insn->def(1)); -+ else -+ emitPRED(84); -+ emitPRED (81, insn->def(0)); -+ emitCond3(76, insn->setCond); -+ emitField(73, 1, isSignedType(insn->sType)); -+ -+ if (insn->subOp) { // .EX -+ assert(0); -+ emitField(72, 1, 1); -+ emitPRED (68, insn->srcExists(3) ? insn->src(3) : insn->src(2)); -+ } -+} -+ -+void -+CodeEmitterGV100::emitLEA() -+{ -+ assert(insn->src(1).get()->asImm()); -+ -+ emitFormA(0x011, FA_RRR | FA_RIR | FA_RCR, N_(0), N_(2), EMPTY); -+ emitPRED (81); -+ emitIMMD (75, 5, insn->src(1)); -+ emitGPR (64); -+} -+ -+void -+CodeEmitterGV100::emitLOP3_LUT() -+{ -+ emitFormA(0x012, FA_RRR | FA_RIR | FA_RCR, __(0), __(1), __(2)); -+ emitField(90, 1, 1); -+ emitPRED (87); -+ emitPRED (81); -+ emitField(80, 1, 0); // .PAND -+ emitField(72, 8, insn->subOp); -+} -+ -+void -+CodeEmitterGV100::emitPOPC() -+{ -+ emitFormA(0x109, FA_RRR | FA_RIR | FA_RCR, EMPTY, __(0), EMPTY); -+ emitNOT (63, insn->src(0)); -+} -+ -+void -+CodeEmitterGV100::emitSGXT() -+{ -+ emitFormA(0x01a, FA_RRR | FA_RIR | FA_RCR, __(0), __(1), EMPTY); -+ emitField(75, 1, 0); // .W -+ emitField(73, 1, 1); // /.U32 -+} -+ -+void -+CodeEmitterGV100::emitSHF() -+{ -+ emitFormA(0x019, FA_RRR | FA_RRI | FA_RRC | FA_RIR | FA_RCR, __(0), __(1), __(2)); -+ emitField(80, 1, !!(insn->subOp & NV50_IR_SUBOP_SHF_HI)); -+ emitField(76, 1, !!(insn->subOp & NV50_IR_SUBOP_SHF_R)); -+ emitField(75, 1, !!(insn->subOp & NV50_IR_SUBOP_SHF_W)); -+ -+ switch (insn->sType) { -+ case TYPE_S64: emitField(73, 2, 0); break; -+ case TYPE_U64: emitField(73, 2, 1); break; -+ case TYPE_S32: emitField(73, 2, 2); break; -+ case TYPE_U32: -+ default: -+ emitField(73, 2, 3); -+ break; -+ } -+} -+ -+/******************************************************************************* -+ * load/stores -+ ******************************************************************************/ -+ -+void -+CodeEmitterGV100::emitALD() -+{ -+ emitInsn (0x321); -+ emitField(74, 2, (insn->getDef(0)->reg.size / 4) - 1); -+ emitGPR (32, insn->src(0).getIndirect(1)); -+ emitO (79); -+ emitP (76); -+ emitADDR (24, 40, 10, 0, insn->src(0)); -+ emitGPR (16, insn->def(0)); -+} -+ -+void -+CodeEmitterGV100::emitAST() -+{ -+ emitInsn (0x322); -+ emitField(74, 2, (typeSizeof(insn->dType) / 4) - 1); -+ emitGPR (64, insn->src(0).getIndirect(1)); -+ emitP (76); -+ emitADDR (24, 40, 10, 0, insn->src(0)); -+ emitGPR (32, insn->src(1)); -+} -+ -+void -+CodeEmitterGV100::emitATOM() -+{ -+ unsigned subOp, dType; -+ -+ if (insn->subOp != NV50_IR_SUBOP_ATOM_CAS) { -+ emitInsn(0x38a); -+ -+ if (insn->subOp == NV50_IR_SUBOP_ATOM_EXCH) -+ subOp = 8; -+ else -+ subOp = insn->subOp; -+ emitField(87, 4, subOp); -+ -+ switch (insn->dType) { -+ case TYPE_U32 : dType = 0; break; -+ case TYPE_S32 : dType = 1; break; -+ case TYPE_U64 : dType = 2; break; -+ case TYPE_F32 : dType = 3; break; -+ case TYPE_B128: dType = 4; break; -+ case TYPE_S64 : dType = 5; break; -+ default: -+ assert(!"unexpected dType"); -+ dType = 0; -+ break; -+ } -+ emitField(73, 3, dType); -+ } else { -+ emitInsn(0x38b); -+ -+ switch (insn->dType) { -+ case TYPE_U32: dType = 0; break; -+ case TYPE_U64: dType = 2; break; -+ default: -+ assert(!"unexpected dType"); -+ dType = 0; -+ break; -+ } -+ emitField(73, 3, dType); -+ emitGPR (64, insn->src(2)); -+ } -+ -+ emitPRED (81); -+ emitField(79, 2, 1); -+ emitField(72, 1, insn->src(0).getIndirect(0)->getSize() == 8); -+ emitGPR (32, insn->src(1)); -+ emitADDR (24, 40, 24, 0, insn->src(0)); -+ emitGPR (16, insn->def(0)); -+} -+ -+void -+CodeEmitterGV100::emitATOMS() -+{ -+ unsigned dType, subOp; -+ -+ if (insn->subOp == NV50_IR_SUBOP_ATOM_CAS) { -+ switch (insn->dType) { -+ case TYPE_U32: dType = 0; break; -+ case TYPE_S32: dType = 1; break; -+ case TYPE_U64: dType = 2; break; -+ default: assert(!"unexpected dType"); dType = 0; break; -+ } -+ -+ emitInsn (0x38d); -+ emitField(87, 1, 0); // ATOMS.CAS/ATOMS.CAST -+ emitField(73, 2, dType); -+ emitGPR (64, insn->src(2)); -+ } else { -+ emitInsn(0x38c); -+ -+ if (insn->subOp == NV50_IR_SUBOP_ATOM_EXCH) -+ subOp = 8; -+ else -+ subOp = insn->subOp; -+ emitField(87, 4, subOp); -+ -+ switch (insn->dType) { -+ case TYPE_U32: dType = 0; break; -+ case TYPE_S32: dType = 1; break; -+ case TYPE_U64: dType = 2; break; -+ default: assert(!"unexpected dType"); dType = 0; break; -+ } -+ -+ emitField(73, 2, dType); -+ } -+ -+ emitGPR (32, insn->src(1)); -+ emitADDR (24, 40, 24, 0, insn->src(0)); -+ emitGPR (16, insn->def(0)); -+} -+ -+static void -+interpApply(const FixupEntry *entry, uint32_t *code, const FixupData& data) -+{ -+ int ipa = entry->ipa; -+ int loc = entry->loc; -+ -+ if (data.force_persample_interp && -+ (ipa & NV50_IR_INTERP_SAMPLE_MASK) == NV50_IR_INTERP_DEFAULT && -+ (ipa & NV50_IR_INTERP_MODE_MASK) != NV50_IR_INTERP_FLAT) { -+ ipa |= NV50_IR_INTERP_CENTROID; -+ } -+ -+ int sample; -+ switch (ipa & NV50_IR_INTERP_SAMPLE_MASK) { -+ case NV50_IR_INTERP_DEFAULT : sample = 0; break; -+ case NV50_IR_INTERP_CENTROID: sample = 1; break; -+ case NV50_IR_INTERP_OFFSET : sample = 2; break; -+ default: assert(!"invalid sample mode"); -+ } -+ -+ int interp; -+ switch (ipa & NV50_IR_INTERP_MODE_MASK) { -+ case NV50_IR_INTERP_LINEAR : -+ case NV50_IR_INTERP_PERSPECTIVE: interp = 0; break; -+ case NV50_IR_INTERP_FLAT : interp = 1; break; -+ case NV50_IR_INTERP_SC : interp = 2; break; -+ default: assert(!"invalid ipa mode"); -+ } -+ -+ code[loc + 2] &= ~(0xf << 12); -+ code[loc + 2] |= sample << 12; -+ code[loc + 2] |= interp << 14; -+} -+ -+void -+CodeEmitterGV100::emitIPA() -+{ -+ emitInsn (0x326); -+ emitPRED (81, insn->defExists(1) ? insn->def(1) : NULL); -+ -+ switch (insn->getInterpMode()) { -+ case NV50_IR_INTERP_LINEAR : -+ case NV50_IR_INTERP_PERSPECTIVE: emitField(78, 2, 0); break; -+ case NV50_IR_INTERP_FLAT : emitField(78, 2, 1); break; -+ case NV50_IR_INTERP_SC : emitField(78, 2, 2); break; -+ default: -+ assert(!"invalid ipa mode"); -+ break; -+ } -+ -+ switch (insn->getSampleMode()) { -+ case NV50_IR_INTERP_DEFAULT : emitField(76, 2, 0); break; -+ case NV50_IR_INTERP_CENTROID: emitField(76, 2, 1); break; -+ case NV50_IR_INTERP_OFFSET : emitField(76, 2, 2); break; -+ default: -+ assert(!"invalid sample mode"); -+ break; -+ } -+ -+ if (insn->getSampleMode() != NV50_IR_INTERP_OFFSET) { -+ emitGPR (32); -+ addInterp(insn->ipa, 0xff, interpApply); -+ } else { -+ emitGPR (32, insn->src(1)); -+ addInterp(insn->ipa, insn->getSrc(1)->reg.data.id, interpApply); -+ } -+ -+ assert(!insn->src(0).isIndirect(0)); -+ emitADDR (-1, 64, 8, 2, insn->src(0)); -+ emitGPR (16, insn->def(0)); -+} -+ -+void -+CodeEmitterGV100::emitISBERD() -+{ -+ emitInsn(0x923); -+ emitGPR (24, insn->src(0)); -+ emitGPR (16, insn->def(0)); -+} -+ -+void -+CodeEmitterGV100::emitLDSTc(int posm, int poso) -+{ -+ int mode = 0; -+ int order = 1; -+ -+ switch (insn->cache) { -+ case CACHE_CA: mode = 0; order = 1; break; -+ case CACHE_CG: mode = 2; order = 2; break; -+ case CACHE_CV: mode = 3; order = 2; break; -+ default: -+ assert(!"invalid caching mode"); -+ break; -+ } -+ -+ emitField(poso, 2, order); -+ emitField(posm, 2, mode); -+} -+ -+void -+CodeEmitterGV100::emitLDSTs(int pos, DataType type) -+{ -+ int data = 0; -+ -+ switch (typeSizeof(type)) { -+ case 1: data = isSignedType(type) ? 1 : 0; break; -+ case 2: data = isSignedType(type) ? 3 : 2; break; -+ case 4: data = 4; break; -+ case 8: data = 5; break; -+ case 16: data = 6; break; -+ default: -+ assert(!"bad type"); -+ break; -+ } -+ -+ emitField(pos, 3, data); -+} -+ -+void -+CodeEmitterGV100::emitLD() -+{ -+ emitInsn (0x980); -+ emitField(79, 2, 2); // .CONSTANT/./.STRONG/.MMIO -+ emitField(77, 2, 2); // .CTA/.SM/.GPU/.SYS -+ emitLDSTs(73, insn->dType); -+ emitField(72, 1, insn->src(0).getIndirect(0)->getSize() == 8); -+ emitADDR (24, 32, 32, 0, insn->src(0)); -+ emitGPR (16, insn->def(0)); -+} -+ -+void -+CodeEmitterGV100::emitLDC() -+{ -+ emitFormA(0x182, FA_RCR, EMPTY, __(0), EMPTY); -+ emitField(78, 2, insn->subOp); -+ emitLDSTs(73, insn->dType); -+ emitGPR (24, insn->src(0).getIndirect(0)); -+} -+ -+void -+CodeEmitterGV100::emitLDL() -+{ -+ emitInsn (0x983); -+ emitField(84, 3, 1); // .EF/./.EL/.LU/.EU/.NA/.INVALID6/.INVALID7 -+ emitLDSTs(73, insn->dType); -+ emitADDR (24, 40, 24, 0, insn->src(0)); -+ emitGPR (16, insn->def(0)); -+} -+ -+void -+CodeEmitterGV100::emitLDS() -+{ -+ emitInsn (0x984); -+ emitLDSTs(73, insn->dType); -+ emitADDR (24, 40, 24, 0, insn->src(0)); -+ emitGPR (16, insn->def(0)); -+} -+ -+void -+CodeEmitterGV100::emitOUT() -+{ -+ const int cut = insn->op == OP_RESTART || insn->subOp; -+ const int emit = insn->op == OP_EMIT; -+ -+ if (insn->op != OP_FINAL) -+ emitFormA(0x124, FA_RRR | FA_RIR, __(0), __(1), EMPTY); -+ else -+ emitFormA(0x124, FA_RRR | FA_RIR, __(0), EMPTY, EMPTY); -+ emitField(78, 2, (cut << 1) | emit); -+} -+ -+void -+CodeEmitterGV100::emitRED() -+{ -+ unsigned dType; -+ -+ switch (insn->dType) { -+ case TYPE_U32: dType = 0; break; -+ case TYPE_S32: dType = 1; break; -+ case TYPE_U64: dType = 2; break; -+ case TYPE_F32: dType = 3; break; -+ case TYPE_B128: dType = 4; break; -+ case TYPE_S64: dType = 5; break; -+ default: assert(!"unexpected dType"); dType = 0; break; -+ } -+ -+ emitInsn (0x98e); -+ emitField(87, 3, insn->subOp); -+ emitField(84, 3, 1); // 0=.EF, 1=, 2=.EL, 3=.LU, 4=.EU, 5=.NA -+ emitField(79, 2, 2); // .INVALID0/./.STRONG/.INVALID3 -+ emitField(77, 2, 2); // .CTA/.SM/.GPU/.SYS -+ emitField(73, 3, dType); -+ emitField(72, 1, insn->src(0).getIndirect(0)->getSize() == 8); -+ emitGPR (32, insn->src(1)); -+ emitADDR (24, 40, 24, 0, insn->src(0)); -+} -+ -+void -+CodeEmitterGV100::emitST() -+{ -+ emitInsn (0x385); -+ emitField(79, 2, 2); // .INVALID0/./.STRONG/.MMIO -+ emitField(77, 2, 2); // .CTA/.SM/.GPU/.SYS -+ emitLDSTs(73, insn->dType); -+ emitField(72, 1, insn->src(0).getIndirect(0)->getSize() == 8); -+ emitGPR (64, insn->src(1)); -+ emitADDR (24, 32, 32, 0, insn->src(0)); -+} -+ -+void -+CodeEmitterGV100::emitSTL() -+{ -+ emitInsn (0x387); -+ emitField(84, 3, 1); // .EF/./.EL/.LU/.EU/.NA/.INVALID6/.INVALID7 -+ emitLDSTs(73, insn->dType); -+ emitADDR (24, 40, 24, 0, insn->src(0)); -+ emitGPR (32, insn->src(1)); -+} -+ -+void -+CodeEmitterGV100::emitSTS() -+{ -+ emitInsn (0x388); -+ emitLDSTs(73, insn->dType); -+ emitADDR (24, 40, 24, 0, insn->src(0)); -+ emitGPR (32, insn->src(1)); -+} -+ -+/******************************************************************************* -+ * texture -+ ******************************************************************************/ -+ -+void -+CodeEmitterGV100::emitTEXs(int pos) -+{ -+ int src1 = insn->predSrc == 1 ? 2 : 1; -+ if (insn->srcExists(src1)) -+ emitGPR(pos, insn->src(src1)); -+ else -+ emitGPR(pos); -+} -+ -+void -+CodeEmitterGV100::emitTEX() -+{ -+ const TexInstruction *insn = this->insn->asTex(); -+ int lodm = 0; -+ -+ if (!insn->tex.levelZero) { -+ switch (insn->op) { -+ case OP_TEX: lodm = 0; break; -+ case OP_TXB: lodm = 2; break; -+ case OP_TXL: lodm = 3; break; -+ default: -+ assert(!"invalid tex op"); -+ break; -+ } -+ } else { -+ lodm = 1; -+ } -+ -+ if (insn->tex.rIndirectSrc < 0) { -+ emitInsn (0xb60); -+ emitField(54, 5, prog->driver->io.auxCBSlot); -+ emitField(40, 14, insn->tex.r); -+ } else { -+ emitInsn (0x361); -+ emitField(59, 1, 1); // .B -+ } -+ emitField(90, 1, insn->tex.liveOnly); // .NODEP -+ emitField(87, 3, lodm); -+ emitField(84, 3, 1); // 0=.EF, 1=, 2=.EL, 3=.LU, 4=.EU, 5=.NA -+ emitField(78, 1, insn->tex.target.isShadow()); // .DC -+ emitField(77, 1, insn->tex.derivAll); // .NDV -+ emitField(76, 1, insn->tex.useOffsets == 1); // .AOFFI -+ emitPRED (81); -+ emitGPR (64, insn->def(1)); -+ emitGPR (16, insn->def(0)); -+ emitGPR (24, insn->src(0)); -+ emitTEXs (32); -+ emitField(63, 1, insn->tex.target.isArray()); -+ emitField(61, 2, insn->tex.target.isCube() ? 3 : -+ insn->tex.target.getDim() - 1); -+ emitField(72, 4, insn->tex.mask); -+} -+ -+void -+CodeEmitterGV100::emitTLD() -+{ -+ const TexInstruction *insn = this->insn->asTex(); -+ -+ if (insn->tex.rIndirectSrc < 0) { -+ emitInsn (0xb66); -+ emitField(54, 5, prog->driver->io.auxCBSlot); -+ emitField(40, 14, insn->tex.r); -+ } else { -+ emitInsn (0x367); -+ emitField(59, 1, 1); // .B -+ } -+ emitField(90, 1, insn->tex.liveOnly); -+ emitField(87, 3, insn->tex.levelZero ? 1 /* .LZ */ : 3 /* .LL */); -+ emitPRED (81); -+ emitField(78, 1, insn->tex.target.isMS()); -+ emitField(76, 1, insn->tex.useOffsets == 1); -+ emitField(72, 4, insn->tex.mask); -+ emitGPR (64, insn->def(1)); -+ emitField(63, 1, insn->tex.target.isArray()); -+ emitField(61, 2, insn->tex.target.isCube() ? 3 : -+ insn->tex.target.getDim() - 1); -+ emitTEXs (32); -+ emitGPR (24, insn->src(0)); -+ emitGPR (16, insn->def(0)); -+} -+ -+void -+CodeEmitterGV100::emitTLD4() -+{ -+ const TexInstruction *insn = this->insn->asTex(); -+ -+ int offsets = 0; -+ switch (insn->tex.useOffsets) { -+ case 4: offsets = 2; break; -+ case 1: offsets = 1; break; -+ case 0: offsets = 0; break; -+ default: assert(!"invalid offsets count"); break; -+ } -+ -+ if (insn->tex.rIndirectSrc < 0) { -+ emitInsn (0xb63); -+ emitField(54, 5, prog->driver->io.auxCBSlot); -+ emitField(40, 14, insn->tex.r); -+ } else { -+ emitInsn (0x364); -+ emitField(59, 1, 1); // .B -+ } -+ emitField(90, 1, insn->tex.liveOnly); -+ emitField(87, 2, insn->tex.gatherComp); -+ emitField(84, 1, 1); // !.EF -+ emitPRED (81); -+ emitField(78, 1, insn->tex.target.isShadow()); -+ emitField(76, 2, offsets); -+ emitField(72, 4, insn->tex.mask); -+ emitGPR (64, insn->def(1)); -+ emitField(63, 1, insn->tex.target.isArray()); -+ emitField(61, 2, insn->tex.target.isCube() ? 3 : -+ insn->tex.target.getDim() - 1); -+ emitTEXs (32); -+ emitGPR (24, insn->src(0)); -+ emitGPR (16, insn->def(0)); -+} -+ -+void -+CodeEmitterGV100::emitTMML() -+{ -+ const TexInstruction *insn = this->insn->asTex(); -+ -+ if (insn->tex.rIndirectSrc < 0) { -+ emitInsn (0xb69); -+ emitField(54, 5, prog->driver->io.auxCBSlot); -+ emitField(40, 14, insn->tex.r); -+ } else { -+ emitInsn (0x36a); -+ emitField(59, 1, 1); // .B -+ } -+ emitField(90, 1, insn->tex.liveOnly); -+ emitField(77, 1, insn->tex.derivAll); -+ emitField(72, 4, insn->tex.mask); -+ emitGPR (64, insn->def(1)); -+ emitField(63, 1, insn->tex.target.isArray()); -+ emitField(61, 2, insn->tex.target.isCube() ? 3 : -+ insn->tex.target.getDim() - 1); -+ emitTEXs (32); -+ emitGPR (24, insn->src(0)); -+ emitGPR (16, insn->def(0)); -+} -+ -+void -+CodeEmitterGV100::emitTXD() -+{ -+ const TexInstruction *insn = this->insn->asTex(); -+ -+ if (insn->tex.rIndirectSrc < 0) { -+ emitInsn (0xb6c); -+ emitField(54, 5, prog->driver->io.auxCBSlot); -+ emitField(40, 14, insn->tex.r); -+ } else { -+ emitInsn (0x36d); -+ emitField(59, 1, 1); // .B -+ } -+ emitField(90, 1, insn->tex.liveOnly); -+ emitPRED (81); -+ emitField(76, 1, insn->tex.useOffsets == 1); -+ emitField(72, 4, insn->tex.mask); -+ emitGPR (64, insn->def(1)); -+ emitField(63, 1, insn->tex.target.isArray()); -+ emitField(61, 2, insn->tex.target.isCube() ? 3 : -+ insn->tex.target.getDim() - 1); -+ emitTEXs (32); -+ emitGPR (24, insn->src(0)); -+ emitGPR (16, insn->def(0)); -+} -+ -+void -+CodeEmitterGV100::emitTXQ() -+{ -+ const TexInstruction *insn = this->insn->asTex(); -+ int type = 0; -+ -+ switch (insn->tex.query) { -+ case TXQ_DIMS : type = 0x00; break; -+ case TXQ_TYPE : type = 0x01; break; -+ case TXQ_SAMPLE_POSITION: type = 0x02; break; -+ default: -+ assert(!"invalid txq query"); -+ break; -+ } -+ -+ if (insn->tex.rIndirectSrc < 0) { -+ emitInsn (0xb6f); -+ emitField(54, 5, prog->driver->io.auxCBSlot); -+ emitField(40, 14, insn->tex.r); -+ } else { -+ emitInsn (0x370); -+ emitField(59, 1, 1); // .B -+ } -+ emitField(90, 1, insn->tex.liveOnly); -+ emitField(72, 4, insn->tex.mask); -+ emitGPR (64, insn->def(1)); -+ emitField(62, 2, type); -+ emitGPR (24, insn->src(0)); -+ emitGPR (16, insn->def(0)); -+} -+ -+/******************************************************************************* -+ * surface -+ ******************************************************************************/ -+ -+void -+CodeEmitterGV100::emitSUHandle(const int s) -+{ -+ const TexInstruction *insn = this->insn->asTex(); -+ -+ assert(insn->op >= OP_SULDB && insn->op <= OP_SUREDP); -+ -+ if (insn->src(s).getFile() == FILE_GPR) { -+ emitGPR(64, insn->src(s)); -+ } else { -+ assert(0); -+ //XXX: not done -+ ImmediateValue *imm = insn->getSrc(s)->asImm(); -+ assert(imm); -+ emitField(0x33, 1, 1); -+ emitField(0x24, 13, imm->reg.data.u32); -+ } -+} -+ -+void -+CodeEmitterGV100::emitSUTarget() -+{ -+ const TexInstruction *insn = this->insn->asTex(); -+ int target = 0; -+ -+ assert(insn->op >= OP_SULDB && insn->op <= OP_SUREDP); -+ -+ if (insn->tex.target == TEX_TARGET_BUFFER) { -+ target = 1; -+ } else if (insn->tex.target == TEX_TARGET_1D_ARRAY) { -+ target = 2; -+ } else if (insn->tex.target == TEX_TARGET_2D || -+ insn->tex.target == TEX_TARGET_RECT) { -+ target = 3; -+ } else if (insn->tex.target == TEX_TARGET_2D_ARRAY || -+ insn->tex.target == TEX_TARGET_CUBE || -+ insn->tex.target == TEX_TARGET_CUBE_ARRAY) { -+ target = 4; -+ } else if (insn->tex.target == TEX_TARGET_3D) { -+ target = 5; -+ } else { -+ assert(insn->tex.target == TEX_TARGET_1D); -+ } -+ emitField(61, 3, target); -+} -+ -+void -+CodeEmitterGV100::emitSUATOM() -+{ -+ const TexInstruction *insn = this->insn->asTex(); -+ uint8_t type = 0, subOp; -+ -+ if (insn->subOp == NV50_IR_SUBOP_ATOM_CAS) -+ emitInsn(0x396); // SUATOM.D.CAS -+ else -+ emitInsn(0x394); // SUATOM.D -+ -+ emitSUTarget(); -+ -+ // destination type -+ switch (insn->dType) { -+ case TYPE_S32: type = 1; break; -+ case TYPE_U64: type = 2; break; -+ case TYPE_F32: type = 3; break; -+ case TYPE_S64: type = 5; break; -+ default: -+ assert(insn->dType == TYPE_U32); -+ break; -+ } -+ -+ // atomic operation -+ if (insn->subOp == NV50_IR_SUBOP_ATOM_CAS) { -+ subOp = 0; -+ } else if (insn->subOp == NV50_IR_SUBOP_ATOM_EXCH) { -+ subOp = 8; -+ } else { -+ subOp = insn->subOp; -+ } -+ -+ emitField(87, 4, subOp); -+ emitPRED (81); -+ emitField(79, 2, 1); -+ emitField(73, 3, type); -+ emitField(72, 1, 0); // .BA -+ emitGPR (32, insn->src(1)); -+ emitGPR (24, insn->src(0)); -+ emitGPR (16, insn->def(0)); -+ -+ emitSUHandle(2); -+} -+ -+void -+CodeEmitterGV100::emitSULD() -+{ -+ const TexInstruction *insn = this->insn->asTex(); -+ int type = 0; -+ -+ if (insn->op == OP_SULDB) { -+ emitInsn(0x99a); -+ emitSUTarget(); -+ -+ switch (insn->dType) { -+ case TYPE_U8: type = 0; break; -+ case TYPE_S8: type = 1; break; -+ case TYPE_U16: type = 2; break; -+ case TYPE_S16: type = 3; break; -+ case TYPE_U32: type = 4; break; -+ case TYPE_U64: type = 5; break; -+ case TYPE_B128: type = 6; break; -+ default: -+ assert(0); -+ break; -+ } -+ emitField(73, 3, type); -+ } else { -+ emitInsn(0x998); -+ emitSUTarget(); -+ emitField(72, 4, 0xf); // rgba -+ } -+ -+ emitPRED (81); -+ emitLDSTc(77, 79); -+ -+ emitGPR (16, insn->def(0)); -+ emitGPR (24, insn->src(0)); -+ -+ emitSUHandle(1); -+} -+ -+void -+CodeEmitterGV100::emitSUST() -+{ -+ const TexInstruction *insn = this->insn->asTex(); -+ -+ emitInsn(0x99c); // SUST.P -+#if 0 -+ if (insn->op == OP_SUSTB) -+ emitField(0x34, 1, 1); -+#endif -+ emitSUTarget(); -+ -+ emitLDSTc(77, 79); -+ emitField(72, 4, 0xf); // rgba -+ emitGPR(32, insn->src(1)); -+ emitGPR(24, insn->src(0)); -+ emitSUHandle(2); -+} -+ -+/******************************************************************************* -+ * misc -+ ******************************************************************************/ -+ -+void -+CodeEmitterGV100::emitAL2P() -+{ -+ emitInsn (0x920); -+ emitO (79); -+ emitField(74, 2, (insn->getDef(0)->reg.size / 4) - 1); -+ emitField(40, 11, insn->src(0).get()->reg.data.offset); -+ emitGPR (24, insn->src(0).getIndirect(0)); -+ emitGPR (16, insn->def(0)); -+} -+ -+void -+CodeEmitterGV100::emitBAR() -+{ -+ uint8_t subop, redop = 0x00; -+ -+ // 80 -+ // 01: DEFER_BLOCKING -+ // 78:77 -+ // 00: SYNC -+ // 01: ARV -+ // 02: RED -+ // 03: SCAN -+ // 75:74 -+ // 00: RED.POPC -+ // 01: RED.AND -+ // 02: RED.OR -+ -+ switch (insn->subOp) { -+ case NV50_IR_SUBOP_BAR_RED_POPC: subop = 0x02; redop = 0x00; break; -+ case NV50_IR_SUBOP_BAR_RED_AND : subop = 0x02; redop = 0x01; break; -+ case NV50_IR_SUBOP_BAR_RED_OR : subop = 0x02; redop = 0x02; break; -+ case NV50_IR_SUBOP_BAR_ARRIVE : subop = 0x01; break; -+ default: -+ subop = 0x00; -+ assert(insn->subOp == NV50_IR_SUBOP_BAR_SYNC); -+ break; -+ } -+ -+ if (insn->src(0).getFile() == FILE_GPR) { -+ emitInsn ((1 << 9) | 0x11d); -+ emitGPR (32, insn->src(0)); //XXX: nvdisasm shows src0==src1 -+ } else { -+ ImmediateValue *imm = insn->getSrc(0)->asImm(); -+ assert(imm); -+ if (insn->src(1).getFile() == FILE_GPR) { -+ emitInsn ((4 << 9) | 0x11d); -+ emitGPR (32, insn->src(1)); -+ } else { -+ emitInsn ((5 << 9) | 0x11d); -+ } -+ emitField(54, 4, imm->reg.data.u32); -+ } -+ -+ emitField(77, 2, subop); -+ emitField(74, 2, redop); -+ -+ if (insn->srcExists(2) && (insn->predSrc != 2)) { -+ emitField(90, 1, insn->src(2).mod == Modifier(NV50_IR_MOD_NOT)); -+ emitPRED (87, insn->src(2)); -+ } else { -+ emitField(87, 3, 7); -+ } -+} -+ -+void -+CodeEmitterGV100::emitCCTL() -+{ -+ if (insn->src(0).getFile() == FILE_MEMORY_GLOBAL) -+ emitInsn(0x98f); -+ else -+ emitInsn(0x990); -+ emitField(87, 4, insn->subOp); -+ emitField(72, 1, insn->src(0).getIndirect(0)->getSize() == 8); -+ emitADDR (24, 32, 32, 0, insn->src(0)); -+} -+ -+void -+CodeEmitterGV100::emitMEMBAR() -+{ -+ emitInsn (0x992); -+ switch (NV50_IR_SUBOP_MEMBAR_SCOPE(insn->subOp)) { -+ case NV50_IR_SUBOP_MEMBAR_CTA: emitField(76, 3, 0); break; -+ case NV50_IR_SUBOP_MEMBAR_GL : emitField(76, 3, 2); break; -+ case NV50_IR_SUBOP_MEMBAR_SYS: emitField(76, 3, 3); break; -+ default: -+ assert(!"invalid scope"); -+ break; -+ } -+} -+ -+void -+CodeEmitterGV100::emitPIXLD() -+{ -+ emitInsn (0x925); -+ switch (insn->subOp) { -+ case NV50_IR_SUBOP_PIXLD_COVMASK : emitField(78, 3, 1); break; // .COVMASK -+ case NV50_IR_SUBOP_PIXLD_SAMPLEID: emitField(78, 3, 3); break; // .MY_INDEX -+ default: -+ assert(0); -+ break; -+ } -+ emitPRED (71); -+ emitGPR (16, insn->def(0)); -+} -+ -+void -+CodeEmitterGV100::emitPLOP3_LUT() -+{ -+ uint8_t op[2] = {}; -+ -+ switch (insn->op) { -+ case OP_AND: op[0] = 0xf0 & 0xcc; break; -+ case OP_OR : op[0] = 0xf0 | 0xcc; break; -+ case OP_XOR: op[0] = 0xf0 ^ 0xcc; break; -+ default: -+ assert(!"invalid PLOP3"); -+ break; -+ } -+ -+ emitInsn(0x81c); -+ emitNOT (90, insn->src(0)); -+ emitPRED(87, insn->src(0)); -+ emitPRED(84); // def(1) -+ emitPRED(81, insn->def(0)); -+ emitNOT (80, insn->src(1)); -+ emitPRED(77, insn->src(1)); -+ emitField(72, 5, op[0] >> 3); -+ emitNOT (71); // src(2) -+ emitPRED(68); // src(2) -+ emitField(64, 3, op[0] & 7); -+ emitField(16, 8, op[1]); -+} -+ -+void -+CodeEmitterGV100::emitVOTE() -+{ -+ const ImmediateValue *imm; -+ uint32_t u32; -+ -+ int r = -1, p = -1; -+ for (int i = 0; insn->defExists(i); i++) { -+ if (insn->def(i).getFile() == FILE_GPR) -+ r = i; -+ else if (insn->def(i).getFile() == FILE_PREDICATE) -+ p = i; -+ } -+ -+ emitInsn (0x806); -+ emitField(72, 2, insn->subOp); -+ if (r >= 0) -+ emitGPR (16, insn->def(r)); -+ else -+ emitGPR (16); -+ if (p >= 0) -+ emitPRED (81, insn->def(p)); -+ else -+ emitPRED (81); -+ -+ switch (insn->src(0).getFile()) { -+ case FILE_PREDICATE: -+ emitField(90, 1, insn->src(0).mod == Modifier(NV50_IR_MOD_NOT)); -+ emitPRED (87, insn->src(0)); -+ break; -+ case FILE_IMMEDIATE: -+ imm = insn->getSrc(0)->asImm(); -+ assert(imm); -+ u32 = imm->reg.data.u32; -+ assert(u32 == 0 || u32 == 1); -+ emitField(90, 1, u32 == 0); -+ emitPRED (87); -+ break; -+ default: -+ assert(!"Unhandled src"); -+ break; -+ } -+} -+ -+bool -+CodeEmitterGV100::emitInstruction(Instruction *i) -+{ -+ insn = i; -+ -+ switch (insn->op) { -+ case OP_ABS: -+ assert(!isFloatType(insn->dType)); -+ emitIABS(); -+ break; -+ case OP_ADD: -+ if (isFloatType(insn->dType)) { -+ if (insn->dType == TYPE_F32) -+ emitFADD(); -+ else -+ emitDADD(); -+ } else { -+ emitIADD3(); -+ } -+ break; -+ case OP_AFETCH: -+ emitAL2P(); -+ break; -+ case OP_AND: -+ case OP_OR: -+ case OP_XOR: -+ if (insn->def(0).getFile() == FILE_PREDICATE) { -+ emitPLOP3_LUT(); -+ } else { -+ assert(!"invalid logop"); -+ emitNOP(); -+ } -+ break; -+ case OP_ATOM: -+ if (insn->src(0).getFile() == FILE_MEMORY_SHARED) -+ emitATOMS(); -+ else -+ if (!insn->defExists(0) && insn->subOp < NV50_IR_SUBOP_ATOM_CAS) -+ emitRED(); -+ else -+ emitATOM(); -+ break; -+ case OP_BAR: -+ emitBAR(); -+ break; -+ case OP_BFIND: -+ emitFLO(); -+ break; -+ case OP_BMSK: -+ emitBMSK(); -+ break; -+ case OP_BREV: -+ emitBREV(); -+ break; -+ case OP_BRA: -+ case OP_JOIN: //XXX -+ emitBRA(); -+ break; -+ case OP_CCTL: -+ emitCCTL(); -+ break; -+ case OP_CEIL: -+ case OP_CVT: -+ case OP_FLOOR: -+ case OP_TRUNC: -+ if (insn->op == OP_CVT && (insn->def(0).getFile() == FILE_PREDICATE || -+ insn->src(0).getFile() == FILE_PREDICATE)) { -+ emitMOV(); -+ } else if (isFloatType(insn->dType)) { -+ if (isFloatType(insn->sType)) { -+ if (insn->sType == insn->dType) -+ emitFRND(); -+ else -+ emitF2F(); -+ } else { -+ emitI2F(); -+ } -+ } else { -+ if (isFloatType(insn->sType)) { -+ emitF2I(); -+ } else { -+ assert(!"I2I"); -+ emitNOP(); -+ } -+ } -+ break; -+ case OP_COS: -+ case OP_EX2: -+ case OP_LG2: -+ case OP_RCP: -+ case OP_RSQ: -+ case OP_SIN: -+ case OP_SQRT: -+ emitMUFU(); -+ break; -+ case OP_DISCARD: -+ emitKILL(); -+ break; -+ case OP_EMIT: -+ case OP_FINAL: -+ case OP_RESTART: -+ emitOUT(); -+ break; -+ case OP_EXIT: -+ emitEXIT(); -+ break; -+ case OP_EXPORT: -+ emitAST(); -+ break; -+ case OP_FMA: -+ case OP_MAD: -+ if (isFloatType(insn->dType)) { -+ if (insn->dType == TYPE_F32) -+ emitFFMA(); -+ else -+ emitDFMA(); -+ } else { -+ if (typeSizeof(insn->dType) != 8) -+ emitIMAD(); -+ else -+ emitIMAD_WIDE(); -+ } -+ break; -+ case OP_JOINAT: //XXX -+ emitNOP(); -+ break; -+ case OP_LINTERP: -+ emitIPA(); -+ break; -+ case OP_LOAD: -+ switch (insn->src(0).getFile()) { -+ case FILE_MEMORY_CONST : emitLDC(); break; -+ case FILE_MEMORY_LOCAL : emitLDL(); break; -+ case FILE_MEMORY_SHARED: emitLDS(); break; -+ case FILE_MEMORY_GLOBAL: emitLD(); break; -+ default: -+ assert(!"invalid load"); -+ emitNOP(); -+ break; -+ } -+ break; -+ case OP_LOP3_LUT: -+ emitLOP3_LUT(); -+ break; -+ case OP_MAX: -+ case OP_MIN: -+ if (isFloatType(insn->dType)) { -+ if (insn->dType == TYPE_F32) { -+ emitFMNMX(); -+ } else { -+ assert(!"invalid FMNMX"); -+ emitNOP(); -+ } -+ } else { -+ assert(!"invalid MNMX"); -+ emitNOP(); -+ } -+ break; -+ case OP_MEMBAR: -+ emitMEMBAR(); -+ break; -+ case OP_MOV: -+ emitMOV(); -+ break; -+ case OP_MUL: -+ if (isFloatType(insn->dType)) { -+ if (insn->dType == TYPE_F32) -+ emitFMUL(); -+ else -+ emitDMUL(); -+ } else { -+ assert(!"invalid IMUL"); -+ emitNOP(); -+ } -+ break; -+ case OP_PERMT: -+ emitPRMT(); -+ break; -+ case OP_PFETCH: -+ emitISBERD(); -+ break; -+ case OP_PIXLD: -+ emitPIXLD(); -+ break; -+ case OP_POPCNT: -+ emitPOPC(); -+ break; -+ case OP_QUADOP: -+ emitFSWZADD(); -+ break; -+ case OP_RDSV: -+ if (targ->isCS2RSV(insn->getSrc(0)->reg.data.sv.sv)) -+ emitCS2R(); -+ else -+ emitS2R(); -+ break; -+ case OP_SELP: -+ emitSEL(); -+ break; -+ case OP_SET: -+ case OP_SET_AND: -+ case OP_SET_OR: -+ case OP_SET_XOR: -+ if (insn->def(0).getFile() != FILE_PREDICATE) { -+ if (isFloatType(insn->dType)) { -+ if (insn->dType == TYPE_F32) { -+ emitFSET_BF(); -+ } else { -+ assert(!"invalid FSET"); -+ emitNOP(); -+ } -+ } else { -+ assert(!"invalid SET"); -+ emitNOP(); -+ } -+ } else { -+ if (isFloatType(insn->sType)) -+ if (insn->sType == TYPE_F64) -+ emitDSETP(); -+ else -+ emitFSETP(); -+ else -+ emitISETP(); -+ } -+ break; -+ case OP_SGXT: -+ emitSGXT(); -+ break; -+ case OP_SHF: -+ emitSHF(); -+ break; -+ case OP_SHFL: -+ emitSHFL(); -+ break; -+ case OP_SHLADD: -+ emitLEA(); -+ break; -+ case OP_STORE: -+ switch (insn->src(0).getFile()) { -+ case FILE_MEMORY_LOCAL : emitSTL(); break; -+ case FILE_MEMORY_SHARED: emitSTS(); break; -+ case FILE_MEMORY_GLOBAL: emitST(); break; -+ default: -+ assert(!"invalid store"); -+ emitNOP(); -+ break; -+ } -+ break; -+ case OP_SULDB: -+ case OP_SULDP: -+ emitSULD(); -+ break; -+ case OP_SUREDB: -+ case OP_SUREDP: -+ emitSUATOM(); -+ break; -+ case OP_SUSTB: -+ case OP_SUSTP: -+ emitSUST(); -+ break; -+ case OP_TEX: -+ case OP_TXB: -+ case OP_TXL: -+ emitTEX(); -+ break; -+ case OP_TXD: -+ emitTXD(); -+ break; -+ case OP_TXF: -+ emitTLD(); -+ break; -+ case OP_TXG: -+ emitTLD4(); -+ break; -+ case OP_TXLQ: -+ emitTMML(); -+ break; -+ case OP_TXQ: -+ emitTXQ(); -+ break; -+ case OP_VFETCH: -+ emitALD(); -+ break; -+ case OP_VOTE: -+ emitVOTE(); -+ break; -+ case OP_WARPSYNC: -+ emitWARPSYNC(); -+ break; -+ default: -+ assert(!"invalid opcode"); -+ emitNOP(); -+ break; -+ } -+ -+ code[3] &= 0x000001ff; -+ code[3] |= insn->sched << 9; -+ code += 4; -+ codeSize += 16; -+ return true; -+} -+ -+void -+CodeEmitterGV100::prepareEmission(BasicBlock *bb) -+{ -+ Function *func = bb->getFunction(); -+ Instruction *i; -+ int j; -+ -+ for (j = func->bbCount - 1; j >= 0 && !func->bbArray[j]->binSize; --j); -+ -+ for (; j >= 0; --j) { -+ BasicBlock *in = func->bbArray[j]; -+ Instruction *exit = in->getExit(); -+ -+ if (exit && exit->op == OP_BRA && exit->asFlow()->target.bb == bb) { -+ in->binSize -= 16; -+ func->binSize -= 16; -+ -+ for (++j; j < func->bbCount; ++j) -+ func->bbArray[j]->binPos -= 16; -+ -+ in->remove(exit); -+ } -+ bb->binPos = in->binPos + in->binSize; -+ if (in->binSize) // no more no-op branches to bb -+ break; -+ } -+ func->bbArray[func->bbCount++] = bb; -+ -+ if (!bb->getExit()) -+ return; -+ -+ for (i = bb->getEntry(); i; i = i->next) { -+ i->encSize = getMinEncodingSize(i); -+ bb->binSize += i->encSize; -+ } -+ -+ assert(!bb->getEntry() || (bb->getExit() && bb->getExit()->encSize == 16)); -+ -+ func->binSize += bb->binSize; -+} -+ -+void -+CodeEmitterGV100::prepareEmission(Function *func) -+{ -+ SchedDataCalculatorGM107 sched(targ); -+ CodeEmitter::prepareEmission(func); -+ sched.run(func, true, true); -+} -+ -+void -+CodeEmitterGV100::prepareEmission(Program *prog) -+{ -+ for (ArrayList::Iterator fi = prog->allFuncs.iterator(); -+ !fi.end(); fi.next()) { -+ Function *func = reinterpret_cast(fi.get()); -+ func->binPos = prog->binSize; -+ prepareEmission(func); -+ prog->binSize += func->binSize; -+ } -+ -+ this->prog = prog; -+} -+ -+CodeEmitterGV100::CodeEmitterGV100(TargetGV100 *target) -+ : CodeEmitter(target), targ(target) -+{ -+ code = NULL; -+ codeSize = codeSizeLimit = 0; -+ relocInfo = NULL; -+} -+}; -diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gv100.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gv100.h -new file mode 100644 -index 00000000000..15ab717e460 ---- /dev/null -+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gv100.h -@@ -0,0 +1,403 @@ -+/* -+ * Copyright 2020 Red Hat Inc. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice shall be included in -+ * all copies or substantial portions of the Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -+ * OTHER DEALINGS IN THE SOFTWARE. -+ */ -+#ifndef __NV50_IR_EMIT_GV100_H__ -+#define __NV50_IR_EMIT_GV100_H__ -+#include "codegen/nv50_ir_target_gv100.h" -+ -+namespace nv50_ir { -+ -+class CodeEmitterGV100 : public CodeEmitter { -+public: -+ CodeEmitterGV100(TargetGV100 *target); -+ -+ virtual bool emitInstruction(Instruction *); -+ virtual uint32_t getMinEncodingSize(const Instruction *) const { return 16; } -+ -+private: -+ const Program *prog; -+ const TargetGV100 *targ; -+ const Instruction *insn; -+ -+ virtual void prepareEmission(Program *); -+ virtual void prepareEmission(Function *); -+ virtual void prepareEmission(BasicBlock *); -+ -+ inline void emitInsn(uint32_t op) { -+ code[0] = op; -+ code[1] = 0; -+ code[2] = 0; -+ code[3] = 0; -+ if (insn->predSrc >= 0) { -+ emitField(12, 3, insn->getSrc(insn->predSrc)->rep()->reg.data.id); -+ emitField(15, 1, insn->cc == CC_NOT_P); -+ } else { -+ emitField(12, 3, 7); -+ } -+ }; -+ -+ inline void emitField(int b, int s, uint64_t v) { -+ if (b >= 0) { -+ uint64_t m = ~0ULL >> (64 - s); -+ uint64_t d = v & m; -+ assert(!(v & ~m) || (v & ~m) == ~m); -+ if (b < 64 && b + s > 64) { -+ *(uint64_t *)&code[0] |= d << b; -+ *(uint64_t *)&code[2] |= d >> (64 - b); -+ } else { -+ *(uint64_t *)&code[(b/64*2)] |= d << (b & 0x3f); -+ } -+ } -+ }; -+ -+ inline void emitABS(int pos, int src, bool supported) -+ { -+ if (insn->src(src).mod.abs()) { -+ assert(supported); -+ emitField(pos, 1, 1); -+ } -+ } -+ -+ inline void emitABS(int pos, int src) -+ { -+ emitABS(pos, src, true); -+ } -+ -+ inline void emitNEG(int pos, int src, bool supported) { -+ if (insn->src(src).mod.neg()) { -+ assert(supported); -+ emitField(pos, 1, 1); -+ } -+ } -+ -+ inline void emitNEG(int pos, int src) { -+ emitNEG(pos, src, true); -+ } -+ -+ inline void emitNOT(int pos) { -+ emitField(pos, 1, 0); -+ }; -+ -+ inline void emitNOT(int pos, const ValueRef &ref) { -+ emitField(pos, 1, !!(ref.mod & Modifier(NV50_IR_MOD_NOT))); -+ } -+ -+ inline void emitSAT(int pos) { -+ emitField(pos, 1, insn->saturate); -+ } -+ -+ inline void emitRND(int rmp, RoundMode rnd, int rip) { -+ int rm = 0, ri = 0; -+ switch (rnd) { -+ case ROUND_NI: ri = 1; -+ case ROUND_N : rm = 0; break; -+ case ROUND_MI: ri = 1; -+ case ROUND_M : rm = 1; break; -+ case ROUND_PI: ri = 1; -+ case ROUND_P : rm = 2; break; -+ case ROUND_ZI: ri = 1; -+ case ROUND_Z : rm = 3; break; -+ default: -+ assert(!"invalid round mode"); -+ break; -+ } -+ emitField(rip, 1, ri); -+ emitField(rmp, 2, rm); -+ } -+ -+ inline void emitRND(int pos) { -+ emitRND(pos, insn->rnd, -1); -+ } -+ -+ inline void emitFMZ(int pos, int len) { -+ emitField(pos, len, insn->dnz << 1 | insn->ftz); -+ } -+ -+ inline void emitPDIV(int pos) { -+ emitField(pos, 3, insn->postFactor + 4); -+ } -+ -+ inline void emitO(int pos) { -+ emitField(pos, 1, insn->getSrc(0)->reg.file == FILE_SHADER_OUTPUT); -+ } -+ -+ inline void emitP(int pos) { -+ emitField(pos, 1, insn->perPatch); -+ } -+ -+ inline void emitCond3(int pos, CondCode code) { -+ int data = 0; -+ -+ switch (code) { -+ case CC_FL : data = 0x00; break; -+ case CC_LTU: -+ case CC_LT : data = 0x01; break; -+ case CC_EQU: -+ case CC_EQ : data = 0x02; break; -+ case CC_LEU: -+ case CC_LE : data = 0x03; break; -+ case CC_GTU: -+ case CC_GT : data = 0x04; break; -+ case CC_NEU: -+ case CC_NE : data = 0x05; break; -+ case CC_GEU: -+ case CC_GE : data = 0x06; break; -+ case CC_TR : data = 0x07; break; -+ default: -+ assert(!"invalid cond3"); -+ break; -+ } -+ -+ emitField(pos, 3, data); -+ } -+ -+ inline void emitCond4(int pos, CondCode code) { -+ int data = 0; -+ -+ switch (code) { -+ case CC_FL: data = 0x00; break; -+ case CC_LT: data = 0x01; break; -+ case CC_EQ: data = 0x02; break; -+ case CC_LE: data = 0x03; break; -+ case CC_GT: data = 0x04; break; -+ case CC_NE: data = 0x05; break; -+ case CC_GE: data = 0x06; break; -+ // case CC_NUM: data = 0x07; break; -+ // case CC_NAN: data = 0x08; break; -+ case CC_LTU: data = 0x09; break; -+ case CC_EQU: data = 0x0a; break; -+ case CC_LEU: data = 0x0b; break; -+ case CC_GTU: data = 0x0c; break; -+ case CC_NEU: data = 0x0d; break; -+ case CC_GEU: data = 0x0e; break; -+ case CC_TR: data = 0x0f; break; -+ default: -+ assert(!"invalid cond4"); -+ break; -+ } -+ -+ emitField(pos, 4, data); -+ } -+ -+ inline void emitSYS(int pos, const Value *val) { -+ int id = val ? val->reg.data.id : -1; -+ -+ switch (id) { -+ case SV_LANEID : id = 0x00; break; -+ case SV_VERTEX_COUNT : id = 0x10; break; -+ case SV_INVOCATION_ID : id = 0x11; break; -+ case SV_THREAD_KILL : id = 0x13; break; -+ case SV_INVOCATION_INFO: id = 0x1d; break; -+ case SV_COMBINED_TID : id = 0x20; break; -+ case SV_TID : id = 0x21 + val->reg.data.sv.index; break; -+ case SV_CTAID : id = 0x25 + val->reg.data.sv.index; break; -+ case SV_LANEMASK_EQ : id = 0x38; break; -+ case SV_LANEMASK_LT : id = 0x39; break; -+ case SV_LANEMASK_LE : id = 0x3a; break; -+ case SV_LANEMASK_GT : id = 0x3b; break; -+ case SV_LANEMASK_GE : id = 0x3c; break; -+ case SV_CLOCK : id = 0x50 + val->reg.data.sv.index; break; -+ default: -+ assert(!"invalid system value"); -+ id = 0; -+ break; -+ } -+ -+ emitField(pos, 8, id); -+ } -+ -+ inline void emitSYS(int pos, const ValueRef &ref) { -+ emitSYS(pos, ref.get() ? ref.rep() : (const Value *)NULL); -+ } -+ -+ inline void emitGPR(int pos, const Value *val, int off) { -+ emitField(pos, 8, val && !val->inFile(FILE_FLAGS) ? -+ val->reg.data.id + off: 255); -+ } -+ -+ inline void emitGPR(int pos, const Value *v) { -+ emitGPR(pos, v, 0); -+ } -+ -+ inline void emitGPR(int pos) { -+ emitGPR(pos, (const Value *)NULL); -+ } -+ -+ inline void emitGPR(int pos, const ValueRef &ref) { -+ emitGPR(pos, ref.get() ? ref.rep() : (const Value *)NULL); -+ } -+ -+ inline void emitGPR(int pos, const ValueRef *ref) { -+ emitGPR(pos, ref ? ref->rep() : (const Value *)NULL); -+ } -+ -+ inline void emitGPR(int pos, const ValueDef &def) { -+ emitGPR(pos, def.get() ? def.rep() : (const Value *)NULL); -+ } -+ -+ inline void emitGPR(int pos, const ValueDef &def, int off) { -+ emitGPR(pos, def.get() ? def.rep() : (const Value *)NULL, off); -+ } -+ -+ inline void emitPRED(int pos, const Value *val) { -+ emitField(pos, 3, val ? val->reg.data.id : 7); -+ }; -+ -+ inline void emitPRED(int pos) { -+ emitPRED(pos, (const Value *)NULL); -+ } -+ -+ inline void emitPRED(int pos, const ValueRef &ref) { -+ emitPRED(pos, ref.get() ? ref.rep() : (const Value *)NULL); -+ } -+ -+ inline void emitPRED(int pos, const ValueDef &def) { -+ emitPRED(pos, def.get() ? def.rep() : (const Value *)NULL); -+ } -+ -+ inline void emitCBUF(int buf, int gpr, int off, int len, int align, -+ const ValueRef &ref) { -+ const Value *v = ref.get(); -+ const Symbol *s = v->asSym(); -+ -+ assert(!(s->reg.data.offset & ((1 << align) - 1))); -+ -+ emitField(buf, 5, v->reg.fileIndex); -+ if (gpr >= 0) -+ emitGPR(gpr, ref.getIndirect(0)); -+ emitField(off, 16, s->reg.data.offset); -+ } -+ -+ inline void emitIMMD(int pos, int len, const ValueRef &ref) { -+ const ImmediateValue *imm = ref.get()->asImm(); -+ uint32_t val = imm->reg.data.u32; -+ -+ if (insn->sType == TYPE_F64) { -+ assert(!(imm->reg.data.u64 & 0x00000000ffffffffULL)); -+ val = imm->reg.data.u64 >> 32; -+ } -+ -+ emitField(pos, len, val); -+ } -+ -+ inline void emitADDR(int gpr, int off, int len, int shr, -+ const ValueRef &ref) { -+ const Value *v = ref.get(); -+ assert(!(v->reg.data.offset & ((1 << shr) - 1))); -+ if (gpr >= 0) -+ emitGPR(gpr, ref.getIndirect(0)); -+ emitField(off, len, v->reg.data.offset >> shr); -+ } -+ -+ inline void emitFormA(uint16_t op, uint8_t forms, int src0, int src1, int src2); -+ inline void emitFormA_RRR(uint16_t op, int src1, int src2); -+ inline void emitFormA_RRI(uint16_t op, int src1, int src2); -+ inline void emitFormA_RRC(uint16_t op, int src1, int src2); -+ inline void emitFormA_I32(int src); -+ -+ void emitBRA(); -+ void emitEXIT(); -+ void emitKILL(); -+ void emitNOP(); -+ void emitWARPSYNC(); -+ -+ void emitCS2R(); -+ void emitF2F(); -+ void emitF2I(); -+ void emitFRND(); -+ void emitI2F(); -+ void emitMOV(); -+ void emitPRMT(); -+ void emitS2R(); -+ void emitSEL(); -+ void emitSHFL(); -+ -+ void emitFADD(); -+ void emitFFMA(); -+ void emitFMNMX(); -+ void emitFMUL(); -+ void emitFSET_BF(); -+ void emitFSETP(); -+ void emitFSWZADD(); -+ void emitMUFU(); -+ -+ void emitDADD(); -+ void emitDFMA(); -+ void emitDMUL(); -+ void emitDSETP(); -+ -+ void emitBMSK(); -+ void emitBREV(); -+ void emitFLO(); -+ void emitIABS(); -+ void emitIADD3(); -+ void emitIMAD(); -+ void emitIMAD_WIDE(); -+ void emitISETP(); -+ void emitLEA(); -+ void emitLOP3_LUT(); -+ void emitPOPC(); -+ void emitSGXT(); -+ void emitSHF(); -+ -+ void emitALD(); -+ void emitAST(); -+ void emitATOM(); -+ void emitATOMS(); -+ void emitIPA(); -+ void emitISBERD(); -+ void emitLDSTc(int, int); -+ void emitLDSTs(int, DataType); -+ void emitLD(); -+ void emitLDC(); -+ void emitLDL(); -+ void emitLDS(); -+ void emitOUT(); -+ void emitRED(); -+ void emitST(); -+ void emitSTL(); -+ void emitSTS(); -+ -+ void emitTEXs(int); -+ void emitTEX(); -+ void emitTLD(); -+ void emitTLD4(); -+ void emitTMML(); -+ void emitTXD(); -+ void emitTXQ(); -+ -+ void emitSUHandle(const int); -+ void emitSUTarget(); -+ void emitSUATOM(); -+ void emitSULD(); -+ void emitSUST(); -+ -+ void emitAL2P(); -+ void emitBAR(); -+ void emitCCTL(); -+ void emitMEMBAR(); -+ void emitPIXLD(); -+ void emitPLOP3_LUT(); -+ void emitVOTE(); -+}; -+ -+}; -+#endif -diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp -index bd78b76f384..eee9aa67256 100644 ---- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp -+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp -@@ -170,6 +170,7 @@ private: - NirArrayLMemOffsets regToLmemOffset; - NirBlockMap blocks; - unsigned int curLoopDepth; -+ unsigned int curIfDepth; - - BasicBlock *exit; - Value *zero; -@@ -188,6 +189,7 @@ Converter::Converter(Program *prog, nir_shader *nir, nv50_ir_prog_info *info) - : ConverterCommon(prog, info), - nir(nir), - curLoopDepth(0), -+ curIfDepth(0), - clipVertexOutput(-1) - { - zero = mkImm((uint32_t)0); -@@ -571,6 +573,10 @@ Converter::getSubOp(nir_op op) - case nir_op_imul_high: - case nir_op_umul_high: - return NV50_IR_SUBOP_MUL_HIGH; -+ case nir_op_ishl: -+ case nir_op_ishr: -+ case nir_op_ushr: -+ return NV50_IR_SUBOP_SHIFT_WRAP; - default: - return 0; - } -@@ -909,7 +915,7 @@ calcSlots(const glsl_type *type, Program::Type stage, const shader_info &info, - uint16_t slots; - switch (stage) { - case Program::TYPE_GEOMETRY: -- slots = type->uniform_locations(); -+ slots = type->count_attribute_slots(false); - if (input) - slots /= info.gs.vertices_in; - break; -@@ -917,9 +923,9 @@ calcSlots(const glsl_type *type, Program::Type stage, const shader_info &info, - case Program::TYPE_TESSELLATION_EVAL: - // remove first dimension - if (var->data.patch || (!input && stage == Program::TYPE_TESSELLATION_EVAL)) -- slots = type->uniform_locations(); -+ slots = type->count_attribute_slots(false); - else -- slots = type->fields.array->uniform_locations(); -+ slots = type->fields.array->count_attribute_slots(false); - break; - default: - slots = type->count_attribute_slots(false); -@@ -929,6 +935,24 @@ calcSlots(const glsl_type *type, Program::Type stage, const shader_info &info, - return slots; - } - -+static uint8_t -+getMaskForType(const glsl_type *type, uint8_t slot) { -+ uint16_t comp = type->without_array()->components(); -+ comp = comp ? comp : 4; -+ -+ if (glsl_base_type_is_64bit(type->without_array()->base_type)) { -+ comp *= 2; -+ if (comp > 4) { -+ if (slot % 2) -+ comp -= 4; -+ else -+ comp = 4; -+ } -+ } -+ -+ return (1 << comp) - 1; -+} -+ - bool Converter::assignSlots() { - unsigned name; - unsigned index; -@@ -981,16 +1005,8 @@ bool Converter::assignSlots() { - const glsl_type *type = var->type; - int slot = var->data.location; - uint16_t slots = calcSlots(type, prog->getType(), nir->info, true, var); -- uint32_t comp = type->is_array() ? type->without_array()->component_slots() -- : type->component_slots(); -- uint32_t frac = var->data.location_frac; - uint32_t vary = var->data.driver_location; - -- if (glsl_base_type_is_64bit(type->without_array()->base_type)) { -- if (comp > 2) -- slots *= 2; -- } -- - assert(vary + slots <= PIPE_MAX_SHADER_INPUTS); - - switch(prog->getType()) { -@@ -1014,6 +1030,8 @@ bool Converter::assignSlots() { - info->numPatchConstants = MAX2(info->numPatchConstants, index + slots); - break; - case Program::TYPE_VERTEX: -+ if (slot >= VERT_ATTRIB_GENERIC0) -+ slot = VERT_ATTRIB_GENERIC0 + vary; - vert_attrib_to_tgsi_semantic((gl_vert_attrib)slot, &name, &index); - switch (name) { - case TGSI_SEMANTIC_EDGEFLAG: -@@ -1029,17 +1047,12 @@ bool Converter::assignSlots() { - } - - for (uint16_t i = 0u; i < slots; ++i, ++vary) { -- info->in[vary].id = vary; -- info->in[vary].patch = var->data.patch; -- info->in[vary].sn = name; -- info->in[vary].si = index + i; -- if (glsl_base_type_is_64bit(type->without_array()->base_type)) -- if (i & 0x1) -- info->in[vary].mask |= (((1 << (comp * 2)) - 1) << (frac * 2) >> 0x4); -- else -- info->in[vary].mask |= (((1 << (comp * 2)) - 1) << (frac * 2) & 0xf); -- else -- info->in[vary].mask |= ((1 << comp) - 1) << frac; -+ nv50_ir_varying *v = &info->in[vary]; -+ -+ v->patch = var->data.patch; -+ v->sn = name; -+ v->si = index + i; -+ v->mask |= getMaskForType(type, i) << var->data.location_frac; - } - info->numInputs = std::max(info->numInputs, vary); - } -@@ -1048,16 +1061,8 @@ bool Converter::assignSlots() { - const glsl_type *type = var->type; - int slot = var->data.location; - uint16_t slots = calcSlots(type, prog->getType(), nir->info, false, var); -- uint32_t comp = type->is_array() ? type->without_array()->component_slots() -- : type->component_slots(); -- uint32_t frac = var->data.location_frac; - uint32_t vary = var->data.driver_location; - -- if (glsl_base_type_is_64bit(type->without_array()->base_type)) { -- if (comp > 2) -- slots *= 2; -- } -- - assert(vary < PIPE_MAX_SHADER_OUTPUTS); - - switch(prog->getType()) { -@@ -1067,7 +1072,11 @@ bool Converter::assignSlots() { - case TGSI_SEMANTIC_COLOR: - if (!var->data.fb_fetch_output) - info->prop.fp.numColourResults++; -- info->prop.fp.separateFragData = true; -+ -+ if (var->data.location == FRAG_RESULT_COLOR && -+ nir->info.outputs_written & BITFIELD64_BIT(var->data.location)) -+ info->prop.fp.separateFragData = true; -+ - // sometimes we get FRAG_RESULT_DATAX with data.index 0 - // sometimes we get FRAG_RESULT_DATA0 with data.index X - index = index == 0 ? var->data.index : index; -@@ -1118,20 +1127,14 @@ bool Converter::assignSlots() { - } - - for (uint16_t i = 0u; i < slots; ++i, ++vary) { -- info->out[vary].id = vary; -- info->out[vary].patch = var->data.patch; -- info->out[vary].sn = name; -- info->out[vary].si = index + i; -- if (glsl_base_type_is_64bit(type->without_array()->base_type)) -- if (i & 0x1) -- info->out[vary].mask |= (((1 << (comp * 2)) - 1) << (frac * 2) >> 0x4); -- else -- info->out[vary].mask |= (((1 << (comp * 2)) - 1) << (frac * 2) & 0xf); -- else -- info->out[vary].mask |= ((1 << comp) - 1) << frac; -+ nv50_ir_varying *v = &info->out[vary]; -+ v->patch = var->data.patch; -+ v->sn = name; -+ v->si = index + i; -+ v->mask |= getMaskForType(type, i) << var->data.location_frac; - - if (nir->info.outputs_read & 1ull << slot) -- info->out[vary].oread = 1; -+ v->oread = 1; - } - info->numOutputs = std::max(info->numOutputs, vary); - } -@@ -1275,6 +1278,7 @@ Converter::parseNIR() - info->bin.tlsSpace = 0; - info->io.clipDistances = nir->info.clip_distance_array_size; - info->io.cullDistances = nir->info.cull_distance_array_size; -+ info->io.layer_viewport_relative = nir->info.layer_viewport_relative; - - switch(prog->getType()) { - case Program::TYPE_COMPUTE: -@@ -1291,7 +1295,7 @@ Converter::parseNIR() - info->prop.fp.postDepthCoverage = nir->info.fs.post_depth_coverage; - info->prop.fp.readsSampleLocations = - (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_POS); -- info->prop.fp.usesDiscard = nir->info.fs.uses_discard; -+ info->prop.fp.usesDiscard = nir->info.fs.uses_discard || nir->info.fs.uses_demote; - info->prop.fp.usesSampleMaskIn = - !!(nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN); - break; -@@ -1426,64 +1430,69 @@ Converter::visit(nir_block *block) - bool - Converter::visit(nir_if *nif) - { -+ curIfDepth++; -+ - DataType sType = getSType(nif->condition, false, false); - Value *src = getSrc(&nif->condition, 0); - - nir_block *lastThen = nir_if_last_then_block(nif); - nir_block *lastElse = nir_if_last_else_block(nif); - -- assert(!lastThen->successors[1]); -- assert(!lastElse->successors[1]); -- -+ BasicBlock *headBB = bb; - BasicBlock *ifBB = convert(nir_if_first_then_block(nif)); - BasicBlock *elseBB = convert(nir_if_first_else_block(nif)); - - bb->cfg.attach(&ifBB->cfg, Graph::Edge::TREE); - bb->cfg.attach(&elseBB->cfg, Graph::Edge::TREE); - -- // we only insert joinats, if both nodes end up at the end of the if again. -- // the reason for this to not happens are breaks/continues/ret/... which -- // have their own handling -- if (lastThen->successors[0] == lastElse->successors[0]) -- bb->joinAt = mkFlow(OP_JOINAT, convert(lastThen->successors[0]), -- CC_ALWAYS, NULL); -- -+ bool insertJoins = lastThen->successors[0] == lastElse->successors[0]; - mkFlow(OP_BRA, elseBB, CC_EQ, src)->setType(sType); - - foreach_list_typed(nir_cf_node, node, node, &nif->then_list) { - if (!visit(node)) - return false; - } -+ - setPosition(convert(lastThen), true); -- if (!bb->getExit() || -- !bb->getExit()->asFlow() || -- bb->getExit()->asFlow()->op == OP_JOIN) { -+ if (!bb->isTerminated()) { - BasicBlock *tailBB = convert(lastThen->successors[0]); - mkFlow(OP_BRA, tailBB, CC_ALWAYS, NULL); - bb->cfg.attach(&tailBB->cfg, Graph::Edge::FORWARD); -+ } else { -+ insertJoins = insertJoins && bb->getExit()->op == OP_BRA; - } - - foreach_list_typed(nir_cf_node, node, node, &nif->else_list) { - if (!visit(node)) - return false; - } -+ - setPosition(convert(lastElse), true); -- if (!bb->getExit() || -- !bb->getExit()->asFlow() || -- bb->getExit()->asFlow()->op == OP_JOIN) { -+ if (!bb->isTerminated()) { - BasicBlock *tailBB = convert(lastElse->successors[0]); - mkFlow(OP_BRA, tailBB, CC_ALWAYS, NULL); - bb->cfg.attach(&tailBB->cfg, Graph::Edge::FORWARD); -+ } else { -+ insertJoins = insertJoins && bb->getExit()->op == OP_BRA; - } - -- if (lastThen->successors[0] == lastElse->successors[0]) { -- setPosition(convert(lastThen->successors[0]), true); -+ /* only insert joins for the most outer if */ -+ if (--curIfDepth) -+ insertJoins = false; -+ -+ /* we made sure that all threads would converge at the same block */ -+ if (insertJoins) { -+ BasicBlock *conv = convert(lastThen->successors[0]); -+ setPosition(headBB->getExit(), false); -+ headBB->joinAt = mkFlow(OP_JOINAT, conv, CC_ALWAYS, NULL); -+ setPosition(conv, false); - mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1; - } - - return true; - } - -+// TODO: add convergency - bool - Converter::visit(nir_loop *loop) - { -@@ -1491,8 +1500,8 @@ Converter::visit(nir_loop *loop) - func->loopNestingBound = std::max(func->loopNestingBound, curLoopDepth); - - BasicBlock *loopBB = convert(nir_loop_first_block(loop)); -- BasicBlock *tailBB = -- convert(nir_cf_node_as_block(nir_cf_node_next(&loop->cf_node))); -+ BasicBlock *tailBB = convert(nir_cf_node_as_block(nir_cf_node_next(&loop->cf_node))); -+ - bb->cfg.attach(&loopBB->cfg, Graph::Edge::TREE); - - mkFlow(OP_PREBREAK, tailBB, CC_ALWAYS, NULL); -@@ -1503,19 +1512,15 @@ Converter::visit(nir_loop *loop) - if (!visit(node)) - return false; - } -- Instruction *insn = bb->getExit(); -- if (bb->cfg.incidentCount() != 0) { -- if (!insn || !insn->asFlow()) { -- mkFlow(OP_CONT, loopBB, CC_ALWAYS, NULL); -- bb->cfg.attach(&loopBB->cfg, Graph::Edge::BACK); -- } else if (insn && insn->op == OP_BRA && !insn->getPredicate() && -- tailBB->cfg.incidentCount() == 0) { -- // RA doesn't like having blocks around with no incident edge, -- // so we create a fake one to make it happy -- bb->cfg.attach(&tailBB->cfg, Graph::Edge::TREE); -- } -+ -+ if (!bb->isTerminated()) { -+ mkFlow(OP_CONT, loopBB, CC_ALWAYS, NULL); -+ bb->cfg.attach(&loopBB->cfg, Graph::Edge::BACK); - } - -+ if (tailBB->cfg.incidentCount() == 0) -+ loopBB->cfg.attach(&tailBB->cfg, Graph::Edge::TREE); -+ - curLoopDepth -= 1; - - return true; -@@ -1560,6 +1565,7 @@ Converter::convert(nir_intrinsic_op intr) - return SV_DRAWID; - case nir_intrinsic_load_front_face: - return SV_FACE; -+ case nir_intrinsic_is_helper_invocation: - case nir_intrinsic_load_helper_invocation: - return SV_THREAD_KILL; - case nir_intrinsic_load_instance_id: -@@ -1617,6 +1623,7 @@ Converter::visit(nir_intrinsic_instr *insn) - { - nir_intrinsic_op op = insn->intrinsic; - const nir_intrinsic_info &opInfo = nir_intrinsic_infos[op]; -+ unsigned dest_components = nir_intrinsic_dest_components(insn); - - switch (op) { - case nir_intrinsic_load_uniform: { -@@ -1624,7 +1631,7 @@ Converter::visit(nir_intrinsic_instr *insn) - const DataType dType = getDType(insn); - Value *indirect; - uint32_t coffset = getIndirect(insn, 0, 0, indirect); -- for (uint8_t i = 0; i < insn->num_components; ++i) { -+ for (uint8_t i = 0; i < dest_components; ++i) { - loadFrom(FILE_MEMORY_CONST, 0, dType, newDefs[i], 16 * coffset, i, indirect); - } - break; -@@ -1635,7 +1642,7 @@ Converter::visit(nir_intrinsic_instr *insn) - DataType dType = getSType(insn->src[0], false, false); - uint32_t idx = getIndirect(insn, op == nir_intrinsic_store_output ? 1 : 2, 0, indirect); - -- for (uint8_t i = 0u; i < insn->num_components; ++i) { -+ for (uint8_t i = 0u; i < nir_intrinsic_src_components(insn, 0); ++i) { - if (!((1u << i) & nir_intrinsic_write_mask(insn))) - continue; - -@@ -1652,6 +1659,7 @@ Converter::visit(nir_intrinsic_instr *insn) - break; - } - case Program::TYPE_GEOMETRY: -+ case Program::TYPE_TESSELLATION_EVAL: - case Program::TYPE_VERTEX: { - if (info->io.genUserClip > 0 && idx == (uint32_t)clipVertexOutput) { - mkMov(clipVtx[i], src); -@@ -1688,7 +1696,7 @@ Converter::visit(nir_intrinsic_instr *insn) - srcs.push_back(mkOp1v(OP_RDSV, TYPE_U32, getSSA(), mkSysVal(SV_LAYER, 0))); - srcs.push_back(mkOp1v(OP_RDSV, TYPE_U32, getSSA(), mkSysVal(SV_SAMPLE_INDEX, 0))); - -- for (uint8_t i = 0u; i < insn->num_components; ++i) { -+ for (uint8_t i = 0u; i < dest_components; ++i) { - defs.push_back(newDefs[i]); - mask |= 1 << i; - } -@@ -1715,15 +1723,25 @@ Converter::visit(nir_intrinsic_instr *insn) - - // see load_barycentric_* handling - if (prog->getType() == Program::TYPE_FRAGMENT) { -- mode = translateInterpMode(&vary, nvirOp); - if (op == nir_intrinsic_load_interpolated_input) { - ImmediateValue immMode; - if (getSrc(&insn->src[0], 1)->getUniqueInsn()->src(0).getImmediate(immMode)) -- mode |= immMode.reg.data.u32; -+ mode = immMode.reg.data.u32; -+ } -+ if (mode == NV50_IR_INTERP_DEFAULT) -+ mode |= translateInterpMode(&vary, nvirOp); -+ else { -+ if (vary.linear) { -+ nvirOp = OP_LINTERP; -+ mode |= NV50_IR_INTERP_LINEAR; -+ } else { -+ nvirOp = OP_PINTERP; -+ mode |= NV50_IR_INTERP_PERSPECTIVE; -+ } - } - } - -- for (uint8_t i = 0u; i < insn->num_components; ++i) { -+ for (uint8_t i = 0u; i < dest_components; ++i) { - uint32_t address = getSlotAddress(insn, idx, i); - Symbol *sym = mkSymbol(input ? FILE_SHADER_INPUT : FILE_SHADER_OUTPUT, 0, dType, address); - if (prog->getType() == Program::TYPE_FRAGMENT) { -@@ -1814,9 +1832,11 @@ Converter::visit(nir_intrinsic_instr *insn) - loadImm(newDefs[1], mode); - break; - } -+ case nir_intrinsic_demote: - case nir_intrinsic_discard: - mkOp(OP_DISCARD, TYPE_NONE, NULL); - break; -+ case nir_intrinsic_demote_if: - case nir_intrinsic_discard_if: { - Value *pred = getSSA(1, FILE_PREDICATE); - if (insn->num_components > 1) { -@@ -1832,6 +1852,7 @@ Converter::visit(nir_intrinsic_instr *insn) - case nir_intrinsic_load_base_instance: - case nir_intrinsic_load_draw_id: - case nir_intrinsic_load_front_face: -+ case nir_intrinsic_is_helper_invocation: - case nir_intrinsic_load_helper_invocation: - case nir_intrinsic_load_instance_id: - case nir_intrinsic_load_invocation_id: -@@ -1858,7 +1879,7 @@ Converter::visit(nir_intrinsic_instr *insn) - SVSemantic sv = convert(op); - LValues &newDefs = convert(&insn->dest); - -- for (uint8_t i = 0u; i < insn->num_components; ++i) { -+ for (uint8_t i = 0u; i < nir_intrinsic_dest_components(insn); ++i) { - Value *def; - if (typeSizeof(dType) == 8) - def = getSSA(); -@@ -1910,12 +1931,12 @@ Converter::visit(nir_intrinsic_instr *insn) - - if (op == nir_intrinsic_read_first_invocation) { - mkOp1(OP_VOTE, TYPE_U32, tmp, mkImm(1))->subOp = NV50_IR_SUBOP_VOTE_ANY; -- mkOp2(OP_EXTBF, TYPE_U32, tmp, tmp, mkImm(0x2000))->subOp = NV50_IR_SUBOP_EXTBF_REV; -+ mkOp1(OP_BREV, TYPE_U32, tmp, tmp); - mkOp1(OP_BFIND, TYPE_U32, tmp, tmp)->subOp = NV50_IR_SUBOP_BFIND_SAMT; - } else - tmp = getSrc(&insn->src[1], 0); - -- for (uint8_t i = 0; i < insn->num_components; ++i) { -+ for (uint8_t i = 0; i < dest_components; ++i) { - mkOp3(OP_SHFL, dType, newDefs[i], getSrc(&insn->src[0], i), tmp, mkImm(0x1f)) - ->subOp = NV50_IR_SUBOP_SHFL_IDX; - } -@@ -1931,7 +1952,7 @@ Converter::visit(nir_intrinsic_instr *insn) - - Value *vtxBase = mkOp2v(OP_PFETCH, TYPE_U32, getSSA(4, FILE_ADDRESS), - mkImm(baseVertex), indirectVertex); -- for (uint8_t i = 0u; i < insn->num_components; ++i) { -+ for (uint8_t i = 0u; i < dest_components; ++i) { - uint32_t address = getSlotAddress(insn, idx, i); - loadFrom(FILE_SHADER_INPUT, 0, dType, newDefs[i], address, 0, - indirectOffset, vtxBase, info->in[idx].patch); -@@ -1954,19 +1975,24 @@ Converter::visit(nir_intrinsic_instr *insn) - - vtxBase = mkOp2v(OP_ADD, TYPE_U32, getSSA(4, FILE_ADDRESS), outBase, vtxBase); - -- for (uint8_t i = 0u; i < insn->num_components; ++i) { -+ for (uint8_t i = 0u; i < dest_components; ++i) { - uint32_t address = getSlotAddress(insn, idx, i); - loadFrom(FILE_SHADER_OUTPUT, 0, dType, newDefs[i], address, 0, - indirectOffset, vtxBase, info->in[idx].patch); - } - break; - } -- case nir_intrinsic_emit_vertex: -+ case nir_intrinsic_emit_vertex: { - if (info->io.genUserClip > 0) - handleUserClipPlanes(); -- // fallthrough -+ uint32_t idx = nir_intrinsic_stream_id(insn); -+ mkOp1(getOperation(op), TYPE_U32, NULL, mkImm(idx))->fixed = 1; -+ break; -+ } - case nir_intrinsic_end_primitive: { - uint32_t idx = nir_intrinsic_stream_id(insn); -+ if (idx) -+ break; - mkOp1(getOperation(op), TYPE_U32, NULL, mkImm(idx))->fixed = 1; - break; - } -@@ -1978,7 +2004,7 @@ Converter::visit(nir_intrinsic_instr *insn) - uint32_t index = getIndirect(&insn->src[0], 0, indirectIndex) + 1; - uint32_t offset = getIndirect(&insn->src[1], 0, indirectOffset); - -- for (uint8_t i = 0u; i < insn->num_components; ++i) { -+ for (uint8_t i = 0u; i < dest_components; ++i) { - loadFrom(FILE_MEMORY_CONST, index, dType, newDefs[i], offset, i, - indirectOffset, indirectIndex); - } -@@ -2001,7 +2027,7 @@ Converter::visit(nir_intrinsic_instr *insn) - uint32_t buffer = getIndirect(&insn->src[1], 0, indirectBuffer); - uint32_t offset = getIndirect(&insn->src[2], 0, indirectOffset); - -- for (uint8_t i = 0u; i < insn->num_components; ++i) { -+ for (uint8_t i = 0u; i < nir_intrinsic_src_components(insn, 0); ++i) { - if (!((1u << i) & nir_intrinsic_write_mask(insn))) - continue; - Symbol *sym = mkSymbol(FILE_MEMORY_BUFFER, buffer, sType, -@@ -2020,7 +2046,7 @@ Converter::visit(nir_intrinsic_instr *insn) - uint32_t buffer = getIndirect(&insn->src[0], 0, indirectBuffer); - uint32_t offset = getIndirect(&insn->src[1], 0, indirectOffset); - -- for (uint8_t i = 0u; i < insn->num_components; ++i) -+ for (uint8_t i = 0u; i < dest_components; ++i) - loadFrom(FILE_MEMORY_BUFFER, buffer, dType, newDefs[i], offset, i, - indirectOffset, indirectBuffer); - -@@ -2314,7 +2340,7 @@ Converter::visit(nir_intrinsic_instr *insn) - Value *indirectOffset; - uint32_t offset = getIndirect(&insn->src[1], 0, indirectOffset); - -- for (uint8_t i = 0u; i < insn->num_components; ++i) { -+ for (uint8_t i = 0u; i < nir_intrinsic_src_components(insn, 0); ++i) { - if (!((1u << i) & nir_intrinsic_write_mask(insn))) - continue; - Symbol *sym = mkSymbol(FILE_MEMORY_SHARED, 0, sType, offset + i * typeSizeof(sType)); -@@ -2328,7 +2354,7 @@ Converter::visit(nir_intrinsic_instr *insn) - Value *indirectOffset; - uint32_t offset = getIndirect(&insn->src[0], 0, indirectOffset); - -- for (uint8_t i = 0u; i < insn->num_components; ++i) -+ for (uint8_t i = 0u; i < dest_components; ++i) - loadFrom(FILE_MEMORY_SHARED, 0, dType, newDefs[i], offset, i, indirectOffset); - - break; -@@ -2367,7 +2393,7 @@ Converter::visit(nir_intrinsic_instr *insn) - Value *indirectOffset; - uint32_t offset = getIndirect(&insn->src[0], 0, indirectOffset); - -- for (auto i = 0u; i < insn->num_components; ++i) -+ for (auto i = 0u; i < dest_components; ++i) - loadFrom(FILE_MEMORY_GLOBAL, 0, dType, newDefs[i], offset, i, indirectOffset); - - info->io.globalAccess |= 0x1; -@@ -2376,7 +2402,7 @@ Converter::visit(nir_intrinsic_instr *insn) - case nir_intrinsic_store_global: { - DataType sType = getSType(insn->src[0], false, false); - -- for (auto i = 0u; i < insn->num_components; ++i) { -+ for (auto i = 0u; i < nir_intrinsic_src_components(insn, 0); ++i) { - if (!((1u << i) & nir_intrinsic_write_mask(insn))) - continue; - if (typeSizeof(sType) == 8) { -@@ -2418,7 +2444,6 @@ Converter::visit(nir_jump_instr *insn) - case nir_jump_continue: { - bool isBreak = insn->type == nir_jump_break; - nir_block *block = insn->instr.block; -- assert(!block->successors[1]); - BasicBlock *target = convert(block->successors[0]); - mkFlow(isBreak ? OP_BREAK : OP_CONT, target, CC_ALWAYS, NULL); - bb->cfg.attach(&target->cfg, isBreak ? Graph::Edge::CROSS : Graph::Edge::BACK); -@@ -2774,7 +2799,7 @@ Converter::visit(nir_alu_instr *insn) - case nir_op_bfm: { - DEFAULT_CHECKS; - LValues &newDefs = convert(&insn->dest); -- mkOp3(OP_INSBF, dType, newDefs[0], getSrc(&insn->src[0]), loadImm(NULL, 0x808), getSrc(&insn->src[1])); -+ mkOp2(OP_BMSK, dType, newDefs[0], getSrc(&insn->src[1]), getSrc(&insn->src[0]))->subOp = NV50_IR_SUBOP_BMSK_W; - break; - } - case nir_op_bitfield_insert: { -@@ -2794,17 +2819,69 @@ Converter::visit(nir_alu_instr *insn) - case nir_op_bitfield_reverse: { - DEFAULT_CHECKS; - LValues &newDefs = convert(&insn->dest); -- mkOp2(OP_EXTBF, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), mkImm(0x2000))->subOp = NV50_IR_SUBOP_EXTBF_REV; -+ mkOp1(OP_BREV, TYPE_U32, newDefs[0], getSrc(&insn->src[0])); - break; - } - case nir_op_find_lsb: { - DEFAULT_CHECKS; - LValues &newDefs = convert(&insn->dest); - Value *tmp = getSSA(); -- mkOp2(OP_EXTBF, TYPE_U32, tmp, getSrc(&insn->src[0]), mkImm(0x2000))->subOp = NV50_IR_SUBOP_EXTBF_REV; -+ mkOp1(OP_BREV, TYPE_U32, tmp, getSrc(&insn->src[0])); - mkOp1(OP_BFIND, TYPE_U32, newDefs[0], tmp)->subOp = NV50_IR_SUBOP_BFIND_SAMT; - break; - } -+ case nir_op_extract_u8: { -+ DEFAULT_CHECKS; -+ LValues &newDefs = convert(&insn->dest); -+ Value *prmt = getSSA(); -+ mkOp2(OP_OR, TYPE_U32, prmt, getSrc(&insn->src[1]), loadImm(NULL, 0x4440)); -+ mkOp3(OP_PERMT, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), prmt, loadImm(NULL, 0)); -+ break; -+ } -+ case nir_op_extract_i8: { -+ DEFAULT_CHECKS; -+ LValues &newDefs = convert(&insn->dest); -+ Value *prmt = getSSA(); -+ mkOp3(OP_MAD, TYPE_U32, prmt, getSrc(&insn->src[1]), loadImm(NULL, 0x1111), loadImm(NULL, 0x8880)); -+ mkOp3(OP_PERMT, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), prmt, loadImm(NULL, 0)); -+ break; -+ } -+ case nir_op_extract_u16: { -+ DEFAULT_CHECKS; -+ LValues &newDefs = convert(&insn->dest); -+ Value *prmt = getSSA(); -+ mkOp3(OP_MAD, TYPE_U32, prmt, getSrc(&insn->src[1]), loadImm(NULL, 0x22), loadImm(NULL, 0x4410)); -+ mkOp3(OP_PERMT, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), prmt, loadImm(NULL, 0)); -+ break; -+ } -+ case nir_op_extract_i16: { -+ DEFAULT_CHECKS; -+ LValues &newDefs = convert(&insn->dest); -+ Value *prmt = getSSA(); -+ mkOp3(OP_MAD, TYPE_U32, prmt, getSrc(&insn->src[1]), loadImm(NULL, 0x2222), loadImm(NULL, 0x9910)); -+ mkOp3(OP_PERMT, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), prmt, loadImm(NULL, 0)); -+ break; -+ } -+ case nir_op_urol: { -+ DEFAULT_CHECKS; -+ LValues &newDefs = convert(&insn->dest); -+ mkOp3(OP_SHF, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), -+ getSrc(&insn->src[1]), getSrc(&insn->src[0])) -+ ->subOp = NV50_IR_SUBOP_SHF_L | -+ NV50_IR_SUBOP_SHF_W | -+ NV50_IR_SUBOP_SHF_HI; -+ break; -+ } -+ case nir_op_uror: { -+ DEFAULT_CHECKS; -+ LValues &newDefs = convert(&insn->dest); -+ mkOp3(OP_SHF, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), -+ getSrc(&insn->src[1]), getSrc(&insn->src[0])) -+ ->subOp = NV50_IR_SUBOP_SHF_R | -+ NV50_IR_SUBOP_SHF_W | -+ NV50_IR_SUBOP_SHF_LO; -+ break; -+ } - // boolean conversions - case nir_op_b2f32: { - DEFAULT_CHECKS; -@@ -2990,14 +3067,11 @@ Converter::handleDeref(nir_deref_instr *deref, Value * &indirect, const nir_vari - CacheMode - Converter::convert(enum gl_access_qualifier access) - { -- switch (access) { -- case ACCESS_VOLATILE: -+ if (access & ACCESS_VOLATILE) - return CACHE_CV; -- case ACCESS_COHERENT: -+ if (access & ACCESS_COHERENT) - return CACHE_CG; -- default: -- return CACHE_CA; -- } -+ return CACHE_CA; - } - - CacheMode -@@ -3224,6 +3298,11 @@ Converter::run() - NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL); - NIR_PASS_V(nir, nir_lower_phis_to_scalar); - -+ /*TODO: improve this lowering/optimisation loop so that we can use -+ * nir_opt_idiv_const effectively before this. -+ */ -+ NIR_PASS(progress, nir, nir_lower_idiv, nir_lower_idiv_precise); -+ - do { - progress = false; - NIR_PASS(progress, nir, nir_copy_prop); -@@ -3285,3 +3364,125 @@ Program::makeFromNIR(struct nv50_ir_prog_info *info) - } - - } // namespace nv50_ir -+ -+static nir_shader_compiler_options -+nvir_nir_shader_compiler_options(int chipset) -+{ -+ nir_shader_compiler_options op = {}; -+ op.lower_fdiv = (chipset >= NVISA_GV100_CHIPSET); -+ op.lower_ffma = false; -+ op.fuse_ffma = false; /* nir doesn't track mad vs fma */ -+ op.lower_flrp16 = (chipset >= NVISA_GV100_CHIPSET); -+ op.lower_flrp32 = true; -+ op.lower_flrp64 = true; -+ op.lower_fpow = false; // TODO: nir's lowering is broken, or we could use it -+ op.lower_fsat = false; -+ op.lower_fsqrt = false; // TODO: only before gm200 -+ op.lower_sincos = false; -+ op.lower_fmod = true; -+ op.lower_bitfield_extract = false; -+ op.lower_bitfield_extract_to_shifts = (chipset >= NVISA_GV100_CHIPSET); -+ op.lower_bitfield_insert = false; -+ op.lower_bitfield_insert_to_shifts = (chipset >= NVISA_GV100_CHIPSET); -+ op.lower_bitfield_insert_to_bitfield_select = false; -+ op.lower_bitfield_reverse = false; -+ op.lower_bit_count = false; -+ op.lower_ifind_msb = false; -+ op.lower_find_lsb = false; -+ op.lower_uadd_carry = true; // TODO -+ op.lower_usub_borrow = true; // TODO -+ op.lower_mul_high = false; -+ op.lower_negate = false; -+ op.lower_sub = true; -+ op.lower_scmp = true; // TODO: not implemented yet -+ op.lower_vector_cmp = false; -+ op.lower_idiv = true; -+ op.lower_bitops = false; -+ op.lower_isign = (chipset >= NVISA_GV100_CHIPSET); -+ op.lower_fsign = (chipset >= NVISA_GV100_CHIPSET); -+ op.lower_fdph = false; -+ op.lower_fdot = false; -+ op.fdot_replicates = false; // TODO -+ op.lower_ffloor = false; // TODO -+ op.lower_ffract = true; -+ op.lower_fceil = false; // TODO -+ op.lower_ftrunc = false; -+ op.lower_ldexp = true; -+ op.lower_pack_half_2x16 = true; -+ op.lower_pack_unorm_2x16 = true; -+ op.lower_pack_snorm_2x16 = true; -+ op.lower_pack_unorm_4x8 = true; -+ op.lower_pack_snorm_4x8 = true; -+ op.lower_unpack_half_2x16 = true; -+ op.lower_unpack_unorm_2x16 = true; -+ op.lower_unpack_snorm_2x16 = true; -+ op.lower_unpack_unorm_4x8 = true; -+ op.lower_unpack_snorm_4x8 = true; -+ op.lower_pack_split = false; -+ op.lower_extract_byte = (chipset < NVISA_GM107_CHIPSET); -+ op.lower_extract_word = (chipset < NVISA_GM107_CHIPSET); -+ op.lower_all_io_to_temps = false; -+ op.lower_all_io_to_elements = false; -+ op.vertex_id_zero_based = false; -+ op.lower_base_vertex = false; -+ op.lower_helper_invocation = false; -+ op.optimize_sample_mask_in = false; -+ op.lower_cs_local_index_from_id = true; -+ op.lower_cs_local_id_from_index = false; -+ op.lower_device_index_to_zero = false; // TODO -+ op.lower_wpos_pntc = false; // TODO -+ op.lower_hadd = true; // TODO -+ op.lower_add_sat = true; // TODO -+ op.vectorize_io = false; -+ op.lower_to_scalar = false; -+ op.unify_interfaces = false; -+ op.use_interpolated_input_intrinsics = true; -+ op.lower_mul_2x32_64 = true; // TODO -+ op.lower_rotate = (chipset < NVISA_GV100_CHIPSET); -+ op.has_imul24 = false; -+ op.intel_vec4 = false; -+ op.max_unroll_iterations = 32; -+ op.lower_int64_options = (nir_lower_int64_options) ( -+ ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_imul64 : 0) | -+ ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_isign64 : 0) | -+ nir_lower_divmod64 | -+ ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_imul_high64 : 0) | -+ ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_mov64 : 0) | -+ ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_icmp64 : 0) | -+ ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_iabs64 : 0) | -+ ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_ineg64 : 0) | -+ ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_logic64 : 0) | -+ ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_minmax64 : 0) | -+ ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_shift64 : 0) | -+ ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_imul_2x32_64 : 0) | -+ ((chipset >= NVISA_GM107_CHIPSET) ? nir_lower_extract64 : 0) | -+ nir_lower_ufind_msb64 -+ ); -+ op.lower_doubles_options = (nir_lower_doubles_options) ( -+ ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_drcp : 0) | -+ ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_dsqrt : 0) | -+ ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_drsq : 0) | -+ ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_dfract : 0) | -+ nir_lower_dmod | -+ ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_dsub : 0) | -+ ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_ddiv : 0) -+ ); -+ return op; -+} -+ -+static const nir_shader_compiler_options gf100_nir_shader_compiler_options = -+nvir_nir_shader_compiler_options(NVISA_GF100_CHIPSET); -+static const nir_shader_compiler_options gm107_nir_shader_compiler_options = -+nvir_nir_shader_compiler_options(NVISA_GM107_CHIPSET); -+static const nir_shader_compiler_options gv100_nir_shader_compiler_options = -+nvir_nir_shader_compiler_options(NVISA_GV100_CHIPSET); -+ -+const nir_shader_compiler_options * -+nv50_ir_nir_shader_compiler_options(int chipset) -+{ -+ if (chipset >= NVISA_GV100_CHIPSET) -+ return &gv100_nir_shader_compiler_options; -+ if (chipset >= NVISA_GM107_CHIPSET) -+ return &gm107_nir_shader_compiler_options; -+ return &gf100_nir_shader_compiler_options; -+} -diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp -index 60f3d582a0b..3fd76f64de0 100644 ---- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp -+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp -@@ -3401,8 +3401,7 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn) - // ReadInvocationARB(src, findLSB(ballot(true))) - val0 = getScratch(); - mkOp1(OP_VOTE, TYPE_U32, val0, mkImm(1))->subOp = NV50_IR_SUBOP_VOTE_ANY; -- mkOp2(OP_EXTBF, TYPE_U32, val0, val0, mkImm(0x2000)) -- ->subOp = NV50_IR_SUBOP_EXTBF_REV; -+ mkOp1(OP_BREV, TYPE_U32, val0, val0); - mkOp1(OP_BFIND, TYPE_U32, val0, val0)->subOp = NV50_IR_SUBOP_BFIND_SAMT; - src1 = val0; - /* fallthrough */ -@@ -3820,8 +3819,7 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn) - FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { - src0 = fetchSrc(0, c); - val0 = getScratch(); -- geni = mkOp2(OP_EXTBF, TYPE_U32, val0, src0, mkImm(0x2000)); -- geni->subOp = NV50_IR_SUBOP_EXTBF_REV; -+ mkOp1(OP_BREV, TYPE_U32, val0, src0); - geni = mkOp1(OP_BFIND, TYPE_U32, dst0[c], val0); - geni->subOp = NV50_IR_SUBOP_BFIND_SAMT; - } -@@ -3836,8 +3834,7 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn) - case TGSI_OPCODE_BREV: - FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { - src0 = fetchSrc(0, c); -- geni = mkOp2(OP_EXTBF, TYPE_U32, dst0[c], src0, mkImm(0x2000)); -- geni->subOp = NV50_IR_SUBOP_EXTBF_REV; -+ mkOp1(OP_BREV, TYPE_U32, dst0[c], src0); - } - break; - case TGSI_OPCODE_POPC: -diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp -index 49a5f3b01f2..9fad1dcfe89 100644 ---- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp -+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp -@@ -239,9 +239,8 @@ GM107LoweringPass::handlePFETCH(Instruction *i) - Value *tmp1 = bld.getScratch(); - Value *tmp2 = bld.getScratch(); - bld.mkOp1(OP_RDSV, TYPE_U32, tmp0, bld.mkSysVal(SV_INVOCATION_INFO, 0)); -- bld.mkOp2(OP_SHR , TYPE_U32, tmp1, tmp0, bld.mkImm(16)); -- bld.mkOp2(OP_AND , TYPE_U32, tmp0, tmp0, bld.mkImm(0xff)); -- bld.mkOp2(OP_AND , TYPE_U32, tmp1, tmp1, bld.mkImm(0xff)); -+ bld.mkOp3(OP_PERMT, TYPE_U32, tmp1, tmp0, bld.mkImm(0x4442), bld.mkImm(0)); -+ bld.mkOp3(OP_PERMT, TYPE_U32, tmp0, tmp0, bld.mkImm(0x4440), bld.mkImm(0)); - if (i->getSrc(1)) - bld.mkOp2(OP_ADD , TYPE_U32, tmp2, i->getSrc(0), i->getSrc(1)); - else -diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.h -index 71e5ea6417a..dfa1d035dac 100644 ---- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.h -+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.h -@@ -21,6 +21,7 @@ class GM107LegalizeSSA : public NVC0LegalizeSSA - private: - virtual bool visit(Instruction *); - -+protected: - void handlePFETCH(Instruction *); - void handleLOAD(Instruction *); - }; -diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gv100.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gv100.cpp -new file mode 100644 -index 00000000000..644d4928327 ---- /dev/null -+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gv100.cpp -@@ -0,0 +1,481 @@ -+/* -+ * Copyright 2020 Red Hat Inc. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice shall be included in -+ * all copies or substantial portions of the Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -+ * OTHER DEALINGS IN THE SOFTWARE. -+ */ -+#include "codegen/nv50_ir.h" -+#include "codegen/nv50_ir_build_util.h" -+ -+#include "codegen/nv50_ir_target_nvc0.h" -+#include "codegen/nv50_ir_lowering_gv100.h" -+ -+#include -+ -+namespace nv50_ir { -+ -+bool -+GV100LegalizeSSA::handleCMP(Instruction *i) -+{ -+ Value *pred = bld.getSSA(1, FILE_PREDICATE); -+ -+ bld.mkCmp(OP_SET, reverseCondCode(i->asCmp()->setCond), TYPE_U8, pred, -+ i->sType, bld.mkImm(0), i->getSrc(2))->ftz = i->ftz; -+ bld.mkOp3(OP_SELP, TYPE_U32, i->getDef(0), i->getSrc(0), i->getSrc(1), pred); -+ return true; -+} -+ -+// NIR deals with most of these for us, but codegen generates more in pointer -+// calculations from other lowering passes. -+bool -+GV100LegalizeSSA::handleIADD64(Instruction *i) -+{ -+ Value *carry = bld.getSSA(1, FILE_PREDICATE); -+ Value *def[2] = { bld.getSSA(), bld.getSSA() }; -+ Value *src[2][2]; -+ -+ for (int s = 0; s < 2; s++) { -+ if (i->getSrc(s)->reg.size == 8) { -+ bld.mkSplit(src[s], 4, i->getSrc(s)); -+ } else { -+ src[s][0] = i->getSrc(s); -+ src[s][1] = bld.mkImm(0); -+ } -+ } -+ -+ bld.mkOp2(OP_ADD, TYPE_U32, def[0], src[0][0], src[1][0])-> -+ setFlagsDef(1, carry); -+ bld.mkOp2(OP_ADD, TYPE_U32, def[1], src[0][1], src[1][1])-> -+ setFlagsSrc(2, carry); -+ bld.mkOp2(OP_MERGE, i->dType, i->getDef(0), def[0], def[1]); -+ return true; -+} -+ -+bool -+GV100LegalizeSSA::handleIMAD_HIGH(Instruction *i) -+{ -+ Value *def = bld.getSSA(8), *defs[2]; -+ Value *src2; -+ -+ if (i->srcExists(2) && -+ (!i->getSrc(2)->asImm() || i->getSrc(2)->asImm()->reg.data.u32)) { -+ Value *src2s[2] = { bld.getSSA(), bld.getSSA() }; -+ bld.mkMov(src2s[0], bld.mkImm(0)); -+ bld.mkMov(src2s[1], i->getSrc(2)); -+ src2 = bld.mkOp2(OP_MERGE, TYPE_U64, bld.getSSA(8), src2s[0], src2s[1])->getDef(0); -+ } else { -+ src2 = bld.mkImm(0); -+ } -+ -+ bld.mkOp3(OP_MAD, isSignedType(i->sType) ? TYPE_S64 : TYPE_U64, def, -+ i->getSrc(0), i->getSrc(1), src2); -+ -+ bld.mkSplit(defs, 4, def); -+ i->def(0).replace(defs[1], false); -+ return true; -+} -+ -+// XXX: We should be able to do this in GV100LoweringPass, but codegen messes -+// up somehow and swaps the condcode without swapping the sources. -+// - tests/spec/glsl-1.50/execution/geometry/primitive-id-in.shader_test -+bool -+GV100LegalizeSSA::handleIMNMX(Instruction *i) -+{ -+ Value *pred = bld.getSSA(1, FILE_PREDICATE); -+ -+ bld.mkCmp(OP_SET, (i->op == OP_MIN) ? CC_LT : CC_GT, i->dType, pred, -+ i->sType, i->getSrc(0), i->getSrc(1)); -+ bld.mkOp3(OP_SELP, i->dType, i->getDef(0), i->getSrc(0), i->getSrc(1), pred); -+ return true; -+} -+ -+bool -+GV100LegalizeSSA::handleIMUL(Instruction *i) -+{ -+ if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) -+ return handleIMAD_HIGH(i); -+ -+ bld.mkOp3(OP_MAD, i->dType, i->getDef(0), i->getSrc(0), i->getSrc(1), -+ bld.mkImm(0)); -+ return true; -+} -+ -+bool -+GV100LegalizeSSA::handleLOP2(Instruction *i) -+{ -+ uint8_t src0 = NV50_IR_SUBOP_LOP3_LUT_SRC0; -+ uint8_t src1 = NV50_IR_SUBOP_LOP3_LUT_SRC1; -+ uint8_t subOp; -+ -+ if (i->src(0).mod & Modifier(NV50_IR_MOD_NOT)) -+ src0 = ~src0; -+ if (i->src(1).mod & Modifier(NV50_IR_MOD_NOT)) -+ src1 = ~src1; -+ -+ switch (i->op) { -+ case OP_AND: subOp = src0 & src1; break; -+ case OP_OR : subOp = src0 | src1; break; -+ case OP_XOR: subOp = src0 ^ src1; break; -+ default: -+ assert(!"invalid LOP2 opcode"); -+ break; -+ } -+ -+ bld.mkOp3(OP_LOP3_LUT, TYPE_U32, i->getDef(0), i->getSrc(0), i->getSrc(1), -+ bld.mkImm(0))->subOp = subOp; -+ return true; -+} -+ -+bool -+GV100LegalizeSSA::handleNOT(Instruction *i) -+{ -+ bld.mkOp3(OP_LOP3_LUT, TYPE_U32, i->getDef(0), bld.mkImm(0), i->getSrc(0), -+ bld.mkImm(0))->subOp = (uint8_t)~NV50_IR_SUBOP_LOP3_LUT_SRC1; -+ return true; -+} -+ -+bool -+GV100LegalizeSSA::handlePREEX2(Instruction *i) -+{ -+ i->def(0).replace(i->src(0), false); -+ return true; -+} -+ -+bool -+GV100LegalizeSSA::handleQUADON(Instruction *i) -+{ -+ handleSHFL(i); // Inserts OP_WARPSYNC -+ return true; -+} -+ -+bool -+GV100LegalizeSSA::handleQUADPOP(Instruction *i) -+{ -+ return true; -+} -+ -+bool -+GV100LegalizeSSA::handleSET(Instruction *i) -+{ -+ Value *src2 = i->srcExists(2) ? i->getSrc(2) : NULL; -+ Value *pred = bld.getSSA(1, FILE_PREDICATE), *met; -+ Instruction *xsetp; -+ -+ if (isFloatType(i->dType)) { -+ if (i->sType == TYPE_F32) -+ return false; // HW has FSET.BF -+ met = bld.mkImm(0x3f800000); -+ } else { -+ met = bld.mkImm(0xffffffff); -+ } -+ -+ xsetp = bld.mkCmp(i->op, i->asCmp()->setCond, TYPE_U8, pred, i->sType, -+ i->getSrc(0), i->getSrc(1)); -+ xsetp->src(0).mod = i->src(0).mod; -+ xsetp->src(1).mod = i->src(1).mod; -+ xsetp->setSrc(2, src2); -+ xsetp->ftz = i->ftz; -+ -+ i = bld.mkOp3(OP_SELP, TYPE_U32, i->getDef(0), bld.mkImm(0), met, pred); -+ i->src(2).mod = Modifier(NV50_IR_MOD_NOT); -+ return true; -+} -+ -+bool -+GV100LegalizeSSA::handleSHFL(Instruction *i) -+{ -+ Instruction *sync = new_Instruction(func, OP_WARPSYNC, TYPE_NONE); -+ sync->fixed = 1; -+ sync->setSrc(0, bld.mkImm(0xffffffff)); -+ i->bb->insertBefore(i, sync); -+ return false; -+} -+ -+bool -+GV100LegalizeSSA::handleShift(Instruction *i) -+{ -+ Value *zero = bld.mkImm(0); -+ Value *src1 = i->getSrc(1); -+ Value *src0, *src2; -+ uint8_t subOp = i->op == OP_SHL ? NV50_IR_SUBOP_SHF_L : NV50_IR_SUBOP_SHF_R; -+ -+ if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR) { -+ src0 = i->getSrc(0); -+ src2 = zero; -+ } else { -+ src0 = zero; -+ src2 = i->getSrc(0); -+ subOp |= NV50_IR_SUBOP_SHF_HI; -+ } -+ if (i->subOp & NV50_IR_SUBOP_SHIFT_WRAP) -+ subOp |= NV50_IR_SUBOP_SHF_W; -+ -+ bld.mkOp3(OP_SHF, i->dType, i->getDef(0), src0, src1, src2)->subOp = subOp; -+ return true; -+} -+ -+bool -+GV100LegalizeSSA::handleSUB(Instruction *i) -+{ -+ Instruction *xadd = -+ bld.mkOp2(OP_ADD, i->dType, i->getDef(0), i->getSrc(0), i->getSrc(1)); -+ xadd->src(0).mod = i->src(0).mod; -+ xadd->src(1).mod = i->src(1).mod ^ Modifier(NV50_IR_MOD_NEG); -+ xadd->ftz = i->ftz; -+ return true; -+} -+ -+bool -+GV100LegalizeSSA::visit(Instruction *i) -+{ -+ bool lowered = false; -+ -+ bld.setPosition(i, false); -+ if (i->sType == TYPE_F32 && i->dType != TYPE_F16 && -+ prog->getType() != Program::TYPE_COMPUTE) -+ handleFTZ(i); -+ -+ switch (i->op) { -+ case OP_AND: -+ case OP_OR: -+ case OP_XOR: -+ if (i->def(0).getFile() != FILE_PREDICATE) -+ lowered = handleLOP2(i); -+ break; -+ case OP_NOT: -+ lowered = handleNOT(i); -+ break; -+ case OP_SHL: -+ case OP_SHR: -+ lowered = handleShift(i); -+ break; -+ case OP_SET: -+ case OP_SET_AND: -+ case OP_SET_OR: -+ case OP_SET_XOR: -+ if (i->def(0).getFile() != FILE_PREDICATE) -+ lowered = handleSET(i); -+ break; -+ case OP_SLCT: -+ lowered = handleCMP(i); -+ break; -+ case OP_PREEX2: -+ lowered = handlePREEX2(i); -+ break; -+ case OP_MUL: -+ if (!isFloatType(i->dType)) -+ lowered = handleIMUL(i); -+ break; -+ case OP_MAD: -+ if (!isFloatType(i->dType) && i->subOp == NV50_IR_SUBOP_MUL_HIGH) -+ lowered = handleIMAD_HIGH(i); -+ break; -+ case OP_SHFL: -+ lowered = handleSHFL(i); -+ break; -+ case OP_QUADON: -+ lowered = handleQUADON(i); -+ break; -+ case OP_QUADPOP: -+ lowered = handleQUADPOP(i); -+ break; -+ case OP_SUB: -+ lowered = handleSUB(i); -+ break; -+ case OP_MAX: -+ case OP_MIN: -+ if (!isFloatType(i->dType)) -+ lowered = handleIMNMX(i); -+ break; -+ case OP_ADD: -+ if (!isFloatType(i->dType) && typeSizeof(i->dType) == 8) -+ lowered = handleIADD64(i); -+ break; -+ case OP_PFETCH: -+ handlePFETCH(i); -+ break; -+ case OP_LOAD: -+ handleLOAD(i); -+ break; -+ default: -+ break; -+ } -+ -+ if (lowered) -+ delete_Instruction(prog, i); -+ -+ return true; -+} -+ -+bool -+GV100LoweringPass::handleDMNMX(Instruction *i) -+{ -+ Value *pred = bld.getSSA(1, FILE_PREDICATE); -+ Value *src0[2], *src1[2], *dest[2]; -+ -+ bld.mkCmp(OP_SET, (i->op == OP_MIN) ? CC_LT : CC_GT, TYPE_U32, pred, -+ i->sType, i->getSrc(0), i->getSrc(1)); -+ bld.mkSplit(src0, 4, i->getSrc(0)); -+ bld.mkSplit(src1, 4, i->getSrc(1)); -+ bld.mkSplit(dest, 4, i->getDef(0)); -+ bld.mkOp3(OP_SELP, TYPE_U32, dest[0], src0[0], src1[0], pred); -+ bld.mkOp3(OP_SELP, TYPE_U32, dest[1], src0[1], src1[1], pred); -+ bld.mkOp2(OP_MERGE, TYPE_U64, i->getDef(0), dest[0], dest[1]); -+ return true; -+} -+ -+bool -+GV100LoweringPass::handleEXTBF(Instruction *i) -+{ -+ Value *bit = bld.getScratch(); -+ Value *cnt = bld.getScratch(); -+ Value *mask = bld.getScratch(); -+ Value *zero = bld.mkImm(0); -+ -+ bld.mkOp3(OP_PERMT, TYPE_U32, bit, i->getSrc(1), bld.mkImm(0x4440), zero); -+ bld.mkOp3(OP_PERMT, TYPE_U32, cnt, i->getSrc(1), bld.mkImm(0x4441), zero); -+ bld.mkOp2(OP_BMSK, TYPE_U32, mask, bit, cnt); -+ bld.mkOp2(OP_AND, TYPE_U32, mask, i->getSrc(0), mask); -+ bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), mask, bit); -+ if (isSignedType(i->dType)) -+ bld.mkOp2(OP_SGXT, TYPE_S32, i->getDef(0), i->getDef(0), cnt); -+ -+ return true; -+} -+ -+bool -+GV100LoweringPass::handleFLOW(Instruction *i) -+{ -+ i->op = OP_BRA; -+ return false; -+} -+ -+bool -+GV100LoweringPass::handleI2I(Instruction *i) -+{ -+ bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), i->sType, i->getSrc(0))-> -+ subOp = i->subOp; -+ bld.mkCvt(OP_CVT, i->dType, i->getDef(0), TYPE_F32, i->getDef(0)); -+ return true; -+} -+ -+bool -+GV100LoweringPass::handleINSBF(Instruction *i) -+{ -+ Value *bit = bld.getScratch(); -+ Value *cnt = bld.getScratch(); -+ Value *mask = bld.getScratch(); -+ Value *src0 = bld.getScratch(); -+ Value *zero = bld.mkImm(0); -+ -+ bld.mkOp3(OP_PERMT, TYPE_U32, bit, i->getSrc(1), bld.mkImm(0x4440), zero); -+ bld.mkOp3(OP_PERMT, TYPE_U32, cnt, i->getSrc(1), bld.mkImm(0x4441), zero); -+ bld.mkOp2(OP_BMSK, TYPE_U32, mask, zero, cnt); -+ -+ bld.mkOp2(OP_AND, TYPE_U32, src0, i->getSrc(0), mask); -+ bld.mkOp2(OP_SHL, TYPE_U32, src0, src0, bit); -+ -+ bld.mkOp2(OP_SHL, TYPE_U32, mask, mask, bit); -+ bld.mkOp3(OP_LOP3_LUT, TYPE_U32, i->getDef(0), src0, i->getSrc(2), mask)-> -+ subOp = NV50_IR_SUBOP_LOP3_LUT(a | (b & ~c)); -+ -+ return true; -+} -+ -+bool -+GV100LoweringPass::handlePINTERP(Instruction *i) -+{ -+ Value *src2 = i->srcExists(2) ? i->getSrc(2) : NULL; -+ Instruction *ipa, *mul; -+ -+ ipa = bld.mkOp2(OP_LINTERP, TYPE_F32, i->getDef(0), i->getSrc(0), src2); -+ ipa->ipa = i->ipa; -+ mul = bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(0), i->getDef(0), i->getSrc(1)); -+ -+ if (i->getInterpMode() == NV50_IR_INTERP_SC) { -+ ipa->setDef(1, bld.getSSA(1, FILE_PREDICATE)); -+ mul->setPredicate(CC_NOT_P, ipa->getDef(1)); -+ } -+ -+ return true; -+} -+ -+bool -+GV100LoweringPass::handlePREFLOW(Instruction *i) -+{ -+ return true; -+} -+ -+bool -+GV100LoweringPass::handlePRESIN(Instruction *i) -+{ -+ const float f = 1.0 / (2.0 * 3.14159265); -+ bld.mkOp2(OP_MUL, i->dType, i->getDef(0), i->getSrc(0), bld.mkImm(f)); -+ return true; -+} -+ -+bool -+GV100LoweringPass::visit(Instruction *i) -+{ -+ bool lowered = false; -+ -+ bld.setPosition(i, false); -+ -+ switch (i->op) { -+ case OP_BREAK: -+ case OP_CONT: -+ lowered = handleFLOW(i); -+ break; -+ case OP_PREBREAK: -+ case OP_PRECONT: -+ lowered = handlePREFLOW(i); -+ break; -+ case OP_CVT: -+ if (i->src(0).getFile() != FILE_PREDICATE && -+ i->def(0).getFile() != FILE_PREDICATE && -+ !isFloatType(i->dType) && !isFloatType(i->sType)) -+ lowered = handleI2I(i); -+ break; -+ case OP_EXTBF: -+ lowered = handleEXTBF(i); -+ break; -+ case OP_INSBF: -+ lowered = handleINSBF(i); -+ break; -+ case OP_MAX: -+ case OP_MIN: -+ if (i->dType == TYPE_F64) -+ lowered = handleDMNMX(i); -+ break; -+ case OP_PINTERP: -+ lowered = handlePINTERP(i); -+ break; -+ case OP_PRESIN: -+ lowered = handlePRESIN(i); -+ break; -+ default: -+ break; -+ } -+ -+ if (lowered) -+ delete_Instruction(prog, i); -+ -+ return true; -+} -+ -+} // namespace nv50_ir -diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gv100.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gv100.h -new file mode 100644 -index 00000000000..d918c6e83eb ---- /dev/null -+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gv100.h -@@ -0,0 +1,78 @@ -+/* -+ * Copyright 2020 Red Hat Inc. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice shall be included in -+ * all copies or substantial portions of the Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -+ * OTHER DEALINGS IN THE SOFTWARE. -+ */ -+#ifndef __NV50_IR_LOWERING_GV100_H__ -+#define __NV50_IR_LOWERING_GV100_H__ -+#include "codegen/nv50_ir_lowering_gm107.h" -+ -+namespace nv50_ir { -+ -+class GV100LoweringPass : public Pass -+{ -+public: -+ GV100LoweringPass(Program *p) { -+ bld.setProgram(p); -+ } -+ -+private: -+ BuildUtil bld; -+ -+ virtual bool visit(Instruction *); -+ -+ bool handleDMNMX(Instruction *); -+ bool handleEXTBF(Instruction *); -+ bool handleFLOW(Instruction *); -+ bool handleI2I(Instruction *); -+ bool handleINSBF(Instruction *); -+ bool handlePINTERP(Instruction *); -+ bool handlePREFLOW(Instruction *); -+ bool handlePRESIN(Instruction *); -+}; -+ -+class GV100LegalizeSSA : public GM107LegalizeSSA -+{ -+public: -+ GV100LegalizeSSA(Program *p) { -+ bld.setProgram(p); -+ } -+ -+private: -+ virtual bool visit(Function *) { return true; } -+ virtual bool visit(BasicBlock *) { return true; } -+ virtual bool visit(Instruction *); -+ -+ bool handleCMP(Instruction *); -+ bool handleIADD64(Instruction *); -+ bool handleIMAD_HIGH(Instruction *); -+ bool handleIMNMX(Instruction *); -+ bool handleIMUL(Instruction *); -+ bool handleLOP2(Instruction *); -+ bool handleNOT(Instruction *); -+ bool handlePREEX2(Instruction *); -+ bool handleQUADON(Instruction *); -+ bool handleQUADPOP(Instruction *); -+ bool handleSET(Instruction *); -+ bool handleSHFL(Instruction *); -+ bool handleShift(Instruction *); -+ bool handleSUB(Instruction *); -+}; -+} -+#endif -diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp -index a60881000fe..067f9abaca8 100644 ---- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp -+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp -@@ -310,6 +310,14 @@ NVC0LegalizeSSA::handleSET(CmpInstruction *cmp) - cmp->sType = hTy; - } - -+void -+NVC0LegalizeSSA::handleBREV(Instruction *i) -+{ -+ i->op = OP_EXTBF; -+ i->subOp = NV50_IR_SUBOP_EXTBF_REV; -+ i->setSrc(1, bld.mkImm(0x2000)); -+} -+ - bool - NVC0LegalizeSSA::visit(Function *fn) - { -@@ -354,6 +362,9 @@ NVC0LegalizeSSA::visit(BasicBlock *bb) - if (typeSizeof(i->sType) == 8 && i->sType != TYPE_F64) - handleSET(i->asCmp()); - break; -+ case OP_BREV: -+ handleBREV(i); -+ break; - default: - break; - } -@@ -856,11 +867,11 @@ NVC0LegalizePostRA::visit(BasicBlock *bb) - next = hi; - } - -- if (i->op == OP_SAT || i->op == OP_NEG || i->op == OP_ABS) -- replaceCvt(i); -- - if (i->op != OP_MOV && i->op != OP_PFETCH) - replaceZero(i); -+ -+ if (i->op == OP_SAT || i->op == OP_NEG || i->op == OP_ABS) -+ replaceCvt(i); - } - } - if (!bb->getEntry()) -@@ -887,6 +898,8 @@ NVC0LoweringPass::visit(Function *fn) - gpEmitAddress = bld.loadImm(NULL, 0)->asLValue(); - if (fn->cfgExit) { - bld.setPosition(BasicBlock::get(fn->cfgExit)->getExit(), false); -+ if (prog->getTarget()->getChipset() >= NVISA_GV100_CHIPSET) -+ bld.mkOp1(OP_FINAL, TYPE_NONE, NULL, gpEmitAddress)->fixed = 1; - bld.mkMovToReg(0, gpEmitAddress); - } - } -@@ -1714,7 +1727,8 @@ NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl) - cctl->setPredicate(cas->cc, cas->getPredicate()); - } - -- if (cas->subOp == NV50_IR_SUBOP_ATOM_CAS) { -+ if (cas->subOp == NV50_IR_SUBOP_ATOM_CAS && -+ targ->getChipset() < NVISA_GV100_CHIPSET) { - // CAS is crazy. It's 2nd source is a double reg, and the 3rd source - // should be set to the high part of the double reg or bad things will - // happen elsewhere in the universe. -diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h -index b4c405a9ea5..8c99427d3c0 100644 ---- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h -+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h -@@ -64,12 +64,14 @@ private: - void handleDIV(Instruction *); // integer division, modulus - void handleRCPRSQLib(Instruction *, Value *[]); - void handleRCPRSQ(Instruction *); // double precision float recip/rsqrt -- void handleFTZ(Instruction *); - void handleSET(CmpInstruction *); - void handleTEXLOD(TexInstruction *); - void handleShift(Instruction *); -+ void handleBREV(Instruction *); - - protected: -+ void handleFTZ(Instruction *); -+ - BuildUtil bld; - }; - -diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp -index 2f46b0e886a..3a4ec3ca561 100644 ---- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp -+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp -@@ -558,6 +558,19 @@ ConstantFolding::expr(Instruction *i, - memset(&res.data, 0, sizeof(res.data)); - - switch (i->op) { -+ case OP_SGXT: { -+ int bits = b->data.u32; -+ if (bits) { -+ uint32_t data = a->data.u32 & (0xffffffff >> (32 - bits)); -+ if (bits < 32 && (data & (1 << (bits - 1)))) -+ data = data - (1 << bits); -+ res.data.u32 = data; -+ } -+ break; -+ } -+ case OP_BMSK: -+ res.data.u32 = ((1 << b->data.u32) - 1) << a->data.u32; -+ break; - case OP_MAD: - case OP_FMA: - case OP_MUL: -@@ -780,6 +793,23 @@ ConstantFolding::expr(Instruction *i, - memset(&res.data, 0, sizeof(res.data)); - - switch (i->op) { -+ case OP_LOP3_LUT: -+ for (int n = 0; n < 32; n++) { -+ uint8_t lut = ((a->data.u32 >> n) & 1) << 2 | -+ ((b->data.u32 >> n) & 1) << 1 | -+ ((c->data.u32 >> n) & 1); -+ res.data.u32 |= !!(i->subOp & (1 << lut)) << n; -+ } -+ break; -+ case OP_PERMT: -+ if (!i->subOp) { -+ uint64_t input = (uint64_t)c->data.u32 << 32 | a->data.u32; -+ uint16_t permt = b->data.u32; -+ for (int n = 0 ; n < 4; n++, permt >>= 4) -+ res.data.u32 |= ((input >> ((permt & 0xf) * 8)) & 0xff) << n * 8; -+ } else -+ return; -+ break; - case OP_INSBF: { - int offset = b->data.u32 & 0xff; - int width = (b->data.u32 >> 8) & 0xff; -@@ -1526,6 +1556,12 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s) - i->subOp = 0; - break; - } -+ case OP_BREV: { -+ uint32_t res = util_bitreverse(imm0.reg.data.u32); -+ i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res)); -+ i->op = OP_MOV; -+ break; -+ } - case OP_POPCNT: { - // Only deal with 1-arg POPCNT here - if (i->srcExists(1)) -diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp -index 5dcbf3c3e0c..ce0d2507dc1 100644 ---- a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp -+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp -@@ -93,8 +93,10 @@ const char *operationStr[OP_LAST + 1] = - "and", - "or", - "xor", -+ "lop3 lut", - "shl", - "shr", -+ "shf", - "max", - "min", - "sat", -@@ -142,6 +144,7 @@ const char *operationStr[OP_LAST + 1] = - "pinterp", - "emit", - "restart", -+ "final", - "tex", - "texbias", - "texlod", -@@ -177,7 +180,10 @@ const char *operationStr[OP_LAST + 1] = - "insbf", - "extbf", - "bfind", -+ "brev", -+ "bmsk", - "permt", -+ "sgxt", - "atom", - "bar", - "vadd", -@@ -193,6 +199,7 @@ const char *operationStr[OP_LAST + 1] = - "shfl", - "vote", - "bufq", -+ "warpsync", - "(invalid)" - }; - -diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp -index 6df2664da22..4e5b21d9176 100644 ---- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp -+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp -@@ -988,6 +988,8 @@ GCRA::coalesce(ArrayList& insns) - case 0x110: - case 0x120: - case 0x130: -+ case 0x140: -+ case 0x160: - ret = doCoalesce(insns, JOIN_MASK_UNION); - break; - default: -@@ -2297,13 +2299,25 @@ RegAlloc::InsertConstraintsPass::texConstraintGM107(TexInstruction *tex) - if (isTextureOp(tex->op)) - textureMask(tex); - -- if (isScalarTexGM107(tex)) { -- handleScalarTexGM107(tex); -- return; -- } -+ if (targ->getChipset() < NVISA_GV100_CHIPSET) { -+ if (isScalarTexGM107(tex)) { -+ handleScalarTexGM107(tex); -+ return; -+ } - -- assert(!tex->tex.scalar); -- condenseDefs(tex); -+ assert(!tex->tex.scalar); -+ condenseDefs(tex); -+ } else { -+ if (isTextureOp(tex->op)) { -+ int defCount = tex->defCount(0xff); -+ if (defCount > 3) -+ condenseDefs(tex, 2, 3); -+ if (defCount > 1) -+ condenseDefs(tex, 0, 1); -+ } else { -+ condenseDefs(tex); -+ } -+ } - - if (isSurfaceOp(tex->op)) { - int s = tex->tex.target.getDim() + -@@ -2485,6 +2499,8 @@ RegAlloc::InsertConstraintsPass::visit(BasicBlock *bb) - case 0x110: - case 0x120: - case 0x130: -+ case 0x140: -+ case 0x160: - texConstraintGM107(tex); - break; - default: -diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_sched_gm107.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_sched_gm107.h -new file mode 100644 -index 00000000000..54443ae2770 ---- /dev/null -+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_sched_gm107.h -@@ -0,0 +1,156 @@ -+#ifndef __NV50_IR_SCHED_GM107_H__ -+#define __NV50_IR_SCHED_GM107_H__ -+namespace nv50_ir { -+ -+class SchedDataCalculatorGM107 : public Pass -+{ -+public: -+ SchedDataCalculatorGM107(const TargetGM107 *targ) : targ(targ) {} -+ -+private: -+ struct RegScores -+ { -+ struct ScoreData { -+ int r[256]; -+ int p[8]; -+ int c; -+ } rd, wr; -+ int base; -+ -+ void rebase(const int base) -+ { -+ const int delta = this->base - base; -+ if (!delta) -+ return; -+ this->base = 0; -+ -+ for (int i = 0; i < 256; ++i) { -+ rd.r[i] += delta; -+ wr.r[i] += delta; -+ } -+ for (int i = 0; i < 8; ++i) { -+ rd.p[i] += delta; -+ wr.p[i] += delta; -+ } -+ rd.c += delta; -+ wr.c += delta; -+ } -+ void wipe() -+ { -+ memset(&rd, 0, sizeof(rd)); -+ memset(&wr, 0, sizeof(wr)); -+ } -+ int getLatest(const ScoreData& d) const -+ { -+ int max = 0; -+ for (int i = 0; i < 256; ++i) -+ if (d.r[i] > max) -+ max = d.r[i]; -+ for (int i = 0; i < 8; ++i) -+ if (d.p[i] > max) -+ max = d.p[i]; -+ if (d.c > max) -+ max = d.c; -+ return max; -+ } -+ inline int getLatestRd() const -+ { -+ return getLatest(rd); -+ } -+ inline int getLatestWr() const -+ { -+ return getLatest(wr); -+ } -+ inline int getLatest() const -+ { -+ return MAX2(getLatestRd(), getLatestWr()); -+ } -+ void setMax(const RegScores *that) -+ { -+ for (int i = 0; i < 256; ++i) { -+ rd.r[i] = MAX2(rd.r[i], that->rd.r[i]); -+ wr.r[i] = MAX2(wr.r[i], that->wr.r[i]); -+ } -+ for (int i = 0; i < 8; ++i) { -+ rd.p[i] = MAX2(rd.p[i], that->rd.p[i]); -+ wr.p[i] = MAX2(wr.p[i], that->wr.p[i]); -+ } -+ rd.c = MAX2(rd.c, that->rd.c); -+ wr.c = MAX2(wr.c, that->wr.c); -+ } -+ void print(int cycle) -+ { -+ for (int i = 0; i < 256; ++i) { -+ if (rd.r[i] > cycle) -+ INFO("rd $r%i @ %i\n", i, rd.r[i]); -+ if (wr.r[i] > cycle) -+ INFO("wr $r%i @ %i\n", i, wr.r[i]); -+ } -+ for (int i = 0; i < 8; ++i) { -+ if (rd.p[i] > cycle) -+ INFO("rd $p%i @ %i\n", i, rd.p[i]); -+ if (wr.p[i] > cycle) -+ INFO("wr $p%i @ %i\n", i, wr.p[i]); -+ } -+ if (rd.c > cycle) -+ INFO("rd $c @ %i\n", rd.c); -+ if (wr.c > cycle) -+ INFO("wr $c @ %i\n", wr.c); -+ } -+ }; -+ -+ RegScores *score; // for current BB -+ std::vector scoreBoards; -+ -+ const TargetGM107 *targ; -+ bool visit(Function *); -+ bool visit(BasicBlock *); -+ -+ void commitInsn(const Instruction *, int); -+ int calcDelay(const Instruction *, int) const; -+ void setDelay(Instruction *, int, const Instruction *); -+ void recordWr(const Value *, int, int); -+ void checkRd(const Value *, int, int&) const; -+ -+ inline void emitYield(Instruction *); -+ inline void emitStall(Instruction *, uint8_t); -+ inline void emitReuse(Instruction *, uint8_t); -+ inline void emitWrDepBar(Instruction *, uint8_t); -+ inline void emitRdDepBar(Instruction *, uint8_t); -+ inline void emitWtDepBar(Instruction *, uint8_t); -+ -+ inline int getStall(const Instruction *) const; -+ inline int getWrDepBar(const Instruction *) const; -+ inline int getRdDepBar(const Instruction *) const; -+ inline int getWtDepBar(const Instruction *) const; -+ -+ void setReuseFlag(Instruction *); -+ -+ inline void printSchedInfo(int, const Instruction *) const; -+ -+ struct LiveBarUse { -+ LiveBarUse(Instruction *insn, Instruction *usei) -+ : insn(insn), usei(usei) { } -+ Instruction *insn; -+ Instruction *usei; -+ }; -+ -+ struct LiveBarDef { -+ LiveBarDef(Instruction *insn, Instruction *defi) -+ : insn(insn), defi(defi) { } -+ Instruction *insn; -+ Instruction *defi; -+ }; -+ -+ bool insertBarriers(BasicBlock *); -+ -+ bool doesInsnWriteTo(const Instruction *insn, const Value *val) const; -+ Instruction *findFirstUse(const Instruction *) const; -+ Instruction *findFirstDef(const Instruction *) const; -+ -+ bool needRdDepBar(const Instruction *) const; -+ bool needWrDepBar(const Instruction *) const; -+}; -+ -+}; // namespace nv50_ir -+#endif -diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp -index 5c6d0570ae2..765375a47df 100644 ---- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp -+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp -@@ -33,7 +33,7 @@ const uint8_t Target::operationSrcNr[] = - 2, 2, 2, 2, 2, 3, 3, 3, // ADD, SUB, MUL, DIV, MOD, MAD, FMA, SAD - 3, 3, // SHLADD, XMAD - 1, 1, 1, // ABS, NEG, NOT -- 2, 2, 2, 2, 2, // AND, OR, XOR, SHL, SHR -+ 2, 2, 2, 3, 2, 2, 3, // AND, OR, XOR, LOP3_LUT, SHL, SHR, SHF - 2, 2, 1, // MAX, MIN, SAT - 1, 1, 1, 1, // CEIL, FLOOR, TRUNC, CVT - 3, 3, 3, 2, 3, 3, // SET_AND,OR,XOR, SET, SELP, SLCT -@@ -43,7 +43,7 @@ const uint8_t Target::operationSrcNr[] = - 0, 0, 0, // PRERET,CONT,BREAK - 0, 0, 0, 0, 0, 0, // BRKPT, JOINAT, JOIN, DISCARD, EXIT, MEMBAR - 1, 1, 1, 2, 1, 2, // VFETCH, PFETCH, AFETCH, EXPORT, LINTERP, PINTERP -- 1, 1, // EMIT, RESTART -+ 1, 1, 1, // EMIT, RESTART, FINAL - 1, 1, 1, // TEX, TXB, TXL, - 1, 1, 1, 1, 1, 1, 2, // TXF, TXQ, TXD, TXG, TXLQ, TEXCSAA, TEXPREP - 1, 1, 2, 2, 2, 2, 2, // SULDB, SULDP, SUSTB, SUSTP, SUREDB, SUREDP, SULEA -@@ -51,13 +51,15 @@ const uint8_t Target::operationSrcNr[] = - 0, // TEXBAR - 1, 1, // DFDX, DFDY - 1, 2, 1, 2, 0, 0, // RDSV, WRSV, PIXLD, QUADOP, QUADON, QUADPOP -- 2, 3, 2, 1, 3, // POPCNT, INSBF, EXTBF, BFIND, PERMT -+ 2, 3, 2, 1, 1, 2, 3, // POPCNT, INSBF, EXTBF, BFIND, BREV, BMSK, PERMT -+ 2, // SGXT - 2, 2, // ATOM, BAR - 2, 2, 2, 2, 3, 2, // VADD, VAVG, VMIN, VMAX, VSAD, VSET, - 2, 2, 2, 1, // VSHR, VSHL, VSEL, CCTL - 3, // SHFL - 1, // VOTE - 1, // BUFQ -+ 1, // WARPSYNC - 0 - }; - -@@ -75,10 +77,10 @@ const OpClass Target::operationClass[] = - OPCLASS_ARITH, OPCLASS_ARITH, OPCLASS_ARITH, - OPCLASS_ARITH, OPCLASS_ARITH, - OPCLASS_ARITH, OPCLASS_ARITH, OPCLASS_ARITH, OPCLASS_ARITH, OPCLASS_ARITH, -- // ABS, NEG; NOT, AND, OR, XOR; SHL, SHR -+ // ABS, NEG; NOT, AND, OR, XOR, LOP3_LUT; SHL, SHR, SHF - OPCLASS_CONVERT, OPCLASS_CONVERT, -- OPCLASS_LOGIC, OPCLASS_LOGIC, OPCLASS_LOGIC, OPCLASS_LOGIC, -- OPCLASS_SHIFT, OPCLASS_SHIFT, -+ OPCLASS_LOGIC, OPCLASS_LOGIC, OPCLASS_LOGIC, OPCLASS_LOGIC, OPCLASS_LOGIC, -+ OPCLASS_SHIFT, OPCLASS_SHIFT, OPCLASS_SHIFT, - // MAX, MIN - OPCLASS_COMPARE, OPCLASS_COMPARE, - // SAT, CEIL, FLOOR, TRUNC; CVT -@@ -103,8 +105,8 @@ const OpClass Target::operationClass[] = - OPCLASS_LOAD, OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_STORE, - // LINTERP, PINTERP - OPCLASS_SFU, OPCLASS_SFU, -- // EMIT, RESTART -- OPCLASS_CONTROL, OPCLASS_CONTROL, -+ // EMIT, RESTART, FINAL -+ OPCLASS_CONTROL, OPCLASS_CONTROL, OPCLASS_CONTROL, - // TEX, TXB, TXL, TXF; TXQ, TXD, TXG, TXLQ; TEXCSAA, TEXPREP - OPCLASS_TEXTURE, OPCLASS_TEXTURE, OPCLASS_TEXTURE, OPCLASS_TEXTURE, - OPCLASS_TEXTURE, OPCLASS_TEXTURE, OPCLASS_TEXTURE, OPCLASS_TEXTURE, -@@ -119,9 +121,9 @@ const OpClass Target::operationClass[] = - // DFDX, DFDY, RDSV, WRSV; PIXLD, QUADOP, QUADON, QUADPOP - OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER, - OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_CONTROL, OPCLASS_CONTROL, -- // POPCNT, INSBF, EXTBF, BFIND; PERMT -+ // POPCNT, INSBF, EXTBF, BFIND, BREV, BMSK; PERMT, SGXT -+ OPCLASS_BITFIELD, OPCLASS_BITFIELD, OPCLASS_BITFIELD, OPCLASS_BITFIELD, - OPCLASS_BITFIELD, OPCLASS_BITFIELD, OPCLASS_BITFIELD, OPCLASS_BITFIELD, -- OPCLASS_BITFIELD, - // ATOM, BAR - OPCLASS_ATOMIC, OPCLASS_CONTROL, - // VADD, VAVG, VMIN, VMAX -@@ -136,10 +138,13 @@ const OpClass Target::operationClass[] = - OPCLASS_OTHER, - // BUFQ - OPCLASS_OTHER, -+ // WARPSYNC -+ OPCLASS_OTHER, - OPCLASS_PSEUDO // LAST - }; - - -+extern Target *getTargetGV100(unsigned int chipset); - extern Target *getTargetGM107(unsigned int chipset); - extern Target *getTargetNVC0(unsigned int chipset); - extern Target *getTargetNV50(unsigned int chipset); -@@ -149,6 +154,9 @@ Target *Target::create(unsigned int chipset) - STATIC_ASSERT(ARRAY_SIZE(operationSrcNr) == OP_LAST + 1); - STATIC_ASSERT(ARRAY_SIZE(operationClass) == OP_LAST + 1); - switch (chipset & ~0xf) { -+ case 0x160: -+ case 0x140: -+ return getTargetGV100(chipset); - case 0x110: - case 0x120: - case 0x130: -diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h -index afeca14d7d1..0f7db116577 100644 ---- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h -+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h -@@ -200,7 +200,7 @@ public: - uint8_t dstMods; - uint16_t srcFiles[3]; - uint16_t dstFiles; -- unsigned int minEncSize : 4; -+ unsigned int minEncSize : 5; - unsigned int vector : 1; - unsigned int predicate : 1; - unsigned int commutative : 1; -diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gv100.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gv100.cpp -new file mode 100644 -index 00000000000..fd969e1ece5 ---- /dev/null -+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gv100.cpp -@@ -0,0 +1,594 @@ -+/* -+ * Copyright 2020 Red Hat Inc. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice shall be included in -+ * all copies or substantial portions of the Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -+ * OTHER DEALINGS IN THE SOFTWARE. -+ */ -+#include "codegen/nv50_ir_target_gv100.h" -+#include "codegen/nv50_ir_lowering_gv100.h" -+#include "codegen/nv50_ir_emit_gv100.h" -+ -+namespace nv50_ir { -+ -+void -+TargetGV100::initOpInfo() -+{ -+ unsigned int i, j; -+ -+ static const operation commutative[] = -+ { -+ OP_ADD, OP_MUL, OP_MAD, OP_FMA, OP_MAX, OP_MIN, -+ OP_SET_AND, OP_SET_OR, OP_SET_XOR, OP_SET, OP_SELP, OP_SLCT -+ }; -+ -+ static const operation noDest[] = -+ { -+ OP_EXIT -+ }; -+ -+ static const operation noPred[] = -+ { -+ }; -+ -+ for (i = 0; i < DATA_FILE_COUNT; ++i) -+ nativeFileMap[i] = (DataFile)i; -+ nativeFileMap[FILE_ADDRESS] = FILE_GPR; -+ nativeFileMap[FILE_FLAGS] = FILE_PREDICATE; -+ -+ for (i = 0; i < OP_LAST; ++i) { -+ opInfo[i].variants = NULL; -+ opInfo[i].op = (operation)i; -+ opInfo[i].srcTypes = 1 << (int)TYPE_F32; -+ opInfo[i].dstTypes = 1 << (int)TYPE_F32; -+ opInfo[i].immdBits = 0; -+ opInfo[i].srcNr = operationSrcNr[i]; -+ -+ for (j = 0; j < opInfo[i].srcNr; ++j) { -+ opInfo[i].srcMods[j] = 0; -+ opInfo[i].srcFiles[j] = 1 << (int)FILE_GPR; -+ } -+ opInfo[i].dstMods = 0; -+ opInfo[i].dstFiles = 1 << (int)FILE_GPR; -+ -+ opInfo[i].hasDest = 1; -+ opInfo[i].vector = (i >= OP_TEX && i <= OP_TEXCSAA); -+ opInfo[i].commutative = false; /* set below */ -+ opInfo[i].pseudo = (i < OP_MOV); -+ opInfo[i].predicate = !opInfo[i].pseudo; -+ opInfo[i].flow = (i >= OP_BRA && i <= OP_JOIN); -+ opInfo[i].minEncSize = 16; -+ } -+ for (i = 0; i < ARRAY_SIZE(commutative); ++i) -+ opInfo[commutative[i]].commutative = true; -+ for (i = 0; i < ARRAY_SIZE(noDest); ++i) -+ opInfo[noDest[i]].hasDest = 0; -+ for (i = 0; i < ARRAY_SIZE(noPred); ++i) -+ opInfo[noPred[i]].predicate = 0; -+} -+ -+struct opInfo { -+ struct { -+ uint8_t files; -+ uint8_t mods; -+ } src[3]; -+}; -+ -+#define SRC_NONE 0 -+#define SRC_R (1 << FILE_GPR) -+#define SRC_I (1 << FILE_MEMORY_CONST) -+#define SRC_C (1 << FILE_IMMEDIATE) -+#define SRC_RC (SRC_R | SRC_C) -+#define SRC_RI (SRC_R | SRC_I ) -+#define SRC_RIC (SRC_R | SRC_I | SRC_C) -+ -+#define MOD_NONE 0 -+#define MOD_NEG NV50_IR_MOD_NEG -+#define MOD_ABS NV50_IR_MOD_ABS -+#define MOD_NOT NV50_IR_MOD_NOT -+#define MOD_NA (MOD_NEG | MOD_ABS) -+ -+#define OPINFO(O,SA,MA,SB,MB,SC,MC) \ -+static struct opInfo \ -+opInfo_##O = { \ -+ .src = { { SRC_##SA, MOD_##MA }, \ -+ { SRC_##SB, MOD_##MB }, \ -+ { SRC_##SC, MOD_##MC }}, \ -+}; -+ -+ -+/* Handled by GV100LegalizeSSA. */ -+OPINFO(FABS , RIC , NA , NONE, NONE, NONE, NONE); -+OPINFO(FCMP , R , NONE, RIC , NONE, RIC , NONE); //XXX: use FSEL for mods -+OPINFO(FNEG , RIC , NA , NONE, NONE, NONE, NONE); -+OPINFO(FSET , R , NA , RIC , NA , NONE, NONE); -+OPINFO(ICMP , R , NONE, RIC , NONE, RIC , NONE); -+OPINFO(IMUL , R , NONE, RIC , NONE, NONE, NONE); -+OPINFO(INEG , RIC , NEG , NONE, NONE, NONE, NONE); -+OPINFO(ISET , R , NONE, RIC , NONE, NONE, NONE); -+OPINFO(LOP2 , R , NOT , RIC , NOT , NONE, NONE); -+OPINFO(NOT , RIC , NONE, NONE, NONE, NONE, NONE); -+OPINFO(SAT , RIC , NA , NONE, NONE, NONE, NONE); -+OPINFO(SHL , RIC , NONE, RIC , NONE, NONE, NONE); -+OPINFO(SHR , RIC , NONE, RIC , NONE, NONE, NONE); -+OPINFO(SUB , R , NONE, RIC , NEG , NONE, NONE); -+OPINFO(IMNMX , R , NONE, RIC , NONE, NONE, NONE); -+ -+/* Handled by CodeEmitterGV100. */ -+OPINFO(AL2P , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(ALD , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(AST , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(ATOM , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(ATOMS , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(BAR , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(BRA , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(BMSK , R , NONE, RIC , NONE, NONE, NONE); -+OPINFO(BREV , RIC , NONE, NONE, NONE, NONE, NONE); -+OPINFO(CCTL , NONE, NONE, NONE, NONE, NONE, NONE); -+//OPINFO(CS2R , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(DADD , R , NA , RIC , NA , NONE, NONE); -+OPINFO(DFMA , R , NA , RIC , NA , RIC , NA ); -+OPINFO(DMUL , R , NA , RIC , NA , NONE, NONE); -+OPINFO(DSETP , R , NA , RIC , NA , NONE, NONE); -+OPINFO(EXIT , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(F2F , RIC , NA , NONE, NONE, NONE, NONE); -+OPINFO(F2I , RIC , NA , NONE, NONE, NONE, NONE); -+OPINFO(FADD , R , NA , RIC , NA , NONE, NONE); -+OPINFO(FFMA , R , NA , RIC , NA , RIC , NA ); -+OPINFO(FLO , RIC , NOT , NONE, NONE, NONE, NONE); -+OPINFO(FMNMX , R , NA , RIC , NA , NONE, NONE); -+OPINFO(FMUL , R , NA , RIC , NA , NONE, NONE); -+OPINFO(FRND , RIC , NA , NONE, NONE, NONE, NONE); -+OPINFO(FSET_BF , R , NA , RIC , NA , NONE, NONE); -+OPINFO(FSETP , R , NA , RIC , NA , NONE, NONE); -+OPINFO(FSWZADD , R , NONE, R , NONE, NONE, NONE); -+OPINFO(I2F , RIC , NONE, NONE, NONE, NONE, NONE); -+OPINFO(IABS , RIC , NONE, NONE, NONE, NONE, NONE); -+OPINFO(IADD3 , R , NEG , RIC , NEG , R , NEG ); -+OPINFO(IMAD , R , NONE, RIC , NONE, RIC , NEG ); -+OPINFO(IMAD_WIDE, R , NONE, RIC , NONE, RC , NEG ); -+OPINFO(IPA , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(ISBERD , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(ISETP , R , NONE, RIC , NONE, NONE, NONE); -+OPINFO(KILL , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(LD , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(LDC , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(LDL , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(LDS , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(LEA , R , NEG , I , NONE, RIC , NEG ); -+OPINFO(LOP3_LUT , R , NONE, RIC , NONE, R , NONE); -+OPINFO(MEMBAR , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(MOV , RIC , NONE, NONE, NONE, NONE, NONE); -+OPINFO(MUFU , RIC , NA , NONE, NONE, NONE, NONE); -+OPINFO(NOP , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(OUT , R , NONE, RI , NONE, NONE, NONE); -+OPINFO(PIXLD , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(PLOP3_LUT, NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(POPC , RIC , NOT , NONE, NONE, NONE, NONE); -+OPINFO(PRMT , R , NONE, RIC , NONE, RIC , NONE); -+OPINFO(RED , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(SGXT , R , NONE, RIC , NONE, NONE, NONE); -+OPINFO(S2R , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(SEL , R , NONE, RIC , NONE, NONE, NONE); -+OPINFO(SHF , R , NONE, RIC , NONE, RIC , NONE); -+OPINFO(SHFL , R , NONE, R , NONE, R , NONE); -+OPINFO(ST , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(STL , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(STS , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(SUATOM , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(SULD , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(SUST , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(TEX , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(TLD , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(TLD4 , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(TMML , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(TXD , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(TXQ , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(VOTE , NONE, NONE, NONE, NONE, NONE, NONE); -+OPINFO(WARPSYNC , R , NONE, NONE, NONE, NONE, NONE); -+ -+static const struct opInfo * -+getOpInfo(const Instruction *i) -+{ -+ switch (i->op) { -+ case OP_ABS: -+ if (isFloatType(i->dType)) -+ return &opInfo_FABS; -+ return &opInfo_IABS; -+ case OP_ADD: -+ if (isFloatType(i->dType)) { -+ if (i->dType == TYPE_F32) -+ return &opInfo_FADD; -+ else -+ return &opInfo_DADD; -+ } else { -+ return &opInfo_IADD3; -+ } -+ break; -+ case OP_AFETCH: return &opInfo_AL2P; -+ case OP_AND: -+ case OP_OR: -+ case OP_XOR: -+ if (i->def(0).getFile() == FILE_PREDICATE) -+ return &opInfo_PLOP3_LUT; -+ return &opInfo_LOP2; -+ case OP_ATOM: -+ if (i->src(0).getFile() == FILE_MEMORY_SHARED) -+ return &opInfo_ATOMS; -+ else -+ if (!i->defExists(0) && i->subOp < NV50_IR_SUBOP_ATOM_CAS) -+ return &opInfo_RED; -+ else -+ return &opInfo_ATOM; -+ break; -+ case OP_BAR: return &opInfo_BAR; -+ case OP_BFIND: return &opInfo_FLO; -+ case OP_BMSK: return &opInfo_BMSK; -+ case OP_BREV: return &opInfo_BREV; -+ case OP_BRA: -+ case OP_JOIN: return &opInfo_BRA; //XXX -+ case OP_CCTL: return &opInfo_CCTL; -+ case OP_CEIL: -+ case OP_CVT: -+ case OP_FLOOR: -+ case OP_TRUNC: -+ if (i->op == OP_CVT && (i->def(0).getFile() == FILE_PREDICATE || -+ i->src(0).getFile() == FILE_PREDICATE)) { -+ return &opInfo_MOV; -+ } else if (isFloatType(i->dType)) { -+ if (isFloatType(i->sType)) { -+ if (i->sType == i->dType) -+ return &opInfo_FRND; -+ else -+ return &opInfo_F2F; -+ } else { -+ return &opInfo_I2F; -+ } -+ } else { -+ if (isFloatType(i->sType)) -+ return &opInfo_F2I; -+ } -+ break; -+ case OP_COS: -+ case OP_EX2: -+ case OP_LG2: -+ case OP_RCP: -+ case OP_RSQ: -+ case OP_SIN: -+ case OP_SQRT: return &opInfo_MUFU; -+ case OP_DISCARD: return &opInfo_KILL; -+ case OP_EMIT: -+ case OP_FINAL: -+ case OP_RESTART: return &opInfo_OUT; -+ case OP_EXIT: return &opInfo_EXIT; -+ case OP_EXPORT: return &opInfo_AST; -+ case OP_FMA: -+ case OP_MAD: -+ if (isFloatType(i->dType)) { -+ if (i->dType == TYPE_F32) -+ return &opInfo_FFMA; -+ else -+ return &opInfo_DFMA; -+ } else { -+ if (typeSizeof(i->dType) != 8) -+ return &opInfo_IMAD; -+ else -+ return &opInfo_IMAD_WIDE; -+ } -+ break; -+ case OP_JOINAT: return &opInfo_NOP; //XXX -+ case OP_LINTERP: return &opInfo_IPA; -+ case OP_LOAD: -+ switch (i->src(0).getFile()) { -+ case FILE_MEMORY_CONST : return &opInfo_LDC; -+ case FILE_MEMORY_LOCAL : return &opInfo_LDL; -+ case FILE_MEMORY_SHARED: return &opInfo_LDS; -+ case FILE_MEMORY_GLOBAL: return &opInfo_LD; -+ default: -+ break; -+ } -+ break; -+ case OP_LOP3_LUT: return &opInfo_LOP3_LUT; -+ case OP_MAX: -+ case OP_MIN: -+ if (isFloatType(i->dType)) { -+ if (i->dType == TYPE_F32) -+ return &opInfo_FMNMX; -+ } else { -+ return &opInfo_IMNMX; -+ } -+ break; -+ case OP_MEMBAR: return &opInfo_MEMBAR; -+ case OP_MOV: return &opInfo_MOV; -+ case OP_MUL: -+ if (isFloatType(i->dType)) { -+ if (i->dType == TYPE_F32) -+ return &opInfo_FMUL; -+ else -+ return &opInfo_DMUL; -+ } -+ return &opInfo_IMUL; -+ case OP_NEG: -+ if (isFloatType(i->dType)) -+ return &opInfo_FNEG; -+ return &opInfo_INEG; -+ case OP_NOT: return &opInfo_NOT; -+ case OP_PERMT: return &opInfo_PRMT; -+ case OP_PFETCH: return &opInfo_ISBERD; -+ case OP_PIXLD: return &opInfo_PIXLD; -+ case OP_POPCNT: return &opInfo_POPC; -+ case OP_QUADOP: return &opInfo_FSWZADD; -+ case OP_RDSV: -+#if 0 -+ if (targ->isCS2RSV(i->getSrc(0)->reg.data.sv.sv)) -+ return &opInfo_CS2R; -+#endif -+ return &opInfo_S2R; -+ case OP_SAT: return &opInfo_SAT; -+ case OP_SELP: return &opInfo_SEL; -+ case OP_SET: -+ case OP_SET_AND: -+ case OP_SET_OR: -+ case OP_SET_XOR: -+ if (i->def(0).getFile() != FILE_PREDICATE) { -+ if (isFloatType(i->dType)) { -+ if (i->dType == TYPE_F32) -+ return &opInfo_FSET_BF; -+ } else { -+ if (isFloatType(i->sType)) -+ return &opInfo_FSET; -+ return &opInfo_ISET; -+ } -+ } else { -+ if (isFloatType(i->sType)) -+ if (i->sType == TYPE_F64) -+ return &opInfo_DSETP; -+ else -+ return &opInfo_FSETP; -+ else -+ return &opInfo_ISETP; -+ } -+ break; -+ case OP_SGXT: return &opInfo_SGXT; -+ case OP_SHF: return &opInfo_SHF; -+ case OP_SHFL: return &opInfo_SHFL; -+ case OP_SHL: return &opInfo_SHL; -+ case OP_SHLADD: return &opInfo_LEA; -+ case OP_SHR: return &opInfo_SHR; -+ case OP_SLCT: -+ if (isFloatType(i->sType)) -+ return &opInfo_FCMP; -+ return &opInfo_ICMP; -+ case OP_STORE: -+ switch (i->src(0).getFile()) { -+ case FILE_MEMORY_LOCAL : return &opInfo_STL; -+ case FILE_MEMORY_SHARED: return &opInfo_STS; -+ case FILE_MEMORY_GLOBAL: return &opInfo_ST; -+ default: -+ break; -+ } -+ break; -+ case OP_SUB: return &opInfo_SUB; -+ case OP_SULDB: -+ case OP_SULDP: return &opInfo_SULD; -+ case OP_SUREDB: -+ case OP_SUREDP: return &opInfo_SUATOM; -+ case OP_SUSTB: -+ case OP_SUSTP: return &opInfo_SUST; -+ case OP_TEX: -+ case OP_TXB: -+ case OP_TXL: return &opInfo_TEX; -+ case OP_TXD: return &opInfo_TXD; -+ case OP_TXF: return &opInfo_TLD; -+ case OP_TXG: return &opInfo_TLD4; -+ case OP_TXLQ: return &opInfo_TMML; -+ case OP_TXQ: return &opInfo_TXQ; -+ case OP_VFETCH: return &opInfo_ALD; -+ case OP_VOTE: return &opInfo_VOTE; -+ case OP_WARPSYNC: return &opInfo_WARPSYNC; -+ default: -+ break; -+ } -+ return NULL; -+} -+ -+bool -+TargetGV100::isSatSupported(const Instruction *i) const -+{ -+ switch (i->dType) { -+ case TYPE_F32: -+ switch (i->op) { -+ case OP_ADD: -+ case OP_FMA: -+ case OP_MAD: -+ case OP_MUL: return true; -+ default: -+ break; -+ } -+ break; -+ default: -+ break; -+ } -+ return false; -+} -+ -+bool -+TargetGV100::isModSupported(const Instruction *i, int s, Modifier mod) const -+{ -+ const struct opInfo *info = nv50_ir::getOpInfo(i); -+ uint8_t mods = 0; -+ if (info && s < (int)ARRAY_SIZE(info->src)) -+ mods = info->src[s].mods; -+ return (mod & Modifier(mods)) == mod; -+} -+ -+bool -+TargetGV100::isOpSupported(operation op, DataType ty) const -+{ -+ if (op == OP_MAD || op == OP_FMA) -+ return true; -+ if (ty == TYPE_F32) { -+ if (op == OP_MAX) -+ return true; -+ } -+ if (op == OP_RSQ) -+ return true; -+ if (op == OP_SET || -+ op == OP_SET_AND || -+ op == OP_SET_OR || -+ op == OP_SET_XOR) -+ return true; -+ if (op == OP_SHLADD) -+ return true; -+ return false; -+} -+ -+bool -+TargetGV100::isBarrierRequired(const Instruction *i) const -+{ -+ switch (i->op) { -+ case OP_BREV: -+ return true; -+ default: -+ break; -+ } -+ -+ return TargetGM107::isBarrierRequired(i); -+} -+ -+bool -+TargetGV100::insnCanLoad(const Instruction *i, int s, -+ const Instruction *ld) const -+{ -+ const struct opInfo *info = nv50_ir::getOpInfo(i); -+ uint16_t files = 0; -+ -+ if (ld->src(0).getFile() == FILE_IMMEDIATE && ld->getSrc(0)->reg.data.u64 == 0) -+ return (!i->isPseudo() && -+ !i->asTex() && -+ i->op != OP_EXPORT && i->op != OP_STORE); -+ -+ if (ld->src(0).isIndirect(0)) -+ return false; -+ -+ if (info && s < (int)ARRAY_SIZE(info->src)) { -+ files = info->src[s].files; -+ if ((s == 1 && i->srcExists(2) && i->src(2).getFile() != FILE_GPR) || -+ (s == 2 && i->srcExists(1) && i->src(1).getFile() != FILE_GPR)) { -+ files &= ~(1 << FILE_MEMORY_CONST); -+ files &= ~(1 << FILE_IMMEDIATE); -+ } else -+ if ((i->op == OP_SHL || i->op == OP_SHR) && -+ ((s == 0 && i->srcExists(1) && i->src(1).getFile() != FILE_GPR) || -+ (s == 1 && i->srcExists(0) && i->src(0).getFile() != FILE_GPR))) { -+ files &= ~(1 << FILE_MEMORY_CONST); -+ files &= ~(1 << FILE_IMMEDIATE); -+ } -+ } -+ -+ if (ld->src(0).getFile() == FILE_IMMEDIATE) { -+ if (i->sType == TYPE_F64) { -+ if (ld->getSrc(0)->asImm()->reg.data.u64 & 0x00000000ffffffff) -+ return false; -+ } -+ } -+ -+ return (files & (1 << ld->src(0).getFile())); -+} -+ -+void -+TargetGV100::getBuiltinCode(const uint32_t **code, uint32_t *size) const -+{ -+ //XXX: find out why gv100 (tu1xx is fine) hangs without this -+ static uint32_t builtin[] = { -+ 0x0000794d, 0x00000000, 0x03800000, 0x03ffde00, -+ 0x0000794d, 0x00000000, 0x03800000, 0x03ffde00, -+ 0x0000794d, 0x00000000, 0x03800000, 0x03ffde00, -+ 0x0000794d, 0x00000000, 0x03800000, 0x03ffde00, -+ 0x0000794d, 0x00000000, 0x03800000, 0x03ffde00, -+ 0x0000794d, 0x00000000, 0x03800000, 0x03ffde00, -+ 0x0000794d, 0x00000000, 0x03800000, 0x03ffde00, -+ 0x0000794d, 0x00000000, 0x03800000, 0x03ffde00, -+ 0x0000794d, 0x00000000, 0x03800000, 0x03ffde00, -+ 0x0000794d, 0x00000000, 0x03800000, 0x03ffde00, -+ 0x0000794d, 0x00000000, 0x03800000, 0x03ffde00, -+ 0x0000794d, 0x00000000, 0x03800000, 0x03ffde00, -+ 0x0000794d, 0x00000000, 0x03800000, 0x03ffde00, -+ 0x0000794d, 0x00000000, 0x03800000, 0x03ffde00, -+ 0x0000794d, 0x00000000, 0x03800000, 0x03ffde00, -+ 0x0000794d, 0x00000000, 0x03800000, 0x03ffde00, -+ 0x0000794d, 0x00000000, 0x03800000, 0x03ffde00, -+ 0x0000794d, 0x00000000, 0x03800000, 0x03ffde00, -+ 0x0000794d, 0x00000000, 0x03800000, 0x03ffde00, -+ 0x0000794d, 0x00000000, 0x03800000, 0x03ffde00, -+ 0x0000794d, 0x00000000, 0x03800000, 0x03ffde00, -+ 0x0000794d, 0x00000000, 0x03800000, 0x03ffde00, -+ 0x0000794d, 0x00000000, 0x03800000, 0x03ffde00, -+ 0x0000794d, 0x00000000, 0x03800000, 0x03ffde00, -+ 0x0000794d, 0x00000000, 0x03800000, 0x03ffde00, -+ 0x0000794d, 0x00000000, 0x03800000, 0x03ffde00, -+ 0x0000794d, 0x00000000, 0x03800000, 0x03ffde00, -+ }; -+ *code = builtin; -+ *size = sizeof(builtin); -+} -+ -+uint32_t -+TargetGV100::getBuiltinOffset(int builtin) const -+{ -+ return 0; -+} -+ -+bool -+TargetGV100::runLegalizePass(Program *prog, CGStage stage) const -+{ -+ if (stage == CG_STAGE_PRE_SSA) { -+ GM107LoweringPass pass1(prog); -+ GV100LoweringPass pass2(prog); -+ pass1.run(prog, false, true); -+ pass2.run(prog, false, true); -+ return true; -+ } else -+ if (stage == CG_STAGE_SSA) { -+ GV100LegalizeSSA pass(prog); -+ return pass.run(prog, false, true); -+ } else -+ if (stage == CG_STAGE_POST_RA) { -+ NVC0LegalizePostRA pass(prog); -+ return pass.run(prog, false, true); -+ } -+ return false; -+} -+ -+CodeEmitter * -+TargetGV100::getCodeEmitter(Program::Type type) -+{ -+ return new CodeEmitterGV100(this); -+} -+ -+TargetGV100::TargetGV100(unsigned int chipset) -+ : TargetGM107(chipset) -+{ -+ initOpInfo(); -+}; -+ -+Target *getTargetGV100(unsigned int chipset) -+{ -+ return new TargetGV100(chipset); -+} -+ -+}; -diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gv100.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gv100.h -new file mode 100644 -index 00000000000..897e6a22d30 ---- /dev/null -+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gv100.h -@@ -0,0 +1,52 @@ -+/* -+ * Copyright 2020 Red Hat Inc. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice shall be included in -+ * all copies or substantial portions of the Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -+ * OTHER DEALINGS IN THE SOFTWARE. -+ */ -+#ifndef __NV50_IR_TARGET_GV100_H__ -+#define __NV50_IR_TARGET_GV100_H__ -+#include "codegen/nv50_ir_target_gm107.h" -+ -+namespace nv50_ir { -+ -+class TargetGV100 : public TargetGM107 { -+public: -+ TargetGV100(unsigned int chipset); -+ -+ virtual CodeEmitter *getCodeEmitter(Program::Type); -+ -+ virtual bool runLegalizePass(Program *, CGStage stage) const; -+ -+ virtual void getBuiltinCode(const uint32_t **code, uint32_t *size) const; -+ virtual uint32_t getBuiltinOffset(int builtin) const; -+ -+ virtual bool insnCanLoad(const Instruction *, int, const Instruction *) const; -+ virtual bool isOpSupported(operation, DataType) const; -+ virtual bool isModSupported(const Instruction *, int s, Modifier) const; -+ virtual bool isSatSupported(const Instruction *) const; -+ -+ virtual bool isBarrierRequired(const Instruction *) const; -+ -+private: -+ void initOpInfo(); -+ void initProps(const struct opProperties *, int); -+}; -+ -+}; -+#endif -diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp -index 60134b445db..ed5b343ccba 100644 ---- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp -+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp -@@ -30,7 +30,7 @@ Target *getTargetNVC0(unsigned int chipset) - } - - TargetNVC0::TargetNVC0(unsigned int card) : -- Target(card < 0x110, false, card >= 0xe4) -+ Target(card < 0x110, false, card >= 0xe4 && card < 0x140) - { - chipset = card; - initOpInfo(); -diff --git a/src/gallium/drivers/nouveau/meson.build b/src/gallium/drivers/nouveau/meson.build -index 7a1d18a6394..68cfebdf20c 100644 ---- a/src/gallium/drivers/nouveau/meson.build -+++ b/src/gallium/drivers/nouveau/meson.build -@@ -150,17 +150,31 @@ files_libnouveau = files( - 'codegen/nv50_ir_util.cpp', - 'codegen/nv50_ir_util.h', - 'codegen/unordered_set.h', -+ 'codegen/nv50_ir_emit_gv100.cpp', -+ 'codegen/nv50_ir_emit_gv100.h', - 'codegen/nv50_ir_emit_gk110.cpp', - 'codegen/nv50_ir_emit_gm107.cpp', - 'codegen/nv50_ir_emit_nvc0.cpp', -+ 'codegen/nv50_ir_lowering_gv100.cpp', -+ 'codegen/nv50_ir_lowering_gv100.h', - 'codegen/nv50_ir_lowering_gm107.cpp', - 'codegen/nv50_ir_lowering_gm107.h', - 'codegen/nv50_ir_lowering_nvc0.cpp', - 'codegen/nv50_ir_lowering_nvc0.h', -+ 'codegen/nv50_ir_target_gv100.cpp', -+ 'codegen/nv50_ir_target_gv100.h', - 'codegen/nv50_ir_target_gm107.cpp', - 'codegen/nv50_ir_target_gm107.h', - 'codegen/nv50_ir_target_nvc0.cpp', - 'codegen/nv50_ir_target_nvc0.h', -+ 'nvc0/cla0c0qmd.h', -+ 'nvc0/clc0c0qmd.h', -+ 'nvc0/clc3c0qmd.h', -+ 'nvc0/drf.h', -+ 'nvc0/qmd.h', -+ 'nvc0/qmda0c0.c', -+ 'nvc0/qmdc0c0.c', -+ 'nvc0/qmdc3c0.c', - 'nvc0/gm107_texture.xml.h', - 'nvc0/nvc0_3d.xml.h', - 'nvc0/nvc0_compute.c', -diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c -index de9cce3812a..8606ba43c1a 100644 ---- a/src/gallium/drivers/nouveau/nouveau_screen.c -+++ b/src/gallium/drivers/nouveau/nouveau_screen.c -@@ -188,7 +188,11 @@ nouveau_screen_init(struct nouveau_screen *screen, struct nouveau_device *dev) - if (nv_dbg) - nouveau_mesa_debug = atoi(nv_dbg); - -- screen->prefer_nir = debug_get_bool_option("NV50_PROG_USE_NIR", false); -+ if (dev->chipset < 0x140) -+ screen->prefer_nir = debug_get_bool_option("NV50_PROG_USE_NIR", false); -+ else -+ screen->prefer_nir = true; -+ - screen->force_enable_cl = debug_get_bool_option("NOUVEAU_ENABLE_CL", false); - if (screen->force_enable_cl) - glsl_type_singleton_init_or_ref(); -diff --git a/src/gallium/drivers/nouveau/nv50/nv50_2d.xml.h b/src/gallium/drivers/nouveau/nv50/nv50_2d.xml.h -index 899d73d7398..31e7cf82233 100644 ---- a/src/gallium/drivers/nouveau/nv50/nv50_2d.xml.h -+++ b/src/gallium/drivers/nouveau/nv50/nv50_2d.xml.h -@@ -218,9 +218,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #define NV50_2D_PATTERN_SELECT_BITMAP_1X64 0x00000002 - #define NV50_2D_PATTERN_SELECT_COLOR 0x00000003 - --#define NVC0_2D_UNK02B8(i0) (0x000002b8 + 0x4*(i0)) --#define NVC0_2D_UNK02B8__ESIZE 0x00000004 --#define NVC0_2D_UNK02B8__LEN 0x00000009 -+#define NVC0_2D_SET_DST_COLOR_RENDER_TO_ZETA_SURFACE 0x000002b8 - - #define NVC0_2D_UNK2DC 0x000002dc - -diff --git a/src/gallium/drivers/nouveau/nv_object.xml.h b/src/gallium/drivers/nouveau/nv_object.xml.h -index 664bfae9f64..fac195d4846 100644 ---- a/src/gallium/drivers/nouveau/nv_object.xml.h -+++ b/src/gallium/drivers/nouveau/nv_object.xml.h -@@ -195,6 +195,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #define GM200_3D_CLASS 0x0000b197 - #define GP100_3D_CLASS 0x0000c097 - #define GP102_3D_CLASS 0x0000c197 -+#define GV100_3D_CLASS 0x0000c397 -+#define TU102_3D_CLASS 0x0000c597 - #define NV50_2D_CLASS 0x0000502d - #define NVC0_2D_CLASS 0x0000902d - #define NV50_COMPUTE_CLASS 0x000050c0 -@@ -207,6 +209,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #define GM200_COMPUTE_CLASS 0x0000b1c0 - #define GP100_COMPUTE_CLASS 0x0000c0c0 - #define GP104_COMPUTE_CLASS 0x0000c1c0 -+#define GV100_COMPUTE_CLASS 0x0000c3c0 -+#define TU102_COMPUTE_CLASS 0x0000c5c0 - #define NV84_CRYPT_CLASS 0x000074c1 - #define BLOB_NVC0_PCOPY1_CLASS 0x000090b8 - #define BLOB_NVC0_PCOPY0_CLASS 0x000090b5 -diff --git a/src/gallium/drivers/nouveau/nvc0/cla0c0qmd.h b/src/gallium/drivers/nouveau/nvc0/cla0c0qmd.h -new file mode 100644 -index 00000000000..c0829f1cdc2 ---- /dev/null -+++ b/src/gallium/drivers/nouveau/nvc0/cla0c0qmd.h -@@ -0,0 +1,660 @@ -+/******************************************************************************* -+ Copyright (c) 2016 NVIDIA Corporation -+ -+ Permission is hereby granted, free of charge, to any person obtaining a copy -+ of this software and associated documentation files (the "Software"), to -+ deal in the Software without restriction, including without limitation the -+ rights to use, copy, modify, merge, publish, distribute, sublicense, and/or -+ sell copies of the Software, and to permit persons to whom the Software is -+ furnished to do so, subject to the following conditions: -+ -+ The above copyright notice and this permission notice shall be -+ included in all copies or substantial portions of the Software. -+ -+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -+ DEALINGS IN THE SOFTWARE. -+ -+*******************************************************************************/ -+ -+/* AUTO GENERATED FILE -- DO NOT EDIT */ -+ -+#ifndef __CLA0C0QMD_H__ -+#define __CLA0C0QMD_H__ -+ -+/* -+** Queue Meta Data, Version 00_06 -+ */ -+ -+// The below C preprocessor definitions describe "multi-word" structures, where -+// fields may have bit numbers beyond 32. For example, MW(127:96) means -+// the field is in bits 0-31 of word number 3 of the structure. The "MW(X:Y)" -+// syntax is to distinguish from similar "X:Y" single-word definitions: the -+// macros historically used for single-word definitions would fail with -+// multi-word definitions. -+// -+// See nvmisc.h:DRF_VAL_MW() in the source code of the kernel -+// interface layer of nvidia.ko for an example of how to manipulate -+// these MW(X:Y) definitions. -+ -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_A MW(30:0) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_B MW(31:31) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_C MW(62:32) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_D MW(63:63) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_E MW(94:64) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_F MW(95:95) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_G MW(126:96) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_H MW(127:127) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_A_A MW(159:128) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_I MW(191:160) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_J MW(196:192) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_A MW(199:197) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_K MW(200:200) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_K_FALSE 0x00000000 -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_K_TRUE 0x00000001 -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_L MW(201:201) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_L_FALSE 0x00000000 -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_L_TRUE 0x00000001 -+#define NVA0C0_QMDV00_06_SEMAPHORE_RELEASE_ENABLE0 MW(202:202) -+#define NVA0C0_QMDV00_06_SEMAPHORE_RELEASE_ENABLE0_FALSE 0x00000000 -+#define NVA0C0_QMDV00_06_SEMAPHORE_RELEASE_ENABLE0_TRUE 0x00000001 -+#define NVA0C0_QMDV00_06_SEMAPHORE_RELEASE_ENABLE1 MW(203:203) -+#define NVA0C0_QMDV00_06_SEMAPHORE_RELEASE_ENABLE1_FALSE 0x00000000 -+#define NVA0C0_QMDV00_06_SEMAPHORE_RELEASE_ENABLE1_TRUE 0x00000001 -+#define NVA0C0_QMDV00_06_QMD_RESERVED_B MW(207:204) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_M MW(222:208) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_N MW(223:223) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_N_FALSE 0x00000000 -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_N_TRUE 0x00000001 -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_O MW(248:224) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_C MW(249:249) -+#define NVA0C0_QMDV00_06_INVALIDATE_TEXTURE_HEADER_CACHE MW(250:250) -+#define NVA0C0_QMDV00_06_INVALIDATE_TEXTURE_HEADER_CACHE_FALSE 0x00000000 -+#define NVA0C0_QMDV00_06_INVALIDATE_TEXTURE_HEADER_CACHE_TRUE 0x00000001 -+#define NVA0C0_QMDV00_06_INVALIDATE_TEXTURE_SAMPLER_CACHE MW(251:251) -+#define NVA0C0_QMDV00_06_INVALIDATE_TEXTURE_SAMPLER_CACHE_FALSE 0x00000000 -+#define NVA0C0_QMDV00_06_INVALIDATE_TEXTURE_SAMPLER_CACHE_TRUE 0x00000001 -+#define NVA0C0_QMDV00_06_INVALIDATE_TEXTURE_DATA_CACHE MW(252:252) -+#define NVA0C0_QMDV00_06_INVALIDATE_TEXTURE_DATA_CACHE_FALSE 0x00000000 -+#define NVA0C0_QMDV00_06_INVALIDATE_TEXTURE_DATA_CACHE_TRUE 0x00000001 -+#define NVA0C0_QMDV00_06_INVALIDATE_SHADER_DATA_CACHE MW(253:253) -+#define NVA0C0_QMDV00_06_INVALIDATE_SHADER_DATA_CACHE_FALSE 0x00000000 -+#define NVA0C0_QMDV00_06_INVALIDATE_SHADER_DATA_CACHE_TRUE 0x00000001 -+#define NVA0C0_QMDV00_06_INVALIDATE_INSTRUCTION_CACHE MW(254:254) -+#define NVA0C0_QMDV00_06_INVALIDATE_INSTRUCTION_CACHE_FALSE 0x00000000 -+#define NVA0C0_QMDV00_06_INVALIDATE_INSTRUCTION_CACHE_TRUE 0x00000001 -+#define NVA0C0_QMDV00_06_INVALIDATE_SHADER_CONSTANT_CACHE MW(255:255) -+#define NVA0C0_QMDV00_06_INVALIDATE_SHADER_CONSTANT_CACHE_FALSE 0x00000000 -+#define NVA0C0_QMDV00_06_INVALIDATE_SHADER_CONSTANT_CACHE_TRUE 0x00000001 -+#define NVA0C0_QMDV00_06_PROGRAM_OFFSET MW(287:256) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_P MW(319:288) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_Q MW(327:320) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_D MW(335:328) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_R MW(351:336) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_S MW(357:352) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_E MW(365:358) -+#define NVA0C0_QMDV00_06_RELEASE_MEMBAR_TYPE MW(366:366) -+#define NVA0C0_QMDV00_06_RELEASE_MEMBAR_TYPE_FE_NONE 0x00000000 -+#define NVA0C0_QMDV00_06_RELEASE_MEMBAR_TYPE_FE_SYSMEMBAR 0x00000001 -+#define NVA0C0_QMDV00_06_CWD_MEMBAR_TYPE MW(369:368) -+#define NVA0C0_QMDV00_06_CWD_MEMBAR_TYPE_L1_NONE 0x00000000 -+#define NVA0C0_QMDV00_06_CWD_MEMBAR_TYPE_L1_SYSMEMBAR 0x00000001 -+#define NVA0C0_QMDV00_06_CWD_MEMBAR_TYPE_L1_MEMBAR 0x00000003 -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_T MW(370:370) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_T_FALSE 0x00000000 -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_T_TRUE 0x00000001 -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_U MW(371:371) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_U_FALSE 0x00000000 -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_U_TRUE 0x00000001 -+#define NVA0C0_QMDV00_06_THROTTLED MW(372:372) -+#define NVA0C0_QMDV00_06_THROTTLED_FALSE 0x00000000 -+#define NVA0C0_QMDV00_06_THROTTLED_TRUE 0x00000001 -+#define NVA0C0_QMDV00_06_QMD_RESERVED_E2_A MW(376:376) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_E2_B MW(377:377) -+#define NVA0C0_QMDV00_06_API_VISIBLE_CALL_LIMIT MW(378:378) -+#define NVA0C0_QMDV00_06_API_VISIBLE_CALL_LIMIT__32 0x00000000 -+#define NVA0C0_QMDV00_06_API_VISIBLE_CALL_LIMIT_NO_CHECK 0x00000001 -+#define NVA0C0_QMDV00_06_SHARED_MEMORY_BANK_MAPPING MW(379:379) -+#define NVA0C0_QMDV00_06_SHARED_MEMORY_BANK_MAPPING_FOUR_BYTES_PER_BANK 0x00000000 -+#define NVA0C0_QMDV00_06_SHARED_MEMORY_BANK_MAPPING_EIGHT_BYTES_PER_BANK 0x00000001 -+#define NVA0C0_QMDV00_06_SAMPLER_INDEX MW(382:382) -+#define NVA0C0_QMDV00_06_SAMPLER_INDEX_INDEPENDENTLY 0x00000000 -+#define NVA0C0_QMDV00_06_SAMPLER_INDEX_VIA_HEADER_INDEX 0x00000001 -+#define NVA0C0_QMDV00_06_QMD_RESERVED_E3_A MW(383:383) -+#define NVA0C0_QMDV00_06_CTA_RASTER_WIDTH MW(415:384) -+#define NVA0C0_QMDV00_06_CTA_RASTER_HEIGHT MW(431:416) -+#define NVA0C0_QMDV00_06_CTA_RASTER_DEPTH MW(447:432) -+#define NVA0C0_QMDV00_06_CTA_RASTER_WIDTH_RESUME MW(479:448) -+#define NVA0C0_QMDV00_06_CTA_RASTER_HEIGHT_RESUME MW(495:480) -+#define NVA0C0_QMDV00_06_CTA_RASTER_DEPTH_RESUME MW(511:496) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_V MW(535:512) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_F MW(542:536) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_W MW(543:543) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_W_FALSE 0x00000000 -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_W_TRUE 0x00000001 -+#define NVA0C0_QMDV00_06_SHARED_MEMORY_SIZE MW(561:544) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_G MW(575:562) -+#define NVA0C0_QMDV00_06_QMD_VERSION MW(579:576) -+#define NVA0C0_QMDV00_06_QMD_MAJOR_VERSION MW(583:580) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_H MW(591:584) -+#define NVA0C0_QMDV00_06_CTA_THREAD_DIMENSION0 MW(607:592) -+#define NVA0C0_QMDV00_06_CTA_THREAD_DIMENSION1 MW(623:608) -+#define NVA0C0_QMDV00_06_CTA_THREAD_DIMENSION2 MW(639:624) -+#define NVA0C0_QMDV00_06_CONSTANT_BUFFER_VALID(i) MW((640+(i)*1):(640+(i)*1)) -+#define NVA0C0_QMDV00_06_CONSTANT_BUFFER_VALID_FALSE 0x00000000 -+#define NVA0C0_QMDV00_06_CONSTANT_BUFFER_VALID_TRUE 0x00000001 -+#define NVA0C0_QMDV00_06_QMD_RESERVED_I MW(668:648) -+#define NVA0C0_QMDV00_06_L1_CONFIGURATION MW(671:669) -+#define NVA0C0_QMDV00_06_L1_CONFIGURATION_DIRECTLY_ADDRESSABLE_MEMORY_SIZE_16KB 0x00000001 -+#define NVA0C0_QMDV00_06_L1_CONFIGURATION_DIRECTLY_ADDRESSABLE_MEMORY_SIZE_32KB 0x00000002 -+#define NVA0C0_QMDV00_06_L1_CONFIGURATION_DIRECTLY_ADDRESSABLE_MEMORY_SIZE_48KB 0x00000003 -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_X MW(703:672) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_Y MW(735:704) -+#define NVA0C0_QMDV00_06_RELEASE0_ADDRESS_LOWER MW(767:736) -+#define NVA0C0_QMDV00_06_RELEASE0_ADDRESS_UPPER MW(775:768) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_J MW(783:776) -+#define NVA0C0_QMDV00_06_RELEASE0_REDUCTION_OP MW(790:788) -+#define NVA0C0_QMDV00_06_RELEASE0_REDUCTION_OP_RED_ADD 0x00000000 -+#define NVA0C0_QMDV00_06_RELEASE0_REDUCTION_OP_RED_MIN 0x00000001 -+#define NVA0C0_QMDV00_06_RELEASE0_REDUCTION_OP_RED_MAX 0x00000002 -+#define NVA0C0_QMDV00_06_RELEASE0_REDUCTION_OP_RED_INC 0x00000003 -+#define NVA0C0_QMDV00_06_RELEASE0_REDUCTION_OP_RED_DEC 0x00000004 -+#define NVA0C0_QMDV00_06_RELEASE0_REDUCTION_OP_RED_AND 0x00000005 -+#define NVA0C0_QMDV00_06_RELEASE0_REDUCTION_OP_RED_OR 0x00000006 -+#define NVA0C0_QMDV00_06_RELEASE0_REDUCTION_OP_RED_XOR 0x00000007 -+#define NVA0C0_QMDV00_06_QMD_RESERVED_K MW(791:791) -+#define NVA0C0_QMDV00_06_RELEASE0_REDUCTION_FORMAT MW(793:792) -+#define NVA0C0_QMDV00_06_RELEASE0_REDUCTION_FORMAT_UNSIGNED_32 0x00000000 -+#define NVA0C0_QMDV00_06_RELEASE0_REDUCTION_FORMAT_SIGNED_32 0x00000001 -+#define NVA0C0_QMDV00_06_RELEASE0_REDUCTION_ENABLE MW(794:794) -+#define NVA0C0_QMDV00_06_RELEASE0_REDUCTION_ENABLE_FALSE 0x00000000 -+#define NVA0C0_QMDV00_06_RELEASE0_REDUCTION_ENABLE_TRUE 0x00000001 -+#define NVA0C0_QMDV00_06_RELEASE0_STRUCTURE_SIZE MW(799:799) -+#define NVA0C0_QMDV00_06_RELEASE0_STRUCTURE_SIZE_FOUR_WORDS 0x00000000 -+#define NVA0C0_QMDV00_06_RELEASE0_STRUCTURE_SIZE_ONE_WORD 0x00000001 -+#define NVA0C0_QMDV00_06_RELEASE0_PAYLOAD MW(831:800) -+#define NVA0C0_QMDV00_06_RELEASE1_ADDRESS_LOWER MW(863:832) -+#define NVA0C0_QMDV00_06_RELEASE1_ADDRESS_UPPER MW(871:864) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_L MW(879:872) -+#define NVA0C0_QMDV00_06_RELEASE1_REDUCTION_OP MW(886:884) -+#define NVA0C0_QMDV00_06_RELEASE1_REDUCTION_OP_RED_ADD 0x00000000 -+#define NVA0C0_QMDV00_06_RELEASE1_REDUCTION_OP_RED_MIN 0x00000001 -+#define NVA0C0_QMDV00_06_RELEASE1_REDUCTION_OP_RED_MAX 0x00000002 -+#define NVA0C0_QMDV00_06_RELEASE1_REDUCTION_OP_RED_INC 0x00000003 -+#define NVA0C0_QMDV00_06_RELEASE1_REDUCTION_OP_RED_DEC 0x00000004 -+#define NVA0C0_QMDV00_06_RELEASE1_REDUCTION_OP_RED_AND 0x00000005 -+#define NVA0C0_QMDV00_06_RELEASE1_REDUCTION_OP_RED_OR 0x00000006 -+#define NVA0C0_QMDV00_06_RELEASE1_REDUCTION_OP_RED_XOR 0x00000007 -+#define NVA0C0_QMDV00_06_QMD_RESERVED_M MW(887:887) -+#define NVA0C0_QMDV00_06_RELEASE1_REDUCTION_FORMAT MW(889:888) -+#define NVA0C0_QMDV00_06_RELEASE1_REDUCTION_FORMAT_UNSIGNED_32 0x00000000 -+#define NVA0C0_QMDV00_06_RELEASE1_REDUCTION_FORMAT_SIGNED_32 0x00000001 -+#define NVA0C0_QMDV00_06_RELEASE1_REDUCTION_ENABLE MW(890:890) -+#define NVA0C0_QMDV00_06_RELEASE1_REDUCTION_ENABLE_FALSE 0x00000000 -+#define NVA0C0_QMDV00_06_RELEASE1_REDUCTION_ENABLE_TRUE 0x00000001 -+#define NVA0C0_QMDV00_06_RELEASE1_STRUCTURE_SIZE MW(895:895) -+#define NVA0C0_QMDV00_06_RELEASE1_STRUCTURE_SIZE_FOUR_WORDS 0x00000000 -+#define NVA0C0_QMDV00_06_RELEASE1_STRUCTURE_SIZE_ONE_WORD 0x00000001 -+#define NVA0C0_QMDV00_06_RELEASE1_PAYLOAD MW(927:896) -+#define NVA0C0_QMDV00_06_CONSTANT_BUFFER_ADDR_LOWER(i) MW((959+(i)*64):(928+(i)*64)) -+#define NVA0C0_QMDV00_06_CONSTANT_BUFFER_ADDR_UPPER(i) MW((967+(i)*64):(960+(i)*64)) -+#define NVA0C0_QMDV00_06_CONSTANT_BUFFER_RESERVED_ADDR(i) MW((973+(i)*64):(968+(i)*64)) -+#define NVA0C0_QMDV00_06_CONSTANT_BUFFER_INVALIDATE(i) MW((974+(i)*64):(974+(i)*64)) -+#define NVA0C0_QMDV00_06_CONSTANT_BUFFER_INVALIDATE_FALSE 0x00000000 -+#define NVA0C0_QMDV00_06_CONSTANT_BUFFER_INVALIDATE_TRUE 0x00000001 -+#define NVA0C0_QMDV00_06_CONSTANT_BUFFER_SIZE(i) MW((991+(i)*64):(975+(i)*64)) -+#define NVA0C0_QMDV00_06_SHADER_LOCAL_MEMORY_LOW_SIZE MW(1463:1440) -+#define NVA0C0_QMDV00_06_QMD_RESERVED_N MW(1466:1464) -+#define NVA0C0_QMDV00_06_BARRIER_COUNT MW(1471:1467) -+#define NVA0C0_QMDV00_06_SHADER_LOCAL_MEMORY_HIGH_SIZE MW(1495:1472) -+#define NVA0C0_QMDV00_06_REGISTER_COUNT MW(1503:1496) -+#define NVA0C0_QMDV00_06_SHADER_LOCAL_MEMORY_CRS_SIZE MW(1527:1504) -+#define NVA0C0_QMDV00_06_SASS_VERSION MW(1535:1528) -+#define NVA0C0_QMDV00_06_QMD_SPARE_A MW(1567:1536) -+#define NVA0C0_QMDV00_06_QMD_SPARE_B MW(1599:1568) -+#define NVA0C0_QMDV00_06_QMD_SPARE_C MW(1631:1600) -+#define NVA0C0_QMDV00_06_QMD_SPARE_D MW(1663:1632) -+#define NVA0C0_QMDV00_06_QMD_SPARE_E MW(1695:1664) -+#define NVA0C0_QMDV00_06_QMD_SPARE_F MW(1727:1696) -+#define NVA0C0_QMDV00_06_QMD_SPARE_G MW(1759:1728) -+#define NVA0C0_QMDV00_06_QMD_SPARE_H MW(1791:1760) -+#define NVA0C0_QMDV00_06_QMD_SPARE_I MW(1823:1792) -+#define NVA0C0_QMDV00_06_QMD_SPARE_J MW(1855:1824) -+#define NVA0C0_QMDV00_06_QMD_SPARE_K MW(1887:1856) -+#define NVA0C0_QMDV00_06_QMD_SPARE_L MW(1919:1888) -+#define NVA0C0_QMDV00_06_QMD_SPARE_M MW(1951:1920) -+#define NVA0C0_QMDV00_06_QMD_SPARE_N MW(1983:1952) -+#define NVA0C0_QMDV00_06_DEBUG_ID_UPPER MW(2015:1984) -+#define NVA0C0_QMDV00_06_DEBUG_ID_LOWER MW(2047:2016) -+ -+ -+/* -+** Queue Meta Data, Version 01_06 -+ */ -+ -+#define NVA0C0_QMDV01_06_OUTER_PUT MW(30:0) -+#define NVA0C0_QMDV01_06_OUTER_OVERFLOW MW(31:31) -+#define NVA0C0_QMDV01_06_OUTER_GET MW(62:32) -+#define NVA0C0_QMDV01_06_OUTER_STICKY_OVERFLOW MW(63:63) -+#define NVA0C0_QMDV01_06_INNER_GET MW(94:64) -+#define NVA0C0_QMDV01_06_INNER_OVERFLOW MW(95:95) -+#define NVA0C0_QMDV01_06_INNER_PUT MW(126:96) -+#define NVA0C0_QMDV01_06_INNER_STICKY_OVERFLOW MW(127:127) -+#define NVA0C0_QMDV01_06_QMD_RESERVED_A_A MW(159:128) -+#define NVA0C0_QMDV01_06_SCHEDULER_NEXT_QMD_POINTER MW(191:160) -+#define NVA0C0_QMDV01_06_QMD_GROUP_ID MW(197:192) -+#define NVA0C0_QMDV01_06_QMD_RESERVED_A MW(199:198) -+#define NVA0C0_QMDV01_06_SCHEDULE_ON_PUT_UPDATE_ENABLE MW(200:200) -+#define NVA0C0_QMDV01_06_SCHEDULE_ON_PUT_UPDATE_ENABLE_FALSE 0x00000000 -+#define NVA0C0_QMDV01_06_SCHEDULE_ON_PUT_UPDATE_ENABLE_TRUE 0x00000001 -+#define NVA0C0_QMDV01_06_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST MW(201:201) -+#define NVA0C0_QMDV01_06_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_FALSE 0x00000000 -+#define NVA0C0_QMDV01_06_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_TRUE 0x00000001 -+#define NVA0C0_QMDV01_06_SEMAPHORE_RELEASE_ENABLE0 MW(202:202) -+#define NVA0C0_QMDV01_06_SEMAPHORE_RELEASE_ENABLE0_FALSE 0x00000000 -+#define NVA0C0_QMDV01_06_SEMAPHORE_RELEASE_ENABLE0_TRUE 0x00000001 -+#define NVA0C0_QMDV01_06_SEMAPHORE_RELEASE_ENABLE1 MW(203:203) -+#define NVA0C0_QMDV01_06_SEMAPHORE_RELEASE_ENABLE1_FALSE 0x00000000 -+#define NVA0C0_QMDV01_06_SEMAPHORE_RELEASE_ENABLE1_TRUE 0x00000001 -+#define NVA0C0_QMDV01_06_REQUIRE_SCHEDULING_PCAS MW(204:204) -+#define NVA0C0_QMDV01_06_REQUIRE_SCHEDULING_PCAS_FALSE 0x00000000 -+#define NVA0C0_QMDV01_06_REQUIRE_SCHEDULING_PCAS_TRUE 0x00000001 -+#define NVA0C0_QMDV01_06_QMD_RESERVED_B MW(207:205) -+#define NVA0C0_QMDV01_06_SKED_PRIVATE_LIST_ADDR MW(222:208) -+#define NVA0C0_QMDV01_06_SKED_PRIVATE_LIST_VALID MW(223:223) -+#define NVA0C0_QMDV01_06_SKED_PRIVATE_LIST_VALID_FALSE 0x00000000 -+#define NVA0C0_QMDV01_06_SKED_PRIVATE_LIST_VALID_TRUE 0x00000001 -+#define NVA0C0_QMDV01_06_CIRCULAR_QUEUE_SIZE MW(248:224) -+#define NVA0C0_QMDV01_06_QMD_RESERVED_C MW(249:249) -+#define NVA0C0_QMDV01_06_INVALIDATE_TEXTURE_HEADER_CACHE MW(250:250) -+#define NVA0C0_QMDV01_06_INVALIDATE_TEXTURE_HEADER_CACHE_FALSE 0x00000000 -+#define NVA0C0_QMDV01_06_INVALIDATE_TEXTURE_HEADER_CACHE_TRUE 0x00000001 -+#define NVA0C0_QMDV01_06_INVALIDATE_TEXTURE_SAMPLER_CACHE MW(251:251) -+#define NVA0C0_QMDV01_06_INVALIDATE_TEXTURE_SAMPLER_CACHE_FALSE 0x00000000 -+#define NVA0C0_QMDV01_06_INVALIDATE_TEXTURE_SAMPLER_CACHE_TRUE 0x00000001 -+#define NVA0C0_QMDV01_06_INVALIDATE_TEXTURE_DATA_CACHE MW(252:252) -+#define NVA0C0_QMDV01_06_INVALIDATE_TEXTURE_DATA_CACHE_FALSE 0x00000000 -+#define NVA0C0_QMDV01_06_INVALIDATE_TEXTURE_DATA_CACHE_TRUE 0x00000001 -+#define NVA0C0_QMDV01_06_INVALIDATE_SHADER_DATA_CACHE MW(253:253) -+#define NVA0C0_QMDV01_06_INVALIDATE_SHADER_DATA_CACHE_FALSE 0x00000000 -+#define NVA0C0_QMDV01_06_INVALIDATE_SHADER_DATA_CACHE_TRUE 0x00000001 -+#define NVA0C0_QMDV01_06_INVALIDATE_INSTRUCTION_CACHE MW(254:254) -+#define NVA0C0_QMDV01_06_INVALIDATE_INSTRUCTION_CACHE_FALSE 0x00000000 -+#define NVA0C0_QMDV01_06_INVALIDATE_INSTRUCTION_CACHE_TRUE 0x00000001 -+#define NVA0C0_QMDV01_06_INVALIDATE_SHADER_CONSTANT_CACHE MW(255:255) -+#define NVA0C0_QMDV01_06_INVALIDATE_SHADER_CONSTANT_CACHE_FALSE 0x00000000 -+#define NVA0C0_QMDV01_06_INVALIDATE_SHADER_CONSTANT_CACHE_TRUE 0x00000001 -+#define NVA0C0_QMDV01_06_PROGRAM_OFFSET MW(287:256) -+#define NVA0C0_QMDV01_06_CIRCULAR_QUEUE_ADDR_LOWER MW(319:288) -+#define NVA0C0_QMDV01_06_CIRCULAR_QUEUE_ADDR_UPPER MW(327:320) -+#define NVA0C0_QMDV01_06_QMD_RESERVED_D MW(335:328) -+#define NVA0C0_QMDV01_06_CIRCULAR_QUEUE_ENTRY_SIZE MW(351:336) -+#define NVA0C0_QMDV01_06_CWD_REFERENCE_COUNT_ID MW(357:352) -+#define NVA0C0_QMDV01_06_CWD_REFERENCE_COUNT_DELTA_MINUS_ONE MW(365:358) -+#define NVA0C0_QMDV01_06_RELEASE_MEMBAR_TYPE MW(366:366) -+#define NVA0C0_QMDV01_06_RELEASE_MEMBAR_TYPE_FE_NONE 0x00000000 -+#define NVA0C0_QMDV01_06_RELEASE_MEMBAR_TYPE_FE_SYSMEMBAR 0x00000001 -+#define NVA0C0_QMDV01_06_CWD_REFERENCE_COUNT_INCR_ENABLE MW(367:367) -+#define NVA0C0_QMDV01_06_CWD_REFERENCE_COUNT_INCR_ENABLE_FALSE 0x00000000 -+#define NVA0C0_QMDV01_06_CWD_REFERENCE_COUNT_INCR_ENABLE_TRUE 0x00000001 -+#define NVA0C0_QMDV01_06_CWD_MEMBAR_TYPE MW(369:368) -+#define NVA0C0_QMDV01_06_CWD_MEMBAR_TYPE_L1_NONE 0x00000000 -+#define NVA0C0_QMDV01_06_CWD_MEMBAR_TYPE_L1_SYSMEMBAR 0x00000001 -+#define NVA0C0_QMDV01_06_CWD_MEMBAR_TYPE_L1_MEMBAR 0x00000003 -+#define NVA0C0_QMDV01_06_SEQUENTIALLY_RUN_CTAS MW(370:370) -+#define NVA0C0_QMDV01_06_SEQUENTIALLY_RUN_CTAS_FALSE 0x00000000 -+#define NVA0C0_QMDV01_06_SEQUENTIALLY_RUN_CTAS_TRUE 0x00000001 -+#define NVA0C0_QMDV01_06_CWD_REFERENCE_COUNT_DECR_ENABLE MW(371:371) -+#define NVA0C0_QMDV01_06_CWD_REFERENCE_COUNT_DECR_ENABLE_FALSE 0x00000000 -+#define NVA0C0_QMDV01_06_CWD_REFERENCE_COUNT_DECR_ENABLE_TRUE 0x00000001 -+#define NVA0C0_QMDV01_06_THROTTLED MW(372:372) -+#define NVA0C0_QMDV01_06_THROTTLED_FALSE 0x00000000 -+#define NVA0C0_QMDV01_06_THROTTLED_TRUE 0x00000001 -+#define NVA0C0_QMDV01_06_FP32_NAN_BEHAVIOR MW(376:376) -+#define NVA0C0_QMDV01_06_FP32_NAN_BEHAVIOR_LEGACY 0x00000000 -+#define NVA0C0_QMDV01_06_FP32_NAN_BEHAVIOR_FP64_COMPATIBLE 0x00000001 -+#define NVA0C0_QMDV01_06_FP32_F2I_NAN_BEHAVIOR MW(377:377) -+#define NVA0C0_QMDV01_06_FP32_F2I_NAN_BEHAVIOR_PASS_ZERO 0x00000000 -+#define NVA0C0_QMDV01_06_FP32_F2I_NAN_BEHAVIOR_PASS_INDEFINITE 0x00000001 -+#define NVA0C0_QMDV01_06_API_VISIBLE_CALL_LIMIT MW(378:378) -+#define NVA0C0_QMDV01_06_API_VISIBLE_CALL_LIMIT__32 0x00000000 -+#define NVA0C0_QMDV01_06_API_VISIBLE_CALL_LIMIT_NO_CHECK 0x00000001 -+#define NVA0C0_QMDV01_06_SHARED_MEMORY_BANK_MAPPING MW(379:379) -+#define NVA0C0_QMDV01_06_SHARED_MEMORY_BANK_MAPPING_FOUR_BYTES_PER_BANK 0x00000000 -+#define NVA0C0_QMDV01_06_SHARED_MEMORY_BANK_MAPPING_EIGHT_BYTES_PER_BANK 0x00000001 -+#define NVA0C0_QMDV01_06_SAMPLER_INDEX MW(382:382) -+#define NVA0C0_QMDV01_06_SAMPLER_INDEX_INDEPENDENTLY 0x00000000 -+#define NVA0C0_QMDV01_06_SAMPLER_INDEX_VIA_HEADER_INDEX 0x00000001 -+#define NVA0C0_QMDV01_06_FP32_NARROW_INSTRUCTION MW(383:383) -+#define NVA0C0_QMDV01_06_FP32_NARROW_INSTRUCTION_KEEP_DENORMS 0x00000000 -+#define NVA0C0_QMDV01_06_FP32_NARROW_INSTRUCTION_FLUSH_DENORMS 0x00000001 -+#define NVA0C0_QMDV01_06_CTA_RASTER_WIDTH MW(415:384) -+#define NVA0C0_QMDV01_06_CTA_RASTER_HEIGHT MW(431:416) -+#define NVA0C0_QMDV01_06_CTA_RASTER_DEPTH MW(447:432) -+#define NVA0C0_QMDV01_06_CTA_RASTER_WIDTH_RESUME MW(479:448) -+#define NVA0C0_QMDV01_06_CTA_RASTER_HEIGHT_RESUME MW(495:480) -+#define NVA0C0_QMDV01_06_CTA_RASTER_DEPTH_RESUME MW(511:496) -+#define NVA0C0_QMDV01_06_LAUNCH_QUOTA MW(535:512) -+#define NVA0C0_QMDV01_06_QMD_RESERVED_F MW(542:536) -+#define NVA0C0_QMDV01_06_LAUNCH_QUOTA_ENABLE MW(543:543) -+#define NVA0C0_QMDV01_06_LAUNCH_QUOTA_ENABLE_FALSE 0x00000000 -+#define NVA0C0_QMDV01_06_LAUNCH_QUOTA_ENABLE_TRUE 0x00000001 -+#define NVA0C0_QMDV01_06_SHARED_MEMORY_SIZE MW(561:544) -+#define NVA0C0_QMDV01_06_QMD_RESERVED_G MW(575:562) -+#define NVA0C0_QMDV01_06_QMD_VERSION MW(579:576) -+#define NVA0C0_QMDV01_06_QMD_MAJOR_VERSION MW(583:580) -+#define NVA0C0_QMDV01_06_QMD_RESERVED_H MW(591:584) -+#define NVA0C0_QMDV01_06_CTA_THREAD_DIMENSION0 MW(607:592) -+#define NVA0C0_QMDV01_06_CTA_THREAD_DIMENSION1 MW(623:608) -+#define NVA0C0_QMDV01_06_CTA_THREAD_DIMENSION2 MW(639:624) -+#define NVA0C0_QMDV01_06_CONSTANT_BUFFER_VALID(i) MW((640+(i)*1):(640+(i)*1)) -+#define NVA0C0_QMDV01_06_CONSTANT_BUFFER_VALID_FALSE 0x00000000 -+#define NVA0C0_QMDV01_06_CONSTANT_BUFFER_VALID_TRUE 0x00000001 -+#define NVA0C0_QMDV01_06_QMD_RESERVED_I MW(668:648) -+#define NVA0C0_QMDV01_06_L1_CONFIGURATION MW(671:669) -+#define NVA0C0_QMDV01_06_L1_CONFIGURATION_DIRECTLY_ADDRESSABLE_MEMORY_SIZE_16KB 0x00000001 -+#define NVA0C0_QMDV01_06_L1_CONFIGURATION_DIRECTLY_ADDRESSABLE_MEMORY_SIZE_32KB 0x00000002 -+#define NVA0C0_QMDV01_06_L1_CONFIGURATION_DIRECTLY_ADDRESSABLE_MEMORY_SIZE_48KB 0x00000003 -+#define NVA0C0_QMDV01_06_SM_DISABLE_MASK_LOWER MW(703:672) -+#define NVA0C0_QMDV01_06_SM_DISABLE_MASK_UPPER MW(735:704) -+#define NVA0C0_QMDV01_06_RELEASE0_ADDRESS_LOWER MW(767:736) -+#define NVA0C0_QMDV01_06_RELEASE0_ADDRESS_UPPER MW(775:768) -+#define NVA0C0_QMDV01_06_QMD_RESERVED_J MW(783:776) -+#define NVA0C0_QMDV01_06_RELEASE0_REDUCTION_OP MW(790:788) -+#define NVA0C0_QMDV01_06_RELEASE0_REDUCTION_OP_RED_ADD 0x00000000 -+#define NVA0C0_QMDV01_06_RELEASE0_REDUCTION_OP_RED_MIN 0x00000001 -+#define NVA0C0_QMDV01_06_RELEASE0_REDUCTION_OP_RED_MAX 0x00000002 -+#define NVA0C0_QMDV01_06_RELEASE0_REDUCTION_OP_RED_INC 0x00000003 -+#define NVA0C0_QMDV01_06_RELEASE0_REDUCTION_OP_RED_DEC 0x00000004 -+#define NVA0C0_QMDV01_06_RELEASE0_REDUCTION_OP_RED_AND 0x00000005 -+#define NVA0C0_QMDV01_06_RELEASE0_REDUCTION_OP_RED_OR 0x00000006 -+#define NVA0C0_QMDV01_06_RELEASE0_REDUCTION_OP_RED_XOR 0x00000007 -+#define NVA0C0_QMDV01_06_QMD_RESERVED_K MW(791:791) -+#define NVA0C0_QMDV01_06_RELEASE0_REDUCTION_FORMAT MW(793:792) -+#define NVA0C0_QMDV01_06_RELEASE0_REDUCTION_FORMAT_UNSIGNED_32 0x00000000 -+#define NVA0C0_QMDV01_06_RELEASE0_REDUCTION_FORMAT_SIGNED_32 0x00000001 -+#define NVA0C0_QMDV01_06_RELEASE0_REDUCTION_ENABLE MW(794:794) -+#define NVA0C0_QMDV01_06_RELEASE0_REDUCTION_ENABLE_FALSE 0x00000000 -+#define NVA0C0_QMDV01_06_RELEASE0_REDUCTION_ENABLE_TRUE 0x00000001 -+#define NVA0C0_QMDV01_06_RELEASE0_STRUCTURE_SIZE MW(799:799) -+#define NVA0C0_QMDV01_06_RELEASE0_STRUCTURE_SIZE_FOUR_WORDS 0x00000000 -+#define NVA0C0_QMDV01_06_RELEASE0_STRUCTURE_SIZE_ONE_WORD 0x00000001 -+#define NVA0C0_QMDV01_06_RELEASE0_PAYLOAD MW(831:800) -+#define NVA0C0_QMDV01_06_RELEASE1_ADDRESS_LOWER MW(863:832) -+#define NVA0C0_QMDV01_06_RELEASE1_ADDRESS_UPPER MW(871:864) -+#define NVA0C0_QMDV01_06_QMD_RESERVED_L MW(879:872) -+#define NVA0C0_QMDV01_06_RELEASE1_REDUCTION_OP MW(886:884) -+#define NVA0C0_QMDV01_06_RELEASE1_REDUCTION_OP_RED_ADD 0x00000000 -+#define NVA0C0_QMDV01_06_RELEASE1_REDUCTION_OP_RED_MIN 0x00000001 -+#define NVA0C0_QMDV01_06_RELEASE1_REDUCTION_OP_RED_MAX 0x00000002 -+#define NVA0C0_QMDV01_06_RELEASE1_REDUCTION_OP_RED_INC 0x00000003 -+#define NVA0C0_QMDV01_06_RELEASE1_REDUCTION_OP_RED_DEC 0x00000004 -+#define NVA0C0_QMDV01_06_RELEASE1_REDUCTION_OP_RED_AND 0x00000005 -+#define NVA0C0_QMDV01_06_RELEASE1_REDUCTION_OP_RED_OR 0x00000006 -+#define NVA0C0_QMDV01_06_RELEASE1_REDUCTION_OP_RED_XOR 0x00000007 -+#define NVA0C0_QMDV01_06_QMD_RESERVED_M MW(887:887) -+#define NVA0C0_QMDV01_06_RELEASE1_REDUCTION_FORMAT MW(889:888) -+#define NVA0C0_QMDV01_06_RELEASE1_REDUCTION_FORMAT_UNSIGNED_32 0x00000000 -+#define NVA0C0_QMDV01_06_RELEASE1_REDUCTION_FORMAT_SIGNED_32 0x00000001 -+#define NVA0C0_QMDV01_06_RELEASE1_REDUCTION_ENABLE MW(890:890) -+#define NVA0C0_QMDV01_06_RELEASE1_REDUCTION_ENABLE_FALSE 0x00000000 -+#define NVA0C0_QMDV01_06_RELEASE1_REDUCTION_ENABLE_TRUE 0x00000001 -+#define NVA0C0_QMDV01_06_RELEASE1_STRUCTURE_SIZE MW(895:895) -+#define NVA0C0_QMDV01_06_RELEASE1_STRUCTURE_SIZE_FOUR_WORDS 0x00000000 -+#define NVA0C0_QMDV01_06_RELEASE1_STRUCTURE_SIZE_ONE_WORD 0x00000001 -+#define NVA0C0_QMDV01_06_RELEASE1_PAYLOAD MW(927:896) -+#define NVA0C0_QMDV01_06_CONSTANT_BUFFER_ADDR_LOWER(i) MW((959+(i)*64):(928+(i)*64)) -+#define NVA0C0_QMDV01_06_CONSTANT_BUFFER_ADDR_UPPER(i) MW((967+(i)*64):(960+(i)*64)) -+#define NVA0C0_QMDV01_06_CONSTANT_BUFFER_RESERVED_ADDR(i) MW((973+(i)*64):(968+(i)*64)) -+#define NVA0C0_QMDV01_06_CONSTANT_BUFFER_INVALIDATE(i) MW((974+(i)*64):(974+(i)*64)) -+#define NVA0C0_QMDV01_06_CONSTANT_BUFFER_INVALIDATE_FALSE 0x00000000 -+#define NVA0C0_QMDV01_06_CONSTANT_BUFFER_INVALIDATE_TRUE 0x00000001 -+#define NVA0C0_QMDV01_06_CONSTANT_BUFFER_SIZE(i) MW((991+(i)*64):(975+(i)*64)) -+#define NVA0C0_QMDV01_06_SHADER_LOCAL_MEMORY_LOW_SIZE MW(1463:1440) -+#define NVA0C0_QMDV01_06_QMD_RESERVED_N MW(1466:1464) -+#define NVA0C0_QMDV01_06_BARRIER_COUNT MW(1471:1467) -+#define NVA0C0_QMDV01_06_SHADER_LOCAL_MEMORY_HIGH_SIZE MW(1495:1472) -+#define NVA0C0_QMDV01_06_REGISTER_COUNT MW(1503:1496) -+#define NVA0C0_QMDV01_06_SHADER_LOCAL_MEMORY_CRS_SIZE MW(1527:1504) -+#define NVA0C0_QMDV01_06_SASS_VERSION MW(1535:1528) -+#define NVA0C0_QMDV01_06_HW_ONLY_INNER_GET MW(1566:1536) -+#define NVA0C0_QMDV01_06_HW_ONLY_REQUIRE_SCHEDULING_PCAS MW(1567:1567) -+#define NVA0C0_QMDV01_06_HW_ONLY_INNER_PUT MW(1598:1568) -+#define NVA0C0_QMDV01_06_HW_ONLY_SCHEDULE_ON_PUT_UPDATE_ENABLE MW(1599:1599) -+#define NVA0C0_QMDV01_06_QUEUE_ENTRIES_PER_CTA_MINUS_ONE MW(1606:1600) -+#define NVA0C0_QMDV01_06_QMD_RESERVED_Q MW(1609:1607) -+#define NVA0C0_QMDV01_06_COALESCE_WAITING_PERIOD MW(1617:1610) -+#define NVA0C0_QMDV01_06_QMD_RESERVED_R MW(1631:1618) -+#define NVA0C0_QMDV01_06_QMD_SPARE_D MW(1663:1632) -+#define NVA0C0_QMDV01_06_QMD_SPARE_E MW(1695:1664) -+#define NVA0C0_QMDV01_06_QMD_SPARE_F MW(1727:1696) -+#define NVA0C0_QMDV01_06_QMD_SPARE_G MW(1759:1728) -+#define NVA0C0_QMDV01_06_QMD_SPARE_H MW(1791:1760) -+#define NVA0C0_QMDV01_06_QMD_SPARE_I MW(1823:1792) -+#define NVA0C0_QMDV01_06_QMD_SPARE_J MW(1855:1824) -+#define NVA0C0_QMDV01_06_QMD_SPARE_K MW(1887:1856) -+#define NVA0C0_QMDV01_06_QMD_SPARE_L MW(1919:1888) -+#define NVA0C0_QMDV01_06_QMD_SPARE_M MW(1951:1920) -+#define NVA0C0_QMDV01_06_QMD_SPARE_N MW(1983:1952) -+#define NVA0C0_QMDV01_06_DEBUG_ID_UPPER MW(2015:1984) -+#define NVA0C0_QMDV01_06_DEBUG_ID_LOWER MW(2047:2016) -+ -+ -+/* -+** Queue Meta Data, Version 01_07 -+ */ -+ -+#define NVA0C0_QMDV01_07_OUTER_PUT MW(30:0) -+#define NVA0C0_QMDV01_07_OUTER_OVERFLOW MW(31:31) -+#define NVA0C0_QMDV01_07_OUTER_GET MW(62:32) -+#define NVA0C0_QMDV01_07_OUTER_STICKY_OVERFLOW MW(63:63) -+#define NVA0C0_QMDV01_07_INNER_GET MW(94:64) -+#define NVA0C0_QMDV01_07_INNER_OVERFLOW MW(95:95) -+#define NVA0C0_QMDV01_07_INNER_PUT MW(126:96) -+#define NVA0C0_QMDV01_07_INNER_STICKY_OVERFLOW MW(127:127) -+#define NVA0C0_QMDV01_07_QMD_RESERVED_A_A MW(159:128) -+#define NVA0C0_QMDV01_07_DEPENDENT_QMD_POINTER MW(191:160) -+#define NVA0C0_QMDV01_07_QMD_GROUP_ID MW(197:192) -+#define NVA0C0_QMDV01_07_QMD_RESERVED_A MW(200:198) -+#define NVA0C0_QMDV01_07_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST MW(201:201) -+#define NVA0C0_QMDV01_07_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_FALSE 0x00000000 -+#define NVA0C0_QMDV01_07_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_TRUE 0x00000001 -+#define NVA0C0_QMDV01_07_SEMAPHORE_RELEASE_ENABLE0 MW(202:202) -+#define NVA0C0_QMDV01_07_SEMAPHORE_RELEASE_ENABLE0_FALSE 0x00000000 -+#define NVA0C0_QMDV01_07_SEMAPHORE_RELEASE_ENABLE0_TRUE 0x00000001 -+#define NVA0C0_QMDV01_07_SEMAPHORE_RELEASE_ENABLE1 MW(203:203) -+#define NVA0C0_QMDV01_07_SEMAPHORE_RELEASE_ENABLE1_FALSE 0x00000000 -+#define NVA0C0_QMDV01_07_SEMAPHORE_RELEASE_ENABLE1_TRUE 0x00000001 -+#define NVA0C0_QMDV01_07_REQUIRE_SCHEDULING_PCAS MW(204:204) -+#define NVA0C0_QMDV01_07_REQUIRE_SCHEDULING_PCAS_FALSE 0x00000000 -+#define NVA0C0_QMDV01_07_REQUIRE_SCHEDULING_PCAS_TRUE 0x00000001 -+#define NVA0C0_QMDV01_07_DEPENDENT_QMD_SCHEDULE_ENABLE MW(205:205) -+#define NVA0C0_QMDV01_07_DEPENDENT_QMD_SCHEDULE_ENABLE_FALSE 0x00000000 -+#define NVA0C0_QMDV01_07_DEPENDENT_QMD_SCHEDULE_ENABLE_TRUE 0x00000001 -+#define NVA0C0_QMDV01_07_DEPENDENT_QMD_TYPE MW(206:206) -+#define NVA0C0_QMDV01_07_DEPENDENT_QMD_TYPE_QUEUE 0x00000000 -+#define NVA0C0_QMDV01_07_DEPENDENT_QMD_TYPE_GRID 0x00000001 -+#define NVA0C0_QMDV01_07_DEPENDENT_QMD_FIELD_COPY MW(207:207) -+#define NVA0C0_QMDV01_07_DEPENDENT_QMD_FIELD_COPY_FALSE 0x00000000 -+#define NVA0C0_QMDV01_07_DEPENDENT_QMD_FIELD_COPY_TRUE 0x00000001 -+#define NVA0C0_QMDV01_07_QMD_RESERVED_B MW(223:208) -+#define NVA0C0_QMDV01_07_CIRCULAR_QUEUE_SIZE MW(248:224) -+#define NVA0C0_QMDV01_07_QMD_RESERVED_C MW(249:249) -+#define NVA0C0_QMDV01_07_INVALIDATE_TEXTURE_HEADER_CACHE MW(250:250) -+#define NVA0C0_QMDV01_07_INVALIDATE_TEXTURE_HEADER_CACHE_FALSE 0x00000000 -+#define NVA0C0_QMDV01_07_INVALIDATE_TEXTURE_HEADER_CACHE_TRUE 0x00000001 -+#define NVA0C0_QMDV01_07_INVALIDATE_TEXTURE_SAMPLER_CACHE MW(251:251) -+#define NVA0C0_QMDV01_07_INVALIDATE_TEXTURE_SAMPLER_CACHE_FALSE 0x00000000 -+#define NVA0C0_QMDV01_07_INVALIDATE_TEXTURE_SAMPLER_CACHE_TRUE 0x00000001 -+#define NVA0C0_QMDV01_07_INVALIDATE_TEXTURE_DATA_CACHE MW(252:252) -+#define NVA0C0_QMDV01_07_INVALIDATE_TEXTURE_DATA_CACHE_FALSE 0x00000000 -+#define NVA0C0_QMDV01_07_INVALIDATE_TEXTURE_DATA_CACHE_TRUE 0x00000001 -+#define NVA0C0_QMDV01_07_INVALIDATE_SHADER_DATA_CACHE MW(253:253) -+#define NVA0C0_QMDV01_07_INVALIDATE_SHADER_DATA_CACHE_FALSE 0x00000000 -+#define NVA0C0_QMDV01_07_INVALIDATE_SHADER_DATA_CACHE_TRUE 0x00000001 -+#define NVA0C0_QMDV01_07_INVALIDATE_INSTRUCTION_CACHE MW(254:254) -+#define NVA0C0_QMDV01_07_INVALIDATE_INSTRUCTION_CACHE_FALSE 0x00000000 -+#define NVA0C0_QMDV01_07_INVALIDATE_INSTRUCTION_CACHE_TRUE 0x00000001 -+#define NVA0C0_QMDV01_07_INVALIDATE_SHADER_CONSTANT_CACHE MW(255:255) -+#define NVA0C0_QMDV01_07_INVALIDATE_SHADER_CONSTANT_CACHE_FALSE 0x00000000 -+#define NVA0C0_QMDV01_07_INVALIDATE_SHADER_CONSTANT_CACHE_TRUE 0x00000001 -+#define NVA0C0_QMDV01_07_PROGRAM_OFFSET MW(287:256) -+#define NVA0C0_QMDV01_07_CIRCULAR_QUEUE_ADDR_LOWER MW(319:288) -+#define NVA0C0_QMDV01_07_CIRCULAR_QUEUE_ADDR_UPPER MW(327:320) -+#define NVA0C0_QMDV01_07_QMD_RESERVED_D MW(335:328) -+#define NVA0C0_QMDV01_07_CIRCULAR_QUEUE_ENTRY_SIZE MW(351:336) -+#define NVA0C0_QMDV01_07_CWD_REFERENCE_COUNT_ID MW(357:352) -+#define NVA0C0_QMDV01_07_CWD_REFERENCE_COUNT_DELTA_MINUS_ONE MW(365:358) -+#define NVA0C0_QMDV01_07_RELEASE_MEMBAR_TYPE MW(366:366) -+#define NVA0C0_QMDV01_07_RELEASE_MEMBAR_TYPE_FE_NONE 0x00000000 -+#define NVA0C0_QMDV01_07_RELEASE_MEMBAR_TYPE_FE_SYSMEMBAR 0x00000001 -+#define NVA0C0_QMDV01_07_CWD_REFERENCE_COUNT_INCR_ENABLE MW(367:367) -+#define NVA0C0_QMDV01_07_CWD_REFERENCE_COUNT_INCR_ENABLE_FALSE 0x00000000 -+#define NVA0C0_QMDV01_07_CWD_REFERENCE_COUNT_INCR_ENABLE_TRUE 0x00000001 -+#define NVA0C0_QMDV01_07_CWD_MEMBAR_TYPE MW(369:368) -+#define NVA0C0_QMDV01_07_CWD_MEMBAR_TYPE_L1_NONE 0x00000000 -+#define NVA0C0_QMDV01_07_CWD_MEMBAR_TYPE_L1_SYSMEMBAR 0x00000001 -+#define NVA0C0_QMDV01_07_CWD_MEMBAR_TYPE_L1_MEMBAR 0x00000003 -+#define NVA0C0_QMDV01_07_SEQUENTIALLY_RUN_CTAS MW(370:370) -+#define NVA0C0_QMDV01_07_SEQUENTIALLY_RUN_CTAS_FALSE 0x00000000 -+#define NVA0C0_QMDV01_07_SEQUENTIALLY_RUN_CTAS_TRUE 0x00000001 -+#define NVA0C0_QMDV01_07_CWD_REFERENCE_COUNT_DECR_ENABLE MW(371:371) -+#define NVA0C0_QMDV01_07_CWD_REFERENCE_COUNT_DECR_ENABLE_FALSE 0x00000000 -+#define NVA0C0_QMDV01_07_CWD_REFERENCE_COUNT_DECR_ENABLE_TRUE 0x00000001 -+#define NVA0C0_QMDV01_07_THROTTLED MW(372:372) -+#define NVA0C0_QMDV01_07_THROTTLED_FALSE 0x00000000 -+#define NVA0C0_QMDV01_07_THROTTLED_TRUE 0x00000001 -+#define NVA0C0_QMDV01_07_FP32_NAN_BEHAVIOR MW(376:376) -+#define NVA0C0_QMDV01_07_FP32_NAN_BEHAVIOR_LEGACY 0x00000000 -+#define NVA0C0_QMDV01_07_FP32_NAN_BEHAVIOR_FP64_COMPATIBLE 0x00000001 -+#define NVA0C0_QMDV01_07_FP32_F2I_NAN_BEHAVIOR MW(377:377) -+#define NVA0C0_QMDV01_07_FP32_F2I_NAN_BEHAVIOR_PASS_ZERO 0x00000000 -+#define NVA0C0_QMDV01_07_FP32_F2I_NAN_BEHAVIOR_PASS_INDEFINITE 0x00000001 -+#define NVA0C0_QMDV01_07_API_VISIBLE_CALL_LIMIT MW(378:378) -+#define NVA0C0_QMDV01_07_API_VISIBLE_CALL_LIMIT__32 0x00000000 -+#define NVA0C0_QMDV01_07_API_VISIBLE_CALL_LIMIT_NO_CHECK 0x00000001 -+#define NVA0C0_QMDV01_07_SHARED_MEMORY_BANK_MAPPING MW(379:379) -+#define NVA0C0_QMDV01_07_SHARED_MEMORY_BANK_MAPPING_FOUR_BYTES_PER_BANK 0x00000000 -+#define NVA0C0_QMDV01_07_SHARED_MEMORY_BANK_MAPPING_EIGHT_BYTES_PER_BANK 0x00000001 -+#define NVA0C0_QMDV01_07_SAMPLER_INDEX MW(382:382) -+#define NVA0C0_QMDV01_07_SAMPLER_INDEX_INDEPENDENTLY 0x00000000 -+#define NVA0C0_QMDV01_07_SAMPLER_INDEX_VIA_HEADER_INDEX 0x00000001 -+#define NVA0C0_QMDV01_07_FP32_NARROW_INSTRUCTION MW(383:383) -+#define NVA0C0_QMDV01_07_FP32_NARROW_INSTRUCTION_KEEP_DENORMS 0x00000000 -+#define NVA0C0_QMDV01_07_FP32_NARROW_INSTRUCTION_FLUSH_DENORMS 0x00000001 -+#define NVA0C0_QMDV01_07_CTA_RASTER_WIDTH MW(415:384) -+#define NVA0C0_QMDV01_07_CTA_RASTER_HEIGHT MW(431:416) -+#define NVA0C0_QMDV01_07_CTA_RASTER_DEPTH MW(447:432) -+#define NVA0C0_QMDV01_07_CTA_RASTER_WIDTH_RESUME MW(479:448) -+#define NVA0C0_QMDV01_07_CTA_RASTER_HEIGHT_RESUME MW(495:480) -+#define NVA0C0_QMDV01_07_CTA_RASTER_DEPTH_RESUME MW(511:496) -+#define NVA0C0_QMDV01_07_QUEUE_ENTRIES_PER_CTA_MINUS_ONE MW(518:512) -+#define NVA0C0_QMDV01_07_COALESCE_WAITING_PERIOD MW(529:522) -+#define NVA0C0_QMDV01_07_SHARED_MEMORY_SIZE MW(561:544) -+#define NVA0C0_QMDV01_07_QMD_RESERVED_G MW(575:562) -+#define NVA0C0_QMDV01_07_QMD_VERSION MW(579:576) -+#define NVA0C0_QMDV01_07_QMD_MAJOR_VERSION MW(583:580) -+#define NVA0C0_QMDV01_07_QMD_RESERVED_H MW(591:584) -+#define NVA0C0_QMDV01_07_CTA_THREAD_DIMENSION0 MW(607:592) -+#define NVA0C0_QMDV01_07_CTA_THREAD_DIMENSION1 MW(623:608) -+#define NVA0C0_QMDV01_07_CTA_THREAD_DIMENSION2 MW(639:624) -+#define NVA0C0_QMDV01_07_CONSTANT_BUFFER_VALID(i) MW((640+(i)*1):(640+(i)*1)) -+#define NVA0C0_QMDV01_07_CONSTANT_BUFFER_VALID_FALSE 0x00000000 -+#define NVA0C0_QMDV01_07_CONSTANT_BUFFER_VALID_TRUE 0x00000001 -+#define NVA0C0_QMDV01_07_QMD_RESERVED_I MW(668:648) -+#define NVA0C0_QMDV01_07_L1_CONFIGURATION MW(671:669) -+#define NVA0C0_QMDV01_07_L1_CONFIGURATION_DIRECTLY_ADDRESSABLE_MEMORY_SIZE_16KB 0x00000001 -+#define NVA0C0_QMDV01_07_L1_CONFIGURATION_DIRECTLY_ADDRESSABLE_MEMORY_SIZE_32KB 0x00000002 -+#define NVA0C0_QMDV01_07_L1_CONFIGURATION_DIRECTLY_ADDRESSABLE_MEMORY_SIZE_48KB 0x00000003 -+#define NVA0C0_QMDV01_07_SM_DISABLE_MASK_LOWER MW(703:672) -+#define NVA0C0_QMDV01_07_SM_DISABLE_MASK_UPPER MW(735:704) -+#define NVA0C0_QMDV01_07_RELEASE0_ADDRESS_LOWER MW(767:736) -+#define NVA0C0_QMDV01_07_RELEASE0_ADDRESS_UPPER MW(775:768) -+#define NVA0C0_QMDV01_07_QMD_RESERVED_J MW(783:776) -+#define NVA0C0_QMDV01_07_RELEASE0_REDUCTION_OP MW(790:788) -+#define NVA0C0_QMDV01_07_RELEASE0_REDUCTION_OP_RED_ADD 0x00000000 -+#define NVA0C0_QMDV01_07_RELEASE0_REDUCTION_OP_RED_MIN 0x00000001 -+#define NVA0C0_QMDV01_07_RELEASE0_REDUCTION_OP_RED_MAX 0x00000002 -+#define NVA0C0_QMDV01_07_RELEASE0_REDUCTION_OP_RED_INC 0x00000003 -+#define NVA0C0_QMDV01_07_RELEASE0_REDUCTION_OP_RED_DEC 0x00000004 -+#define NVA0C0_QMDV01_07_RELEASE0_REDUCTION_OP_RED_AND 0x00000005 -+#define NVA0C0_QMDV01_07_RELEASE0_REDUCTION_OP_RED_OR 0x00000006 -+#define NVA0C0_QMDV01_07_RELEASE0_REDUCTION_OP_RED_XOR 0x00000007 -+#define NVA0C0_QMDV01_07_QMD_RESERVED_K MW(791:791) -+#define NVA0C0_QMDV01_07_RELEASE0_REDUCTION_FORMAT MW(793:792) -+#define NVA0C0_QMDV01_07_RELEASE0_REDUCTION_FORMAT_UNSIGNED_32 0x00000000 -+#define NVA0C0_QMDV01_07_RELEASE0_REDUCTION_FORMAT_SIGNED_32 0x00000001 -+#define NVA0C0_QMDV01_07_RELEASE0_REDUCTION_ENABLE MW(794:794) -+#define NVA0C0_QMDV01_07_RELEASE0_REDUCTION_ENABLE_FALSE 0x00000000 -+#define NVA0C0_QMDV01_07_RELEASE0_REDUCTION_ENABLE_TRUE 0x00000001 -+#define NVA0C0_QMDV01_07_RELEASE0_STRUCTURE_SIZE MW(799:799) -+#define NVA0C0_QMDV01_07_RELEASE0_STRUCTURE_SIZE_FOUR_WORDS 0x00000000 -+#define NVA0C0_QMDV01_07_RELEASE0_STRUCTURE_SIZE_ONE_WORD 0x00000001 -+#define NVA0C0_QMDV01_07_RELEASE0_PAYLOAD MW(831:800) -+#define NVA0C0_QMDV01_07_RELEASE1_ADDRESS_LOWER MW(863:832) -+#define NVA0C0_QMDV01_07_RELEASE1_ADDRESS_UPPER MW(871:864) -+#define NVA0C0_QMDV01_07_QMD_RESERVED_L MW(879:872) -+#define NVA0C0_QMDV01_07_RELEASE1_REDUCTION_OP MW(886:884) -+#define NVA0C0_QMDV01_07_RELEASE1_REDUCTION_OP_RED_ADD 0x00000000 -+#define NVA0C0_QMDV01_07_RELEASE1_REDUCTION_OP_RED_MIN 0x00000001 -+#define NVA0C0_QMDV01_07_RELEASE1_REDUCTION_OP_RED_MAX 0x00000002 -+#define NVA0C0_QMDV01_07_RELEASE1_REDUCTION_OP_RED_INC 0x00000003 -+#define NVA0C0_QMDV01_07_RELEASE1_REDUCTION_OP_RED_DEC 0x00000004 -+#define NVA0C0_QMDV01_07_RELEASE1_REDUCTION_OP_RED_AND 0x00000005 -+#define NVA0C0_QMDV01_07_RELEASE1_REDUCTION_OP_RED_OR 0x00000006 -+#define NVA0C0_QMDV01_07_RELEASE1_REDUCTION_OP_RED_XOR 0x00000007 -+#define NVA0C0_QMDV01_07_QMD_RESERVED_M MW(887:887) -+#define NVA0C0_QMDV01_07_RELEASE1_REDUCTION_FORMAT MW(889:888) -+#define NVA0C0_QMDV01_07_RELEASE1_REDUCTION_FORMAT_UNSIGNED_32 0x00000000 -+#define NVA0C0_QMDV01_07_RELEASE1_REDUCTION_FORMAT_SIGNED_32 0x00000001 -+#define NVA0C0_QMDV01_07_RELEASE1_REDUCTION_ENABLE MW(890:890) -+#define NVA0C0_QMDV01_07_RELEASE1_REDUCTION_ENABLE_FALSE 0x00000000 -+#define NVA0C0_QMDV01_07_RELEASE1_REDUCTION_ENABLE_TRUE 0x00000001 -+#define NVA0C0_QMDV01_07_RELEASE1_STRUCTURE_SIZE MW(895:895) -+#define NVA0C0_QMDV01_07_RELEASE1_STRUCTURE_SIZE_FOUR_WORDS 0x00000000 -+#define NVA0C0_QMDV01_07_RELEASE1_STRUCTURE_SIZE_ONE_WORD 0x00000001 -+#define NVA0C0_QMDV01_07_RELEASE1_PAYLOAD MW(927:896) -+#define NVA0C0_QMDV01_07_CONSTANT_BUFFER_ADDR_LOWER(i) MW((959+(i)*64):(928+(i)*64)) -+#define NVA0C0_QMDV01_07_CONSTANT_BUFFER_ADDR_UPPER(i) MW((967+(i)*64):(960+(i)*64)) -+#define NVA0C0_QMDV01_07_CONSTANT_BUFFER_RESERVED_ADDR(i) MW((973+(i)*64):(968+(i)*64)) -+#define NVA0C0_QMDV01_07_CONSTANT_BUFFER_INVALIDATE(i) MW((974+(i)*64):(974+(i)*64)) -+#define NVA0C0_QMDV01_07_CONSTANT_BUFFER_INVALIDATE_FALSE 0x00000000 -+#define NVA0C0_QMDV01_07_CONSTANT_BUFFER_INVALIDATE_TRUE 0x00000001 -+#define NVA0C0_QMDV01_07_CONSTANT_BUFFER_SIZE(i) MW((991+(i)*64):(975+(i)*64)) -+#define NVA0C0_QMDV01_07_SHADER_LOCAL_MEMORY_LOW_SIZE MW(1463:1440) -+#define NVA0C0_QMDV01_07_QMD_RESERVED_N MW(1466:1464) -+#define NVA0C0_QMDV01_07_BARRIER_COUNT MW(1471:1467) -+#define NVA0C0_QMDV01_07_SHADER_LOCAL_MEMORY_HIGH_SIZE MW(1495:1472) -+#define NVA0C0_QMDV01_07_REGISTER_COUNT MW(1503:1496) -+#define NVA0C0_QMDV01_07_SHADER_LOCAL_MEMORY_CRS_SIZE MW(1527:1504) -+#define NVA0C0_QMDV01_07_SASS_VERSION MW(1535:1528) -+#define NVA0C0_QMDV01_07_HW_ONLY_INNER_GET MW(1566:1536) -+#define NVA0C0_QMDV01_07_HW_ONLY_REQUIRE_SCHEDULING_PCAS MW(1567:1567) -+#define NVA0C0_QMDV01_07_HW_ONLY_INNER_PUT MW(1598:1568) -+#define NVA0C0_QMDV01_07_QMD_RESERVED_P MW(1599:1599) -+#define NVA0C0_QMDV01_07_HW_ONLY_SPAN_LIST_HEAD_INDEX MW(1629:1600) -+#define NVA0C0_QMDV01_07_QMD_RESERVED_Q MW(1630:1630) -+#define NVA0C0_QMDV01_07_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID MW(1631:1631) -+#define NVA0C0_QMDV01_07_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_FALSE 0x00000000 -+#define NVA0C0_QMDV01_07_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_TRUE 0x00000001 -+#define NVA0C0_QMDV01_07_HW_ONLY_SKED_NEXT_QMD_POINTER MW(1663:1632) -+#define NVA0C0_QMDV01_07_QMD_SPARE_E MW(1695:1664) -+#define NVA0C0_QMDV01_07_QMD_SPARE_F MW(1727:1696) -+#define NVA0C0_QMDV01_07_QMD_SPARE_G MW(1759:1728) -+#define NVA0C0_QMDV01_07_QMD_SPARE_H MW(1791:1760) -+#define NVA0C0_QMDV01_07_QMD_SPARE_I MW(1823:1792) -+#define NVA0C0_QMDV01_07_QMD_SPARE_J MW(1855:1824) -+#define NVA0C0_QMDV01_07_QMD_SPARE_K MW(1887:1856) -+#define NVA0C0_QMDV01_07_QMD_SPARE_L MW(1919:1888) -+#define NVA0C0_QMDV01_07_QMD_SPARE_M MW(1951:1920) -+#define NVA0C0_QMDV01_07_QMD_SPARE_N MW(1983:1952) -+#define NVA0C0_QMDV01_07_DEBUG_ID_UPPER MW(2015:1984) -+#define NVA0C0_QMDV01_07_DEBUG_ID_LOWER MW(2047:2016) -+ -+ -+ -+#endif // #ifndef __CLA0C0QMD_H__ -diff --git a/src/gallium/drivers/nouveau/nvc0/clc0c0qmd.h b/src/gallium/drivers/nouveau/nvc0/clc0c0qmd.h -new file mode 100644 -index 00000000000..040bdcd9dcb ---- /dev/null -+++ b/src/gallium/drivers/nouveau/nvc0/clc0c0qmd.h -@@ -0,0 +1,665 @@ -+/******************************************************************************* -+ Copyright (c) 2016 NVIDIA Corporation -+ -+ Permission is hereby granted, free of charge, to any person obtaining a copy -+ of this software and associated documentation files (the "Software"), to -+ deal in the Software without restriction, including without limitation the -+ rights to use, copy, modify, merge, publish, distribute, sublicense, and/or -+ sell copies of the Software, and to permit persons to whom the Software is -+ furnished to do so, subject to the following conditions: -+ -+ The above copyright notice and this permission notice shall be -+ included in all copies or substantial portions of the Software. -+ -+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -+ DEALINGS IN THE SOFTWARE. -+ -+*******************************************************************************/ -+ -+/* AUTO GENERATED FILE -- DO NOT EDIT */ -+ -+#ifndef __CLC0C0QMD_H__ -+#define __CLC0C0QMD_H__ -+ -+/* -+** Queue Meta Data, Version 01_07 -+ */ -+ -+// The below C preprocessor definitions describe "multi-word" structures, where -+// fields may have bit numbers beyond 32. For example, MW(127:96) means -+// the field is in bits 0-31 of word number 3 of the structure. The "MW(X:Y)" -+// syntax is to distinguish from similar "X:Y" single-word definitions: the -+// macros historically used for single-word definitions would fail with -+// multi-word definitions. -+// -+// See nvmisc.h:DRF_VAL_MW() in the source code of the kernel -+// interface layer of nvidia.ko for an example of how to manipulate -+// these MW(X:Y) definitions. -+ -+#define NVC0C0_QMDV01_07_OUTER_PUT MW(30:0) -+#define NVC0C0_QMDV01_07_OUTER_OVERFLOW MW(31:31) -+#define NVC0C0_QMDV01_07_OUTER_GET MW(62:32) -+#define NVC0C0_QMDV01_07_OUTER_STICKY_OVERFLOW MW(63:63) -+#define NVC0C0_QMDV01_07_INNER_GET MW(94:64) -+#define NVC0C0_QMDV01_07_INNER_OVERFLOW MW(95:95) -+#define NVC0C0_QMDV01_07_INNER_PUT MW(126:96) -+#define NVC0C0_QMDV01_07_INNER_STICKY_OVERFLOW MW(127:127) -+#define NVC0C0_QMDV01_07_QMD_RESERVED_A_A MW(159:128) -+#define NVC0C0_QMDV01_07_DEPENDENT_QMD_POINTER MW(191:160) -+#define NVC0C0_QMDV01_07_QMD_GROUP_ID MW(197:192) -+#define NVC0C0_QMDV01_07_SM_GLOBAL_CACHING_ENABLE MW(198:198) -+#define NVC0C0_QMDV01_07_RUN_CTA_IN_ONE_SM_PARTITION MW(199:199) -+#define NVC0C0_QMDV01_07_RUN_CTA_IN_ONE_SM_PARTITION_FALSE 0x00000000 -+#define NVC0C0_QMDV01_07_RUN_CTA_IN_ONE_SM_PARTITION_TRUE 0x00000001 -+#define NVC0C0_QMDV01_07_IS_QUEUE MW(200:200) -+#define NVC0C0_QMDV01_07_IS_QUEUE_FALSE 0x00000000 -+#define NVC0C0_QMDV01_07_IS_QUEUE_TRUE 0x00000001 -+#define NVC0C0_QMDV01_07_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST MW(201:201) -+#define NVC0C0_QMDV01_07_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_FALSE 0x00000000 -+#define NVC0C0_QMDV01_07_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_TRUE 0x00000001 -+#define NVC0C0_QMDV01_07_SEMAPHORE_RELEASE_ENABLE0 MW(202:202) -+#define NVC0C0_QMDV01_07_SEMAPHORE_RELEASE_ENABLE0_FALSE 0x00000000 -+#define NVC0C0_QMDV01_07_SEMAPHORE_RELEASE_ENABLE0_TRUE 0x00000001 -+#define NVC0C0_QMDV01_07_SEMAPHORE_RELEASE_ENABLE1 MW(203:203) -+#define NVC0C0_QMDV01_07_SEMAPHORE_RELEASE_ENABLE1_FALSE 0x00000000 -+#define NVC0C0_QMDV01_07_SEMAPHORE_RELEASE_ENABLE1_TRUE 0x00000001 -+#define NVC0C0_QMDV01_07_REQUIRE_SCHEDULING_PCAS MW(204:204) -+#define NVC0C0_QMDV01_07_REQUIRE_SCHEDULING_PCAS_FALSE 0x00000000 -+#define NVC0C0_QMDV01_07_REQUIRE_SCHEDULING_PCAS_TRUE 0x00000001 -+#define NVC0C0_QMDV01_07_DEPENDENT_QMD_SCHEDULE_ENABLE MW(205:205) -+#define NVC0C0_QMDV01_07_DEPENDENT_QMD_SCHEDULE_ENABLE_FALSE 0x00000000 -+#define NVC0C0_QMDV01_07_DEPENDENT_QMD_SCHEDULE_ENABLE_TRUE 0x00000001 -+#define NVC0C0_QMDV01_07_DEPENDENT_QMD_TYPE MW(206:206) -+#define NVC0C0_QMDV01_07_DEPENDENT_QMD_TYPE_QUEUE 0x00000000 -+#define NVC0C0_QMDV01_07_DEPENDENT_QMD_TYPE_GRID 0x00000001 -+#define NVC0C0_QMDV01_07_DEPENDENT_QMD_FIELD_COPY MW(207:207) -+#define NVC0C0_QMDV01_07_DEPENDENT_QMD_FIELD_COPY_FALSE 0x00000000 -+#define NVC0C0_QMDV01_07_DEPENDENT_QMD_FIELD_COPY_TRUE 0x00000001 -+#define NVC0C0_QMDV01_07_QMD_RESERVED_B MW(223:208) -+#define NVC0C0_QMDV01_07_CIRCULAR_QUEUE_SIZE MW(248:224) -+#define NVC0C0_QMDV01_07_QMD_RESERVED_C MW(249:249) -+#define NVC0C0_QMDV01_07_INVALIDATE_TEXTURE_HEADER_CACHE MW(250:250) -+#define NVC0C0_QMDV01_07_INVALIDATE_TEXTURE_HEADER_CACHE_FALSE 0x00000000 -+#define NVC0C0_QMDV01_07_INVALIDATE_TEXTURE_HEADER_CACHE_TRUE 0x00000001 -+#define NVC0C0_QMDV01_07_INVALIDATE_TEXTURE_SAMPLER_CACHE MW(251:251) -+#define NVC0C0_QMDV01_07_INVALIDATE_TEXTURE_SAMPLER_CACHE_FALSE 0x00000000 -+#define NVC0C0_QMDV01_07_INVALIDATE_TEXTURE_SAMPLER_CACHE_TRUE 0x00000001 -+#define NVC0C0_QMDV01_07_INVALIDATE_TEXTURE_DATA_CACHE MW(252:252) -+#define NVC0C0_QMDV01_07_INVALIDATE_TEXTURE_DATA_CACHE_FALSE 0x00000000 -+#define NVC0C0_QMDV01_07_INVALIDATE_TEXTURE_DATA_CACHE_TRUE 0x00000001 -+#define NVC0C0_QMDV01_07_INVALIDATE_SHADER_DATA_CACHE MW(253:253) -+#define NVC0C0_QMDV01_07_INVALIDATE_SHADER_DATA_CACHE_FALSE 0x00000000 -+#define NVC0C0_QMDV01_07_INVALIDATE_SHADER_DATA_CACHE_TRUE 0x00000001 -+#define NVC0C0_QMDV01_07_INVALIDATE_INSTRUCTION_CACHE MW(254:254) -+#define NVC0C0_QMDV01_07_INVALIDATE_INSTRUCTION_CACHE_FALSE 0x00000000 -+#define NVC0C0_QMDV01_07_INVALIDATE_INSTRUCTION_CACHE_TRUE 0x00000001 -+#define NVC0C0_QMDV01_07_INVALIDATE_SHADER_CONSTANT_CACHE MW(255:255) -+#define NVC0C0_QMDV01_07_INVALIDATE_SHADER_CONSTANT_CACHE_FALSE 0x00000000 -+#define NVC0C0_QMDV01_07_INVALIDATE_SHADER_CONSTANT_CACHE_TRUE 0x00000001 -+#define NVC0C0_QMDV01_07_PROGRAM_OFFSET MW(287:256) -+#define NVC0C0_QMDV01_07_CIRCULAR_QUEUE_ADDR_LOWER MW(319:288) -+#define NVC0C0_QMDV01_07_CIRCULAR_QUEUE_ADDR_UPPER MW(327:320) -+#define NVC0C0_QMDV01_07_QMD_RESERVED_D MW(335:328) -+#define NVC0C0_QMDV01_07_CIRCULAR_QUEUE_ENTRY_SIZE MW(351:336) -+#define NVC0C0_QMDV01_07_CWD_REFERENCE_COUNT_ID MW(357:352) -+#define NVC0C0_QMDV01_07_CWD_REFERENCE_COUNT_DELTA_MINUS_ONE MW(365:358) -+#define NVC0C0_QMDV01_07_RELEASE_MEMBAR_TYPE MW(366:366) -+#define NVC0C0_QMDV01_07_RELEASE_MEMBAR_TYPE_FE_NONE 0x00000000 -+#define NVC0C0_QMDV01_07_RELEASE_MEMBAR_TYPE_FE_SYSMEMBAR 0x00000001 -+#define NVC0C0_QMDV01_07_CWD_REFERENCE_COUNT_INCR_ENABLE MW(367:367) -+#define NVC0C0_QMDV01_07_CWD_REFERENCE_COUNT_INCR_ENABLE_FALSE 0x00000000 -+#define NVC0C0_QMDV01_07_CWD_REFERENCE_COUNT_INCR_ENABLE_TRUE 0x00000001 -+#define NVC0C0_QMDV01_07_CWD_MEMBAR_TYPE MW(369:368) -+#define NVC0C0_QMDV01_07_CWD_MEMBAR_TYPE_L1_NONE 0x00000000 -+#define NVC0C0_QMDV01_07_CWD_MEMBAR_TYPE_L1_SYSMEMBAR 0x00000001 -+#define NVC0C0_QMDV01_07_CWD_MEMBAR_TYPE_L1_MEMBAR 0x00000003 -+#define NVC0C0_QMDV01_07_SEQUENTIALLY_RUN_CTAS MW(370:370) -+#define NVC0C0_QMDV01_07_SEQUENTIALLY_RUN_CTAS_FALSE 0x00000000 -+#define NVC0C0_QMDV01_07_SEQUENTIALLY_RUN_CTAS_TRUE 0x00000001 -+#define NVC0C0_QMDV01_07_CWD_REFERENCE_COUNT_DECR_ENABLE MW(371:371) -+#define NVC0C0_QMDV01_07_CWD_REFERENCE_COUNT_DECR_ENABLE_FALSE 0x00000000 -+#define NVC0C0_QMDV01_07_CWD_REFERENCE_COUNT_DECR_ENABLE_TRUE 0x00000001 -+#define NVC0C0_QMDV01_07_THROTTLED MW(372:372) -+#define NVC0C0_QMDV01_07_THROTTLED_FALSE 0x00000000 -+#define NVC0C0_QMDV01_07_THROTTLED_TRUE 0x00000001 -+#define NVC0C0_QMDV01_07_FP32_NAN_BEHAVIOR MW(376:376) -+#define NVC0C0_QMDV01_07_FP32_NAN_BEHAVIOR_LEGACY 0x00000000 -+#define NVC0C0_QMDV01_07_FP32_NAN_BEHAVIOR_FP64_COMPATIBLE 0x00000001 -+#define NVC0C0_QMDV01_07_FP32_F2I_NAN_BEHAVIOR MW(377:377) -+#define NVC0C0_QMDV01_07_FP32_F2I_NAN_BEHAVIOR_PASS_ZERO 0x00000000 -+#define NVC0C0_QMDV01_07_FP32_F2I_NAN_BEHAVIOR_PASS_INDEFINITE 0x00000001 -+#define NVC0C0_QMDV01_07_API_VISIBLE_CALL_LIMIT MW(378:378) -+#define NVC0C0_QMDV01_07_API_VISIBLE_CALL_LIMIT__32 0x00000000 -+#define NVC0C0_QMDV01_07_API_VISIBLE_CALL_LIMIT_NO_CHECK 0x00000001 -+#define NVC0C0_QMDV01_07_SHARED_MEMORY_BANK_MAPPING MW(379:379) -+#define NVC0C0_QMDV01_07_SHARED_MEMORY_BANK_MAPPING_FOUR_BYTES_PER_BANK 0x00000000 -+#define NVC0C0_QMDV01_07_SHARED_MEMORY_BANK_MAPPING_EIGHT_BYTES_PER_BANK 0x00000001 -+#define NVC0C0_QMDV01_07_SAMPLER_INDEX MW(382:382) -+#define NVC0C0_QMDV01_07_SAMPLER_INDEX_INDEPENDENTLY 0x00000000 -+#define NVC0C0_QMDV01_07_SAMPLER_INDEX_VIA_HEADER_INDEX 0x00000001 -+#define NVC0C0_QMDV01_07_FP32_NARROW_INSTRUCTION MW(383:383) -+#define NVC0C0_QMDV01_07_FP32_NARROW_INSTRUCTION_KEEP_DENORMS 0x00000000 -+#define NVC0C0_QMDV01_07_FP32_NARROW_INSTRUCTION_FLUSH_DENORMS 0x00000001 -+#define NVC0C0_QMDV01_07_CTA_RASTER_WIDTH MW(415:384) -+#define NVC0C0_QMDV01_07_CTA_RASTER_HEIGHT MW(431:416) -+#define NVC0C0_QMDV01_07_CTA_RASTER_DEPTH MW(447:432) -+#define NVC0C0_QMDV01_07_CTA_RASTER_WIDTH_RESUME MW(479:448) -+#define NVC0C0_QMDV01_07_CTA_RASTER_HEIGHT_RESUME MW(495:480) -+#define NVC0C0_QMDV01_07_CTA_RASTER_DEPTH_RESUME MW(511:496) -+#define NVC0C0_QMDV01_07_QUEUE_ENTRIES_PER_CTA_MINUS_ONE MW(518:512) -+#define NVC0C0_QMDV01_07_COALESCE_WAITING_PERIOD MW(529:522) -+#define NVC0C0_QMDV01_07_SHARED_MEMORY_SIZE MW(561:544) -+#define NVC0C0_QMDV01_07_QMD_RESERVED_G MW(575:562) -+#define NVC0C0_QMDV01_07_QMD_VERSION MW(579:576) -+#define NVC0C0_QMDV01_07_QMD_MAJOR_VERSION MW(583:580) -+#define NVC0C0_QMDV01_07_QMD_RESERVED_H MW(591:584) -+#define NVC0C0_QMDV01_07_CTA_THREAD_DIMENSION0 MW(607:592) -+#define NVC0C0_QMDV01_07_CTA_THREAD_DIMENSION1 MW(623:608) -+#define NVC0C0_QMDV01_07_CTA_THREAD_DIMENSION2 MW(639:624) -+#define NVC0C0_QMDV01_07_CONSTANT_BUFFER_VALID(i) MW((640+(i)*1):(640+(i)*1)) -+#define NVC0C0_QMDV01_07_CONSTANT_BUFFER_VALID_FALSE 0x00000000 -+#define NVC0C0_QMDV01_07_CONSTANT_BUFFER_VALID_TRUE 0x00000001 -+#define NVC0C0_QMDV01_07_QMD_RESERVED_I MW(668:648) -+#define NVC0C0_QMDV01_07_L1_CONFIGURATION MW(671:669) -+#define NVC0C0_QMDV01_07_L1_CONFIGURATION_DIRECTLY_ADDRESSABLE_MEMORY_SIZE_16KB 0x00000001 -+#define NVC0C0_QMDV01_07_L1_CONFIGURATION_DIRECTLY_ADDRESSABLE_MEMORY_SIZE_32KB 0x00000002 -+#define NVC0C0_QMDV01_07_L1_CONFIGURATION_DIRECTLY_ADDRESSABLE_MEMORY_SIZE_48KB 0x00000003 -+#define NVC0C0_QMDV01_07_SM_DISABLE_MASK_LOWER MW(703:672) -+#define NVC0C0_QMDV01_07_SM_DISABLE_MASK_UPPER MW(735:704) -+#define NVC0C0_QMDV01_07_RELEASE0_ADDRESS_LOWER MW(767:736) -+#define NVC0C0_QMDV01_07_RELEASE0_ADDRESS_UPPER MW(775:768) -+#define NVC0C0_QMDV01_07_QMD_RESERVED_J MW(783:776) -+#define NVC0C0_QMDV01_07_RELEASE0_REDUCTION_OP MW(790:788) -+#define NVC0C0_QMDV01_07_RELEASE0_REDUCTION_OP_RED_ADD 0x00000000 -+#define NVC0C0_QMDV01_07_RELEASE0_REDUCTION_OP_RED_MIN 0x00000001 -+#define NVC0C0_QMDV01_07_RELEASE0_REDUCTION_OP_RED_MAX 0x00000002 -+#define NVC0C0_QMDV01_07_RELEASE0_REDUCTION_OP_RED_INC 0x00000003 -+#define NVC0C0_QMDV01_07_RELEASE0_REDUCTION_OP_RED_DEC 0x00000004 -+#define NVC0C0_QMDV01_07_RELEASE0_REDUCTION_OP_RED_AND 0x00000005 -+#define NVC0C0_QMDV01_07_RELEASE0_REDUCTION_OP_RED_OR 0x00000006 -+#define NVC0C0_QMDV01_07_RELEASE0_REDUCTION_OP_RED_XOR 0x00000007 -+#define NVC0C0_QMDV01_07_QMD_RESERVED_K MW(791:791) -+#define NVC0C0_QMDV01_07_RELEASE0_REDUCTION_FORMAT MW(793:792) -+#define NVC0C0_QMDV01_07_RELEASE0_REDUCTION_FORMAT_UNSIGNED_32 0x00000000 -+#define NVC0C0_QMDV01_07_RELEASE0_REDUCTION_FORMAT_SIGNED_32 0x00000001 -+#define NVC0C0_QMDV01_07_RELEASE0_REDUCTION_ENABLE MW(794:794) -+#define NVC0C0_QMDV01_07_RELEASE0_REDUCTION_ENABLE_FALSE 0x00000000 -+#define NVC0C0_QMDV01_07_RELEASE0_REDUCTION_ENABLE_TRUE 0x00000001 -+#define NVC0C0_QMDV01_07_RELEASE0_STRUCTURE_SIZE MW(799:799) -+#define NVC0C0_QMDV01_07_RELEASE0_STRUCTURE_SIZE_FOUR_WORDS 0x00000000 -+#define NVC0C0_QMDV01_07_RELEASE0_STRUCTURE_SIZE_ONE_WORD 0x00000001 -+#define NVC0C0_QMDV01_07_RELEASE0_PAYLOAD MW(831:800) -+#define NVC0C0_QMDV01_07_RELEASE1_ADDRESS_LOWER MW(863:832) -+#define NVC0C0_QMDV01_07_RELEASE1_ADDRESS_UPPER MW(871:864) -+#define NVC0C0_QMDV01_07_QMD_RESERVED_L MW(879:872) -+#define NVC0C0_QMDV01_07_RELEASE1_REDUCTION_OP MW(886:884) -+#define NVC0C0_QMDV01_07_RELEASE1_REDUCTION_OP_RED_ADD 0x00000000 -+#define NVC0C0_QMDV01_07_RELEASE1_REDUCTION_OP_RED_MIN 0x00000001 -+#define NVC0C0_QMDV01_07_RELEASE1_REDUCTION_OP_RED_MAX 0x00000002 -+#define NVC0C0_QMDV01_07_RELEASE1_REDUCTION_OP_RED_INC 0x00000003 -+#define NVC0C0_QMDV01_07_RELEASE1_REDUCTION_OP_RED_DEC 0x00000004 -+#define NVC0C0_QMDV01_07_RELEASE1_REDUCTION_OP_RED_AND 0x00000005 -+#define NVC0C0_QMDV01_07_RELEASE1_REDUCTION_OP_RED_OR 0x00000006 -+#define NVC0C0_QMDV01_07_RELEASE1_REDUCTION_OP_RED_XOR 0x00000007 -+#define NVC0C0_QMDV01_07_QMD_RESERVED_M MW(887:887) -+#define NVC0C0_QMDV01_07_RELEASE1_REDUCTION_FORMAT MW(889:888) -+#define NVC0C0_QMDV01_07_RELEASE1_REDUCTION_FORMAT_UNSIGNED_32 0x00000000 -+#define NVC0C0_QMDV01_07_RELEASE1_REDUCTION_FORMAT_SIGNED_32 0x00000001 -+#define NVC0C0_QMDV01_07_RELEASE1_REDUCTION_ENABLE MW(890:890) -+#define NVC0C0_QMDV01_07_RELEASE1_REDUCTION_ENABLE_FALSE 0x00000000 -+#define NVC0C0_QMDV01_07_RELEASE1_REDUCTION_ENABLE_TRUE 0x00000001 -+#define NVC0C0_QMDV01_07_RELEASE1_STRUCTURE_SIZE MW(895:895) -+#define NVC0C0_QMDV01_07_RELEASE1_STRUCTURE_SIZE_FOUR_WORDS 0x00000000 -+#define NVC0C0_QMDV01_07_RELEASE1_STRUCTURE_SIZE_ONE_WORD 0x00000001 -+#define NVC0C0_QMDV01_07_RELEASE1_PAYLOAD MW(927:896) -+#define NVC0C0_QMDV01_07_CONSTANT_BUFFER_ADDR_LOWER(i) MW((959+(i)*64):(928+(i)*64)) -+#define NVC0C0_QMDV01_07_CONSTANT_BUFFER_ADDR_UPPER(i) MW((967+(i)*64):(960+(i)*64)) -+#define NVC0C0_QMDV01_07_CONSTANT_BUFFER_RESERVED_ADDR(i) MW((973+(i)*64):(968+(i)*64)) -+#define NVC0C0_QMDV01_07_CONSTANT_BUFFER_INVALIDATE(i) MW((974+(i)*64):(974+(i)*64)) -+#define NVC0C0_QMDV01_07_CONSTANT_BUFFER_INVALIDATE_FALSE 0x00000000 -+#define NVC0C0_QMDV01_07_CONSTANT_BUFFER_INVALIDATE_TRUE 0x00000001 -+#define NVC0C0_QMDV01_07_CONSTANT_BUFFER_SIZE(i) MW((991+(i)*64):(975+(i)*64)) -+#define NVC0C0_QMDV01_07_SHADER_LOCAL_MEMORY_LOW_SIZE MW(1463:1440) -+#define NVC0C0_QMDV01_07_QMD_RESERVED_N MW(1466:1464) -+#define NVC0C0_QMDV01_07_BARRIER_COUNT MW(1471:1467) -+#define NVC0C0_QMDV01_07_SHADER_LOCAL_MEMORY_HIGH_SIZE MW(1495:1472) -+#define NVC0C0_QMDV01_07_REGISTER_COUNT MW(1503:1496) -+#define NVC0C0_QMDV01_07_SHADER_LOCAL_MEMORY_CRS_SIZE MW(1527:1504) -+#define NVC0C0_QMDV01_07_SASS_VERSION MW(1535:1528) -+#define NVC0C0_QMDV01_07_HW_ONLY_INNER_GET MW(1566:1536) -+#define NVC0C0_QMDV01_07_HW_ONLY_REQUIRE_SCHEDULING_PCAS MW(1567:1567) -+#define NVC0C0_QMDV01_07_HW_ONLY_INNER_PUT MW(1598:1568) -+#define NVC0C0_QMDV01_07_HW_ONLY_SCG_TYPE MW(1599:1599) -+#define NVC0C0_QMDV01_07_HW_ONLY_SPAN_LIST_HEAD_INDEX MW(1629:1600) -+#define NVC0C0_QMDV01_07_QMD_RESERVED_Q MW(1630:1630) -+#define NVC0C0_QMDV01_07_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID MW(1631:1631) -+#define NVC0C0_QMDV01_07_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_FALSE 0x00000000 -+#define NVC0C0_QMDV01_07_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_TRUE 0x00000001 -+#define NVC0C0_QMDV01_07_HW_ONLY_SKED_NEXT_QMD_POINTER MW(1663:1632) -+#define NVC0C0_QMDV01_07_QMD_SPARE_E MW(1695:1664) -+#define NVC0C0_QMDV01_07_QMD_SPARE_F MW(1727:1696) -+#define NVC0C0_QMDV01_07_QMD_SPARE_G MW(1759:1728) -+#define NVC0C0_QMDV01_07_QMD_SPARE_H MW(1791:1760) -+#define NVC0C0_QMDV01_07_QMD_SPARE_I MW(1823:1792) -+#define NVC0C0_QMDV01_07_QMD_SPARE_J MW(1855:1824) -+#define NVC0C0_QMDV01_07_QMD_SPARE_K MW(1887:1856) -+#define NVC0C0_QMDV01_07_QMD_SPARE_L MW(1919:1888) -+#define NVC0C0_QMDV01_07_QMD_SPARE_M MW(1951:1920) -+#define NVC0C0_QMDV01_07_QMD_SPARE_N MW(1983:1952) -+#define NVC0C0_QMDV01_07_DEBUG_ID_UPPER MW(2015:1984) -+#define NVC0C0_QMDV01_07_DEBUG_ID_LOWER MW(2047:2016) -+ -+ -+/* -+** Queue Meta Data, Version 02_00 -+ */ -+ -+#define NVC0C0_QMDV02_00_OUTER_PUT MW(30:0) -+#define NVC0C0_QMDV02_00_OUTER_OVERFLOW MW(31:31) -+#define NVC0C0_QMDV02_00_OUTER_GET MW(62:32) -+#define NVC0C0_QMDV02_00_OUTER_STICKY_OVERFLOW MW(63:63) -+#define NVC0C0_QMDV02_00_INNER_GET MW(94:64) -+#define NVC0C0_QMDV02_00_INNER_OVERFLOW MW(95:95) -+#define NVC0C0_QMDV02_00_INNER_PUT MW(126:96) -+#define NVC0C0_QMDV02_00_INNER_STICKY_OVERFLOW MW(127:127) -+#define NVC0C0_QMDV02_00_QMD_RESERVED_A_A MW(159:128) -+#define NVC0C0_QMDV02_00_DEPENDENT_QMD_POINTER MW(191:160) -+#define NVC0C0_QMDV02_00_QMD_GROUP_ID MW(197:192) -+#define NVC0C0_QMDV02_00_SM_GLOBAL_CACHING_ENABLE MW(198:198) -+#define NVC0C0_QMDV02_00_RUN_CTA_IN_ONE_SM_PARTITION MW(199:199) -+#define NVC0C0_QMDV02_00_RUN_CTA_IN_ONE_SM_PARTITION_FALSE 0x00000000 -+#define NVC0C0_QMDV02_00_RUN_CTA_IN_ONE_SM_PARTITION_TRUE 0x00000001 -+#define NVC0C0_QMDV02_00_IS_QUEUE MW(200:200) -+#define NVC0C0_QMDV02_00_IS_QUEUE_FALSE 0x00000000 -+#define NVC0C0_QMDV02_00_IS_QUEUE_TRUE 0x00000001 -+#define NVC0C0_QMDV02_00_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST MW(201:201) -+#define NVC0C0_QMDV02_00_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_FALSE 0x00000000 -+#define NVC0C0_QMDV02_00_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_TRUE 0x00000001 -+#define NVC0C0_QMDV02_00_SEMAPHORE_RELEASE_ENABLE0 MW(202:202) -+#define NVC0C0_QMDV02_00_SEMAPHORE_RELEASE_ENABLE0_FALSE 0x00000000 -+#define NVC0C0_QMDV02_00_SEMAPHORE_RELEASE_ENABLE0_TRUE 0x00000001 -+#define NVC0C0_QMDV02_00_SEMAPHORE_RELEASE_ENABLE1 MW(203:203) -+#define NVC0C0_QMDV02_00_SEMAPHORE_RELEASE_ENABLE1_FALSE 0x00000000 -+#define NVC0C0_QMDV02_00_SEMAPHORE_RELEASE_ENABLE1_TRUE 0x00000001 -+#define NVC0C0_QMDV02_00_REQUIRE_SCHEDULING_PCAS MW(204:204) -+#define NVC0C0_QMDV02_00_REQUIRE_SCHEDULING_PCAS_FALSE 0x00000000 -+#define NVC0C0_QMDV02_00_REQUIRE_SCHEDULING_PCAS_TRUE 0x00000001 -+#define NVC0C0_QMDV02_00_DEPENDENT_QMD_SCHEDULE_ENABLE MW(205:205) -+#define NVC0C0_QMDV02_00_DEPENDENT_QMD_SCHEDULE_ENABLE_FALSE 0x00000000 -+#define NVC0C0_QMDV02_00_DEPENDENT_QMD_SCHEDULE_ENABLE_TRUE 0x00000001 -+#define NVC0C0_QMDV02_00_DEPENDENT_QMD_TYPE MW(206:206) -+#define NVC0C0_QMDV02_00_DEPENDENT_QMD_TYPE_QUEUE 0x00000000 -+#define NVC0C0_QMDV02_00_DEPENDENT_QMD_TYPE_GRID 0x00000001 -+#define NVC0C0_QMDV02_00_DEPENDENT_QMD_FIELD_COPY MW(207:207) -+#define NVC0C0_QMDV02_00_DEPENDENT_QMD_FIELD_COPY_FALSE 0x00000000 -+#define NVC0C0_QMDV02_00_DEPENDENT_QMD_FIELD_COPY_TRUE 0x00000001 -+#define NVC0C0_QMDV02_00_QMD_RESERVED_B MW(223:208) -+#define NVC0C0_QMDV02_00_CIRCULAR_QUEUE_SIZE MW(248:224) -+#define NVC0C0_QMDV02_00_QMD_RESERVED_C MW(249:249) -+#define NVC0C0_QMDV02_00_INVALIDATE_TEXTURE_HEADER_CACHE MW(250:250) -+#define NVC0C0_QMDV02_00_INVALIDATE_TEXTURE_HEADER_CACHE_FALSE 0x00000000 -+#define NVC0C0_QMDV02_00_INVALIDATE_TEXTURE_HEADER_CACHE_TRUE 0x00000001 -+#define NVC0C0_QMDV02_00_INVALIDATE_TEXTURE_SAMPLER_CACHE MW(251:251) -+#define NVC0C0_QMDV02_00_INVALIDATE_TEXTURE_SAMPLER_CACHE_FALSE 0x00000000 -+#define NVC0C0_QMDV02_00_INVALIDATE_TEXTURE_SAMPLER_CACHE_TRUE 0x00000001 -+#define NVC0C0_QMDV02_00_INVALIDATE_TEXTURE_DATA_CACHE MW(252:252) -+#define NVC0C0_QMDV02_00_INVALIDATE_TEXTURE_DATA_CACHE_FALSE 0x00000000 -+#define NVC0C0_QMDV02_00_INVALIDATE_TEXTURE_DATA_CACHE_TRUE 0x00000001 -+#define NVC0C0_QMDV02_00_INVALIDATE_SHADER_DATA_CACHE MW(253:253) -+#define NVC0C0_QMDV02_00_INVALIDATE_SHADER_DATA_CACHE_FALSE 0x00000000 -+#define NVC0C0_QMDV02_00_INVALIDATE_SHADER_DATA_CACHE_TRUE 0x00000001 -+#define NVC0C0_QMDV02_00_INVALIDATE_INSTRUCTION_CACHE MW(254:254) -+#define NVC0C0_QMDV02_00_INVALIDATE_INSTRUCTION_CACHE_FALSE 0x00000000 -+#define NVC0C0_QMDV02_00_INVALIDATE_INSTRUCTION_CACHE_TRUE 0x00000001 -+#define NVC0C0_QMDV02_00_INVALIDATE_SHADER_CONSTANT_CACHE MW(255:255) -+#define NVC0C0_QMDV02_00_INVALIDATE_SHADER_CONSTANT_CACHE_FALSE 0x00000000 -+#define NVC0C0_QMDV02_00_INVALIDATE_SHADER_CONSTANT_CACHE_TRUE 0x00000001 -+#define NVC0C0_QMDV02_00_PROGRAM_OFFSET MW(287:256) -+#define NVC0C0_QMDV02_00_CIRCULAR_QUEUE_ADDR_LOWER MW(319:288) -+#define NVC0C0_QMDV02_00_CIRCULAR_QUEUE_ADDR_UPPER MW(327:320) -+#define NVC0C0_QMDV02_00_QMD_RESERVED_D MW(335:328) -+#define NVC0C0_QMDV02_00_CIRCULAR_QUEUE_ENTRY_SIZE MW(351:336) -+#define NVC0C0_QMDV02_00_CWD_REFERENCE_COUNT_ID MW(357:352) -+#define NVC0C0_QMDV02_00_CWD_REFERENCE_COUNT_DELTA_MINUS_ONE MW(365:358) -+#define NVC0C0_QMDV02_00_RELEASE_MEMBAR_TYPE MW(366:366) -+#define NVC0C0_QMDV02_00_RELEASE_MEMBAR_TYPE_FE_NONE 0x00000000 -+#define NVC0C0_QMDV02_00_RELEASE_MEMBAR_TYPE_FE_SYSMEMBAR 0x00000001 -+#define NVC0C0_QMDV02_00_CWD_REFERENCE_COUNT_INCR_ENABLE MW(367:367) -+#define NVC0C0_QMDV02_00_CWD_REFERENCE_COUNT_INCR_ENABLE_FALSE 0x00000000 -+#define NVC0C0_QMDV02_00_CWD_REFERENCE_COUNT_INCR_ENABLE_TRUE 0x00000001 -+#define NVC0C0_QMDV02_00_CWD_MEMBAR_TYPE MW(369:368) -+#define NVC0C0_QMDV02_00_CWD_MEMBAR_TYPE_L1_NONE 0x00000000 -+#define NVC0C0_QMDV02_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR 0x00000001 -+#define NVC0C0_QMDV02_00_CWD_MEMBAR_TYPE_L1_MEMBAR 0x00000003 -+#define NVC0C0_QMDV02_00_SEQUENTIALLY_RUN_CTAS MW(370:370) -+#define NVC0C0_QMDV02_00_SEQUENTIALLY_RUN_CTAS_FALSE 0x00000000 -+#define NVC0C0_QMDV02_00_SEQUENTIALLY_RUN_CTAS_TRUE 0x00000001 -+#define NVC0C0_QMDV02_00_CWD_REFERENCE_COUNT_DECR_ENABLE MW(371:371) -+#define NVC0C0_QMDV02_00_CWD_REFERENCE_COUNT_DECR_ENABLE_FALSE 0x00000000 -+#define NVC0C0_QMDV02_00_CWD_REFERENCE_COUNT_DECR_ENABLE_TRUE 0x00000001 -+#define NVC0C0_QMDV02_00_THROTTLED MW(372:372) -+#define NVC0C0_QMDV02_00_THROTTLED_FALSE 0x00000000 -+#define NVC0C0_QMDV02_00_THROTTLED_TRUE 0x00000001 -+#define NVC0C0_QMDV02_00_API_VISIBLE_CALL_LIMIT MW(378:378) -+#define NVC0C0_QMDV02_00_API_VISIBLE_CALL_LIMIT__32 0x00000000 -+#define NVC0C0_QMDV02_00_API_VISIBLE_CALL_LIMIT_NO_CHECK 0x00000001 -+#define NVC0C0_QMDV02_00_SAMPLER_INDEX MW(382:382) -+#define NVC0C0_QMDV02_00_SAMPLER_INDEX_INDEPENDENTLY 0x00000000 -+#define NVC0C0_QMDV02_00_SAMPLER_INDEX_VIA_HEADER_INDEX 0x00000001 -+#define NVC0C0_QMDV02_00_CTA_RASTER_WIDTH MW(415:384) -+#define NVC0C0_QMDV02_00_CTA_RASTER_HEIGHT MW(431:416) -+#define NVC0C0_QMDV02_00_QMD_RESERVED13A MW(447:432) -+#define NVC0C0_QMDV02_00_CTA_RASTER_DEPTH MW(463:448) -+#define NVC0C0_QMDV02_00_QMD_RESERVED14A MW(479:464) -+#define NVC0C0_QMDV02_00_QMD_RESERVED15A MW(511:480) -+#define NVC0C0_QMDV02_00_QUEUE_ENTRIES_PER_CTA_MINUS_ONE MW(518:512) -+#define NVC0C0_QMDV02_00_COALESCE_WAITING_PERIOD MW(529:522) -+#define NVC0C0_QMDV02_00_SHARED_MEMORY_SIZE MW(561:544) -+#define NVC0C0_QMDV02_00_QMD_RESERVED_G MW(575:562) -+#define NVC0C0_QMDV02_00_QMD_VERSION MW(579:576) -+#define NVC0C0_QMDV02_00_QMD_MAJOR_VERSION MW(583:580) -+#define NVC0C0_QMDV02_00_QMD_RESERVED_H MW(591:584) -+#define NVC0C0_QMDV02_00_CTA_THREAD_DIMENSION0 MW(607:592) -+#define NVC0C0_QMDV02_00_CTA_THREAD_DIMENSION1 MW(623:608) -+#define NVC0C0_QMDV02_00_CTA_THREAD_DIMENSION2 MW(639:624) -+#define NVC0C0_QMDV02_00_CONSTANT_BUFFER_VALID(i) MW((640+(i)*1):(640+(i)*1)) -+#define NVC0C0_QMDV02_00_CONSTANT_BUFFER_VALID_FALSE 0x00000000 -+#define NVC0C0_QMDV02_00_CONSTANT_BUFFER_VALID_TRUE 0x00000001 -+#define NVC0C0_QMDV02_00_QMD_RESERVED_I MW(671:648) -+#define NVC0C0_QMDV02_00_SM_DISABLE_MASK_LOWER MW(703:672) -+#define NVC0C0_QMDV02_00_SM_DISABLE_MASK_UPPER MW(735:704) -+#define NVC0C0_QMDV02_00_RELEASE0_ADDRESS_LOWER MW(767:736) -+#define NVC0C0_QMDV02_00_RELEASE0_ADDRESS_UPPER MW(775:768) -+#define NVC0C0_QMDV02_00_QMD_RESERVED_J MW(783:776) -+#define NVC0C0_QMDV02_00_RELEASE0_REDUCTION_OP MW(790:788) -+#define NVC0C0_QMDV02_00_RELEASE0_REDUCTION_OP_RED_ADD 0x00000000 -+#define NVC0C0_QMDV02_00_RELEASE0_REDUCTION_OP_RED_MIN 0x00000001 -+#define NVC0C0_QMDV02_00_RELEASE0_REDUCTION_OP_RED_MAX 0x00000002 -+#define NVC0C0_QMDV02_00_RELEASE0_REDUCTION_OP_RED_INC 0x00000003 -+#define NVC0C0_QMDV02_00_RELEASE0_REDUCTION_OP_RED_DEC 0x00000004 -+#define NVC0C0_QMDV02_00_RELEASE0_REDUCTION_OP_RED_AND 0x00000005 -+#define NVC0C0_QMDV02_00_RELEASE0_REDUCTION_OP_RED_OR 0x00000006 -+#define NVC0C0_QMDV02_00_RELEASE0_REDUCTION_OP_RED_XOR 0x00000007 -+#define NVC0C0_QMDV02_00_QMD_RESERVED_K MW(791:791) -+#define NVC0C0_QMDV02_00_RELEASE0_REDUCTION_FORMAT MW(793:792) -+#define NVC0C0_QMDV02_00_RELEASE0_REDUCTION_FORMAT_UNSIGNED_32 0x00000000 -+#define NVC0C0_QMDV02_00_RELEASE0_REDUCTION_FORMAT_SIGNED_32 0x00000001 -+#define NVC0C0_QMDV02_00_RELEASE0_REDUCTION_ENABLE MW(794:794) -+#define NVC0C0_QMDV02_00_RELEASE0_REDUCTION_ENABLE_FALSE 0x00000000 -+#define NVC0C0_QMDV02_00_RELEASE0_REDUCTION_ENABLE_TRUE 0x00000001 -+#define NVC0C0_QMDV02_00_RELEASE0_STRUCTURE_SIZE MW(799:799) -+#define NVC0C0_QMDV02_00_RELEASE0_STRUCTURE_SIZE_FOUR_WORDS 0x00000000 -+#define NVC0C0_QMDV02_00_RELEASE0_STRUCTURE_SIZE_ONE_WORD 0x00000001 -+#define NVC0C0_QMDV02_00_RELEASE0_PAYLOAD MW(831:800) -+#define NVC0C0_QMDV02_00_RELEASE1_ADDRESS_LOWER MW(863:832) -+#define NVC0C0_QMDV02_00_RELEASE1_ADDRESS_UPPER MW(871:864) -+#define NVC0C0_QMDV02_00_QMD_RESERVED_L MW(879:872) -+#define NVC0C0_QMDV02_00_RELEASE1_REDUCTION_OP MW(886:884) -+#define NVC0C0_QMDV02_00_RELEASE1_REDUCTION_OP_RED_ADD 0x00000000 -+#define NVC0C0_QMDV02_00_RELEASE1_REDUCTION_OP_RED_MIN 0x00000001 -+#define NVC0C0_QMDV02_00_RELEASE1_REDUCTION_OP_RED_MAX 0x00000002 -+#define NVC0C0_QMDV02_00_RELEASE1_REDUCTION_OP_RED_INC 0x00000003 -+#define NVC0C0_QMDV02_00_RELEASE1_REDUCTION_OP_RED_DEC 0x00000004 -+#define NVC0C0_QMDV02_00_RELEASE1_REDUCTION_OP_RED_AND 0x00000005 -+#define NVC0C0_QMDV02_00_RELEASE1_REDUCTION_OP_RED_OR 0x00000006 -+#define NVC0C0_QMDV02_00_RELEASE1_REDUCTION_OP_RED_XOR 0x00000007 -+#define NVC0C0_QMDV02_00_QMD_RESERVED_M MW(887:887) -+#define NVC0C0_QMDV02_00_RELEASE1_REDUCTION_FORMAT MW(889:888) -+#define NVC0C0_QMDV02_00_RELEASE1_REDUCTION_FORMAT_UNSIGNED_32 0x00000000 -+#define NVC0C0_QMDV02_00_RELEASE1_REDUCTION_FORMAT_SIGNED_32 0x00000001 -+#define NVC0C0_QMDV02_00_RELEASE1_REDUCTION_ENABLE MW(890:890) -+#define NVC0C0_QMDV02_00_RELEASE1_REDUCTION_ENABLE_FALSE 0x00000000 -+#define NVC0C0_QMDV02_00_RELEASE1_REDUCTION_ENABLE_TRUE 0x00000001 -+#define NVC0C0_QMDV02_00_RELEASE1_STRUCTURE_SIZE MW(895:895) -+#define NVC0C0_QMDV02_00_RELEASE1_STRUCTURE_SIZE_FOUR_WORDS 0x00000000 -+#define NVC0C0_QMDV02_00_RELEASE1_STRUCTURE_SIZE_ONE_WORD 0x00000001 -+#define NVC0C0_QMDV02_00_RELEASE1_PAYLOAD MW(927:896) -+#define NVC0C0_QMDV02_00_SHADER_LOCAL_MEMORY_LOW_SIZE MW(951:928) -+#define NVC0C0_QMDV02_00_QMD_RESERVED_N MW(954:952) -+#define NVC0C0_QMDV02_00_BARRIER_COUNT MW(959:955) -+#define NVC0C0_QMDV02_00_SHADER_LOCAL_MEMORY_HIGH_SIZE MW(983:960) -+#define NVC0C0_QMDV02_00_REGISTER_COUNT MW(991:984) -+#define NVC0C0_QMDV02_00_SHADER_LOCAL_MEMORY_CRS_SIZE MW(1015:992) -+#define NVC0C0_QMDV02_00_SASS_VERSION MW(1023:1016) -+#define NVC0C0_QMDV02_00_CONSTANT_BUFFER_ADDR_LOWER(i) MW((1055+(i)*64):(1024+(i)*64)) -+#define NVC0C0_QMDV02_00_CONSTANT_BUFFER_ADDR_UPPER(i) MW((1072+(i)*64):(1056+(i)*64)) -+#define NVC0C0_QMDV02_00_CONSTANT_BUFFER_RESERVED_ADDR(i) MW((1073+(i)*64):(1073+(i)*64)) -+#define NVC0C0_QMDV02_00_CONSTANT_BUFFER_INVALIDATE(i) MW((1074+(i)*64):(1074+(i)*64)) -+#define NVC0C0_QMDV02_00_CONSTANT_BUFFER_INVALIDATE_FALSE 0x00000000 -+#define NVC0C0_QMDV02_00_CONSTANT_BUFFER_INVALIDATE_TRUE 0x00000001 -+#define NVC0C0_QMDV02_00_CONSTANT_BUFFER_SIZE_SHIFTED4(i) MW((1087+(i)*64):(1075+(i)*64)) -+#define NVC0C0_QMDV02_00_HW_ONLY_INNER_GET MW(1566:1536) -+#define NVC0C0_QMDV02_00_HW_ONLY_REQUIRE_SCHEDULING_PCAS MW(1567:1567) -+#define NVC0C0_QMDV02_00_HW_ONLY_INNER_PUT MW(1598:1568) -+#define NVC0C0_QMDV02_00_HW_ONLY_SCG_TYPE MW(1599:1599) -+#define NVC0C0_QMDV02_00_HW_ONLY_SPAN_LIST_HEAD_INDEX MW(1629:1600) -+#define NVC0C0_QMDV02_00_QMD_RESERVED_Q MW(1630:1630) -+#define NVC0C0_QMDV02_00_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID MW(1631:1631) -+#define NVC0C0_QMDV02_00_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_FALSE 0x00000000 -+#define NVC0C0_QMDV02_00_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_TRUE 0x00000001 -+#define NVC0C0_QMDV02_00_HW_ONLY_SKED_NEXT_QMD_POINTER MW(1663:1632) -+#define NVC0C0_QMDV02_00_CTA_RASTER_WIDTH_RESUME MW(1695:1664) -+#define NVC0C0_QMDV02_00_CTA_RASTER_HEIGHT_RESUME MW(1711:1696) -+#define NVC0C0_QMDV02_00_CTA_RASTER_DEPTH_RESUME MW(1727:1712) -+#define NVC0C0_QMDV02_00_QMD_SPARE_G MW(1759:1728) -+#define NVC0C0_QMDV02_00_QMD_SPARE_H MW(1791:1760) -+#define NVC0C0_QMDV02_00_QMD_SPARE_I MW(1823:1792) -+#define NVC0C0_QMDV02_00_QMD_SPARE_J MW(1855:1824) -+#define NVC0C0_QMDV02_00_QMD_SPARE_K MW(1887:1856) -+#define NVC0C0_QMDV02_00_QMD_SPARE_L MW(1919:1888) -+#define NVC0C0_QMDV02_00_QMD_SPARE_M MW(1951:1920) -+#define NVC0C0_QMDV02_00_QMD_SPARE_N MW(1983:1952) -+#define NVC0C0_QMDV02_00_DEBUG_ID_UPPER MW(2015:1984) -+#define NVC0C0_QMDV02_00_DEBUG_ID_LOWER MW(2047:2016) -+ -+ -+/* -+** Queue Meta Data, Version 02_01 -+ */ -+ -+#define NVC0C0_QMDV02_01_OUTER_PUT MW(30:0) -+#define NVC0C0_QMDV02_01_OUTER_OVERFLOW MW(31:31) -+#define NVC0C0_QMDV02_01_OUTER_GET MW(62:32) -+#define NVC0C0_QMDV02_01_OUTER_STICKY_OVERFLOW MW(63:63) -+#define NVC0C0_QMDV02_01_INNER_GET MW(94:64) -+#define NVC0C0_QMDV02_01_INNER_OVERFLOW MW(95:95) -+#define NVC0C0_QMDV02_01_INNER_PUT MW(126:96) -+#define NVC0C0_QMDV02_01_INNER_STICKY_OVERFLOW MW(127:127) -+#define NVC0C0_QMDV02_01_QMD_GROUP_ID MW(133:128) -+#define NVC0C0_QMDV02_01_SM_GLOBAL_CACHING_ENABLE MW(134:134) -+#define NVC0C0_QMDV02_01_RUN_CTA_IN_ONE_SM_PARTITION MW(135:135) -+#define NVC0C0_QMDV02_01_RUN_CTA_IN_ONE_SM_PARTITION_FALSE 0x00000000 -+#define NVC0C0_QMDV02_01_RUN_CTA_IN_ONE_SM_PARTITION_TRUE 0x00000001 -+#define NVC0C0_QMDV02_01_IS_QUEUE MW(136:136) -+#define NVC0C0_QMDV02_01_IS_QUEUE_FALSE 0x00000000 -+#define NVC0C0_QMDV02_01_IS_QUEUE_TRUE 0x00000001 -+#define NVC0C0_QMDV02_01_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST MW(137:137) -+#define NVC0C0_QMDV02_01_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_FALSE 0x00000000 -+#define NVC0C0_QMDV02_01_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_TRUE 0x00000001 -+#define NVC0C0_QMDV02_01_SEMAPHORE_RELEASE_ENABLE0 MW(138:138) -+#define NVC0C0_QMDV02_01_SEMAPHORE_RELEASE_ENABLE0_FALSE 0x00000000 -+#define NVC0C0_QMDV02_01_SEMAPHORE_RELEASE_ENABLE0_TRUE 0x00000001 -+#define NVC0C0_QMDV02_01_SEMAPHORE_RELEASE_ENABLE1 MW(139:139) -+#define NVC0C0_QMDV02_01_SEMAPHORE_RELEASE_ENABLE1_FALSE 0x00000000 -+#define NVC0C0_QMDV02_01_SEMAPHORE_RELEASE_ENABLE1_TRUE 0x00000001 -+#define NVC0C0_QMDV02_01_REQUIRE_SCHEDULING_PCAS MW(140:140) -+#define NVC0C0_QMDV02_01_REQUIRE_SCHEDULING_PCAS_FALSE 0x00000000 -+#define NVC0C0_QMDV02_01_REQUIRE_SCHEDULING_PCAS_TRUE 0x00000001 -+#define NVC0C0_QMDV02_01_DEPENDENT_QMD_SCHEDULE_ENABLE MW(141:141) -+#define NVC0C0_QMDV02_01_DEPENDENT_QMD_SCHEDULE_ENABLE_FALSE 0x00000000 -+#define NVC0C0_QMDV02_01_DEPENDENT_QMD_SCHEDULE_ENABLE_TRUE 0x00000001 -+#define NVC0C0_QMDV02_01_DEPENDENT_QMD_TYPE MW(142:142) -+#define NVC0C0_QMDV02_01_DEPENDENT_QMD_TYPE_QUEUE 0x00000000 -+#define NVC0C0_QMDV02_01_DEPENDENT_QMD_TYPE_GRID 0x00000001 -+#define NVC0C0_QMDV02_01_DEPENDENT_QMD_FIELD_COPY MW(143:143) -+#define NVC0C0_QMDV02_01_DEPENDENT_QMD_FIELD_COPY_FALSE 0x00000000 -+#define NVC0C0_QMDV02_01_DEPENDENT_QMD_FIELD_COPY_TRUE 0x00000001 -+#define NVC0C0_QMDV02_01_QMD_RESERVED_B MW(159:144) -+#define NVC0C0_QMDV02_01_CIRCULAR_QUEUE_SIZE MW(184:160) -+#define NVC0C0_QMDV02_01_QMD_RESERVED_C MW(185:185) -+#define NVC0C0_QMDV02_01_INVALIDATE_TEXTURE_HEADER_CACHE MW(186:186) -+#define NVC0C0_QMDV02_01_INVALIDATE_TEXTURE_HEADER_CACHE_FALSE 0x00000000 -+#define NVC0C0_QMDV02_01_INVALIDATE_TEXTURE_HEADER_CACHE_TRUE 0x00000001 -+#define NVC0C0_QMDV02_01_INVALIDATE_TEXTURE_SAMPLER_CACHE MW(187:187) -+#define NVC0C0_QMDV02_01_INVALIDATE_TEXTURE_SAMPLER_CACHE_FALSE 0x00000000 -+#define NVC0C0_QMDV02_01_INVALIDATE_TEXTURE_SAMPLER_CACHE_TRUE 0x00000001 -+#define NVC0C0_QMDV02_01_INVALIDATE_TEXTURE_DATA_CACHE MW(188:188) -+#define NVC0C0_QMDV02_01_INVALIDATE_TEXTURE_DATA_CACHE_FALSE 0x00000000 -+#define NVC0C0_QMDV02_01_INVALIDATE_TEXTURE_DATA_CACHE_TRUE 0x00000001 -+#define NVC0C0_QMDV02_01_INVALIDATE_SHADER_DATA_CACHE MW(189:189) -+#define NVC0C0_QMDV02_01_INVALIDATE_SHADER_DATA_CACHE_FALSE 0x00000000 -+#define NVC0C0_QMDV02_01_INVALIDATE_SHADER_DATA_CACHE_TRUE 0x00000001 -+#define NVC0C0_QMDV02_01_INVALIDATE_INSTRUCTION_CACHE MW(190:190) -+#define NVC0C0_QMDV02_01_INVALIDATE_INSTRUCTION_CACHE_FALSE 0x00000000 -+#define NVC0C0_QMDV02_01_INVALIDATE_INSTRUCTION_CACHE_TRUE 0x00000001 -+#define NVC0C0_QMDV02_01_INVALIDATE_SHADER_CONSTANT_CACHE MW(191:191) -+#define NVC0C0_QMDV02_01_INVALIDATE_SHADER_CONSTANT_CACHE_FALSE 0x00000000 -+#define NVC0C0_QMDV02_01_INVALIDATE_SHADER_CONSTANT_CACHE_TRUE 0x00000001 -+#define NVC0C0_QMDV02_01_CTA_RASTER_WIDTH_RESUME MW(223:192) -+#define NVC0C0_QMDV02_01_CTA_RASTER_HEIGHT_RESUME MW(239:224) -+#define NVC0C0_QMDV02_01_CTA_RASTER_DEPTH_RESUME MW(255:240) -+#define NVC0C0_QMDV02_01_PROGRAM_OFFSET MW(287:256) -+#define NVC0C0_QMDV02_01_CIRCULAR_QUEUE_ADDR_LOWER MW(319:288) -+#define NVC0C0_QMDV02_01_CIRCULAR_QUEUE_ADDR_UPPER MW(327:320) -+#define NVC0C0_QMDV02_01_QMD_RESERVED_D MW(335:328) -+#define NVC0C0_QMDV02_01_CIRCULAR_QUEUE_ENTRY_SIZE MW(351:336) -+#define NVC0C0_QMDV02_01_CWD_REFERENCE_COUNT_ID MW(357:352) -+#define NVC0C0_QMDV02_01_CWD_REFERENCE_COUNT_DELTA_MINUS_ONE MW(365:358) -+#define NVC0C0_QMDV02_01_RELEASE_MEMBAR_TYPE MW(366:366) -+#define NVC0C0_QMDV02_01_RELEASE_MEMBAR_TYPE_FE_NONE 0x00000000 -+#define NVC0C0_QMDV02_01_RELEASE_MEMBAR_TYPE_FE_SYSMEMBAR 0x00000001 -+#define NVC0C0_QMDV02_01_CWD_REFERENCE_COUNT_INCR_ENABLE MW(367:367) -+#define NVC0C0_QMDV02_01_CWD_REFERENCE_COUNT_INCR_ENABLE_FALSE 0x00000000 -+#define NVC0C0_QMDV02_01_CWD_REFERENCE_COUNT_INCR_ENABLE_TRUE 0x00000001 -+#define NVC0C0_QMDV02_01_CWD_MEMBAR_TYPE MW(369:368) -+#define NVC0C0_QMDV02_01_CWD_MEMBAR_TYPE_L1_NONE 0x00000000 -+#define NVC0C0_QMDV02_01_CWD_MEMBAR_TYPE_L1_SYSMEMBAR 0x00000001 -+#define NVC0C0_QMDV02_01_CWD_MEMBAR_TYPE_L1_MEMBAR 0x00000003 -+#define NVC0C0_QMDV02_01_SEQUENTIALLY_RUN_CTAS MW(370:370) -+#define NVC0C0_QMDV02_01_SEQUENTIALLY_RUN_CTAS_FALSE 0x00000000 -+#define NVC0C0_QMDV02_01_SEQUENTIALLY_RUN_CTAS_TRUE 0x00000001 -+#define NVC0C0_QMDV02_01_CWD_REFERENCE_COUNT_DECR_ENABLE MW(371:371) -+#define NVC0C0_QMDV02_01_CWD_REFERENCE_COUNT_DECR_ENABLE_FALSE 0x00000000 -+#define NVC0C0_QMDV02_01_CWD_REFERENCE_COUNT_DECR_ENABLE_TRUE 0x00000001 -+#define NVC0C0_QMDV02_01_THROTTLED MW(372:372) -+#define NVC0C0_QMDV02_01_THROTTLED_FALSE 0x00000000 -+#define NVC0C0_QMDV02_01_THROTTLED_TRUE 0x00000001 -+#define NVC0C0_QMDV02_01_API_VISIBLE_CALL_LIMIT MW(378:378) -+#define NVC0C0_QMDV02_01_API_VISIBLE_CALL_LIMIT__32 0x00000000 -+#define NVC0C0_QMDV02_01_API_VISIBLE_CALL_LIMIT_NO_CHECK 0x00000001 -+#define NVC0C0_QMDV02_01_SAMPLER_INDEX MW(382:382) -+#define NVC0C0_QMDV02_01_SAMPLER_INDEX_INDEPENDENTLY 0x00000000 -+#define NVC0C0_QMDV02_01_SAMPLER_INDEX_VIA_HEADER_INDEX 0x00000001 -+#define NVC0C0_QMDV02_01_CTA_RASTER_WIDTH MW(415:384) -+#define NVC0C0_QMDV02_01_CTA_RASTER_HEIGHT MW(431:416) -+#define NVC0C0_QMDV02_01_QMD_RESERVED13A MW(447:432) -+#define NVC0C0_QMDV02_01_CTA_RASTER_DEPTH MW(463:448) -+#define NVC0C0_QMDV02_01_QMD_RESERVED14A MW(479:464) -+#define NVC0C0_QMDV02_01_DEPENDENT_QMD_POINTER MW(511:480) -+#define NVC0C0_QMDV02_01_QUEUE_ENTRIES_PER_CTA_MINUS_ONE MW(518:512) -+#define NVC0C0_QMDV02_01_COALESCE_WAITING_PERIOD MW(529:522) -+#define NVC0C0_QMDV02_01_SHARED_MEMORY_SIZE MW(561:544) -+#define NVC0C0_QMDV02_01_QMD_RESERVED_G MW(575:562) -+#define NVC0C0_QMDV02_01_QMD_VERSION MW(579:576) -+#define NVC0C0_QMDV02_01_QMD_MAJOR_VERSION MW(583:580) -+#define NVC0C0_QMDV02_01_QMD_RESERVED_H MW(591:584) -+#define NVC0C0_QMDV02_01_CTA_THREAD_DIMENSION0 MW(607:592) -+#define NVC0C0_QMDV02_01_CTA_THREAD_DIMENSION1 MW(623:608) -+#define NVC0C0_QMDV02_01_CTA_THREAD_DIMENSION2 MW(639:624) -+#define NVC0C0_QMDV02_01_CONSTANT_BUFFER_VALID(i) MW((640+(i)*1):(640+(i)*1)) -+#define NVC0C0_QMDV02_01_CONSTANT_BUFFER_VALID_FALSE 0x00000000 -+#define NVC0C0_QMDV02_01_CONSTANT_BUFFER_VALID_TRUE 0x00000001 -+#define NVC0C0_QMDV02_01_QMD_RESERVED_I MW(671:648) -+#define NVC0C0_QMDV02_01_SM_DISABLE_MASK_LOWER MW(703:672) -+#define NVC0C0_QMDV02_01_SM_DISABLE_MASK_UPPER MW(735:704) -+#define NVC0C0_QMDV02_01_RELEASE0_ADDRESS_LOWER MW(767:736) -+#define NVC0C0_QMDV02_01_RELEASE0_ADDRESS_UPPER MW(775:768) -+#define NVC0C0_QMDV02_01_QMD_RESERVED_J MW(783:776) -+#define NVC0C0_QMDV02_01_RELEASE0_REDUCTION_OP MW(790:788) -+#define NVC0C0_QMDV02_01_RELEASE0_REDUCTION_OP_RED_ADD 0x00000000 -+#define NVC0C0_QMDV02_01_RELEASE0_REDUCTION_OP_RED_MIN 0x00000001 -+#define NVC0C0_QMDV02_01_RELEASE0_REDUCTION_OP_RED_MAX 0x00000002 -+#define NVC0C0_QMDV02_01_RELEASE0_REDUCTION_OP_RED_INC 0x00000003 -+#define NVC0C0_QMDV02_01_RELEASE0_REDUCTION_OP_RED_DEC 0x00000004 -+#define NVC0C0_QMDV02_01_RELEASE0_REDUCTION_OP_RED_AND 0x00000005 -+#define NVC0C0_QMDV02_01_RELEASE0_REDUCTION_OP_RED_OR 0x00000006 -+#define NVC0C0_QMDV02_01_RELEASE0_REDUCTION_OP_RED_XOR 0x00000007 -+#define NVC0C0_QMDV02_01_QMD_RESERVED_K MW(791:791) -+#define NVC0C0_QMDV02_01_RELEASE0_REDUCTION_FORMAT MW(793:792) -+#define NVC0C0_QMDV02_01_RELEASE0_REDUCTION_FORMAT_UNSIGNED_32 0x00000000 -+#define NVC0C0_QMDV02_01_RELEASE0_REDUCTION_FORMAT_SIGNED_32 0x00000001 -+#define NVC0C0_QMDV02_01_RELEASE0_REDUCTION_ENABLE MW(794:794) -+#define NVC0C0_QMDV02_01_RELEASE0_REDUCTION_ENABLE_FALSE 0x00000000 -+#define NVC0C0_QMDV02_01_RELEASE0_REDUCTION_ENABLE_TRUE 0x00000001 -+#define NVC0C0_QMDV02_01_RELEASE0_STRUCTURE_SIZE MW(799:799) -+#define NVC0C0_QMDV02_01_RELEASE0_STRUCTURE_SIZE_FOUR_WORDS 0x00000000 -+#define NVC0C0_QMDV02_01_RELEASE0_STRUCTURE_SIZE_ONE_WORD 0x00000001 -+#define NVC0C0_QMDV02_01_RELEASE0_PAYLOAD MW(831:800) -+#define NVC0C0_QMDV02_01_RELEASE1_ADDRESS_LOWER MW(863:832) -+#define NVC0C0_QMDV02_01_RELEASE1_ADDRESS_UPPER MW(871:864) -+#define NVC0C0_QMDV02_01_QMD_RESERVED_L MW(879:872) -+#define NVC0C0_QMDV02_01_RELEASE1_REDUCTION_OP MW(886:884) -+#define NVC0C0_QMDV02_01_RELEASE1_REDUCTION_OP_RED_ADD 0x00000000 -+#define NVC0C0_QMDV02_01_RELEASE1_REDUCTION_OP_RED_MIN 0x00000001 -+#define NVC0C0_QMDV02_01_RELEASE1_REDUCTION_OP_RED_MAX 0x00000002 -+#define NVC0C0_QMDV02_01_RELEASE1_REDUCTION_OP_RED_INC 0x00000003 -+#define NVC0C0_QMDV02_01_RELEASE1_REDUCTION_OP_RED_DEC 0x00000004 -+#define NVC0C0_QMDV02_01_RELEASE1_REDUCTION_OP_RED_AND 0x00000005 -+#define NVC0C0_QMDV02_01_RELEASE1_REDUCTION_OP_RED_OR 0x00000006 -+#define NVC0C0_QMDV02_01_RELEASE1_REDUCTION_OP_RED_XOR 0x00000007 -+#define NVC0C0_QMDV02_01_QMD_RESERVED_M MW(887:887) -+#define NVC0C0_QMDV02_01_RELEASE1_REDUCTION_FORMAT MW(889:888) -+#define NVC0C0_QMDV02_01_RELEASE1_REDUCTION_FORMAT_UNSIGNED_32 0x00000000 -+#define NVC0C0_QMDV02_01_RELEASE1_REDUCTION_FORMAT_SIGNED_32 0x00000001 -+#define NVC0C0_QMDV02_01_RELEASE1_REDUCTION_ENABLE MW(890:890) -+#define NVC0C0_QMDV02_01_RELEASE1_REDUCTION_ENABLE_FALSE 0x00000000 -+#define NVC0C0_QMDV02_01_RELEASE1_REDUCTION_ENABLE_TRUE 0x00000001 -+#define NVC0C0_QMDV02_01_RELEASE1_STRUCTURE_SIZE MW(895:895) -+#define NVC0C0_QMDV02_01_RELEASE1_STRUCTURE_SIZE_FOUR_WORDS 0x00000000 -+#define NVC0C0_QMDV02_01_RELEASE1_STRUCTURE_SIZE_ONE_WORD 0x00000001 -+#define NVC0C0_QMDV02_01_RELEASE1_PAYLOAD MW(927:896) -+#define NVC0C0_QMDV02_01_SHADER_LOCAL_MEMORY_LOW_SIZE MW(951:928) -+#define NVC0C0_QMDV02_01_QMD_RESERVED_N MW(954:952) -+#define NVC0C0_QMDV02_01_BARRIER_COUNT MW(959:955) -+#define NVC0C0_QMDV02_01_SHADER_LOCAL_MEMORY_HIGH_SIZE MW(983:960) -+#define NVC0C0_QMDV02_01_REGISTER_COUNT MW(991:984) -+#define NVC0C0_QMDV02_01_SHADER_LOCAL_MEMORY_CRS_SIZE MW(1015:992) -+#define NVC0C0_QMDV02_01_SASS_VERSION MW(1023:1016) -+#define NVC0C0_QMDV02_01_CONSTANT_BUFFER_ADDR_LOWER(i) MW((1055+(i)*64):(1024+(i)*64)) -+#define NVC0C0_QMDV02_01_CONSTANT_BUFFER_ADDR_UPPER(i) MW((1072+(i)*64):(1056+(i)*64)) -+#define NVC0C0_QMDV02_01_CONSTANT_BUFFER_RESERVED_ADDR(i) MW((1073+(i)*64):(1073+(i)*64)) -+#define NVC0C0_QMDV02_01_CONSTANT_BUFFER_INVALIDATE(i) MW((1074+(i)*64):(1074+(i)*64)) -+#define NVC0C0_QMDV02_01_CONSTANT_BUFFER_INVALIDATE_FALSE 0x00000000 -+#define NVC0C0_QMDV02_01_CONSTANT_BUFFER_INVALIDATE_TRUE 0x00000001 -+#define NVC0C0_QMDV02_01_CONSTANT_BUFFER_SIZE_SHIFTED4(i) MW((1087+(i)*64):(1075+(i)*64)) -+#define NVC0C0_QMDV02_01_QMD_RESERVED_R MW(1567:1536) -+#define NVC0C0_QMDV02_01_QMD_RESERVED_S MW(1599:1568) -+#define NVC0C0_QMDV02_01_HW_ONLY_INNER_GET MW(1630:1600) -+#define NVC0C0_QMDV02_01_HW_ONLY_REQUIRE_SCHEDULING_PCAS MW(1631:1631) -+#define NVC0C0_QMDV02_01_HW_ONLY_INNER_PUT MW(1662:1632) -+#define NVC0C0_QMDV02_01_HW_ONLY_SCG_TYPE MW(1663:1663) -+#define NVC0C0_QMDV02_01_HW_ONLY_SPAN_LIST_HEAD_INDEX MW(1693:1664) -+#define NVC0C0_QMDV02_01_QMD_RESERVED_Q MW(1694:1694) -+#define NVC0C0_QMDV02_01_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID MW(1695:1695) -+#define NVC0C0_QMDV02_01_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_FALSE 0x00000000 -+#define NVC0C0_QMDV02_01_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_TRUE 0x00000001 -+#define NVC0C0_QMDV02_01_HW_ONLY_SKED_NEXT_QMD_POINTER MW(1727:1696) -+#define NVC0C0_QMDV02_01_QMD_SPARE_G MW(1759:1728) -+#define NVC0C0_QMDV02_01_QMD_SPARE_H MW(1791:1760) -+#define NVC0C0_QMDV02_01_QMD_SPARE_I MW(1823:1792) -+#define NVC0C0_QMDV02_01_QMD_SPARE_J MW(1855:1824) -+#define NVC0C0_QMDV02_01_QMD_SPARE_K MW(1887:1856) -+#define NVC0C0_QMDV02_01_QMD_SPARE_L MW(1919:1888) -+#define NVC0C0_QMDV02_01_QMD_SPARE_M MW(1951:1920) -+#define NVC0C0_QMDV02_01_QMD_SPARE_N MW(1983:1952) -+#define NVC0C0_QMDV02_01_DEBUG_ID_UPPER MW(2015:1984) -+#define NVC0C0_QMDV02_01_DEBUG_ID_LOWER MW(2047:2016) -+ -+ -+ -+#endif // #ifndef __CLC0C0QMD_H__ -diff --git a/src/gallium/drivers/nouveau/nvc0/clc3c0qmd.h b/src/gallium/drivers/nouveau/nvc0/clc3c0qmd.h -new file mode 100644 -index 00000000000..588cc639d32 ---- /dev/null -+++ b/src/gallium/drivers/nouveau/nvc0/clc3c0qmd.h -@@ -0,0 +1,245 @@ -+/******************************************************************************* -+ Copyright (c) 2001-2010 NVIDIA Corporation -+ -+ Permission is hereby granted, free of charge, to any person obtaining a copy -+ of this software and associated documentation files (the "Software"), to -+ deal in the Software without restriction, including without limitation the -+ rights to use, copy, modify, merge, publish, distribute, sublicense, and/or -+ sell copies of the Software, and to permit persons to whom the Software is -+ furnished to do so, subject to the following conditions: -+ -+ The above copyright notice and this permission notice shall be -+ included in all copies or substantial portions of the Software. -+ -+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -+ DEALINGS IN THE SOFTWARE. -+ -+*******************************************************************************/ -+ -+/* AUTO GENERATED FILE -- DO NOT EDIT */ -+ -+#ifndef __CLC3C0QMD_H__ -+#define __CLC3C0QMD_H__ -+ -+/* -+** Queue Meta Data, Version 02_02 -+ */ -+ -+// The below C preprocessor definitions describe "multi-word" structures, where -+// fields may have bit numbers beyond 32. For example, MW(127:96) means -+// the field is in bits 0-31 of word number 3 of the structure. The "MW(X:Y)" -+// syntax is to distinguish from similar "X:Y" single-word definitions: the -+// macros historically used for single-word definitions would fail with -+// multi-word definitions. -+// -+// See nvmisc.h:DRF_VAL_MW() in the source code of the kernel -+// interface layer of nvidia.ko for an example of how to manipulate -+// these MW(X:Y) definitions. -+ -+#define NVC3C0_QMDV02_02_OUTER_PUT MW(30:0) -+#define NVC3C0_QMDV02_02_OUTER_OVERFLOW MW(31:31) -+#define NVC3C0_QMDV02_02_OUTER_GET MW(62:32) -+#define NVC3C0_QMDV02_02_OUTER_STICKY_OVERFLOW MW(63:63) -+#define NVC3C0_QMDV02_02_INNER_GET MW(94:64) -+#define NVC3C0_QMDV02_02_INNER_OVERFLOW MW(95:95) -+#define NVC3C0_QMDV02_02_INNER_PUT MW(126:96) -+#define NVC3C0_QMDV02_02_INNER_STICKY_OVERFLOW MW(127:127) -+#define NVC3C0_QMDV02_02_QMD_GROUP_ID MW(133:128) -+#define NVC3C0_QMDV02_02_SM_GLOBAL_CACHING_ENABLE MW(134:134) -+#define NVC3C0_QMDV02_02_RUN_CTA_IN_ONE_SM_PARTITION MW(135:135) -+#define NVC3C0_QMDV02_02_RUN_CTA_IN_ONE_SM_PARTITION_FALSE 0x00000000 -+#define NVC3C0_QMDV02_02_RUN_CTA_IN_ONE_SM_PARTITION_TRUE 0x00000001 -+#define NVC3C0_QMDV02_02_IS_QUEUE MW(136:136) -+#define NVC3C0_QMDV02_02_IS_QUEUE_FALSE 0x00000000 -+#define NVC3C0_QMDV02_02_IS_QUEUE_TRUE 0x00000001 -+#define NVC3C0_QMDV02_02_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST MW(137:137) -+#define NVC3C0_QMDV02_02_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_FALSE 0x00000000 -+#define NVC3C0_QMDV02_02_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_TRUE 0x00000001 -+#define NVC3C0_QMDV02_02_SEMAPHORE_RELEASE_ENABLE0 MW(138:138) -+#define NVC3C0_QMDV02_02_SEMAPHORE_RELEASE_ENABLE0_FALSE 0x00000000 -+#define NVC3C0_QMDV02_02_SEMAPHORE_RELEASE_ENABLE0_TRUE 0x00000001 -+#define NVC3C0_QMDV02_02_SEMAPHORE_RELEASE_ENABLE1 MW(139:139) -+#define NVC3C0_QMDV02_02_SEMAPHORE_RELEASE_ENABLE1_FALSE 0x00000000 -+#define NVC3C0_QMDV02_02_SEMAPHORE_RELEASE_ENABLE1_TRUE 0x00000001 -+#define NVC3C0_QMDV02_02_REQUIRE_SCHEDULING_PCAS MW(140:140) -+#define NVC3C0_QMDV02_02_REQUIRE_SCHEDULING_PCAS_FALSE 0x00000000 -+#define NVC3C0_QMDV02_02_REQUIRE_SCHEDULING_PCAS_TRUE 0x00000001 -+#define NVC3C0_QMDV02_02_DEPENDENT_QMD_SCHEDULE_ENABLE MW(141:141) -+#define NVC3C0_QMDV02_02_DEPENDENT_QMD_SCHEDULE_ENABLE_FALSE 0x00000000 -+#define NVC3C0_QMDV02_02_DEPENDENT_QMD_SCHEDULE_ENABLE_TRUE 0x00000001 -+#define NVC3C0_QMDV02_02_DEPENDENT_QMD_TYPE MW(142:142) -+#define NVC3C0_QMDV02_02_DEPENDENT_QMD_TYPE_QUEUE 0x00000000 -+#define NVC3C0_QMDV02_02_DEPENDENT_QMD_TYPE_GRID 0x00000001 -+#define NVC3C0_QMDV02_02_DEPENDENT_QMD_FIELD_COPY MW(143:143) -+#define NVC3C0_QMDV02_02_DEPENDENT_QMD_FIELD_COPY_FALSE 0x00000000 -+#define NVC3C0_QMDV02_02_DEPENDENT_QMD_FIELD_COPY_TRUE 0x00000001 -+#define NVC3C0_QMDV02_02_QMD_RESERVED_B MW(159:144) -+#define NVC3C0_QMDV02_02_CIRCULAR_QUEUE_SIZE MW(184:160) -+#define NVC3C0_QMDV02_02_QMD_RESERVED_C MW(185:185) -+#define NVC3C0_QMDV02_02_INVALIDATE_TEXTURE_HEADER_CACHE MW(186:186) -+#define NVC3C0_QMDV02_02_INVALIDATE_TEXTURE_HEADER_CACHE_FALSE 0x00000000 -+#define NVC3C0_QMDV02_02_INVALIDATE_TEXTURE_HEADER_CACHE_TRUE 0x00000001 -+#define NVC3C0_QMDV02_02_INVALIDATE_TEXTURE_SAMPLER_CACHE MW(187:187) -+#define NVC3C0_QMDV02_02_INVALIDATE_TEXTURE_SAMPLER_CACHE_FALSE 0x00000000 -+#define NVC3C0_QMDV02_02_INVALIDATE_TEXTURE_SAMPLER_CACHE_TRUE 0x00000001 -+#define NVC3C0_QMDV02_02_INVALIDATE_TEXTURE_DATA_CACHE MW(188:188) -+#define NVC3C0_QMDV02_02_INVALIDATE_TEXTURE_DATA_CACHE_FALSE 0x00000000 -+#define NVC3C0_QMDV02_02_INVALIDATE_TEXTURE_DATA_CACHE_TRUE 0x00000001 -+#define NVC3C0_QMDV02_02_INVALIDATE_SHADER_DATA_CACHE MW(189:189) -+#define NVC3C0_QMDV02_02_INVALIDATE_SHADER_DATA_CACHE_FALSE 0x00000000 -+#define NVC3C0_QMDV02_02_INVALIDATE_SHADER_DATA_CACHE_TRUE 0x00000001 -+#define NVC3C0_QMDV02_02_INVALIDATE_INSTRUCTION_CACHE MW(190:190) -+#define NVC3C0_QMDV02_02_INVALIDATE_INSTRUCTION_CACHE_FALSE 0x00000000 -+#define NVC3C0_QMDV02_02_INVALIDATE_INSTRUCTION_CACHE_TRUE 0x00000001 -+#define NVC3C0_QMDV02_02_INVALIDATE_SHADER_CONSTANT_CACHE MW(191:191) -+#define NVC3C0_QMDV02_02_INVALIDATE_SHADER_CONSTANT_CACHE_FALSE 0x00000000 -+#define NVC3C0_QMDV02_02_INVALIDATE_SHADER_CONSTANT_CACHE_TRUE 0x00000001 -+#define NVC3C0_QMDV02_02_CTA_RASTER_WIDTH_RESUME MW(223:192) -+#define NVC3C0_QMDV02_02_CTA_RASTER_HEIGHT_RESUME MW(239:224) -+#define NVC3C0_QMDV02_02_CTA_RASTER_DEPTH_RESUME MW(255:240) -+#define NVC3C0_QMDV02_02_PROGRAM_OFFSET MW(287:256) -+#define NVC3C0_QMDV02_02_CIRCULAR_QUEUE_ADDR_LOWER MW(319:288) -+#define NVC3C0_QMDV02_02_CIRCULAR_QUEUE_ADDR_UPPER MW(327:320) -+#define NVC3C0_QMDV02_02_QMD_RESERVED_D MW(335:328) -+#define NVC3C0_QMDV02_02_CIRCULAR_QUEUE_ENTRY_SIZE MW(351:336) -+#define NVC3C0_QMDV02_02_CWD_REFERENCE_COUNT_ID MW(357:352) -+#define NVC3C0_QMDV02_02_CWD_REFERENCE_COUNT_DELTA_MINUS_ONE MW(365:358) -+#define NVC3C0_QMDV02_02_RELEASE_MEMBAR_TYPE MW(366:366) -+#define NVC3C0_QMDV02_02_RELEASE_MEMBAR_TYPE_FE_NONE 0x00000000 -+#define NVC3C0_QMDV02_02_RELEASE_MEMBAR_TYPE_FE_SYSMEMBAR 0x00000001 -+#define NVC3C0_QMDV02_02_CWD_REFERENCE_COUNT_INCR_ENABLE MW(367:367) -+#define NVC3C0_QMDV02_02_CWD_REFERENCE_COUNT_INCR_ENABLE_FALSE 0x00000000 -+#define NVC3C0_QMDV02_02_CWD_REFERENCE_COUNT_INCR_ENABLE_TRUE 0x00000001 -+#define NVC3C0_QMDV02_02_CWD_MEMBAR_TYPE MW(369:368) -+#define NVC3C0_QMDV02_02_CWD_MEMBAR_TYPE_L1_NONE 0x00000000 -+#define NVC3C0_QMDV02_02_CWD_MEMBAR_TYPE_L1_SYSMEMBAR 0x00000001 -+#define NVC3C0_QMDV02_02_CWD_MEMBAR_TYPE_L1_MEMBAR 0x00000003 -+#define NVC3C0_QMDV02_02_SEQUENTIALLY_RUN_CTAS MW(370:370) -+#define NVC3C0_QMDV02_02_SEQUENTIALLY_RUN_CTAS_FALSE 0x00000000 -+#define NVC3C0_QMDV02_02_SEQUENTIALLY_RUN_CTAS_TRUE 0x00000001 -+#define NVC3C0_QMDV02_02_CWD_REFERENCE_COUNT_DECR_ENABLE MW(371:371) -+#define NVC3C0_QMDV02_02_CWD_REFERENCE_COUNT_DECR_ENABLE_FALSE 0x00000000 -+#define NVC3C0_QMDV02_02_CWD_REFERENCE_COUNT_DECR_ENABLE_TRUE 0x00000001 -+#define NVC3C0_QMDV02_02_API_VISIBLE_CALL_LIMIT MW(378:378) -+#define NVC3C0_QMDV02_02_API_VISIBLE_CALL_LIMIT__32 0x00000000 -+#define NVC3C0_QMDV02_02_API_VISIBLE_CALL_LIMIT_NO_CHECK 0x00000001 -+#define NVC3C0_QMDV02_02_SAMPLER_INDEX MW(382:382) -+#define NVC3C0_QMDV02_02_SAMPLER_INDEX_INDEPENDENTLY 0x00000000 -+#define NVC3C0_QMDV02_02_SAMPLER_INDEX_VIA_HEADER_INDEX 0x00000001 -+#define NVC3C0_QMDV02_02_CTA_RASTER_WIDTH MW(415:384) -+#define NVC3C0_QMDV02_02_CTA_RASTER_HEIGHT MW(431:416) -+#define NVC3C0_QMDV02_02_QMD_RESERVED13A MW(447:432) -+#define NVC3C0_QMDV02_02_CTA_RASTER_DEPTH MW(463:448) -+#define NVC3C0_QMDV02_02_QMD_RESERVED14A MW(479:464) -+#define NVC3C0_QMDV02_02_DEPENDENT_QMD_POINTER MW(511:480) -+#define NVC3C0_QMDV02_02_QUEUE_ENTRIES_PER_CTA_MINUS_ONE MW(518:512) -+#define NVC3C0_QMDV02_02_COALESCE_WAITING_PERIOD MW(529:522) -+#define NVC3C0_QMDV02_02_SHARED_MEMORY_SIZE MW(561:544) -+#define NVC3C0_QMDV02_02_MIN_SM_CONFIG_SHARED_MEM_SIZE MW(568:562) -+#define NVC3C0_QMDV02_02_MAX_SM_CONFIG_SHARED_MEM_SIZE MW(575:569) -+#define NVC3C0_QMDV02_02_QMD_VERSION MW(579:576) -+#define NVC3C0_QMDV02_02_QMD_MAJOR_VERSION MW(583:580) -+#define NVC3C0_QMDV02_02_QMD_RESERVED_H MW(591:584) -+#define NVC3C0_QMDV02_02_CTA_THREAD_DIMENSION0 MW(607:592) -+#define NVC3C0_QMDV02_02_CTA_THREAD_DIMENSION1 MW(623:608) -+#define NVC3C0_QMDV02_02_CTA_THREAD_DIMENSION2 MW(639:624) -+#define NVC3C0_QMDV02_02_CONSTANT_BUFFER_VALID(i) MW((640+(i)*1):(640+(i)*1)) -+#define NVC3C0_QMDV02_02_CONSTANT_BUFFER_VALID_FALSE 0x00000000 -+#define NVC3C0_QMDV02_02_CONSTANT_BUFFER_VALID_TRUE 0x00000001 -+#define NVC3C0_QMDV02_02_REGISTER_COUNT_V MW(656:648) -+#define NVC3C0_QMDV02_02_TARGET_SM_CONFIG_SHARED_MEM_SIZE MW(663:657) -+#define NVC3C0_QMDV02_02_FREE_CTA_SLOTS_EMPTY_SM MW(671:664) -+#define NVC3C0_QMDV02_02_SM_DISABLE_MASK_LOWER MW(703:672) -+#define NVC3C0_QMDV02_02_SM_DISABLE_MASK_UPPER MW(735:704) -+#define NVC3C0_QMDV02_02_RELEASE0_ADDRESS_LOWER MW(767:736) -+#define NVC3C0_QMDV02_02_RELEASE0_ADDRESS_UPPER MW(775:768) -+#define NVC3C0_QMDV02_02_QMD_RESERVED_J MW(783:776) -+#define NVC3C0_QMDV02_02_RELEASE0_REDUCTION_OP MW(790:788) -+#define NVC3C0_QMDV02_02_RELEASE0_REDUCTION_OP_RED_ADD 0x00000000 -+#define NVC3C0_QMDV02_02_RELEASE0_REDUCTION_OP_RED_MIN 0x00000001 -+#define NVC3C0_QMDV02_02_RELEASE0_REDUCTION_OP_RED_MAX 0x00000002 -+#define NVC3C0_QMDV02_02_RELEASE0_REDUCTION_OP_RED_INC 0x00000003 -+#define NVC3C0_QMDV02_02_RELEASE0_REDUCTION_OP_RED_DEC 0x00000004 -+#define NVC3C0_QMDV02_02_RELEASE0_REDUCTION_OP_RED_AND 0x00000005 -+#define NVC3C0_QMDV02_02_RELEASE0_REDUCTION_OP_RED_OR 0x00000006 -+#define NVC3C0_QMDV02_02_RELEASE0_REDUCTION_OP_RED_XOR 0x00000007 -+#define NVC3C0_QMDV02_02_QMD_RESERVED_K MW(791:791) -+#define NVC3C0_QMDV02_02_RELEASE0_REDUCTION_FORMAT MW(793:792) -+#define NVC3C0_QMDV02_02_RELEASE0_REDUCTION_FORMAT_UNSIGNED_32 0x00000000 -+#define NVC3C0_QMDV02_02_RELEASE0_REDUCTION_FORMAT_SIGNED_32 0x00000001 -+#define NVC3C0_QMDV02_02_RELEASE0_REDUCTION_ENABLE MW(794:794) -+#define NVC3C0_QMDV02_02_RELEASE0_REDUCTION_ENABLE_FALSE 0x00000000 -+#define NVC3C0_QMDV02_02_RELEASE0_REDUCTION_ENABLE_TRUE 0x00000001 -+#define NVC3C0_QMDV02_02_RELEASE0_STRUCTURE_SIZE MW(799:799) -+#define NVC3C0_QMDV02_02_RELEASE0_STRUCTURE_SIZE_FOUR_WORDS 0x00000000 -+#define NVC3C0_QMDV02_02_RELEASE0_STRUCTURE_SIZE_ONE_WORD 0x00000001 -+#define NVC3C0_QMDV02_02_RELEASE0_PAYLOAD MW(831:800) -+#define NVC3C0_QMDV02_02_RELEASE1_ADDRESS_LOWER MW(863:832) -+#define NVC3C0_QMDV02_02_RELEASE1_ADDRESS_UPPER MW(871:864) -+#define NVC3C0_QMDV02_02_QMD_RESERVED_L MW(879:872) -+#define NVC3C0_QMDV02_02_RELEASE1_REDUCTION_OP MW(886:884) -+#define NVC3C0_QMDV02_02_RELEASE1_REDUCTION_OP_RED_ADD 0x00000000 -+#define NVC3C0_QMDV02_02_RELEASE1_REDUCTION_OP_RED_MIN 0x00000001 -+#define NVC3C0_QMDV02_02_RELEASE1_REDUCTION_OP_RED_MAX 0x00000002 -+#define NVC3C0_QMDV02_02_RELEASE1_REDUCTION_OP_RED_INC 0x00000003 -+#define NVC3C0_QMDV02_02_RELEASE1_REDUCTION_OP_RED_DEC 0x00000004 -+#define NVC3C0_QMDV02_02_RELEASE1_REDUCTION_OP_RED_AND 0x00000005 -+#define NVC3C0_QMDV02_02_RELEASE1_REDUCTION_OP_RED_OR 0x00000006 -+#define NVC3C0_QMDV02_02_RELEASE1_REDUCTION_OP_RED_XOR 0x00000007 -+#define NVC3C0_QMDV02_02_QMD_RESERVED_M MW(887:887) -+#define NVC3C0_QMDV02_02_RELEASE1_REDUCTION_FORMAT MW(889:888) -+#define NVC3C0_QMDV02_02_RELEASE1_REDUCTION_FORMAT_UNSIGNED_32 0x00000000 -+#define NVC3C0_QMDV02_02_RELEASE1_REDUCTION_FORMAT_SIGNED_32 0x00000001 -+#define NVC3C0_QMDV02_02_RELEASE1_REDUCTION_ENABLE MW(890:890) -+#define NVC3C0_QMDV02_02_RELEASE1_REDUCTION_ENABLE_FALSE 0x00000000 -+#define NVC3C0_QMDV02_02_RELEASE1_REDUCTION_ENABLE_TRUE 0x00000001 -+#define NVC3C0_QMDV02_02_RELEASE1_STRUCTURE_SIZE MW(895:895) -+#define NVC3C0_QMDV02_02_RELEASE1_STRUCTURE_SIZE_FOUR_WORDS 0x00000000 -+#define NVC3C0_QMDV02_02_RELEASE1_STRUCTURE_SIZE_ONE_WORD 0x00000001 -+#define NVC3C0_QMDV02_02_RELEASE1_PAYLOAD MW(927:896) -+#define NVC3C0_QMDV02_02_SHADER_LOCAL_MEMORY_LOW_SIZE MW(951:928) -+#define NVC3C0_QMDV02_02_QMD_RESERVED_N MW(954:952) -+#define NVC3C0_QMDV02_02_BARRIER_COUNT MW(959:955) -+#define NVC3C0_QMDV02_02_SHADER_LOCAL_MEMORY_HIGH_SIZE MW(983:960) -+#define NVC3C0_QMDV02_02_REGISTER_COUNT MW(991:984) -+#define NVC3C0_QMDV02_02_SHADER_LOCAL_MEMORY_CRS_SIZE MW(1015:992) -+#define NVC3C0_QMDV02_02_SASS_VERSION MW(1023:1016) -+#define NVC3C0_QMDV02_02_CONSTANT_BUFFER_ADDR_LOWER(i) MW((1055+(i)*64):(1024+(i)*64)) -+#define NVC3C0_QMDV02_02_CONSTANT_BUFFER_ADDR_UPPER(i) MW((1072+(i)*64):(1056+(i)*64)) -+#define NVC3C0_QMDV02_02_CONSTANT_BUFFER_RESERVED_ADDR(i) MW((1073+(i)*64):(1073+(i)*64)) -+#define NVC3C0_QMDV02_02_CONSTANT_BUFFER_INVALIDATE(i) MW((1074+(i)*64):(1074+(i)*64)) -+#define NVC3C0_QMDV02_02_CONSTANT_BUFFER_INVALIDATE_FALSE 0x00000000 -+#define NVC3C0_QMDV02_02_CONSTANT_BUFFER_INVALIDATE_TRUE 0x00000001 -+#define NVC3C0_QMDV02_02_CONSTANT_BUFFER_SIZE_SHIFTED4(i) MW((1087+(i)*64):(1075+(i)*64)) -+#define NVC3C0_QMDV02_02_PROGRAM_ADDRESS_LOWER MW(1567:1536) -+#define NVC3C0_QMDV02_02_PROGRAM_ADDRESS_UPPER MW(1584:1568) -+#define NVC3C0_QMDV02_02_QMD_RESERVED_S MW(1599:1585) -+#define NVC3C0_QMDV02_02_HW_ONLY_INNER_GET MW(1630:1600) -+#define NVC3C0_QMDV02_02_HW_ONLY_REQUIRE_SCHEDULING_PCAS MW(1631:1631) -+#define NVC3C0_QMDV02_02_HW_ONLY_INNER_PUT MW(1662:1632) -+#define NVC3C0_QMDV02_02_HW_ONLY_SCG_TYPE MW(1663:1663) -+#define NVC3C0_QMDV02_02_HW_ONLY_SPAN_LIST_HEAD_INDEX MW(1693:1664) -+#define NVC3C0_QMDV02_02_QMD_RESERVED_Q MW(1694:1694) -+#define NVC3C0_QMDV02_02_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID MW(1695:1695) -+#define NVC3C0_QMDV02_02_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_FALSE 0x00000000 -+#define NVC3C0_QMDV02_02_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_TRUE 0x00000001 -+#define NVC3C0_QMDV02_02_HW_ONLY_SKED_NEXT_QMD_POINTER MW(1727:1696) -+#define NVC3C0_QMDV02_02_QMD_SPARE_G MW(1759:1728) -+#define NVC3C0_QMDV02_02_QMD_SPARE_H MW(1791:1760) -+#define NVC3C0_QMDV02_02_QMD_SPARE_I MW(1823:1792) -+#define NVC3C0_QMDV02_02_QMD_SPARE_J MW(1855:1824) -+#define NVC3C0_QMDV02_02_QMD_SPARE_K MW(1887:1856) -+#define NVC3C0_QMDV02_02_QMD_SPARE_L MW(1919:1888) -+#define NVC3C0_QMDV02_02_QMD_SPARE_M MW(1951:1920) -+#define NVC3C0_QMDV02_02_QMD_SPARE_N MW(1983:1952) -+#define NVC3C0_QMDV02_02_DEBUG_ID_UPPER MW(2015:1984) -+#define NVC3C0_QMDV02_02_DEBUG_ID_LOWER MW(2047:2016) -+ -+ -+ -+#endif // #ifndef __CLC3C0QMD_H__ -diff --git a/src/gallium/drivers/nouveau/nvc0/drf.h b/src/gallium/drivers/nouveau/nvc0/drf.h -new file mode 100644 -index 00000000000..bf95c8c3185 ---- /dev/null -+++ b/src/gallium/drivers/nouveau/nvc0/drf.h -@@ -0,0 +1,119 @@ -+/* -+ * Copyright 2019 Red Hat Inc. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice shall be included in -+ * all copies or substantial portions of the Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -+ * OTHER DEALINGS IN THE SOFTWARE. -+ */ -+#ifndef __NVHW_DRF_H__ -+#define __NVHW_DRF_H__ -+ -+/* Helpers common to all DRF accessors. */ -+#define DRF_LO(drf) (0 ? drf) -+#define DRF_HI(drf) (1 ? drf) -+#define DRF_BITS(drf) (DRF_HI(drf) - DRF_LO(drf) + 1) -+#define DRF_MASK(drf) (~0ULL >> (64 - DRF_BITS(drf))) -+#define DRF_SMASK(drf) (DRF_MASK(drf) << DRF_LO(drf)) -+ -+/* Helpers for DRF-MW accessors. */ -+#define DRF_MX_MW(drf) drf -+#define DRF_MX(drf) DRF_MX_##drf -+#define DRF_MW(drf) DRF_MX(drf) -+#define DRF_MW_SPANS(o,drf) (DRF_LW_IDX((o),drf) != DRF_HW_IDX((o),drf)) -+#define DRF_MW_SIZE(o) (sizeof((o)[0]) * 8) -+ -+#define DRF_LW_IDX(o,drf) (DRF_LO(DRF_MW(drf)) / DRF_MW_SIZE(o)) -+#define DRF_LW_LO(o,drf) (DRF_LO(DRF_MW(drf)) % DRF_MW_SIZE(o)) -+#define DRF_LW_HI(o,drf) (DRF_MW_SPANS((o),drf) ? (DRF_MW_SIZE(o) - 1) : DRF_HW_HI((o),drf)) -+#define DRF_LW_BITS(o,drf) (DRF_LW_HI((o),drf) - DRF_LW_LO((o),drf) + 1) -+#define DRF_LW_MASK(o,drf) (~0ULL >> (64 - DRF_LW_BITS((o),drf))) -+#define DRF_LW_SMASK(o,drf) (DRF_LW_MASK((o),drf) << DRF_LW_LO((o),drf)) -+#define DRF_LW_GET(o,drf) (((o)[DRF_LW_IDX((o),drf)] >> DRF_LW_LO((o),drf)) & DRF_LW_MASK((o),drf)) -+#define DRF_LW_VAL(o,drf,v) (((v) & DRF_LW_MASK((o),drf)) << DRF_LW_LO((o),drf)) -+#define DRF_LW_CLR(o,drf) ((o)[DRF_LW_IDX((o),drf)] & ~DRF_LW_SMASK((o),drf)) -+#define DRF_LW_SET(o,drf,v) (DRF_LW_CLR((o),drf) | DRF_LW_VAL((o),drf,(v))) -+ -+#define DRF_HW_IDX(o,drf) (DRF_HI(DRF_MW(drf)) / DRF_MW_SIZE(o)) -+#define DRF_HW_LO(o,drf) 0 -+#define DRF_HW_HI(o,drf) (DRF_HI(DRF_MW(drf)) % DRF_MW_SIZE(o)) -+#define DRF_HW_BITS(o,drf) (DRF_HW_HI((o),drf) - DRF_HW_LO((o),drf) + 1) -+#define DRF_HW_MASK(o,drf) (~0ULL >> (64 - DRF_HW_BITS((o),drf))) -+#define DRF_HW_SMASK(o,drf) (DRF_HW_MASK((o),drf) << DRF_HW_LO((o),drf)) -+#define DRF_HW_GET(o,drf) ((o)[DRF_HW_IDX(o,drf)] & DRF_HW_SMASK((o),drf)) -+#define DRF_HW_VAL(o,drf,v) (((long long)(v) >> DRF_LW_BITS((o),drf)) & DRF_HW_SMASK((o),drf)) -+#define DRF_HW_CLR(o,drf) ((o)[DRF_HW_IDX((o),drf)] & ~DRF_HW_SMASK((o),drf)) -+#define DRF_HW_SET(o,drf,v) (DRF_HW_CLR((o),drf) | DRF_HW_VAL((o),drf,(v))) -+ -+/* DRF accessors. */ -+#define NVVAL_X(drf,v) (((v) & DRF_MASK(drf)) << DRF_LO(drf)) -+#define NVVAL_N(X,d,r,f, v) NVVAL_X(d##_##r##_##f, (v)) -+#define NVVAL_I(X,d,r,f,i,v) NVVAL_X(d##_##r##_##f(i), (v)) -+#define NVVAL_(X,_1,_2,_3,_4,_5,IMPL,...) IMPL -+#define NVVAL(A...) NVVAL_(X, ##A, NVVAL_I, NVVAL_N)(X, ##A) -+ -+#define NVDEF_N(X,d,r,f, v) NVVAL_X(d##_##r##_##f, d##_##r##_##f##_##v) -+#define NVDEF_I(X,d,r,f,i,v) NVVAL_X(d##_##r##_##f(i), d##_##r##_##f##_##v) -+#define NVDEF_(X,_1,_2,_3,_4,_5,IMPL,...) IMPL -+#define NVDEF(A...) NVDEF_(X, ##A, NVDEF_I, NVDEF_N)(X, ##A) -+ -+#define NVVAL_GET_X(o,drf) (((o) >> DRF_LO(drf)) & DRF_MASK(drf)) -+#define NVVAL_GET_N(X,o,d,r,f ) NVVAL_GET_X(o, d##_##r##_##f) -+#define NVVAL_GET_I(X,o,d,r,f,i) NVVAL_GET_X(o, d##_##r##_##f(i)) -+#define NVVAL_GET_(X,_1,_2,_3,_4,_5,IMPL,...) IMPL -+#define NVVAL_GET(A...) NVVAL_GET_(X, ##A, NVVAL_GET_I, NVVAL_GET_N)(X, ##A) -+ -+#define NVVAL_SET_X(o,drf,v) (((o) & ~DRF_SMASK(drf)) | NVVAL_X(drf, (v))) -+#define NVVAL_SET_N(X,o,d,r,f, v) NVVAL_SET_X(o, d##_##r##_##f, (v)) -+#define NVVAL_SET_I(X,o,d,r,f,i,v) NVVAL_SET_X(o, d##_##r##_##f(i), (v)) -+#define NVVAL_SET_(X,_1,_2,_3,_4,_5,_6,IMPL,...) IMPL -+#define NVVAL_SET(A...) NVVAL_SET_(X, ##A, NVVAL_SET_I, NVVAL_SET_N)(X, ##A) -+ -+#define NVDEF_SET_N(X,o,d,r,f, v) \ -+ NVVAL_SET_X(o, d##_##r##_##f, d##_##r##_##f##_##v) -+#define NVDEF_SET_I(X,o,d,r,f,i,v) \ -+ NVVAL_SET_X(o, d##_##r##_##f(i), d##_##r##_##f##_##v) -+#define NVDEF_SET_(X,_1,_2,_3,_4,_5,_6,IMPL,...) IMPL -+#define NVDEF_SET(A...) NVDEF_SET_(X, ##A, NVDEF_SET_I, NVDEF_SET_N)(X, ##A) -+ -+/* DRF-MW accessors. */ -+#define NVVAL_MW_GET_X(o,drf) \ -+ ((DRF_MW_SPANS((o),drf) ? \ -+ (DRF_HW_GET((o),drf) << DRF_LW_BITS((o),drf)) : 0) | DRF_LW_GET((o),drf)) -+#define NVVAL_MW_GET_N(X,o,d,r,f ) NVVAL_MW_GET_X((o), d##_##r##_##f) -+#define NVVAL_MW_GET_I(X,o,d,r,f,i) NVVAL_MW_GET_X((o), d##_##r##_##f(i)) -+#define NVVAL_MW_GET_(X,_1,_2,_3,_4,_5,IMPL,...) IMPL -+#define NVVAL_MW_GET(A...) NVVAL_MW_GET_(X, ##A, NVVAL_MW_GET_I, NVVAL_MW_GET_N)(X, ##A) -+ -+#define NVVAL_MW_SET_X(o,drf,v) do { \ -+ (o)[DRF_LW_IDX((o),drf)] = DRF_LW_SET((o),drf,(v)); \ -+ if (DRF_MW_SPANS((o),drf)) \ -+ (o)[DRF_HW_IDX((o),drf)] = DRF_HW_SET((o),drf,(v)); \ -+} while(0) -+#define NVVAL_MW_SET_N(X,o,d,r,f, v) NVVAL_MW_SET_X((o), d##_##r##_##f, (v)) -+#define NVVAL_MW_SET_I(X,o,d,r,f,i,v) NVVAL_MW_SET_X((o), d##_##r##_##f(i), (v)) -+#define NVVAL_MW_SET_(X,_1,_2,_3,_4,_5,_6,IMPL,...) IMPL -+#define NVVAL_MW_SET(A...) \ -+ NVVAL_MW_SET_(X, ##A, NVVAL_MW_SET_I, NVVAL_MW_SET_N)(X, ##A) -+ -+#define NVDEF_MW_SET_N(X,o,d,r,f, v) \ -+ NVVAL_MW_SET_X(o, d##_##r##_##f, d##_##r##_##f##_##v) -+#define NVDEF_MW_SET_I(X,o,d,r,f,i,v) \ -+ NVVAL_MW_SET_X(o, d##_##r##_##f(i), d##_##r##_##f##_##v) -+#define NVDEF_MW_SET_(X,_1,_2,_3,_4,_5,_6,IMPL,...) IMPL -+#define NVDEF_MW_SET(A...) \ -+ NVDEF_MW_SET_(X, ##A, NVDEF_MW_SET_I, NVDEF_MW_SET_N)(X, ##A) -+#endif -diff --git a/src/gallium/drivers/nouveau/nvc0/mme/comc597.mme.h b/src/gallium/drivers/nouveau/nvc0/mme/comc597.mme.h -new file mode 100644 -index 00000000000..390741cbd04 ---- /dev/null -+++ b/src/gallium/drivers/nouveau/nvc0/mme/comc597.mme.h -@@ -0,0 +1,904 @@ -+#define NV_MME_PRED_MODE_UUUU 0 -+#define NV_MME_PRED_MODE_TTTT 1 -+#define NV_MME_PRED_MODE_FFFF 2 -+#define NV_MME_PRED_MODE_TTUU 3 -+#define NV_MME_PRED_MODE_FFUU 4 -+#define NV_MME_PRED_MODE_TFUU 5 -+#define NV_MME_PRED_MODE_TUUU 6 -+#define NV_MME_PRED_MODE_FUUU 7 -+#define NV_MME_PRED_MODE_UUTT 8 -+#define NV_MME_PRED_MODE_UUTF 9 -+#define NV_MME_PRED_MODE_UUTU 10 -+#define NV_MME_PRED_MODE_UUFT 11 -+#define NV_MME_PRED_MODE_UUFF 12 -+#define NV_MME_PRED_MODE_UUFU 13 -+#define NV_MME_PRED_MODE_UUUT 14 -+#define NV_MME_PRED_MODE_UUUF 15 -+ -+#define NV_MME_REG_R0 0 -+#define NV_MME_REG_R1 1 -+#define NV_MME_REG_R2 2 -+#define NV_MME_REG_R3 3 -+#define NV_MME_REG_R4 4 -+#define NV_MME_REG_R5 5 -+#define NV_MME_REG_R6 6 -+#define NV_MME_REG_R7 7 -+#define NV_MME_REG_R8 8 -+#define NV_MME_REG_R9 9 -+#define NV_MME_REG_R10 10 -+#define NV_MME_REG_R11 11 -+#define NV_MME_REG_R12 12 -+#define NV_MME_REG_R13 13 -+#define NV_MME_REG_R14 14 -+#define NV_MME_REG_R15 15 -+#define NV_MME_REG_R16 16 -+#define NV_MME_REG_R17 17 -+#define NV_MME_REG_R18 18 -+#define NV_MME_REG_R19 19 -+#define NV_MME_REG_R20 20 -+#define NV_MME_REG_R21 21 -+#define NV_MME_REG_R22 22 -+#define NV_MME_REG_R23 23 -+#define NV_MME_REG_ZERO 24 -+#define NV_MME_REG_IMMED 25 -+#define NV_MME_REG_IMMEDPAIR 26 -+#define NV_MME_REG_IMMED32 27 -+#define NV_MME_REG_LOAD0 28 -+#define NV_MME_REG_LOAD1 29 -+ -+#define NV_MME_ALU_ADD 0 -+#define NV_MME_ALU_ADDC 1 -+#define NV_MME_ALU_SUB 2 -+#define NV_MME_ALU_SUBB 3 -+#define NV_MME_ALU_MUL 4 -+#define NV_MME_ALU_MULH 5 -+#define NV_MME_ALU_MULU 6 -+#define NV_MME_ALU_EXTENDED 7 -+#define NV_MME_ALU_CLZ 8 -+#define NV_MME_ALU_SLL 9 -+#define NV_MME_ALU_SRL 10 -+#define NV_MME_ALU_SRA 11 -+#define NV_MME_ALU_AND 12 -+#define NV_MME_ALU_NAND 13 -+#define NV_MME_ALU_OR 14 -+#define NV_MME_ALU_XOR 15 -+#define NV_MME_ALU_MERGE 16 -+#define NV_MME_ALU_SLT 17 -+#define NV_MME_ALU_SLTU 18 -+#define NV_MME_ALU_SLE 19 -+#define NV_MME_ALU_SLEU 20 -+#define NV_MME_ALU_SEQ 21 -+#define NV_MME_ALU_STATE 22 -+#define NV_MME_ALU_LOOP 23 -+#define NV_MME_ALU_JAL 24 -+#define NV_MME_ALU_BLT 25 -+#define NV_MME_ALU_BLTU 26 -+#define NV_MME_ALU_BLE 27 -+#define NV_MME_ALU_BLEU 28 -+#define NV_MME_ALU_BEQ 29 -+#define NV_MME_ALU_DREAD 30 -+#define NV_MME_ALU_DWRITE 31 -+ -+#define NV_MME_OUT_NONE 0 -+#define NV_MME_OUT_ALU0 1 -+#define NV_MME_OUT_ALU1 2 -+#define NV_MME_OUT_LOAD0 3 -+#define NV_MME_OUT_LOAD1 4 -+#define NV_MME_OUT_IMMED0 5 -+#define NV_MME_OUT_IMMED1 6 -+#define NV_MME_OUT_RESERVED 7 -+#define NV_MME_OUT_IMMEDHIGH0 8 -+#define NV_MME_OUT_IMMEDHIGH1 9 -+#define NV_MME_OUT_IMMED32_0 10 -+ -+#define MME_BITS(en,pm,pr,o0,d0,a0,b0,i0,o1,d1,a1,b1,i1,m0,e0,m1,e1) \ -+ ((e1) << (92 - 64) | (m1) << (89 - 64) | \ -+ (e0) << (85 - 64) | (m0) << (82 - 64) | \ -+ (i1) << (66 - 64) | (b1) >> (64 - 61)), \ -+ (((b1) & 7) << (61 - 32) | (a1) << (56 - 32) | \ -+ (d1) << (51 - 32) | (o1) << (46 - 32) | \ -+ (i0) >> (32 - 30)), \ -+ (((i0) & 3) << 30 | (b0) << 25 | (a0) << 20 | (d0) << 15 | (o0) << 10 | \ -+ (pr) << 5 | (pm) << 1 | (en)) -+ -+#define MME_INSN(en,o0,d0,a0,b0,i0,m0,e0,o1,d1,a1,b1,i1,m1,e1) \ -+ MME_BITS((en), NV_MME_PRED_MODE_UUUU, NV_MME_REG_ZERO, \ -+ NV_MME_ALU_##o0, NV_MME_REG_##d0, \ -+ NV_MME_REG_##a0, NV_MME_REG_##b0, (i0), \ -+ NV_MME_ALU_##o1, NV_MME_REG_##d1, \ -+ NV_MME_REG_##a1, NV_MME_REG_##b1, (i1), \ -+ NV_MME_OUT_##m0, NV_MME_OUT_##e0, \ -+ NV_MME_OUT_##m1, NV_MME_OUT_##e1) -+ -+uint32_t mmec597_per_instance_bf[] = { -+// r1 = load(); // count -+// r3 = load(); // mask -+// mthd(0x1880, 1); // VERTEX_ARRAY_PER_INSTANCE[0] -+ MME_INSN(0, ADD, R1, LOAD0, ZERO, (1<<12)|0x1880/4, IMMED0, NONE, -+ ADD, R3, LOAD1, ZERO, 0, NONE, NONE), -+// while (HW_LOOP_COUNT < r1) { -+// send(r3 & 1); -+// r3 >>= 1; -+// } -+ MME_INSN(0, LOOP, ZERO, R1, ZERO, 0x0003, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, AND, ZERO, R3, IMMED, 1, NONE, ALU0, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, SRL, R3, R3, IMMED, 1, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(1, ADD, ZERO, ZERO, ZERO, 0, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, ZERO, ZERO, ZERO, 0, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+}; -+ -+uint32_t mmec597_vertex_array_select[] = { -+// r1 = load(); // array -+// r2 = load(); // limit hi -+// r3 = load(); // limit lo -+// r4 = load(); // start hi -+// r5 = load(); // start lo -+// r6 = (r1 & 0x1f) << 2; -+// r7 = (r1 & 0x1f) << 1; -+// mthd(0x1c04 + r6, 1); // VERTEX_ARRAY_START_HIGH[] -+// send(r4); -+// send(r5); -+// mthd(0x0600 + r7, 1); // VERTEX_ARRAY_LIMIT_HIGH[] -+// send(r2); -+// send(r3); -+ MME_INSN(0, ADD, R1, LOAD0, ZERO, 0, NONE, NONE, -+ ADD, R2, LOAD1, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, R3, LOAD0, ZERO, 0, NONE, NONE, -+ ADD, R4, LOAD1, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, R5, LOAD0, ZERO, 0, NONE, NONE, -+ MERGE, R6, ZERO, R1, (2<<10)|(5<<5)|0, NONE, NONE), -+ MME_INSN(0, MERGE, R7, ZERO, R1, (1<<10)|(5<<5)|0, ALU1, NONE, -+ ADD, ZERO, R6, IMMED, (1<<12)|0x1c04/4, NONE, NONE), -+ MME_INSN(0, ADD, ZERO, R4, ZERO, 0, NONE, ALU0, -+ ADD, ZERO, R5, ZERO, 0, NONE, ALU1), -+ MME_INSN(1, ADD, ZERO, R7, IMMED, (1<<12)|0x0600/4, ALU0, ALU1, -+ ADD, ZERO, R2, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, ZERO, R3, ZERO, 0, NONE, ALU0, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+}; -+ -+uint32_t mmec597_blend_enables[] = { -+// r1 = load(); // enable mask -+// mthd(0x1360, 1); // NVC0_3D_BLEND_ENABLE[] -+// send((r1 >> 0) & 1); -+// send((r1 >> 1) & 1); -+// send((r1 >> 2) & 1); -+// send((r1 >> 3) & 1); -+// send((r1 >> 4) & 1); -+// send((r1 >> 5) & 1); -+// send((r1 >> 6) & 1); -+// send((r1 >> 7) & 1); -+ MME_INSN(0, ADD, R1, LOAD0, ZERO, 0, IMMED1, NONE, -+ ADD, ZERO, ZERO, ZERO, (1<<12)|0x1360/4, NONE, NONE), -+ MME_INSN(0, MERGE, ZERO, ZERO, R1, (0<<10)|(1<<5)|0, NONE, ALU0, -+ MERGE, ZERO, ZERO, R1, (0<<10)|(1<<5)|1, NONE, ALU1), -+ MME_INSN(0, MERGE, ZERO, ZERO, R1, (0<<10)|(1<<5)|2, NONE, ALU0, -+ MERGE, ZERO, ZERO, R1, (0<<10)|(1<<5)|3, NONE, ALU1), -+ MME_INSN(1, MERGE, ZERO, ZERO, R1, (0<<10)|(1<<5)|4, NONE, ALU0, -+ MERGE, ZERO, ZERO, R1, (0<<10)|(1<<5)|5, NONE, ALU1), -+ MME_INSN(0, MERGE, ZERO, ZERO, R1, (0<<10)|(1<<5)|6, NONE, ALU0, -+ MERGE, ZERO, ZERO, R1, (0<<10)|(1<<5)|7, NONE, ALU1), -+}; -+ -+uint32_t mmec597_poly_mode_front[] = { -+// r1 = load(); -+// mthd(0x0dac,0); // POLYGON_MODE_FRONT -+// send(r1); -+// r2 = read(0x0db0); // POLYGON_MODE_BACK -+// r3 = read(0x20c0); // SP_SELECT[3] -+// r7 = r1 | r2; -+// r4 = read(0x2100); // SP_SELECT[4] -+// r6 = 0x60; -+// r7 = r7 & 1; -+// if (r7 != 0) -+ MME_INSN(0, ADD, R1, LOAD0, ZERO, (0<<12)|0x0dac/4, IMMED0, ALU0, -+ STATE, R2, IMMED, ZERO, 0x0db0/4, NONE, NONE), -+ MME_INSN(0, STATE, R3, IMMED, ZERO, 0x20c0/4, NONE, NONE, -+ OR, R7, R1, R2, 0, NONE, NONE), -+ MME_INSN(0, STATE, R4, IMMED, ZERO, 0x2100/4, NONE, NONE, -+ ADD, R6, IMMED, ZERO, 0x60, NONE, NONE), -+ MME_INSN(0, AND, R7, R7, IMMED, 1, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, BEQ, ZERO, R7, ZERO, (2<<14)|0x0002, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// r6 = 0x200; -+ MME_INSN(0, ADD, R6, IMMED, ZERO, 0x200, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// r7 = r3 | r4; -+// r7 = r7 & 1; -+// if (r7 != 0) -+ MME_INSN(0, OR, R7, R3, R4, 0, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, AND, R7, R7, IMMED, 1, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, BEQ, ZERO, R7, ZERO, (2<<14)|0x0002, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// r6 = 0; -+ MME_INSN(0, ADD, R6, ZERO, ZERO, 0, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// mthd(0x02ec, 0); -+// send(r6); -+ MME_INSN(1, ADD, ZERO, ZERO, ZERO, (0<<12)|0x02ec/4, IMMED0, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, ZERO, R6, ZERO, 0, NONE, ALU0, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+}; -+ -+uint32_t mmec597_poly_mode_back[] = { -+// r1 = load(); -+// mthd(0x0db0,0); // POLYGON_MODE_BACK -+// send(r1); -+// r2 = read(0x0dac); // POLYGON_MODE_FRONT -+// r3 = read(0x20c0); // SP_SELECT[3] -+// r7 = r1 | r2; -+// r4 = read(0x2100); // SP_SELECT[4] -+// r6 = 0x60; -+// r7 = r7 & 1; -+// if (r7 != 0) -+ MME_INSN(0, ADD, R1, LOAD0, ZERO, (0<<12)|0x0db0/4, IMMED0, ALU0, -+ STATE, R2, IMMED, ZERO, 0x0dac/4, NONE, NONE), -+ MME_INSN(0, STATE, R3, IMMED, ZERO, 0x20c0/4, NONE, NONE, -+ OR, R7, R1, R2, 0, NONE, NONE), -+ MME_INSN(0, STATE, R4, IMMED, ZERO, 0x2100/4, NONE, NONE, -+ ADD, R6, IMMED, ZERO, 0x60, NONE, NONE), -+ MME_INSN(0, AND, R7, R7, IMMED, 1, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, BEQ, ZERO, R7, ZERO, (2<<14)|0x0002, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// r6 = 0x200; -+ MME_INSN(0, ADD, R6, IMMED, ZERO, 0x200, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// r7 = r3 | r4; -+// r7 = r7 & 1; -+// if (r7 != 0) -+ MME_INSN(0, OR, R7, R3, R4, 0, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, AND, R7, R7, IMMED, 1, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, BEQ, ZERO, R7, ZERO, (2<<14)|0x0002, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// r6 = 0; -+ MME_INSN(0, ADD, R6, ZERO, ZERO, 0, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// mthd(0x02ec, 0); -+// send(r6); -+ MME_INSN(1, ADD, ZERO, ZERO, ZERO, (0<<12)|0x02ec/4, IMMED0, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, ZERO, R6, ZERO, 0, NONE, ALU0, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+}; -+ -+uint32_t mmec597_gp_select[] = { -+// r1 = load(); -+// mthd(0x2100,0); // SP_SELECT[4] -+// send(r1); -+// r2 = read(0x0dac); // POLYGON_MODE_FRONT -+// r3 = read(0x0db0); // POLYGON_MODE_BACK -+// r7 = r2 | r3; -+// r4 = read(0x20c0); // SP_SELECT[3] -+// r6 = 0x60; -+// r7 = r7 & 1; -+// if (r7 != 0) -+ MME_INSN(0, ADD, R1, LOAD0, ZERO, (0<<12)|0x2100/4, IMMED0, ALU0, -+ STATE, R2, IMMED, ZERO, 0x0dac/4, NONE, NONE), -+ MME_INSN(0, STATE, R3, IMMED, ZERO, 0x0db0/4, NONE, NONE, -+ OR, R7, R2, R3, 0, NONE, NONE), -+ MME_INSN(0, STATE, R4, IMMED, ZERO, 0x20c0/4, NONE, NONE, -+ ADD, R6, IMMED, ZERO, 0x60, NONE, NONE), -+ MME_INSN(0, AND, R7, R7, IMMED, 1, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, BEQ, ZERO, R7, ZERO, (2<<14)|0x0002, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// r6 = 0x200; -+ MME_INSN(0, ADD, R6, IMMED, ZERO, 0x200, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// r7 = r1 | r4; -+// r7 = r7 & 1; -+// if (r7 != 0) -+ MME_INSN(0, OR, R7, R1, R4, 0, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, AND, R7, R7, IMMED, 1, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, BEQ, ZERO, R7, ZERO, (2<<14)|0x0002, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// r6 = 0; -+ MME_INSN(0, ADD, R6, ZERO, ZERO, 0, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// mthd(0x02ec, 0); -+// send(r6); -+ MME_INSN(1, ADD, ZERO, ZERO, ZERO, (0<<12)|0x02ec/4, IMMED0, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, ZERO, R6, ZERO, 0, NONE, ALU0, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+}; -+ -+uint32_t mmec597_tep_select[] = { -+// r1 = load(); -+// mthd(0x20c0,0); // SP_SELECT[3] -+// send(r1); -+// r2 = read(0x0dac); // POLYGON_MODE_FRONT -+// r3 = read(0x0db0); // POLYGON_MODE_BACK -+// r7 = r2 | r3; -+// r4 = read(0x2100); // SP_SELECT[4] -+// r6 = 0x60; -+// r7 = r7 & 1; -+// if (r7 != 0) -+ MME_INSN(0, ADD, R1, LOAD0, ZERO, (0<<12)|0x20c0/4, IMMED0, ALU0, -+ STATE, R2, IMMED, ZERO, 0x0dac/4, NONE, NONE), -+ MME_INSN(0, STATE, R3, IMMED, ZERO, 0x0db0/4, NONE, NONE, -+ OR, R7, R2, R3, 0, NONE, NONE), -+ MME_INSN(0, STATE, R4, IMMED, ZERO, 0x2100/4, NONE, NONE, -+ ADD, R6, IMMED, ZERO, 0x60, NONE, NONE), -+ MME_INSN(0, AND, R7, R7, IMMED, 1, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, BEQ, ZERO, R7, ZERO, (2<<14)|0x0002, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// r6 = 0x200; -+ MME_INSN(0, ADD, R6, IMMED, ZERO, 0x200, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// r7 = r1 | r4; -+// r7 = r7 & 1; -+// if (r7 != 0) -+ MME_INSN(0, OR, R7, R1, R4, 0, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, AND, R7, R7, IMMED, 1, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, BEQ, ZERO, R7, ZERO, (2<<14)|0x0002, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// r6 = 0; -+ MME_INSN(0, ADD, R6, ZERO, ZERO, 0, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// mthd(0x02ec, 0); -+// send(r6); -+ MME_INSN(1, ADD, ZERO, ZERO, ZERO, (0<<12)|0x02ec/4, IMMED0, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, ZERO, R6, ZERO, 0, NONE, ALU0, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+}; -+ -+uint32_t mmec597_draw_arrays_indirect[] = { -+// r1 = load(); // mode -+// r5 = read(0x1438); // VB_INSTANCE_BASE -+// r6 = load(); // start_drawid -+// r7 = load(); // numparams -+ MME_INSN(0, ADD, R1, LOAD0, ZERO, 0, NONE, NONE, -+ ADD, R6, LOAD1, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, R7, LOAD0, ZERO, 0, NONE, NONE, -+ STATE, R5, IMMED, ZERO, 0x1438/4, NONE, NONE), -+// while (HW_LOOP_COUNT < r7) { -+// r2 = load(); // count -+// r3 = load(); // instance_count -+// mthd(0x0d74, 0); // VERTEX_BUFFER_FIRST -+// send(load()); // start -+// r4 = load(); // start_instance -+// if (r3) { -+ MME_INSN(0, LOOP, ZERO, R7, ZERO, 0x000c, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, R2, LOAD0, ZERO, 0x0d74/4, IMMED0, NONE, -+ ADD, R3, LOAD1, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, ZERO, LOAD0, ZERO, 0, NONE, ALU0, -+ ADD, R4, LOAD1, ZERO, 0, NONE, NONE), -+ MME_INSN(0, BEQ, ZERO, R3, ZERO, (2<<14)|0x0008, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// mthd(0x238c, 1); // CB_POS -+// send(256 + 160); -+// send(0); // base_vertex -+// send(r4); // start_instance -+// send(r6); // draw id -+// mthd(0x1438, 0); // VB_INSTANCE_BASE -+// send(r4); -+// r1 = r1 & ~(1<<26); // clear INSTANCE_NEXT -+ MME_INSN(0, ADD, ZERO, ZERO, ZERO, (1<<12)|0x238c/4, IMMED0, IMMED1, -+ ADD, ZERO, ZERO, ZERO, 256 + 160, NONE, ALU0), -+ MME_INSN(0, ADD, ZERO, R4, ZERO, 0, NONE, ALU0, -+ ADD, ZERO, R6, ZERO, 0, NONE, ALU1), -+ MME_INSN(0, ADD, ZERO, R4, ZERO, 0x1438/4, IMMED0, ALU0, -+ MERGE, R1, R1, ZERO, (26<<10)|(1<<5)|0, NONE, NONE), -+// do { -+// mthd(0x1618, 0); // VERTEX_BEGIN_GL -+// send(r1); // mode -+// mthd(0x0d78, 0); // VERTEX_BUFFER_COUNT -+// send(r2); // count -+// mthd(0x1614, 0); // VERTEX_END_GL -+// send(0); -+// r1 |= (1<<26); // set INSTANCE_NEXT -+// } while(--r3); -+// } -+ MME_INSN(0, ADD, ZERO, R1, ZERO, 0x1618/4, IMMED0, ALU0, -+ ADD, ZERO, R2, ZERO, 0x0d78/4, IMMED1, ALU1), -+ MME_INSN(0, ADD, ZERO, ZERO, ZERO, 0x1614/4, IMMED0, ALU0, -+ ADD, R4, IMMED, ZERO, 1, NONE, NONE), -+ MME_INSN(0, MERGE, R1, R1, R4, (26<<10)|(1<<5)|0, NONE, NONE, -+ SUB, R3, R3, IMMED, 1, NONE, NONE), -+ MME_INSN(0, BEQ, ZERO, R3, ZERO, (1<<14)|0x3ffd, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// r6 = r6 + 1; -+// }; -+ MME_INSN(0, ADD, R6, R6, IMMED, 1, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// mthd(0x1438, 0); // restore VB_INSTANCE_BASE -+// send(r5); -+ MME_INSN(1, ADD, ZERO, ZERO, ZERO, 0x1438/4, IMMED0, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, ZERO, R5, ZERO, 0, NONE, ALU0, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+}; -+ -+uint32_t mmec597_draw_elts_indirect[] = { -+// r1 = load(); // mode -+// r8 = read(0x1434); // VB_ELEMENT_BASE -+// r9 = read(0x1438); // VB_INSTANCE_BASE -+// r6 = load(); // start_drawid -+// r7 = load(); // numparams -+ MME_INSN(0, ADD, R1, LOAD0, ZERO, 0, NONE, NONE, -+ STATE, R8, IMMED, ZERO, 0x1434/4, NONE, NONE), -+ MME_INSN(0, STATE, R9, IMMED, ZERO, 0x1438/4, NONE, NONE, -+ ADD, R6, LOAD0, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, R7, LOAD0, ZERO, 0, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// while (HW_LOOP_COUNT < r7) { -+// r3 = load(); // count -+// r2 = load(); // instance_count -+// mthd(0x17dc, 0); // INDEX_BATCH_FIRST -+// send(load()); // start -+// r4 = load(); // index_bias -+// mthd(0x238c, 1); // CB_POS -+// send(256 + 160); -+// send(r4); // index_bias -+// r5 = load(); // start_instance -+// if (r2) { -+ MME_INSN(0, LOOP, ZERO, R7, ZERO, 0x000d, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, R3, LOAD0, ZERO, 0x17dc/4, IMMED0, NONE, -+ ADD, R2, LOAD1, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, ZERO, LOAD0, ZERO, 0, NONE, ALU0, -+ ADD, R4, LOAD1, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, ZERO, ZERO, ZERO, (1<<12)|0x238c/4, IMMED0, IMMED1, -+ ADD, ZERO, R4, ZERO, 256 + 160, NONE, ALU1), -+ MME_INSN(0, BEQ, ZERO, R2, ZERO, (2<<14)|0x0008, NONE, NONE, -+ ADD, R5, LOAD0, ZERO, 0, NONE, NONE), -+// send(r5); // start_instance -+// send(r6); // draw_id -+// mthd(0x1434, 1); // VB_ELEMENT_BASE -+// send(r4); // index_bias -+// send(r5); // start_instance -+// mthd(0x1118, 0); // VERTEX_ID_BASE -+// send(r4); // index_bias -+// r1 &= ~(1 << 26); // clear INSTANCE_NEXT -+ MME_INSN(0, ADD, ZERO, R5, ZERO, 0, NONE, ALU0, -+ ADD, ZERO, R6, ZERO, 0, NONE, ALU1), -+ MME_INSN(0, ADD, ZERO, R4, ZERO, (1<<12)|0x1434/4, IMMED0, ALU0, -+ ADD, ZERO, R5, ZERO, 0, NONE, ALU1), -+ MME_INSN(0, ADD, ZERO, R4, ZERO, 0x1118/4, IMMED0, ALU0, -+ MERGE, R1, R1, ZERO, (26<<10)|(1<<5)|0, NONE, NONE), -+// do { -+// mthd(0x1618, 0); // VERTEX_BEGIN_GL -+// send(r1); // mode -+// mthd(0x17e0, 0); // INDEX_BATCH_COUNT -+// send(r3); // count -+// mthd(0x1614, 0); // VERTEX_END_GL -+// send(0); -+// r1 |= (1 << 26); // set INSTANCE_NEXT -+// } while (--r2); -+// } -+ MME_INSN(0, ADD, ZERO, R1, ZERO, 0x1618/4, IMMED0, ALU0, -+ ADD, ZERO, R3, ZERO, 0x17e0/4, IMMED1, ALU1), -+ MME_INSN(0, ADD, ZERO, ZERO, ZERO, 0x1614/4, IMMED0, ALU0, -+ ADD, R4, IMMED, ZERO, 1, NONE, NONE), -+ MME_INSN(0, MERGE, R1, R1, R4, (26<<10)|(1<<5)|0, NONE, NONE, -+ SUB, R2, R2, IMMED, 1, NONE, NONE), -+ MME_INSN(0, BEQ, ZERO, R2, ZERO, (1<<14)|0x3ffd, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// r6 = r6 + 1; -+// }; -+ MME_INSN(0, ADD, R6, R6, IMMED, 1, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// mthd(0x1434, 1); -+// send(r8); // restore VB_ELEMENT_BASE -+// send(r9); // restore VB_INSTANCE_BASE -+// mthd(0x1118, 0); -+// send(r8); // restore VERTEX_ID_BASE -+ MME_INSN(1, ADD, ZERO, R8, ZERO, (1<<12)|0x1434/4, IMMED0, ALU0, -+ ADD, ZERO, R9, ZERO, 0, NONE, ALU1), -+ MME_INSN(0, ADD, ZERO, R8, ZERO, 0x1118/4, IMMED0, ALU0, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+}; -+ -+uint32_t mmec597_draw_arrays_indirect_count[] = { -+// r1 = load(); // mode -+// r6 = load(); // start_drawid -+// r7 = load(); // numparams -+// r5 = load(); // totaldraws -+// r8 = read(0x1438); // VB_INSTANCE_BASE -+// r5 = r5 - r6; // remaining draws -+// if (r5 > r7) -+ MME_INSN(0, ADD, R1, LOAD0, ZERO, 0, NONE, NONE, -+ ADD, R6, LOAD1, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, R7, LOAD0, ZERO, 0, NONE, NONE, -+ ADD, R5, LOAD1, ZERO, 0, NONE, NONE), -+ MME_INSN(0, STATE, R8, IMMED, ZERO, 0x1438/4, NONE, NONE, -+ SUB, R5, R5, R6, 0, NONE, NONE), -+ MME_INSN(0, BLE, ZERO, R5, R7, (2<<14)|0x0002, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// r5 = r7; -+ MME_INSN(0, ADD, R5, R7, ZERO, 0, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// if (r5 >= 0) { -+ MME_INSN(0, BLT, ZERO, R5, ZERO, (2<<14)|0x000e, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// while (HW_LOOP_COUNT < r5) { -+// r2 = load(); // count -+// r3 = load(); // instance_count -+// mthd(0x0d74, 0); // VERTEX_BUFFER_FIRST -+// send(load()); // start -+// r4 = load(); // start_instance -+// if (r3) { -+ MME_INSN(0, LOOP, ZERO, R5, ZERO, 0x000c, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, R2, LOAD0, ZERO, 0x0d74/4, IMMED0, NONE, -+ ADD, R3, LOAD1, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, ZERO, LOAD0, ZERO, 0, NONE, ALU0, -+ ADD, R4, LOAD1, ZERO, 0, NONE, NONE), -+ MME_INSN(0, BEQ, ZERO, R3, ZERO, (2<<14)|0x0008, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// mthd(0x238c, 1); // CB_POS -+// send(256 + 160); -+// send(0); // base_vertex -+// send(r4); // start_instance -+// send(r6); // draw_id -+// mthd(0x1438, 0); // VB_INSTANCE_BASE -+// send(r4); -+// r1 &= ~(1 << 26); // clear INSTANCE_NEXT -+ MME_INSN(0, ADD, ZERO, ZERO, ZERO, (1<<12)|0x238c/4, IMMED0, IMMED1, -+ ADD, ZERO, ZERO, ZERO, 256+160, NONE, ALU0), -+ MME_INSN(0, ADD, ZERO, R4, ZERO, 0, NONE, ALU0, -+ ADD, ZERO, R6, ZERO, 0, NONE, ALU1), -+ MME_INSN(0, ADD, ZERO, R4, ZERO, 0x1438/4, IMMED0, ALU0, -+ MERGE, R1, R1, ZERO, (26<<10)|(1<<5)|0, NONE, NONE), -+// do { -+// mthd(0x1618, 0); // VERTEX_BEGIN_GL -+// send(r1); // mode -+// mthd(0x0d78, 0); // VERTEX_BUFFER_COUNT -+// send(r2); -+// mthd(0x1614, 0); // VERTEX_END_GL -+// send(0); -+// r1 |= (1 << 26); // set INSTANCE_NEXT -+// } while (--r3); -+// } -+ MME_INSN(0, ADD, ZERO, R1, ZERO, 0x1618/4, IMMED0, ALU0, -+ ADD, ZERO, R2, ZERO, 0x0d78/4, IMMED1, ALU1), -+ MME_INSN(0, ADD, ZERO, ZERO, ZERO, 0x1614/4, IMMED0, ALU0, -+ ADD, R4, IMMED, ZERO, 1, NONE, NONE), -+ MME_INSN(0, MERGE, R1, R1, R4, (26<<10)|(1<<5)|0, NONE, NONE, -+ SUB, R3, R3, IMMED, 1, NONE, NONE), -+ MME_INSN(0, BEQ, ZERO, R3, ZERO, (1<<14)|0x3ffd, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// r6 = r6 + 1; // draw_id++ -+// } -+ MME_INSN(0, ADD, R6, R6, IMMED, 1, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// r7 = r7 - r5; // unneeded params -+// } -+ MME_INSN(0, SUB, R7, R7, R5, 0, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// while (HW_LOOP_COUNT < r7) { -+// load(); -+// load(); -+// load(); -+// load(); -+// } -+ MME_INSN(0, LOOP, ZERO, R7, ZERO, 0x0003, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, ZERO, LOAD0, ZERO, 0, NONE, NONE, -+ ADD, ZERO, LOAD1, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, ZERO, LOAD0, ZERO, 0, NONE, NONE, -+ ADD, ZERO, LOAD1, ZERO, 0, NONE, NONE), -+// exit mthd(0x1438, 0); // VB_INSTANCE_BASE -+// send(r8); -+ MME_INSN(1, ADD, ZERO, ZERO, ZERO, 0x1438/4, IMMED0, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, ZERO, R8, ZERO, 0, NONE, ALU0, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+}; -+ -+uint32_t mmec597_draw_elts_indirect_count[] = { -+// r8 = read(0x1434); -+// r1 = load(); -+// r9 = read(0x1438); -+// r6 = load(); -+// r7 = load(); -+// r5 = load(); -+// r5 = r5 - r6; -+// if (r5 > r7) -+ MME_INSN(0, STATE, R8, IMMED, ZERO, 0x1434/4, NONE, NONE, -+ ADD, R1, LOAD0, ZERO, 0, NONE, NONE), -+ MME_INSN(0, STATE, R9, IMMED, ZERO, 0x1438/4, NONE, NONE, -+ ADD, R6, LOAD0, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, R7, LOAD0, ZERO, 0, NONE, NONE, -+ ADD, R5, LOAD1, ZERO, 0, NONE, NONE), -+ MME_INSN(0, SUB, R5, R5, R6, 0, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, BLE, ZERO, R5, R7, (2<<14)|0x0002, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// r5 = r7; -+ MME_INSN(0, ADD, R5, R7, ZERO, 0, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// if (r5 >= 0) { -+ MME_INSN(0, BLT, ZERO, R5, ZERO, (2<<14)|0x000f, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// while (HW_LOOP_COUNT < r5) { -+// r3 = load(); -+// r2 = load(); -+// mthd(0x17dc, 0); -+// send(load()); -+// r4 = load(); -+// mthd(0x238c, 1); -+// send(256 + 160); -+// send(r4); -+// r10 = load(); -+// if (r2) { -+ MME_INSN(0, LOOP, ZERO, R5, ZERO, 0x000d, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, R3, LOAD0, ZERO, (0<<12)|0x17dc/4, IMMED0, NONE, -+ ADD, R2, LOAD1, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, ZERO, LOAD0, ZERO, (1<<12)|0x238c/4, NONE, ALU0, -+ ADD, R4, LOAD1, ZERO, 256 + 160, IMMED0, IMMED1), -+ MME_INSN(0, ADD, ZERO, R4, ZERO, 0, NONE, ALU0, -+ ADD, R10, LOAD0, ZERO, 0, NONE, NONE), -+ MME_INSN(0, BEQ, ZERO, R2, ZERO, (2<<14)|0x0008, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// send(r10); -+// send(r6); -+// mthd(0x1434, 1); -+// send(r4); -+// send(r10); -+// mthd(0x1118, 0); -+// send(r4); -+// r1 &= ~(1 << 26); -+ MME_INSN(0, ADD, ZERO, R10, ZERO, 0, NONE, ALU0, -+ ADD, ZERO, R6, ZERO, 0, NONE, ALU1), -+ MME_INSN(0, ADD, ZERO, R4, ZERO, (1<<12)|0x1434/4, IMMED0, ALU0, -+ ADD, ZERO, R10, ZERO, 0, NONE, ALU1), -+ MME_INSN(0, ADD, ZERO, R4, ZERO, (0<<12)|0x1118/4, IMMED0, ALU0, -+ MERGE, R1, R1, ZERO, (26<<10)|(1<<5)|0, NONE, NONE), -+// do { -+// mthd(0x1618, 0); -+// send(r1); -+// mthd(0x17e0, 0); -+// send(r3); -+// mthd(0x1614, 0); -+// send(0); -+// r1 |= (1 << 26); -+// } while (--r2); -+// } -+ MME_INSN(0, ADD, ZERO, R1, ZERO, 0x1618/4, IMMED0, ALU0, -+ ADD, ZERO, R3, ZERO, 0x17e0/4, IMMED1, ALU1), -+ MME_INSN(0, ADD, ZERO, ZERO, ZERO, 0x1614/4, IMMED0, ALU0, -+ ADD, R4, IMMED, ZERO, 1, NONE, NONE), -+ MME_INSN(0, MERGE, R1, R1, R4, (26<<10)|(1<<5)|0, NONE, NONE, -+ SUB, R2, R2, IMMED, 1, NONE, NONE), -+ MME_INSN(0, BEQ, ZERO, R2, ZERO, (1<<14)|0x3ffd, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// r6 = r6 + 1; -+// } -+ MME_INSN(0, ADD, R6, R6, IMMED, 1, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// r7 = r7 - r5; // unneeded params -+// } -+ MME_INSN(0, SUB, R7, R7, R5, 0, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// while (HW_LOOP_COUNT < r7) { -+// r2 = load(); -+// r2 = load(); -+// r2 = load(); -+// r2 = load(); -+// r2 = load(); -+// } -+ MME_INSN(0, LOOP, ZERO, R7, ZERO, 0x0004, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, ZERO, LOAD0, ZERO, 0, NONE, NONE, -+ ADD, ZERO, LOAD1, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, ZERO, LOAD0, ZERO, 0, NONE, NONE, -+ ADD, ZERO, LOAD1, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, ZERO, LOAD0, ZERO, 0, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// mthd(0x1434, 1); -+// send(r8); -+// send(r9); -+// exit mthd(0x1118, 0); -+// send(r8); -+ MME_INSN(1, ADD, ZERO, R8, ZERO, (1<<12)|0x1434/4, IMMED0, ALU0, -+ ADD, ZERO, R9, ZERO, 0, NONE, ALU1), -+ MME_INSN(0, ADD, ZERO, R8, ZERO, (0<<12)|0x1118/4, IMMED0, ALU0, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+}; -+ -+uint32_t mmec597_query_buffer_write[] = { -+// r1 = load(); // clamp value -+// r2 = load(); // end value (lo) -+// r3 = load(); // end value (hi) -+// r4 = load(); // start value (lo) -+// r5 = load(); // start value (hi) -+// r8 = load(); // desired sequence -+// r9 = load(); // actual sequence -+// r7 = load(); // query address (hi) -+// r6 = load(); // query address (lo) -+// if (r9 >= r8) { -+ MME_INSN(0, ADD, R1, LOAD0, ZERO, 0, NONE, NONE, -+ ADD, R2, LOAD1, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, R3, LOAD0, ZERO, 0, NONE, NONE, -+ ADD, R4, LOAD1, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, R5, LOAD0, ZERO, 0, NONE, NONE, -+ ADD, R8, LOAD1, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, R9, LOAD0, ZERO, 0, NONE, NONE, -+ ADD, R7, LOAD1, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, R6, LOAD0, ZERO, 0, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, BLT, ZERO, R9, R8, (2<<14)|0x000e, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// [r3,r2] = [r3,r2] - [r5,r4]; -+// if (r1) { -+ MME_INSN(0, SUB, R2, R2, R4, 0, NONE, NONE, -+ SUBB, R3, R3, R5, 0, NONE, NONE), -+ MME_INSN(0, BEQ, ZERO, R1, ZERO, (2<<14)|0x0004, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// if (r3 != 0 || r1 < r2) -+// r2 = r1; -+// } -+ MME_INSN(0, BEQ, ZERO, R3, ZERO, (1<<14)|0x0002, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, BLTU, ZERO, R1, R2, (1<<14)|0x0002, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, R2, R1, ZERO, 0, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// mthd(0x1b00, 1); -+// send(r7); -+// send(r6); -+// send(r2) -+// send(0x10000000); -+// if (!r1) { -+ MME_INSN(0, ADD, ZERO, R7, ZERO, (1<<12)|0x1b00/4, IMMED0, ALU0, -+ ADD, ZERO, R6, ZERO, 0, NONE, ALU1), -+ MME_INSN(0, ADD, ZERO, R2, ZERO, 0, NONE, ALU0, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, ZERO, ZERO, ZERO, 0x1000, NONE, IMMED32_0, -+ ADD, ZERO, ZERO, ZERO, 0x0000, NONE, NONE), -+ MME_INSN(0, BEQ, ZERO, R1, ZERO, (1<<14)|0x0004, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// [r7,r6] = [r7,r6] + 4; -+// mthd(0x1b00, 1); -+// send(r7); -+// send(r6); -+// send(r3); -+// send(0x10000000); -+// } -+ MME_INSN(0, ADD, ZERO, R6, IMMED, 4, IMMED1, ALU1, -+ ADDC, ZERO, R7, ZERO, (1<<12)|0x1b00/4, NONE, ALU0), -+ MME_INSN(0, ADD, ZERO, R3, ZERO, 0, NONE, ALU0, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, ZERO, ZERO, ZERO, 0x1000, NONE, IMMED32_0, -+ ADD, ZERO, ZERO, ZERO, 0x0000, NONE, NONE), -+// mthd(0x0110, 0); -+// send(0); -+ MME_INSN(0, ADD, ZERO, ZERO, ZERO, (0<<12)|0x0110/4, IMMED0, ALU0, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// } -+ MME_INSN(1, ADD, ZERO, ZERO, ZERO, 0, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, ZERO, ZERO, ZERO, 0, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+}; -+ -+uint32_t mmec597_conservative_raster_state[] = { -+// r1 = load(); -+// mthd(0x3400, 1); -+// send(0); -+// send(((r1 >> 8) & 7) << 23); -+// send(0x03800000); -+// mthd(0x2310, 1); -+// send(0x00418800); -+// r2 = r1 & 0xf; -+// r3 = 16; -+// r2 = r2 | (((r1 >> 4) & 0xf) << 8); -+// mthd(0x0a1c, 8); -+ MME_INSN(0, ADD, R1, LOAD0, ZERO, (1<<12)|0x3400/4, IMMED0, IMMED1, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, MERGE, ZERO, ZERO, R1, (23<<10)|(3<<5)|8, NONE, ALU0, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, ZERO, ZERO, ZERO, 0x0380, NONE, IMMED32_0, -+ ADD, ZERO, ZERO, ZERO, 0x0000, NONE, NONE), -+ MME_INSN(0, ADD, ZERO, ZERO, ZERO, (1<<12)|0x2310/4, IMMED0, NONE, -+ ADD, ZERO, ZERO, ZERO, 0x0000, NONE, NONE), -+ MME_INSN(0, ADD, ZERO, ZERO, ZERO, 0x0041, NONE, IMMED32_0, -+ ADD, ZERO, ZERO, ZERO, 0x8800, NONE, NONE), -+ MME_INSN(0, AND, R2, R1, IMMED, 0xf, NONE, NONE, -+ ADD, R3, ZERO, IMMED, 16, NONE, NONE), -+ MME_INSN(0, MERGE, R2, R2, R1, (8<<10)|(4<<5)|4, IMMED1, NONE, -+ ADD, ZERO, ZERO, ZERO, (8<<12)|0x0a1c/4, NONE, NONE), -+// while (HW_LOOP_COUNT < r3) -+// send(r2); -+ MME_INSN(0, LOOP, ZERO, R3, ZERO, 0x0002, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, ZERO, R2, ZERO, 0, NONE, ALU0, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+// mthd(0x1148, 0); -+// send(1); -+ MME_INSN(1, ADD, ZERO, ZERO, ZERO, (0<<12)|0x1148/4, IMMED0, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, ZERO, ZERO, ZERO, 1, NONE, IMMED1, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+}; -+ -+uint32_t mmec597_compute_counter[] = { -+// r0 = load(); -+// r1 = 1; -+// r2 = 0; -+// while (HW_LOOP_COUNT < r2) { -+ MME_INSN(0, ADD, R0, LOAD0, ZERO, 0, NONE, NONE, -+ ADD, R1, IMMED, ZERO, 1, NONE, NONE), -+ MME_INSN(0, LOOP, ZERO, R0, ZERO, 0x0003, NONE, NONE, -+ ADD, R2, ZERO, ZERO, 0, NONE, NONE), -+// r3 = load(); -+// [r1,r0] *= r3; -+// } -+ MME_INSN(0, ADD, R3, LOAD0, ZERO, 0, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, MULU, R1, R1, R3, 0, NONE, NONE, -+ MULH, R2, ZERO, ZERO, 0, NONE, NONE), -+// r3 = read(0x3410); -+// r4 = read(0x3414); -+// [r4,r3] += [r2,r1]; -+// mthd(0x3410, 1); -+// send(r3); -+// send(r4); -+ MME_INSN(0, STATE, ZERO, ZERO, ZERO, 0x3410/4, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(1, STATE, ZERO, ZERO, ZERO, 0x3414/4, NONE, NONE, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, R3, R3, R1, (1<<12)|0x3410/4, IMMED0, ALU0, -+ ADDC, R4, R4, R2, 0, NONE, ALU1), -+}; -+ -+uint32_t mmec597_compute_counter_to_query[] = { -+// r1 = load(); -+// r3 = read(0x3410); -+// r2 = load(); -+// r4 = read(0x3414); -+// [r2,r1] = [r2,r1] + [r4,r3]; -+// mthd(0x1b00, 1); -+// r3 = load(); -+// send(r3); -+// r4 = load(); -+// send(r4); -+// send(r1); -+// send(0x10000000); -+ MME_INSN(0, ADD, R1, LOAD0, ZERO, 0, NONE, NONE, -+ STATE, R3, IMMED, ZERO, 0x3410/4, NONE, NONE), -+ MME_INSN(0, ADD, R2, LOAD0, ZERO, 0, NONE, NONE, -+ STATE, R4, IMMED, ZERO, 0x3414/4, NONE, NONE), -+ MME_INSN(0, ADD, R1, R1, R3, (1<<12)|0x1b00/4, IMMED0, NONE, -+ ADDC, R2, R2, R4, 0, NONE, NONE), -+ MME_INSN(0, ADD, R3, LOAD0, ZERO, 0, NONE, ALU0, -+ ADD, R4, LOAD1, ZERO, 0, NONE, ALU1), -+ MME_INSN(0, ADD, ZERO, R1, ZERO, 0, NONE, ALU0, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, ZERO, ZERO, ZERO, 0x1000, NONE, IMMED32_0, -+ ADD, ZERO, ZERO, ZERO, 0x0000, NONE, NONE), -+// [r3,r4] = [r3,r4] + 4; -+// mthd(0x1b00, 1); -+// send(r3); -+// send(r4); -+// send(r2); -+// send(0x10000000); -+ MME_INSN(0, ADD, ZERO, R4, IMMED, 4, IMMED1, ALU1, -+ ADDC, ZERO, R3, ZERO, (1<<12)|0x1b00/4, NONE, ALU0), -+ MME_INSN(1, ADD, ZERO, R2, ZERO, 0, NONE, ALU0, -+ ADD, ZERO, ZERO, ZERO, 0, NONE, NONE), -+ MME_INSN(0, ADD, ZERO, ZERO, ZERO, 0x1000, NONE, IMMED32_0, -+ ADD, ZERO, ZERO, ZERO, 0x0000, NONE, NONE), -+}; -diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_3d.xml.h b/src/gallium/drivers/nouveau/nvc0/nvc0_3d.xml.h -index 221bab3105b..539bdc75022 100644 ---- a/src/gallium/drivers/nouveau/nvc0/nvc0_3d.xml.h -+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_3d.xml.h -@@ -157,6 +157,12 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #define NVC0_3D_UNK0220__ESIZE 0x00000004 - #define NVC0_3D_UNK0220__LEN 0x00000028 - -+#define TU102_3D_INDEX_ARRAY_LIMIT_HIGH 0x00000238 -+ -+#define TU102_3D_INDEX_ARRAY_LIMIT_LOW 0x0000023c -+ -+#define TU102_3D_SET_COLOR_RENDER_TO_ZETA_SURFACE 0x000002b8 -+ - #define NVC0_3D_UNK02C0 0x000002c0 - - #define NVC0_3D_UNK02C4 0x000002c4 -@@ -278,6 +284,9 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #define NVC0_3D_UNK0400__ESIZE 0x00000004 - #define NVC0_3D_UNK0400__LEN 0x000000c0 - -+#define TU102_3D_VERTEX_ARRAY_LIMIT_HIGH(i0) (0x00000600 + 0x8*(i0)) -+#define TU102_3D_VERTEX_ARRAY_LIMIT_LOW(i0) (0x00000604 + 0x8*(i0)) -+ - #define NVC0_3D_TFB_STREAM(i0) (0x00000700 + 0x10*(i0)) - #define NVC0_3D_TFB_STREAM__ESIZE 0x00000010 - #define NVC0_3D_TFB_STREAM__LEN 0x00000004 -@@ -1787,6 +1796,9 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #define NVC0_3D_SP_UNK14__ESIZE 0x00000004 - #define NVC0_3D_SP_UNK14__LEN 0x00000004 - -+#define GV100_3D_SP_ADDRESS_HIGH(i0) (0x00002014 + 0x40*(i0)) -+#define GV100_3D_SP_ADDRESS_LOW(i0) (0x00002018 + 0x40*(i0)) -+ - #define NVC0_3D_TEX_LIMITS(i0) (0x00002200 + 0x10*(i0)) - #define NVC0_3D_TEX_LIMITS__ESIZE 0x00000010 - #define NVC0_3D_TEX_LIMITS__LEN 0x00000005 -diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c b/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c -index c897e4e8b97..69131fa22d3 100644 ---- a/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c -+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c -@@ -37,6 +37,55 @@ nvc0_tex_choose_tile_dims(unsigned nx, unsigned ny, unsigned nz, bool is_3d) - return nv50_tex_choose_tile_dims_helper(nx, ny, nz, is_3d); - } - -+static uint32_t -+tu102_mt_choose_storage_type(struct nv50_miptree *mt, bool compressed) -+{ -+ uint32_t kind; -+ -+ if (unlikely(mt->base.base.bind & PIPE_BIND_CURSOR)) -+ return 0; -+ if (unlikely(mt->base.base.flags & NOUVEAU_RESOURCE_FLAG_LINEAR)) -+ return 0; -+ -+ switch (mt->base.base.format) { -+ case PIPE_FORMAT_Z16_UNORM: -+ if (compressed) -+ kind = 0x0b; // NV_MMU_PTE_KIND_Z16_COMPRESSIBLE_DISABLE_PLC -+ else -+ kind = 0x01; // NV_MMU_PTE_KIND_Z16 -+ break; -+ case PIPE_FORMAT_X8Z24_UNORM: -+ case PIPE_FORMAT_S8X24_UINT: -+ case PIPE_FORMAT_S8_UINT_Z24_UNORM: -+ if (compressed) -+ kind = 0x0e; // NV_MMU_PTE_KIND_Z24S8_COMPRESSIBLE_DISABLE_PLC -+ else -+ kind = 0x05; // NV_MMU_PTE_KIND_Z24S8 -+ break; -+ case PIPE_FORMAT_X24S8_UINT: -+ case PIPE_FORMAT_Z24X8_UNORM: -+ case PIPE_FORMAT_Z24_UNORM_S8_UINT: -+ if (compressed) -+ kind = 0x0c; // NV_MMU_PTE_KIND_S8Z24_COMPRESSIBLE_DISABLE_PLC -+ else -+ kind = 0x03; // NV_MMU_PTE_KIND_S8Z24 -+ break; -+ case PIPE_FORMAT_X32_S8X24_UINT: -+ case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: -+ if (compressed) -+ kind = 0x0d; // NV_MMU_PTE_KIND_ZF32_X24S8_COMPRESSIBLE_DISABLE_PLC -+ else -+ kind = 0x04; // NV_MMU_PTE_KIND_ZF32_X24S8 -+ break; -+ case PIPE_FORMAT_Z32_FLOAT: -+ default: -+ kind = 0x06; -+ break; -+ } -+ -+ return kind; -+} -+ - static uint32_t - nvc0_mt_choose_storage_type(struct nv50_miptree *mt, bool compressed) - { -@@ -357,7 +406,10 @@ nvc0_miptree_create(struct pipe_screen *pscreen, - if (pt->bind & PIPE_BIND_LINEAR) - pt->flags |= NOUVEAU_RESOURCE_FLAG_LINEAR; - -- bo_config.nvc0.memtype = nvc0_mt_choose_storage_type(mt, compressed); -+ if (dev->chipset < 0x160) -+ bo_config.nvc0.memtype = nvc0_mt_choose_storage_type(mt, compressed); -+ else -+ bo_config.nvc0.memtype = tu102_mt_choose_storage_type(mt, compressed); - - if (!nvc0_miptree_init_ms_mode(mt)) { - FREE(mt); -diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c -index 32aa82d168c..d2b2de47c8d 100644 ---- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c -+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c -@@ -645,7 +645,10 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset, - prog->code_size = info->bin.codeSize; - prog->relocs = info->bin.relocData; - prog->fixups = info->bin.fixupData; -- prog->num_gprs = MAX2(4, (info->bin.maxGPR + 1)); -+ if (info->target >= NVISA_GV100_CHIPSET) -+ prog->num_gprs = MIN2(info->bin.maxGPR + 5, 256); //XXX: why? -+ else -+ prog->num_gprs = MAX2(4, (info->bin.maxGPR + 1)); - prog->cp.smem_size = info->bin.smemSize; - prog->num_barriers = info->numBarriers; - -@@ -734,7 +737,14 @@ nvc0_program_alloc_code(struct nvc0_context *nvc0, struct nvc0_program *prog) - struct nvc0_screen *screen = nvc0->screen; - const bool is_cp = prog->type == PIPE_SHADER_COMPUTE; - int ret; -- uint32_t size = prog->code_size + (is_cp ? 0 : NVC0_SHADER_HEADER_SIZE); -+ uint32_t size = prog->code_size; -+ -+ if (!is_cp) { -+ if (screen->eng3d->oclass < TU102_3D_CLASS) -+ size += GF100_SHADER_HEADER_SIZE; -+ else -+ size += TU102_SHADER_HEADER_SIZE; -+ } - - /* On Fermi, SP_START_ID must be aligned to 0x40. - * On Kepler, the first instruction must be aligned to 0x80 because -@@ -750,7 +760,8 @@ nvc0_program_alloc_code(struct nvc0_context *nvc0, struct nvc0_program *prog) - prog->code_base = prog->mem->start; - - if (!is_cp) { -- if (screen->base.class_3d >= NVE4_3D_CLASS) { -+ if (screen->base.class_3d >= NVE4_3D_CLASS && -+ screen->base.class_3d < TU102_3D_CLASS) { - switch (prog->mem->start & 0xff) { - case 0x40: prog->code_base += 0x70; break; - case 0x80: prog->code_base += 0x30; break; -@@ -777,7 +788,16 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog) - { - struct nvc0_screen *screen = nvc0->screen; - const bool is_cp = prog->type == PIPE_SHADER_COMPUTE; -- uint32_t code_pos = prog->code_base + (is_cp ? 0 : NVC0_SHADER_HEADER_SIZE); -+ uint32_t code_pos = prog->code_base; -+ uint32_t size_sph = 0; -+ -+ if (!is_cp) { -+ if (screen->eng3d->oclass < TU102_3D_CLASS) -+ size_sph = GF100_SHADER_HEADER_SIZE; -+ else -+ size_sph = TU102_SHADER_HEADER_SIZE; -+ } -+ code_pos += size_sph; - - if (prog->relocs) - nv50_ir_relocate_code(prog->relocs, prog->code, code_pos, -@@ -803,8 +823,7 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog) - - if (!is_cp) - nvc0->base.push_data(&nvc0->base, screen->text, prog->code_base, -- NV_VRAM_DOMAIN(&screen->base), -- NVC0_SHADER_HEADER_SIZE, prog->hdr); -+ NV_VRAM_DOMAIN(&screen->base), size_sph, prog->hdr); - - nvc0->base.push_data(&nvc0->base, screen->text, code_pos, - NV_VRAM_DOMAIN(&screen->base), prog->code_size, -@@ -817,7 +836,14 @@ nvc0_program_upload(struct nvc0_context *nvc0, struct nvc0_program *prog) - struct nvc0_screen *screen = nvc0->screen; - const bool is_cp = prog->type == PIPE_SHADER_COMPUTE; - int ret; -- uint32_t size = prog->code_size + (is_cp ? 0 : NVC0_SHADER_HEADER_SIZE); -+ uint32_t size = prog->code_size; -+ -+ if (!is_cp) { -+ if (screen->eng3d->oclass < TU102_3D_CLASS) -+ size += GF100_SHADER_HEADER_SIZE; -+ else -+ size += TU102_SHADER_HEADER_SIZE; -+ } - - ret = nvc0_program_alloc_code(nvc0, prog); - if (ret) { -@@ -874,8 +900,7 @@ nvc0_program_upload(struct nvc0_context *nvc0, struct nvc0_program *prog) - BEGIN_NVC0(nvc0->base.pushbuf, NVC0_CP(FLUSH), 1); - PUSH_DATA (nvc0->base.pushbuf, NVC0_COMPUTE_FLUSH_CODE); - } else { -- BEGIN_NVC0(nvc0->base.pushbuf, NVC0_3D(SP_START_ID(i)), 1); -- PUSH_DATA (nvc0->base.pushbuf, progs[i]->code_base); -+ nvc0_program_sp_start_id(nvc0, i, progs[i]); - } - } - } -@@ -953,7 +978,7 @@ nvc0_program_symbol_offset(const struct nvc0_program *prog, uint32_t label) - unsigned base = 0; - unsigned i; - if (prog->type != PIPE_SHADER_COMPUTE) -- base = NVC0_SHADER_HEADER_SIZE; -+ base = GF100_SHADER_HEADER_SIZE; - for (i = 0; i < prog->cp.num_syms; ++i) - if (syms[i].label == label) - return prog->code_base + base + syms[i].offset; -diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.h b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h -index 5684207aa54..2c465b342e9 100644 ---- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.h -+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h -@@ -15,7 +15,9 @@ struct nvc0_transform_feedback_state { - }; - - --#define NVC0_SHADER_HEADER_SIZE (20 * 4) -+#define GF100_SHADER_HEADER_SIZE (20 * 4) -+#define TU102_SHADER_HEADER_SIZE (32 * 4) -+#define NVC0_MAX_SHADER_HEADER_SIZE TU102_SHADER_HEADER_SIZE - - struct nvc0_program { - struct pipe_shader_state pipe; -@@ -30,7 +32,7 @@ struct nvc0_program { - unsigned code_size; - unsigned parm_size; /* size of non-bindable uniforms (c0[]) */ - -- uint32_t hdr[20]; -+ uint32_t hdr[NVC0_MAX_SHADER_HEADER_SIZE/4]; - uint32_t flags[2]; - - struct { -@@ -72,4 +74,6 @@ struct nvc0_program { - struct nouveau_heap *mem; - }; - -+void -+nvc0_program_sp_start_id(struct nvc0_context *, int, struct nvc0_program *); - #endif -diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c -index 7abbf762af2..07d74ddd50c 100644 ---- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c -+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c -@@ -27,15 +27,17 @@ - #include "util/format/u_format_s3tc.h" - #include "util/u_screen.h" - #include "pipe/p_screen.h" --#include "compiler/nir/nir.h" - - #include "nouveau_vp3_video.h" - -+#include "codegen/nv50_ir_driver.h" -+ - #include "nvc0/nvc0_context.h" - #include "nvc0/nvc0_screen.h" - - #include "nvc0/mme/com9097.mme.h" - #include "nvc0/mme/com90c0.mme.h" -+#include "nvc0/mme/comc597.mme.h" - - #include "nv50/g80_texture.xml.h" - -@@ -443,8 +445,8 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, - case PIPE_SHADER_CAP_PREFERRED_IR: - return screen->prefer_nir ? PIPE_SHADER_IR_NIR : PIPE_SHADER_IR_TGSI; - case PIPE_SHADER_CAP_SUPPORTED_IRS: { -- uint32_t irs = 1 << PIPE_SHADER_IR_TGSI | -- 1 << PIPE_SHADER_IR_NIR; -+ uint32_t irs = 1 << PIPE_SHADER_IR_NIR | -+ ((class_3d >= GV100_3D_CLASS) ? 0 : 1 << PIPE_SHADER_IR_TGSI); - if (screen->force_enable_cl) - irs |= 1 << PIPE_SHADER_IR_NIR_SERIALIZED; - return irs; -@@ -467,6 +469,14 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, - case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR: - return shader != PIPE_SHADER_FRAGMENT; - case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR: -+ /* HW doesn't support indirect addressing of fragment program inputs -+ * on Volta. The binary driver generates a function to handle every -+ * possible indirection, and indirectly calls the function to handle -+ * this instead. -+ */ -+ if (class_3d >= GV100_3D_CLASS) -+ return shader != PIPE_SHADER_FRAGMENT; -+ return 1; - case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR: - case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR: - return 1; -@@ -717,6 +727,26 @@ nvc0_graph_set_macro(struct nvc0_screen *screen, uint32_t m, unsigned pos, - return pos + size; - } - -+static int -+tu102_graph_set_macro(struct nvc0_screen *screen, uint32_t m, unsigned pos, -+ unsigned size, const uint32_t *data) -+{ -+ struct nouveau_pushbuf *push = screen->base.pushbuf; -+ -+ size /= 4; -+ -+ assert((pos + size) <= 0x800); -+ -+ BEGIN_NVC0(push, SUBC_3D(NVC0_GRAPH_MACRO_ID), 2); -+ PUSH_DATA (push, (m - 0x3800) / 8); -+ PUSH_DATA (push, pos); -+ BEGIN_1IC0(push, SUBC_3D(NVC0_GRAPH_MACRO_UPLOAD_POS), size + 1); -+ PUSH_DATA (push, pos); -+ PUSH_DATAp(push, data, size); -+ -+ return pos + (size / 3); -+} -+ - static void - nvc0_magic_3d_init(struct nouveau_pushbuf *push, uint16_t obj_class) - { -@@ -728,8 +758,10 @@ nvc0_magic_3d_init(struct nouveau_pushbuf *push, uint16_t obj_class) - BEGIN_NVC0(push, SUBC_3D(0x10ec), 2); - PUSH_DATA (push, 0xff); - PUSH_DATA (push, 0xff); -- BEGIN_NVC0(push, SUBC_3D(0x074c), 1); -- PUSH_DATA (push, 0x3f); -+ if (obj_class < GV100_3D_CLASS) { -+ BEGIN_NVC0(push, SUBC_3D(0x074c), 1); -+ PUSH_DATA (push, 0x3f); -+ } - - BEGIN_NVC0(push, SUBC_3D(0x16a8), 1); - PUSH_DATA (push, (3 << 16) | 3); -@@ -761,8 +793,10 @@ nvc0_magic_3d_init(struct nouveau_pushbuf *push, uint16_t obj_class) - BEGIN_NVC0(push, SUBC_3D(0x0300), 1); - PUSH_DATA (push, 3); - -- BEGIN_NVC0(push, SUBC_3D(0x02d0), 1); -- PUSH_DATA (push, 0x3fffff); -+ if (obj_class < GV100_3D_CLASS) { -+ BEGIN_NVC0(push, SUBC_3D(0x02d0), 1); -+ PUSH_DATA (push, 0x3fffff); -+ } - BEGIN_NVC0(push, SUBC_3D(0x0fdc), 1); - PUSH_DATA (push, 1); - BEGIN_NVC0(push, SUBC_3D(0x19c0), 1); -@@ -822,6 +856,8 @@ nvc0_screen_init_compute(struct nvc0_screen *screen) - case 0x110: - case 0x120: - case 0x130: -+ case 0x140: -+ case 0x160: - return nve4_screen_compute_setup(screen, screen->base.pushbuf); - default: - return -1; -@@ -893,13 +929,15 @@ nvc0_screen_resize_text_area(struct nvc0_screen *screen, uint64_t size) - nouveau_heap_init(&screen->text_heap, 0, size - 0x100); - - /* update the code segment setup */ -- BEGIN_NVC0(push, NVC0_3D(CODE_ADDRESS_HIGH), 2); -- PUSH_DATAh(push, screen->text->offset); -- PUSH_DATA (push, screen->text->offset); -- if (screen->compute) { -- BEGIN_NVC0(push, NVC0_CP(CODE_ADDRESS_HIGH), 2); -+ if (screen->eng3d->oclass < GV100_3D_CLASS) { -+ BEGIN_NVC0(push, NVC0_3D(CODE_ADDRESS_HIGH), 2); - PUSH_DATAh(push, screen->text->offset); - PUSH_DATA (push, screen->text->offset); -+ if (screen->compute) { -+ BEGIN_NVC0(push, NVC0_CP(CODE_ADDRESS_HIGH), 2); -+ PUSH_DATAh(push, screen->text->offset); -+ PUSH_DATA (push, screen->text->offset); -+ } - } - - return 0; -@@ -939,74 +977,14 @@ nvc0_screen_bind_cb_3d(struct nvc0_screen *screen, bool *can_serialize, - IMMED_NVC0(push, NVC0_3D(CB_BIND(stage)), (index << 4) | (size >= 0)); - } - --static const nir_shader_compiler_options nir_options = { -- .lower_fdiv = false, -- .lower_ffma = false, -- .fuse_ffma = false, /* nir doesn't track mad vs fma */ -- .lower_flrp32 = true, -- .lower_flrp64 = true, -- .lower_fpow = false, -- .lower_fsat = false, -- .lower_fsqrt = false, // TODO: only before gm200 -- .lower_fmod = true, -- .lower_bitfield_extract = false, -- .lower_bitfield_extract_to_shifts = false, -- .lower_bitfield_insert = false, -- .lower_bitfield_insert_to_shifts = false, -- .lower_bitfield_reverse = false, -- .lower_bit_count = false, -- .lower_ifind_msb = false, -- .lower_find_lsb = false, -- .lower_uadd_carry = true, // TODO -- .lower_usub_borrow = true, // TODO -- .lower_mul_high = false, -- .lower_negate = false, -- .lower_sub = true, -- .lower_scmp = true, // TODO: not implemented yet -- .lower_idiv = true, -- .lower_isign = false, // TODO -- .fdot_replicates = false, // TODO -- .lower_ffloor = false, // TODO -- .lower_ffract = true, -- .lower_fceil = false, // TODO -- .lower_ldexp = true, -- .lower_pack_half_2x16 = true, -- .lower_pack_unorm_2x16 = true, -- .lower_pack_snorm_2x16 = true, -- .lower_pack_unorm_4x8 = true, -- .lower_pack_snorm_4x8 = true, -- .lower_unpack_half_2x16 = true, -- .lower_unpack_unorm_2x16 = true, -- .lower_unpack_snorm_2x16 = true, -- .lower_unpack_unorm_4x8 = true, -- .lower_unpack_snorm_4x8 = true, -- .lower_extract_byte = true, -- .lower_extract_word = true, -- .lower_all_io_to_temps = false, -- .vertex_id_zero_based = false, -- .lower_base_vertex = false, -- .lower_helper_invocation = false, -- .lower_cs_local_index_from_id = true, -- .lower_cs_local_id_from_index = false, -- .lower_device_index_to_zero = false, // TODO -- .lower_wpos_pntc = false, // TODO -- .lower_hadd = true, // TODO -- .lower_add_sat = true, // TODO -- .use_interpolated_input_intrinsics = true, -- .lower_mul_2x32_64 = true, // TODO -- .max_unroll_iterations = 32, -- .lower_int64_options = nir_lower_ufind_msb64|nir_lower_divmod64, // TODO -- .lower_doubles_options = nir_lower_dmod, // TODO -- .lower_to_scalar = true, --}; -- - static const void * - nvc0_screen_get_compiler_options(struct pipe_screen *pscreen, - enum pipe_shader_ir ir, - enum pipe_shader_type shader) - { -+ struct nvc0_screen *screen = nvc0_screen(pscreen); - if (ir == PIPE_SHADER_IR_NIR) -- return &nir_options; -+ return nv50_ir_nir_shader_compiler_options(screen->base.device->chipset); - return NULL; - } - -@@ -1038,6 +1016,8 @@ nvc0_screen_create(struct nouveau_device *dev) - case 0x110: - case 0x120: - case 0x130: -+ case 0x140: -+ case 0x160: - break; - default: - return NULL; -@@ -1104,16 +1084,19 @@ nvc0_screen_create(struct nouveau_device *dev) - screen->base.fence.emit = nvc0_screen_fence_emit; - screen->base.fence.update = nvc0_screen_fence_update; - -+ if (dev->chipset < 0x140) { -+ ret = nouveau_object_new(chan, (dev->chipset < 0xe0) ? 0x1f906e : 0x906e, -+ NVIF_CLASS_SW_GF100, NULL, 0, &screen->nvsw); -+ if (ret) -+ FAIL_SCREEN_INIT("Error creating SW object: %d\n", ret); - -- ret = nouveau_object_new(chan, (dev->chipset < 0xe0) ? 0x1f906e : 0x906e, -- NVIF_CLASS_SW_GF100, NULL, 0, &screen->nvsw); -- if (ret) -- FAIL_SCREEN_INIT("Error creating SW object: %d\n", ret); -- -- BEGIN_NVC0(push, SUBC_SW(NV01_SUBCHAN_OBJECT), 1); -- PUSH_DATA (push, screen->nvsw->handle); -+ BEGIN_NVC0(push, SUBC_SW(NV01_SUBCHAN_OBJECT), 1); -+ PUSH_DATA (push, screen->nvsw->handle); -+ } - - switch (dev->chipset & ~0xf) { -+ case 0x160: -+ case 0x140: - case 0x130: - case 0x120: - case 0x110: -@@ -1167,6 +1150,12 @@ nvc0_screen_create(struct nouveau_device *dev) - PUSH_DATA (push, screen->fence.bo->offset + 16); - - switch (dev->chipset & ~0xf) { -+ case 0x160: -+ obj_class = TU102_3D_CLASS; -+ break; -+ case 0x140: -+ obj_class = GV100_3D_CLASS; -+ break; - case 0x130: - switch (dev->chipset) { - case 0x130: -@@ -1414,25 +1403,47 @@ nvc0_screen_create(struct nouveau_device *dev) - PUSH_DATA (push, 16384 << 16); - } - -+ if (screen->eng3d->oclass < TU102_3D_CLASS) { - #define MK_MACRO(m, n) i = nvc0_graph_set_macro(screen, m, i, sizeof(n), n); - -- i = 0; -- MK_MACRO(NVC0_3D_MACRO_VERTEX_ARRAY_PER_INSTANCE, mme9097_per_instance_bf); -- MK_MACRO(NVC0_3D_MACRO_BLEND_ENABLES, mme9097_blend_enables); -- MK_MACRO(NVC0_3D_MACRO_VERTEX_ARRAY_SELECT, mme9097_vertex_array_select); -- MK_MACRO(NVC0_3D_MACRO_TEP_SELECT, mme9097_tep_select); -- MK_MACRO(NVC0_3D_MACRO_GP_SELECT, mme9097_gp_select); -- MK_MACRO(NVC0_3D_MACRO_POLYGON_MODE_FRONT, mme9097_poly_mode_front); -- MK_MACRO(NVC0_3D_MACRO_POLYGON_MODE_BACK, mme9097_poly_mode_back); -- MK_MACRO(NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT, mme9097_draw_arrays_indirect); -- MK_MACRO(NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT, mme9097_draw_elts_indirect); -- MK_MACRO(NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT_COUNT, mme9097_draw_arrays_indirect_count); -- MK_MACRO(NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT, mme9097_draw_elts_indirect_count); -- MK_MACRO(NVC0_3D_MACRO_QUERY_BUFFER_WRITE, mme9097_query_buffer_write); -- MK_MACRO(NVC0_3D_MACRO_CONSERVATIVE_RASTER_STATE, mme9097_conservative_raster_state); -- MK_MACRO(NVC0_3D_MACRO_COMPUTE_COUNTER, mme9097_compute_counter); -- MK_MACRO(NVC0_3D_MACRO_COMPUTE_COUNTER_TO_QUERY, mme9097_compute_counter_to_query); -- MK_MACRO(NVC0_CP_MACRO_LAUNCH_GRID_INDIRECT, mme90c0_launch_grid_indirect); -+ i = 0; -+ MK_MACRO(NVC0_3D_MACRO_VERTEX_ARRAY_PER_INSTANCE, mme9097_per_instance_bf); -+ MK_MACRO(NVC0_3D_MACRO_BLEND_ENABLES, mme9097_blend_enables); -+ MK_MACRO(NVC0_3D_MACRO_VERTEX_ARRAY_SELECT, mme9097_vertex_array_select); -+ MK_MACRO(NVC0_3D_MACRO_TEP_SELECT, mme9097_tep_select); -+ MK_MACRO(NVC0_3D_MACRO_GP_SELECT, mme9097_gp_select); -+ MK_MACRO(NVC0_3D_MACRO_POLYGON_MODE_FRONT, mme9097_poly_mode_front); -+ MK_MACRO(NVC0_3D_MACRO_POLYGON_MODE_BACK, mme9097_poly_mode_back); -+ MK_MACRO(NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT, mme9097_draw_arrays_indirect); -+ MK_MACRO(NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT, mme9097_draw_elts_indirect); -+ MK_MACRO(NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT_COUNT, mme9097_draw_arrays_indirect_count); -+ MK_MACRO(NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT, mme9097_draw_elts_indirect_count); -+ MK_MACRO(NVC0_3D_MACRO_QUERY_BUFFER_WRITE, mme9097_query_buffer_write); -+ MK_MACRO(NVC0_3D_MACRO_CONSERVATIVE_RASTER_STATE, mme9097_conservative_raster_state); -+ MK_MACRO(NVC0_3D_MACRO_COMPUTE_COUNTER, mme9097_compute_counter); -+ MK_MACRO(NVC0_3D_MACRO_COMPUTE_COUNTER_TO_QUERY, mme9097_compute_counter_to_query); -+ MK_MACRO(NVC0_CP_MACRO_LAUNCH_GRID_INDIRECT, mme90c0_launch_grid_indirect); -+ } else { -+#undef MK_MACRO -+#define MK_MACRO(m, n) i = tu102_graph_set_macro(screen, m, i, sizeof(n), n); -+ -+ i = 0; -+ MK_MACRO(NVC0_3D_MACRO_VERTEX_ARRAY_PER_INSTANCE, mmec597_per_instance_bf); -+ MK_MACRO(NVC0_3D_MACRO_BLEND_ENABLES, mmec597_blend_enables); -+ MK_MACRO(NVC0_3D_MACRO_VERTEX_ARRAY_SELECT, mmec597_vertex_array_select); -+ MK_MACRO(NVC0_3D_MACRO_TEP_SELECT, mmec597_tep_select); -+ MK_MACRO(NVC0_3D_MACRO_GP_SELECT, mmec597_gp_select); -+ MK_MACRO(NVC0_3D_MACRO_POLYGON_MODE_FRONT, mmec597_poly_mode_front); -+ MK_MACRO(NVC0_3D_MACRO_POLYGON_MODE_BACK, mmec597_poly_mode_back); -+ MK_MACRO(NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT, mmec597_draw_arrays_indirect); -+ MK_MACRO(NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT, mmec597_draw_elts_indirect); -+ MK_MACRO(NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT_COUNT, mmec597_draw_arrays_indirect_count); -+ MK_MACRO(NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT, mmec597_draw_elts_indirect_count); -+ MK_MACRO(NVC0_3D_MACRO_QUERY_BUFFER_WRITE, mmec597_query_buffer_write); -+ MK_MACRO(NVC0_3D_MACRO_CONSERVATIVE_RASTER_STATE, mmec597_conservative_raster_state); -+ MK_MACRO(NVC0_3D_MACRO_COMPUTE_COUNTER, mmec597_compute_counter); -+ MK_MACRO(NVC0_3D_MACRO_COMPUTE_COUNTER_TO_QUERY, mmec597_compute_counter_to_query); -+ } - - BEGIN_NVC0(push, NVC0_3D(RASTERIZE_ENABLE), 1); - PUSH_DATA (push, 1); -diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c -index b7e0c8a930f..490026b2c00 100644 ---- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c -+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c -@@ -64,6 +64,22 @@ nvc0_program_validate(struct nvc0_context *nvc0, struct nvc0_program *prog) - return true; /* stream output info only */ - } - -+void -+nvc0_program_sp_start_id(struct nvc0_context *nvc0, int stage, -+ struct nvc0_program *prog) -+{ -+ struct nouveau_pushbuf *push = nvc0->base.pushbuf; -+ -+ if (nvc0->screen->eng3d->oclass < GV100_3D_CLASS) { -+ BEGIN_NVC0(push, NVC0_3D(SP_START_ID(stage)), 1); -+ PUSH_DATA (push, prog->code_base); -+ } else { -+ BEGIN_NVC0(push, SUBC_3D(GV100_3D_SP_ADDRESS_HIGH(stage)), 2); -+ PUSH_DATAh(push, nvc0->screen->text->offset + prog->code_base); -+ PUSH_DATA (push, nvc0->screen->text->offset + prog->code_base); -+ } -+} -+ - void - nvc0_vertprog_validate(struct nvc0_context *nvc0) - { -@@ -74,9 +90,9 @@ nvc0_vertprog_validate(struct nvc0_context *nvc0) - return; - nvc0_program_update_context_state(nvc0, vp, 0); - -- BEGIN_NVC0(push, NVC0_3D(SP_SELECT(1)), 2); -+ BEGIN_NVC0(push, NVC0_3D(SP_SELECT(1)), 1); - PUSH_DATA (push, 0x11); -- PUSH_DATA (push, vp->code_base); -+ nvc0_program_sp_start_id(nvc0, 1, vp); - BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(1)), 1); - PUSH_DATA (push, vp->num_gprs); - -@@ -152,9 +168,9 @@ nvc0_fragprog_validate(struct nvc0_context *nvc0) - fp->fp.post_depth_coverage); - } - -- BEGIN_NVC0(push, NVC0_3D(SP_SELECT(5)), 2); -+ BEGIN_NVC0(push, NVC0_3D(SP_SELECT(5)), 1); - PUSH_DATA (push, 0x51); -- PUSH_DATA (push, fp->code_base); -+ nvc0_program_sp_start_id(nvc0, 5, fp); - BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(5)), 1); - PUSH_DATA (push, fp->num_gprs); - -@@ -176,9 +192,9 @@ nvc0_tctlprog_validate(struct nvc0_context *nvc0) - BEGIN_NVC0(push, NVC0_3D(TESS_MODE), 1); - PUSH_DATA (push, tp->tp.tess_mode); - } -- BEGIN_NVC0(push, NVC0_3D(SP_SELECT(2)), 2); -+ BEGIN_NVC0(push, NVC0_3D(SP_SELECT(2)), 1); - PUSH_DATA (push, 0x21); -- PUSH_DATA (push, tp->code_base); -+ nvc0_program_sp_start_id(nvc0, 2, tp); - BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(2)), 1); - PUSH_DATA (push, tp->num_gprs); - } else { -@@ -186,9 +202,9 @@ nvc0_tctlprog_validate(struct nvc0_context *nvc0) - /* not a whole lot we can do to handle this failure */ - if (!nvc0_program_validate(nvc0, tp)) - assert(!"unable to validate empty tcp"); -- BEGIN_NVC0(push, NVC0_3D(SP_SELECT(2)), 2); -+ BEGIN_NVC0(push, NVC0_3D(SP_SELECT(2)), 1); - PUSH_DATA (push, 0x20); -- PUSH_DATA (push, tp->code_base); -+ nvc0_program_sp_start_id(nvc0, 2, tp); - } - nvc0_program_update_context_state(nvc0, tp, 1); - } -@@ -206,8 +222,7 @@ nvc0_tevlprog_validate(struct nvc0_context *nvc0) - } - BEGIN_NVC0(push, NVC0_3D(MACRO_TEP_SELECT), 1); - PUSH_DATA (push, 0x31); -- BEGIN_NVC0(push, NVC0_3D(SP_START_ID(3)), 1); -- PUSH_DATA (push, tp->code_base); -+ nvc0_program_sp_start_id(nvc0, 3, tp); - BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(3)), 1); - PUSH_DATA (push, tp->num_gprs); - } else { -@@ -227,8 +242,7 @@ nvc0_gmtyprog_validate(struct nvc0_context *nvc0) - if (gp && nvc0_program_validate(nvc0, gp) && gp->code_size) { - BEGIN_NVC0(push, NVC0_3D(MACRO_GP_SELECT), 1); - PUSH_DATA (push, 0x41); -- BEGIN_NVC0(push, NVC0_3D(SP_START_ID(4)), 1); -- PUSH_DATA (push, gp->code_base); -+ nvc0_program_sp_start_id(nvc0, 4, gp); - BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(4)), 1); - PUSH_DATA (push, gp->num_gprs); - } else { -diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c -index 538effdb531..731b0b5dbf8 100644 ---- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c -+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c -@@ -29,6 +29,8 @@ - #include "util/format/u_format.h" - #include "util/u_surface.h" - -+#include "tgsi/tgsi_ureg.h" -+ - #include "os/os_thread.h" - - #include "nvc0/nvc0_context.h" -@@ -138,6 +140,11 @@ nvc0_2d_texture_set(struct nouveau_pushbuf *push, bool dst, - PUSH_DATA (push, bo->offset + offset); - } - -+ if (dst) { -+ IMMED_NVC0(push, SUBC_2D(NVC0_2D_SET_DST_COLOR_RENDER_TO_ZETA_SURFACE), -+ util_format_is_depth_or_stencil(pformat)); -+ } -+ - #if 0 - if (dst) { - BEGIN_NVC0(push, SUBC_2D(NVC0_2D_CLIP_X), 4); -@@ -772,7 +779,7 @@ gm200_evaluate_depth_buffer(struct pipe_context *pipe) - struct nvc0_blitter - { - struct nvc0_program *fp[NV50_BLIT_MAX_TEXTURE_TYPES][NV50_BLIT_MODES]; -- struct nvc0_program vp; -+ struct nvc0_program *vp; - - struct nv50_tsc_entry sampler[2]; /* nearest, bilinear */ - -@@ -785,6 +792,7 @@ struct nvc0_blitctx - { - struct nvc0_context *nvc0; - struct nvc0_program *fp; -+ struct nvc0_program *vp; - uint8_t mode; - uint16_t color_mask; - uint8_t filter; -@@ -809,78 +817,27 @@ struct nvc0_blitctx - struct nvc0_rasterizer_stateobj rast; - }; - --static void --nvc0_blitter_make_vp(struct nvc0_blitter *blit) -+static void * -+nvc0_blitter_make_vp(struct pipe_context *pipe) - { -- static const uint32_t code_nvc0[] = -- { -- 0xfff11c26, 0x06000080, /* vfetch b64 $r4:$r5 a[0x80] */ -- 0xfff01c46, 0x06000090, /* vfetch b96 $r0:$r1:$r2 a[0x90] */ -- 0x13f01c26, 0x0a7e0070, /* export b64 o[0x70] $r4:$r5 */ -- 0x03f01c46, 0x0a7e0080, /* export b96 o[0x80] $r0:$r1:$r2 */ -- 0x00001de7, 0x80000000, /* exit */ -- }; -- static const uint32_t code_nve4[] = -- { -- 0x00000007, 0x20000000, /* sched */ -- 0xfff11c26, 0x06000080, /* vfetch b64 $r4:$r5 a[0x80] */ -- 0xfff01c46, 0x06000090, /* vfetch b96 $r0:$r1:$r2 a[0x90] */ -- 0x13f01c26, 0x0a7e0070, /* export b64 o[0x70] $r4:$r5 */ -- 0x03f01c46, 0x0a7e0080, /* export b96 o[0x80] $r0:$r1:$r2 */ -- 0x00001de7, 0x80000000, /* exit */ -- }; -- static const uint32_t code_gk110[] = -- { -- 0x00000000, 0x08000000, /* sched */ -- 0x401ffc12, 0x7ec7fc00, /* ld b64 $r4d a[0x80] 0x0 0x0 */ -- 0x481ffc02, 0x7ecbfc00, /* ld b96 $r0t a[0x90] 0x0 0x0 */ -- 0x381ffc12, 0x7f07fc00, /* st b64 a[0x70] $r4d 0x0 0x0 */ -- 0x401ffc02, 0x7f0bfc00, /* st b96 a[0x80] $r0t 0x0 0x0 */ -- 0x001c003c, 0x18000000, /* exit */ -- }; -- static const uint32_t code_gm107[] = -- { -- 0xe4200701, 0x001d0400, /* sched (st 0x1 wr 0x0) (st 0x1 wr 0x1) (st 0x1 wr 0x2) */ -- 0x0807ff00, 0xefd87f80, /* ld b32 $r0 a[0x80] 0x0 */ -- 0x0847ff01, 0xefd87f80, /* ld b32 $r1 a[0x84] 0x0 */ -- 0x0907ff02, 0xefd87f80, /* ld b32 $r2 a[0x90] 0x0 */ -- 0xf0200761, 0x003f8400, /* sched (st 0x1 wr 0x3) (st 0x1 wr 0x4) (st 0x1 wt 0x1) */ -- 0x0947ff03, 0xefd87f80, /* ld b32 $r3 a[0x94] 0x0 */ -- 0x0987ff04, 0xefd87f80, /* ld b32 $r4 a[0x98] 0x0 */ -- 0x0707ff00, 0xeff07f80, /* st b32 a[0x70] $r0 0x0 */ -- 0xfc2017e1, 0x011f8404, /* sched (st 0x1 wt 0x2) (st 0x1 wt 0x4) (st 0x1 wt 0x8) */ -- 0x0747ff01, 0xeff07f80, /* st b32 a[0x74] $r1 0x0 */ -- 0x0807ff02, 0xeff07f80, /* st b32 a[0x80] $r2 0x0 */ -- 0x0847ff03, 0xeff07f80, /* st b32 a[0x84] $r3 0x0 */ -- 0xfde087e1, 0x001f8000, /* sched (st 0x1 wt 0x10) (st 0xf) (st 0x0) */ -- 0x0887ff04, 0xeff07f80, /* st b32 a[0x88] $r4 0x0 */ -- 0x0007000f, 0xe3000000, /* exit */ -- }; -- -- blit->vp.type = PIPE_SHADER_VERTEX; -- blit->vp.translated = true; -- if (blit->screen->base.class_3d >= GM107_3D_CLASS) { -- blit->vp.code = (uint32_t *)code_gm107; /* const_cast */ -- blit->vp.code_size = sizeof(code_gm107); -- } else -- if (blit->screen->base.class_3d >= NVF0_3D_CLASS) { -- blit->vp.code = (uint32_t *)code_gk110; /* const_cast */ -- blit->vp.code_size = sizeof(code_gk110); -- } else -- if (blit->screen->base.class_3d >= NVE4_3D_CLASS) { -- blit->vp.code = (uint32_t *)code_nve4; /* const_cast */ -- blit->vp.code_size = sizeof(code_nve4); -- } else { -- blit->vp.code = (uint32_t *)code_nvc0; /* const_cast */ -- blit->vp.code_size = sizeof(code_nvc0); -- } -- blit->vp.num_gprs = 6; -- blit->vp.vp.edgeflag = PIPE_MAX_ATTRIBS; -+ struct ureg_program *ureg; -+ struct ureg_src ipos, itex; -+ struct ureg_dst opos, otex; -+ -+ ureg = ureg_create(PIPE_SHADER_VERTEX); -+ if (!ureg) -+ return NULL; -+ -+ opos = ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0); -+ ipos = ureg_DECL_vs_input(ureg, 0); -+ otex = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 0); -+ itex = ureg_DECL_vs_input(ureg, 1); -+ -+ ureg_MOV(ureg, ureg_writemask(opos, TGSI_WRITEMASK_XY ), ipos); -+ ureg_MOV(ureg, ureg_writemask(otex, TGSI_WRITEMASK_XYZ), itex); -+ ureg_END(ureg); - -- blit->vp.hdr[0] = 0x00020461; /* vertprog magic */ -- blit->vp.hdr[4] = 0x000ff000; /* no outputs read */ -- blit->vp.hdr[6] = 0x00000073; /* a[0x80].xy, a[0x90].xyz */ -- blit->vp.hdr[13] = 0x00073000; /* o[0x70].xy, o[0x80].xyz */ -+ return ureg_create_shader_and_destroy(ureg, pipe); - } - - static void -@@ -910,6 +867,20 @@ nvc0_blitter_make_sampler(struct nvc0_blitter *blit) - G80_TSC_1_MIP_FILTER_NONE; - } - -+static void -+nvc0_blit_select_vp(struct nvc0_blitctx *ctx) -+{ -+ struct nvc0_blitter *blitter = ctx->nvc0->screen->blitter; -+ -+ if (!blitter->vp) { -+ mtx_lock(&blitter->mutex); -+ if (!blitter->vp) -+ blitter->vp = nvc0_blitter_make_vp(&ctx->nvc0->base.pipe); -+ mtx_unlock(&blitter->mutex); -+ } -+ ctx->vp = blitter->vp; -+} -+ - static void - nvc0_blit_select_fp(struct nvc0_blitctx *ctx, const struct pipe_blit_info *info) - { -@@ -1082,7 +1053,7 @@ nvc0_blitctx_pre_blit(struct nvc0_blitctx *ctx, - - nvc0->rast = &ctx->rast; - -- nvc0->vertprog = &blitter->vp; -+ nvc0->vertprog = ctx->vp; - nvc0->tctlprog = NULL; - nvc0->tevlprog = NULL; - nvc0->gmtyprog = NULL; -@@ -1221,6 +1192,7 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info) - blit->filter = nv50_blit_get_filter(info); - blit->render_condition_enable = info->render_condition_enable; - -+ nvc0_blit_select_vp(blit); - nvc0_blit_select_fp(blit, info); - nvc0_blitctx_pre_blit(blit, info); - -@@ -1266,6 +1238,11 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info) - } - } - -+ if (screen->eng3d->oclass >= TU102_3D_CLASS) { -+ IMMED_NVC0(push, SUBC_3D(TU102_3D_SET_COLOR_RENDER_TO_ZETA_SURFACE), -+ util_format_is_depth_or_stencil(info->dst.format)); -+ } -+ - IMMED_NVC0(push, NVC0_3D(VIEWPORT_TRANSFORM_EN), 0); - IMMED_NVC0(push, NVC0_3D(VIEW_VOLUME_CLIP_CTRL), 0x2 | - NVC0_3D_VIEW_VOLUME_CLIP_CTRL_DEPTH_RANGE_0_1); -@@ -1326,7 +1303,10 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info) - PUSH_DATAh(push, vtxbuf); - PUSH_DATA (push, vtxbuf); - PUSH_DATA (push, 0); -- BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_LIMIT_HIGH(0)), 2); -+ if (screen->eng3d->oclass < TU102_3D_CLASS) -+ BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_LIMIT_HIGH(0)), 2); -+ else -+ BEGIN_NVC0(push, SUBC_3D(TU102_3D_VERTEX_ARRAY_LIMIT_HIGH(0)), 2); - PUSH_DATAh(push, vtxbuf + length - 1); - PUSH_DATA (push, vtxbuf + length - 1); - -@@ -1403,6 +1383,8 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info) - - /* restore viewport transform */ - IMMED_NVC0(push, NVC0_3D(VIEWPORT_TRANSFORM_EN), 1); -+ if (screen->eng3d->oclass >= TU102_3D_CLASS) -+ IMMED_NVC0(push, SUBC_3D(TU102_3D_SET_COLOR_RENDER_TO_ZETA_SURFACE), 0); - } - - static void -@@ -1697,7 +1679,6 @@ nvc0_blitter_create(struct nvc0_screen *screen) - - (void) mtx_init(&screen->blitter->mutex, mtx_plain); - -- nvc0_blitter_make_vp(screen->blitter); - nvc0_blitter_make_sampler(screen->blitter); - - return true; -diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c -index 92bd7eb5b8e..8287d8431b1 100644 ---- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c -+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c -@@ -360,7 +360,11 @@ nvc0_validate_vertex_buffers(struct nvc0_context *nvc0) - PUSH_DATAh(push, res->address + offset); - PUSH_DATA (push, res->address + offset); - } -- BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_LIMIT_HIGH(i)), 2); -+ -+ if (nvc0->screen->eng3d->oclass < TU102_3D_CLASS) -+ BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_LIMIT_HIGH(i)), 2); -+ else -+ BEGIN_NVC0(push, SUBC_3D(TU102_3D_VERTEX_ARRAY_LIMIT_HIGH(i)), 2); - PUSH_DATAh(push, res->address + limit); - PUSH_DATA (push, res->address + limit); - -@@ -406,7 +410,11 @@ nvc0_validate_vertex_buffers_shared(struct nvc0_context *nvc0) - PUSH_DATA (push, NVC0_3D_VERTEX_ARRAY_FETCH_ENABLE | vb->stride); - PUSH_DATAh(push, buf->address + offset); - PUSH_DATA (push, buf->address + offset); -- BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_LIMIT_HIGH(b)), 2); -+ -+ if (nvc0->screen->eng3d->oclass < TU102_3D_CLASS) -+ BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_LIMIT_HIGH(b)), 2); -+ else -+ BEGIN_NVC0(push, SUBC_3D(TU102_3D_VERTEX_ARRAY_LIMIT_HIGH(b)), 2); - PUSH_DATAh(push, buf->address + limit); - PUSH_DATA (push, buf->address + limit); - -@@ -961,12 +969,23 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) - assert(nouveau_resource_mapped_by_gpu(&buf->base)); - - PUSH_SPACE(push, 6); -- BEGIN_NVC0(push, NVC0_3D(INDEX_ARRAY_START_HIGH), 5); -- PUSH_DATAh(push, buf->address); -- PUSH_DATA (push, buf->address); -- PUSH_DATAh(push, buf->address + buf->base.width0 - 1); -- PUSH_DATA (push, buf->address + buf->base.width0 - 1); -- PUSH_DATA (push, info->index_size >> 1); -+ if (nvc0->screen->eng3d->oclass < TU102_3D_CLASS) { -+ BEGIN_NVC0(push, NVC0_3D(INDEX_ARRAY_START_HIGH), 5); -+ PUSH_DATAh(push, buf->address); -+ PUSH_DATA (push, buf->address); -+ PUSH_DATAh(push, buf->address + buf->base.width0 - 1); -+ PUSH_DATA (push, buf->address + buf->base.width0 - 1); -+ PUSH_DATA (push, info->index_size >> 1); -+ } else { -+ BEGIN_NVC0(push, NVC0_3D(INDEX_ARRAY_START_HIGH), 2); -+ PUSH_DATAh(push, buf->address); -+ PUSH_DATA (push, buf->address); -+ BEGIN_NVC0(push, SUBC_3D(TU102_3D_INDEX_ARRAY_LIMIT_HIGH), 2); -+ PUSH_DATAh(push, buf->address + buf->base.width0 - 1); -+ PUSH_DATA (push, buf->address + buf->base.width0 - 1); -+ BEGIN_NVC0(push, NVC0_3D(INDEX_FORMAT), 1); -+ PUSH_DATA (push, info->index_size >> 1); -+ } - - BCTX_REFN(nvc0->bufctx_3d, 3D_IDX, buf, RD); - } -diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c -index 8aa7088dfec..d49a5dfd2cf 100644 ---- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c -+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c -@@ -228,7 +228,11 @@ nvc0_push_setup_vertex_array(struct nvc0_context *nvc0, const unsigned count) - BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_START_HIGH(0)), 2); - PUSH_DATAh(push, va); - PUSH_DATA (push, va); -- BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_LIMIT_HIGH(0)), 2); -+ -+ if (nvc0->screen->eng3d->oclass < TU102_3D_CLASS) -+ BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_LIMIT_HIGH(0)), 2); -+ else -+ BEGIN_NVC0(push, SUBC_3D(TU102_3D_VERTEX_ARRAY_LIMIT_HIGH(0)), 2); - PUSH_DATAh(push, va + size - 1); - PUSH_DATA (push, va + size - 1); - -@@ -771,7 +775,11 @@ nvc0_push_upload_vertex_ids(struct push_context *ctx, - PUSH_DATA (push, NVC0_3D_VERTEX_ARRAY_FETCH_ENABLE | index_size); - PUSH_DATAh(push, va); - PUSH_DATA (push, va); -- BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_LIMIT_HIGH(1)), 2); -+ -+ if (nvc0->screen->eng3d->oclass < TU102_3D_CLASS) -+ BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_LIMIT_HIGH(1)), 2); -+ else -+ BEGIN_NVC0(push, SUBC_3D(TU102_3D_VERTEX_ARRAY_LIMIT_HIGH(1)), 2); - PUSH_DATAh(push, va + info->count * index_size - 1); - PUSH_DATA (push, va + info->count * index_size - 1); - -diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c -index 146eeb35f85..ebbc410184b 100644 ---- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c -+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c -@@ -27,11 +27,18 @@ - - #include "codegen/nv50_ir_driver.h" - --#ifndef NDEBUG --static void nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *); --static void gp100_compute_dump_launch_desc(const struct gp100_cp_launch_desc *); --#endif -- -+#include "drf.h" -+#include "qmd.h" -+#include "cla0c0qmd.h" -+#include "clc0c0qmd.h" -+#include "clc3c0qmd.h" -+ -+#define NVA0C0_QMDV00_06_VAL_SET(p,a...) NVVAL_MW_SET((p), NVA0C0, QMDV00_06, ##a) -+#define NVA0C0_QMDV00_06_DEF_SET(p,a...) NVDEF_MW_SET((p), NVA0C0, QMDV00_06, ##a) -+#define NVC0C0_QMDV02_01_VAL_SET(p,a...) NVVAL_MW_SET((p), NVC0C0, QMDV02_01, ##a) -+#define NVC0C0_QMDV02_01_DEF_SET(p,a...) NVDEF_MW_SET((p), NVC0C0, QMDV02_01, ##a) -+#define NVC3C0_QMDV02_02_VAL_SET(p,a...) NVVAL_MW_SET((p), NVC3C0, QMDV02_02, ##a) -+#define NVC3C0_QMDV02_02_DEF_SET(p,a...) NVDEF_MW_SET((p), NVC3C0, QMDV02_02, ##a) - - int - nve4_screen_compute_setup(struct nvc0_screen *screen, -@@ -45,6 +52,12 @@ nve4_screen_compute_setup(struct nvc0_screen *screen, - uint64_t address; - - switch (dev->chipset & ~0xf) { -+ case 0x160: -+ obj_class = TU102_COMPUTE_CLASS; -+ break; -+ case 0x140: -+ obj_class = GV100_COMPUTE_CLASS; -+ break; - case 0x100: - case 0xf0: - obj_class = NVF0_COMPUTE_CLASS; /* GK110 */ -@@ -88,24 +101,35 @@ nve4_screen_compute_setup(struct nvc0_screen *screen, - PUSH_DATAh(push, screen->tls->size / screen->mp_count); - PUSH_DATA (push, (screen->tls->size / screen->mp_count) & ~0x7fff); - PUSH_DATA (push, 0xff); -- BEGIN_NVC0(push, NVE4_CP(MP_TEMP_SIZE_HIGH(1)), 3); -- PUSH_DATAh(push, screen->tls->size / screen->mp_count); -- PUSH_DATA (push, (screen->tls->size / screen->mp_count) & ~0x7fff); -- PUSH_DATA (push, 0xff); -+ if (obj_class < GV100_COMPUTE_CLASS) { -+ BEGIN_NVC0(push, NVE4_CP(MP_TEMP_SIZE_HIGH(1)), 3); -+ PUSH_DATAh(push, screen->tls->size / screen->mp_count); -+ PUSH_DATA (push, (screen->tls->size / screen->mp_count) & ~0x7fff); -+ PUSH_DATA (push, 0xff); -+ } - - /* Unified address space ? Who needs that ? Certainly not OpenCL. - * - * FATAL: Buffers with addresses inside [0x1000000, 0x3000000] will NOT be - * accessible. We cannot prevent that at the moment, so expect failure. - */ -- BEGIN_NVC0(push, NVE4_CP(LOCAL_BASE), 1); -- PUSH_DATA (push, 0xff << 24); -- BEGIN_NVC0(push, NVE4_CP(SHARED_BASE), 1); -- PUSH_DATA (push, 0xfe << 24); -- -- BEGIN_NVC0(push, NVE4_CP(CODE_ADDRESS_HIGH), 2); -- PUSH_DATAh(push, screen->text->offset); -- PUSH_DATA (push, screen->text->offset); -+ if (obj_class < GV100_COMPUTE_CLASS) { -+ BEGIN_NVC0(push, NVE4_CP(LOCAL_BASE), 1); -+ PUSH_DATA (push, 0xff << 24); -+ BEGIN_NVC0(push, NVE4_CP(SHARED_BASE), 1); -+ PUSH_DATA (push, 0xfe << 24); -+ -+ BEGIN_NVC0(push, NVE4_CP(CODE_ADDRESS_HIGH), 2); -+ PUSH_DATAh(push, screen->text->offset); -+ PUSH_DATA (push, screen->text->offset); -+ } else { -+ BEGIN_NVC0(push, SUBC_CP(0x2a0), 2); -+ PUSH_DATAh(push, 0xfeULL << 24); -+ PUSH_DATA (push, 0xfeULL << 24); -+ BEGIN_NVC0(push, SUBC_CP(0x7b0), 2); -+ PUSH_DATAh(push, 0xffULL << 24); -+ PUSH_DATA (push, 0xffULL << 24); -+ } - - BEGIN_NVC0(push, SUBC_CP(0x0310), 1); - PUSH_DATA (push, (obj_class >= NVF0_COMPUTE_CLASS) ? 0x400 : 0x300); -@@ -542,14 +566,35 @@ nve4_compute_upload_input(struct nvc0_context *nvc0, - PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB); - } - --static inline uint8_t --nve4_compute_derive_cache_split(struct nvc0_context *nvc0, uint32_t shared_size) -+static inline void -+gp100_cp_launch_desc_set_cb(uint32_t *qmd, unsigned index, -+ struct nouveau_bo *bo, uint32_t base, uint32_t size) -+{ -+ uint64_t address = bo->offset + base; -+ -+ assert(index < 8); -+ assert(!(base & 0xff)); -+ -+ NVC0C0_QMDV02_01_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_LOWER, index, address); -+ NVC0C0_QMDV02_01_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_UPPER, index, address >> 32); -+ NVC0C0_QMDV02_01_VAL_SET(qmd, CONSTANT_BUFFER_SIZE_SHIFTED4, index, -+ DIV_ROUND_UP(size, 16)); -+ NVC0C0_QMDV02_01_DEF_SET(qmd, CONSTANT_BUFFER_VALID, index, TRUE); -+} -+ -+static inline void -+nve4_cp_launch_desc_set_cb(uint32_t *qmd, unsigned index, struct nouveau_bo *bo, -+ uint32_t base, uint32_t size) - { -- if (shared_size > (32 << 10)) -- return NVC0_3D_CACHE_SPLIT_48K_SHARED_16K_L1; -- if (shared_size > (16 << 10)) -- return NVE4_3D_CACHE_SPLIT_32K_SHARED_32K_L1; -- return NVC1_3D_CACHE_SPLIT_16K_SHARED_48K_L1; -+ uint64_t address = bo->offset + base; -+ -+ assert(index < 8); -+ assert(!(base & 0xff)); -+ -+ NVA0C0_QMDV00_06_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_LOWER, index, address); -+ NVA0C0_QMDV00_06_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_UPPER, index, address >> 32); -+ NVA0C0_QMDV00_06_VAL_SET(qmd, CONSTANT_BUFFER_SIZE, index, size); -+ NVA0C0_QMDV00_06_DEF_SET(qmd, CONSTANT_BUFFER_VALID, index, TRUE); - } - - static void -@@ -577,92 +622,186 @@ nve4_compute_setup_buf_cb(struct nvc0_context *nvc0, bool gp100, void *desc) - } - - static void --nve4_compute_setup_launch_desc(struct nvc0_context *nvc0, -- struct nve4_cp_launch_desc *desc, -+nve4_compute_setup_launch_desc(struct nvc0_context *nvc0, uint32_t *qmd, - const struct pipe_grid_info *info) - { - const struct nvc0_screen *screen = nvc0->screen; - const struct nvc0_program *cp = nvc0->compprog; - -- nve4_cp_launch_desc_init_default(desc); -- -- desc->entry = nvc0_program_symbol_offset(cp, info->pc); -- -- desc->griddim_x = info->grid[0]; -- desc->griddim_y = info->grid[1]; -- desc->griddim_z = info->grid[2]; -- desc->blockdim_x = info->block[0]; -- desc->blockdim_y = info->block[1]; -- desc->blockdim_z = info->block[2]; -- -- desc->shared_size = align(cp->cp.smem_size, 0x100); -- desc->local_size_p = (cp->hdr[1] & 0xfffff0) + align(cp->cp.lmem_size, 0x10); -- desc->local_size_n = 0; -- desc->cstack_size = 0x800; -- desc->cache_split = nve4_compute_derive_cache_split(nvc0, cp->cp.smem_size); -+ NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_TEXTURE_HEADER_CACHE, TRUE); -+ NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_TEXTURE_SAMPLER_CACHE, TRUE); -+ NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_TEXTURE_DATA_CACHE, TRUE); -+ NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_SHADER_DATA_CACHE, TRUE); -+ NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_SHADER_CONSTANT_CACHE, TRUE); -+ NVA0C0_QMDV00_06_DEF_SET(qmd, RELEASE_MEMBAR_TYPE, FE_SYSMEMBAR); -+ NVA0C0_QMDV00_06_DEF_SET(qmd, CWD_MEMBAR_TYPE, L1_SYSMEMBAR); -+ NVA0C0_QMDV00_06_DEF_SET(qmd, API_VISIBLE_CALL_LIMIT, NO_CHECK); -+ NVA0C0_QMDV00_06_VAL_SET(qmd, SASS_VERSION, 0x30); -+ -+ NVA0C0_QMDV00_06_VAL_SET(qmd, PROGRAM_OFFSET, -+ nvc0_program_symbol_offset(cp, info->pc)); -+ -+ NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_RASTER_WIDTH, info->grid[0]); -+ NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_RASTER_HEIGHT, info->grid[1]); -+ NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_RASTER_DEPTH, info->grid[2]); -+ NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_THREAD_DIMENSION0, info->block[0]); -+ NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_THREAD_DIMENSION1, info->block[1]); -+ NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_THREAD_DIMENSION2, info->block[2]); -+ -+ NVA0C0_QMDV00_06_VAL_SET(qmd, SHARED_MEMORY_SIZE, -+ align(cp->cp.smem_size, 0x100)); -+ NVA0C0_QMDV00_06_VAL_SET(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE, -+ (cp->hdr[1] & 0xfffff0) + -+ align(cp->cp.lmem_size, 0x10)); -+ NVA0C0_QMDV00_06_VAL_SET(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, 0); -+ NVA0C0_QMDV00_06_VAL_SET(qmd, SHADER_LOCAL_MEMORY_CRS_SIZE, 0x800); -+ -+ if (cp->cp.smem_size > (32 << 10)) -+ NVA0C0_QMDV00_06_DEF_SET(qmd, L1_CONFIGURATION, -+ DIRECTLY_ADDRESSABLE_MEMORY_SIZE_48KB); -+ else -+ if (cp->cp.smem_size > (16 << 10)) -+ NVA0C0_QMDV00_06_DEF_SET(qmd, L1_CONFIGURATION, -+ DIRECTLY_ADDRESSABLE_MEMORY_SIZE_32KB); -+ else -+ NVA0C0_QMDV00_06_DEF_SET(qmd, L1_CONFIGURATION, -+ DIRECTLY_ADDRESSABLE_MEMORY_SIZE_16KB); - -- desc->gpr_alloc = cp->num_gprs; -- desc->bar_alloc = cp->num_barriers; -+ NVA0C0_QMDV00_06_VAL_SET(qmd, REGISTER_COUNT, cp->num_gprs); -+ NVA0C0_QMDV00_06_VAL_SET(qmd, BARRIER_COUNT, cp->num_barriers); - - // Only bind user uniforms and the driver constant buffer through the - // launch descriptor because UBOs are sticked to the driver cb to avoid the - // limitation of 8 CBs. - if (nvc0->constbuf[5][0].user || cp->parm_size) { -- nve4_cp_launch_desc_set_cb(desc, 0, screen->uniform_bo, -+ nve4_cp_launch_desc_set_cb(qmd, 0, screen->uniform_bo, - NVC0_CB_USR_INFO(5), 1 << 16); - - // Later logic will attempt to bind a real buffer at position 0. That - // should not happen if we've bound a user buffer. - assert(nvc0->constbuf[5][0].user || !nvc0->constbuf[5][0].u.buf); - } -- nve4_cp_launch_desc_set_cb(desc, 7, screen->uniform_bo, -+ nve4_cp_launch_desc_set_cb(qmd, 7, screen->uniform_bo, - NVC0_CB_AUX_INFO(5), 1 << 11); - -- nve4_compute_setup_buf_cb(nvc0, false, desc); -+ nve4_compute_setup_buf_cb(nvc0, false, qmd); - } - - static void --gp100_compute_setup_launch_desc(struct nvc0_context *nvc0, -- struct gp100_cp_launch_desc *desc, -+gp100_compute_setup_launch_desc(struct nvc0_context *nvc0, uint32_t *qmd, - const struct pipe_grid_info *info) - { - const struct nvc0_screen *screen = nvc0->screen; - const struct nvc0_program *cp = nvc0->compprog; - -- gp100_cp_launch_desc_init_default(desc); -+ NVC0C0_QMDV02_01_VAL_SET(qmd, SM_GLOBAL_CACHING_ENABLE, 1); -+ NVC0C0_QMDV02_01_DEF_SET(qmd, RELEASE_MEMBAR_TYPE, FE_SYSMEMBAR); -+ NVC0C0_QMDV02_01_DEF_SET(qmd, CWD_MEMBAR_TYPE, L1_SYSMEMBAR); -+ NVC0C0_QMDV02_01_DEF_SET(qmd, API_VISIBLE_CALL_LIMIT, NO_CHECK); -+ -+ NVC0C0_QMDV02_01_VAL_SET(qmd, PROGRAM_OFFSET, -+ nvc0_program_symbol_offset(cp, info->pc)); -+ -+ NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_RASTER_WIDTH, info->grid[0]); -+ NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_RASTER_HEIGHT, info->grid[1]); -+ NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_RASTER_DEPTH, info->grid[2]); -+ NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_THREAD_DIMENSION0, info->block[0]); -+ NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_THREAD_DIMENSION1, info->block[1]); -+ NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_THREAD_DIMENSION2, info->block[2]); -+ -+ NVC0C0_QMDV02_01_VAL_SET(qmd, SHARED_MEMORY_SIZE, -+ align(cp->cp.smem_size, 0x100)); -+ NVC0C0_QMDV02_01_VAL_SET(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE, -+ (cp->hdr[1] & 0xfffff0) + -+ align(cp->cp.lmem_size, 0x10)); -+ NVC0C0_QMDV02_01_VAL_SET(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, 0); -+ NVC0C0_QMDV02_01_VAL_SET(qmd, SHADER_LOCAL_MEMORY_CRS_SIZE, 0x800); - -- desc->entry = nvc0_program_symbol_offset(cp, info->pc); -+ NVC0C0_QMDV02_01_VAL_SET(qmd, REGISTER_COUNT, cp->num_gprs); -+ NVC0C0_QMDV02_01_VAL_SET(qmd, BARRIER_COUNT, cp->num_barriers); - -- desc->griddim_x = info->grid[0]; -- desc->griddim_y = info->grid[1]; -- desc->griddim_z = info->grid[2]; -- desc->blockdim_x = info->block[0]; -- desc->blockdim_y = info->block[1]; -- desc->blockdim_z = info->block[2]; -+ // Only bind user uniforms and the driver constant buffer through the -+ // launch descriptor because UBOs are sticked to the driver cb to avoid the -+ // limitation of 8 CBs. -+ if (nvc0->constbuf[5][0].user || cp->parm_size) { -+ gp100_cp_launch_desc_set_cb(qmd, 0, screen->uniform_bo, -+ NVC0_CB_USR_INFO(5), 1 << 16); - -- desc->shared_size = align(cp->cp.smem_size, 0x100); -- desc->local_size_p = (cp->hdr[1] & 0xfffff0) + align(cp->cp.lmem_size, 0x10); -- desc->local_size_n = 0; -- desc->cstack_size = 0x800; -+ // Later logic will attempt to bind a real buffer at position 0. That -+ // should not happen if we've bound a user buffer. -+ assert(nvc0->constbuf[5][0].user || !nvc0->constbuf[5][0].u.buf); -+ } -+ gp100_cp_launch_desc_set_cb(qmd, 7, screen->uniform_bo, -+ NVC0_CB_AUX_INFO(5), 1 << 11); -+ -+ nve4_compute_setup_buf_cb(nvc0, true, qmd); -+} -+ -+static int -+gv100_sm_config_smem_size(u32 size) -+{ -+ if (size > 64 * 1024) size = 96 * 1024; -+ else if (size > 32 * 1024) size = 64 * 1024; -+ else if (size > 16 * 1024) size = 32 * 1024; -+ else if (size > 8 * 1024) size = 16 * 1024; -+ else size = 8 * 1024; -+ return (size / 4096) + 1; -+} - -- desc->gpr_alloc = cp->num_gprs; -- desc->bar_alloc = cp->num_barriers; -+static void -+gv100_compute_setup_launch_desc(struct nvc0_context *nvc0, u32 *qmd, -+ const struct pipe_grid_info *info) -+{ -+ struct nvc0_program *cp = nvc0->compprog; -+ struct nvc0_screen *screen = nvc0->screen; -+ uint64_t entry = -+ screen->text->offset + nvc0_program_symbol_offset(cp, info->pc); -+ -+ NVC3C0_QMDV02_02_VAL_SET(qmd, SM_GLOBAL_CACHING_ENABLE, 1); -+ NVC3C0_QMDV02_02_DEF_SET(qmd, API_VISIBLE_CALL_LIMIT, NO_CHECK); -+ NVC3C0_QMDV02_02_DEF_SET(qmd, SAMPLER_INDEX, INDEPENDENTLY); -+ NVC3C0_QMDV02_02_VAL_SET(qmd, SHARED_MEMORY_SIZE, -+ align(cp->cp.smem_size, 0x100)); -+ NVC3C0_QMDV02_02_VAL_SET(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE, -+ (cp->hdr[1] & 0xfffff0) + -+ align(cp->cp.lmem_size, 0x10)); -+ NVC3C0_QMDV02_02_VAL_SET(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, 0); -+ NVC3C0_QMDV02_02_VAL_SET(qmd, MIN_SM_CONFIG_SHARED_MEM_SIZE, -+ gv100_sm_config_smem_size(8 * 1024)); -+ NVC3C0_QMDV02_02_VAL_SET(qmd, MAX_SM_CONFIG_SHARED_MEM_SIZE, -+ gv100_sm_config_smem_size(96 * 1024)); -+ NVC3C0_QMDV02_02_VAL_SET(qmd, QMD_VERSION, 2); -+ NVC3C0_QMDV02_02_VAL_SET(qmd, QMD_MAJOR_VERSION, 2); -+ NVC3C0_QMDV02_02_VAL_SET(qmd, TARGET_SM_CONFIG_SHARED_MEM_SIZE, -+ gv100_sm_config_smem_size(cp->cp.smem_size)); -+ -+ NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_RASTER_WIDTH, info->grid[0]); -+ NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_RASTER_HEIGHT, info->grid[1]); -+ NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_RASTER_DEPTH, info->grid[2]); -+ NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_THREAD_DIMENSION0, info->block[0]); -+ NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_THREAD_DIMENSION1, info->block[1]); -+ NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_THREAD_DIMENSION2, info->block[2]); -+ NVC3C0_QMDV02_02_VAL_SET(qmd, REGISTER_COUNT_V, cp->num_gprs); -+ NVC3C0_QMDV02_02_VAL_SET(qmd, BARRIER_COUNT, cp->num_barriers); - - // Only bind user uniforms and the driver constant buffer through the - // launch descriptor because UBOs are sticked to the driver cb to avoid the - // limitation of 8 CBs. - if (nvc0->constbuf[5][0].user || cp->parm_size) { -- gp100_cp_launch_desc_set_cb(desc, 0, screen->uniform_bo, -+ gp100_cp_launch_desc_set_cb(qmd, 0, screen->uniform_bo, - NVC0_CB_USR_INFO(5), 1 << 16); - - // Later logic will attempt to bind a real buffer at position 0. That - // should not happen if we've bound a user buffer. - assert(nvc0->constbuf[5][0].user || !nvc0->constbuf[5][0].u.buf); - } -- gp100_cp_launch_desc_set_cb(desc, 7, screen->uniform_bo, -+ gp100_cp_launch_desc_set_cb(qmd, 7, screen->uniform_bo, - NVC0_CB_AUX_INFO(5), 1 << 11); - -- nve4_compute_setup_buf_cb(nvc0, true, desc); -+ nve4_compute_setup_buf_cb(nvc0, true, qmd); -+ -+ NVC3C0_QMDV02_02_VAL_SET(qmd, PROGRAM_ADDRESS_LOWER, entry & 0xffffffff); -+ NVC3C0_QMDV02_02_VAL_SET(qmd, PROGRAM_ADDRESS_UPPER, entry >> 32); - } - - static inline void * -@@ -677,6 +816,7 @@ nve4_compute_alloc_launch_desc(struct nouveau_context *nv, - ptr += adj; - *pgpuaddr += adj; - } -+ memset(ptr, 0x00, 256); - return ptr; - } - -@@ -734,6 +874,9 @@ nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info) - if (ret) - goto out; - -+ if (nvc0->screen->compute->oclass >= GV100_COMPUTE_CLASS) -+ gv100_compute_setup_launch_desc(nvc0, desc, info); -+ else - if (nvc0->screen->compute->oclass >= GP100_COMPUTE_CLASS) - gp100_compute_setup_launch_desc(nvc0, desc, info); - else -@@ -743,10 +886,14 @@ nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info) - - #ifndef NDEBUG - if (debug_get_num_option("NV50_PROG_DEBUG", 0)) { -+ debug_printf("Queue Meta Data:\n"); -+ if (nvc0->screen->compute->oclass >= GV100_COMPUTE_CLASS) -+ NVC3C0QmdDump_V02_02(desc); -+ else - if (nvc0->screen->compute->oclass >= GP100_COMPUTE_CLASS) -- gp100_compute_dump_launch_desc(desc); -+ NVC0C0QmdDump_V02_01(desc); - else -- nve4_compute_dump_launch_desc(desc); -+ NVA0C0QmdDump_V00_06(desc); - } - #endif - -@@ -877,115 +1024,6 @@ nve4_compute_validate_textures(struct nvc0_context *nvc0) - nvc0->dirty_3d |= NVC0_NEW_3D_TEXTURES; - } - -- --#ifndef NDEBUG --static const char *nve4_cache_split_name(unsigned value) --{ -- switch (value) { -- case NVC1_3D_CACHE_SPLIT_16K_SHARED_48K_L1: return "16K_SHARED_48K_L1"; -- case NVE4_3D_CACHE_SPLIT_32K_SHARED_32K_L1: return "32K_SHARED_32K_L1"; -- case NVC0_3D_CACHE_SPLIT_48K_SHARED_16K_L1: return "48K_SHARED_16K_L1"; -- default: -- return "(invalid)"; -- } --} -- --static void --nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *desc) --{ -- const uint32_t *data = (const uint32_t *)desc; -- unsigned i; -- bool zero = false; -- -- debug_printf("COMPUTE LAUNCH DESCRIPTOR:\n"); -- -- for (i = 0; i < sizeof(*desc); i += 4) { -- if (data[i / 4]) { -- debug_printf("[%x]: 0x%08x\n", i, data[i / 4]); -- zero = false; -- } else -- if (!zero) { -- debug_printf("...\n"); -- zero = true; -- } -- } -- -- debug_printf("entry = 0x%x\n", desc->entry); -- debug_printf("grid dimensions = %ux%ux%u\n", -- desc->griddim_x, desc->griddim_y, desc->griddim_z); -- debug_printf("block dimensions = %ux%ux%u\n", -- desc->blockdim_x, desc->blockdim_y, desc->blockdim_z); -- debug_printf("s[] size: 0x%x\n", desc->shared_size); -- debug_printf("l[] size: -0x%x / +0x%x\n", -- desc->local_size_n, desc->local_size_p); -- debug_printf("stack size: 0x%x\n", desc->cstack_size); -- debug_printf("barrier count: %u\n", desc->bar_alloc); -- debug_printf("$r count: %u\n", desc->gpr_alloc); -- debug_printf("cache split: %s\n", nve4_cache_split_name(desc->cache_split)); -- debug_printf("linked tsc: %d\n", desc->linked_tsc); -- -- for (i = 0; i < 8; ++i) { -- uint64_t address; -- uint32_t size = desc->cb[i].size; -- bool valid = !!(desc->cb_mask & (1 << i)); -- -- address = ((uint64_t)desc->cb[i].address_h << 32) | desc->cb[i].address_l; -- -- if (!valid && !address && !size) -- continue; -- debug_printf("CB[%u]: address = 0x%"PRIx64", size 0x%x%s\n", -- i, address, size, valid ? "" : " (invalid)"); -- } --} -- --static void --gp100_compute_dump_launch_desc(const struct gp100_cp_launch_desc *desc) --{ -- const uint32_t *data = (const uint32_t *)desc; -- unsigned i; -- bool zero = false; -- -- debug_printf("COMPUTE LAUNCH DESCRIPTOR:\n"); -- -- for (i = 0; i < sizeof(*desc); i += 4) { -- if (data[i / 4]) { -- debug_printf("[%x]: 0x%08x\n", i, data[i / 4]); -- zero = false; -- } else -- if (!zero) { -- debug_printf("...\n"); -- zero = true; -- } -- } -- -- debug_printf("entry = 0x%x\n", desc->entry); -- debug_printf("grid dimensions = %ux%ux%u\n", -- desc->griddim_x, desc->griddim_y, desc->griddim_z); -- debug_printf("block dimensions = %ux%ux%u\n", -- desc->blockdim_x, desc->blockdim_y, desc->blockdim_z); -- debug_printf("s[] size: 0x%x\n", desc->shared_size); -- debug_printf("l[] size: -0x%x / +0x%x\n", -- desc->local_size_n, desc->local_size_p); -- debug_printf("stack size: 0x%x\n", desc->cstack_size); -- debug_printf("barrier count: %u\n", desc->bar_alloc); -- debug_printf("$r count: %u\n", desc->gpr_alloc); -- debug_printf("linked tsc: %d\n", desc->linked_tsc); -- -- for (i = 0; i < 8; ++i) { -- uint64_t address; -- uint32_t size = desc->cb[i].size_sh4 << 4; -- bool valid = !!(desc->cb_mask & (1 << i)); -- -- address = ((uint64_t)desc->cb[i].address_h << 32) | desc->cb[i].address_l; -- -- if (!valid && !address && !size) -- continue; -- debug_printf("CB[%u]: address = 0x%"PRIx64", size 0x%x%s\n", -- i, address, size, valid ? "" : " (invalid)"); -- } --} --#endif -- - #ifdef NOUVEAU_NVE4_MP_TRAP_HANDLER - static void - nve4_compute_trap_info(struct nvc0_context *nvc0) -diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.h b/src/gallium/drivers/nouveau/nvc0/nve4_compute.h -index 7ff6935cc3d..d2599f7a71d 100644 ---- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.h -+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.h -@@ -4,142 +4,6 @@ - - #include "nvc0/nve4_compute.xml.h" - --struct nve4_cp_launch_desc --{ -- u32 unk0[8]; -- u32 entry; -- u32 unk9[2]; -- u32 unk11_0 : 30; -- u32 linked_tsc : 1; -- u32 unk11_31 : 1; -- u32 griddim_x : 31; -- u32 unk12 : 1; -- u16 griddim_y; -- u16 griddim_z; -- u32 unk14[3]; -- u16 shared_size; /* must be aligned to 0x100 */ -- u16 unk17; -- u16 unk18; -- u16 blockdim_x; -- u16 blockdim_y; -- u16 blockdim_z; -- u32 cb_mask : 8; -- u32 unk20_8 : 21; -- u32 cache_split : 2; -- u32 unk20_31 : 1; -- u32 unk21[8]; -- struct { -- u32 address_l; -- u32 address_h : 8; -- u32 reserved : 7; -- u32 size : 17; -- } cb[8]; -- u32 local_size_p : 20; -- u32 unk45_20 : 7; -- u32 bar_alloc : 5; -- u32 local_size_n : 20; -- u32 unk46_20 : 4; -- u32 gpr_alloc : 8; -- u32 cstack_size : 20; -- u32 unk47_20 : 12; -- u32 unk48[16]; --}; -- --struct gp100_cp_launch_desc --{ -- u32 unk0[8]; -- u32 entry; -- u32 unk9[2]; -- u32 unk11_0 : 30; -- u32 linked_tsc : 1; -- u32 unk11_31 : 1; -- u32 griddim_x : 31; -- u32 unk12 : 1; -- u16 griddim_y; -- u16 unk13; -- u16 griddim_z; -- u16 unk14; -- u32 unk15[2]; -- u32 shared_size : 18; -- u32 unk17 : 14; -- u16 unk18; -- u16 blockdim_x; -- u16 blockdim_y; -- u16 blockdim_z; -- u32 cb_mask : 8; -- u32 unk20 : 24; -- u32 unk21[8]; -- u32 local_size_p : 24; -- u32 unk29 : 3; -- u32 bar_alloc : 5; -- u32 local_size_n : 24; -- u32 gpr_alloc : 8; -- u32 cstack_size : 24; -- u32 unk31 : 8; -- struct { -- u32 address_l; -- u32 address_h : 17; -- u32 reserved : 2; -- u32 size_sh4 : 13; -- } cb[8]; -- u32 unk48[16]; --}; -- --static inline void --nve4_cp_launch_desc_init_default(struct nve4_cp_launch_desc *desc) --{ -- memset(desc, 0, sizeof(*desc)); -- -- desc->unk0[7] = 0xbc000000; -- desc->unk11_0 = 0x04014000; -- desc->unk47_20 = 0x300; --} -- --static inline void --nve4_cp_launch_desc_set_cb(struct nve4_cp_launch_desc *desc, -- unsigned index, -- struct nouveau_bo *bo, -- uint32_t base, uint32_t size) --{ -- uint64_t address = bo->offset + base; -- -- assert(index < 8); -- assert(!(base & 0xff)); -- -- desc->cb[index].address_l = address; -- desc->cb[index].address_h = address >> 32; -- desc->cb[index].size = size; -- -- desc->cb_mask |= 1 << index; --} -- --static inline void --gp100_cp_launch_desc_init_default(struct gp100_cp_launch_desc *desc) --{ -- memset(desc, 0, sizeof(*desc)); -- -- desc->unk0[4] = 0x40; -- desc->unk11_0 = 0x04014000; --} -- --static inline void --gp100_cp_launch_desc_set_cb(struct gp100_cp_launch_desc *desc, -- unsigned index, -- struct nouveau_bo *bo, -- uint32_t base, uint32_t size) --{ -- uint64_t address = bo->offset + base; -- -- assert(index < 8); -- assert(!(base & 0xff)); -- -- desc->cb[index].address_l = address; -- desc->cb[index].address_h = address >> 32; -- desc->cb[index].size_sh4 = DIV_ROUND_UP(size, 16); -- -- desc->cb_mask |= 1 << index; --} -- - struct nve4_mp_trap_info { - u32 lock; - u32 pc; -diff --git a/src/gallium/drivers/nouveau/nvc0/qmd.h b/src/gallium/drivers/nouveau/nvc0/qmd.h -new file mode 100644 -index 00000000000..86c290fe836 ---- /dev/null -+++ b/src/gallium/drivers/nouveau/nvc0/qmd.h -@@ -0,0 +1,68 @@ -+#ifndef __NVHW_QMD_H__ -+#define __NVHW_QMD_H__ -+#include -+#include -+#include "util/u_debug.h" -+#include "drf.h" -+ -+#define NVQMD_ENUM_1(X,drf,v0) \ -+ [drf##_##v0] = #v0 -+#define NVQMD_ENUM_2(X,drf,v0,v1) \ -+ [drf##_##v0] = #v0, \ -+ [drf##_##v1] = #v1 -+#define NVQMD_ENUM_3(X,drf,v0,v1,v2) \ -+ [drf##_##v0] = #v0, \ -+ [drf##_##v1] = #v1, \ -+ [drf##_##v2] = #v2 -+#define NVQMD_ENUM_8(X,drf,v0,v1,v2,v3,v4,v5,v6,v7) \ -+ [drf##_##v0] = #v0, \ -+ [drf##_##v1] = #v1, \ -+ [drf##_##v2] = #v2, \ -+ [drf##_##v3] = #v3, \ -+ [drf##_##v4] = #v4, \ -+ [drf##_##v5] = #v5, \ -+ [drf##_##v6] = #v6, \ -+ [drf##_##v7] = #v7 -+ -+#define NVQMD_ENUM_(X,_1,_2,_3,_4,_5,_6,_7,_8,_9,IMPL,...) IMPL -+#define NVQMD_ENUM(A...) NVQMD_ENUM_(X, ##A, NVQMD_ENUM_8, NVQMD_ENUM_7, \ -+ NVQMD_ENUM_6, NVQMD_ENUM_5, \ -+ NVQMD_ENUM_4, NVQMD_ENUM_3, \ -+ NVQMD_ENUM_2, NVQMD_ENUM_1)(X, ##A) -+ -+#define NVQMD_VAL_N(X,d,r,p,f,o) do { \ -+ uint32_t val = NVVAL_MW_GET_X((p), d##_##r##_##f); \ -+ debug_printf(" %-36s: "o"\n", #f, val); \ -+} while(0) -+#define NVQMD_VAL_I(X,d,r,p,f,i,o) do { \ -+ uint32_t val = NVVAL_MW_GET_X((p), d##_##r##_##f(i)); \ -+ char name[80]; \ -+ snprintf(name, sizeof(name), "%s(%d)", #f, i); \ -+ debug_printf(" %-36s: "o"\n", name, val); \ -+} while(0) -+#define NVQMD_VAL_(X,_1,_2,_3,_4,_5,_6,IMPL,...) IMPL -+#define NVQMD_VAL(A...) NVQMD_VAL_(X, ##A, NVQMD_VAL_I, NVQMD_VAL_N)(X, ##A) -+ -+#define NVQMD_DEF(d,r,p,f,e...) do { \ -+ static const char *ev[] = { NVQMD_ENUM(d##_##r##_##f,##e) }; \ -+ uint32_t val = NVVAL_MW_GET((p), d, r, f); \ -+ if (val < ARRAY_SIZE(ev) && ev[val]) \ -+ debug_printf(" %-36s: %s\n", #f, ev[val]); \ -+ else \ -+ debug_printf(" %-36s: UNKNOWN 0x%x\n", #f, val); \ -+} while(0) -+#define NVQMD_IDX(d,r,p,f,i,e...) do { \ -+ static const char *ev[] = { NVQMD_ENUM(d##_##r##_##f,##e) }; \ -+ char name[80]; \ -+ snprintf(name, sizeof(name), "%s(%d)", #f, i); \ -+ uint32_t val = NVVAL_MW_GET((p), d, r, f, i); \ -+ if (val < ARRAY_SIZE(ev) && ev[val]) \ -+ debug_printf(" %-36s: %s\n", name, ev[val]); \ -+ else \ -+ debug_printf(" %-36s: UNKNOWN 0x%x\n", name, val); \ -+} while(0) -+ -+void NVA0C0QmdDump_V00_06(uint32_t *); -+void NVC0C0QmdDump_V02_01(uint32_t *); -+void NVC3C0QmdDump_V02_02(uint32_t *); -+#endif -diff --git a/src/gallium/drivers/nouveau/nvc0/qmda0c0.c b/src/gallium/drivers/nouveau/nvc0/qmda0c0.c -new file mode 100644 -index 00000000000..7103a893af5 ---- /dev/null -+++ b/src/gallium/drivers/nouveau/nvc0/qmda0c0.c -@@ -0,0 +1,166 @@ -+/* -+ * Copyright 2020 Red Hat Inc. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice shall be included in -+ * all copies or substantial portions of the Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -+ * OTHER DEALINGS IN THE SOFTWARE. -+ */ -+#include "qmd.h" -+#include "cla0c0qmd.h" -+ -+#define NVA0C0_QMDV00_06_VAL(a...) NVQMD_VAL(NVA0C0, QMDV00_06, ##a) -+#define NVA0C0_QMDV00_06_DEF(a...) NVQMD_DEF(NVA0C0, QMDV00_06, ##a) -+#define NVA0C0_QMDV00_06_IDX(a...) NVQMD_IDX(NVA0C0, QMDV00_06, ##a) -+ -+void -+NVA0C0QmdDump_V00_06(uint32_t *qmd) -+{ -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_A, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_B, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_C, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_D, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_E, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_F, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_G, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_H, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_A_A, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_I, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_J, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_A, "0x%x"); -+ NVA0C0_QMDV00_06_DEF(qmd, QMD_RESERVED_V1_K, FALSE, TRUE); -+ NVA0C0_QMDV00_06_DEF(qmd, QMD_RESERVED_V1_L, FALSE, TRUE); -+ NVA0C0_QMDV00_06_DEF(qmd, SEMAPHORE_RELEASE_ENABLE0, FALSE, TRUE); -+ NVA0C0_QMDV00_06_DEF(qmd, SEMAPHORE_RELEASE_ENABLE1, FALSE, TRUE); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_B, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_M, "0x%x"); -+ NVA0C0_QMDV00_06_DEF(qmd, QMD_RESERVED_V1_N, FALSE, TRUE); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_O, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_C, "0x%x"); -+ NVA0C0_QMDV00_06_DEF(qmd, INVALIDATE_TEXTURE_HEADER_CACHE, FALSE, TRUE); -+ NVA0C0_QMDV00_06_DEF(qmd, INVALIDATE_TEXTURE_SAMPLER_CACHE, FALSE, TRUE); -+ NVA0C0_QMDV00_06_DEF(qmd, INVALIDATE_TEXTURE_DATA_CACHE, FALSE, TRUE); -+ NVA0C0_QMDV00_06_DEF(qmd, INVALIDATE_SHADER_DATA_CACHE, FALSE, TRUE); -+ NVA0C0_QMDV00_06_DEF(qmd, INVALIDATE_INSTRUCTION_CACHE, FALSE, TRUE); -+ NVA0C0_QMDV00_06_DEF(qmd, INVALIDATE_SHADER_CONSTANT_CACHE, FALSE, TRUE); -+ NVA0C0_QMDV00_06_VAL(qmd, PROGRAM_OFFSET, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_P, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_Q, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_D, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_R, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_S, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_E, "0x%x"); -+ NVA0C0_QMDV00_06_DEF(qmd, RELEASE_MEMBAR_TYPE, FE_NONE, FE_SYSMEMBAR); -+ NVA0C0_QMDV00_06_DEF(qmd, CWD_MEMBAR_TYPE, L1_NONE, L1_SYSMEMBAR, L1_MEMBAR); -+ NVA0C0_QMDV00_06_DEF(qmd, QMD_RESERVED_V1_T, FALSE, TRUE); -+ NVA0C0_QMDV00_06_DEF(qmd, QMD_RESERVED_V1_U, FALSE, TRUE); -+ NVA0C0_QMDV00_06_DEF(qmd, THROTTLED, FALSE, TRUE); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_E2_A, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_E2_B, "0x%x"); -+ NVA0C0_QMDV00_06_DEF(qmd, API_VISIBLE_CALL_LIMIT, _32, NO_CHECK); -+ NVA0C0_QMDV00_06_DEF(qmd, SHARED_MEMORY_BANK_MAPPING, FOUR_BYTES_PER_BANK, -+ EIGHT_BYTES_PER_BANK); -+ NVA0C0_QMDV00_06_DEF(qmd, SAMPLER_INDEX, INDEPENDENTLY, VIA_HEADER_INDEX); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_E3_A, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, CTA_RASTER_WIDTH, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, CTA_RASTER_HEIGHT, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, CTA_RASTER_DEPTH, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, CTA_RASTER_WIDTH_RESUME, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, CTA_RASTER_HEIGHT_RESUME, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, CTA_RASTER_DEPTH_RESUME, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_V, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_F, "0x%x"); -+ NVA0C0_QMDV00_06_DEF(qmd, QMD_RESERVED_V1_W, FALSE, TRUE); -+ NVA0C0_QMDV00_06_VAL(qmd, SHARED_MEMORY_SIZE, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_G, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_VERSION, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_MAJOR_VERSION, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_H, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, CTA_THREAD_DIMENSION0, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, CTA_THREAD_DIMENSION1, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, CTA_THREAD_DIMENSION2, "0x%x"); -+ for (int i = 0; i < 8; i++) -+ NVA0C0_QMDV00_06_IDX(qmd, CONSTANT_BUFFER_VALID, i, FALSE, TRUE); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_I, "0x%x"); -+ NVA0C0_QMDV00_06_DEF(qmd, L1_CONFIGURATION, -+ DIRECTLY_ADDRESSABLE_MEMORY_SIZE_16KB, -+ DIRECTLY_ADDRESSABLE_MEMORY_SIZE_32KB, -+ DIRECTLY_ADDRESSABLE_MEMORY_SIZE_48KB); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_X, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_Y, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, RELEASE0_ADDRESS_LOWER, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, RELEASE0_ADDRESS_UPPER, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_J, "0x%x"); -+ NVA0C0_QMDV00_06_DEF(qmd, RELEASE0_REDUCTION_OP, RED_ADD, -+ RED_MIN, -+ RED_MAX, -+ RED_INC, -+ RED_DEC, -+ RED_AND, -+ RED_OR, -+ RED_XOR); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_K, "0x%x"); -+ NVA0C0_QMDV00_06_DEF(qmd, RELEASE0_REDUCTION_FORMAT, UNSIGNED_32, SIGNED_32); -+ NVA0C0_QMDV00_06_DEF(qmd, RELEASE0_REDUCTION_ENABLE, FALSE, TRUE); -+ NVA0C0_QMDV00_06_DEF(qmd, RELEASE0_STRUCTURE_SIZE, FOUR_WORDS, ONE_WORD); -+ NVA0C0_QMDV00_06_VAL(qmd, RELEASE0_PAYLOAD, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, RELEASE1_ADDRESS_LOWER, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, RELEASE1_ADDRESS_UPPER, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_L, "0x%x"); -+ NVA0C0_QMDV00_06_DEF(qmd, RELEASE1_REDUCTION_OP, RED_ADD, -+ RED_MIN, -+ RED_MAX, -+ RED_INC, -+ RED_DEC, -+ RED_AND, -+ RED_OR, -+ RED_XOR); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_M, "0x%x"); -+ NVA0C0_QMDV00_06_DEF(qmd, RELEASE1_REDUCTION_FORMAT, UNSIGNED_32, SIGNED_32); -+ NVA0C0_QMDV00_06_DEF(qmd, RELEASE1_REDUCTION_ENABLE, FALSE, TRUE); -+ NVA0C0_QMDV00_06_DEF(qmd, RELEASE1_STRUCTURE_SIZE, FOUR_WORDS, ONE_WORD); -+ NVA0C0_QMDV00_06_VAL(qmd, RELEASE1_PAYLOAD, "0x%x"); -+ for (int i = 0; i < 8; i++) { -+ NVA0C0_QMDV00_06_VAL(qmd, CONSTANT_BUFFER_ADDR_LOWER, i, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, CONSTANT_BUFFER_ADDR_UPPER, i, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, CONSTANT_BUFFER_RESERVED_ADDR, i, "0x%x"); -+ NVA0C0_QMDV00_06_IDX(qmd, CONSTANT_BUFFER_INVALIDATE, i, FALSE, TRUE); -+ NVA0C0_QMDV00_06_VAL(qmd, CONSTANT_BUFFER_SIZE, i, "0x%x"); -+ } -+ NVA0C0_QMDV00_06_VAL(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_N, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, BARRIER_COUNT, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, REGISTER_COUNT, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, SHADER_LOCAL_MEMORY_CRS_SIZE, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, SASS_VERSION, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_SPARE_A, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_SPARE_B, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_SPARE_C, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_SPARE_D, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_SPARE_E, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_SPARE_F, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_SPARE_G, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_SPARE_H, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_SPARE_I, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_SPARE_J, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_SPARE_K, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_SPARE_L, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_SPARE_M, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, QMD_SPARE_N, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, DEBUG_ID_UPPER, "0x%x"); -+ NVA0C0_QMDV00_06_VAL(qmd, DEBUG_ID_LOWER, "0x%x"); -+} -diff --git a/src/gallium/drivers/nouveau/nvc0/qmdc0c0.c b/src/gallium/drivers/nouveau/nvc0/qmdc0c0.c -new file mode 100644 -index 00000000000..945439ee0c8 ---- /dev/null -+++ b/src/gallium/drivers/nouveau/nvc0/qmdc0c0.c -@@ -0,0 +1,165 @@ -+/* -+ * Copyright 2020 Red Hat Inc. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice shall be included in -+ * all copies or substantial portions of the Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -+ * OTHER DEALINGS IN THE SOFTWARE. -+ */ -+#include "qmd.h" -+#include "clc0c0qmd.h" -+ -+#define NVC0C0_QMDV02_01_VAL(a...) NVQMD_VAL(NVC0C0, QMDV02_01, ##a) -+#define NVC0C0_QMDV02_01_DEF(a...) NVQMD_DEF(NVC0C0, QMDV02_01, ##a) -+#define NVC0C0_QMDV02_01_IDX(a...) NVQMD_IDX(NVC0C0, QMDV02_01, ##a) -+ -+void -+NVC0C0QmdDump_V02_01(uint32_t *qmd) -+{ -+ NVC0C0_QMDV02_01_VAL(qmd, OUTER_PUT, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, OUTER_OVERFLOW, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, OUTER_GET, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, OUTER_STICKY_OVERFLOW, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, INNER_GET, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, INNER_OVERFLOW, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, INNER_PUT, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, INNER_STICKY_OVERFLOW, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, QMD_GROUP_ID, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, SM_GLOBAL_CACHING_ENABLE, "0x%x"); -+ NVC0C0_QMDV02_01_DEF(qmd, RUN_CTA_IN_ONE_SM_PARTITION, FALSE, TRUE); -+ NVC0C0_QMDV02_01_DEF(qmd, IS_QUEUE, FALSE, TRUE); -+ NVC0C0_QMDV02_01_DEF(qmd, ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST, FALSE, TRUE); -+ NVC0C0_QMDV02_01_DEF(qmd, SEMAPHORE_RELEASE_ENABLE0, FALSE, TRUE); -+ NVC0C0_QMDV02_01_DEF(qmd, SEMAPHORE_RELEASE_ENABLE1, FALSE, TRUE); -+ NVC0C0_QMDV02_01_DEF(qmd, REQUIRE_SCHEDULING_PCAS, FALSE, TRUE); -+ NVC0C0_QMDV02_01_DEF(qmd, DEPENDENT_QMD_SCHEDULE_ENABLE, FALSE, TRUE); -+ NVC0C0_QMDV02_01_DEF(qmd, DEPENDENT_QMD_TYPE, QUEUE, GRID); -+ NVC0C0_QMDV02_01_DEF(qmd, DEPENDENT_QMD_FIELD_COPY, FALSE, TRUE); -+ NVC0C0_QMDV02_01_VAL(qmd, QMD_RESERVED_B, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, CIRCULAR_QUEUE_SIZE, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, QMD_RESERVED_C, "0x%x"); -+ NVC0C0_QMDV02_01_DEF(qmd, INVALIDATE_TEXTURE_HEADER_CACHE, FALSE, TRUE); -+ NVC0C0_QMDV02_01_DEF(qmd, INVALIDATE_TEXTURE_SAMPLER_CACHE, FALSE, TRUE); -+ NVC0C0_QMDV02_01_DEF(qmd, INVALIDATE_TEXTURE_DATA_CACHE, FALSE, TRUE); -+ NVC0C0_QMDV02_01_DEF(qmd, INVALIDATE_SHADER_DATA_CACHE, FALSE, TRUE); -+ NVC0C0_QMDV02_01_DEF(qmd, INVALIDATE_INSTRUCTION_CACHE, FALSE, TRUE); -+ NVC0C0_QMDV02_01_DEF(qmd, INVALIDATE_SHADER_CONSTANT_CACHE, FALSE, TRUE); -+ NVC0C0_QMDV02_01_VAL(qmd, CTA_RASTER_WIDTH_RESUME, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, CTA_RASTER_HEIGHT_RESUME, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, CTA_RASTER_DEPTH_RESUME, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, PROGRAM_OFFSET, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, CIRCULAR_QUEUE_ADDR_LOWER, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, CIRCULAR_QUEUE_ADDR_UPPER, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, QMD_RESERVED_D, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, CIRCULAR_QUEUE_ENTRY_SIZE, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, CWD_REFERENCE_COUNT_ID, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, CWD_REFERENCE_COUNT_DELTA_MINUS_ONE, "0x%x"); -+ NVC0C0_QMDV02_01_DEF(qmd, RELEASE_MEMBAR_TYPE, FE_NONE, FE_SYSMEMBAR); -+ NVC0C0_QMDV02_01_DEF(qmd, CWD_REFERENCE_COUNT_INCR_ENABLE, FALSE, TRUE); -+ NVC0C0_QMDV02_01_DEF(qmd, CWD_MEMBAR_TYPE, L1_NONE, L1_SYSMEMBAR, L1_MEMBAR); -+ NVC0C0_QMDV02_01_DEF(qmd, SEQUENTIALLY_RUN_CTAS, FALSE, TRUE); -+ NVC0C0_QMDV02_01_DEF(qmd, CWD_REFERENCE_COUNT_DECR_ENABLE, FALSE, TRUE); -+ NVC0C0_QMDV02_01_DEF(qmd, THROTTLED, FALSE, TRUE); -+ NVC0C0_QMDV02_01_DEF(qmd, API_VISIBLE_CALL_LIMIT, _32, NO_CHECK); -+ NVC0C0_QMDV02_01_DEF(qmd, SAMPLER_INDEX, INDEPENDENTLY, VIA_HEADER_INDEX); -+ NVC0C0_QMDV02_01_VAL(qmd, CTA_RASTER_WIDTH, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, CTA_RASTER_HEIGHT, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, QMD_RESERVED13A, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, CTA_RASTER_DEPTH, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, QMD_RESERVED14A, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, DEPENDENT_QMD_POINTER, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, QUEUE_ENTRIES_PER_CTA_MINUS_ONE, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, COALESCE_WAITING_PERIOD, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, SHARED_MEMORY_SIZE, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, QMD_RESERVED_G, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, QMD_VERSION, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, QMD_MAJOR_VERSION, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, QMD_RESERVED_H, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, CTA_THREAD_DIMENSION0, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, CTA_THREAD_DIMENSION1, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, CTA_THREAD_DIMENSION2, "0x%x"); -+ for (int i = 0; i < 8; i++) -+ NVC0C0_QMDV02_01_IDX(qmd, CONSTANT_BUFFER_VALID, i, FALSE, TRUE); -+ NVC0C0_QMDV02_01_VAL(qmd, QMD_RESERVED_I, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, SM_DISABLE_MASK_LOWER, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, SM_DISABLE_MASK_UPPER, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, RELEASE0_ADDRESS_LOWER, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, RELEASE0_ADDRESS_UPPER, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, QMD_RESERVED_J, "0x%x"); -+ NVC0C0_QMDV02_01_DEF(qmd, RELEASE0_REDUCTION_OP, RED_ADD, -+ RED_MIN, -+ RED_MAX, -+ RED_INC, -+ RED_DEC, -+ RED_AND, -+ RED_OR, -+ RED_XOR); -+ NVC0C0_QMDV02_01_VAL(qmd, QMD_RESERVED_K, "0x%x"); -+ NVC0C0_QMDV02_01_DEF(qmd, RELEASE0_REDUCTION_FORMAT, UNSIGNED_32, SIGNED_32); -+ NVC0C0_QMDV02_01_DEF(qmd, RELEASE0_REDUCTION_ENABLE, FALSE, TRUE); -+ NVC0C0_QMDV02_01_DEF(qmd, RELEASE0_STRUCTURE_SIZE, FOUR_WORDS, ONE_WORD); -+ NVC0C0_QMDV02_01_VAL(qmd, RELEASE0_PAYLOAD, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, RELEASE1_ADDRESS_LOWER, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, RELEASE1_ADDRESS_UPPER, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, QMD_RESERVED_L, "0x%x"); -+ NVC0C0_QMDV02_01_DEF(qmd, RELEASE1_REDUCTION_OP, RED_ADD, -+ RED_MIN, -+ RED_MAX, -+ RED_INC, -+ RED_DEC, -+ RED_AND, -+ RED_OR, -+ RED_XOR); -+ NVC0C0_QMDV02_01_VAL(qmd, QMD_RESERVED_M, "0x%x"); -+ NVC0C0_QMDV02_01_DEF(qmd, RELEASE1_REDUCTION_FORMAT, UNSIGNED_32, SIGNED_32); -+ NVC0C0_QMDV02_01_DEF(qmd, RELEASE1_REDUCTION_ENABLE, FALSE, TRUE); -+ NVC0C0_QMDV02_01_DEF(qmd, RELEASE1_STRUCTURE_SIZE, FOUR_WORDS, ONE_WORD); -+ NVC0C0_QMDV02_01_VAL(qmd, RELEASE1_PAYLOAD, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, QMD_RESERVED_N, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, BARRIER_COUNT, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, REGISTER_COUNT, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, SHADER_LOCAL_MEMORY_CRS_SIZE, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, SASS_VERSION, "0x%x"); -+ for (int i = 0; i < 8; i++) { -+ NVC0C0_QMDV02_01_VAL(qmd, CONSTANT_BUFFER_ADDR_LOWER, i, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, CONSTANT_BUFFER_ADDR_UPPER, i, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, CONSTANT_BUFFER_RESERVED_ADDR, i, "0x%x"); -+ NVC0C0_QMDV02_01_IDX(qmd, CONSTANT_BUFFER_INVALIDATE, i, FALSE, TRUE); -+ NVC0C0_QMDV02_01_VAL(qmd, CONSTANT_BUFFER_SIZE_SHIFTED4, i, "0x%x"); -+ } -+ NVC0C0_QMDV02_01_VAL(qmd, QMD_RESERVED_R, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, QMD_RESERVED_S, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, HW_ONLY_INNER_GET, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, HW_ONLY_REQUIRE_SCHEDULING_PCAS, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, HW_ONLY_INNER_PUT, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, HW_ONLY_SCG_TYPE, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, HW_ONLY_SPAN_LIST_HEAD_INDEX, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, QMD_RESERVED_Q, "0x%x"); -+ NVC0C0_QMDV02_01_DEF(qmd, HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID, FALSE, TRUE); -+ NVC0C0_QMDV02_01_VAL(qmd, HW_ONLY_SKED_NEXT_QMD_POINTER, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, QMD_SPARE_G, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, QMD_SPARE_H, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, QMD_SPARE_I, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, QMD_SPARE_J, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, QMD_SPARE_K, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, QMD_SPARE_L, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, QMD_SPARE_M, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, QMD_SPARE_N, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, DEBUG_ID_UPPER, "0x%x"); -+ NVC0C0_QMDV02_01_VAL(qmd, DEBUG_ID_LOWER, "0x%x"); -+} -diff --git a/src/gallium/drivers/nouveau/nvc0/qmdc3c0.c b/src/gallium/drivers/nouveau/nvc0/qmdc3c0.c -new file mode 100644 -index 00000000000..c9bd8966114 ---- /dev/null -+++ b/src/gallium/drivers/nouveau/nvc0/qmdc3c0.c -@@ -0,0 +1,168 @@ -+/* -+ * Copyright 2020 Red Hat Inc. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice shall be included in -+ * all copies or substantial portions of the Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -+ * OTHER DEALINGS IN THE SOFTWARE. -+ */ -+#include "qmd.h" -+#include "clc3c0qmd.h" -+ -+#define NVC3C0_QMDV02_02_VAL(a...) NVQMD_VAL(NVC3C0, QMDV02_02, ##a) -+#define NVC3C0_QMDV02_02_DEF(a...) NVQMD_DEF(NVC3C0, QMDV02_02, ##a) -+#define NVC3C0_QMDV02_02_IDX(a...) NVQMD_IDX(NVC3C0, QMDV02_02, ##a) -+ -+void -+NVC3C0QmdDump_V02_02(uint32_t *qmd) -+{ -+ NVC3C0_QMDV02_02_VAL(qmd, OUTER_PUT, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, OUTER_OVERFLOW, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, OUTER_GET, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, OUTER_STICKY_OVERFLOW, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, INNER_GET, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, INNER_OVERFLOW, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, INNER_PUT, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, INNER_STICKY_OVERFLOW, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, QMD_GROUP_ID, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, SM_GLOBAL_CACHING_ENABLE, "0x%x"); -+ NVC3C0_QMDV02_02_DEF(qmd, RUN_CTA_IN_ONE_SM_PARTITION, FALSE, TRUE); -+ NVC3C0_QMDV02_02_DEF(qmd, IS_QUEUE, FALSE, TRUE); -+ NVC3C0_QMDV02_02_DEF(qmd, ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST, FALSE, TRUE); -+ NVC3C0_QMDV02_02_DEF(qmd, SEMAPHORE_RELEASE_ENABLE0, FALSE, TRUE); -+ NVC3C0_QMDV02_02_DEF(qmd, SEMAPHORE_RELEASE_ENABLE1, FALSE, TRUE); -+ NVC3C0_QMDV02_02_DEF(qmd, REQUIRE_SCHEDULING_PCAS, FALSE, TRUE); -+ NVC3C0_QMDV02_02_DEF(qmd, DEPENDENT_QMD_SCHEDULE_ENABLE, FALSE, TRUE); -+ NVC3C0_QMDV02_02_DEF(qmd, DEPENDENT_QMD_TYPE, QUEUE, GRID); -+ NVC3C0_QMDV02_02_DEF(qmd, DEPENDENT_QMD_FIELD_COPY, FALSE, TRUE); -+ NVC3C0_QMDV02_02_VAL(qmd, QMD_RESERVED_B, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, CIRCULAR_QUEUE_SIZE, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, QMD_RESERVED_C, "0x%x"); -+ NVC3C0_QMDV02_02_DEF(qmd, INVALIDATE_TEXTURE_HEADER_CACHE, FALSE, TRUE); -+ NVC3C0_QMDV02_02_DEF(qmd, INVALIDATE_TEXTURE_SAMPLER_CACHE, FALSE, TRUE); -+ NVC3C0_QMDV02_02_DEF(qmd, INVALIDATE_TEXTURE_DATA_CACHE, FALSE, TRUE); -+ NVC3C0_QMDV02_02_DEF(qmd, INVALIDATE_SHADER_DATA_CACHE, FALSE, TRUE); -+ NVC3C0_QMDV02_02_DEF(qmd, INVALIDATE_INSTRUCTION_CACHE, FALSE, TRUE); -+ NVC3C0_QMDV02_02_DEF(qmd, INVALIDATE_SHADER_CONSTANT_CACHE, FALSE, TRUE); -+ NVC3C0_QMDV02_02_VAL(qmd, CTA_RASTER_WIDTH_RESUME, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, CTA_RASTER_HEIGHT_RESUME, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, CTA_RASTER_DEPTH_RESUME, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, PROGRAM_OFFSET, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, CIRCULAR_QUEUE_ADDR_LOWER, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, CIRCULAR_QUEUE_ADDR_UPPER, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, QMD_RESERVED_D, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, CIRCULAR_QUEUE_ENTRY_SIZE, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, CWD_REFERENCE_COUNT_ID, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, CWD_REFERENCE_COUNT_DELTA_MINUS_ONE, "0x%x"); -+ NVC3C0_QMDV02_02_DEF(qmd, RELEASE_MEMBAR_TYPE, FE_NONE, FE_SYSMEMBAR); -+ NVC3C0_QMDV02_02_DEF(qmd, CWD_REFERENCE_COUNT_INCR_ENABLE, FALSE, TRUE); -+ NVC3C0_QMDV02_02_DEF(qmd, CWD_MEMBAR_TYPE, L1_NONE, L1_SYSMEMBAR, L1_MEMBAR); -+ NVC3C0_QMDV02_02_DEF(qmd, SEQUENTIALLY_RUN_CTAS, FALSE, TRUE); -+ NVC3C0_QMDV02_02_DEF(qmd, CWD_REFERENCE_COUNT_DECR_ENABLE, FALSE, TRUE); -+ NVC3C0_QMDV02_02_DEF(qmd, API_VISIBLE_CALL_LIMIT, _32, NO_CHECK); -+ NVC3C0_QMDV02_02_DEF(qmd, SAMPLER_INDEX, INDEPENDENTLY, VIA_HEADER_INDEX); -+ NVC3C0_QMDV02_02_VAL(qmd, CTA_RASTER_WIDTH, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, CTA_RASTER_HEIGHT, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, QMD_RESERVED13A, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, CTA_RASTER_DEPTH, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, QMD_RESERVED14A, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, DEPENDENT_QMD_POINTER, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, QUEUE_ENTRIES_PER_CTA_MINUS_ONE, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, COALESCE_WAITING_PERIOD, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, SHARED_MEMORY_SIZE, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, MIN_SM_CONFIG_SHARED_MEM_SIZE, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, MAX_SM_CONFIG_SHARED_MEM_SIZE, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, QMD_VERSION, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, QMD_MAJOR_VERSION, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, QMD_RESERVED_H, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, CTA_THREAD_DIMENSION0, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, CTA_THREAD_DIMENSION1, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, CTA_THREAD_DIMENSION2, "0x%x"); -+ for (int i = 0; i < 8; i++) -+ NVC3C0_QMDV02_02_IDX(qmd, CONSTANT_BUFFER_VALID, i, FALSE, TRUE); -+ NVC3C0_QMDV02_02_VAL(qmd, REGISTER_COUNT_V, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, TARGET_SM_CONFIG_SHARED_MEM_SIZE, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, FREE_CTA_SLOTS_EMPTY_SM, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, SM_DISABLE_MASK_LOWER, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, SM_DISABLE_MASK_UPPER, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, RELEASE0_ADDRESS_LOWER, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, RELEASE0_ADDRESS_UPPER, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, QMD_RESERVED_J, "0x%x"); -+ NVC3C0_QMDV02_02_DEF(qmd, RELEASE0_REDUCTION_OP, RED_ADD, -+ RED_MIN, -+ RED_MAX, -+ RED_INC, -+ RED_DEC, -+ RED_AND, -+ RED_OR, -+ RED_XOR); -+ NVC3C0_QMDV02_02_VAL(qmd, QMD_RESERVED_K, "0x%x"); -+ NVC3C0_QMDV02_02_DEF(qmd, RELEASE0_REDUCTION_FORMAT, UNSIGNED_32, SIGNED_32); -+ NVC3C0_QMDV02_02_DEF(qmd, RELEASE0_REDUCTION_ENABLE, FALSE, TRUE); -+ NVC3C0_QMDV02_02_DEF(qmd, RELEASE0_STRUCTURE_SIZE, FOUR_WORDS, ONE_WORD); -+ NVC3C0_QMDV02_02_VAL(qmd, RELEASE0_PAYLOAD, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, RELEASE1_ADDRESS_LOWER, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, RELEASE1_ADDRESS_UPPER, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, QMD_RESERVED_L, "0x%x"); -+ NVC3C0_QMDV02_02_DEF(qmd, RELEASE1_REDUCTION_OP, RED_ADD, -+ RED_MIN, -+ RED_MAX, -+ RED_INC, -+ RED_DEC, -+ RED_AND, -+ RED_OR, -+ RED_XOR); -+ NVC3C0_QMDV02_02_VAL(qmd, QMD_RESERVED_M, "0x%x"); -+ NVC3C0_QMDV02_02_DEF(qmd, RELEASE1_REDUCTION_FORMAT, UNSIGNED_32, SIGNED_32); -+ NVC3C0_QMDV02_02_DEF(qmd, RELEASE1_REDUCTION_ENABLE, FALSE, TRUE); -+ NVC3C0_QMDV02_02_DEF(qmd, RELEASE1_STRUCTURE_SIZE, FOUR_WORDS, ONE_WORD); -+ NVC3C0_QMDV02_02_VAL(qmd, RELEASE1_PAYLOAD, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, QMD_RESERVED_N, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, BARRIER_COUNT, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, REGISTER_COUNT, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, SHADER_LOCAL_MEMORY_CRS_SIZE, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, SASS_VERSION, "0x%x"); -+ for (int i = 0; i < 8; i++) { -+ NVC3C0_QMDV02_02_VAL(qmd, CONSTANT_BUFFER_ADDR_LOWER, i, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, CONSTANT_BUFFER_ADDR_UPPER, i, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, CONSTANT_BUFFER_RESERVED_ADDR, i, "0x%x"); -+ NVC3C0_QMDV02_02_IDX(qmd, CONSTANT_BUFFER_INVALIDATE, i, FALSE, TRUE); -+ NVC3C0_QMDV02_02_VAL(qmd, CONSTANT_BUFFER_SIZE_SHIFTED4, i, "0x%x"); -+ } -+ NVC3C0_QMDV02_02_VAL(qmd, PROGRAM_ADDRESS_LOWER, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, PROGRAM_ADDRESS_UPPER, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, QMD_RESERVED_S, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, HW_ONLY_INNER_GET, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, HW_ONLY_REQUIRE_SCHEDULING_PCAS, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, HW_ONLY_INNER_PUT, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, HW_ONLY_SCG_TYPE, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, HW_ONLY_SPAN_LIST_HEAD_INDEX, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, QMD_RESERVED_Q, "0x%x"); -+ NVC3C0_QMDV02_02_DEF(qmd, HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID, FALSE, TRUE); -+ NVC3C0_QMDV02_02_VAL(qmd, HW_ONLY_SKED_NEXT_QMD_POINTER, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, QMD_SPARE_G, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, QMD_SPARE_H, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, QMD_SPARE_I, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, QMD_SPARE_J, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, QMD_SPARE_K, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, QMD_SPARE_L, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, QMD_SPARE_M, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, QMD_SPARE_N, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, DEBUG_ID_UPPER, "0x%x"); -+ NVC3C0_QMDV02_02_VAL(qmd, DEBUG_ID_LOWER, "0x%x"); -+} -diff --git a/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c b/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c -index 5c43518afcb..d123c8a1c17 100644 ---- a/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c -+++ b/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c -@@ -104,6 +104,8 @@ nouveau_drm_screen_create(int fd) - case 0x110: - case 0x120: - case 0x130: -+ case 0x140: -+ case 0x160: - init = nvc0_screen_create; - break; - default: diff --git a/SPECS/mesa.spec b/SPECS/mesa.spec index 177ac45..4c1ccd7 100644 --- a/SPECS/mesa.spec +++ b/SPECS/mesa.spec @@ -9,16 +9,16 @@ %endif %ifarch %{ix86} x86_64 -%define platform_drivers ,i965 +%define platform_drivers i965 %define with_vmware 1 %define with_xa 1 %define with_iris 1 %endif %ifarch %{ix86} x86_64 -%define with_vulkan 1 +%define with_vulkan_hw 1 %else -%define with_vulkan 0 +%define with_vulkan_hw 0 %endif %ifarch %{arm} aarch64 @@ -31,18 +31,20 @@ %global dri_drivers %{?platform_drivers} -%if 0%{?with_vulkan} -%define vulkan_drivers intel,amd +%if 0%{?with_vulkan_hw} +%define vulkan_drivers swrast,intel,amd +%else +%define vulkan_drivers swrast %endif %global sanitize 0 -#global rctag rc4 +#global rctag rc2 Name: mesa Summary: Mesa graphics libraries -Version: 20.1.4 -Release: 1%{?rctag:.%{rctag}}%{?dist} +Version: 20.3.3 +Release: 2%{?rctag:.%{rctag}}%{?dist} License: MIT URL: http://www.mesa3d.org @@ -56,9 +58,11 @@ Source3: Makefile # Fedora opts to ignore the optional part of clause 2 and treat that code as 2 clause BSD. Source4: Mesa-MLAA-License-Clarification-Email.txt -# Add support for TU11x nvidia -Patch10: 0001-nir-use-bitfield_insert-instead-of-bfi-in-nir_lower_.patch -Patch11: nouveau-tu1xx-support.patch +Patch0: lavapipe-disable-env-var.patch +Patch1: mesa-20.3.3-stable-fixes.patch +Patch2: anv-remove-warning.patch + +Patch10: cpu-affinity-fixes-20.3.3.patch BuildRequires: gcc BuildRequires: gcc-c++ @@ -67,7 +71,7 @@ BuildRequires: meson >= 0.45 %if %{with_hardware} BuildRequires: kernel-headers %endif -BuildRequires: libdrm-devel >= 2.4.42 +BuildRequires: libdrm-devel >= 2.4.103 BuildRequires: libXxf86vm-devel BuildRequires: expat-devel BuildRequires: xorg-x11-proto-devel @@ -166,6 +170,7 @@ Provides: libEGL-devel%{?_isa} %package dri-drivers Summary: Mesa-based DRI drivers Requires: %{name}-filesystem%{?_isa} = %{?epoch:%{epoch}:}%{version}-%{release} +Requires: libdrm >= 2.4.103 %description dri-drivers %{summary}. @@ -282,7 +287,6 @@ Requires: %{name}-libd3d%{?_isa} = %{?epoch:%{epoch}:}%{version}-%{release %{summary}. %endif -%if 0%{?with_vulkan} %package vulkan-drivers Summary: Mesa Vulkan drivers Requires: vulkan%{_isa} @@ -290,6 +294,7 @@ Requires: vulkan%{_isa} %description vulkan-drivers The drivers with support for the Vulkan API. +%if 0%{?with_vulkan_hw} %package vulkan-devel Summary: Mesa Vulkan development files Requires: %{name}-vulkan-drivers%{?_isa} = %{?epoch:%{epoch}:}%{version}-%{release} @@ -323,7 +328,7 @@ pathfix.py -i %{__python3} -pn bin/*.py src/egl/generate/*.py \ export ASFLAGS="--generate-missing-build-notes=yes" %meson -Dcpp_std=gnu++14 \ -Db_ndebug=true \ - -Dplatforms=x11,wayland,drm,surfaceless \ + -Dplatforms=x11,wayland \ -Ddri3=true \ -Ddri-drivers=%{?dri_drivers} \ %if 0%{?with_hardware} @@ -527,8 +532,8 @@ done %endif %endif -%if 0%{?with_vulkan} %files vulkan-drivers +%if 0%{?with_vulkan_hw} %{_libdir}/libvulkan_intel.so %{_libdir}/libvulkan_radeon.so %ifarch x86_64 @@ -538,14 +543,40 @@ done %{_datadir}/vulkan/icd.d/intel_icd.i686.json %{_datadir}/vulkan/icd.d/radeon_icd.i686.json %endif +%endif +%{_libdir}/libvulkan_lvp.so +%{_datadir}/vulkan/icd.d/lvp_icd.*.json %{_libdir}/libVkLayer_MESA_device_select.so %{_datadir}/vulkan/implicit_layer.d/VkLayer_MESA_device_select.json +%if 0%{?with_vulkan_hw} %files vulkan-devel %{_includedir}/vulkan/ %endif %changelog +* Fri Mar 26 2021 Dave Airlie - 20.3.3-2 +- Fix CPU affinity memory corruption crash (#1938788) + +* Tue Feb 16 2021 Dave Airlie - 20.3.3-1 +- Update to 20.3.3 + upstream fixes for qemu regression + +* Mon Jan 11 2021 Dave Airlie - 20.3.2-1 +- Update to 20.3.2 for upstream fixes + +* Mon Dec 21 2020 Dave Airlie - 20.3.1-1 +- Update to 20.3.1 for radeon fix + +* Mon Dec 07 2020 Dave Airlie - 20.3.0-2 +- Fix regression with radeon si/cik cards + +* Fri Dec 04 2020 Dave Airlie - 20.3.0-1 +- Update to 20.3.0 release + +* Thu Nov 19 2020 Dave Airlie - 20.3.0-0.1.rc2 +- Update 20.3.0-rc2 +- enable lavapipe behind env var so it can be used for testing + * Wed Aug 05 2020 Dave Airlie - 20.1.4-1 - Update to 20.1.4 - Update nouveau tu1xx support patch (Karol)