Backport fixes for radeonsi and disable GFX12 on radv Resolves: https://issues.redhat.com/browse/RHEL-53419
2566 lines
114 KiB
Diff
2566 lines
114 KiB
Diff
From 9797443fc2d655b8333ab80aec9d17a06706fe53 Mon Sep 17 00:00:00 2001
|
|
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
|
|
Date: Wed, 20 Nov 2024 10:34:27 -0500
|
|
Subject: [PATCH 01/27] ac/surface: adjust HiZ enablement
|
|
|
|
Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
|
|
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32257>
|
|
---
|
|
src/amd/common/ac_surface.c | 2 +-
|
|
1 file changed, 1 insertion(+), 1 deletion(-)
|
|
|
|
diff --git a/src/amd/common/ac_surface.c b/src/amd/common/ac_surface.c
|
|
index 3ecb5d57be4..69f722bc4cc 100644
|
|
--- a/src/amd/common/ac_surface.c
|
|
+++ b/src/amd/common/ac_surface.c
|
|
@@ -3022,7 +3022,7 @@ static bool gfx12_compute_hiz_his_info(struct ac_addrlib *addrlib, const struct
|
|
{
|
|
assert(surf_in->flags.depth != surf_in->flags.stencil);
|
|
|
|
- if (surf->flags & RADEON_SURF_NO_HTILE || (info->gfx_level == GFX12 && info->chip_rev == 0))
|
|
+ if (surf->flags & RADEON_SURF_NO_HTILE || (info->gfx_level == GFX12 && info->chip_rev <= 1))
|
|
return true;
|
|
|
|
ADDR3_COMPUTE_SURFACE_INFO_OUTPUT out = {0};
|
|
--
|
|
2.48.1
|
|
|
|
|
|
From 51f92a3545b42d4250da95e9623576b65e8260b9 Mon Sep 17 00:00:00 2001
|
|
From: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
|
|
Date: Thu, 21 Nov 2024 11:46:02 +0100
|
|
Subject: [PATCH 02/27] radeonsi/gfx12: disable display dcc for front buffer
|
|
rendering
|
|
MIME-Version: 1.0
|
|
Content-Type: text/plain; charset=UTF-8
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
Same logic as other chips, except we need to reallocate the texture
|
|
as we can't disable dcc.
|
|
|
|
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
|
|
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32281>
|
|
---
|
|
src/gallium/drivers/radeonsi/si_texture.c | 33 ++++++++++++++++-------
|
|
1 file changed, 24 insertions(+), 9 deletions(-)
|
|
|
|
diff --git a/src/gallium/drivers/radeonsi/si_texture.c b/src/gallium/drivers/radeonsi/si_texture.c
|
|
index ad234b2f018..ae65b725600 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_texture.c
|
|
+++ b/src/gallium/drivers/radeonsi/si_texture.c
|
|
@@ -797,16 +797,31 @@ static bool si_texture_get_handle(struct pipe_screen *screen, struct pipe_contex
|
|
assert(tex->surface.tile_swizzle == 0);
|
|
}
|
|
|
|
- /* Since shader image stores don't support DCC on GFX8,
|
|
- * disable it for external clients that want write
|
|
- * access.
|
|
+ const bool debug_disable_dcc = sscreen->debug_flags & DBG(NO_EXPORTED_DCC);
|
|
+ /* Since shader image stores don't support DCC on GFX9 and older,
|
|
+ * disable it for external clients that want write access.
|
|
*/
|
|
- if (sscreen->debug_flags & DBG(NO_EXPORTED_DCC) ||
|
|
- (usage & PIPE_HANDLE_USAGE_SHADER_WRITE && !tex->is_depth && tex->surface.meta_offset) ||
|
|
- /* Displayable DCC requires an explicit flush. */
|
|
- (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) &&
|
|
- si_displayable_dcc_needs_explicit_flush(tex))) {
|
|
- if (si_texture_disable_dcc(sctx, tex)) {
|
|
+ const bool shader_write = sscreen->info.gfx_level <= GFX9 &&
|
|
+ usage & PIPE_HANDLE_USAGE_SHADER_WRITE &&
|
|
+ !tex->is_depth &&
|
|
+ tex->surface.meta_offset;
|
|
+ /* Another reason to disable display dcc is front buffer rendering.
|
|
+ * This can happens with Xorg. If the ddx driver uses GBM_BO_USE_FRONT_RENDERING,
|
|
+ * there's nothing to do because the texture is not using DCC.
|
|
+ * If the flag isn't set, we have to infer it to get correct rendering.
|
|
+ */
|
|
+ const bool front_buffer_rendering = !(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) &&
|
|
+ tex->buffer.b.b.bind & PIPE_BIND_SCANOUT;
|
|
+
|
|
+ /* If display dcc requires a retiling step, drop dcc. */
|
|
+ const bool explicit_flush = !(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) &&
|
|
+ si_displayable_dcc_needs_explicit_flush(tex);
|
|
+
|
|
+ if (debug_disable_dcc || shader_write || front_buffer_rendering || explicit_flush) {
|
|
+ if (sscreen->info.gfx_level >= GFX12) {
|
|
+ si_reallocate_texture_inplace(sctx, tex, PIPE_BIND_CONST_BW, false);
|
|
+ update_metadata = true;
|
|
+ } else if (si_texture_disable_dcc(sctx, tex)) {
|
|
update_metadata = true;
|
|
/* si_texture_disable_dcc flushes the context */
|
|
flush = false;
|
|
--
|
|
2.48.1
|
|
|
|
|
|
From 89d3f1a550ade706459e6c2f0649608c084387ff Mon Sep 17 00:00:00 2001
|
|
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
|
|
Date: Mon, 16 Dec 2024 03:48:37 -0500
|
|
Subject: [PATCH 03/27] radeonsi/gfx12: set DB_RENDER_OVERRIDE based on stencil
|
|
state
|
|
|
|
Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
|
|
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32653>
|
|
---
|
|
src/gallium/drivers/radeonsi/si_gfx_cs.c | 1 +
|
|
src/gallium/drivers/radeonsi/si_state.c | 8 ++++++++
|
|
src/gallium/drivers/radeonsi/si_state.h | 2 ++
|
|
3 files changed, 11 insertions(+)
|
|
|
|
diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c
|
|
index 9b11eaf678e..31814bc94d2 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
|
|
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
|
|
@@ -318,6 +318,7 @@ void si_set_tracked_regs_to_clear_state(struct si_context *ctx)
|
|
ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ENA] = 0;
|
|
ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ADDR] = 0;
|
|
|
|
+ ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_OVERRIDE] = 0;
|
|
ctx->tracked_regs.reg_value[SI_TRACKED_DB_EQAA] = 0;
|
|
ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_OVERRIDE2] = 0;
|
|
ctx->tracked_regs.reg_value[SI_TRACKED_DB_SHADER_CONTROL] = 0;
|
|
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
|
|
index 6bb1037a79b..018be3dbe4f 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_state.c
|
|
+++ b/src/gallium/drivers/radeonsi/si_state.c
|
|
@@ -1573,6 +1573,12 @@ static void *si_create_dsa_state(struct pipe_context *ctx,
|
|
S_028090_TESTMASK_BF(state->stencil[1].valuemask);
|
|
dsa->db_stencil_write_mask = S_028094_WRITEMASK(state->stencil[0].writemask) |
|
|
S_028094_WRITEMASK_BF(state->stencil[1].writemask);
|
|
+
|
|
+ bool force_s_valid = state->stencil[0].zpass_op != state->stencil[0].zfail_op ||
|
|
+ (state->stencil[1].enabled &&
|
|
+ state->stencil[1].zpass_op != state->stencil[1].zfail_op);
|
|
+ dsa->db_render_override = S_02800C_FORCE_STENCIL_READ(1) |
|
|
+ S_02800C_FORCE_STENCIL_VALID(force_s_valid);
|
|
}
|
|
|
|
bool zfunc_is_ordered =
|
|
@@ -1608,6 +1614,8 @@ static void si_pm4_emit_dsa(struct si_context *sctx, unsigned index)
|
|
if (sctx->gfx_level >= GFX12) {
|
|
radeon_begin(&sctx->gfx_cs);
|
|
gfx12_begin_context_regs();
|
|
+ gfx12_opt_set_context_reg(R_02800C_DB_RENDER_OVERRIDE, SI_TRACKED_DB_RENDER_OVERRIDE,
|
|
+ state->db_render_override);
|
|
gfx12_opt_set_context_reg(R_028070_DB_DEPTH_CONTROL, SI_TRACKED_DB_DEPTH_CONTROL,
|
|
state->db_depth_control);
|
|
if (state->stencil_enabled) {
|
|
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
|
|
index a984c7d6918..f964fa21606 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_state.h
|
|
+++ b/src/gallium/drivers/radeonsi/si_state.h
|
|
@@ -130,6 +130,7 @@ struct si_state_dsa {
|
|
unsigned spi_shader_user_data_ps_alpha_ref;
|
|
unsigned db_stencil_read_mask;
|
|
unsigned db_stencil_write_mask;
|
|
+ unsigned db_render_override; /* only gfx12 */
|
|
|
|
/* 0 = without stencil buffer, 1 = when both Z and S buffers are present */
|
|
struct si_dsa_order_invariance order_invariance[2];
|
|
@@ -314,6 +315,7 @@ enum si_tracked_reg
|
|
SI_TRACKED_SPI_PS_INPUT_ENA,
|
|
SI_TRACKED_SPI_PS_INPUT_ADDR,
|
|
|
|
+ SI_TRACKED_DB_RENDER_OVERRIDE,
|
|
SI_TRACKED_DB_EQAA,
|
|
SI_TRACKED_DB_RENDER_OVERRIDE2,
|
|
SI_TRACKED_DB_SHADER_CONTROL,
|
|
--
|
|
2.48.1
|
|
|
|
|
|
From 9a0377096ab33b12c10913ecc2b5a1358da29923 Mon Sep 17 00:00:00 2001
|
|
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
|
|
Date: Tue, 6 Aug 2024 16:23:01 -0400
|
|
Subject: [PATCH 04/27] radeonsi: add a new PM4 helper radeon_event_write
|
|
|
|
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
|
|
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31168>
|
|
---
|
|
src/gallium/drivers/radeonsi/si_build_pm4.h | 11 +++
|
|
src/gallium/drivers/radeonsi/si_compute.c | 7 +-
|
|
src/gallium/drivers/radeonsi/si_gfx_cs.c | 89 +++++++------------
|
|
src/gallium/drivers/radeonsi/si_perfcounter.c | 21 ++---
|
|
src/gallium/drivers/radeonsi/si_query.c | 3 +-
|
|
src/gallium/drivers/radeonsi/si_state.c | 24 +++--
|
|
.../drivers/radeonsi/si_state_draw.cpp | 21 ++---
|
|
.../drivers/radeonsi/si_state_shaders.cpp | 13 +--
|
|
.../drivers/radeonsi/si_state_streamout.c | 3 +-
|
|
9 files changed, 74 insertions(+), 118 deletions(-)
|
|
|
|
diff --git a/src/gallium/drivers/radeonsi/si_build_pm4.h b/src/gallium/drivers/radeonsi/si_build_pm4.h
|
|
index 15259aa329b..630b5885ff3 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_build_pm4.h
|
|
+++ b/src/gallium/drivers/radeonsi/si_build_pm4.h
|
|
@@ -503,6 +503,17 @@
|
|
} \
|
|
} while (0)
|
|
|
|
+/* Other packet helpers. */
|
|
+#define radeon_event_write(event_type) do { \
|
|
+ unsigned __event_type = (event_type); \
|
|
+ radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); \
|
|
+ radeon_emit(EVENT_TYPE(__event_type) | \
|
|
+ EVENT_INDEX(__event_type == V_028A90_VS_PARTIAL_FLUSH || \
|
|
+ __event_type == V_028A90_PS_PARTIAL_FLUSH || \
|
|
+ __event_type == V_028A90_CS_PARTIAL_FLUSH ? 4 : \
|
|
+ __event_type == V_028A90_PIXEL_PIPE_STAT_CONTROL ? 1 : 0)); \
|
|
+} while (0)
|
|
+
|
|
/* This should be evaluated at compile time if all parameters are constants. */
|
|
static ALWAYS_INLINE unsigned
|
|
si_get_user_data_base(enum amd_gfx_level gfx_level, enum si_has_tess has_tess,
|
|
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
|
|
index 727face471e..01aa75d222a 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_compute.c
|
|
+++ b/src/gallium/drivers/radeonsi/si_compute.c
|
|
@@ -1114,10 +1114,9 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_
|
|
radeon_emit(dispatch_initiator);
|
|
}
|
|
|
|
- if (unlikely(sctx->sqtt_enabled && sctx->gfx_level >= GFX9)) {
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0));
|
|
- }
|
|
+ if (unlikely(sctx->sqtt_enabled && sctx->gfx_level >= GFX9))
|
|
+ radeon_event_write(V_028A90_THREAD_TRACE_MARKER);
|
|
+
|
|
radeon_end();
|
|
}
|
|
|
|
diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c
|
|
index 31814bc94d2..d8dccd9b008 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
|
|
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
|
|
@@ -160,8 +160,7 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h
|
|
*/
|
|
if ((ctx->gfx_level == GFX11 || ctx->gfx_level == GFX11_5) && ctx->has_tessellation) {
|
|
radeon_begin(cs);
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_SQ_NON_EVENT) | EVENT_INDEX(0));
|
|
+ radeon_event_write(V_028A90_SQ_NON_EVENT);
|
|
radeon_end();
|
|
}
|
|
|
|
@@ -797,10 +796,8 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
|
|
|
|
radeon_begin(cs);
|
|
|
|
- if (flags & SI_CONTEXT_VGT_FLUSH) {
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
|
|
- }
|
|
+ if (flags & SI_CONTEXT_VGT_FLUSH)
|
|
+ radeon_event_write(V_028A90_VGT_FLUSH);
|
|
|
|
if (flags & SI_CONTEXT_INV_ICACHE)
|
|
gcr_cntl |= S_586_GLI_INV(V_586_GLI_ALL);
|
|
@@ -837,19 +834,15 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
|
|
}
|
|
|
|
if (flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) {
|
|
- if (ctx->gfx_level < GFX12 && flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
|
|
- /* Flush CMASK/FMASK/DCC. Will wait for idle later. */
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
|
|
- }
|
|
+ /* Flush CMASK/FMASK/DCC. Will wait for idle later. */
|
|
+ if (ctx->gfx_level < GFX12 && flags & SI_CONTEXT_FLUSH_AND_INV_CB)
|
|
+ radeon_event_write(V_028A90_FLUSH_AND_INV_CB_META);
|
|
|
|
/* Gfx11 can't flush DB_META and should use a TS event instead. */
|
|
+ /* Flush HTILE. Will wait for idle later. */
|
|
if (ctx->gfx_level < GFX12 && ctx->gfx_level != GFX11 &&
|
|
- flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
|
|
- /* Flush HTILE. Will wait for idle later. */
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
|
|
- }
|
|
+ flags & SI_CONTEXT_FLUSH_AND_INV_DB)
|
|
+ radeon_event_write(V_028A90_FLUSH_AND_INV_DB_META);
|
|
|
|
/* First flush CB/DB, then L1/L2. */
|
|
gcr_cntl |= S_586_SEQ(V_586_SEQ_FORWARD);
|
|
@@ -870,21 +863,18 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
|
|
} else {
|
|
/* Wait for graphics shaders to go idle if requested. */
|
|
if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) {
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
|
|
+ radeon_event_write(V_028A90_PS_PARTIAL_FLUSH);
|
|
/* Only count explicit shader flushes, not implicit ones. */
|
|
ctx->num_vs_flushes++;
|
|
ctx->num_ps_flushes++;
|
|
} else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) {
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
|
|
+ radeon_event_write(V_028A90_VS_PARTIAL_FLUSH);
|
|
ctx->num_vs_flushes++;
|
|
}
|
|
}
|
|
|
|
if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && ctx->compute_is_busy) {
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4)));
|
|
+ radeon_event_write(V_028A90_CS_PARTIAL_FLUSH);
|
|
ctx->num_cs_flushes++;
|
|
ctx->compute_is_busy = false;
|
|
}
|
|
@@ -1026,12 +1016,10 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
|
|
}
|
|
|
|
if (flags & SI_CONTEXT_START_PIPELINE_STATS && ctx->pipeline_stats_enabled != 1) {
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0));
|
|
+ radeon_event_write(V_028A90_PIPELINESTAT_START);
|
|
ctx->pipeline_stats_enabled = 1;
|
|
} else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS && ctx->pipeline_stats_enabled != 0) {
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0));
|
|
+ radeon_event_write(V_028A90_PIPELINESTAT_STOP);
|
|
ctx->pipeline_stats_enabled = 0;
|
|
}
|
|
radeon_end();
|
|
@@ -1092,16 +1080,13 @@ void gfx6_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
|
|
|
|
radeon_begin(cs);
|
|
|
|
- if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
|
|
- /* Flush CMASK/FMASK/DCC. SURFACE_SYNC will wait for idle. */
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
|
|
- }
|
|
- if (flags & (SI_CONTEXT_FLUSH_AND_INV_DB | SI_CONTEXT_FLUSH_AND_INV_DB_META)) {
|
|
- /* Flush HTILE. SURFACE_SYNC will wait for idle. */
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
|
|
- }
|
|
+ /* Flush CMASK/FMASK/DCC. SURFACE_SYNC will wait for idle. */
|
|
+ if (flags & SI_CONTEXT_FLUSH_AND_INV_CB)
|
|
+ radeon_event_write(V_028A90_FLUSH_AND_INV_CB_META);
|
|
+
|
|
+ /* Flush HTILE. SURFACE_SYNC will wait for idle. */
|
|
+ if (flags & (SI_CONTEXT_FLUSH_AND_INV_DB | SI_CONTEXT_FLUSH_AND_INV_DB_META))
|
|
+ radeon_event_write(V_028A90_FLUSH_AND_INV_DB_META);
|
|
|
|
/* Wait for shader engines to go idle.
|
|
* VS and PS waits are unnecessary if SURFACE_SYNC is going to wait
|
|
@@ -1109,36 +1094,28 @@ void gfx6_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
|
|
*/
|
|
if (!flush_cb_db) {
|
|
if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) {
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
|
|
- /* Only count explicit shader flushes, not implicit ones
|
|
- * done by SURFACE_SYNC.
|
|
- */
|
|
+ radeon_event_write(V_028A90_PS_PARTIAL_FLUSH);
|
|
+ /* Only count explicit shader flushes, not implicit ones done by SURFACE_SYNC. */
|
|
sctx->num_vs_flushes++;
|
|
sctx->num_ps_flushes++;
|
|
} else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) {
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
|
|
+ radeon_event_write(V_028A90_VS_PARTIAL_FLUSH);
|
|
sctx->num_vs_flushes++;
|
|
}
|
|
}
|
|
|
|
if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && sctx->compute_is_busy) {
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
|
|
+ radeon_event_write(V_028A90_CS_PARTIAL_FLUSH);
|
|
sctx->num_cs_flushes++;
|
|
sctx->compute_is_busy = false;
|
|
}
|
|
|
|
/* VGT state synchronization. */
|
|
- if (flags & SI_CONTEXT_VGT_FLUSH) {
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
|
|
- }
|
|
- if (flags & SI_CONTEXT_VGT_STREAMOUT_SYNC) {
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_VGT_STREAMOUT_SYNC) | EVENT_INDEX(0));
|
|
- }
|
|
+ if (flags & SI_CONTEXT_VGT_FLUSH)
|
|
+ radeon_event_write(V_028A90_VGT_FLUSH);
|
|
+
|
|
+ if (flags & SI_CONTEXT_VGT_STREAMOUT_SYNC)
|
|
+ radeon_event_write(V_028A90_VGT_STREAMOUT_SYNC);
|
|
|
|
radeon_end();
|
|
|
|
@@ -1267,14 +1244,12 @@ void gfx6_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
|
|
|
|
if (flags & SI_CONTEXT_START_PIPELINE_STATS && sctx->pipeline_stats_enabled != 1) {
|
|
radeon_begin(cs);
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0));
|
|
+ radeon_event_write(V_028A90_PIPELINESTAT_START);
|
|
radeon_end();
|
|
sctx->pipeline_stats_enabled = 1;
|
|
} else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS && sctx->pipeline_stats_enabled != 0) {
|
|
radeon_begin(cs);
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0));
|
|
+ radeon_event_write(V_028A90_PIPELINESTAT_STOP);
|
|
radeon_end();
|
|
sctx->pipeline_stats_enabled = 0;
|
|
}
|
|
diff --git a/src/gallium/drivers/radeonsi/si_perfcounter.c b/src/gallium/drivers/radeonsi/si_perfcounter.c
|
|
index 9cbd08648af..785c98a9dc9 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_perfcounter.c
|
|
+++ b/src/gallium/drivers/radeonsi/si_perfcounter.c
|
|
@@ -114,8 +114,7 @@ static void si_pc_emit_start(struct si_context *sctx, struct si_resource *buffer
|
|
radeon_begin(cs);
|
|
radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
|
|
S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET));
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
|
|
+ radeon_event_write(V_028A90_PERFCOUNTER_STOP);
|
|
radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
|
|
S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_START_COUNTING));
|
|
radeon_end();
|
|
@@ -132,13 +131,10 @@ static void si_pc_emit_stop(struct si_context *sctx, struct si_resource *buffer,
|
|
si_cp_wait_mem(sctx, cs, va, 0, 0xffffffff, WAIT_REG_MEM_EQUAL);
|
|
|
|
radeon_begin(cs);
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
|
|
+ radeon_event_write(V_028A90_PERFCOUNTER_SAMPLE);
|
|
|
|
- if (!sctx->screen->info.never_send_perfcounter_stop) {
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
|
|
- }
|
|
+ if (!sctx->screen->info.never_send_perfcounter_stop)
|
|
+ radeon_event_write(V_028A90_PERFCOUNTER_STOP);
|
|
|
|
radeon_set_uconfig_reg(
|
|
R_036020_CP_PERFMON_CNTL,
|
|
@@ -158,8 +154,7 @@ void si_pc_emit_spm_start(struct radeon_cmdbuf *cs)
|
|
S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
|
|
S_036020_SPM_PERFMON_STATE(V_036020_STRM_PERFMON_STATE_START_COUNTING));
|
|
/* Start windowed performance counters. */
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
|
|
+ radeon_event_write(V_028A90_PERFCOUNTER_START);
|
|
radeon_set_sh_reg(R_00B82C_COMPUTE_PERFCOUNT_ENABLE, S_00B82C_PERFCOUNT_ENABLE(1));
|
|
|
|
radeon_end();
|
|
@@ -171,10 +166,8 @@ void si_pc_emit_spm_stop(struct radeon_cmdbuf *cs, bool never_stop_sq_perf_count
|
|
radeon_begin(cs);
|
|
|
|
/* Stop windowed performance counters. */
|
|
- if (!never_send_perfcounter_stop) {
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
|
|
- }
|
|
+ if (!never_send_perfcounter_stop)
|
|
+ radeon_event_write(V_028A90_PERFCOUNTER_STOP);
|
|
|
|
radeon_set_sh_reg(R_00B82C_COMPUTE_PERFCOUNT_ENABLE, S_00B82C_PERFCOUNT_ENABLE(0));
|
|
|
|
diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c
|
|
index 18c19b58063..eca647505ae 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_query.c
|
|
+++ b/src/gallium/drivers/radeonsi/si_query.c
|
|
@@ -980,8 +980,7 @@ static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw
|
|
|
|
radeon_begin(cs);
|
|
if (sctx->screen->use_ngg && query->flags & SI_QUERY_EMULATE_GS_COUNTERS) {
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
|
|
+ radeon_event_write(V_028A90_VS_PARTIAL_FLUSH);
|
|
|
|
if (--sctx->num_pipeline_stat_emulated_queries == 0) {
|
|
si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, NULL);
|
|
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
|
|
index 018be3dbe4f..8af8bae58ae 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_state.c
|
|
+++ b/src/gallium/drivers/radeonsi/si_state.c
|
|
@@ -62,8 +62,7 @@ static void si_emit_cb_render_state(struct si_context *sctx, unsigned index)
|
|
sctx->last_cb_target_mask = cb_target_mask;
|
|
|
|
radeon_begin(cs);
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
|
|
+ radeon_event_write(V_028A90_BREAK_BATCH);
|
|
radeon_end();
|
|
}
|
|
|
|
@@ -3191,10 +3190,9 @@ static void gfx6_emit_framebuffer_state(struct si_context *sctx, unsigned index)
|
|
S_028208_BR_X(state->width) | S_028208_BR_Y(state->height));
|
|
|
|
if (sctx->screen->dpbb_allowed &&
|
|
- sctx->screen->pbb_context_states_per_bin > 1) {
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
|
|
- }
|
|
+ sctx->screen->pbb_context_states_per_bin > 1)
|
|
+ radeon_event_write(V_028A90_BREAK_BATCH);
|
|
+
|
|
radeon_end();
|
|
|
|
si_update_display_dcc_dirty(sctx);
|
|
@@ -3341,10 +3339,9 @@ static void gfx11_dgpu_emit_framebuffer_state(struct si_context *sctx, unsigned
|
|
gfx11_end_packed_context_regs();
|
|
|
|
if (sctx->screen->dpbb_allowed &&
|
|
- sctx->screen->pbb_context_states_per_bin > 1) {
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
|
|
- }
|
|
+ sctx->screen->pbb_context_states_per_bin > 1)
|
|
+ radeon_event_write(V_028A90_BREAK_BATCH);
|
|
+
|
|
radeon_end();
|
|
|
|
si_update_display_dcc_dirty(sctx);
|
|
@@ -3478,10 +3475,9 @@ static void gfx12_emit_framebuffer_state(struct si_context *sctx, unsigned index
|
|
gfx12_end_context_regs();
|
|
|
|
if (sctx->screen->dpbb_allowed &&
|
|
- sctx->screen->pbb_context_states_per_bin > 1) {
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
|
|
- }
|
|
+ sctx->screen->pbb_context_states_per_bin > 1)
|
|
+ radeon_event_write(V_028A90_BREAK_BATCH);
|
|
+
|
|
radeon_end();
|
|
|
|
sctx->framebuffer.dirty_cbufs = 0;
|
|
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp
|
|
index bd2c936306b..81565b7694e 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
|
|
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
|
|
@@ -1212,16 +1212,6 @@ void si_emit_buffered_compute_sh_regs(struct si_context *sctx)
|
|
|
|
#endif
|
|
|
|
-#define EMIT_SQTT_END_DRAW \
|
|
- do { \
|
|
- if (GFX_VERSION >= GFX9 && unlikely(sctx->sqtt_enabled)) { \
|
|
- radeon_begin(&sctx->gfx_cs); \
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); \
|
|
- radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0)); \
|
|
- radeon_end(); \
|
|
- } \
|
|
- } while (0)
|
|
-
|
|
template <amd_gfx_level GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
|
|
si_is_draw_vertex_state IS_DRAW_VERTEX_STATE, si_has_sh_pairs_packed HAS_SH_PAIRS_PACKED> ALWAYS_INLINE
|
|
static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw_info *info,
|
|
@@ -1638,10 +1628,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
|
|
radeon_emit(0);
|
|
radeon_emit(V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque);
|
|
|
|
- for (unsigned i = 0; i < 3; i++) {
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_SQ_NON_EVENT) | EVENT_INDEX(0));
|
|
- }
|
|
+ for (unsigned i = 0; i < 3; i++)
|
|
+ radeon_event_write(V_028A90_SQ_NON_EVENT);
|
|
} else if (increment_draw_id) {
|
|
for (unsigned i = 0; i < num_draws; i++) {
|
|
if (i > 0) {
|
|
@@ -1675,9 +1663,10 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
|
|
}
|
|
}
|
|
}
|
|
- radeon_end();
|
|
|
|
- EMIT_SQTT_END_DRAW;
|
|
+ if (GFX_VERSION >= GFX9 && unlikely(sctx->sqtt_enabled))
|
|
+ radeon_event_write(V_028A90_THREAD_TRACE_MARKER);
|
|
+ radeon_end();
|
|
}
|
|
|
|
/* Return false if not bound. */
|
|
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
|
|
index 3eb56675313..a3a1b613337 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
|
|
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
|
|
@@ -4081,12 +4081,10 @@ static void si_emit_vgt_flush(struct radeon_cmdbuf *cs)
|
|
radeon_begin(cs);
|
|
|
|
/* This is required before VGT_FLUSH. */
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
|
|
+ radeon_event_write(V_028A90_VS_PARTIAL_FLUSH);
|
|
|
|
/* VGT_FLUSH is required even if VGT is idle. It resets VGT pointers. */
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
|
|
+ radeon_event_write(V_028A90_VGT_FLUSH);
|
|
radeon_end();
|
|
}
|
|
|
|
@@ -4973,11 +4971,8 @@ static void si_emit_spi_ge_ring_state(struct si_context *sctx, unsigned index)
|
|
|
|
radeon_begin(&sctx->gfx_cs);
|
|
/* Required before writing tessellation config registers. */
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
|
|
-
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
|
|
+ radeon_event_write(V_028A90_VS_PARTIAL_FLUSH);
|
|
+ radeon_event_write(V_028A90_VGT_FLUSH);
|
|
|
|
if (sctx->gfx_level >= GFX7) {
|
|
radeon_set_uconfig_reg_seq(R_030938_VGT_TF_RING_SIZE, 3);
|
|
diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/gallium/drivers/radeonsi/si_state_streamout.c
|
|
index 8047965ff84..9a92b4a4f3d 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_state_streamout.c
|
|
+++ b/src/gallium/drivers/radeonsi/si_state_streamout.c
|
|
@@ -259,8 +259,7 @@ static void si_flush_vgt_streamout(struct si_context *sctx)
|
|
radeon_set_config_reg(reg_strmout_cntl, 0);
|
|
}
|
|
|
|
- radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
|
- radeon_emit(EVENT_TYPE(V_028A90_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
|
|
+ radeon_event_write(V_028A90_SO_VGTSTREAMOUT_FLUSH);
|
|
|
|
radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
|
|
radeon_emit(WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
|
|
--
|
|
2.48.1
|
|
|
|
|
|
From eefc15a11186a5b806ed5b2fcfbd2c9e1ad3d8e9 Mon Sep 17 00:00:00 2001
|
|
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
|
|
Date: Mon, 16 Dec 2024 03:59:13 -0500
|
|
Subject: [PATCH 05/27] radeonsi/gfx12: adjust HiZ/HiS logic
|
|
|
|
Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
|
|
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32653>
|
|
---
|
|
src/amd/common/ac_gpu_info.c | 1 +
|
|
src/amd/common/ac_surface.c | 8 ++-
|
|
src/gallium/drivers/radeonsi/si_build_pm4.h | 14 +++++
|
|
.../drivers/radeonsi/si_debug_options.h | 1 +
|
|
src/gallium/drivers/radeonsi/si_pipe.h | 1 +
|
|
src/gallium/drivers/radeonsi/si_state.c | 51 ++++++++++++----
|
|
.../drivers/radeonsi/si_state_draw.cpp | 61 +++++++++++++++----
|
|
7 files changed, 110 insertions(+), 27 deletions(-)
|
|
|
|
diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c
|
|
index 0d475cdab63..dd43af8dbb6 100644
|
|
--- a/src/amd/common/ac_gpu_info.c
|
|
+++ b/src/amd/common/ac_gpu_info.c
|
|
@@ -593,6 +593,7 @@ static void handle_env_var_force_family(struct radeon_info *info)
|
|
info->gfx_level = ac_get_gfx_level(i);
|
|
info->family_id = ac_get_family_id(i);
|
|
info->family_overridden = true;
|
|
+ info->chip_rev = 1;
|
|
return;
|
|
}
|
|
}
|
|
diff --git a/src/amd/common/ac_surface.c b/src/amd/common/ac_surface.c
|
|
index 69f722bc4cc..66d773921c4 100644
|
|
--- a/src/amd/common/ac_surface.c
|
|
+++ b/src/amd/common/ac_surface.c
|
|
@@ -3022,7 +3022,7 @@ static bool gfx12_compute_hiz_his_info(struct ac_addrlib *addrlib, const struct
|
|
{
|
|
assert(surf_in->flags.depth != surf_in->flags.stencil);
|
|
|
|
- if (surf->flags & RADEON_SURF_NO_HTILE || (info->gfx_level == GFX12 && info->chip_rev <= 1))
|
|
+ if (surf->flags & RADEON_SURF_NO_HTILE || (info->gfx_level == GFX12 && info->chip_rev == 0))
|
|
return true;
|
|
|
|
ADDR3_COMPUTE_SURFACE_INFO_OUTPUT out = {0};
|
|
@@ -3079,7 +3079,11 @@ static bool gfx12_compute_miptree(struct ac_addrlib *addrlib, const struct radeo
|
|
surf->surf_alignment_log2 = MAX2(surf->surf_alignment_log2, util_logbase2(out.baseAlign));
|
|
surf->surf_size = surf->u.gfx9.zs.stencil_offset + out.surfSize;
|
|
|
|
- return gfx12_compute_hiz_his_info(addrlib, info, surf, &surf->u.gfx9.zs.his, in);
|
|
+ if (info->chip_rev >= 2 &&
|
|
+ !gfx12_compute_hiz_his_info(addrlib, info, surf, &surf->u.gfx9.zs.his, in))
|
|
+ return false;
|
|
+
|
|
+ return true;
|
|
}
|
|
|
|
surf->u.gfx9.surf_slice_size = out.sliceSize;
|
|
diff --git a/src/gallium/drivers/radeonsi/si_build_pm4.h b/src/gallium/drivers/radeonsi/si_build_pm4.h
|
|
index 630b5885ff3..c792d0fd2f8 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_build_pm4.h
|
|
+++ b/src/gallium/drivers/radeonsi/si_build_pm4.h
|
|
@@ -514,6 +514,20 @@
|
|
__event_type == V_028A90_PIXEL_PIPE_STAT_CONTROL ? 1 : 0)); \
|
|
} while (0)
|
|
|
|
+#define radeon_emit_alt_hiz_logic() do { \
|
|
+ static_assert(GFX_VERSION == GFX12 || !ALT_HIZ_LOGIC, ""); \
|
|
+ if (GFX_VERSION == GFX12 && ALT_HIZ_LOGIC) { \
|
|
+ radeon_emit(PKT3(PKT3_RELEASE_MEM, 6, 0)); \
|
|
+ radeon_emit(S_490_EVENT_TYPE(V_028A90_BOTTOM_OF_PIPE_TS) | S_490_EVENT_INDEX(5)); \
|
|
+ radeon_emit(0); /* DST_SEL, INT_SEL = no write confirm, DATA_SEL = no data */ \
|
|
+ radeon_emit(0); /* ADDRESS_LO */ \
|
|
+ radeon_emit(0); /* ADDRESS_HI */ \
|
|
+ radeon_emit(0); /* DATA_LO */ \
|
|
+ radeon_emit(0); /* DATA_HI */ \
|
|
+ radeon_emit(0); /* INT_CTXID */ \
|
|
+ } \
|
|
+} while (0)
|
|
+
|
|
/* This should be evaluated at compile time if all parameters are constants. */
|
|
static ALWAYS_INLINE unsigned
|
|
si_get_user_data_base(enum amd_gfx_level gfx_level, enum si_has_tess has_tess,
|
|
diff --git a/src/gallium/drivers/radeonsi/si_debug_options.h b/src/gallium/drivers/radeonsi/si_debug_options.h
|
|
index ba2c2336ee0..a5057c4700e 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_debug_options.h
|
|
+++ b/src/gallium/drivers/radeonsi/si_debug_options.h
|
|
@@ -23,6 +23,7 @@ OPT_BOOL(zerovram, false, "Zero all VRAM allocations")
|
|
OPT_BOOL(clear_lds, false, "Clear LDS at the end of shaders. Might decrease performance.")
|
|
OPT_BOOL(cache_rb_gl2, false, "Enable GL2 caching for CB and DB.")
|
|
OPT_BOOL(optimize_io, true, "Run nir_opt_varyings in the GLSL linker.")
|
|
+OPT_BOOL(alt_hiz_logic, false, "Enable alternative HiZ logic")
|
|
|
|
#undef OPT_BOOL
|
|
#undef OPT_INT
|
|
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
|
|
index a7f5b544051..0002a3056c9 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_pipe.h
|
|
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
|
|
@@ -398,6 +398,7 @@ struct si_texture {
|
|
bool can_sample_z : 1;
|
|
bool can_sample_s : 1;
|
|
bool need_flush_after_depth_decompression: 1;
|
|
+ bool force_disable_hiz_his : 1;
|
|
|
|
/* We need to track DCC dirtiness, because st/dri usually calls
|
|
* flush_resource twice per frame (not a bug) and we don't wanna
|
|
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
|
|
index 8af8bae58ae..a647defc26c 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_state.c
|
|
+++ b/src/gallium/drivers/radeonsi/si_state.c
|
|
@@ -1713,6 +1713,20 @@ static void si_bind_dsa_state(struct pipe_context *ctx, void *state)
|
|
si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref);
|
|
}
|
|
|
|
+ struct pipe_surface *zssurf = sctx->framebuffer.state.zsbuf;
|
|
+ struct si_texture *zstex = (struct si_texture*)(zssurf ? zssurf->texture : NULL);
|
|
+
|
|
+ if (sctx->gfx_level == GFX12 && !sctx->screen->options.alt_hiz_logic &&
|
|
+ sctx->framebuffer.has_stencil && dsa->stencil_enabled && !zstex->force_disable_hiz_his) {
|
|
+ zstex->force_disable_hiz_his = true;
|
|
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
|
|
+
|
|
+ if (sctx->framebuffer.has_hiz_his) {
|
|
+ sctx->framebuffer.has_hiz_his = false;
|
|
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
|
|
+ }
|
|
+ }
|
|
+
|
|
if (old_dsa->alpha_func != dsa->alpha_func) {
|
|
si_ps_key_update_dsa(sctx);
|
|
si_update_ps_inputs_read_or_disabled(sctx);
|
|
@@ -2814,9 +2828,14 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
|
|
if (util_format_has_stencil(util_format_description(zstex->buffer.b.b.format)))
|
|
sctx->framebuffer.has_stencil = true;
|
|
|
|
+ if (sctx->gfx_level == GFX12 && !sctx->screen->options.alt_hiz_logic &&
|
|
+ sctx->framebuffer.has_stencil && sctx->queued.named.dsa->stencil_enabled)
|
|
+ zstex->force_disable_hiz_his = true;
|
|
+
|
|
if (sctx->gfx_level >= GFX12) {
|
|
- sctx->framebuffer.has_hiz_his = zstex->surface.u.gfx9.zs.hiz.offset ||
|
|
- zstex->surface.u.gfx9.zs.his.offset;
|
|
+ sctx->framebuffer.has_hiz_his = (zstex->surface.u.gfx9.zs.hiz.offset ||
|
|
+ zstex->surface.u.gfx9.zs.his.offset) &&
|
|
+ !zstex->force_disable_hiz_his;
|
|
}
|
|
}
|
|
|
|
@@ -3443,18 +3462,24 @@ static void gfx12_emit_framebuffer_state(struct si_context *sctx, unsigned index
|
|
gfx12_set_context_reg(R_028034_DB_STENCIL_READ_BASE_HI, zb->ds.db_stencil_base >> 32);
|
|
gfx12_set_context_reg(R_028038_DB_STENCIL_WRITE_BASE, zb->ds.db_stencil_base);
|
|
gfx12_set_context_reg(R_02803C_DB_STENCIL_WRITE_BASE_HI, zb->ds.db_stencil_base >> 32);
|
|
- gfx12_set_context_reg(R_028B94_PA_SC_HIZ_INFO, zb->ds.u.gfx12.hiz_info);
|
|
- gfx12_set_context_reg(R_028B98_PA_SC_HIS_INFO, zb->ds.u.gfx12.his_info);
|
|
|
|
- if (zb->ds.u.gfx12.hiz_info) {
|
|
- gfx12_set_context_reg(R_028B9C_PA_SC_HIZ_BASE, zb->ds.u.gfx12.hiz_base);
|
|
- gfx12_set_context_reg(R_028BA0_PA_SC_HIZ_BASE_EXT, zb->ds.u.gfx12.hiz_base >> 32);
|
|
- gfx12_set_context_reg(R_028BA4_PA_SC_HIZ_SIZE_XY, zb->ds.u.gfx12.hiz_size_xy);
|
|
- }
|
|
- if (zb->ds.u.gfx12.his_info) {
|
|
- gfx12_set_context_reg(R_028BA8_PA_SC_HIS_BASE, zb->ds.u.gfx12.his_base);
|
|
- gfx12_set_context_reg(R_028BAC_PA_SC_HIS_BASE_EXT, zb->ds.u.gfx12.his_base >> 32);
|
|
- gfx12_set_context_reg(R_028BB0_PA_SC_HIS_SIZE_XY, zb->ds.u.gfx12.his_size_xy);
|
|
+ if (tex->force_disable_hiz_his) {
|
|
+ gfx12_set_context_reg(R_028B94_PA_SC_HIZ_INFO, S_028B94_SURFACE_ENABLE(0));
|
|
+ gfx12_set_context_reg(R_028B98_PA_SC_HIS_INFO, S_028B98_SURFACE_ENABLE(0));
|
|
+ } else {
|
|
+ gfx12_set_context_reg(R_028B94_PA_SC_HIZ_INFO, zb->ds.u.gfx12.hiz_info);
|
|
+ gfx12_set_context_reg(R_028B98_PA_SC_HIS_INFO, zb->ds.u.gfx12.his_info);
|
|
+
|
|
+ if (zb->ds.u.gfx12.hiz_info) {
|
|
+ gfx12_set_context_reg(R_028B9C_PA_SC_HIZ_BASE, zb->ds.u.gfx12.hiz_base);
|
|
+ gfx12_set_context_reg(R_028BA0_PA_SC_HIZ_BASE_EXT, zb->ds.u.gfx12.hiz_base >> 32);
|
|
+ gfx12_set_context_reg(R_028BA4_PA_SC_HIZ_SIZE_XY, zb->ds.u.gfx12.hiz_size_xy);
|
|
+ }
|
|
+ if (zb->ds.u.gfx12.his_info) {
|
|
+ gfx12_set_context_reg(R_028BA8_PA_SC_HIS_BASE, zb->ds.u.gfx12.his_base);
|
|
+ gfx12_set_context_reg(R_028BAC_PA_SC_HIS_BASE_EXT, zb->ds.u.gfx12.his_base >> 32);
|
|
+ gfx12_set_context_reg(R_028BB0_PA_SC_HIS_SIZE_XY, zb->ds.u.gfx12.his_size_xy);
|
|
+ }
|
|
}
|
|
} else if (sctx->framebuffer.dirty_zsbuf) {
|
|
gfx12_set_context_reg(R_028018_DB_Z_INFO,
|
|
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp
|
|
index 81565b7694e..bc4cf308936 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
|
|
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
|
|
@@ -823,6 +823,11 @@ enum si_has_sh_pairs_packed {
|
|
HAS_SH_PAIRS_PACKED_ON,
|
|
};
|
|
|
|
+enum si_alt_hiz_logic {
|
|
+ ALT_HIZ_LOGIC_OFF,
|
|
+ ALT_HIZ_LOGIC_ON,
|
|
+};
|
|
+
|
|
template <si_is_draw_vertex_state IS_DRAW_VERTEX_STATE> ALWAYS_INLINE
|
|
static bool num_instanced_prims_less_than(const struct pipe_draw_indirect_info *indirect,
|
|
enum mesa_prim prim,
|
|
@@ -1213,7 +1218,8 @@ void si_emit_buffered_compute_sh_regs(struct si_context *sctx)
|
|
#endif
|
|
|
|
template <amd_gfx_level GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
|
|
- si_is_draw_vertex_state IS_DRAW_VERTEX_STATE, si_has_sh_pairs_packed HAS_SH_PAIRS_PACKED> ALWAYS_INLINE
|
|
+ si_is_draw_vertex_state IS_DRAW_VERTEX_STATE, si_has_sh_pairs_packed HAS_SH_PAIRS_PACKED,
|
|
+ si_alt_hiz_logic ALT_HIZ_LOGIC> ALWAYS_INLINE
|
|
static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw_info *info,
|
|
unsigned drawid_base,
|
|
const struct pipe_draw_indirect_info *indirect,
|
|
@@ -1405,6 +1411,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
|
|
radeon_emit((sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
|
|
radeon_emit((sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
|
|
radeon_emit(di_src_sel);
|
|
+
|
|
+ radeon_emit_alt_hiz_logic();
|
|
} else {
|
|
uint64_t count_va = 0;
|
|
|
|
@@ -1430,6 +1438,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
|
|
radeon_emit(count_va >> 32);
|
|
radeon_emit(indirect->stride);
|
|
radeon_emit(di_src_sel);
|
|
+
|
|
+ radeon_emit_alt_hiz_logic();
|
|
}
|
|
} else {
|
|
if (sctx->last_instance_count == SI_INSTANCE_COUNT_UNKNOWN ||
|
|
@@ -1549,6 +1559,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
|
|
radeon_emit(va >> 32);
|
|
radeon_emit(draws[i].count);
|
|
radeon_emit(V_0287F0_DI_SRC_SEL_DMA); /* NOT_EOP disabled */
|
|
+
|
|
+ radeon_emit_alt_hiz_logic();
|
|
}
|
|
if (num_draws > 1) {
|
|
BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg); /* BaseVertex */
|
|
@@ -1568,6 +1580,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
|
|
radeon_emit(va >> 32);
|
|
radeon_emit(draws[i].count);
|
|
radeon_emit(V_0287F0_DI_SRC_SEL_DMA); /* NOT_EOP disabled */
|
|
+
|
|
+ radeon_emit_alt_hiz_logic();
|
|
}
|
|
if (num_draws > 1) {
|
|
BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg + 1); /* DrawID */
|
|
@@ -1588,6 +1602,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
|
|
radeon_emit(va >> 32);
|
|
radeon_emit(draws[i].count);
|
|
radeon_emit(V_0287F0_DI_SRC_SEL_DMA); /* NOT_EOP disabled */
|
|
+
|
|
+ radeon_emit_alt_hiz_logic();
|
|
}
|
|
if (num_draws > 1) {
|
|
BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg); /* BaseVertex */
|
|
@@ -1615,6 +1631,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
|
|
radeon_emit(V_0287F0_DI_SRC_SEL_DMA |
|
|
S_0287F0_NOT_EOP(GFX_VERSION >= GFX10 && GFX_VERSION < GFX12 &&
|
|
i < num_draws - 1));
|
|
+
|
|
+ radeon_emit_alt_hiz_logic();
|
|
}
|
|
}
|
|
}
|
|
@@ -1628,6 +1646,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
|
|
radeon_emit(0);
|
|
radeon_emit(V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque);
|
|
|
|
+ radeon_emit_alt_hiz_logic();
|
|
for (unsigned i = 0; i < 3; i++)
|
|
radeon_event_write(V_028A90_SQ_NON_EVENT);
|
|
} else if (increment_draw_id) {
|
|
@@ -1643,6 +1662,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
|
|
radeon_emit(PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit));
|
|
radeon_emit(draws[i].count);
|
|
radeon_emit(V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque);
|
|
+
|
|
+ radeon_emit_alt_hiz_logic();
|
|
}
|
|
if (num_draws > 1 && (IS_DRAW_VERTEX_STATE || !sctx->num_vs_blit_sgprs)) {
|
|
BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg); /* BaseVertex */
|
|
@@ -1656,6 +1677,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
|
|
radeon_emit(PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit));
|
|
radeon_emit(draws[i].count);
|
|
radeon_emit(V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque);
|
|
+
|
|
+ radeon_emit_alt_hiz_logic();
|
|
}
|
|
if (num_draws > 1 && (IS_DRAW_VERTEX_STATE || !sctx->num_vs_blit_sgprs)) {
|
|
BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg); /* BaseVertex */
|
|
@@ -2012,7 +2035,7 @@ static void si_emit_all_states(struct si_context *sctx, uint64_t skip_atom_mask)
|
|
|
|
template <amd_gfx_level GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
|
|
si_is_draw_vertex_state IS_DRAW_VERTEX_STATE, si_has_sh_pairs_packed HAS_SH_PAIRS_PACKED,
|
|
- util_popcnt POPCNT> ALWAYS_INLINE
|
|
+ util_popcnt POPCNT, si_alt_hiz_logic ALT_HIZ_LOGIC> ALWAYS_INLINE
|
|
static void si_draw(struct pipe_context *ctx,
|
|
const struct pipe_draw_info *info,
|
|
unsigned drawid_offset,
|
|
@@ -2312,7 +2335,8 @@ static void si_draw(struct pipe_context *ctx,
|
|
return;
|
|
}
|
|
|
|
- si_emit_draw_packets<GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_DRAW_VERTEX_STATE, HAS_SH_PAIRS_PACKED>
|
|
+ si_emit_draw_packets<GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_DRAW_VERTEX_STATE,
|
|
+ HAS_SH_PAIRS_PACKED, ALT_HIZ_LOGIC>
|
|
(sctx, info, drawid_offset, indirect, draws, num_draws, indexbuf,
|
|
index_size, index_offset, instance_count);
|
|
/* <-- CUs start to get busy here if we waited. */
|
|
@@ -2362,7 +2386,7 @@ static void si_draw(struct pipe_context *ctx,
|
|
}
|
|
|
|
template <amd_gfx_level GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
|
|
- si_has_sh_pairs_packed HAS_SH_PAIRS_PACKED>
|
|
+ si_has_sh_pairs_packed HAS_SH_PAIRS_PACKED, si_alt_hiz_logic ALT_HIZ_LOGIC>
|
|
static void si_draw_vbo(struct pipe_context *ctx,
|
|
const struct pipe_draw_info *info,
|
|
unsigned drawid_offset,
|
|
@@ -2370,12 +2394,14 @@ static void si_draw_vbo(struct pipe_context *ctx,
|
|
const struct pipe_draw_start_count_bias *draws,
|
|
unsigned num_draws)
|
|
{
|
|
- si_draw<GFX_VERSION, HAS_TESS, HAS_GS, NGG, DRAW_VERTEX_STATE_OFF, HAS_SH_PAIRS_PACKED, POPCNT_NO>
|
|
+ si_draw<GFX_VERSION, HAS_TESS, HAS_GS, NGG, DRAW_VERTEX_STATE_OFF, HAS_SH_PAIRS_PACKED,
|
|
+ POPCNT_NO, ALT_HIZ_LOGIC>
|
|
(ctx, info, drawid_offset, indirect, draws, num_draws, NULL, 0);
|
|
}
|
|
|
|
template <amd_gfx_level GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
|
|
- si_has_sh_pairs_packed HAS_SH_PAIRS_PACKED, util_popcnt POPCNT>
|
|
+ si_has_sh_pairs_packed HAS_SH_PAIRS_PACKED, util_popcnt POPCNT,
|
|
+ si_alt_hiz_logic ALT_HIZ_LOGIC>
|
|
static void si_draw_vertex_state(struct pipe_context *ctx,
|
|
struct pipe_vertex_state *vstate,
|
|
uint32_t partial_velem_mask,
|
|
@@ -2391,7 +2417,8 @@ static void si_draw_vertex_state(struct pipe_context *ctx,
|
|
dinfo.instance_count = 1;
|
|
dinfo.index.resource = state->b.input.indexbuf;
|
|
|
|
- si_draw<GFX_VERSION, HAS_TESS, HAS_GS, NGG, DRAW_VERTEX_STATE_ON, HAS_SH_PAIRS_PACKED, POPCNT>
|
|
+ si_draw<GFX_VERSION, HAS_TESS, HAS_GS, NGG, DRAW_VERTEX_STATE_ON, HAS_SH_PAIRS_PACKED, POPCNT,
|
|
+ ALT_HIZ_LOGIC>
|
|
(ctx, &dinfo, 0, NULL, draws, num_draws, vstate, partial_velem_mask);
|
|
|
|
if (info.take_vertex_state_ownership)
|
|
@@ -2453,18 +2480,28 @@ static void si_init_draw_vbo(struct si_context *sctx)
|
|
if (!NGG && GFX_VERSION >= GFX11)
|
|
return;
|
|
|
|
- if (GFX_VERSION >= GFX11 && GFX_VERSION < GFX12 && sctx->screen->info.has_set_sh_pairs_packed) {
|
|
+ if (GFX_VERSION == GFX12 && sctx->screen->options.alt_hiz_logic) {
|
|
+ sctx->draw_vbo[HAS_TESS][HAS_GS][NGG] =
|
|
+ si_draw_vbo<GFX12, HAS_TESS, HAS_GS, NGG, HAS_SH_PAIRS_PACKED_OFF, ALT_HIZ_LOGIC_ON>;
|
|
+
|
|
+ sctx->draw_vertex_state[HAS_TESS][HAS_GS][NGG] =
|
|
+ si_draw_vertex_state<GFX12, HAS_TESS, HAS_GS, NGG, HAS_SH_PAIRS_PACKED_OFF, POPCNT,
|
|
+ ALT_HIZ_LOGIC_ON>;
|
|
+ } else if (GFX_VERSION >= GFX11 && GFX_VERSION < GFX12 &&
|
|
+ sctx->screen->info.has_set_sh_pairs_packed) {
|
|
sctx->draw_vbo[HAS_TESS][HAS_GS][NGG] =
|
|
- si_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG, HAS_SH_PAIRS_PACKED_ON>;
|
|
+ si_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG, HAS_SH_PAIRS_PACKED_ON, ALT_HIZ_LOGIC_OFF>;
|
|
|
|
sctx->draw_vertex_state[HAS_TESS][HAS_GS][NGG] =
|
|
- si_draw_vertex_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG, HAS_SH_PAIRS_PACKED_ON, POPCNT>;
|
|
+ si_draw_vertex_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG, HAS_SH_PAIRS_PACKED_ON, POPCNT,
|
|
+ ALT_HIZ_LOGIC_OFF>;
|
|
} else {
|
|
sctx->draw_vbo[HAS_TESS][HAS_GS][NGG] =
|
|
- si_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG, HAS_SH_PAIRS_PACKED_OFF>;
|
|
+ si_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG, HAS_SH_PAIRS_PACKED_OFF, ALT_HIZ_LOGIC_OFF>;
|
|
|
|
sctx->draw_vertex_state[HAS_TESS][HAS_GS][NGG] =
|
|
- si_draw_vertex_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG, HAS_SH_PAIRS_PACKED_OFF, POPCNT>;
|
|
+ si_draw_vertex_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG, HAS_SH_PAIRS_PACKED_OFF, POPCNT,
|
|
+ ALT_HIZ_LOGIC_OFF>;
|
|
}
|
|
}
|
|
|
|
--
|
|
2.48.1
|
|
|
|
|
|
From a4f5130de6d737aa1264d3a981f95b81a9f13d05 Mon Sep 17 00:00:00 2001
|
|
From: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
|
|
Date: Thu, 21 Nov 2024 13:32:55 +0100
|
|
Subject: [PATCH 06/27] radeonsi: disable DCC for PIPE_BIND_USE_FRONT_RENDERING
|
|
MIME-Version: 1.0
|
|
Content-Type: text/plain; charset=UTF-8
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
Front rendering and (display) DCC are causing artifacts on screen.
|
|
|
|
si_texture_get_handle deals with this problem by disabling dcc, but
|
|
we can make it simpler by not allocating DCC at all when this flag
|
|
is set.
|
|
|
|
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
|
|
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32281>
|
|
---
|
|
src/gallium/drivers/radeonsi/si_texture.c | 19 +++++++++++++++++++
|
|
1 file changed, 19 insertions(+)
|
|
|
|
diff --git a/src/gallium/drivers/radeonsi/si_texture.c b/src/gallium/drivers/radeonsi/si_texture.c
|
|
index ae65b725600..9cc11a81669 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_texture.c
|
|
+++ b/src/gallium/drivers/radeonsi/si_texture.c
|
|
@@ -240,6 +240,8 @@ static int si_init_surface(struct si_screen *sscreen, struct radeon_surf *surfac
|
|
|
|
if (modifier == DRM_FORMAT_MOD_INVALID &&
|
|
(ptex->bind & PIPE_BIND_CONST_BW ||
|
|
+ ptex->bind & PIPE_BIND_PROTECTED ||
|
|
+ ptex->bind & PIPE_BIND_USE_FRONT_RENDERING ||
|
|
sscreen->debug_flags & DBG(NO_DCC) ||
|
|
(ptex->bind & PIPE_BIND_SCANOUT && sscreen->debug_flags & DBG(NO_DISPLAY_DCC))))
|
|
flags |= RADEON_SURF_DISABLE_DCC;
|
|
@@ -289,6 +291,9 @@ static int si_init_surface(struct si_screen *sscreen, struct radeon_surf *surfac
|
|
if (ptex->bind & PIPE_BIND_CONST_BW)
|
|
flags |= RADEON_SURF_DISABLE_DCC;
|
|
|
|
+ if (ptex->bind & PIPE_BIND_USE_FRONT_RENDERING)
|
|
+ flags |= RADEON_SURF_DISABLE_DCC;
|
|
+
|
|
switch (sscreen->info.gfx_level) {
|
|
case GFX8:
|
|
/* Stoney: 128bpp MSAA textures randomly fail piglit tests with DCC. */
|
|
@@ -1638,6 +1643,20 @@ si_modifier_supports_resource(struct pipe_screen *screen,
|
|
struct si_screen *sscreen = (struct si_screen *)screen;
|
|
uint32_t max_width, max_height;
|
|
|
|
+ if (((templ->bind & PIPE_BIND_LINEAR) || sscreen->debug_flags & DBG(NO_TILING)) &&
|
|
+ modifier != DRM_FORMAT_MOD_LINEAR)
|
|
+ return false;
|
|
+
|
|
+ if ((templ->bind & PIPE_BIND_USE_FRONT_RENDERING) && ac_modifier_has_dcc(modifier))
|
|
+ return false;
|
|
+
|
|
+ /* Protected content doesn't support DCC on GFX12. */
|
|
+ if (sscreen->info.gfx_level >= GFX12 && templ->bind & PIPE_BIND_PROTECTED &&
|
|
+ IS_AMD_FMT_MOD(modifier) &&
|
|
+ AMD_FMT_MOD_GET(TILE_VERSION, modifier) >= AMD_FMT_MOD_TILE_VER_GFX12 &&
|
|
+ AMD_FMT_MOD_GET(DCC, modifier))
|
|
+ return false;
|
|
+
|
|
ac_modifier_max_extent(&sscreen->info, modifier, &max_width, &max_height);
|
|
return templ->width0 <= max_width && templ->height0 <= max_height;
|
|
}
|
|
--
|
|
2.48.1
|
|
|
|
|
|
From 35435069c1063180814aff055365dce939d95e60 Mon Sep 17 00:00:00 2001
|
|
From: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
|
|
Date: Wed, 29 Jan 2025 18:10:12 +0100
|
|
Subject: [PATCH 07/27] radeonsi: update si_need_gfx_cs_space upper bound
|
|
MIME-Version: 1.0
|
|
Content-Type: text/plain; charset=UTF-8
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
radeon_emit_alt_hiz_logic can add 8 extra dw per draw.
|
|
|
|
Fixes: cdecbee9225 ("radeonsi/gfx12: adjust HiZ/HiS logic")
|
|
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
|
|
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33288>
|
|
---
|
|
src/gallium/drivers/radeonsi/si_compute.c | 2 +-
|
|
src/gallium/drivers/radeonsi/si_cp_dma.c | 2 +-
|
|
src/gallium/drivers/radeonsi/si_perfcounter.c | 2 +-
|
|
src/gallium/drivers/radeonsi/si_pipe.h | 12 ++++++++++--
|
|
src/gallium/drivers/radeonsi/si_query.c | 6 +++---
|
|
src/gallium/drivers/radeonsi/si_state_draw.cpp | 2 +-
|
|
6 files changed, 17 insertions(+), 9 deletions(-)
|
|
|
|
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
|
|
index 01aa75d222a..d2c2bc16500 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_compute.c
|
|
+++ b/src/gallium/drivers/radeonsi/si_compute.c
|
|
@@ -1218,7 +1218,7 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
|
|
}
|
|
}
|
|
|
|
- si_need_gfx_cs_space(sctx, 0);
|
|
+ si_need_gfx_cs_space(sctx, 0, 0);
|
|
|
|
/* If we're using a secure context, determine if cs must be secure or not */
|
|
if (unlikely(radeon_uses_secure_bos(sctx->ws))) {
|
|
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
|
|
index 89ee80e14f5..f0ba0796377 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
|
|
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
|
|
@@ -146,7 +146,7 @@ static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst
|
|
bool *is_first, unsigned *packet_flags)
|
|
{
|
|
if (!(user_flags & SI_OP_CPDMA_SKIP_CHECK_CS_SPACE))
|
|
- si_need_gfx_cs_space(sctx, 0);
|
|
+ si_need_gfx_cs_space(sctx, 0, 0);
|
|
|
|
/* This must be done after need_cs_space. */
|
|
if (dst)
|
|
diff --git a/src/gallium/drivers/radeonsi/si_perfcounter.c b/src/gallium/drivers/radeonsi/si_perfcounter.c
|
|
index 785c98a9dc9..806a842a08d 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_perfcounter.c
|
|
+++ b/src/gallium/drivers/radeonsi/si_perfcounter.c
|
|
@@ -277,7 +277,7 @@ static void si_pc_query_resume(struct si_context *sctx, struct si_query *squery)
|
|
|
|
if (!si_query_buffer_alloc(sctx, &query->buffer, NULL, query->result_size))
|
|
return;
|
|
- si_need_gfx_cs_space(sctx, 0);
|
|
+ si_need_gfx_cs_space(sctx, 0, 0);
|
|
|
|
if (query->shaders)
|
|
si_pc_emit_shaders(&sctx->gfx_cs, query->shaders);
|
|
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
|
|
index 0002a3056c9..f132e182b4f 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_pipe.h
|
|
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
|
|
@@ -2026,11 +2026,19 @@ static inline bool util_rast_prim_is_lines_or_triangles(unsigned prim)
|
|
return ((1 << prim) & (UTIL_ALL_PRIM_LINE_MODES | UTIL_ALL_PRIM_TRIANGLE_MODES)) != 0;
|
|
}
|
|
|
|
-static inline void si_need_gfx_cs_space(struct si_context *ctx, unsigned num_draws)
|
|
+static inline void si_need_gfx_cs_space(struct si_context *ctx, unsigned num_draws,
|
|
+ unsigned extra_dw_per_draw)
|
|
{
|
|
struct radeon_cmdbuf *cs = &ctx->gfx_cs;
|
|
+ /* Don't count the needed CS space exactly and just use an upper bound.
|
|
+ *
|
|
+ * Also reserve space for stopping queries at the end of IB, because
|
|
+ * the number of active queries is unlimited in theory.
|
|
+ */
|
|
+ unsigned reserve_dw = 2048 + ctx->num_cs_dw_queries_suspend +
|
|
+ num_draws * (10 + extra_dw_per_draw);
|
|
|
|
- if (!ctx->ws->cs_check_space(cs, si_get_minimum_num_gfx_cs_dwords(ctx, num_draws)))
|
|
+ if (!ctx->ws->cs_check_space(cs, reserve_dw))
|
|
si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
|
|
}
|
|
|
|
diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c
|
|
index eca647505ae..67dd732f615 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_query.c
|
|
+++ b/src/gallium/drivers/radeonsi/si_query.c
|
|
@@ -918,7 +918,7 @@ static void si_query_hw_emit_start(struct si_context *sctx, struct si_query_hw *
|
|
si_update_prims_generated_query_state(sctx, query->b.type, 1);
|
|
si_update_hw_pipeline_stats(sctx, query->b.type, 1);
|
|
|
|
- si_need_gfx_cs_space(sctx, 0);
|
|
+ si_need_gfx_cs_space(sctx, 0, 0);
|
|
|
|
va = query->buffer.buf->gpu_address + query->buffer.results_end;
|
|
si_query_hw_do_emit_start(sctx, query, query->buffer.buf, va);
|
|
@@ -1014,7 +1014,7 @@ static void si_query_hw_emit_stop(struct si_context *sctx, struct si_query_hw *q
|
|
|
|
/* The queries which need begin already called this in begin_query. */
|
|
if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
|
|
- si_need_gfx_cs_space(sctx, 0);
|
|
+ si_need_gfx_cs_space(sctx, 0, 0);
|
|
if (!si_query_buffer_alloc(sctx, &query->buffer, si_query_hw_prepare_buffer,
|
|
query->result_size))
|
|
return;
|
|
@@ -1726,7 +1726,7 @@ void si_resume_queries(struct si_context *sctx)
|
|
struct si_query *query;
|
|
|
|
/* Check CS space here. Resuming must not be interrupted by flushes. */
|
|
- si_need_gfx_cs_space(sctx, 0);
|
|
+ si_need_gfx_cs_space(sctx, 0, 0);
|
|
|
|
LIST_FOR_EACH_ENTRY (query, &sctx->active_queries, active_list)
|
|
query->ops->resume(sctx, query);
|
|
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp
|
|
index bc4cf308936..8b57ddb3b70 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
|
|
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
|
|
@@ -2060,7 +2060,7 @@ static void si_draw(struct pipe_context *ctx,
|
|
else if (GFX_VERSION < GFX12)
|
|
gfx11_decompress_textures(sctx, u_bit_consecutive(0, SI_NUM_GRAPHICS_SHADERS));
|
|
|
|
- si_need_gfx_cs_space(sctx, num_draws);
|
|
+ si_need_gfx_cs_space(sctx, num_draws, ALT_HIZ_LOGIC ? 8 : 0);
|
|
|
|
if (u_trace_perfetto_active(&sctx->ds.trace_context))
|
|
trace_si_begin_draw(&sctx->trace);
|
|
--
|
|
2.48.1
|
|
|
|
|
|
From 1e95d1b0fec92e3cba014e32ac731705eb6e8425 Mon Sep 17 00:00:00 2001
|
|
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
|
|
Date: Wed, 15 Jan 2025 12:00:43 +0100
|
|
Subject: [PATCH 08/27] radv: disable GFX12+ support
|
|
|
|
It's mostly broken, but 25.0+ should be good enough.
|
|
|
|
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
|
|
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33113>
|
|
---
|
|
src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c | 13 +++++++++++++
|
|
1 file changed, 13 insertions(+)
|
|
|
|
diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c
|
|
index 6ed5718e695..b6ce7ba6ac5 100644
|
|
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c
|
|
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c
|
|
@@ -22,12 +22,25 @@
|
|
#include "vk_drm_syncobj.h"
|
|
#include "xf86drm.h"
|
|
|
|
+static bool
|
|
+radv_is_gpu_supported(const struct radeon_info *info)
|
|
+{
|
|
+ /* GFX12 isn't supported. */
|
|
+ if (info->gfx_level >= GFX12)
|
|
+ return false;
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
static bool
|
|
do_winsys_init(struct radv_amdgpu_winsys *ws, int fd)
|
|
{
|
|
if (!ac_query_gpu_info(fd, ws->dev, &ws->info, true))
|
|
return false;
|
|
|
|
+ if (!radv_is_gpu_supported(&ws->info))
|
|
+ return false;
|
|
+
|
|
/*
|
|
* Override the max submits on video queues.
|
|
* If you submit multiple session contexts in the same IB sequence the
|
|
--
|
|
2.48.1
|
|
|
|
|
|
From 317d71daef2588ff76c62aa54a5f1920e717a465 Mon Sep 17 00:00:00 2001
|
|
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
|
|
Date: Wed, 18 Dec 2024 19:18:30 -0500
|
|
Subject: [PATCH 09/27] radeonsi: fix a gfx10.3 regression due to a gfx12
|
|
change
|
|
|
|
This fixes:
|
|
Assertion `!"BITSET_TEST_RANGE: bit range crosses word boundary"' failed.
|
|
|
|
Fixes: e3cef02c245 - radeonsi/gfx12: set DB_RENDER_OVERRIDE based on stencil state
|
|
|
|
Reviewed-by: Qiang Yu <yuq825@gmail.com>
|
|
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32713>
|
|
---
|
|
src/gallium/drivers/radeonsi/si_gfx_cs.c | 7 +++++--
|
|
src/gallium/drivers/radeonsi/si_state.h | 2 +-
|
|
2 files changed, 6 insertions(+), 3 deletions(-)
|
|
|
|
diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c
|
|
index d8dccd9b008..9782e1afaa1 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
|
|
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
|
|
@@ -317,7 +317,6 @@ void si_set_tracked_regs_to_clear_state(struct si_context *ctx)
|
|
ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ENA] = 0;
|
|
ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ADDR] = 0;
|
|
|
|
- ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_OVERRIDE] = 0;
|
|
ctx->tracked_regs.reg_value[SI_TRACKED_DB_EQAA] = 0;
|
|
ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_OVERRIDE2] = 0;
|
|
ctx->tracked_regs.reg_value[SI_TRACKED_DB_SHADER_CONTROL] = 0;
|
|
@@ -368,7 +367,11 @@ void si_set_tracked_regs_to_clear_state(struct si_context *ctx)
|
|
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2] = 0;
|
|
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3] = 0;
|
|
|
|
- ctx->tracked_regs.reg_value[SI_TRACKED_SPI_VS_OUT_CONFIG] = 0;
|
|
+ if (ctx->gfx_level >= GFX12)
|
|
+ ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_OVERRIDE] = 0;
|
|
+ else
|
|
+ ctx->tracked_regs.reg_value[SI_TRACKED_SPI_VS_OUT_CONFIG] = 0;
|
|
+
|
|
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_PRIMITIVEID_EN] = 0;
|
|
ctx->tracked_regs.reg_value[SI_TRACKED_CB_DCC_CONTROL] = 0;
|
|
|
|
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
|
|
index f964fa21606..75159c475e6 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_state.h
|
|
+++ b/src/gallium/drivers/radeonsi/si_state.h
|
|
@@ -315,7 +315,6 @@ enum si_tracked_reg
|
|
SI_TRACKED_SPI_PS_INPUT_ENA,
|
|
SI_TRACKED_SPI_PS_INPUT_ADDR,
|
|
|
|
- SI_TRACKED_DB_RENDER_OVERRIDE,
|
|
SI_TRACKED_DB_EQAA,
|
|
SI_TRACKED_DB_RENDER_OVERRIDE2,
|
|
SI_TRACKED_DB_SHADER_CONTROL,
|
|
@@ -371,6 +370,7 @@ enum si_tracked_reg
|
|
SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3, /* GFX6-10 (GFX11+ can reuse this slot) */
|
|
|
|
SI_TRACKED_SPI_VS_OUT_CONFIG, /* GFX6-11 */
|
|
+ SI_TRACKED_DB_RENDER_OVERRIDE = SI_TRACKED_SPI_VS_OUT_CONFIG, /* GFX12+ (slot reused) */
|
|
SI_TRACKED_VGT_PRIMITIVEID_EN, /* GFX6-11 */
|
|
SI_TRACKED_CB_DCC_CONTROL, /* GFX8-11 */
|
|
SI_TRACKED_DB_STENCIL_READ_MASK, /* GFX12+ */
|
|
--
|
|
2.48.1
|
|
|
|
|
|
From 5e4f6e7f2c644219f998672b90c6de9146ca6952 Mon Sep 17 00:00:00 2001
|
|
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
|
|
Date: Wed, 8 Jan 2025 15:00:50 -0500
|
|
Subject: [PATCH 10/27] radeonsi/gfx12: enable alt_hiz_logic
|
|
|
|
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
|
|
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32957>
|
|
---
|
|
src/gallium/drivers/radeonsi/si_debug_options.h | 2 +-
|
|
1 file changed, 1 insertion(+), 1 deletion(-)
|
|
|
|
diff --git a/src/gallium/drivers/radeonsi/si_debug_options.h b/src/gallium/drivers/radeonsi/si_debug_options.h
|
|
index a5057c4700e..74b9b59a455 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_debug_options.h
|
|
+++ b/src/gallium/drivers/radeonsi/si_debug_options.h
|
|
@@ -23,7 +23,7 @@ OPT_BOOL(zerovram, false, "Zero all VRAM allocations")
|
|
OPT_BOOL(clear_lds, false, "Clear LDS at the end of shaders. Might decrease performance.")
|
|
OPT_BOOL(cache_rb_gl2, false, "Enable GL2 caching for CB and DB.")
|
|
OPT_BOOL(optimize_io, true, "Run nir_opt_varyings in the GLSL linker.")
|
|
-OPT_BOOL(alt_hiz_logic, false, "Enable alternative HiZ logic")
|
|
+OPT_BOOL(alt_hiz_logic, true, "Enable alternative HiZ logic")
|
|
|
|
#undef OPT_BOOL
|
|
#undef OPT_INT
|
|
--
|
|
2.48.1
|
|
|
|
|
|
From 074851b27e9b59bf9b450907e3231824495e589a Mon Sep 17 00:00:00 2001
|
|
From: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
|
|
Date: Mon, 10 Feb 2025 12:27:48 +0100
|
|
Subject: [PATCH 11/27] radeonsi: disable dcc when external shader stores are
|
|
used
|
|
MIME-Version: 1.0
|
|
Content-Type: text/plain; charset=UTF-8
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
See comment.
|
|
|
|
Fixes: 666a6eb871d ("radeonsi/gfx12: disable display dcc for front buffer rendering")
|
|
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/12552
|
|
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
|
|
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33469>
|
|
---
|
|
src/gallium/drivers/radeonsi/si_texture.c | 8 +++++---
|
|
1 file changed, 5 insertions(+), 3 deletions(-)
|
|
|
|
diff --git a/src/gallium/drivers/radeonsi/si_texture.c b/src/gallium/drivers/radeonsi/si_texture.c
|
|
index 9cc11a81669..54e2bfe7e53 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_texture.c
|
|
+++ b/src/gallium/drivers/radeonsi/si_texture.c
|
|
@@ -803,10 +803,12 @@ static bool si_texture_get_handle(struct pipe_screen *screen, struct pipe_contex
|
|
}
|
|
|
|
const bool debug_disable_dcc = sscreen->debug_flags & DBG(NO_EXPORTED_DCC);
|
|
- /* Since shader image stores don't support DCC on GFX9 and older,
|
|
- * disable it for external clients that want write access.
|
|
+ /* Disable DCC for external clients that might use shader image stores.
|
|
+ * They don't support DCC on GFX9 and older. GFX10/10.3 is also problematic
|
|
+ * if the view formats between clients are incompatible or if DCC clear is
|
|
+ * used.
|
|
*/
|
|
- const bool shader_write = sscreen->info.gfx_level <= GFX9 &&
|
|
+ const bool shader_write = sscreen->info.gfx_level < GFX11 &&
|
|
usage & PIPE_HANDLE_USAGE_SHADER_WRITE &&
|
|
!tex->is_depth &&
|
|
tex->surface.meta_offset;
|
|
--
|
|
2.48.1
|
|
|
|
|
|
From ec81fa2026aa8760158dcf4520bc307f627db40f Mon Sep 17 00:00:00 2001
|
|
From: Rhys Perry <pendingchaos02@gmail.com>
|
|
Date: Thu, 25 Jul 2024 21:39:25 +0100
|
|
Subject: [PATCH 12/27] aco: split CounterMap off from VGPRCounterMap
|
|
|
|
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
|
|
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
|
|
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30478>
|
|
---
|
|
src/amd/compiler/aco_insert_NOPs.cpp | 38 ++++++++++++++++------------
|
|
1 file changed, 22 insertions(+), 16 deletions(-)
|
|
|
|
diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp
|
|
index a6290a7f457..3f4b5d6c85f 100644
|
|
--- a/src/amd/compiler/aco_insert_NOPs.cpp
|
|
+++ b/src/amd/compiler/aco_insert_NOPs.cpp
|
|
@@ -164,14 +164,14 @@ struct NOP_ctx_gfx10 {
|
|
}
|
|
};
|
|
|
|
-template <int Max> struct VGPRCounterMap {
|
|
+template <int Start, int Size, int Max> struct CounterMap {
|
|
public:
|
|
int base = 0;
|
|
- BITSET_DECLARE(resident, 256);
|
|
- int val[256];
|
|
+ BITSET_DECLARE(resident, Size);
|
|
+ int val[Size];
|
|
|
|
/* Initializes all counters to Max. */
|
|
- VGPRCounterMap() { BITSET_ZERO(resident); }
|
|
+ CounterMap() { BITSET_ZERO(resident); }
|
|
|
|
/* Increase all counters, clamping at Max. */
|
|
void inc() { base++; }
|
|
@@ -185,11 +185,12 @@ public:
|
|
|
|
void set(PhysReg reg, unsigned bytes)
|
|
{
|
|
- if (reg.reg() < 256)
|
|
+ if (reg.reg() < Start)
|
|
return;
|
|
|
|
- for (unsigned i = 0; i < DIV_ROUND_UP(bytes, 4); i++)
|
|
- set(reg.reg() - 256 + i);
|
|
+ unsigned size = MIN2(DIV_ROUND_UP(bytes, 4), Start + Size - reg.reg());
|
|
+ for (unsigned i = 0; i < size; i++)
|
|
+ set(reg.reg() - Start + i);
|
|
}
|
|
|
|
/* Reset all counters to Max. */
|
|
@@ -201,11 +202,12 @@ public:
|
|
|
|
void reset(PhysReg reg, unsigned bytes)
|
|
{
|
|
- if (reg.reg() < 256)
|
|
+ if (reg.reg() < Start)
|
|
return;
|
|
|
|
- for (unsigned i = 0; i < DIV_ROUND_UP(bytes, 4); i++)
|
|
- BITSET_CLEAR(resident, reg.reg() - 256 + i);
|
|
+ unsigned size = MIN2(DIV_ROUND_UP(bytes, 4), Start + Size - reg.reg());
|
|
+ for (unsigned i = 0; i < size; i++)
|
|
+ BITSET_CLEAR(resident, reg.reg() - Start + i);
|
|
}
|
|
|
|
uint8_t get(unsigned idx)
|
|
@@ -215,14 +217,14 @@ public:
|
|
|
|
uint8_t get(PhysReg reg, unsigned offset = 0)
|
|
{
|
|
- assert(reg.reg() >= 256);
|
|
- return get(reg.reg() - 256 + offset);
|
|
+ assert(reg.reg() >= Start);
|
|
+ return get(reg.reg() - Start + offset);
|
|
}
|
|
|
|
- void join_min(const VGPRCounterMap& other)
|
|
+ void join_min(const CounterMap& other)
|
|
{
|
|
unsigned i;
|
|
- BITSET_FOREACH_SET (i, other.resident, 256) {
|
|
+ BITSET_FOREACH_SET (i, other.resident, Size) {
|
|
if (BITSET_TEST(resident, i))
|
|
val[i] = MIN2(val[i] + base, other.val[i] + other.base) - base;
|
|
else
|
|
@@ -231,13 +233,13 @@ public:
|
|
BITSET_OR(resident, resident, other.resident);
|
|
}
|
|
|
|
- bool operator==(const VGPRCounterMap& other) const
|
|
+ bool operator==(const CounterMap& other) const
|
|
{
|
|
if (!BITSET_EQUAL(resident, other.resident))
|
|
return false;
|
|
|
|
unsigned i;
|
|
- BITSET_FOREACH_SET (i, other.resident, 256) {
|
|
+ BITSET_FOREACH_SET (i, other.resident, Size) {
|
|
if (!BITSET_TEST(resident, i))
|
|
return false;
|
|
if (val[i] + base != other.val[i] + other.base)
|
|
@@ -245,8 +247,12 @@ public:
|
|
}
|
|
return true;
|
|
}
|
|
+
|
|
+ unsigned size() const { return Size; }
|
|
};
|
|
|
|
+template <int Max> using VGPRCounterMap = CounterMap<256, 256, Max>;
|
|
+
|
|
struct NOP_ctx_gfx11 {
|
|
/* VcmpxPermlaneHazard */
|
|
bool has_Vcmpx = false;
|
|
--
|
|
2.48.1
|
|
|
|
|
|
From 8cfd8daf2e0ed1783f3aeac6a5274bc70e1d103a Mon Sep 17 00:00:00 2001
|
|
From: Rhys Perry <pendingchaos02@gmail.com>
|
|
Date: Thu, 25 Jul 2024 21:36:29 +0100
|
|
Subject: [PATCH 13/27] aco: minor CounterMap::operator== fix
|
|
|
|
I don't think this matters for how we use CounterMap::operator==.
|
|
|
|
The BITSET_TEST() was unnecessary because of the BITSET_EQUAL above.
|
|
|
|
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
|
|
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
|
|
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30478>
|
|
---
|
|
src/amd/compiler/aco_insert_NOPs.cpp | 4 +---
|
|
1 file changed, 1 insertion(+), 3 deletions(-)
|
|
|
|
diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp
|
|
index 3f4b5d6c85f..d6df9190864 100644
|
|
--- a/src/amd/compiler/aco_insert_NOPs.cpp
|
|
+++ b/src/amd/compiler/aco_insert_NOPs.cpp
|
|
@@ -240,9 +240,7 @@ public:
|
|
|
|
unsigned i;
|
|
BITSET_FOREACH_SET (i, other.resident, Size) {
|
|
- if (!BITSET_TEST(resident, i))
|
|
- return false;
|
|
- if (val[i] + base != other.val[i] + other.base)
|
|
+ if (MIN2(val[i] + base, Max) != MIN2(other.val[i] + other.base, Max))
|
|
return false;
|
|
}
|
|
return true;
|
|
--
|
|
2.48.1
|
|
|
|
|
|
From 62f14df1ca6435f30e13f44e7246fa763706cd9e Mon Sep 17 00:00:00 2001
|
|
From: Rhys Perry <pendingchaos02@gmail.com>
|
|
Date: Thu, 25 Jul 2024 21:40:02 +0100
|
|
Subject: [PATCH 14/27] aco: workaround VALUReadSGPRHazard
|
|
|
|
fossil-db (gfx1200):
|
|
Totals from 65112 (82.01% of 79395) affected shaders:
|
|
Instrs: 41732906 -> 42987198 (+3.01%); split: -0.00%, +3.01%
|
|
CodeSize: 222451964 -> 226942644 (+2.02%); split: -0.01%, +2.03%
|
|
Latency: 290411063 -> 290944688 (+0.18%); split: -0.00%, +0.18%
|
|
InvThroughput: 45854913 -> 45910275 (+0.12%); split: -0.00%, +0.12%
|
|
|
|
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
|
|
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
|
|
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30478>
|
|
---
|
|
src/amd/compiler/aco_insert_NOPs.cpp | 91 +++++++++++++++++++++++++---
|
|
1 file changed, 84 insertions(+), 7 deletions(-)
|
|
|
|
diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp
|
|
index d6df9190864..1bf53050e78 100644
|
|
--- a/src/amd/compiler/aco_insert_NOPs.cpp
|
|
+++ b/src/amd/compiler/aco_insert_NOPs.cpp
|
|
@@ -273,6 +273,10 @@ struct NOP_ctx_gfx11 {
|
|
/* WMMAHazards */
|
|
std::bitset<256> vgpr_written_by_wmma;
|
|
|
|
+ /* VALUReadSGPRHazard */
|
|
+ std::bitset<m0.reg() / 2> sgpr_read_by_valu; /* SGPR pairs, excluding null, exec, m0 and scc */
|
|
+ CounterMap<0, m0.reg(), 11> sgpr_read_by_valu_then_wr_by_salu;
|
|
+
|
|
void join(const NOP_ctx_gfx11& other)
|
|
{
|
|
has_Vcmpx |= other.has_Vcmpx;
|
|
@@ -287,6 +291,8 @@ struct NOP_ctx_gfx11 {
|
|
sgpr_read_by_valu_as_lanemask_then_wr_by_salu |=
|
|
other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
|
|
vgpr_written_by_wmma |= other.vgpr_written_by_wmma;
|
|
+ sgpr_read_by_valu |= other.sgpr_read_by_valu;
|
|
+ sgpr_read_by_valu_then_wr_by_salu.join_min(other.sgpr_read_by_valu_then_wr_by_salu);
|
|
}
|
|
|
|
bool operator==(const NOP_ctx_gfx11& other)
|
|
@@ -302,7 +308,9 @@ struct NOP_ctx_gfx11 {
|
|
sgpr_read_by_valu_as_lanemask == other.sgpr_read_by_valu_as_lanemask &&
|
|
sgpr_read_by_valu_as_lanemask_then_wr_by_salu ==
|
|
other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu &&
|
|
- vgpr_written_by_wmma == other.vgpr_written_by_wmma;
|
|
+ vgpr_written_by_wmma == other.vgpr_written_by_wmma &&
|
|
+ sgpr_read_by_valu == other.sgpr_read_by_valu &&
|
|
+ sgpr_read_by_valu_then_wr_by_salu == other.sgpr_read_by_valu_then_wr_by_salu;
|
|
}
|
|
};
|
|
|
|
@@ -1527,6 +1535,48 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
|
|
}
|
|
}
|
|
}
|
|
+ } else {
|
|
+ /* VALUReadSGPRHazard
|
|
+ * VALU reads SGPR and later written by SALU cannot safely be read by VALU/SALU.
|
|
+ */
|
|
+ if (instr->isVALU() || instr->isSALU()) {
|
|
+ unsigned expiry_count = instr->isSALU() ? 10 : 11;
|
|
+ for (Operand& op : instr->operands) {
|
|
+ if (sa_sdst == 0)
|
|
+ break;
|
|
+
|
|
+ for (unsigned i = 0; i < op.size(); i++) {
|
|
+ unsigned reg = op.physReg() + i;
|
|
+ if (reg < ctx.sgpr_read_by_valu_then_wr_by_salu.size() &&
|
|
+ ctx.sgpr_read_by_valu_then_wr_by_salu.get(reg) < expiry_count) {
|
|
+ bld.sopp(aco_opcode::s_waitcnt_depctr, 0xfffe);
|
|
+ sa_sdst = 0;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (sa_sdst == 0)
|
|
+ ctx.sgpr_read_by_valu_then_wr_by_salu.reset();
|
|
+ else if (instr->isSALU() && !instr->isSOPP())
|
|
+ ctx.sgpr_read_by_valu_then_wr_by_salu.inc();
|
|
+
|
|
+ if (instr->isVALU()) {
|
|
+ for (const Operand& op : instr->operands) {
|
|
+ for (unsigned i = 0; i < DIV_ROUND_UP(op.size(), 2); i++) {
|
|
+ unsigned reg = (op.physReg() / 2) + i;
|
|
+ if (reg < ctx.sgpr_read_by_valu.size())
|
|
+ ctx.sgpr_read_by_valu.set(reg);
|
|
+ }
|
|
+ }
|
|
+ } else if (instr->isSALU() && !instr->definitions.empty()) {
|
|
+ for (unsigned i = 0; i < instr->definitions[0].size(); i++) {
|
|
+ unsigned def_reg = instr->definitions[0].physReg() + i;
|
|
+ if ((def_reg / 2) < ctx.sgpr_read_by_valu.size() && ctx.sgpr_read_by_valu[def_reg / 2])
|
|
+ ctx.sgpr_read_by_valu_then_wr_by_salu.set(def_reg);
|
|
+ }
|
|
+ }
|
|
}
|
|
|
|
/* LdsDirectVMEMHazard
|
|
@@ -1683,6 +1733,15 @@ resolve_all_gfx11(State& state, NOP_ctx_gfx11& ctx,
|
|
}
|
|
}
|
|
|
|
+ /* VALUReadSGPRHazard */
|
|
+ if (state.program->gfx_level >= GFX12) {
|
|
+ for (unsigned i = 0; i < ctx.sgpr_read_by_valu_then_wr_by_salu.size(); i++) {
|
|
+ if (ctx.sgpr_read_by_valu_then_wr_by_salu.get(i) < 11)
|
|
+ waitcnt_depctr &= 0xfffe;
|
|
+ }
|
|
+ ctx.sgpr_read_by_valu_then_wr_by_salu.reset();
|
|
+ }
|
|
+
|
|
/* LdsDirectVMEMHazard */
|
|
if (ctx.vgpr_used_by_vmem_load.any() || ctx.vgpr_used_by_vmem_store.any() ||
|
|
ctx.vgpr_used_by_ds.any() || ctx.vgpr_used_by_vmem_sample.any() ||
|
|
@@ -1758,7 +1817,7 @@ handle_block(Program* program, Ctx& ctx, Block& block)
|
|
|
|
template <typename Ctx, HandleInstr<Ctx> Handle, ResolveAll<Ctx> Resolve>
|
|
void
|
|
-mitigate_hazards(Program* program)
|
|
+mitigate_hazards(Program* program, Ctx initial_ctx = Ctx())
|
|
{
|
|
std::vector<Ctx> all_ctx(program->blocks.size());
|
|
std::stack<unsigned, std::vector<unsigned>> loop_header_indices;
|
|
@@ -1767,6 +1826,9 @@ mitigate_hazards(Program* program)
|
|
Block& block = program->blocks[i];
|
|
Ctx& ctx = all_ctx[i];
|
|
|
|
+ if (i == 0 || (block.kind & block_kind_resume))
|
|
+ ctx = initial_ctx;
|
|
+
|
|
if (block.kind & block_kind_loop_header) {
|
|
loop_header_indices.push(i);
|
|
} else if (block.kind & block_kind_loop_exit) {
|
|
@@ -1864,14 +1926,29 @@ required_export_priority(Program* program)
|
|
void
|
|
insert_NOPs(Program* program)
|
|
{
|
|
- if (program->gfx_level >= GFX11)
|
|
- mitigate_hazards<NOP_ctx_gfx11, handle_instruction_gfx11, resolve_all_gfx11>(program);
|
|
- else if (program->gfx_level >= GFX10_3)
|
|
+ if (program->gfx_level >= GFX11) {
|
|
+ NOP_ctx_gfx11 initial_ctx;
|
|
+
|
|
+ bool has_previous_part =
|
|
+ program->is_epilog || program->info.vs.has_prolog || program->info.ps.has_prolog ||
|
|
+ (program->info.merged_shader_compiled_separately && program->stage.sw != SWStage::VS &&
|
|
+ program->stage.sw != SWStage::TES) || program->stage == raytracing_cs;
|
|
+ if (program->gfx_level >= GFX12 && has_previous_part) {
|
|
+ /* resolve_all_gfx11 can't resolve VALUReadSGPRHazard entirely. We have to assume that any
|
|
+ * SGPR might have been read by VALU if there was a previous shader part.
|
|
+ */
|
|
+ initial_ctx.sgpr_read_by_valu.flip();
|
|
+ }
|
|
+
|
|
+ mitigate_hazards<NOP_ctx_gfx11, handle_instruction_gfx11, resolve_all_gfx11>(program,
|
|
+ initial_ctx);
|
|
+ } else if (program->gfx_level >= GFX10_3) {
|
|
; /* no hazards/bugs to mitigate */
|
|
- else if (program->gfx_level >= GFX10)
|
|
+ } else if (program->gfx_level >= GFX10) {
|
|
mitigate_hazards<NOP_ctx_gfx10, handle_instruction_gfx10, resolve_all_gfx10>(program);
|
|
- else
|
|
+ } else {
|
|
mitigate_hazards<NOP_ctx_gfx6, handle_instruction_gfx6, resolve_all_gfx6>(program);
|
|
+ }
|
|
|
|
if (program->gfx_level == GFX11_5 && (program->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER ||
|
|
program->stage.hw == AC_HW_PIXEL_SHADER))
|
|
--
|
|
2.48.1
|
|
|
|
|
|
From d6d58210c9658d759622022c7d196849c8f4e28d Mon Sep 17 00:00:00 2001
|
|
From: Rhys Perry <pendingchaos02@gmail.com>
|
|
Date: Tue, 26 Nov 2024 12:00:35 +0000
|
|
Subject: [PATCH 15/27] aco/gfx12: insert wait between VMEM WaW
|
|
|
|
https://github.com/llvm/llvm-project/pull/105549
|
|
|
|
fossil-db (gfx1200):
|
|
Totals from 1783 (2.25% of 79395) affected shaders:
|
|
Instrs: 7398391 -> 7404566 (+0.08%); split: -0.00%, +0.08%
|
|
CodeSize: 38862456 -> 38886364 (+0.06%); split: -0.00%, +0.06%
|
|
Latency: 83191513 -> 84211504 (+1.23%); split: -0.00%, +1.23%
|
|
InvThroughput: 15185936 -> 15345744 (+1.05%); split: -0.01%, +1.06%
|
|
|
|
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
|
|
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32373>
|
|
---
|
|
src/amd/compiler/aco_insert_waitcnt.cpp | 8 ++++++--
|
|
src/amd/compiler/tests/test_insert_waitcnt.cpp | 5 +++++
|
|
2 files changed, 11 insertions(+), 2 deletions(-)
|
|
|
|
diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp
|
|
index ae079dcd755..b19f49e8571 100644
|
|
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
|
|
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
|
|
@@ -385,9 +385,13 @@ check_instr(wait_ctx& ctx, wait_imm& wait, alu_delay_info& delay, Instruction* i
|
|
|
|
wait_imm reg_imm = it->second.imm;
|
|
|
|
- /* Vector Memory reads and writes return in the order they were issued */
|
|
+ /* Vector Memory reads and writes decrease the counter in the order they were issued.
|
|
+ * Before GFX12, they also write VGPRs in order if they're of the same type.
|
|
+ * TODO: We can do this for GFX12 and different types for GFX11 if we know that the two
|
|
+ * VMEM loads do not write the same lanes. Since GFX11, we track VMEM operations on the
|
|
+ * linear CFG, so this is difficult */
|
|
uint8_t vmem_type = get_vmem_type(ctx.gfx_level, instr);
|
|
- if (vmem_type) {
|
|
+ if (vmem_type && ctx.gfx_level < GFX12) {
|
|
wait_event event = get_vmem_event(ctx, instr, vmem_type);
|
|
wait_type type = (wait_type)(ffs(ctx.info->get_counters_for_event(event)) - 1);
|
|
if ((it->second.events & ctx.info->events[type]) == event &&
|
|
diff --git a/src/amd/compiler/tests/test_insert_waitcnt.cpp b/src/amd/compiler/tests/test_insert_waitcnt.cpp
|
|
index edc34bfc7f8..c69772c5d05 100644
|
|
--- a/src/amd/compiler/tests/test_insert_waitcnt.cpp
|
|
+++ b/src/amd/compiler/tests/test_insert_waitcnt.cpp
|
|
@@ -192,6 +192,7 @@ BEGIN_TEST(insert_waitcnt.waw.vmem_types)
|
|
|
|
//>> p_unit_test 0
|
|
//! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
|
|
+ //~gfx12! s_wait_loadcnt imm:0
|
|
//! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
|
|
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
|
|
bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
|
|
@@ -220,6 +221,7 @@ BEGIN_TEST(insert_waitcnt.waw.vmem_types)
|
|
|
|
//>> p_unit_test 3
|
|
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
|
|
+ //~gfx12! s_wait_samplecnt imm:0
|
|
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
|
|
bld.reset(program->create_and_insert_block());
|
|
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
|
|
@@ -249,6 +251,7 @@ BEGIN_TEST(insert_waitcnt.waw.vmem_types)
|
|
|
|
//>> p_unit_test 6
|
|
//! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[16-26] 1d
|
|
+ //~gfx12! s_wait_bvhcnt imm:0
|
|
//! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[16-26] 1d
|
|
bld.reset(program->create_and_insert_block());
|
|
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
|
|
@@ -300,6 +303,7 @@ BEGIN_TEST(insert_waitcnt.waw.vmem_types)
|
|
//>> BB11
|
|
//! /* logical preds: BB9, BB10, / linear preds: BB9, BB10, / kind: uniform, */
|
|
//! p_unit_test 9
|
|
+ //~gfx12! s_wait_loadcnt imm:0
|
|
//! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
|
|
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
|
|
bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
|
|
@@ -326,6 +330,7 @@ BEGIN_TEST(insert_waitcnt.waw.vmem_types)
|
|
//! /* logical preds: BB12, BB13, / linear preds: BB12, BB13, / kind: uniform, */
|
|
//! p_unit_test 10
|
|
//~gfx11! s_waitcnt vmcnt(0)
|
|
+ //~gfx12! s_wait_loadcnt imm:0
|
|
//~gfx12! s_wait_samplecnt imm:0
|
|
//! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
|
|
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
|
|
--
|
|
2.48.1
|
|
|
|
|
|
From 41edfd3cd9470ab4e54efb0b3af9eab7242f5e5f Mon Sep 17 00:00:00 2001
|
|
From: Rhys Perry <pendingchaos02@gmail.com>
|
|
Date: Wed, 27 Nov 2024 14:51:32 +0000
|
|
Subject: [PATCH 16/27] aco: force linear for event_vmem_sample and
|
|
event_vmem_bvh
|
|
|
|
I don't know if this issue affects GFX12, but workaround it anyway to be
|
|
safe.
|
|
|
|
fossil-db (gfx1200):
|
|
Totals from 3463 (4.36% of 79395) affected shaders:
|
|
Instrs: 9794280 -> 9833253 (+0.40%); split: -0.00%, +0.40%
|
|
CodeSize: 52306040 -> 52457988 (+0.29%); split: -0.01%, +0.30%
|
|
Latency: 90549385 -> 93617517 (+3.39%); split: -0.00%, +3.39%
|
|
InvThroughput: 13189030 -> 13602942 (+3.14%); split: -0.00%, +3.14%
|
|
|
|
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
|
|
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
|
|
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32373>
|
|
---
|
|
src/amd/compiler/aco_insert_waitcnt.cpp | 3 ++-
|
|
1 file changed, 2 insertions(+), 1 deletion(-)
|
|
|
|
diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp
|
|
index b19f49e8571..66ce8603ea4 100644
|
|
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
|
|
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
|
|
@@ -735,7 +735,8 @@ insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, uint8_t vmem_
|
|
* Also, follow linear control flow for ALU because it's unlikely that the hardware does per-lane
|
|
* dependency checks.
|
|
*/
|
|
- uint32_t ds_vmem_events = event_lds | event_gds | event_vmem | event_flat;
|
|
+ uint32_t ds_vmem_events =
|
|
+ event_lds | event_gds | event_vmem | event_vmem_sample | event_vmem_bvh | event_flat;
|
|
uint32_t alu_events = event_trans | event_valu | event_salu;
|
|
bool force_linear = ctx.gfx_level >= GFX11 && (event & (ds_vmem_events | alu_events));
|
|
|
|
--
|
|
2.48.1
|
|
|
|
|
|
From c56875aa56504eab06cc07245b78f9acfb3a1bf6 Mon Sep 17 00:00:00 2001
|
|
From: Georg Lehmann <dadschoorse@gmail.com>
|
|
Date: Thu, 5 Dec 2024 11:33:08 +0100
|
|
Subject: [PATCH 17/27] aco/gfx12+: do not use v_pack_b32_f16 to pack untyped
|
|
data
|
|
MIME-Version: 1.0
|
|
Content-Type: text/plain; charset=UTF-8
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
GFX12 removed IEEE_MODE, and made its signalling NaN quieting the default.
|
|
|
|
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/12251
|
|
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
|
|
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32502>
|
|
---
|
|
src/amd/compiler/aco_lower_to_hw_instr.cpp | 8 +++++++-
|
|
1 file changed, 7 insertions(+), 1 deletion(-)
|
|
|
|
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp
|
|
index ed955a32a1e..bceaa0d25e9 100644
|
|
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
|
|
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
|
|
@@ -1562,9 +1562,15 @@ do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Opera
|
|
return;
|
|
}
|
|
|
|
+ /* v_pack_b32_f16 can be used for bit exact copies if:
|
|
+ * - fp16 input denorms are enabled, otherwise they get flushed to zero
|
|
+ * - signalling input NaNs are kept, which is the case with IEEE_MODE=0
|
|
+ * GFX12+ always quiets signalling NaNs, IEEE_MODE was removed
|
|
+ */
|
|
bool can_use_pack = (ctx->block->fp_mode.denorm16_64 & fp_denorm_keep_in) &&
|
|
(ctx->program->gfx_level >= GFX10 ||
|
|
- (ctx->program->gfx_level >= GFX9 && !lo.isLiteral() && !hi.isLiteral()));
|
|
+ (ctx->program->gfx_level >= GFX9 && !lo.isLiteral() && !hi.isLiteral())) &&
|
|
+ ctx->program->gfx_level < GFX12;
|
|
|
|
if (can_use_pack) {
|
|
Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, def, lo, hi);
|
|
--
|
|
2.48.1
|
|
|
|
|
|
From 04d7f9ed9e5b0cd98c32a9d8b35c92698e427dfa Mon Sep 17 00:00:00 2001
|
|
From: Rhys Perry <pendingchaos02@gmail.com>
|
|
Date: Wed, 4 Dec 2024 13:36:04 +0000
|
|
Subject: [PATCH 18/27] aco: don't CSE p_shader_cycles_hi_lo_hi
|
|
|
|
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
|
|
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
|
|
Fixes: fae2a85d57a4 ("aco/gfx12: implement subgroup shader clock")
|
|
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/12243
|
|
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32500>
|
|
---
|
|
src/amd/compiler/aco_opt_value_numbering.cpp | 1 +
|
|
1 file changed, 1 insertion(+)
|
|
|
|
diff --git a/src/amd/compiler/aco_opt_value_numbering.cpp b/src/amd/compiler/aco_opt_value_numbering.cpp
|
|
index e040221be61..e0c8eeffbbb 100644
|
|
--- a/src/amd/compiler/aco_opt_value_numbering.cpp
|
|
+++ b/src/amd/compiler/aco_opt_value_numbering.cpp
|
|
@@ -306,6 +306,7 @@ can_eliminate(aco_ptr<Instruction>& instr)
|
|
if (instr->definitions.empty() || instr->opcode == aco_opcode::p_phi ||
|
|
instr->opcode == aco_opcode::p_linear_phi ||
|
|
instr->opcode == aco_opcode::p_pops_gfx9_add_exiting_wave_id ||
|
|
+ instr->opcode == aco_opcode::p_shader_cycles_hi_lo_hi ||
|
|
instr->definitions[0].isNoCSE())
|
|
return false;
|
|
|
|
--
|
|
2.48.1
|
|
|
|
|
|
From 8de2e82ca04d973c70eec66c650809a89000e90d Mon Sep 17 00:00:00 2001
|
|
From: Georg Lehmann <dadschoorse@gmail.com>
|
|
Date: Mon, 9 Dec 2024 13:40:49 +0100
|
|
Subject: [PATCH 19/27] aco/gfx12: don't assume memory operations complete in
|
|
order
|
|
|
|
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
|
|
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32569>
|
|
---
|
|
src/amd/compiler/aco_insert_waitcnt.cpp | 4 ++--
|
|
1 file changed, 2 insertions(+), 2 deletions(-)
|
|
|
|
diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp
|
|
index 66ce8603ea4..22822d92f24 100644
|
|
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
|
|
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
|
|
@@ -449,9 +449,9 @@ perform_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, unsigned se
|
|
if (bar_scope_lds <= subgroup_scope)
|
|
events &= ~event_lds;
|
|
|
|
- /* in non-WGP, the L1 (L0 on GFX10+) cache keeps all memory operations
|
|
+ /* Until GFX12, in non-WGP, the L1 (L0 on GFX10+) cache keeps all memory operations
|
|
* in-order for the same workgroup */
|
|
- if (!ctx.program->wgp_mode && sync.scope <= scope_workgroup)
|
|
+ if (ctx.gfx_level < GFX12 && !ctx.program->wgp_mode && sync.scope <= scope_workgroup)
|
|
events &= ~(event_vmem | event_vmem_store | event_smem);
|
|
|
|
if (events)
|
|
--
|
|
2.48.1
|
|
|
|
|
|
From 054163a6be4c130b5a95fb5fa2d39d64ea8ee259 Mon Sep 17 00:00:00 2001
|
|
From: Qiang Yu <yuq825@gmail.com>
|
|
Date: Mon, 9 Dec 2024 10:01:21 +0800
|
|
Subject: [PATCH 20/27] aco: enable gfx12 support for radeonsi
|
|
|
|
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32570>
|
|
---
|
|
src/amd/compiler/aco_interface.cpp | 1 +
|
|
1 file changed, 1 insertion(+)
|
|
|
|
diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp
|
|
index ab518aa14ef..8508cc0445e 100644
|
|
--- a/src/amd/compiler/aco_interface.cpp
|
|
+++ b/src/amd/compiler/aco_interface.cpp
|
|
@@ -422,6 +422,7 @@ aco_is_gpu_supported(const struct radeon_info* info)
|
|
case GFX10_3:
|
|
case GFX11:
|
|
case GFX11_5:
|
|
+ case GFX12:
|
|
return true;
|
|
default:
|
|
return false;
|
|
--
|
|
2.48.1
|
|
|
|
|
|
From a3006214823ffacb34383fad792b3184a171c135 Mon Sep 17 00:00:00 2001
|
|
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
|
|
Date: Fri, 3 Jan 2025 08:29:34 -0800
|
|
Subject: [PATCH 21/27] aco: fix VS prologs on GFX12
|
|
|
|
MTBUF/MUBUF instructions must use zero for SOFFSET, use const_offset
|
|
instead.
|
|
|
|
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
|
|
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32904>
|
|
---
|
|
.../compiler/aco_instruction_selection.cpp | 30 +++++++++++++++----
|
|
1 file changed, 25 insertions(+), 5 deletions(-)
|
|
|
|
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
|
|
index 93a88c74d30..5e4806f608f 100644
|
|
--- a/src/amd/compiler/aco_instruction_selection.cpp
|
|
+++ b/src/amd/compiler/aco_instruction_selection.cpp
|
|
@@ -12903,8 +12903,16 @@ load_unaligned_vs_attrib(Builder& bld, PhysReg dst, Operand desc, Operand index,
|
|
} else {
|
|
for (unsigned i = 0; i < size; i++) {
|
|
Definition def(i ? scratch.advance(i * 4 - 4) : dst, v1);
|
|
- bld.mubuf(aco_opcode::buffer_load_ubyte, def, desc, index, Operand::c32(offset + i), 0,
|
|
- false, true);
|
|
+ unsigned soffset = 0, const_offset = 0;
|
|
+
|
|
+ if (bld.program->gfx_level >= GFX12) {
|
|
+ const_offset = offset + i;
|
|
+ } else {
|
|
+ soffset = offset + i;
|
|
+ }
|
|
+
|
|
+ bld.mubuf(aco_opcode::buffer_load_ubyte, def, desc, index, Operand::c32(soffset),
|
|
+ const_offset, false, true);
|
|
}
|
|
}
|
|
|
|
@@ -13082,6 +13090,17 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_sh
|
|
for (unsigned j = 0; j < (vtx_info->chan_byte_size ? vtx_info->num_channels : 1); j++) {
|
|
bool post_shuffle = pinfo->post_shuffle & (1u << loc);
|
|
unsigned offset = vtx_info->chan_byte_size * (post_shuffle && j < 3 ? 2 - j : j);
|
|
+ unsigned soffset = 0, const_offset = 0;
|
|
+
|
|
+ /* We need to use soffset on GFX6-7 to avoid being considered
|
|
+ * out-of-bounds when offset>=stride. GFX12 doesn't support a
|
|
+ * non-zero constant soffset.
|
|
+ */
|
|
+ if (program->gfx_level >= GFX12) {
|
|
+ const_offset = offset;
|
|
+ } else {
|
|
+ soffset = offset;
|
|
+ }
|
|
|
|
if ((pinfo->unaligned_mask & (1u << loc)) && vtx_info->chan_byte_size <= 4)
|
|
load_unaligned_vs_attrib(bld, dest.advance(j * 4u), Operand(cur_desc, s4),
|
|
@@ -13089,11 +13108,12 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_sh
|
|
else if (vtx_info->chan_byte_size == 8)
|
|
bld.mtbuf(aco_opcode::tbuffer_load_format_xy,
|
|
Definition(dest.advance(j * 8u), v2), Operand(cur_desc, s4),
|
|
- fetch_index, Operand::c32(offset), dfmt, nfmt, 0, false, true);
|
|
+ fetch_index, Operand::c32(soffset), dfmt, nfmt, const_offset, false,
|
|
+ true);
|
|
else
|
|
bld.mtbuf(aco_opcode::tbuffer_load_format_x, Definition(dest.advance(j * 4u), v1),
|
|
- Operand(cur_desc, s4), fetch_index, Operand::c32(offset), dfmt, nfmt,
|
|
- 0, false, true);
|
|
+ Operand(cur_desc, s4), fetch_index, Operand::c32(soffset), dfmt, nfmt,
|
|
+ const_offset, false, true);
|
|
}
|
|
|
|
unsigned slots = vtx_info->chan_byte_size == 8 && vtx_info->num_channels > 2 ? 2 : 1;
|
|
--
|
|
2.48.1
|
|
|
|
|
|
From 9aeed54b7f1050648612f59844c4cfbedfe1b182 Mon Sep 17 00:00:00 2001
|
|
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
|
|
Date: Wed, 8 Jan 2025 02:03:09 -0800
|
|
Subject: [PATCH 22/27] aco: always use ds_bpermute for shuffle/rotate on GFX12
|
|
|
|
ds_bpermute supports both 32 and 64 lanes now.
|
|
|
|
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
|
|
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32974>
|
|
---
|
|
src/amd/compiler/aco_instruction_selection.cpp | 13 +++++++------
|
|
1 file changed, 7 insertions(+), 6 deletions(-)
|
|
|
|
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
|
|
index 5e4806f608f..8ab9c18ee2a 100644
|
|
--- a/src/amd/compiler/aco_instruction_selection.cpp
|
|
+++ b/src/amd/compiler/aco_instruction_selection.cpp
|
|
@@ -189,9 +189,9 @@ emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data)
|
|
|
|
return bld.pseudo(aco_opcode::p_bpermute_readlane, bld.def(v1), bld.def(bld.lm),
|
|
bld.def(bld.lm, vcc), index_op, input_data);
|
|
- } else if (ctx->options->gfx_level >= GFX10 && ctx->program->wave_size == 64) {
|
|
-
|
|
- /* GFX10 wave64 mode: emulate full-wave bpermute */
|
|
+ } else if (ctx->options->gfx_level >= GFX10 && ctx->options->gfx_level <= GFX11_5 &&
|
|
+ ctx->program->wave_size == 64) {
|
|
+ /* GFX10-11.5 wave64 mode: emulate full-wave bpermute */
|
|
Temp index_is_lo =
|
|
bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand::c32(31u), index);
|
|
Builder::Result index_is_lo_split =
|
|
@@ -221,7 +221,7 @@ emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data)
|
|
same_half);
|
|
}
|
|
} else {
|
|
- /* GFX8-9 or GFX10 wave32: bpermute works normally */
|
|
+ /* wave32 or GFX8-9, GFX12+: bpermute works normally */
|
|
Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
|
|
return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
|
|
}
|
|
@@ -8747,8 +8747,9 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
|
Temp tid = emit_mbcnt(ctx, bld.tmp(v1));
|
|
Temp src_lane = bld.vadd32(bld.def(v1), tid, delta);
|
|
|
|
- if (ctx->program->gfx_level >= GFX10 && cluster_size == 32) {
|
|
- /* ds_bpermute is restricted to 32 lanes on GFX10+. */
|
|
+ if (ctx->program->gfx_level >= GFX10 && ctx->program->gfx_level <= GFX11_5 &&
|
|
+ cluster_size == 32) {
|
|
+ /* ds_bpermute is restricted to 32 lanes on GFX10-GFX11.5. */
|
|
Temp index_x4 =
|
|
bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), src_lane);
|
|
tmp = bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, src);
|
|
--
|
|
2.48.1
|
|
|
|
|
|
From 4929c358e0d44cb25cb486cc634ee96536a674db Mon Sep 17 00:00:00 2001
|
|
From: Georg Lehmann <dadschoorse@gmail.com>
|
|
Date: Sun, 19 Jan 2025 10:17:12 +0100
|
|
Subject: [PATCH 23/27] aco: update is_dual_issue_capable for gfx11.5+
|
|
MIME-Version: 1.0
|
|
Content-Type: text/plain; charset=UTF-8
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
|
|
Reviewed-by: Daniel Schürmann <None>
|
|
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33111>
|
|
---
|
|
src/amd/compiler/aco_statistics.cpp | 7 ++++++-
|
|
1 file changed, 6 insertions(+), 1 deletion(-)
|
|
|
|
diff --git a/src/amd/compiler/aco_statistics.cpp b/src/amd/compiler/aco_statistics.cpp
|
|
index 552bfd2cee4..00934b89ceb 100644
|
|
--- a/src/amd/compiler/aco_statistics.cpp
|
|
+++ b/src/amd/compiler/aco_statistics.cpp
|
|
@@ -142,7 +142,12 @@ is_dual_issue_capable(const Program& program, const Instruction& instr)
|
|
}
|
|
return false;
|
|
}
|
|
- default: return false;
|
|
+ default:
|
|
+ if (instr.isVINTERP_INREG())
|
|
+ return program.gfx_level >= GFX11_5;
|
|
+ if (instr.isVOPC() && instr_info.classes[(int)instr.opcode] == instr_class::valu32)
|
|
+ return program.gfx_level == GFX11_5;
|
|
+ return false;
|
|
}
|
|
}
|
|
|
|
--
|
|
2.48.1
|
|
|
|
|
|
From 9f0671af164968432750d7b1be5da6d838765179 Mon Sep 17 00:00:00 2001
|
|
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
|
|
Date: Thu, 12 Sep 2024 15:46:29 +0200
|
|
Subject: [PATCH 24/27] aco,radv,radeonsi: move has_epilog to the fragment
|
|
shader info
|
|
|
|
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
|
|
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31150>
|
|
---
|
|
src/amd/compiler/aco_assembler.cpp | 3 ++-
|
|
src/amd/compiler/aco_instruction_selection.cpp | 8 ++++----
|
|
src/amd/compiler/aco_interface.cpp | 2 +-
|
|
src/amd/compiler/aco_shader_info.h | 2 +-
|
|
src/amd/vulkan/radv_aco_shader_info.h | 2 +-
|
|
src/amd/vulkan/radv_cmd_buffer.c | 6 +++---
|
|
src/amd/vulkan/radv_pipeline.c | 2 +-
|
|
src/amd/vulkan/radv_pipeline_graphics.c | 2 +-
|
|
src/amd/vulkan/radv_shader_args.c | 2 +-
|
|
src/amd/vulkan/radv_shader_info.c | 6 +++---
|
|
src/amd/vulkan/radv_shader_info.h | 3 ++-
|
|
src/gallium/drivers/radeonsi/si_shader_aco.c | 2 +-
|
|
12 files changed, 21 insertions(+), 19 deletions(-)
|
|
|
|
diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp
|
|
index 714648b252c..cf57a7e95c6 100644
|
|
--- a/src/amd/compiler/aco_assembler.cpp
|
|
+++ b/src/amd/compiler/aco_assembler.cpp
|
|
@@ -1727,7 +1727,8 @@ emit_program(Program* program, std::vector<uint32_t>& code, std::vector<struct a
|
|
program->info.merged_shader_compiled_separately;
|
|
|
|
/* Prolog has no exports. */
|
|
- if (!program->is_prolog && !program->info.has_epilog && !is_separately_compiled_ngg_vs_or_es &&
|
|
+ if (!program->is_prolog && !program->info.ps.has_epilog &&
|
|
+ !is_separately_compiled_ngg_vs_or_es &&
|
|
(program->stage.hw == AC_HW_VERTEX_SHADER || program->stage.hw == AC_HW_PIXEL_SHADER ||
|
|
program->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER))
|
|
fix_exports(ctx, code, program);
|
|
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
|
|
index 8ab9c18ee2a..987ba7d22ae 100644
|
|
--- a/src/amd/compiler/aco_instruction_selection.cpp
|
|
+++ b/src/amd/compiler/aco_instruction_selection.cpp
|
|
@@ -177,7 +177,7 @@ emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data)
|
|
const bool avoid_shared_vgprs =
|
|
ctx->options->gfx_level >= GFX10 && ctx->options->gfx_level < GFX11 &&
|
|
ctx->program->wave_size == 64 &&
|
|
- (ctx->program->info.has_epilog || ctx->program->info.merged_shader_compiled_separately ||
|
|
+ (ctx->program->info.ps.has_epilog || ctx->program->info.merged_shader_compiled_separately ||
|
|
ctx->program->info.vs.has_prolog || ctx->stage == raytracing_cs);
|
|
|
|
if (ctx->options->gfx_level <= GFX7 || avoid_shared_vgprs) {
|
|
@@ -5611,7 +5611,7 @@ store_output_to_temps(isel_context* ctx, nir_intrinsic_instr* instr)
|
|
idx++;
|
|
}
|
|
|
|
- if (ctx->stage == fragment_fs && ctx->program->info.has_epilog && base >= FRAG_RESULT_DATA0) {
|
|
+ if (ctx->stage == fragment_fs && ctx->program->info.ps.has_epilog && base >= FRAG_RESULT_DATA0) {
|
|
unsigned index = base - FRAG_RESULT_DATA0;
|
|
|
|
if (nir_intrinsic_src_type(instr) == nir_type_float16) {
|
|
@@ -11927,7 +11927,7 @@ select_shader(isel_context& ctx, nir_shader* nir, const bool need_startpgm, cons
|
|
nir_function_impl* func = nir_shader_get_entrypoint(nir);
|
|
visit_cf_list(&ctx, &func->body);
|
|
|
|
- if (ctx.program->info.has_epilog) {
|
|
+ if (ctx.program->info.ps.has_epilog) {
|
|
if (ctx.stage == fragment_fs) {
|
|
if (ctx.options->is_opengl)
|
|
create_fs_end_for_epilog(&ctx);
|
|
@@ -11965,7 +11965,7 @@ select_shader(isel_context& ctx, nir_shader* nir, const bool need_startpgm, cons
|
|
append_logical_end(ctx.block);
|
|
ctx.block->kind |= block_kind_uniform;
|
|
|
|
- if ((!program->info.has_epilog && !is_first_stage_of_merged_shader) ||
|
|
+ if ((!program->info.ps.has_epilog && !is_first_stage_of_merged_shader) ||
|
|
(nir->info.stage == MESA_SHADER_TESS_CTRL && program->gfx_level >= GFX9)) {
|
|
Builder(program, ctx.block).sopp(aco_opcode::s_endpgm);
|
|
}
|
|
diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp
|
|
index 8508cc0445e..64396cfbc75 100644
|
|
--- a/src/amd/compiler/aco_interface.cpp
|
|
+++ b/src/amd/compiler/aco_interface.cpp
|
|
@@ -277,7 +277,7 @@ aco_compile_shader(const struct aco_compiler_options* options, const struct aco_
|
|
/* OpenGL combine multi shader parts into one continous code block,
|
|
* so only last part need the s_endpgm instruction.
|
|
*/
|
|
- bool append_endpgm = !(options->is_opengl && info->has_epilog);
|
|
+ bool append_endpgm = !(options->is_opengl && info->ps.has_epilog);
|
|
unsigned exec_size = emit_program(program.get(), code, &symbols, append_endpgm);
|
|
|
|
if (program->collect_statistics)
|
|
diff --git a/src/amd/compiler/aco_shader_info.h b/src/amd/compiler/aco_shader_info.h
|
|
index 1cc2bc11db4..7f6fe7a5e9e 100644
|
|
--- a/src/amd/compiler/aco_shader_info.h
|
|
+++ b/src/amd/compiler/aco_shader_info.h
|
|
@@ -102,7 +102,6 @@ struct aco_shader_info {
|
|
bool has_ngg_early_prim_export;
|
|
bool image_2d_view_of_3d;
|
|
unsigned workgroup_size;
|
|
- bool has_epilog; /* Only for TCS or PS. */
|
|
bool merged_shader_compiled_separately; /* GFX9+ */
|
|
struct ac_arg next_stage_pc;
|
|
struct ac_arg epilog_pc; /* Vulkan only */
|
|
@@ -127,6 +126,7 @@ struct aco_shader_info {
|
|
uint32_t num_interp;
|
|
unsigned spi_ps_input_ena;
|
|
unsigned spi_ps_input_addr;
|
|
+ bool has_epilog;
|
|
|
|
/* OpenGL only */
|
|
struct ac_arg alpha_reference;
|
|
diff --git a/src/amd/vulkan/radv_aco_shader_info.h b/src/amd/vulkan/radv_aco_shader_info.h
|
|
index b9665b2b20f..912d6b6ca4e 100644
|
|
--- a/src/amd/vulkan/radv_aco_shader_info.h
|
|
+++ b/src/amd/vulkan/radv_aco_shader_info.h
|
|
@@ -31,7 +31,7 @@ radv_aco_convert_shader_info(struct aco_shader_info *aco_info, const struct radv
|
|
ASSIGN_FIELD(has_ngg_culling);
|
|
ASSIGN_FIELD(has_ngg_early_prim_export);
|
|
ASSIGN_FIELD(workgroup_size);
|
|
- ASSIGN_FIELD(has_epilog);
|
|
+ ASSIGN_FIELD(ps.has_epilog);
|
|
ASSIGN_FIELD(merged_shader_compiled_separately);
|
|
ASSIGN_FIELD(vs.tcs_in_out_eq);
|
|
ASSIGN_FIELD(vs.tcs_temp_only_input_mask);
|
|
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
|
|
index d0cf1f1123b..671b35cb972 100644
|
|
--- a/src/amd/vulkan/radv_cmd_buffer.c
|
|
+++ b/src/amd/vulkan/radv_cmd_buffer.c
|
|
@@ -8096,7 +8096,7 @@ radv_bind_fragment_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_
|
|
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DB_SHADER_CONTROL;
|
|
|
|
/* Re-emit the PS epilog when a new fragment shader is bound. */
|
|
- if (ps->info.has_epilog)
|
|
+ if (ps->info.ps.has_epilog)
|
|
cmd_buffer->state.emitted_ps_epilog = NULL;
|
|
}
|
|
|
|
@@ -10703,7 +10703,7 @@ radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct r
|
|
struct radv_shader_part *ps_epilog = NULL;
|
|
|
|
if (cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT] &&
|
|
- cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT]->info.has_epilog) {
|
|
+ cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT]->info.ps.has_epilog) {
|
|
if ((cmd_buffer->state.emitted_graphics_pipeline != cmd_buffer->state.graphics_pipeline ||
|
|
((cmd_buffer->state.dirty & (RADV_CMD_DIRTY_GRAPHICS_SHADERS | RADV_CMD_DIRTY_FRAMEBUFFER)) ||
|
|
(cmd_buffer->state.dirty_dynamic &
|
|
@@ -10921,7 +10921,7 @@ radv_bind_graphics_shaders(struct radv_cmd_buffer *cmd_buffer)
|
|
}
|
|
|
|
const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
|
|
- if (ps && !ps->info.has_epilog) {
|
|
+ if (ps && !ps->info.ps.has_epilog) {
|
|
uint32_t col_format = 0, cb_shader_mask = 0;
|
|
if (radv_needs_null_export_workaround(device, ps, 0))
|
|
col_format = V_028714_SPI_SHADER_32_R;
|
|
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
|
|
index e07737cd742..738fc736b2c 100644
|
|
--- a/src/amd/vulkan/radv_pipeline.c
|
|
+++ b/src/amd/vulkan/radv_pipeline.c
|
|
@@ -551,7 +551,7 @@ radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_stat
|
|
.use_aco = !radv_use_llvm_for_stage(pdev, stage->stage),
|
|
.uses_discard = true,
|
|
.alpha_func = COMPARE_FUNC_ALWAYS,
|
|
- .no_color_export = stage->info.has_epilog,
|
|
+ .no_color_export = stage->info.ps.has_epilog,
|
|
.no_depth_export = stage->info.ps.exports_mrtz_via_epilog,
|
|
|
|
.bc_optimize_for_persp = G_0286CC_PERSP_CENTER_ENA(stage->info.ps.spi_ps_input_ena) &&
|
|
diff --git a/src/amd/vulkan/radv_pipeline_graphics.c b/src/amd/vulkan/radv_pipeline_graphics.c
|
|
index ed380341ba5..daab8ce2ed6 100644
|
|
--- a/src/amd/vulkan/radv_pipeline_graphics.c
|
|
+++ b/src/amd/vulkan/radv_pipeline_graphics.c
|
|
@@ -3150,7 +3150,7 @@ radv_graphics_pipeline_init(struct radv_graphics_pipeline *pipeline, struct radv
|
|
radv_pipeline_init_dynamic_state(device, pipeline, &gfx_state.vk, pCreateInfo);
|
|
|
|
const struct radv_shader *ps = pipeline->base.shaders[MESA_SHADER_FRAGMENT];
|
|
- if (ps && !ps->info.has_epilog) {
|
|
+ if (ps && !ps->info.ps.has_epilog) {
|
|
pipeline->spi_shader_col_format = ps->info.ps.spi_shader_col_format;
|
|
pipeline->cb_shader_mask = ps->info.ps.cb_shader_mask;
|
|
}
|
|
diff --git a/src/amd/vulkan/radv_shader_args.c b/src/amd/vulkan/radv_shader_args.c
|
|
index 10f1ee3ad07..479bc49c803 100644
|
|
--- a/src/amd/vulkan/radv_shader_args.c
|
|
+++ b/src/amd/vulkan/radv_shader_args.c
|
|
@@ -846,7 +846,7 @@ declare_shader_args(const struct radv_device *device, const struct radv_graphics
|
|
case MESA_SHADER_FRAGMENT:
|
|
declare_global_input_sgprs(gfx_level, info, user_sgpr_info, args);
|
|
|
|
- if (info->has_epilog) {
|
|
+ if (info->ps.has_epilog) {
|
|
add_ud_arg(args, 1, AC_ARG_INT, &args->epilog_pc, AC_UD_EPILOG_PC);
|
|
}
|
|
|
|
diff --git a/src/amd/vulkan/radv_shader_info.c b/src/amd/vulkan/radv_shader_info.c
|
|
index c8c0b77201d..a1050a9ec07 100644
|
|
--- a/src/amd/vulkan/radv_shader_info.c
|
|
+++ b/src/amd/vulkan/radv_shader_info.c
|
|
@@ -972,9 +972,9 @@ gather_shader_info_fs(const struct radv_device *device, const nir_shader *nir,
|
|
info->ps.spi_ps_input_addr &= C_02865C_COVERAGE_TO_SHADER_SELECT;
|
|
}
|
|
|
|
- info->has_epilog = gfx_state->ps.has_epilog && info->ps.colors_written;
|
|
+ info->ps.has_epilog = gfx_state->ps.has_epilog && info->ps.colors_written;
|
|
|
|
- if (!info->has_epilog) {
|
|
+ if (!info->ps.has_epilog) {
|
|
info->ps.mrt0_is_dual_src = gfx_state->ps.epilog.mrt0_is_dual_src;
|
|
info->ps.spi_shader_col_format = gfx_state->ps.epilog.spi_shader_col_format;
|
|
|
|
@@ -988,7 +988,7 @@ gather_shader_info_fs(const struct radv_device *device, const nir_shader *nir,
|
|
(info->ps.color0_written & 0x8) && (info->ps.writes_z || info->ps.writes_stencil || info->ps.writes_sample_mask);
|
|
|
|
info->ps.exports_mrtz_via_epilog =
|
|
- info->has_epilog && gfx_state->ps.exports_mrtz_via_epilog && export_alpha_and_mrtz;
|
|
+ info->ps.has_epilog && gfx_state->ps.exports_mrtz_via_epilog && export_alpha_and_mrtz;
|
|
|
|
if (!info->ps.exports_mrtz_via_epilog) {
|
|
info->ps.writes_mrt0_alpha = gfx_state->ms.alpha_to_coverage_via_mrtz && export_alpha_and_mrtz;
|
|
diff --git a/src/amd/vulkan/radv_shader_info.h b/src/amd/vulkan/radv_shader_info.h
|
|
index e072c9ffd5b..0ffa972b078 100644
|
|
--- a/src/amd/vulkan/radv_shader_info.h
|
|
+++ b/src/amd/vulkan/radv_shader_info.h
|
|
@@ -106,7 +106,6 @@ struct radv_shader_info {
|
|
uint32_t user_data_0;
|
|
bool inputs_linked;
|
|
bool outputs_linked;
|
|
- bool has_epilog; /* Only for TCS or PS */
|
|
bool merged_shader_compiled_separately; /* GFX9+ */
|
|
bool force_indirect_desc_sets;
|
|
|
|
@@ -213,6 +212,8 @@ struct radv_shader_info {
|
|
bool load_provoking_vtx;
|
|
bool load_rasterization_prim;
|
|
bool force_sample_iter_shading_rate;
|
|
+ bool uses_fbfetch_output;
|
|
+ bool has_epilog;
|
|
} ps;
|
|
struct {
|
|
bool uses_grid_size;
|
|
diff --git a/src/gallium/drivers/radeonsi/si_shader_aco.c b/src/gallium/drivers/radeonsi/si_shader_aco.c
|
|
index 67ba140ed67..232104954fb 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_shader_aco.c
|
|
+++ b/src/gallium/drivers/radeonsi/si_shader_aco.c
|
|
@@ -100,7 +100,7 @@ si_fill_aco_shader_info(struct si_shader *shader, struct aco_shader_info *info,
|
|
info->ps.spi_ps_input_ena = shader->config.spi_ps_input_ena;
|
|
info->ps.spi_ps_input_addr = shader->config.spi_ps_input_addr;
|
|
info->ps.alpha_reference = args->alpha_reference;
|
|
- info->has_epilog = !shader->is_monolithic;
|
|
+ info->ps.has_epilog = !shader->is_monolithic;
|
|
break;
|
|
default:
|
|
break;
|
|
--
|
|
2.48.1
|
|
|
|
|
|
From 82939c7825d3a45d5e992e10491e43183e511a9d Mon Sep 17 00:00:00 2001
|
|
From: Rhys Perry <pendingchaos02@gmail.com>
|
|
Date: Thu, 24 Oct 2024 11:01:46 +0100
|
|
Subject: [PATCH 25/27] aco,radv,radeonsi: add aco_shader_info::ps::has_prolog
|
|
|
|
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
|
|
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
|
|
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30478>
|
|
---
|
|
src/amd/compiler/aco_shader_info.h | 1 +
|
|
src/amd/vulkan/radv_aco_shader_info.h | 1 +
|
|
src/gallium/drivers/radeonsi/si_shader_aco.c | 1 +
|
|
3 files changed, 3 insertions(+)
|
|
|
|
diff --git a/src/amd/compiler/aco_shader_info.h b/src/amd/compiler/aco_shader_info.h
|
|
index 7f6fe7a5e9e..ad615e19992 100644
|
|
--- a/src/amd/compiler/aco_shader_info.h
|
|
+++ b/src/amd/compiler/aco_shader_info.h
|
|
@@ -126,6 +126,7 @@ struct aco_shader_info {
|
|
uint32_t num_interp;
|
|
unsigned spi_ps_input_ena;
|
|
unsigned spi_ps_input_addr;
|
|
+ bool has_prolog;
|
|
bool has_epilog;
|
|
|
|
/* OpenGL only */
|
|
diff --git a/src/amd/vulkan/radv_aco_shader_info.h b/src/amd/vulkan/radv_aco_shader_info.h
|
|
index 912d6b6ca4e..9a0b4d562e0 100644
|
|
--- a/src/amd/vulkan/radv_aco_shader_info.h
|
|
+++ b/src/amd/vulkan/radv_aco_shader_info.h
|
|
@@ -41,6 +41,7 @@ radv_aco_convert_shader_info(struct aco_shader_info *aco_info, const struct radv
|
|
ASSIGN_FIELD(cs.uses_full_subgroups);
|
|
aco_info->ps.spi_ps_input_ena = radv->ps.spi_ps_input_ena;
|
|
aco_info->ps.spi_ps_input_addr = radv->ps.spi_ps_input_addr;
|
|
+ aco_info->ps.has_prolog = false;
|
|
aco_info->gfx9_gs_ring_lds_size = radv->gs_ring_info.lds_size;
|
|
aco_info->is_trap_handler_shader = radv->type == RADV_SHADER_TYPE_TRAP_HANDLER;
|
|
aco_info->image_2d_view_of_3d = radv_key->image_2d_view_of_3d;
|
|
diff --git a/src/gallium/drivers/radeonsi/si_shader_aco.c b/src/gallium/drivers/radeonsi/si_shader_aco.c
|
|
index 232104954fb..dd0b3759a2b 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_shader_aco.c
|
|
+++ b/src/gallium/drivers/radeonsi/si_shader_aco.c
|
|
@@ -100,6 +100,7 @@ si_fill_aco_shader_info(struct si_shader *shader, struct aco_shader_info *info,
|
|
info->ps.spi_ps_input_ena = shader->config.spi_ps_input_ena;
|
|
info->ps.spi_ps_input_addr = shader->config.spi_ps_input_addr;
|
|
info->ps.alpha_reference = args->alpha_reference;
|
|
+ info->ps.has_prolog = !shader->is_monolithic;
|
|
info->ps.has_epilog = !shader->is_monolithic;
|
|
break;
|
|
default:
|
|
--
|
|
2.48.1
|
|
|
|
|
|
From af418b1dcedd56326872945c18dff6dae79ebfdf Mon Sep 17 00:00:00 2001
|
|
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
|
|
Date: Thu, 12 Sep 2024 15:44:43 +0200
|
|
Subject: [PATCH 26/27] radv,radeonsi: remove remaining occurrences of TCS
|
|
epilog
|
|
|
|
TCS epilog has been removed few months ago.
|
|
|
|
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
|
|
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31150>
|
|
---
|
|
src/amd/vulkan/radv_shader_args.c | 8 --------
|
|
src/gallium/drivers/radeonsi/si_shader_aco.c | 1 -
|
|
2 files changed, 9 deletions(-)
|
|
|
|
diff --git a/src/amd/vulkan/radv_shader_args.c b/src/amd/vulkan/radv_shader_args.c
|
|
index 479bc49c803..100797c7389 100644
|
|
--- a/src/amd/vulkan/radv_shader_args.c
|
|
+++ b/src/amd/vulkan/radv_shader_args.c
|
|
@@ -683,10 +683,6 @@ declare_shader_args(const struct radv_device *device, const struct radv_graphics
|
|
add_ud_arg(args, 1, AC_ARG_INT, &args->tcs_offchip_layout, AC_UD_TCS_OFFCHIP_LAYOUT);
|
|
}
|
|
|
|
- if (info->has_epilog) {
|
|
- add_ud_arg(args, 1, AC_ARG_INT, &args->epilog_pc, AC_UD_EPILOG_PC);
|
|
- }
|
|
-
|
|
ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_patch_id);
|
|
ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_rel_ids);
|
|
|
|
@@ -703,10 +699,6 @@ declare_shader_args(const struct radv_device *device, const struct radv_graphics
|
|
add_ud_arg(args, 1, AC_ARG_INT, &args->tcs_offchip_layout, AC_UD_TCS_OFFCHIP_LAYOUT);
|
|
}
|
|
|
|
- if (info->has_epilog) {
|
|
- add_ud_arg(args, 1, AC_ARG_INT, &args->epilog_pc, AC_UD_EPILOG_PC);
|
|
- }
|
|
-
|
|
ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
|
|
ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_factor_offset);
|
|
if (args->explicit_scratch_args) {
|
|
diff --git a/src/gallium/drivers/radeonsi/si_shader_aco.c b/src/gallium/drivers/radeonsi/si_shader_aco.c
|
|
index dd0b3759a2b..dacfe15bb5a 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_shader_aco.c
|
|
+++ b/src/gallium/drivers/radeonsi/si_shader_aco.c
|
|
@@ -88,7 +88,6 @@ si_fill_aco_shader_info(struct si_shader *shader, struct aco_shader_info *info,
|
|
case MESA_SHADER_TESS_CTRL:
|
|
info->vs.tcs_in_out_eq = key->ge.opt.same_patch_vertices;
|
|
info->vs.tcs_temp_only_input_mask = sel->info.tcs_vgpr_only_inputs;
|
|
- info->has_epilog = !shader->is_monolithic;
|
|
info->tcs.pass_tessfactors_by_reg = sel->info.tessfactors_are_def_in_all_invocs;
|
|
info->tcs.patch_stride = si_get_tcs_out_patch_stride(&sel->info);
|
|
info->tcs.tcs_offchip_layout = args->tcs_offchip_layout;
|
|
--
|
|
2.48.1
|
|
|
|
|
|
From fdd1b46c2683b34bb7171038999e0dc4620a9296 Mon Sep 17 00:00:00 2001
|
|
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
|
|
Date: Wed, 8 Jan 2025 15:03:32 -0500
|
|
Subject: [PATCH 27/27] radeonsi/gfx12: use ACO if LLVM is 19 or older
|
|
|
|
LLVM 19 is missing a SALU hazard fix.
|
|
|
|
(cherry-picked + adapted from 239840556f99aaa1b71c450a3b389bd9be1a24f3)
|
|
|
|
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
|
|
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32957>
|
|
---
|
|
src/gallium/drivers/radeonsi/si_pipe.c | 9 ++++++++-
|
|
1 file changed, 8 insertions(+), 1 deletion(-)
|
|
|
|
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
|
|
index 4e063c1ce3e..f3f23ab0dd0 100644
|
|
--- a/src/gallium/drivers/radeonsi/si_pipe.c
|
|
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
|
|
@@ -1198,7 +1198,14 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
|
|
sscreen->info.register_shadowing_required = true;
|
|
|
|
#if AMD_LLVM_AVAILABLE
|
|
- sscreen->use_aco = (sscreen->debug_flags & DBG(USE_ACO));
|
|
+ /* For GFX11.5, LLVM < 19 is missing a workaround that can cause GPU hangs. ACO is the only
|
|
+ * alternative that has the workaround and is always available. Same for GFX12.
|
|
+ */
|
|
+ if ((sscreen->info.gfx_level == GFX12 && LLVM_VERSION_MAJOR < 20) ||
|
|
+ (sscreen->info.gfx_level == GFX11_5 && LLVM_VERSION_MAJOR < 19))
|
|
+ sscreen->use_aco = true;
|
|
+ else
|
|
+ sscreen->use_aco = sscreen->debug_flags & DBG(USE_ACO);
|
|
#else
|
|
sscreen->use_aco = true;
|
|
#endif
|
|
--
|
|
2.48.1
|
|
|