From 3915d1efcdb1e9d10c8f6966acbe5c359d824ba1 Mon Sep 17 00:00:00 2001
From: Josh Stone <jistone@redhat.com>
Date: Mon, 6 Apr 2026 14:08:10 -0700
Subject: [PATCH] [CodeGen] Preserve big-endian trunc in concat_vectors

A transform from `concat_vectors(trunc(scalar), undef)` to
`scalar_to_vector(scalar)` is only equivalent for little-endian targets.
On big-endian, that would put the extra upper bytes ahead of the desired
truncated bytes. This problem was seen on Rust s390x in [RHEL-147748].

[RHEL-147748]: https://redhat.atlassian.net/browse/RHEL-147748

Assisted-by: Claude Code
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  4 +-
 llvm/test/CodeGen/SystemZ/vec-trunc-to-i16.ll | 45 +++++++++++++++++++
 2 files changed, 48 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/SystemZ/vec-trunc-to-i16.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 383e45c5ea3a8..5485ee86251a5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -26513,9 +26513,11 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
       // If the bitcast type isn't legal, it might be a trunc of a legal type;
       // look through the trunc so we can still do the transform:
       //   concat_vectors(trunc(scalar), undef) -> scalar_to_vector(scalar)
+      // However, this is only equivalent on little-endian targets.
       if (Scalar->getOpcode() == ISD::TRUNCATE &&
           !TLI.isTypeLegal(Scalar.getValueType()) &&
-          TLI.isTypeLegal(Scalar->getOperand(0).getValueType()))
+          TLI.isTypeLegal(Scalar->getOperand(0).getValueType()) &&
+          DAG.getDataLayout().isLittleEndian())
         Scalar = Scalar->getOperand(0);
 
       EVT SclTy = Scalar.getValueType();
diff --git a/llvm/test/CodeGen/SystemZ/vec-trunc-to-i16.ll b/llvm/test/CodeGen/SystemZ/vec-trunc-to-i16.ll
new file mode 100644
index 0000000000000..42d787d945145
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/vec-trunc-to-i16.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+; Test that truncated scalars use the correct vector insert instruction.
+; On big-endian targets, concat_vectors should not skip truncates when
+; creating scalar_to_vector, as the bytes would be in the wrong position.
+
+; This truncated i16 should use vlvgh (insert halfword), not vlvgf (insert fullword).
+define <16 x i8> @test_concat_trunc_i16(i32 %x) {
+; CHECK-LABEL: test_concat_trunc_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlvgh %v24, %r2, 0
+; CHECK-NEXT:    br %r14
+  %t = trunc i32 %x to i16
+  %vec = bitcast i16 %t to <2 x i8>
+  %result = shufflevector <2 x i8> %vec, <2 x i8> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  ret <16 x i8> %result
+}
+
+; Test with a more complex shuffle pattern, reduced from a Rust bug report.
+define fastcc void @test_shuffle_with_trunc() {
+; CHECK-LABEL: test_shuffle_with_trunc:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lh %r1, 0
+; CHECK-NEXT:    l %r0, 0
+; CHECK-NEXT:    vlvgh %v1, %r1, 0
+; CHECK-NEXT:    larl %r1, .LCPI1_0
+; CHECK-NEXT:    vl %v2, 0(%r1), 3
+; CHECK-NEXT:    vlvgf %v0, %r0, 0
+; CHECK-NEXT:    vperm %v0, %v0, %v1, %v2
+; CHECK-NEXT:    vst %v0, 0, 3
+; CHECK-NEXT:    br %r14
+  %1 = load i32, ptr null, align 8
+  %2 = load i16, ptr null, align 1
+  br label %3
+
+3:
+  %4 = bitcast i32 %1 to <4 x i8>
+  %5 = shufflevector <4 x i8> %4, <4 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %6 = bitcast i16 %2 to <2 x i8>
+  %7 = shufflevector <2 x i8> %6, <2 x i8> zeroinitializer, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %8 = shufflevector <16 x i8> %5, <16 x i8> %7, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 26, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  store <16 x i8> %8, ptr null, align 8
+  ret void
+}