1721 lines
67 KiB
Diff
1721 lines
67 KiB
Diff
commit 3cc0232c46a5905b4a6c2fbd302b58bf5f90b3d5
|
|
Author: Carl Love <cel@us.ibm.com>
|
|
Date: Mon Jan 11 16:00:57 2021 -0600
|
|
|
|
PPC64: ISA 3.1 VSX PCV Generate Operations
|
|
|
|
xgenpcvbm VSX Vector Generate PCV from Byte Mask
|
|
xxgenpcvdmVSX Vector Generate PCV from Doubleword Mask
|
|
xxgenpcvhmVSX Vector Generate PCV from Halfword Mask
|
|
xxgenpcvwmVSX Vector Generate PCV from Word Mask
|
|
|
|
diff --git a/VEX/priv/guest_ppc_defs.h b/VEX/priv/guest_ppc_defs.h
|
|
index deda4dfce..54ce923a9 100644
|
|
--- a/VEX/priv/guest_ppc_defs.h
|
|
+++ b/VEX/priv/guest_ppc_defs.h
|
|
@@ -169,6 +169,23 @@ void write_ACC_entry (VexGuestPPC64State* gst, UInt offset, UInt acc,
|
|
void get_ACC_entry (VexGuestPPC64State* gst, UInt offset, UInt acc,
|
|
UInt reg, UInt *result);
|
|
|
|
+extern void vector_gen_pvc_byte_mask_dirty_helper( VexGuestPPC64State* gst,
|
|
+ ULong src_hi,
|
|
+ ULong src_lo,
|
|
+ UInt rtn_val, UInt IMM );
|
|
+extern void vector_gen_pvc_hword_mask_dirty_helper( VexGuestPPC64State* gst,
|
|
+ ULong src_hi,
|
|
+ ULong src_lo,
|
|
+ UInt rtn_val, UInt IMM );
|
|
+extern void vector_gen_pvc_word_mask_dirty_helper( VexGuestPPC64State* gst,
|
|
+ ULong src_hi,
|
|
+ ULong src_lo,
|
|
+ UInt rtn_val, UInt IMM );
|
|
+extern void vector_gen_pvc_dword_mask_dirty_helper( VexGuestPPC64State* gst,
|
|
+ ULong src_hi,
|
|
+ ULong src_lo,
|
|
+ UInt rtn_val, UInt IMM );
|
|
+
|
|
/* 8-bit XO value from instruction description */
|
|
#define XVI4GER8 0b00100011
|
|
#define XVI4GER8PP 0b00100010
|
|
diff --git a/VEX/priv/guest_ppc_helpers.c b/VEX/priv/guest_ppc_helpers.c
|
|
index c24191ef3..75497abb9 100644
|
|
--- a/VEX/priv/guest_ppc_helpers.c
|
|
+++ b/VEX/priv/guest_ppc_helpers.c
|
|
@@ -701,6 +701,738 @@ ULong vector_evaluate64_helper( ULong srcA, ULong srcB, ULong srcC,
|
|
#undef MAX_IMM_BITS
|
|
}
|
|
|
|
+/*--------------------------------------------------*/
|
|
+/*---- VSX Vector Generate PCV from Mask helpers ---*/
|
|
+/*--------------------------------------------------*/
|
|
+static void write_VSX_entry (VexGuestPPC64State* gst, UInt reg_offset,
|
|
+ ULong *vsx_entry)
|
|
+{
|
|
+ U128* pU128_dst;
|
|
+ pU128_dst = (U128*) (((UChar*) gst) + reg_offset);
|
|
+
|
|
+ /* The U128 type is defined as an array of unsigned intetgers. */
|
|
+ /* Writing in LE order */
|
|
+ (*pU128_dst)[0] = (UInt)(vsx_entry[1] & 0xFFFFFFFF);
|
|
+ (*pU128_dst)[1] = (UInt)(vsx_entry[1] >> 32);
|
|
+ (*pU128_dst)[2] = (UInt)(vsx_entry[0] & 0xFFFFFFFF);
|
|
+ (*pU128_dst)[3] = (UInt)(vsx_entry[0] >> 32);
|
|
+ return;
|
|
+}
|
|
+
|
|
+/* CALLED FROM GENERATED CODE */
|
|
+void vector_gen_pvc_byte_mask_dirty_helper( VexGuestPPC64State* gst,
|
|
+ ULong src_hi, ULong src_lo,
|
|
+ UInt reg_offset, UInt imm ) {
|
|
+ /* The function computes the 128-bit result then writes it directly
|
|
+ into the guest state VSX register. */
|
|
+
|
|
+ UInt i, shift_by, sel_shift_by, half_sel;
|
|
+ ULong index, src, result[2];
|
|
+ ULong j;
|
|
+
|
|
+ result[0] = 0;
|
|
+ result[1] = 0;
|
|
+ j = 0;
|
|
+
|
|
+ /* The algorithm in the ISA is written with IBM numbering zero on left and
|
|
+ N-1 on right. The loop index is converted to "i" to match the algorithm
|
|
+ for claritiy of matching the C code to the algorithm in the ISA. */
|
|
+
|
|
+ if (imm == 0b00) { // big endian expansion
|
|
+ for( index = 0; index < 16; index++) {
|
|
+ i = 15 - index;
|
|
+
|
|
+ shift_by = i*8;
|
|
+
|
|
+ if ( i >= 8) {
|
|
+ src = src_hi;
|
|
+ shift_by = shift_by - 64;
|
|
+ half_sel = 0;
|
|
+ } else {
|
|
+ src = src_lo;
|
|
+ half_sel = 1;
|
|
+ }
|
|
+
|
|
+ sel_shift_by = shift_by + 7;
|
|
+
|
|
+ if ( ((src >> sel_shift_by) & 0x1) == 1) {
|
|
+ result[half_sel] |= j << shift_by;
|
|
+ j++;
|
|
+ } else {
|
|
+ result[half_sel] |= (index + (unsigned long long)0x10) << shift_by;
|
|
+ }
|
|
+ }
|
|
+
|
|
+
|
|
+ } else if (imm == 0b01) { // big endian compression
|
|
+ /* If IMM=0b00001, let pcv be the permute control vector required to
|
|
+ enable a left-indexed permute (vperm or xxperm) to implement a
|
|
+ compression of the sparse byte elements in a source vector specified
|
|
+ by the byte-element mask in VSR[VRB+32] into the leftmost byte
|
|
+ elements of a result vector.
|
|
+ */
|
|
+ for( index = 0; index < 16; index++) {
|
|
+ i = 15 - index;
|
|
+ shift_by = i*8;
|
|
+
|
|
+ if ( i >= 8) {
|
|
+ src = src_hi;
|
|
+ shift_by = shift_by - 64;
|
|
+ half_sel = 0;
|
|
+ } else {
|
|
+ src = src_lo;
|
|
+ half_sel = 1;
|
|
+ }
|
|
+
|
|
+ sel_shift_by = shift_by + 7;
|
|
+
|
|
+ if ( ((src >> sel_shift_by) & 0x1) == 1) {
|
|
+ if (j >= 8)
|
|
+ result[1] |= (index) << (15 - j)*8;
|
|
+ else
|
|
+ result[0] |= (index) << (7 - j)*8;
|
|
+ j++;
|
|
+ }
|
|
+ }
|
|
+ /* The algorithim says set to undefined, leave as 0
|
|
+ for( index = 3 - j; index < 4; index++) {
|
|
+ result |= (0 << (index*8));
|
|
+ }
|
|
+ */
|
|
+
|
|
+ } else if (imm == 0b10) { //little-endian expansion
|
|
+ /* If IMM=0b00010, let pcv be the permute control vector required to
|
|
+ enable a right-indexed permute (vpermr or xxpermr) to implement an
|
|
+ expansion of the rightmost byte elements of a source vector into the
|
|
+ byte elements of a result vector specified by the byte-element mask
|
|
+ in VSR[VRB+32]. */
|
|
+ for( index = 0; index < 16; index++) {
|
|
+ i = index;
|
|
+
|
|
+ shift_by = i*8;
|
|
+
|
|
+ if ( i >= 8) {
|
|
+ src = src_hi;
|
|
+ shift_by = shift_by - 64;
|
|
+ half_sel = 0;
|
|
+ } else {
|
|
+ src = src_lo;
|
|
+ half_sel = 1;
|
|
+ }
|
|
+
|
|
+ sel_shift_by = shift_by + 7;
|
|
+
|
|
+ /* mod shift amount by 8 since src is either the upper or lower
|
|
+ 64-bits. */
|
|
+ if ( ((src >> sel_shift_by) & 0x1) == 1) {
|
|
+ result[half_sel] |= j << shift_by;
|
|
+ j++;
|
|
+ } else {
|
|
+ result[half_sel] |= (index + (unsigned long long)0x10) << shift_by;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ } else if (imm == 0b11) { //little-endian compression
|
|
+ /* If IMM=0b00011, let pcv be the permute control vector required to
|
|
+ enable a right-indexed permute (vpermr or xxpermr) to implement a
|
|
+ compression of the sparse byte elements in a source vector specified
|
|
+ by the byte-element mask in VSR[VRB+32] into the rightmost byte
|
|
+ elements of a result vector. */
|
|
+
|
|
+ for( index = 0; index < 16; index++) {
|
|
+ i = index;
|
|
+
|
|
+ shift_by = i*8;
|
|
+
|
|
+ if ( i >= 8) {
|
|
+ src = src_hi;
|
|
+ shift_by = shift_by - 64;
|
|
+ half_sel = 0;
|
|
+ } else {
|
|
+ src = src_lo;
|
|
+ half_sel = 1;
|
|
+ }
|
|
+
|
|
+ sel_shift_by = shift_by + 7;
|
|
+
|
|
+ if ( ((src >> sel_shift_by) & 0x1) == 1) {
|
|
+ if (j >= 8)
|
|
+ result[0] |= (index) << (j-8)*8;
|
|
+ else
|
|
+ result[1] |= (index) << j*8;
|
|
+ j++;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* The algorithim says set to undefined, leave as 0
|
|
+ for( index = 3 - j; index < 4; index++) {
|
|
+ result |= (0 << (index*8));
|
|
+ }
|
|
+ */
|
|
+
|
|
+ } else {
|
|
+ vex_printf("ERROR, vector_gen_pvc_byte_mask_dirty_helper, imm value %u not supported.\n",
|
|
+ imm);
|
|
+ vassert(0);
|
|
+ }
|
|
+ write_VSX_entry( gst, reg_offset, result);
|
|
+}
|
|
+
|
|
+/* CALLED FROM GENERATED CODE */
|
|
+void vector_gen_pvc_hword_mask_dirty_helper( VexGuestPPC64State* gst,
|
|
+ ULong src_hi, ULong src_lo,
|
|
+ UInt reg_offset,
|
|
+ UInt imm ) {
|
|
+ /* The function computes the 128-bit result then writes it directly
|
|
+ into the guest state VSX register. */
|
|
+ UInt i, shift_by, sel_shift_by, half_sel;
|
|
+ ULong index, src, result[2];
|
|
+ ULong j;
|
|
+
|
|
+ result[0] = 0;
|
|
+ result[1] = 0;
|
|
+ j = 0;
|
|
+
|
|
+ /* The algorithm in the ISA is written with IBM numbering zero on left and
|
|
+ N-1 on right. The loop index is converted to "i" to match the algorithm
|
|
+ for claritiy of matching the C code to the algorithm in the ISA. */
|
|
+
|
|
+ if (imm == 0b00) { // big endian expansion
|
|
+ /* If IMM=0b00000, let pcv be the permute control vector required to
|
|
+ enable a left-indexed permute (vperm or xxperm) to implement an
|
|
+ expansion of the leftmost halfword elements of a source vector into
|
|
+ the halfword elements of a result vector specified by the halfword-
|
|
+ element mask in VSR[VRB+32].
|
|
+ */
|
|
+ for( index = 0; index < 8; index++) {
|
|
+ i = 7 - index;
|
|
+
|
|
+ shift_by = i*16;
|
|
+
|
|
+ if ( i >= 4) {
|
|
+ src = src_hi;
|
|
+ shift_by = shift_by - 64;
|
|
+ half_sel = 0;
|
|
+ } else {
|
|
+ src = src_lo;
|
|
+ half_sel = 1;
|
|
+ }
|
|
+
|
|
+ sel_shift_by = shift_by + 15;
|
|
+
|
|
+ if ( ((src >> sel_shift_by) & 0x1) == 1) {
|
|
+ // half-word i, byte 0
|
|
+ result[half_sel] |= (2*j + 0x0) << (shift_by+8);
|
|
+ // half-word i, byte 1
|
|
+ result[half_sel] |= (2*j + 0x1) << shift_by;
|
|
+ j++;
|
|
+ } else {
|
|
+ result[half_sel] |= (2*index + 0x10) << (shift_by+8);
|
|
+ result[half_sel] |= (2*index + 0x11) << shift_by;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ } else if (imm == 0b01) { // big endian expansion
|
|
+ /* If IMM=0b00001,let pcv be the permute control vector required to
|
|
+ enable a left-indexed permute (vperm or xxperm) to implement a
|
|
+ compression of the sparse halfword elements in a source vector
|
|
+ specified by the halfword-element mask in VSR[VRB+32] into the
|
|
+ leftmost halfword elements of a result vector.
|
|
+ */
|
|
+ for( index = 0; index < 8; index++) {
|
|
+ i = 7 - index;
|
|
+
|
|
+ shift_by = i*16;
|
|
+
|
|
+ if ( i >= 4) {
|
|
+ src = src_hi;
|
|
+ shift_by = shift_by - 64;
|
|
+ half_sel = 0;
|
|
+ } else {
|
|
+ src = src_lo;
|
|
+ half_sel = 1;
|
|
+ }
|
|
+
|
|
+ sel_shift_by = shift_by + 15;
|
|
+
|
|
+ if ( ((src >> sel_shift_by) & 0x1) == 1) {
|
|
+ if (j >= 4) {
|
|
+ // half-word i, byte 0
|
|
+ result[1] |= (2*index + 0x0) << ((7 - j)*16 + 8);
|
|
+ // half-word i, byte 1
|
|
+ result[1] |= (2*index + 0x1) << ((7 - j)*16);
|
|
+ } else {
|
|
+ // half-word i, byte 0
|
|
+ result[0] |= (2*index + 0x0) << ((3 - j)*16 + 8);
|
|
+ // half-word i, byte 1
|
|
+ result[0] |= (2*index + 0x1) << ((3 - j)*16);
|
|
+ }
|
|
+ j++;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ } else if (imm == 0b10) { //little-endian expansion
|
|
+ /* If IMM=0b00010, let pcv be the permute control vector required to
|
|
+ enable a right-indexed permute (vpermr or xxpermr) to implement an
|
|
+ expansion of the rightmost halfword elements of a source vector into
|
|
+ the halfword elements of a result vector specified by the halfword-
|
|
+ element mask in VSR[VRB+32].
|
|
+ */
|
|
+ for( index = 0; index < 8; index++) {
|
|
+ i = index;
|
|
+ shift_by = i*16;
|
|
+
|
|
+ if ( i >= 4) {
|
|
+ src = src_hi;
|
|
+ shift_by = shift_by - 64;
|
|
+ half_sel = 0;
|
|
+ } else {
|
|
+ src = src_lo;
|
|
+ half_sel = 1;
|
|
+ }
|
|
+
|
|
+ sel_shift_by = shift_by + 15;
|
|
+
|
|
+ if ( ((src >> sel_shift_by) & 0x1) == 1) {
|
|
+ // half-word i, byte 0
|
|
+ result[half_sel] |= (2*j + 0x00) << shift_by;
|
|
+ // half-word i, byte 1
|
|
+ result[half_sel] |= (2*j + 0x01) << (shift_by+8);
|
|
+ j++;
|
|
+
|
|
+ } else {
|
|
+ // half-word i, byte 0
|
|
+ result[half_sel] |= (2*index + 0x10) << shift_by;
|
|
+ // half-word i, byte 1
|
|
+ result[half_sel] |= (2*index + 0x11) << (shift_by+8);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ } else if (imm == 0b11) { //little-endian compression
|
|
+ /* If IMM=0b00011, let pcv be the permute control vector required to
|
|
+ enable a right-indexed permute (vpermr or xxpermr) to implement a
|
|
+ compression of the sparse halfword elements in a source vector
|
|
+ specified by the halfword-element mask in VSR[VRB+32] into the
|
|
+ rightmost halfword elements of a result vector. */
|
|
+ for( index = 0; index < 8; index++) {
|
|
+ i = index;
|
|
+ shift_by = i*16;
|
|
+
|
|
+ if ( i >= 4) {
|
|
+ src = src_hi;
|
|
+ shift_by = shift_by - 64;
|
|
+ half_sel = 0;
|
|
+ } else {
|
|
+ src = src_lo;
|
|
+ half_sel = 1;
|
|
+ }
|
|
+
|
|
+ sel_shift_by = shift_by + 15;
|
|
+
|
|
+ if ( ((src >> sel_shift_by) & 0x1) == 1) {
|
|
+ if (j >= 4) {
|
|
+ // half-word j, byte 0
|
|
+ result[0] |= (2*index + 0x0) << ((j-4)*16);
|
|
+ // half-word j, byte 1
|
|
+ result[0] |= (2*index + 0x1) << ((j-4)*16+8);
|
|
+ } else {
|
|
+ // half-word j, byte 0
|
|
+ result[1] |= (2*index + 0x0) << (j*16);
|
|
+ // half-word j, byte 1
|
|
+ result[1] |= (2*index + 0x1) << ((j*16)+8);
|
|
+ }
|
|
+ j++;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ } else {
|
|
+ vex_printf("ERROR, vector_gen_pvc_hword_dirty_mask_helper, imm value %u not supported.\n",
|
|
+ imm);
|
|
+ vassert(0);
|
|
+ }
|
|
+ write_VSX_entry( gst, reg_offset, result);
|
|
+}
|
|
+
|
|
+/* CALLED FROM GENERATED CODE */
|
|
+void vector_gen_pvc_word_mask_dirty_helper( VexGuestPPC64State* gst,
|
|
+ ULong src_hi, ULong src_lo,
|
|
+ UInt reg_offset, UInt imm ) {
|
|
+ /* The function computes the 128-bit result then writes it directly
|
|
+ into the guest state VSX register. */
|
|
+ UInt i, shift_by, sel_shift_by, half_sel;
|
|
+ ULong index, src, result[2];
|
|
+ ULong j;
|
|
+
|
|
+ result[0] = 0;
|
|
+ result[1] = 0;
|
|
+ j = 0;
|
|
+
|
|
+ /* The algorithm in the ISA is written with IBM numbering zero on left and
|
|
+ N-1 on right. The loop index is converted to "i" to match the algorithm
|
|
+ for claritiy of matching the C code to the algorithm in the ISA. */
|
|
+
|
|
+ if (imm == 0b00) { // big endian expansion
|
|
+ /* If IMM=0b00000, let pcv be the permute control vector required to
|
|
+ enable a left-indexed permute (vperm or xxperm) to implement an
|
|
+ expansion of the leftmost word elements of a source vector into the
|
|
+ word elements of a result vector specified by the word-element mask
|
|
+ in VSR[VRB+32].
|
|
+ */
|
|
+ for( index = 0; index < 4; index++) {
|
|
+ i = 3 - index;
|
|
+
|
|
+ shift_by = i*32;
|
|
+
|
|
+ if ( i >= 2) {
|
|
+ src = src_hi;
|
|
+ shift_by = shift_by - 64;
|
|
+ half_sel = 0;
|
|
+ } else {
|
|
+ src = src_lo;
|
|
+ half_sel = 1;
|
|
+ }
|
|
+
|
|
+ sel_shift_by = shift_by + 31;
|
|
+
|
|
+ if ( ((src >> sel_shift_by) & 0x1) == 1) {
|
|
+ result[half_sel] |= (4*j+0) << (shift_by+24); // word i, byte 0
|
|
+ result[half_sel] |= (4*j+1) << (shift_by+16); // word i, byte 1
|
|
+ result[half_sel] |= (4*j+2) << (shift_by+8); // word i, byte 2
|
|
+ result[half_sel] |= (4*j+3) << shift_by; // word i, byte 3
|
|
+ j++;
|
|
+ } else {
|
|
+ result[half_sel] |= (4*index + 0x10) << (shift_by+24);
|
|
+ result[half_sel] |= (4*index + 0x11) << (shift_by+16);
|
|
+ result[half_sel] |= (4*index + 0x12) << (shift_by+8);
|
|
+ result[half_sel] |= (4*index + 0x13) << shift_by;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ } else if (imm == 0b01) { // big endian compression
|
|
+ /* If IMM=0b00001, let pcv be the permute control vector required to
|
|
+ enable a left-indexed permute (vperm or xxperm) to implement a
|
|
+ compression of the sparse word elements in a source vector specified
|
|
+ by the word-element mask in VSR[VRB+32] into the leftmost word
|
|
+ elements of a result vector.
|
|
+ */
|
|
+ for( index = 0; index < 4; index++) {
|
|
+ i = 3 - index;
|
|
+
|
|
+ shift_by = i*32;
|
|
+
|
|
+ if ( i >= 2) {
|
|
+ src = src_hi;
|
|
+ shift_by = shift_by - 64;
|
|
+ half_sel = 0;
|
|
+ } else {
|
|
+ src = src_lo;
|
|
+ half_sel = 1;
|
|
+ }
|
|
+
|
|
+ sel_shift_by = shift_by + 31;
|
|
+
|
|
+ if (((src >> sel_shift_by) & 0x1) == 1) {
|
|
+ if (j >= 2) {
|
|
+ // word j, byte 0
|
|
+ result[1] |= (4*index+0) << ((3 - j)*32 + 24);
|
|
+ // word j, byte 1
|
|
+ result[1] |= (4*index+1) << ((3 - j)*32 + 16);
|
|
+ // word j, byte 2
|
|
+ result[1] |= (4*index+2) << ((3 - j)*32 + 8);
|
|
+ // word j, byte 3
|
|
+ result[1] |= (4*index+3) << ((3 - j)*32 + 0);
|
|
+ } else {
|
|
+ result[0] |= (4*index+0) << ((1 - j)*32 + 24);
|
|
+ result[0] |= (4*index+1) << ((1 - j)*32 + 16);
|
|
+ result[0] |= (4*index+2) << ((1 - j)*32 + 8);
|
|
+ result[0] |= (4*index+3) << ((1 - j)*32 + 0);
|
|
+ }
|
|
+ j++;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ } else if (imm == 0b10) { //little-endian expansion
|
|
+ /* If IMM=0b00010, let pcv be the permute control vector required to
|
|
+ enable a right-indexed permute (vpermr or xxpermr) to implement an
|
|
+ expansion of the rightmost word elements of a source vector into the
|
|
+ word elements of a result vector specified by the word-element mask
|
|
+ in VSR[VRB+32].
|
|
+ */
|
|
+ for( index = 0; index < 4; index++) {
|
|
+ i = index;
|
|
+
|
|
+ shift_by = i*32;
|
|
+
|
|
+ if ( i >= 2) {
|
|
+ src = src_hi;
|
|
+ shift_by = shift_by - 64;
|
|
+ half_sel = 0;
|
|
+ } else {
|
|
+ src = src_lo;
|
|
+ half_sel = 1;
|
|
+ }
|
|
+
|
|
+ sel_shift_by = shift_by + 31;
|
|
+
|
|
+ if (((src >> sel_shift_by) & 0x1) == 1) {
|
|
+ result[half_sel] |= (4*j+0) << (shift_by + 0); // word j, byte 0
|
|
+ result[half_sel] |= (4*j+1) << (shift_by + 8); // word j, byte 1
|
|
+ result[half_sel] |= (4*j+2) << (shift_by + 16); // word j, byte 2
|
|
+ result[half_sel] |= (4*j+3) << (shift_by + 24); // word j, byte 3
|
|
+ j++;
|
|
+ } else {
|
|
+ result[half_sel] |= (4*index + 0x10) << (shift_by + 0);
|
|
+ result[half_sel] |= (4*index + 0x11) << (shift_by + 8);
|
|
+ result[half_sel] |= (4*index + 0x12) << (shift_by + 16);
|
|
+ result[half_sel] |= (4*index + 0x13) << (shift_by + 24);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ } else if (imm == 0b11) { //little-endian compression
|
|
+ /* If IMM=0b00011, let pcv be the permute control vector required to
|
|
+ enable a right-indexed permute (vpermr or xxpermr) to implement a
|
|
+ compression of the sparse word elements in a source vector specified
|
|
+ by the word-element mask in VSR[VRB+32] into the rightmost word
|
|
+ elements of a result vector. */
|
|
+ for( index = 0; index < 4; index++) {
|
|
+ i =index;
|
|
+
|
|
+ shift_by = i*32;
|
|
+
|
|
+ if ( i >= 2) {
|
|
+ src = src_hi;
|
|
+ shift_by = shift_by - 64;
|
|
+ half_sel = 0;
|
|
+ } else {
|
|
+ src = src_lo;
|
|
+ half_sel = 1;
|
|
+ }
|
|
+
|
|
+ sel_shift_by = shift_by + 31;
|
|
+
|
|
+ if (((src >> sel_shift_by) & 0x1) == 1) {
|
|
+ if (j >= 2){
|
|
+ // word j, byte 0
|
|
+ result[0] |= (4*index + 0x0) << ((j-2)*32+0);
|
|
+ // word j, byte 1
|
|
+ result[0] |= (4*index + 0x1) << ((j-2)*32+8);
|
|
+ // word j, byte 2
|
|
+ result[0] |= (4*index + 0x2) << ((j-2)*32+16);
|
|
+ // word j, byte 3
|
|
+ result[0] |= (4*index + 0x3) << ((j-2)*32+24);
|
|
+ } else {
|
|
+ result[1] |= (4*index + 0x0) << (j*32+0);
|
|
+ result[1] |= (4*index + 0x1) << (j*32+8);
|
|
+ result[1] |= (4*index + 0x2) << (j*32+16);
|
|
+ result[1] |= (4*index + 0x3) << (j*32+24);
|
|
+ }
|
|
+ j++;
|
|
+ }
|
|
+ }
|
|
+ } else {
|
|
+ vex_printf("ERROR, vector_gen_pvc_word_mask_dirty_helper, imm value %u not supported.\n",
|
|
+ imm);
|
|
+ vassert(0);
|
|
+ }
|
|
+
|
|
+ write_VSX_entry( gst, reg_offset, result);
|
|
+}
|
|
+
|
|
+/* CALLED FROM GENERATED CODE */
|
|
+void vector_gen_pvc_dword_mask_dirty_helper( VexGuestPPC64State* gst,
|
|
+ ULong src_hi, ULong src_lo,
|
|
+ UInt reg_offset, UInt imm ) {
|
|
+ /* The function computes the 128-bit result then writes it directly
|
|
+ into the guest state VSX register. */
|
|
+ UInt sel_shift_by, half_sel;
|
|
+ ULong index, src, result[2];
|
|
+ ULong j, i;
|
|
+
|
|
+ result[0] = 0;
|
|
+ result[1] = 0;
|
|
+ j = 0;
|
|
+
|
|
+ /* The algorithm in the ISA is written with IBM numbering zero on left and
|
|
+ N-1 on right. The loop index is converted to "i" to match the algorithm
|
|
+ for claritiy of matching the C code to the algorithm in the ISA. */
|
|
+
|
|
+ if (imm == 0b00) { // big endian expansion
|
|
+ /* If IMM=0b00000, let pcv be the permute control vector required to
|
|
+ enable a left-indexed permute (vperm or xxperm) to implement an
|
|
+ expansion of the leftmost doubleword elements of a source vector into
|
|
+ the doubleword elements of a result vector specified by the
|
|
+ doubleword-element mask in VSR[VRB+32].
|
|
+ */
|
|
+ for( index = 0; index < 2; index++) {
|
|
+ i = 1 - index;
|
|
+
|
|
+ if ( i == 1) {
|
|
+ src = src_hi;
|
|
+ half_sel = 0;
|
|
+ } else {
|
|
+ src = src_lo;
|
|
+ half_sel = 1;
|
|
+ }
|
|
+
|
|
+ sel_shift_by = 63;
|
|
+
|
|
+ if ( ((src >> sel_shift_by) & 0x1) == 1) {
|
|
+ result[half_sel] |= (8*j + 0x0) << 56; // dword i, byte 0
|
|
+ result[half_sel] |= (8*j + 0x1) << 48; // dword i, byte 1
|
|
+ result[half_sel] |= (8*j + 0x2) << 40; // dword i, byte 2
|
|
+ result[half_sel] |= (8*j + 0x3) << 32; // dword i, byte 3
|
|
+ result[half_sel] |= (8*j + 0x4) << 24; // dword i, byte 4
|
|
+ result[half_sel] |= (8*j + 0x5) << 16; // dword i, byte 5
|
|
+ result[half_sel] |= (8*j + 0x6) << 8; // dword i, byte 6
|
|
+ result[half_sel] |= (8*j + 0x7) << 0; // dword i, byte 7
|
|
+ j++;
|
|
+ } else {
|
|
+ result[half_sel] |= (8*index + 0x10) << 56;
|
|
+ result[half_sel] |= (8*index + 0x11) << 48;
|
|
+ result[half_sel] |= (8*index + 0x12) << 40;
|
|
+ result[half_sel] |= (8*index + 0x13) << 32;
|
|
+ result[half_sel] |= (8*index + 0x14) << 24;
|
|
+ result[half_sel] |= (8*index + 0x15) << 16;
|
|
+ result[half_sel] |= (8*index + 0x16) << 8;
|
|
+ result[half_sel] |= (8*index + 0x17) << 0;
|
|
+ }
|
|
+ }
|
|
+ } else if (imm == 0b01) { // big endian compression
|
|
+ /* If IMM=0b00001, let pcv be the the permute control vector required to
|
|
+ enable a left-indexed permute (vperm or xxperm) to implement a
|
|
+ compression of the sparse doubleword elements in a source vector
|
|
+ specified by the doubleword-element mask in VSR[VRB+32] into the
|
|
+ leftmost doubleword elements of a result vector.
|
|
+ */
|
|
+ for( index = 0; index < 2; index++) {
|
|
+ i = 1 - index;
|
|
+
|
|
+ if ( i == 1) {
|
|
+ src = src_hi;
|
|
+ half_sel = 0;
|
|
+ } else {
|
|
+ src = src_lo;
|
|
+ half_sel = 1;
|
|
+ }
|
|
+
|
|
+ sel_shift_by = 63;
|
|
+
|
|
+ if ( ((src >> sel_shift_by) & 0x1) == 1) {
|
|
+ if (j == 1) {
|
|
+ result[1] |= (8*index + 0x0) << 56; // double-word j, byte 0
|
|
+ result[1] |= (8*index + 0x1) << 48; // double-word j, byte 1
|
|
+ result[1] |= (8*index + 0x2) << 40; // double-word j, byte 2
|
|
+ result[1] |= (8*index + 0x3) << 32; // double-word j, byte 3
|
|
+ result[1] |= (8*index + 0x4) << 24; // double-word j, byte 4
|
|
+ result[1] |= (8*index + 0x5) << 16; // double-word j, byte 5
|
|
+ result[1] |= (8*index + 0x6) << 8; // double-word j, byte 6
|
|
+ result[1] |= (8*index + 0x7) << 0; // double-word j, byte 7
|
|
+ } else {
|
|
+ result[0] |= (8*index + 0x0) << 56; // double-word j, byte 0
|
|
+ result[0] |= (8*index + 0x1) << 48; // double-word j, byte 1
|
|
+ result[0] |= (8*index + 0x2) << 40; // double-word j, byte 2
|
|
+ result[0] |= (8*index + 0x3) << 32; // double-word j, byte 3
|
|
+ result[0] |= (8*index + 0x4) << 24; // double-word j, byte 4
|
|
+ result[0] |= (8*index + 0x5) << 16; // double-word j, byte 5
|
|
+ result[0] |= (8*index + 0x6) << 8; // double-word j, byte 6
|
|
+ result[0] |= (8*index + 0x7) << 0; // double-word j, byte 7
|
|
+ }
|
|
+ j++;
|
|
+ }
|
|
+ }
|
|
+ } else if (imm == 0b10) { //little-endian expansion
|
|
+ /* If IMM=0b00010, let pcv be the permute control vector required to
|
|
+ enable a right-indexed permute (vpermr or xxpermr) to implement an
|
|
+ expansion of the rightmost doubleword elements of a source vector
|
|
+ into the doubleword elements of a result vector specified by the
|
|
+ doubleword-element mask in VSR[VRB+32].
|
|
+ */
|
|
+
|
|
+ for( index = 0; index < 2; index++) {
|
|
+ i = index;
|
|
+
|
|
+ if ( i == 1) {
|
|
+ src = src_hi;
|
|
+ half_sel = 0;
|
|
+ } else {
|
|
+ src = src_lo;
|
|
+ half_sel = 1;
|
|
+ }
|
|
+
|
|
+ sel_shift_by = 63;
|
|
+
|
|
+ if ( ((src >> sel_shift_by) & 0x1) == 1) {
|
|
+ result[half_sel] |= (8*j+0) << 0; // double-word i, byte 0
|
|
+ result[half_sel] |= (8*j+1) << 8; // double-word i, byte 1
|
|
+ result[half_sel] |= (8*j+2) << 16; // double-word i, byte 2
|
|
+ result[half_sel] |= (8*j+3) << 24; // double-word i, byte 3
|
|
+ result[half_sel] |= (8*j+4) << 32; // double-word i, byte 4
|
|
+ result[half_sel] |= (8*j+5) << 40; // double-word i, byte 5
|
|
+ result[half_sel] |= (8*j+6) << 48; // double-word i, byte 6
|
|
+ result[half_sel] |= (8*j+7) << 56; // double-word i, byte 7
|
|
+ j++;
|
|
+ } else {
|
|
+ result[half_sel] |= (8*index + 0x10) << 0;
|
|
+ result[half_sel] |= (8*index + 0x11) << 8;
|
|
+ result[half_sel] |= (8*index + 0x12) << 16;
|
|
+ result[half_sel] |= (8*index + 0x13) << 24;
|
|
+ result[half_sel] |= (8*index + 0x14) << 32;
|
|
+ result[half_sel] |= (8*index + 0x15) << 40;
|
|
+ result[half_sel] |= (8*index + 0x16) << 48;
|
|
+ result[half_sel] |= (8*index + 0x17) << 56;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ } else if (imm == 0b11) { //little-endian compression
|
|
+ /* If IMM=0b00011, let pcv be the permute control vector required to
|
|
+ enable a right-indexed permute (vpermr or xxpermr) to implement a
|
|
+ compression of the sparse doubleword elements in a source vector
|
|
+ specified by the doubleword-element mask in VSR[VRB+32] into the
|
|
+ rightmost doubleword elements of a result vector. */
|
|
+ for( index = 0; index < 2; index++) {
|
|
+ i = index;
|
|
+
|
|
+ if ( i == 1) {
|
|
+ src = src_hi;
|
|
+ half_sel = 0;
|
|
+ } else {
|
|
+ src = src_lo;
|
|
+ half_sel = 1;
|
|
+ }
|
|
+
|
|
+ sel_shift_by = 63;
|
|
+
|
|
+ if (((src >> sel_shift_by) & 0x1) == 1) {
|
|
+ if (j == 1) {
|
|
+ result[0] |= (8*index + 0x0) << 0; // double-word j, byte 0
|
|
+ result[0] |= (8*index + 0x1) << 8; // double-word j, byte 1
|
|
+ result[0] |= (8*index + 0x2) << 16; // double-word j, byte 2
|
|
+ result[0] |= (8*index + 0x3) << 24; // double-word j, byte 3
|
|
+ result[0] |= (8*index + 0x4) << 32; // double-word j, byte 4
|
|
+ result[0] |= (8*index + 0x5) << 40; // double-word j, byte 5
|
|
+ result[0] |= (8*index + 0x6) << 48; // double-word j, byte 6
|
|
+ result[0] |= (8*index + 0x7) << 56; // double-word j, byte 7
|
|
+ } else {
|
|
+ result[1] |= (8*index + 0x0) << 0;
|
|
+ result[1] |= (8*index + 0x1) << 8;
|
|
+ result[1] |= (8*index + 0x2) << 16;
|
|
+ result[1] |= (8*index + 0x3) << 24;
|
|
+ result[1] |= (8*index + 0x4) << 32;
|
|
+ result[1] |= (8*index + 0x5) << 40;
|
|
+ result[1] |= (8*index + 0x6) << 48;
|
|
+ result[1] |= (8*index + 0x7) << 56;
|
|
+ }
|
|
+ j++;
|
|
+ }
|
|
+ }
|
|
+ } else {
|
|
+ vex_printf("ERROR, vector_gen_pvc_dword_mask_helper, imm value %u not supported.\n",
|
|
+ imm);
|
|
+ vassert(0);
|
|
+ }
|
|
+
|
|
+ write_VSX_entry( gst, reg_offset, result);
|
|
+}
|
|
|
|
/*------------------------------------------------*/
|
|
/*---- VSX Matrix signed integer GER functions ---*/
|
|
diff --git a/VEX/priv/guest_ppc_toIR.c b/VEX/priv/guest_ppc_toIR.c
|
|
index bcabf69dd..354be6b53 100644
|
|
--- a/VEX/priv/guest_ppc_toIR.c
|
|
+++ b/VEX/priv/guest_ppc_toIR.c
|
|
@@ -3322,6 +3322,7 @@ static IRExpr * locate_vector_ele_eq ( IRTemp src, IRExpr *value,
|
|
#define DFORM_IMMASK 0xffffffff
|
|
#define DSFORM_IMMASK 0xfffffffc
|
|
#define DQFORM_IMMASK 0xfffffff0
|
|
+#define DA8LSFORM_IMMASK 0x3fffffff // Algebraic 8LS Dform
|
|
|
|
#define ISA_3_1_PREFIX_CHECK if (prefix) {if (!allow_isa_3_1) goto decode_noIsa3_1;}
|
|
|
|
@@ -6109,6 +6110,87 @@ static void vsx_matrix_64bit_float_ger ( const VexAbiInfo* vbi,
|
|
stmt( IRStmt_Dirty(d) );
|
|
}
|
|
|
|
+static void vector_gen_pvc_mask ( const VexAbiInfo* vbi,
|
|
+ IRExpr *src, UInt IMM,
|
|
+ UInt opc2, UInt VSX_addr ) {
|
|
+ /* The function takes a 64-bit source and an immediate value. The function
|
|
+ calls a helper to execute the xxgenpcvbm, xxgenpcvhm, xxgenpcvwm,
|
|
+ xxgenpcvdm instruction. The instructions are not practical to do with
|
|
+ Iops. The instruction is implemented with a dirty helper that
|
|
+ calculates the 128-bit result and writes it directly into the guest
|
|
+ state VSX register.
|
|
+ */
|
|
+ IRTemp src_hi = newTemp( Ity_I64);
|
|
+ IRTemp src_lo = newTemp( Ity_I64);
|
|
+
|
|
+ IRDirty* d;
|
|
+
|
|
+ vassert( (VSX_addr >= 0) && (VSX_addr < 64) );
|
|
+ UInt reg_offset = offsetofPPCGuestState( guest_VSR0 )
|
|
+ + sizeof(U128) * VSX_addr;
|
|
+
|
|
+ assign( src_hi, unop( Iop_V128HIto64, src ) );
|
|
+ assign( src_lo, unop( Iop_V128to64, src ) );
|
|
+
|
|
+ IRExpr** args = mkIRExprVec_5(
|
|
+ IRExpr_GSPTR(),
|
|
+ mkexpr( src_hi ),
|
|
+ mkexpr( src_lo ),
|
|
+ mkU32( reg_offset ),
|
|
+ mkU64( IMM ) );
|
|
+
|
|
+ switch( opc2 ) {
|
|
+ case 0x394: // xxgenpcvbm
|
|
+ d = unsafeIRDirty_0_N (
|
|
+ 0 /*regparms*/,
|
|
+ "vector_gen_pvc_byte_mask_dirty_helper",
|
|
+ fnptr_to_fnentry( vbi,
|
|
+ &vector_gen_pvc_byte_mask_dirty_helper ),
|
|
+ args);
|
|
+ break;
|
|
+
|
|
+ case 0x395: // xxgenpcvhm
|
|
+ d = unsafeIRDirty_0_N (
|
|
+ 0 /*regparms*/,
|
|
+ "vector_gen_pvc_hword_mask_dirty_helper",
|
|
+ fnptr_to_fnentry( vbi,
|
|
+ &vector_gen_pvc_hword_mask_dirty_helper ),
|
|
+ args);
|
|
+ break;
|
|
+
|
|
+ case 0x3B4: // xxgenpcvwm
|
|
+ d = unsafeIRDirty_0_N (
|
|
+ 0 /*regparms*/,
|
|
+ "vector_gen_pvc_word_mask_dirty_helper",
|
|
+ fnptr_to_fnentry( vbi,
|
|
+ &vector_gen_pvc_word_mask_dirty_helper ),
|
|
+ args);
|
|
+ break;
|
|
+
|
|
+ case 0x3B5: // xxgenpcvdm
|
|
+ d = unsafeIRDirty_0_N (
|
|
+ 0 /*regparms*/,
|
|
+ "vector_gen_pvc_dword_mask_dirty_helper",
|
|
+ fnptr_to_fnentry( vbi,
|
|
+ &vector_gen_pvc_dword_mask_dirty_helper ),
|
|
+ args);
|
|
+ break;
|
|
+ default:
|
|
+ vex_printf("ERROR: Unkown instruction = %u in vector_gen_pvc_mask()\n",
|
|
+ opc2);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ d->nFxState = 1;
|
|
+ vex_bzero(&d->fxState, sizeof(d->fxState));
|
|
+ d->fxState[0].fx = Ifx_Modify;
|
|
+ d->fxState[0].size = sizeof(U128);
|
|
+ d->fxState[0].offset = reg_offset;
|
|
+
|
|
+ /* execute the dirty call, side-effecting guest state */
|
|
+ stmt( IRStmt_Dirty(d) );
|
|
+}
|
|
+
|
|
static IRExpr * UNSIGNED_CMP_GT_V128 ( IRExpr *vA, IRExpr *vB ) {
|
|
/* This function does an unsigned compare of two V128 values. The
|
|
* function is for use in 32-bit mode only as it is expensive. The
|
|
@@ -35227,6 +35309,54 @@ static Bool dis_vsx_accumulator_prefix ( UInt prefix, UInt theInstr,
|
|
return True;
|
|
}
|
|
|
|
+static Bool dis_vector_generate_pvc_from_mask ( UInt prefix,
|
|
+ UInt theInstr,
|
|
+ const VexAbiInfo* vbi )
|
|
+{
|
|
+ UChar XT_addr = ifieldRegXT(theInstr);
|
|
+ UChar vB_addr = ifieldRegB(theInstr);
|
|
+ IRTemp vB = newTemp( Ity_V128 );
|
|
+ UInt opc2 = ifieldOPClo10(theInstr);
|
|
+ UInt IMM = IFIELD(theInstr, (31-15), 5); // bits[11:15]
|
|
+
|
|
+ assign( vB, getVReg( vB_addr ) );
|
|
+
|
|
+ switch( opc2 ) {
|
|
+ case 0x394:
|
|
+ DIP("xxgenpcvbm v%u,v%u,%u\n", XT_addr, vB_addr, IMM);
|
|
+ /* vector_gen_pvc_mask uses a dirty helper to calculate the result and
|
|
+ write it to the VSX result register. */
|
|
+ vector_gen_pvc_mask( vbi, mkexpr( vB ), IMM, opc2, XT_addr );
|
|
+ break;
|
|
+
|
|
+ case 0x395:
|
|
+ DIP("xxgenpcvhm v%u,v%u,%u\n", XT_addr, vB_addr, IMM);
|
|
+ /* vector_gen_pvc_mask uses a dirty helper to calculate the result and
|
|
+ write it to the VSX result register. */
|
|
+ vector_gen_pvc_mask( vbi, mkexpr( vB ), IMM, opc2, XT_addr );
|
|
+ break;
|
|
+
|
|
+ case 0x3B4:
|
|
+ DIP("xxgenpcvwm v%u,v%u,%u\n", XT_addr, vB_addr, IMM);
|
|
+ /* vector_gen_pvc_mask uses a dirty helper to calculate the result and
|
|
+ write it to the VSX result register. */
|
|
+ vector_gen_pvc_mask( vbi, mkexpr( vB ), IMM, opc2, XT_addr );
|
|
+ break;
|
|
+
|
|
+ case 0x3B5:
|
|
+ DIP("xxgenpcvdm v%u,v%u,%u\n", XT_addr, vB_addr, IMM);
|
|
+ /* vector_gen_pvc_mask uses a dirty helper to calculate the result and
|
|
+ write it to the VSX result register. */
|
|
+ vector_gen_pvc_mask( vbi, mkexpr( vB ), IMM, opc2, XT_addr );
|
|
+ break;
|
|
+
|
|
+ default:
|
|
+ return False;
|
|
+ }
|
|
+
|
|
+ return True;
|
|
+}
|
|
+
|
|
static Int dis_nop_prefix ( UInt prefix, UInt theInstr )
|
|
{
|
|
Bool is_prefix = prefix_instruction( prefix );
|
|
@@ -35748,14 +35878,9 @@ DisResult disInstr_PPC_WRK (
|
|
}
|
|
goto decode_failure;
|
|
|
|
- case 0x31: // lfsu, stxv
|
|
+ case 0x31: // lfsu
|
|
if (!allow_F) goto decode_noF;
|
|
- if (prefix_instruction( prefix )) { // stxv
|
|
- if ( !(allow_isa_3_1) ) goto decode_noIsa3_1;
|
|
- if (dis_fp_pair_prefix( prefix, theInstr )) goto decode_success;
|
|
- } else { // lfsu
|
|
- if (dis_fp_load( prefix, theInstr )) goto decode_success;
|
|
- }
|
|
+ if (dis_fp_load( prefix, theInstr )) goto decode_success;
|
|
goto decode_failure;
|
|
|
|
case 0x32:
|
|
@@ -35842,7 +35967,6 @@ DisResult disInstr_PPC_WRK (
|
|
case 0x39: // pld, lxsd, lxssp, lfdp
|
|
{
|
|
UInt opc2tmp = ifieldOPC0o2(theInstr);
|
|
-
|
|
if (!allow_F) goto decode_noF;
|
|
if (prefix_instruction( prefix )) { // pld
|
|
if ( !(allow_isa_3_1) ) goto decode_noIsa3_1;
|
|
@@ -36125,12 +36249,6 @@ DisResult disInstr_PPC_WRK (
|
|
goto decode_failure;
|
|
}
|
|
|
|
- /* The vsxOpc2 returned is the "normalized" value, representing the
|
|
- * instructions secondary opcode as taken from the standard secondary
|
|
- * opcode field [21:30] (IBM notatition), even if the actual field
|
|
- * is non-standard. These normalized values are given in the opcode
|
|
- * appendices of the ISA 2.06 document.
|
|
- */
|
|
if ( ( opc2 == 0x168 ) && ( IFIELD( theInstr, 19, 2 ) == 0 ) )// xxspltib
|
|
{
|
|
/* This is a special case of the XX1 form where the RA, RB
|
|
@@ -36153,6 +36271,23 @@ DisResult disInstr_PPC_WRK (
|
|
goto decode_failure;
|
|
}
|
|
|
|
+ if ( ( opc2 == 0x394 ) || // xxgenpcvbm
|
|
+ ( opc2 == 0x395 ) || // xxgenpcvwm
|
|
+ ( opc2 == 0x3B4 ) || // xxgenpcvhm
|
|
+ ( opc2 == 0x3B5 ) ) { // xxgenpcvdm
|
|
+ if ( !(allow_isa_3_1) ) goto decode_noIsa3_1;
|
|
+ if (dis_vector_generate_pvc_from_mask( prefix, theInstr,
|
|
+ abiinfo ))
|
|
+ goto decode_success;
|
|
+ goto decode_failure;
|
|
+ }
|
|
+
|
|
+ /* The vsxOpc2 returned is the "normalized" value, representing the
|
|
+ * instructions secondary opcode as taken from the standard secondary
|
|
+ * opcode field [21:30] (IBM notatition), even if the actual field
|
|
+ * is non-standard. These normalized values are given in the opcode
|
|
+ * appendices of the ISA 2.06 document.
|
|
+ */
|
|
vsxOpc2 = get_VSX60_opc2(opc2, theInstr);
|
|
|
|
switch (vsxOpc2) {
|
|
commit 078f89e99b6f62e043f6138c6a7ae238befc1f2a
|
|
Author: Carl Love <cel@us.ibm.com>
|
|
Date: Fri Feb 26 15:46:55 2021 -0600
|
|
|
|
PPC64: Reduced-Precision - bfloat16 Outer Product & Format Conversion Operations
|
|
|
|
Add support for:
|
|
|
|
pmxvbf16ger2 Prefixed Masked VSX Vector bfloat16 GER (Rank-2 Update)
|
|
pmxvbf16ger2pp Prefixed Masked VSX Vector bfloat16 GER (Rank-2 Update) Positive
|
|
multiply, Positive accumulate
|
|
pmxvbf16ger2pn Prefixed Masked VSX Vector bfloat16 GER (Rank-2 Update) Positive
|
|
multiply, Negative accumulate
|
|
pmxvbf16ger2np Prefixed Masked VSX Vector bfloat16 GER (Rank-2 Update) Negative
|
|
multiply, Positive accumulate
|
|
pmxvbf16ger2nn Prefixed Masked VSX Vector bfloat16 GER (Rank-2 Update) Negative
|
|
multiply, Negative accumulate
|
|
xvbf16ger2VSX Vector bfloat16 GER (Rank-2 Update)
|
|
xvbf16ger2pp VSX Vector bfloat16 GER (Rank-2 Update) Positive multiply, Positive
|
|
accumulate
|
|
xvbf16ger2pn VSX Vector bfloat16 GER (Rank-2 Update) Positive multiply, Negative
|
|
accumulate
|
|
xvbf16ger2np VSX Vector bfloat16 GER (Rank-2 Update) Negative multiply, Positive
|
|
accumulate
|
|
xvbf16ger2nn VSX Vector bfloat16 GER (Rank-2 Update) Negative multiply, Negative
|
|
accumulate
|
|
xvcvbf16sp VSX Vector Convert bfloat16 to Single-Precision format
|
|
xvcvspbf16 VSX Vector Convert with round Single-Precision to bfloat16 format
|
|
|
|
diff --git a/VEX/priv/guest_ppc_defs.h b/VEX/priv/guest_ppc_defs.h
|
|
index 54ce923a9..d36d6c07d 100644
|
|
--- a/VEX/priv/guest_ppc_defs.h
|
|
+++ b/VEX/priv/guest_ppc_defs.h
|
|
@@ -150,6 +150,8 @@ extern ULong convert_to_zoned_helper( ULong src_hi, ULong src_low,
|
|
ULong return_upper );
|
|
extern ULong convert_to_national_helper( ULong src, ULong return_upper );
|
|
extern ULong convert_from_zoned_helper( ULong src_hi, ULong src_low );
|
|
+extern ULong convert_from_floattobf16_helper( ULong src );
|
|
+extern ULong convert_from_bf16tofloat_helper( ULong src );
|
|
extern ULong convert_from_national_helper( ULong src_hi, ULong src_low );
|
|
extern ULong generate_C_FPCC_helper( ULong size, ULong src_hi, ULong src );
|
|
extern ULong extract_bits_under_mask_helper( ULong src, ULong mask,
|
|
@@ -201,6 +203,11 @@ extern void vector_gen_pvc_dword_mask_dirty_helper( VexGuestPPC64State* gst,
|
|
#define XVF16GER2PN 0b10010010
|
|
#define XVF16GER2NP 0b01010010
|
|
#define XVF16GER2NN 0b11010010
|
|
+#define XVBF16GER2 0b00110011
|
|
+#define XVBF16GER2PP 0b00110010
|
|
+#define XVBF16GER2PN 0b10110010
|
|
+#define XVBF16GER2NP 0b01110010
|
|
+#define XVBF16GER2NN 0b11110010
|
|
#define XVF32GER 0b00011011
|
|
#define XVF32GERPP 0b00011010
|
|
#define XVF32GERPN 0b10011010
|
|
diff --git a/VEX/priv/guest_ppc_helpers.c b/VEX/priv/guest_ppc_helpers.c
|
|
index 75497abb9..6bcee966d 100644
|
|
--- a/VEX/priv/guest_ppc_helpers.c
|
|
+++ b/VEX/priv/guest_ppc_helpers.c
|
|
@@ -1905,6 +1905,125 @@ static Double conv_f16_to_double( ULong input )
|
|
# endif
|
|
}
|
|
|
|
+#define BF16_SIGN_MASK 0x8000
|
|
+#define BF16_EXP_MASK 0x7F80
|
|
+#define BF16_FRAC_MASK 0x007F
|
|
+#define BF16_BIAS 127
|
|
+#define BF16_MAX_UNBIASED_EXP 127
|
|
+#define BF16_MIN_UNBIASED_EXP -126
|
|
+#define FLOAT_SIGN_MASK 0x80000000
|
|
+#define FLOAT_EXP_MASK 0x7F800000
|
|
+#define FLOAT_FRAC_MASK 0x007FFFFF
|
|
+#define FLOAT_FRAC_BIT8 0x00008000
|
|
+#define FLOAT_BIAS 127
|
|
+
|
|
+static Float conv_bf16_to_float( UInt input )
|
|
+{
|
|
+ /* input is 16-bit bfloat.
|
|
+ bias +127, exponent 8-bits, fraction 7-bits
|
|
+
|
|
+ output is 32-bit float.
|
|
+ bias +127, exponent 8-bits, fraction 22-bits
|
|
+ */
|
|
+
|
|
+ UInt input_exp, input_fraction, unbiased_exp;
|
|
+ UInt output_exp, output_fraction;
|
|
+ UInt sign;
|
|
+ union convert_t conv;
|
|
+
|
|
+ sign = (UInt)(input & BF16_SIGN_MASK);
|
|
+ input_exp = input & BF16_EXP_MASK;
|
|
+ unbiased_exp = (input_exp >> 7) - (UInt)BF16_BIAS;
|
|
+ input_fraction = input & BF16_FRAC_MASK;
|
|
+
|
|
+ if (((input_exp & BF16_EXP_MASK) == BF16_EXP_MASK) &&
|
|
+ (input_fraction != 0)) {
|
|
+ /* input is NaN or SNaN, exp all 1's, fraction != 0 */
|
|
+ output_exp = FLOAT_EXP_MASK;
|
|
+ output_fraction = input_fraction;
|
|
+
|
|
+ } else if(((input_exp & BF16_EXP_MASK) == BF16_EXP_MASK) &&
|
|
+ ( input_fraction == 0)) {
|
|
+ /* input is infinity, exp all 1's, fraction = 0 */
|
|
+ output_exp = FLOAT_EXP_MASK;
|
|
+ output_fraction = 0;
|
|
+
|
|
+ } else if((input_exp == 0) && (input_fraction == 0)) {
|
|
+ /* input is zero */
|
|
+ output_exp = 0;
|
|
+ output_fraction = 0;
|
|
+
|
|
+ } else if((input_exp == 0) && (input_fraction != 0)) {
|
|
+ /* input is denormal */
|
|
+ output_fraction = input_fraction;
|
|
+ output_exp = (-(Int)BF16_BIAS + (Int)FLOAT_BIAS ) << 23;
|
|
+
|
|
+ } else {
|
|
+ /* result is normal */
|
|
+ output_exp = (unbiased_exp + FLOAT_BIAS) << 23;
|
|
+ output_fraction = input_fraction;
|
|
+ }
|
|
+
|
|
+ conv.u32 = sign << (31 - 15) | output_exp | (output_fraction << (23-7));
|
|
+ return conv.f;
|
|
+}
|
|
+
|
|
+static UInt conv_float_to_bf16( UInt input )
|
|
+{
|
|
+ /* input is 32-bit float stored as unsigned 32-bit.
|
|
+ bias +127, exponent 8-bits, fraction 23-bits
|
|
+
|
|
+ output is 16-bit bfloat.
|
|
+ bias +127, exponent 8-bits, fraction 7-bits
|
|
+
|
|
+ If the unbiased exponent of the input is greater than the max floating
|
|
+ point unbiased exponent value, the result of the floating point 16-bit
|
|
+ value is infinity.
|
|
+ */
|
|
+
|
|
+ UInt input_exp, input_fraction;
|
|
+ UInt output_exp, output_fraction;
|
|
+ UInt result, sign;
|
|
+
|
|
+ sign = input & FLOAT_SIGN_MASK;
|
|
+ input_exp = input & FLOAT_EXP_MASK;
|
|
+ input_fraction = input & FLOAT_FRAC_MASK;
|
|
+
|
|
+ if (((input_exp & FLOAT_EXP_MASK) == FLOAT_EXP_MASK) &&
|
|
+ (input_fraction != 0)) {
|
|
+ /* input is NaN or SNaN, exp all 1's, fraction != 0 */
|
|
+ output_exp = BF16_EXP_MASK;
|
|
+ output_fraction = (ULong)input_fraction >> (23 - 7);
|
|
+ } else if (((input_exp & FLOAT_EXP_MASK) == FLOAT_EXP_MASK) &&
|
|
+ ( input_fraction == 0)) {
|
|
+ /* input is infinity, exp all 1's, fraction = 0 */
|
|
+ output_exp = BF16_EXP_MASK;
|
|
+ output_fraction = 0;
|
|
+ } else if ((input_exp == 0) && (input_fraction == 0)) {
|
|
+ /* input is zero */
|
|
+ output_exp = 0;
|
|
+ output_fraction = 0;
|
|
+ } else if ((input_exp == 0) && (input_fraction != 0)) {
|
|
+ /* input is denormal */
|
|
+ output_exp = 0;
|
|
+ output_fraction = (ULong)input_fraction >> (23 - 7);
|
|
+ } else {
|
|
+ /* result is normal */
|
|
+ output_exp = (input_exp - BF16_BIAS + FLOAT_BIAS) >> (23 - 7);
|
|
+ output_fraction = (ULong)input_fraction >> (23 - 7);
|
|
+
|
|
+ /* Round result. Look at the 8th bit position of the 32-bit floating
|
|
+ pointt fraction. The F16 fraction is only 7 bits wide so if the 8th
|
|
+ bit of the F32 is a 1 we need to round up by adding 1 to the output
|
|
+ fraction. */
|
|
+ if ((input_fraction & FLOAT_FRAC_BIT8) == FLOAT_FRAC_BIT8)
|
|
+ /* Round the F16 fraction up by 1 */
|
|
+ output_fraction = output_fraction + 1;
|
|
+ }
|
|
+
|
|
+ result = sign >> (31 - 15) | output_exp | output_fraction;
|
|
+ return result;
|
|
+}
|
|
|
|
static Float conv_double_to_float( Double src )
|
|
{
|
|
@@ -1942,6 +2061,36 @@ static Float negate_float( Float input )
|
|
return -input;
|
|
}
|
|
|
|
+/* This C-helper takes a vector of two 32-bit floating point values
|
|
+ * and returns a vector containing two 16-bit bfloats.
|
|
+ input: word0 word1
|
|
+ output 0x0 hword1 0x0 hword3
|
|
+ Called from generated code.
|
|
+ */
|
|
+ULong convert_from_floattobf16_helper( ULong src ) {
|
|
+ ULong resultHi, resultLo;
|
|
+
|
|
+ resultHi = (ULong)conv_float_to_bf16( (UInt)(src >> 32));
|
|
+ resultLo = (ULong)conv_float_to_bf16( (UInt)(src & 0xFFFFFFFF));
|
|
+ return (resultHi << 32) | resultLo;
|
|
+
|
|
+}
|
|
+
|
|
+/* This C-helper takes a vector of two 16-bit bfloating point values
|
|
+ * and returns a vector containing one 32-bit float.
|
|
+ input: 0x0 hword1 0x0 hword3
|
|
+ output: word0 word1
|
|
+ */
|
|
+ULong convert_from_bf16tofloat_helper( ULong src ) {
|
|
+ ULong result;
|
|
+ union convert_t conv;
|
|
+ conv.f = conv_bf16_to_float( (UInt)(src >> 32) );
|
|
+ result = (ULong) conv.u32;
|
|
+ conv.f = conv_bf16_to_float( (UInt)(src & 0xFFFFFFFF));
|
|
+ result = (result << 32) | (ULong) conv.u32;
|
|
+ return result;
|
|
+ }
|
|
+
|
|
void vsx_matrix_16bit_float_ger_dirty_helper( VexGuestPPC64State* gst,
|
|
UInt offset_ACC,
|
|
ULong srcA_hi, ULong srcA_lo,
|
|
@@ -2002,24 +2151,44 @@ void vsx_matrix_16bit_float_ger_dirty_helper( VexGuestPPC64State* gst,
|
|
srcB_word[0][j] = (UInt)((srcB_lo >> (16-16*j)) & mask);
|
|
}
|
|
|
|
+ /* Note the isa is not consistent in the src naming. Will use the
|
|
+ naming src10, src11, src20, src21 used with xvf16ger2 instructions.
|
|
+ */
|
|
for( j = 0; j < 4; j++) {
|
|
if (((pmsk >> 1) & 0x1) == 0) {
|
|
src10 = 0;
|
|
src20 = 0;
|
|
} else {
|
|
- src10 = conv_f16_to_double((ULong)srcA_word[i][0]);
|
|
- src20 = conv_f16_to_double((ULong)srcB_word[j][0]);
|
|
+ if (( inst == XVF16GER2 ) || ( inst == XVF16GER2PP )
|
|
+ || ( inst == XVF16GER2PN ) || ( inst == XVF16GER2NP )
|
|
+ || ( inst == XVF16GER2NN )) {
|
|
+ src10 = conv_f16_to_double((ULong)srcA_word[i][0]);
|
|
+ src20 = conv_f16_to_double((ULong)srcB_word[j][0]);
|
|
+ } else {
|
|
+ /* Input is in bfloat format, result is stored in the
|
|
+ "traditional" 64-bit float format. */
|
|
+ src10 = (double)conv_bf16_to_float((ULong)srcA_word[i][0]);
|
|
+ src20 = (double)conv_bf16_to_float((ULong)srcB_word[j][0]);
|
|
+ }
|
|
}
|
|
|
|
if ((pmsk & 0x1) == 0) {
|
|
src11 = 0;
|
|
src21 = 0;
|
|
} else {
|
|
- src11 = conv_f16_to_double((ULong)srcA_word[i][1]);
|
|
- src21 = conv_f16_to_double((ULong)srcB_word[j][1]);
|
|
+ if (( inst == XVF16GER2 ) || ( inst == XVF16GER2PP )
|
|
+ || ( inst == XVF16GER2PN ) || ( inst == XVF16GER2NP )
|
|
+ || ( inst == XVF16GER2NN )) {
|
|
+ src11 = conv_f16_to_double((ULong)srcA_word[i][1]);
|
|
+ src21 = conv_f16_to_double((ULong)srcB_word[j][1]);
|
|
+ } else {
|
|
+ /* Input is in bfloat format, result is stored in the
|
|
+ "traditional" 64-bit float format. */
|
|
+ src11 = (double)conv_bf16_to_float((ULong)srcA_word[i][1]);
|
|
+ src21 = (double)conv_bf16_to_float((ULong)srcB_word[j][1]);
|
|
+ }
|
|
}
|
|
|
|
-
|
|
prod = src10 * src20;
|
|
msum = prod + src11 * src21;
|
|
|
|
@@ -2027,26 +2196,26 @@ void vsx_matrix_16bit_float_ger_dirty_helper( VexGuestPPC64State* gst,
|
|
/* Note, we do not track the exception handling bits
|
|
ox, ux, xx, si, mz, vxsnan and vximz in the FPSCR. */
|
|
|
|
- if ( inst == XVF16GER2 )
|
|
+ if (( inst == XVF16GER2 ) || ( inst == XVBF16GER2 ) )
|
|
result[j] = reinterpret_float_as_int(
|
|
conv_double_to_float(msum) );
|
|
|
|
- else if ( inst == XVF16GER2PP )
|
|
+ else if (( inst == XVF16GER2PP ) || (inst == XVBF16GER2PP ))
|
|
result[j] = reinterpret_float_as_int(
|
|
conv_double_to_float(msum)
|
|
+ acc_word[j] );
|
|
|
|
- else if ( inst == XVF16GER2PN )
|
|
+ else if (( inst == XVF16GER2PN ) || ( inst == XVBF16GER2PN ))
|
|
result[j] = reinterpret_float_as_int(
|
|
conv_double_to_float(msum)
|
|
+ negate_float( acc_word[j] ) );
|
|
|
|
- else if ( inst == XVF16GER2NP )
|
|
+ else if (( inst == XVF16GER2NP ) || ( inst == XVBF16GER2NP ))
|
|
result[j] = reinterpret_float_as_int(
|
|
conv_double_to_float( negate_double( msum ) )
|
|
+ acc_word[j] );
|
|
|
|
- else if ( inst == XVF16GER2NN )
|
|
+ else if (( inst == XVF16GER2NN ) || ( inst == XVBF16GER2NN ))
|
|
result[j] = reinterpret_float_as_int(
|
|
conv_double_to_float( negate_double( msum ) )
|
|
+ negate_float( acc_word[j] ) );
|
|
diff --git a/VEX/priv/guest_ppc_toIR.c b/VEX/priv/guest_ppc_toIR.c
|
|
index 354be6b53..20553a539 100644
|
|
--- a/VEX/priv/guest_ppc_toIR.c
|
|
+++ b/VEX/priv/guest_ppc_toIR.c
|
|
@@ -5688,6 +5688,57 @@ static IRExpr * convert_from_national ( const VexAbiInfo* vbi, IRExpr *src ) {
|
|
return mkexpr( result );
|
|
}
|
|
|
|
+static IRExpr * vector_convert_floattobf16 ( const VexAbiInfo* vbi,
|
|
+ IRExpr *src ) {
|
|
+ /* The function takes 128-bit value containing four 32-bit floats and
|
|
+ returns a 128-bit value containint four 16-bit bfloats in the lower
|
|
+ halfwords. */
|
|
+
|
|
+ IRTemp resultHi = newTemp( Ity_I64);
|
|
+ IRTemp resultLo = newTemp( Ity_I64);
|
|
+
|
|
+ assign( resultHi,
|
|
+ mkIRExprCCall( Ity_I64, 0 /*regparms*/,
|
|
+ "vector_convert_floattobf16_helper",
|
|
+ fnptr_to_fnentry( vbi,
|
|
+ &convert_from_floattobf16_helper ),
|
|
+ mkIRExprVec_1( unop( Iop_V128HIto64, src ) ) ) );
|
|
+
|
|
+ assign( resultLo,
|
|
+ mkIRExprCCall( Ity_I64, 0 /*regparms*/,
|
|
+ "vector_convert_floattobf16_helper",
|
|
+ fnptr_to_fnentry( vbi,
|
|
+ &convert_from_floattobf16_helper ),
|
|
+ mkIRExprVec_1( unop( Iop_V128to64, src ) ) ) );
|
|
+
|
|
+ return binop( Iop_64HLtoV128, mkexpr( resultHi ), mkexpr( resultLo ) );
|
|
+}
|
|
+
|
|
+static IRExpr * vector_convert_bf16tofloat ( const VexAbiInfo* vbi,
|
|
+ IRExpr *src ) {
|
|
+ /* The function takes 128-bit value containing four 16-bit bfloats in
|
|
+ the lower halfwords and returns a 128-bit value containint four
|
|
+ 32-bit floats. */
|
|
+ IRTemp resultHi = newTemp( Ity_I64);
|
|
+ IRTemp resultLo = newTemp( Ity_I64);
|
|
+
|
|
+ assign( resultHi,
|
|
+ mkIRExprCCall( Ity_I64, 0 /*regparms*/,
|
|
+ "vector_convert_bf16tofloat_helper",
|
|
+ fnptr_to_fnentry( vbi,
|
|
+ &convert_from_bf16tofloat_helper ),
|
|
+ mkIRExprVec_1( unop( Iop_V128HIto64, src ) ) ) );
|
|
+
|
|
+ assign( resultLo,
|
|
+ mkIRExprCCall( Ity_I64, 0 /*regparms*/,
|
|
+ "vector_convert_bf16tofloat_helper",
|
|
+ fnptr_to_fnentry( vbi,
|
|
+ &convert_from_bf16tofloat_helper ),
|
|
+ mkIRExprVec_1( unop( Iop_V128to64, src ) ) ) );
|
|
+
|
|
+ return binop( Iop_64HLtoV128, mkexpr( resultHi ), mkexpr( resultLo ) );
|
|
+}
|
|
+
|
|
static IRExpr * popcnt64 ( const VexAbiInfo* vbi,
|
|
IRExpr *src ){
|
|
/* The function takes a 64-bit source and counts the number of bits in the
|
|
@@ -5936,6 +5987,7 @@ static void vsx_matrix_ger ( const VexAbiInfo* vbi,
|
|
case XVI16GER2:
|
|
case XVI16GER2S:
|
|
case XVF16GER2:
|
|
+ case XVBF16GER2:
|
|
case XVF32GER:
|
|
AT_fx = Ifx_Write;
|
|
break;
|
|
@@ -5943,6 +5995,10 @@ static void vsx_matrix_ger ( const VexAbiInfo* vbi,
|
|
case XVI8GER4PP:
|
|
case XVI16GER2PP:
|
|
case XVI16GER2SPP:
|
|
+ case XVBF16GER2PP:
|
|
+ case XVBF16GER2PN:
|
|
+ case XVBF16GER2NP:
|
|
+ case XVBF16GER2NN:
|
|
case XVF16GER2PP:
|
|
case XVF16GER2PN:
|
|
case XVF16GER2NP:
|
|
@@ -23899,6 +23955,24 @@ dis_vxs_misc( UInt prefix, UInt theInstr, const VexAbiInfo* vbi, UInt opc2,
|
|
mkexpr( sub_element1 ),
|
|
mkexpr( sub_element0 ) ) ) );
|
|
|
|
+ } else if ((inst_select == 16) && !prefix) {
|
|
+ IRTemp result = newTemp(Ity_V128);
|
|
+ UChar xT_addr = ifieldRegXT ( theInstr );
|
|
+ UChar xB_addr = ifieldRegXB ( theInstr );
|
|
+ /* Convert 16-bit bfloat to 32-bit float, not a prefix inst */
|
|
+ DIP("xvcvbf16sp v%u,v%u\n", xT_addr, xB_addr);
|
|
+ assign( result, vector_convert_bf16tofloat( vbi, mkexpr( vB ) ) );
|
|
+ putVSReg( XT, mkexpr( result) );
|
|
+
|
|
+ } else if ((inst_select == 17) && !prefix) {
|
|
+ IRTemp result = newTemp(Ity_V128);
|
|
+ UChar xT_addr = ifieldRegXT ( theInstr );
|
|
+ UChar xB_addr = ifieldRegXB ( theInstr );
|
|
+ /* Convert 32-bit float to 16-bit bfloat, not a prefix inst */
|
|
+ DIP("xvcvspbf16 v%u,v%u\n", xT_addr, xB_addr);
|
|
+ assign( result, vector_convert_floattobf16( vbi, mkexpr( vB ) ) );
|
|
+ putVSReg( XT, mkexpr( result) );
|
|
+
|
|
} else if (inst_select == 23) {
|
|
DIP("xxbrd v%u, v%u\n", (UInt)XT, (UInt)XB);
|
|
|
|
@@ -34956,6 +35030,41 @@ static Bool dis_vsx_accumulator_prefix ( UInt prefix, UInt theInstr,
|
|
getVSReg( rB_addr ), AT,
|
|
( ( inst_prefix << 8 ) | XO ) );
|
|
break;
|
|
+ case XVBF16GER2:
|
|
+ DIP("xvbf16ger2 %u,r%u, r%u\n", AT, rA_addr, rB_addr);
|
|
+ vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,
|
|
+ getVSReg( rA_addr ),
|
|
+ getVSReg( rB_addr ), AT,
|
|
+ ( ( inst_prefix << 8 ) | XO ) );
|
|
+ break;
|
|
+ case XVBF16GER2PP:
|
|
+ DIP("xvbf16ger2pp %u,r%u, r%u\n", AT, rA_addr, rB_addr);
|
|
+ vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,
|
|
+ getVSReg( rA_addr ),
|
|
+ getVSReg( rB_addr ), AT,
|
|
+ ( ( inst_prefix << 8 ) | XO ) );
|
|
+ break;
|
|
+ case XVBF16GER2PN:
|
|
+ DIP("xvbf16ger2pn %u,r%u, r%u\n", AT, rA_addr, rB_addr);
|
|
+ vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,
|
|
+ getVSReg( rA_addr ),
|
|
+ getVSReg( rB_addr ), AT,
|
|
+ ( ( inst_prefix << 8 ) | XO ) );
|
|
+ break;
|
|
+ case XVBF16GER2NP:
|
|
+ DIP("xvbf16ger2np %u,r%u, r%u\n", AT, rA_addr, rB_addr);
|
|
+ vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,
|
|
+ getVSReg( rA_addr ),
|
|
+ getVSReg( rB_addr ), AT,
|
|
+ ( ( inst_prefix << 8 ) | XO ) );
|
|
+ break;
|
|
+ case XVBF16GER2NN:
|
|
+ DIP("xvbf16ger2nn %u,r%u, r%u\n", AT, rA_addr, rB_addr);
|
|
+ vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,
|
|
+ getVSReg( rA_addr ),
|
|
+ getVSReg( rB_addr ), AT,
|
|
+ ( ( inst_prefix << 8 ) | XO ) );
|
|
+ break;
|
|
case XVF32GER:
|
|
DIP("xvf32ger %u,r%u, r%u\n", AT, rA_addr, rB_addr);
|
|
vsx_matrix_ger( vbi, MATRIX_32BIT_FLOAT_GER,
|
|
@@ -35106,6 +35215,61 @@ static Bool dis_vsx_accumulator_prefix ( UInt prefix, UInt theInstr,
|
|
AT,
|
|
( (MASKS << 9 ) | ( inst_prefix << 8 ) | XO ) );
|
|
break;
|
|
+ case XVBF16GER2:
|
|
+ PMSK = IFIELD( prefix, 14, 2);
|
|
+ XMSK = IFIELD( prefix, 4, 4);
|
|
+ YMSK = IFIELD( prefix, 0, 4);
|
|
+ DIP("pmxvbf16ger2 %u,r%u, r%u\n", AT, rA_addr, rB_addr);
|
|
+ vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,
|
|
+ getVSReg( rA_addr ),
|
|
+ getVSReg( rB_addr ),
|
|
+ AT, ( (MASKS << 9 )
|
|
+ | ( inst_prefix << 8 ) | XO ) );
|
|
+ break;
|
|
+ case XVBF16GER2PP:
|
|
+ PMSK = IFIELD( prefix, 14, 2);
|
|
+ XMSK = IFIELD( prefix, 4, 4);
|
|
+ YMSK = IFIELD( prefix, 0, 4);
|
|
+ DIP("pmxvbf16ger2pp %u,r%u, r%u\n", AT, rA_addr, rB_addr);
|
|
+ vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,
|
|
+ getVSReg( rA_addr ),
|
|
+ getVSReg( rB_addr ),
|
|
+ AT, ( (MASKS << 9 )
|
|
+ | ( inst_prefix << 8 ) | XO ) );
|
|
+ break;
|
|
+ case XVBF16GER2PN:
|
|
+ PMSK = IFIELD( prefix, 14, 2);
|
|
+ XMSK = IFIELD( prefix, 4, 4);
|
|
+ YMSK = IFIELD( prefix, 0, 4);
|
|
+ DIP("pmxvbf16ger2pn %u,r%u, r%u\n", AT, rA_addr, rB_addr);
|
|
+ vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,
|
|
+ getVSReg( rA_addr ),
|
|
+ getVSReg( rB_addr ),
|
|
+ AT, ( (MASKS << 9 )
|
|
+ | ( inst_prefix << 8 ) | XO ) );
|
|
+ break;
|
|
+ case XVBF16GER2NP:
|
|
+ PMSK = IFIELD( prefix, 14, 2);
|
|
+ XMSK = IFIELD( prefix, 4, 4);
|
|
+ YMSK = IFIELD( prefix, 0, 4);
|
|
+ DIP("pmxvbf16ger2np %u,r%u, r%u\n", AT, rA_addr, rB_addr);
|
|
+ vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,
|
|
+ getVSReg( rA_addr ),
|
|
+ getVSReg( rB_addr ),
|
|
+ AT, ( (MASKS << 9 )
|
|
+ | ( inst_prefix << 8 ) | XO ) );
|
|
+ break;
|
|
+ case XVBF16GER2NN:
|
|
+ PMSK = IFIELD( prefix, 14, 2);
|
|
+ XMSK = IFIELD( prefix, 4, 4);
|
|
+ YMSK = IFIELD( prefix, 0, 4);
|
|
+ DIP("pmxvbf16ger2nn %u,r%u, r%u\n", AT, rA_addr, rB_addr);
|
|
+ vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,
|
|
+ getVSReg( rA_addr ),
|
|
+ getVSReg( rB_addr ),
|
|
+ AT, ( (MASKS << 9 )
|
|
+ | ( inst_prefix << 8 ) | XO ) );
|
|
+ break;
|
|
case XVF16GER2:
|
|
PMSK = IFIELD( prefix, 14, 2);
|
|
XMSK = IFIELD( prefix, 4, 4);
|
|
@@ -36181,6 +36345,11 @@ DisResult disInstr_PPC_WRK (
|
|
(opc2 == XVI4GER8PP) || // xvi4ger8pp
|
|
(opc2 == XVI8GER4) || // xvi8ger4
|
|
(opc2 == XVI8GER4PP) || // xvi8ger4pp
|
|
+ (opc2 == XVBF16GER2) || // xvbf16ger2
|
|
+ (opc2 == XVBF16GER2PP) || // xvbf16ger2pp
|
|
+ (opc2 == XVBF16GER2PN) || // xvbf16ger2pn
|
|
+ (opc2 == XVBF16GER2NP) || // xvbf16ger2np
|
|
+ (opc2 == XVBF16GER2NN) || // xvbf16ger2nn
|
|
(opc2 == XVF16GER2) || // xvf16ger2
|
|
(opc2 == XVF16GER2PP) || // xvf16ger2pp
|
|
(opc2 == XVF16GER2PN) || // xvf16ger2pn
|
|
commit e09fdaf569b975717465ed8043820d0198d4d47d
|
|
Author: Carl Love <cel@us.ibm.com>
|
|
Date: Fri Feb 26 16:05:12 2021 -0600
|
|
|
|
PPC64: Reduced-Precision: Missing Integer-based Outer Product Operations
|
|
|
|
Add support for:
|
|
|
|
pmxvi16ger2 VSX Vector 16-bit Signed Integer GER (rank-2 update), Prefixed
|
|
Masked
|
|
pmxvi16ger2pp VSX Vector 16-bit Signed Integer GER (rank-2 update) (Positive
|
|
multiply, Positive accumulate), Prefixed Masked
|
|
pmxvi8ger4spp VSX Vector 8-bit Signed/Unsigned Integer GER (rank-4 update) with
|
|
Saturation (Positive multiply, Positive accumulate), Prefixed Masked
|
|
xvi16ger2 VSX Vector 16-bit Signed Integer GER (rank-2 update)
|
|
xvi16ger2pp VSX Vector 16-bit Signed Integer GER (rank-2 update) (Positive
|
|
multiply, Positive accumulate)
|
|
xvi8ger4spp VSX Vector 8-bit Signed/Unsigned Integer GER (rank-4 update) with
|
|
Saturation (Positive multiply, Positive accumulate)
|
|
|
|
diff --git a/VEX/priv/guest_ppc_helpers.c b/VEX/priv/guest_ppc_helpers.c
|
|
index 6bcee966d..d8131eb60 100644
|
|
--- a/VEX/priv/guest_ppc_helpers.c
|
|
+++ b/VEX/priv/guest_ppc_helpers.c
|
|
@@ -1446,16 +1446,16 @@ static UInt exts4( UInt src)
|
|
return src & 0xF; /* make sure high order bits are zero */
|
|
}
|
|
|
|
-static UInt exts8( UInt src)
|
|
+static ULong exts8( UInt src)
|
|
{
|
|
- /* Input is an 8-bit value. Extend bit 7 to bits [31:8] */
|
|
+ /* Input is an 8-bit value. Extend bit 7 to bits [63:8] */
|
|
if (( src >> 7 ) & 0x1)
|
|
- return src | 0xFFFFFF00; /* sign bit is a 1, extend */
|
|
+ return src | 0xFFFFFFFFFFFFFF00ULL; /* sign bit is a 1, extend */
|
|
else
|
|
return src & 0xFF; /* make sure high order bits are zero */
|
|
}
|
|
|
|
-static UInt extz8( UInt src)
|
|
+static ULong extz8( UInt src)
|
|
{
|
|
/* Input is an 8-bit value. Extend src on the left with zeros. */
|
|
return src & 0xFF; /* make sure high order bits are zero */
|
|
@@ -1662,12 +1662,12 @@ void vsx_matrix_8bit_ger_dirty_helper( VexGuestPPC64State* gst,
|
|
ULong srcB_hi, ULong srcB_lo,
|
|
UInt masks_inst )
|
|
{
|
|
- UInt i, j, mask, sum, inst, acc_entry, prefix_inst;
|
|
+ UInt i, j, mask, inst, acc_entry, prefix_inst;
|
|
|
|
UInt srcA_bytes[4][4]; /* word, byte */
|
|
UInt srcB_bytes[4][4]; /* word, byte */
|
|
UInt acc_word[4];
|
|
- UInt prod0, prod1, prod2, prod3;
|
|
+ ULong prod0, prod1, prod2, prod3, sum;
|
|
UInt result[4];
|
|
UInt pmsk = 0;
|
|
UInt xmsk = 0;
|
|
@@ -1742,10 +1742,13 @@ void vsx_matrix_8bit_ger_dirty_helper( VexGuestPPC64State* gst,
|
|
sum = prod0 + prod1 + prod2 + prod3;
|
|
|
|
if ( inst == XVI8GER4 )
|
|
- result[j] = sum;
|
|
+ result[j] = chop64to32( sum );
|
|
|
|
else if ( inst == XVI8GER4PP )
|
|
- result[j] = sum + acc_word[j];
|
|
+ result[j] = chop64to32( sum + acc_word[j] );
|
|
+
|
|
+ else if ( inst == XVI8GER4SPP )
|
|
+ result[j] = clampS64toS32(sum + acc_word[j]);
|
|
|
|
} else {
|
|
result[j] = 0;
|
|
@@ -1821,7 +1824,7 @@ void vsx_matrix_16bit_ger_dirty_helper( VexGuestPPC64State* gst,
|
|
else
|
|
prod1 = exts16to64( srcA_word[i][1] )
|
|
* exts16to64( srcB_word[j][1] );
|
|
- /* sum is UInt so the result is choped to 32-bits */
|
|
+
|
|
sum = prod0 + prod1;
|
|
|
|
if ( inst == XVI16GER2 )
|
|
@@ -1830,13 +1833,11 @@ void vsx_matrix_16bit_ger_dirty_helper( VexGuestPPC64State* gst,
|
|
else if ( inst == XVI16GER2S )
|
|
result[j] = clampS64toS32( sum );
|
|
|
|
- else if ( inst == XVI16GER2PP ) {
|
|
+ else if ( inst == XVI16GER2PP )
|
|
result[j] = chop64to32( sum + acc_word[j] );
|
|
- }
|
|
|
|
- else if ( inst == XVI16GER2SPP ) {
|
|
+ else if ( inst == XVI16GER2SPP )
|
|
result[j] = clampS64toS32( sum + acc_word[j] );
|
|
- }
|
|
|
|
} else {
|
|
result[j] = 0;
|
|
diff --git a/VEX/priv/guest_ppc_toIR.c b/VEX/priv/guest_ppc_toIR.c
|
|
index 20553a539..e54f0f389 100644
|
|
--- a/VEX/priv/guest_ppc_toIR.c
|
|
+++ b/VEX/priv/guest_ppc_toIR.c
|
|
@@ -5993,6 +5993,7 @@ static void vsx_matrix_ger ( const VexAbiInfo* vbi,
|
|
break;
|
|
case XVI4GER8PP:
|
|
case XVI8GER4PP:
|
|
+ case XVI8GER4SPP:
|
|
case XVI16GER2PP:
|
|
case XVI16GER2SPP:
|
|
case XVBF16GER2PP:
|
|
@@ -34983,6 +34984,12 @@ static Bool dis_vsx_accumulator_prefix ( UInt prefix, UInt theInstr,
|
|
getVSReg( rA_addr ), getVSReg( rB_addr ),
|
|
AT, ( ( inst_prefix << 8 ) | XO ) );
|
|
break;
|
|
+ case XVI8GER4SPP:
|
|
+ DIP("xvi8ger4spp %u,r%u, r%u\n", AT, rA_addr, rB_addr);
|
|
+ vsx_matrix_ger( vbi, MATRIX_8BIT_INT_GER,
|
|
+ getVSReg( rA_addr ), getVSReg( rB_addr ),
|
|
+ AT, ( ( inst_prefix << 8 ) | XO ) );
|
|
+ break;
|
|
case XVI16GER2S:
|
|
DIP("xvi16ger2s %u,r%u, r%u\n", AT, rA_addr, rB_addr);
|
|
vsx_matrix_ger( vbi, MATRIX_16BIT_INT_GER,
|
|
@@ -34995,6 +35002,19 @@ static Bool dis_vsx_accumulator_prefix ( UInt prefix, UInt theInstr,
|
|
getVSReg( rA_addr ), getVSReg( rB_addr ),
|
|
AT, ( ( inst_prefix << 8 ) | XO ) );
|
|
break;
|
|
+ case XVI16GER2:
|
|
+ DIP("xvi16ger2 %u,r%u, r%u\n", AT, rA_addr, rB_addr);
|
|
+ vsx_matrix_ger( vbi, MATRIX_16BIT_INT_GER,
|
|
+ getVSReg( rA_addr ), getVSReg( rB_addr ),
|
|
+ AT, ( ( inst_prefix << 8 ) | XO ) );
|
|
+ break;
|
|
+ case XVI16GER2PP:
|
|
+ DIP("xvi16ger2pp %u,r%u, r%u\n", AT, rA_addr, rB_addr);
|
|
+ vsx_matrix_ger( vbi, MATRIX_16BIT_INT_GER,
|
|
+ getVSReg( rA_addr ), getVSReg( rB_addr ),
|
|
+ AT, ( ( inst_prefix << 8 ) | XO ) );
|
|
+ break;
|
|
+
|
|
case XVF16GER2:
|
|
DIP("xvf16ger2 %u,r%u, r%u\n", AT, rA_addr, rB_addr);
|
|
vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,
|
|
@@ -35193,6 +35213,39 @@ static Bool dis_vsx_accumulator_prefix ( UInt prefix, UInt theInstr,
|
|
AT,
|
|
( (MASKS << 9 ) | ( inst_prefix << 8 ) | XO ) );
|
|
break;
|
|
+ case XVI8GER4SPP:
|
|
+ PMSK = IFIELD( prefix, 12, 4);
|
|
+ XMSK = IFIELD( prefix, 4, 4);
|
|
+ YMSK = IFIELD( prefix, 0, 4);
|
|
+ DIP("pmxvi8ger4spp %u,r%u, r%u,%u,%u,%u\n",
|
|
+ AT, rA_addr, rB_addr, XMSK, YMSK, PMSK);
|
|
+ vsx_matrix_ger( vbi, MATRIX_8BIT_INT_GER,
|
|
+ getVSReg( rA_addr ), getVSReg( rB_addr ),
|
|
+ AT,
|
|
+ ( (MASKS << 9 ) | ( inst_prefix << 8 ) | XO ) );
|
|
+ break;
|
|
+ case XVI16GER2:
|
|
+ PMSK = IFIELD( prefix, 12, 4);
|
|
+ XMSK = IFIELD( prefix, 4, 4);
|
|
+ YMSK = IFIELD( prefix, 0, 4);
|
|
+ DIP("pmxvi16ger2 %u,r%u, r%u,%u,%u,%u\n",
|
|
+ AT, rA_addr, rB_addr, XMSK, YMSK, PMSK);
|
|
+ vsx_matrix_ger( vbi, MATRIX_16BIT_INT_GER,
|
|
+ getVSReg( rA_addr ), getVSReg( rB_addr ),
|
|
+ AT,
|
|
+ ( (MASKS << 9 ) | ( inst_prefix << 8 ) | XO ) );
|
|
+ break;
|
|
+ case XVI16GER2PP:
|
|
+ PMSK = IFIELD( prefix, 12, 4);
|
|
+ XMSK = IFIELD( prefix, 4, 4);
|
|
+ YMSK = IFIELD( prefix, 0, 4);
|
|
+ DIP("pmxvi16ger2pp %u,r%u, r%u,%u,%u,%u\n",
|
|
+ AT, rA_addr, rB_addr, XMSK, YMSK, PMSK);
|
|
+ vsx_matrix_ger( vbi, MATRIX_16BIT_INT_GER,
|
|
+ getVSReg( rA_addr ), getVSReg( rB_addr ),
|
|
+ AT,
|
|
+ ( (MASKS << 9 ) | ( inst_prefix << 8 ) | XO ) );
|
|
+ break;
|
|
case XVI16GER2S:
|
|
PMSK = IFIELD( prefix, 14, 2);
|
|
XMSK = IFIELD( prefix, 4, 4);
|
|
@@ -36345,6 +36398,9 @@ DisResult disInstr_PPC_WRK (
|
|
(opc2 == XVI4GER8PP) || // xvi4ger8pp
|
|
(opc2 == XVI8GER4) || // xvi8ger4
|
|
(opc2 == XVI8GER4PP) || // xvi8ger4pp
|
|
+ (opc2 == XVI8GER4SPP) || // xvi8ger4spp
|
|
+ (opc2 == XVI16GER2) || // xvi16ger2
|
|
+ (opc2 == XVI16GER2PP) || // xvi16ger2pp
|
|
(opc2 == XVBF16GER2) || // xvbf16ger2
|
|
(opc2 == XVBF16GER2PP) || // xvbf16ger2pp
|
|
(opc2 == XVBF16GER2PN) || // xvbf16ger2pn
|