From 04cdc29b007594a0e58ffef0c9dd87df3ea595ea Mon Sep 17 00:00:00 2001 From: Mark Wielaard Date: Wed, 14 Oct 2020 06:11:34 -0400 Subject: [PATCH] arm64 VEX frontend and backend support for Iop_M{Add,Sub}F{32,64} The arm64 frontend used to implement the scalar fmadd, fmsub, fnmadd and fnmsub iinstructions into separate addition/substraction and multiplication instructions, which caused rounding issues. This patch turns them into Iop_M{Add,Sub}F{32,64} instructions (with some arguments negated). And the backend now emits fmadd or fmsub instructions. Alexandra Hajkova added tests and fixed up the implementation to make sure rounding (and sign) are correct now. https://bugs.kde.org/show_bug.cgi?id=426014 --- VEX/priv/guest_arm64_toIR.c | 58 ++++++++--- VEX/priv/host_arm64_defs.c | 136 +++++++++++++++++++++++++- VEX/priv/host_arm64_defs.h | 30 ++++++ VEX/priv/host_arm64_isel.c | 39 ++++++++ none/tests/arm64/Makefile.am | 6 +- none/tests/arm64/fmadd_sub.c | 98 +++++++++++++++++++ none/tests/arm64/fmadd_sub.stderr.exp | 0 none/tests/arm64/fmadd_sub.stdout.exp | 125 +++++++++++++++++++++++ none/tests/arm64/fmadd_sub.vgtest | 3 + 9 files changed, 479 insertions(+), 16 deletions(-) create mode 100644 none/tests/arm64/fmadd_sub.c create mode 100644 none/tests/arm64/fmadd_sub.stderr.exp create mode 100644 none/tests/arm64/fmadd_sub.stdout.exp create mode 100644 none/tests/arm64/fmadd_sub.vgtest diff --git a/VEX/priv/guest_arm64_toIR.c b/VEX/priv/guest_arm64_toIR.c index 556b85a6a..d242d43c0 100644 --- a/VEX/priv/guest_arm64_toIR.c +++ b/VEX/priv/guest_arm64_toIR.c @@ -286,6 +286,12 @@ static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 ) return IRExpr_Triop(op, a1, a2, a3); } +static IRExpr* qop ( IROp op, IRExpr* a1, IRExpr* a2, + IRExpr* a3, IRExpr* a4 ) +{ + return IRExpr_Qop(op, a1, a2, a3, a4); +} + static IRExpr* loadLE ( IRType ty, IRExpr* addr ) { return IRExpr_Load(Iend_LE, ty, addr); @@ -532,6 +538,22 @@ static IROp mkADDF ( IRType ty ) { } } +static IROp mkFMADDF ( IRType ty ) { + switch (ty) { + case Ity_F32: return Iop_MAddF32; + case Ity_F64: return Iop_MAddF64; + default: vpanic("mkFMADDF"); + } +} + +static IROp mkFMSUBF ( IRType ty ) { + switch (ty) { + case Ity_F32: return Iop_MSubF32; + case Ity_F64: return Iop_MSubF64; + default: vpanic("mkFMSUBF"); + } +} + static IROp mkSUBF ( IRType ty ) { switch (ty) { case Ity_F32: return Iop_SubF32; @@ -14368,30 +14390,40 @@ Bool dis_AdvSIMD_fp_data_proc_3_source(/*MB_OUT*/DisResult* dres, UInt insn) where Fx=Dx when sz=1, Fx=Sx when sz=0 -----SPEC------ ----IMPL---- - fmadd a + n * m a + n * m - fmsub a + (-n) * m a - n * m - fnmadd (-a) + (-n) * m -(a + n * m) - fnmsub (-a) + n * m -(a - n * m) + fmadd a + n * m fmadd (a, n, m) + fmsub a + (-n) * m fmsub (a, n, m) + fnmadd (-a) + (-n) * m fmadd (-a, -n, m) + fnmsub (-a) + n * m fmadd (-a, n, m) + + Note Iop_MAdd/SubF32/64 take arguments in the order: rm, N, M, A */ Bool isD = (ty & 1) == 1; UInt ix = (bitO1 << 1) | bitO0; IRType ity = isD ? Ity_F64 : Ity_F32; - IROp opADD = mkADDF(ity); - IROp opSUB = mkSUBF(ity); - IROp opMUL = mkMULF(ity); + IROp opFMADD = mkFMADDF(ity); + IROp opFMSUB = mkFMSUBF(ity); IROp opNEG = mkNEGF(ity); IRTemp res = newTemp(ity); IRExpr* eA = getQRegLO(aa, ity); IRExpr* eN = getQRegLO(nn, ity); IRExpr* eM = getQRegLO(mm, ity); IRExpr* rm = mkexpr(mk_get_IR_rounding_mode()); - IRExpr* eNxM = triop(opMUL, rm, eN, eM); switch (ix) { - case 0: assign(res, triop(opADD, rm, eA, eNxM)); break; - case 1: assign(res, triop(opSUB, rm, eA, eNxM)); break; - case 2: assign(res, unop(opNEG, triop(opADD, rm, eA, eNxM))); break; - case 3: assign(res, unop(opNEG, triop(opSUB, rm, eA, eNxM))); break; - default: vassert(0); + case 0: /* FMADD */ + assign(res, qop(opFMADD, rm, eN, eM, eA)); + break; + case 1: /* FMSUB */ + assign(res, qop(opFMSUB, rm, eN, eM, eA)); + break; + case 2: /* FNMADD */ + assign(res, qop(opFMADD, rm, unop(opNEG, eN), eM, + unop(opNEG,eA))); + break; + case 3: /* FNMSUB */ + assign(res, qop(opFMADD, rm, eN, eM, unop(opNEG, eA))); + break; + default: + vassert(0); } putQReg128(dd, mkV128(0x0000)); putQRegLO(dd, mkexpr(res)); diff --git a/VEX/priv/host_arm64_defs.c b/VEX/priv/host_arm64_defs.c index e4ef56986..13b497f60 100644 --- a/VEX/priv/host_arm64_defs.c +++ b/VEX/priv/host_arm64_defs.c @@ -546,6 +546,14 @@ static const HChar* showARM64FpBinOp ( ARM64FpBinOp op ) { } } +static const HChar* showARM64FpTriOp ( ARM64FpTriOp op ) { + switch (op) { + case ARM64fpt_FMADD: return "fmadd"; + case ARM64fpt_FMSUB: return "fmsub"; + default: vpanic("showARM64FpTriOp"); + } +} + static const HChar* showARM64FpUnaryOp ( ARM64FpUnaryOp op ) { switch (op) { case ARM64fpu_NEG: return "neg "; @@ -1154,6 +1162,28 @@ ARM64Instr* ARM64Instr_VBinS ( ARM64FpBinOp op, i->ARM64in.VBinS.argR = argR; return i; } +ARM64Instr* ARM64Instr_VTriD ( ARM64FpTriOp op, + HReg dst, HReg arg1, HReg arg2, HReg arg3 ) { + ARM64Instr* i = LibVEX_Alloc_inline(sizeof(ARM64Instr)); + i->tag = ARM64in_VTriD; + i->ARM64in.VTriD.op = op; + i->ARM64in.VTriD.dst = dst; + i->ARM64in.VTriD.arg1 = arg1; + i->ARM64in.VTriD.arg2 = arg2; + i->ARM64in.VTriD.arg3 = arg3; + return i; +} +ARM64Instr* ARM64Instr_VTriS ( ARM64FpTriOp op, + HReg dst, HReg arg1, HReg arg2, HReg arg3 ) { + ARM64Instr* i = LibVEX_Alloc_inline(sizeof(ARM64Instr)); + i->tag = ARM64in_VTriS; + i->ARM64in.VTriS.op = op; + i->ARM64in.VTriS.dst = dst; + i->ARM64in.VTriS.arg1 = arg1; + i->ARM64in.VTriS.arg2 = arg2; + i->ARM64in.VTriS.arg3 = arg3; + return i; +} ARM64Instr* ARM64Instr_VCmpD ( HReg argL, HReg argR ) { ARM64Instr* i = LibVEX_Alloc_inline(sizeof(ARM64Instr)); i->tag = ARM64in_VCmpD; @@ -1756,6 +1786,26 @@ void ppARM64Instr ( const ARM64Instr* i ) { vex_printf(", "); ppHRegARM64asSreg(i->ARM64in.VBinS.argR); return; + case ARM64in_VTriD: + vex_printf("f%s ", showARM64FpTriOp(i->ARM64in.VTriD.op)); + ppHRegARM64(i->ARM64in.VTriD.dst); + vex_printf(", "); + ppHRegARM64(i->ARM64in.VTriD.arg1); + vex_printf(", "); + ppHRegARM64(i->ARM64in.VTriD.arg2); + vex_printf(", "); + ppHRegARM64(i->ARM64in.VTriD.arg3); + return; + case ARM64in_VTriS: + vex_printf("f%s ", showARM64FpTriOp(i->ARM64in.VTriS.op)); + ppHRegARM64asSreg(i->ARM64in.VTriS.dst); + vex_printf(", "); + ppHRegARM64asSreg(i->ARM64in.VTriS.arg1); + vex_printf(", "); + ppHRegARM64asSreg(i->ARM64in.VTriS.arg2); + vex_printf(", "); + ppHRegARM64asSreg(i->ARM64in.VTriS.arg3); + return; case ARM64in_VCmpD: vex_printf("fcmp "); ppHRegARM64(i->ARM64in.VCmpD.argL); @@ -2197,6 +2247,18 @@ void getRegUsage_ARM64Instr ( HRegUsage* u, const ARM64Instr* i, Bool mode64 ) addHRegUse(u, HRmRead, i->ARM64in.VBinS.argL); addHRegUse(u, HRmRead, i->ARM64in.VBinS.argR); return; + case ARM64in_VTriD: + addHRegUse(u, HRmWrite, i->ARM64in.VTriD.dst); + addHRegUse(u, HRmRead, i->ARM64in.VTriD.arg1); + addHRegUse(u, HRmRead, i->ARM64in.VTriD.arg2); + addHRegUse(u, HRmRead, i->ARM64in.VTriD.arg3); + return; + case ARM64in_VTriS: + addHRegUse(u, HRmWrite, i->ARM64in.VTriS.dst); + addHRegUse(u, HRmRead, i->ARM64in.VTriS.arg1); + addHRegUse(u, HRmRead, i->ARM64in.VTriS.arg2); + addHRegUse(u, HRmRead, i->ARM64in.VTriS.arg3); + return; case ARM64in_VCmpD: addHRegUse(u, HRmRead, i->ARM64in.VCmpD.argL); addHRegUse(u, HRmRead, i->ARM64in.VCmpD.argR); @@ -2454,6 +2516,18 @@ void mapRegs_ARM64Instr ( HRegRemap* m, ARM64Instr* i, Bool mode64 ) i->ARM64in.VBinS.argL = lookupHRegRemap(m, i->ARM64in.VBinS.argL); i->ARM64in.VBinS.argR = lookupHRegRemap(m, i->ARM64in.VBinS.argR); return; + case ARM64in_VTriD: + i->ARM64in.VTriD.dst = lookupHRegRemap(m, i->ARM64in.VTriD.dst); + i->ARM64in.VTriD.arg1 = lookupHRegRemap(m, i->ARM64in.VTriD.arg1); + i->ARM64in.VTriD.arg2 = lookupHRegRemap(m, i->ARM64in.VTriD.arg2); + i->ARM64in.VTriD.arg3 = lookupHRegRemap(m, i->ARM64in.VTriD.arg3); + return; + case ARM64in_VTriS: + i->ARM64in.VTriS.dst = lookupHRegRemap(m, i->ARM64in.VTriS.dst); + i->ARM64in.VTriS.arg1 = lookupHRegRemap(m, i->ARM64in.VTriS.arg1); + i->ARM64in.VTriS.arg2 = lookupHRegRemap(m, i->ARM64in.VTriS.arg2); + i->ARM64in.VTriS.arg3 = lookupHRegRemap(m, i->ARM64in.VTriS.arg3); + return; case ARM64in_VCmpD: i->ARM64in.VCmpD.argL = lookupHRegRemap(m, i->ARM64in.VCmpD.argL); i->ARM64in.VCmpD.argR = lookupHRegRemap(m, i->ARM64in.VCmpD.argR); @@ -2812,7 +2886,8 @@ static inline UInt qregEnc ( HReg r ) #define X11110011 BITS8(1,1,1,1,0,0,1,1) #define X11110101 BITS8(1,1,1,1,0,1,0,1) #define X11110111 BITS8(1,1,1,1,0,1,1,1) - +#define X11111000 BITS8(1,1,1,1,1,0,0,0) +#define X11111010 BITS8(1,1,1,1,1,0,1,0) /* --- 4 fields --- */ @@ -2972,6 +3047,27 @@ static inline UInt X_3_6_1_6_6_5_5 ( UInt f1, UInt f2, UInt f3, } +static inline UInt X_3_8_5_1_5_5_5 ( UInt f1, UInt f2, UInt f3, UInt f4, + UInt f5, UInt f6, UInt f7 ) { + vassert(3+8+5+1+5+5+5 == 32); + vassert(f1 < (1<<3)); + vassert(f2 < (1<<8)); + vassert(f3 < (1<<5)); + vassert(f4 < (1<<1)); + vassert(f5 < (1<<5)); + vassert(f6 < (1<<5)); + vassert(f7 < (1<<5)); + UInt w = 0; + w = (w << 3) | f1; + w = (w << 8) | f2; + w = (w << 5) | f3; + w = (w << 1) | f4; + w = (w << 5) | f5; + w = (w << 5) | f6; + w = (w << 5) | f7; + return w; +} + //ZZ #define X0000 BITS4(0,0,0,0) //ZZ #define X0001 BITS4(0,0,0,1) //ZZ #define X0010 BITS4(0,0,1,0) @@ -4339,6 +4435,44 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc, = X_3_8_5_6_5_5(X000, X11110001, sM, (b1512 << 2) | X10, sN, sD); goto done; } + case ARM64in_VTriD: { + /* 31 20 15 14 9 4 + 000 11111 010 m 0 a n d FMADD Dd,Dn,Dm,Da + ---------------- 1 ------ FMSUB ----------- + */ + UInt dD = dregEnc(i->ARM64in.VTriD.dst); + UInt dN = dregEnc(i->ARM64in.VTriD.arg1); + UInt dM = dregEnc(i->ARM64in.VTriD.arg2); + UInt dA = dregEnc(i->ARM64in.VTriD.arg3); + UInt b15 = 2; /* impossible */ + switch (i->ARM64in.VTriD.op) { + case ARM64fpt_FMADD: b15 = 0; break; + case ARM64fpt_FMSUB: b15 = 1; break; + default: goto bad; + } + vassert(b15 < 2); + *p++ = X_3_8_5_1_5_5_5(X000, X11111010, dM, b15, dA, dN, dD); + goto done; + } + case ARM64in_VTriS: { + /* 31 20 15 14 9 4 + 000 11111 000 m 0 a n d FMADD Dd,Dn,Dm,Da + ---------------- 1 ------ FMSUB ----------- + */ + UInt dD = dregEnc(i->ARM64in.VTriD.dst); + UInt dN = dregEnc(i->ARM64in.VTriD.arg1); + UInt dM = dregEnc(i->ARM64in.VTriD.arg2); + UInt dA = dregEnc(i->ARM64in.VTriD.arg3); + UInt b15 = 2; /* impossible */ + switch (i->ARM64in.VTriD.op) { + case ARM64fpt_FMADD: b15 = 0; break; + case ARM64fpt_FMSUB: b15 = 1; break; + default: goto bad; + } + vassert(b15 < 2); + *p++ = X_3_8_5_1_5_5_5(X000, X11111000, dM, b15, dA, dN, dD); + goto done; + } case ARM64in_VCmpD: { /* 000 11110 01 1 m 00 1000 n 00 000 FCMP Dn, Dm */ UInt dN = dregEnc(i->ARM64in.VCmpD.argL); diff --git a/VEX/priv/host_arm64_defs.h b/VEX/priv/host_arm64_defs.h index 05dba7ab8..5a82564ce 100644 --- a/VEX/priv/host_arm64_defs.h +++ b/VEX/priv/host_arm64_defs.h @@ -289,6 +289,14 @@ typedef } ARM64FpBinOp; +typedef + enum { + ARM64fpt_FMADD=105, + ARM64fpt_FMSUB, + ARM64fpt_INVALID + } + ARM64FpTriOp; + typedef enum { ARM64fpu_NEG=110, @@ -498,6 +506,8 @@ typedef ARM64in_VUnaryS, ARM64in_VBinD, ARM64in_VBinS, + ARM64in_VTriD, + ARM64in_VTriS, ARM64in_VCmpD, ARM64in_VCmpS, ARM64in_VFCSel, @@ -799,6 +809,22 @@ typedef HReg argL; HReg argR; } VBinS; + /* 64-bit FP ternary arithmetic */ + struct { + ARM64FpTriOp op; + HReg dst; + HReg arg1; + HReg arg2; + HReg arg3; + } VTriD; + /* 32-bit FP ternary arithmetic */ + struct { + ARM64FpTriOp op; + HReg dst; + HReg arg1; + HReg arg2; + HReg arg3; + } VTriS; /* 64-bit FP compare */ struct { HReg argL; @@ -970,6 +996,10 @@ extern ARM64Instr* ARM64Instr_VUnaryD ( ARM64FpUnaryOp op, HReg dst, HReg src ); extern ARM64Instr* ARM64Instr_VUnaryS ( ARM64FpUnaryOp op, HReg dst, HReg src ); extern ARM64Instr* ARM64Instr_VBinD ( ARM64FpBinOp op, HReg, HReg, HReg ); extern ARM64Instr* ARM64Instr_VBinS ( ARM64FpBinOp op, HReg, HReg, HReg ); +extern ARM64Instr* ARM64Instr_VTriD ( ARM64FpTriOp op, HReg dst, + HReg, HReg, HReg ); +extern ARM64Instr* ARM64Instr_VTriS ( ARM64FpTriOp op, HReg dst, + HReg, HReg, HReg ); extern ARM64Instr* ARM64Instr_VCmpD ( HReg argL, HReg argR ); extern ARM64Instr* ARM64Instr_VCmpS ( HReg argL, HReg argR ); extern ARM64Instr* ARM64Instr_VFCSel ( HReg dst, HReg argL, HReg argR, diff --git a/VEX/priv/host_arm64_isel.c b/VEX/priv/host_arm64_isel.c index 2f19eab81..da1218715 100644 --- a/VEX/priv/host_arm64_isel.c +++ b/VEX/priv/host_arm64_isel.c @@ -3255,6 +3255,25 @@ static HReg iselDblExpr_wrk ( ISelEnv* env, IRExpr* e ) } } + if (e->tag == Iex_Qop) { + IRQop* qop = e->Iex.Qop.details; + ARM64FpTriOp triop = ARM64fpt_INVALID; + switch (qop->op) { + case Iop_MAddF64: triop = ARM64fpt_FMADD; break; + case Iop_MSubF64: triop = ARM64fpt_FMSUB; break; + default: break; + } + if (triop != ARM64fpt_INVALID) { + HReg N = iselDblExpr(env, qop->arg2); + HReg M = iselDblExpr(env, qop->arg3); + HReg A = iselDblExpr(env, qop->arg4); + HReg dst = newVRegD(env); + set_FPCR_rounding_mode(env, qop->arg1); + addInstr(env, ARM64Instr_VTriD(triop, dst, N, M, A)); + return dst; + } + } + if (e->tag == Iex_ITE) { /* ITE(ccexpr, iftrue, iffalse) */ ARM64CondCode cc; @@ -3450,6 +3469,26 @@ static HReg iselFltExpr_wrk ( ISelEnv* env, IRExpr* e ) return dst; } + if (e->tag == Iex_Qop) { + IRQop* qop = e->Iex.Qop.details; + ARM64FpTriOp triop = ARM64fpt_INVALID; + switch (qop->op) { + case Iop_MAddF32: triop = ARM64fpt_FMADD; break; + case Iop_MSubF32: triop = ARM64fpt_FMSUB; break; + default: break; + } + + if (triop != ARM64fpt_INVALID) { + HReg N = iselFltExpr(env, qop->arg2); + HReg M = iselFltExpr(env, qop->arg3); + HReg A = iselFltExpr(env, qop->arg4); + HReg dst = newVRegD(env); + set_FPCR_rounding_mode(env, qop->arg1); + addInstr(env, ARM64Instr_VTriS(triop, dst, N, M, A)); + return dst; + } + } + ppIRExpr(e); vpanic("iselFltExpr_wrk"); } diff --git a/none/tests/arm64/Makefile.am b/none/tests/arm64/Makefile.am index 7b3ebbdca..4ecab36ad 100644 --- a/none/tests/arm64/Makefile.am +++ b/none/tests/arm64/Makefile.am @@ -10,14 +10,16 @@ EXTRA_DIST = \ integer.stdout.exp integer.stderr.exp integer.vgtest \ memory.stdout.exp memory.stderr.exp memory.vgtest \ atomics_v81.stdout.exp atomics_v81.stderr.exp atomics_v81.vgtest \ - simd_v81.stdout.exp simd_v81.stderr.exp simd_v81.vgtest + simd_v81.stdout.exp simd_v81.stderr.exp simd_v81.vgtest \ + fmadd_sub.stdout.exp fmadd_sub.stderr.exp fmadd_sub.vgtest check_PROGRAMS = \ allexec \ cvtf_imm \ fp_and_simd \ integer \ - memory + memory \ + fmadd_sub if BUILD_ARMV8_CRC_TESTS check_PROGRAMS += crc32 diff --git a/none/tests/arm64/fmadd_sub.c b/none/tests/arm64/fmadd_sub.c new file mode 100644 index 000000000..dcab22d1b --- /dev/null +++ b/none/tests/arm64/fmadd_sub.c @@ -0,0 +1,98 @@ +#include +#include +#include +#include + +#define COUNT 5 + +static void +print_float(const char *ident, float x) +{ + union + { + float f; + uint32_t i; + } u; + + u.f = x; + printf("%s = %08x = %.17g\n", ident, u.i, x); +} + +static void +print_double(const char *ident, double x) +{ + union + { + double f; + uint64_t i; + } u; + + u.f = x; + printf("%s = %016lx = %.17g\n", ident, u.i, x); +} + +int +main(int argc, char **argv) +{ + float x[] = { 55, 0.98076171874999996, 0, 1, 0xFFFFFFFF } ; + float y[] = { 0.69314718055994529, 1.015625, 0, 1, 0xFFFFFFFF }; + float z[] = { 38.123094930796988, 1, 0, 1, 0xFFFFFFFF }; + float dst = -5; + + double dx[] = { 55, 0.98076171874999996, 0, 1, 0xFFFFFFFF } ; + double dy[] = { 0.69314718055994529, 1.015625, 0, 1, 0xFFFFFFFF }; + double dz[] = { 38.123094930796988, 1, 0, 1, 0xFFFFFFFF }; + double ddst= -5; + + int i; + + for (i = 0; i < COUNT; i++) { + //32bit variant + asm("fmadd %s0, %s1, %s2, %s3\n;" : "=w"(dst) : "w"(x[i]), "w"(y[i]), "w"(z[i])); + printf("FMADD 32bit: dst = z + x * y\n"); + printf("%f = %f + %f * %f\n", dst, z[i], x[i], y[i]); + print_float("dst", dst); + + // Floating-point negated fused multiply-add + asm("fnmadd %s0, %s1, %s2, %s3\n;" : "=w"(dst) : "w"(x[i]), "w"(y[i]), "w"(z[i])); + printf("FNMADD 32bit: dst = -z + (-x) * y\n"); + printf("%f = -%f + (-%f) * %f\n", dst, z[i], x[i], y[i]); + print_float("dst", dst); + + asm("fmsub %s0, %s1, %s2, %s3\n;" : "=w"(dst) : "w"(x[i]), "w"(y[i]), "w"(z[i])); + printf("FMSUB 32bit: dst = z + (-x) * y\n"); + printf("%f = %f + (-%f) * %f\n", dst, z[i], x[i], y[i]); + print_float("dst", dst); + + asm("fnmsub %s0, %s1, %s2, %s3\n;" : "=w"(dst) : "w"(x[i]), "w"(y[i]), "w"(z[i])); + printf("FNMSUB 32bit: dst = -z + x * y\n"); + printf("%f = -%f + %f * %f\n", dst, z[i], x[i], y[i]); + print_float("dst", dst); + + //64bit variant + asm("fmadd %d0, %d1, %d2, %d3\n;" : "=w"(ddst) : "w"(dx[i]), "w"(dy[i]), "w"(dz[i])); + printf("FMADD 64bit: dst = z + x * y\n"); + printf("%f = %f + %f * %f\n", ddst, dz[i], dx[i], dy[i]); + print_double("dst", ddst); + + asm("fnmadd %d0, %d1, %d2, %d3\n;" : "=w"(ddst) : "w"(dx[i]), "w"(dy[i]), "w"(dz[i])); + printf("FNMADD 64bit: dst = -z + (-x) * y\n"); + printf("%f = -%f - %f * %f\n", ddst, dz[i], dx[i], dy[i]); + print_double("dst", ddst); + + asm("fmsub %d0, %d1, %d2, %d3\n;" : "=w"(ddst) : "w"(dx[i]), "w"(dy[i]), "w"(dz[i])); + printf("FMSUB 64bit: dst = z + (-x) * y\n"); + printf("%f = %f + (-%f) * %f\n", ddst, dz[i], dx[i], dy[i]); + print_double("dst", ddst); + + asm("fnmsub %d0, %d1, %d2, %d3\n;" : "=w"(ddst) : "w"(dx[i]), "w"(dy[i]), "w"(dz[i])); + printf("FNMSUB 64bit: dst = -z + x * y\n"); + printf("%f = -%f + %f * %f\n", ddst, dz[i], dx[i], dy[i]); + print_double("dst", ddst); + + printf("\n"); + } + + return 0; +} + diff --git a/none/tests/arm64/fmadd_sub.stderr.exp b/none/tests/arm64/fmadd_sub.stderr.exp new file mode 100644 index 000000000..e69de29bb diff --git a/none/tests/arm64/fmadd_sub.stdout.exp b/none/tests/arm64/fmadd_sub.stdout.exp new file mode 100644 index 000000000..f1824b12b --- /dev/null +++ b/none/tests/arm64/fmadd_sub.stdout.exp @@ -0,0 +1,125 @@ +FMADD 32bit: dst = z + x * y +76.246193 = 38.123096 + 55.000000 * 0.693147 +dst = 42987e0d = 76.246192932128906 +FNMADD 32bit: dst = -z + (-x) * y +-76.246193 = -38.123096 + (-55.000000) * 0.693147 +dst = c2987e0d = -76.246192932128906 +FMSUB 32bit: dst = z + (-x) * y +0.000001 = 38.123096 + (-55.000000) * 0.693147 +dst = 35c00000 = 1.430511474609375e-06 +FNMSUB 32bit: dst = -z + x * y +-0.000001 = -38.123096 + 55.000000 * 0.693147 +dst = b5c00000 = -1.430511474609375e-06 +FMADD 64bit: dst = z + x * y +76.246190 = 38.123095 + 55.000000 * 0.693147 +dst = 40530fc1931f09c9 = 76.246189861593976 +FNMADD 64bit: dst = -z + (-x) * y +-76.246190 = -38.123095 - 55.000000 * 0.693147 +dst = c0530fc1931f09c9 = -76.246189861593976 +FMSUB 64bit: dst = z + (-x) * y +-0.000000 = 38.123095 + (-55.000000) * 0.693147 +dst = bce9000000000000 = -2.7755575615628914e-15 +FNMSUB 64bit: dst = -z + x * y +0.000000 = -38.123095 + 55.000000 * 0.693147 +dst = 3ce9000000000000 = 2.7755575615628914e-15 + +FMADD 32bit: dst = z + x * y +1.996086 = 1.000000 + 0.980762 * 1.015625 +dst = 3fff7fc0 = 1.9960861206054688 +FNMADD 32bit: dst = -z + (-x) * y +-1.996086 = -1.000000 + (-0.980762) * 1.015625 +dst = bfff7fc0 = -1.9960861206054688 +FMSUB 32bit: dst = z + (-x) * y +0.003914 = 1.000000 + (-0.980762) * 1.015625 +dst = 3b80401a = 0.00391389150172472 +FNMSUB 32bit: dst = -z + x * y +-0.003914 = -1.000000 + 0.980762 * 1.015625 +dst = bb80401a = -0.00391389150172472 +FMADD 64bit: dst = z + x * y +1.996086 = 1.000000 + 0.980762 * 1.015625 +dst = 3fffeff800000000 = 1.9960861206054688 +FNMADD 64bit: dst = -z + (-x) * y +-1.996086 = -1.000000 - 0.980762 * 1.015625 +dst = bfffeff800000000 = -1.9960861206054688 +FMSUB 64bit: dst = z + (-x) * y +0.003914 = 1.000000 + (-0.980762) * 1.015625 +dst = 3f70080000000034 = 0.0039138793945312951 +FNMSUB 64bit: dst = -z + x * y +-0.003914 = -1.000000 + 0.980762 * 1.015625 +dst = bf70080000000034 = -0.0039138793945312951 + +FMADD 32bit: dst = z + x * y +0.000000 = 0.000000 + 0.000000 * 0.000000 +dst = 00000000 = 0 +FNMADD 32bit: dst = -z + (-x) * y +-0.000000 = -0.000000 + (-0.000000) * 0.000000 +dst = 80000000 = -0 +FMSUB 32bit: dst = z + (-x) * y +0.000000 = 0.000000 + (-0.000000) * 0.000000 +dst = 00000000 = 0 +FNMSUB 32bit: dst = -z + x * y +0.000000 = -0.000000 + 0.000000 * 0.000000 +dst = 00000000 = 0 +FMADD 64bit: dst = z + x * y +0.000000 = 0.000000 + 0.000000 * 0.000000 +dst = 0000000000000000 = 0 +FNMADD 64bit: dst = -z + (-x) * y +-0.000000 = -0.000000 - 0.000000 * 0.000000 +dst = 8000000000000000 = -0 +FMSUB 64bit: dst = z + (-x) * y +0.000000 = 0.000000 + (-0.000000) * 0.000000 +dst = 0000000000000000 = 0 +FNMSUB 64bit: dst = -z + x * y +0.000000 = -0.000000 + 0.000000 * 0.000000 +dst = 0000000000000000 = 0 + +FMADD 32bit: dst = z + x * y +2.000000 = 1.000000 + 1.000000 * 1.000000 +dst = 40000000 = 2 +FNMADD 32bit: dst = -z + (-x) * y +-2.000000 = -1.000000 + (-1.000000) * 1.000000 +dst = c0000000 = -2 +FMSUB 32bit: dst = z + (-x) * y +0.000000 = 1.000000 + (-1.000000) * 1.000000 +dst = 00000000 = 0 +FNMSUB 32bit: dst = -z + x * y +0.000000 = -1.000000 + 1.000000 * 1.000000 +dst = 00000000 = 0 +FMADD 64bit: dst = z + x * y +2.000000 = 1.000000 + 1.000000 * 1.000000 +dst = 4000000000000000 = 2 +FNMADD 64bit: dst = -z + (-x) * y +-2.000000 = -1.000000 - 1.000000 * 1.000000 +dst = c000000000000000 = -2 +FMSUB 64bit: dst = z + (-x) * y +0.000000 = 1.000000 + (-1.000000) * 1.000000 +dst = 0000000000000000 = 0 +FNMSUB 64bit: dst = -z + x * y +0.000000 = -1.000000 + 1.000000 * 1.000000 +dst = 0000000000000000 = 0 + +FMADD 32bit: dst = z + x * y +18446744073709551616.000000 = 4294967296.000000 + 4294967296.000000 * 4294967296.000000 +dst = 5f800000 = 1.8446744073709552e+19 +FNMADD 32bit: dst = -z + (-x) * y +-18446744073709551616.000000 = -4294967296.000000 + (-4294967296.000000) * 4294967296.000000 +dst = df800000 = -1.8446744073709552e+19 +FMSUB 32bit: dst = z + (-x) * y +-18446744073709551616.000000 = 4294967296.000000 + (-4294967296.000000) * 4294967296.000000 +dst = df800000 = -1.8446744073709552e+19 +FNMSUB 32bit: dst = -z + x * y +18446744073709551616.000000 = -4294967296.000000 + 4294967296.000000 * 4294967296.000000 +dst = 5f800000 = 1.8446744073709552e+19 +FMADD 64bit: dst = z + x * y +18446744069414584320.000000 = 4294967295.000000 + 4294967295.000000 * 4294967295.000000 +dst = 43efffffffe00000 = 1.8446744069414584e+19 +FNMADD 64bit: dst = -z + (-x) * y +-18446744069414584320.000000 = -4294967295.000000 - 4294967295.000000 * 4294967295.000000 +dst = c3efffffffe00000 = -1.8446744069414584e+19 +FMSUB 64bit: dst = z + (-x) * y +-18446744060824649728.000000 = 4294967295.000000 + (-4294967295.000000) * 4294967295.000000 +dst = c3efffffffa00000 = -1.844674406082465e+19 +FNMSUB 64bit: dst = -z + x * y +18446744060824649728.000000 = -4294967295.000000 + 4294967295.000000 * 4294967295.000000 +dst = 43efffffffa00000 = 1.844674406082465e+19 + diff --git a/none/tests/arm64/fmadd_sub.vgtest b/none/tests/arm64/fmadd_sub.vgtest new file mode 100644 index 000000000..b4c53eea4 --- /dev/null +++ b/none/tests/arm64/fmadd_sub.vgtest @@ -0,0 +1,3 @@ +prog: fmadd_sub +prereq: test -x fmadd_sub +vgopts: -q -- 2.18.4