3620 lines
131 KiB
Diff
3620 lines
131 KiB
Diff
2007-02-10 H.J. Lu <hongjiu.lu@intel.com>
|
||
|
||
* gcc.target/i386/sse4a-extract.c: Add "LL" to 64bit constants.
|
||
* gcc.target/i386/sse4a-insert.c: Likewise.
|
||
|
||
2007-02-08 Harsha Jagasia <harsha.jagasia@amd.com>
|
||
|
||
* config/i386/xmmintrin.h: Make inclusion of emmintrin.h
|
||
conditional to __SSE2__.
|
||
* config/i386/emmintrin.h: Generate #error if __SSE2__ is not
|
||
defined.
|
||
* config/i386/pmmintrin.h: Generate #error if __SSE3__ is not
|
||
defined.
|
||
* config/i386/tmmintrin.h: Generate #error if __SSSE3__ is not
|
||
defined.
|
||
|
||
2007-02-05 Harsha Jagasia <harsha.jagasia@amd.com>
|
||
|
||
* config/i386/athlon.md (athlon_fldxf_k8, athlon_fld_k8,
|
||
athlon_fstxf_k8, athlon_fst_k8, athlon_fist, athlon_fmov,
|
||
athlon_fadd_load, athlon_fadd_load_k8, athlon_fadd, athlon_fmul,
|
||
athlon_fmul_load, athlon_fmul_load_k8, athlon_fsgn,
|
||
athlon_fdiv_load, athlon_fdiv_load_k8, athlon_fdiv_k8,
|
||
athlon_fpspc_load, athlon_fpspc, athlon_fcmov_load,
|
||
athlon_fcmov_load_k8, athlon_fcmov_k8, athlon_fcomi_load_k8,
|
||
athlon_fcomi, athlon_fcom_load_k8, athlon_fcom): Added amdfam10.
|
||
|
||
* config/i386/i386.md (x86_sahf_1, cmpfp_i_mixed, cmpfp_i_sse,
|
||
cmpfp_i_i387, cmpfp_iu_mixed, cmpfp_iu_sse, cmpfp_iu_387,
|
||
swapsi, swaphi_1, swapqi_1, swapdi_rex64, fix_truncsfdi_sse,
|
||
fix_truncdfdi_sse, fix_truncsfsi_sse, fix_truncdfsi_sse,
|
||
x86_fldcw_1, floatsisf2_mixed, floatsisf2_sse, floatdisf2_mixed,
|
||
floatdisf2_sse, floatsidf2_mixed, floatsidf2_sse,
|
||
floatdidf2_mixed, floatdidf2_sse, muldi3_1_rex64, mulsi3_1,
|
||
mulsi3_1_zext, mulhi3_1, mulqi3_1, umulqihi3_1, mulqihi3_insn,
|
||
umulditi3_insn, umulsidi3_insn, mulditi3_insn, mulsidi3_insn,
|
||
umuldi3_highpart_rex64, umulsi3_highpart_insn,
|
||
umulsi3_highpart_zext, smuldi3_highpart_rex64,
|
||
smulsi3_highpart_insn, smulsi3_highpart_zext, x86_64_shld,
|
||
x86_shld_1, x86_64_shrd, sqrtsf2_mixed, sqrtsf2_sse,
|
||
sqrtsf2_i387, sqrtdf2_mixed, sqrtdf2_sse, sqrtdf2_i387,
|
||
sqrtextendsfdf2_i387, sqrtxf2, sqrtextendsfxf2_i387,
|
||
sqrtextenddfxf2_i387): Added amdfam10_decode.
|
||
|
||
* config/i386/athlon.md (athlon_idirect_amdfam10,
|
||
athlon_ivector_amdfam10, athlon_idirect_load_amdfam10,
|
||
athlon_ivector_load_amdfam10, athlon_idirect_both_amdfam10,
|
||
athlon_ivector_both_amdfam10, athlon_idirect_store_amdfam10,
|
||
athlon_ivector_store_amdfam10): New define_insn_reservation.
|
||
(athlon_idirect_loadmov, athlon_idirect_movstore): Added
|
||
amdfam10.
|
||
|
||
* config/i386/athlon.md (athlon_call_amdfam10,
|
||
athlon_pop_amdfam10, athlon_lea_amdfam10): New
|
||
define_insn_reservation.
|
||
(athlon_branch, athlon_push, athlon_leave_k8, athlon_imul_k8,
|
||
athlon_imul_k8_DI, athlon_imul_mem_k8, athlon_imul_mem_k8_DI,
|
||
athlon_idiv, athlon_idiv_mem, athlon_str): Added amdfam10.
|
||
|
||
* config/i386/athlon.md (athlon_sseld_amdfam10,
|
||
athlon_mmxld_amdfam10, athlon_ssest_amdfam10,
|
||
athlon_mmxssest_short_amdfam10): New define_insn_reservation.
|
||
|
||
* config/i386/athlon.md (athlon_sseins_amdfam10): New
|
||
define_insn_reservation.
|
||
* config/i386/i386.md (sseins): Added sseins to define_attr type
|
||
and define_attr unit.
|
||
* config/i386/sse.md: Set type attribute to sseins for insertq
|
||
and insertqi.
|
||
|
||
* config/i386/athlon.md (sselog_load_amdfam10, sselog_amdfam10,
|
||
ssecmpvector_load_amdfam10, ssecmpvector_amdfam10,
|
||
ssecomi_load_amdfam10, ssecomi_amdfam10,
|
||
sseaddvector_load_amdfam10, sseaddvector_amdfam10): New
|
||
define_insn_reservation.
|
||
(ssecmp_load_k8, ssecmp, sseadd_load_k8, seadd): Added amdfam10.
|
||
|
||
* config/i386/athlon.md (cvtss2sd_load_amdfam10,
|
||
cvtss2sd_amdfam10, cvtps2pd_load_amdfam10, cvtps2pd_amdfam10,
|
||
cvtsi2sd_load_amdfam10, cvtsi2ss_load_amdfam10,
|
||
cvtsi2sd_amdfam10, cvtsi2ss_amdfam10, cvtsd2ss_load_amdfam10,
|
||
cvtsd2ss_amdfam10, cvtpd2ps_load_amdfam10, cvtpd2ps_amdfam10,
|
||
cvtsX2si_load_amdfam10, cvtsX2si_amdfam10): New
|
||
define_insn_reservation.
|
||
|
||
* config/i386/sse.md (cvtsi2ss, cvtsi2ssq, cvtss2si,
|
||
cvtss2siq, cvttss2si, cvttss2siq, cvtsi2sd, cvtsi2sdq,
|
||
cvtsd2si, cvtsd2siq, cvttsd2si, cvttsd2siq,
|
||
cvtpd2dq, cvttpd2dq, cvtsd2ss, cvtss2sd,
|
||
cvtpd2ps, cvtps2pd): Added amdfam10_decode attribute.
|
||
|
||
* config/i386/athlon.md (athlon_ssedivvector_amdfam10,
|
||
athlon_ssedivvector_load_amdfam10, athlon_ssemulvector_amdfam10,
|
||
athlon_ssemulvector_load_amdfam10): New define_insn_reservation.
|
||
(athlon_ssediv, athlon_ssediv_load_k8, athlon_ssemul,
|
||
athlon_ssemul_load_k8): Added amdfam10.
|
||
|
||
* config/i386/i386.h (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL): New macro.
|
||
(x86_sse_unaligned_move_optimal): New variable.
|
||
|
||
* config/i386/i386.c (x86_sse_unaligned_move_optimal): Enable for
|
||
m_AMDFAM10.
|
||
(ix86_expand_vector_move_misalign): Add code to generate movupd/movups
|
||
for unaligned vector SSE double/single precision loads for AMDFAM10.
|
||
|
||
* config/i386/i386.h (TARGET_AMDFAM10): New macro.
|
||
(TARGET_CPU_CPP_BUILTINS): Add code for amdfam10.
|
||
Define TARGET_CPU_DEFAULT_amdfam10.
|
||
(TARGET_CPU_DEFAULT_NAMES): Add amdfam10.
|
||
(processor_type): Add PROCESSOR_AMDFAM10.
|
||
|
||
* config/i386/i386.md: Add amdfam10 as a new cpu attribute to match
|
||
processor_type in config/i386/i386.h.
|
||
Enable imul peepholes for TARGET_AMDFAM10.
|
||
|
||
* config.gcc: Add support for --with-cpu option for amdfam10.
|
||
|
||
* config/i386/i386.c (amdfam10_cost): New variable.
|
||
(m_AMDFAM10): New macro.
|
||
(m_ATHLON_K8_AMDFAM10): New macro.
|
||
(x86_use_leave, x86_push_memory, x86_movx, x86_unroll_strlen,
|
||
x86_cmove, x86_3dnow_a, x86_deep_branch, x86_use_simode_fiop,
|
||
x86_promote_QImode, x86_integer_DFmode_moves,
|
||
x86_partial_reg_dependency, x86_memory_mismatch_stall,
|
||
x86_accumulate_outgoing_args, x86_arch_always_fancy_math_387,
|
||
x86_sse_partial_reg_dependency, x86_sse_typeless_stores,
|
||
x86_use_ffreep, x86_use_incdec, x86_four_jump_limit,
|
||
x86_schedule, x86_use_bt, x86_cmpxchg16b, x86_pad_returns):
|
||
Enable/disable for amdfam10.
|
||
(override_options): Add amdfam10_cost to processor_target_table.
|
||
Set up PROCESSOR_AMDFAM10 for amdfam10 entry in
|
||
processor_alias_table.
|
||
(ix86_issue_rate): Add PROCESSOR_AMDFAM10.
|
||
(ix86_adjust_cost): Add code for amdfam10.
|
||
|
||
* config/i386/i386.opt: Add new Advanced Bit Manipulation (-mabm)
|
||
instruction set feature flag. Add new (-mpopcnt) flag for popcnt
|
||
instruction. Add new SSE4A (-msse4a) instruction set feature flag.
|
||
* config/i386/i386.h: Add builtin definition for SSE4A.
|
||
* config/i386/i386.md: Add support for ABM instructions
|
||
(popcnt and lzcnt).
|
||
* config/i386/sse.md: Add support for SSE4A instructions
|
||
(movntss, movntsd, extrq, insertq).
|
||
* config/i386/i386.c: Add support for ABM and SSE4A builtins.
|
||
Add -march=amdfam10 flag.
|
||
* config/i386/ammintrin.h: Add support for SSE4A intrinsics.
|
||
* doc/invoke.texi: Add documentation on flags for sse4a, abm, popcnt
|
||
and amdfam10.
|
||
* doc/extend.texi: Add documentation for SSE4A builtins.
|
||
|
||
2007-02-05 Dwarakanath Rajagopal <dwarak.rajagopal@amd.com>
|
||
|
||
* gcc.dg/i386-cpuid.h: Test whether SSE4A is supported
|
||
for running tests.
|
||
* gcc.target/i386/sse4a-extract.c: New test.
|
||
* gcc.target/i386/sse4a-insert.c: New test.
|
||
* gcc.target/i386/sse4a-montsd.c: New test.
|
||
* gcc.target/i386/sse4a-montss.c: New test.
|
||
|
||
2006-12-15 H.J. Lu <hongjiu.lu@intel.com>
|
||
|
||
* gcc.dg/i386-cpuid.h (bit_SSSE3): New.
|
||
|
||
2006-11-30 H.J. Lu <hongjiu.lu@intel.com>
|
||
|
||
* gcc.dg/i386-cpuid.h (bit_SSE3): New.
|
||
(i386_get_cpuid): New function.
|
||
(i386_cpuid_ecx): Likewise.
|
||
(i386_cpuid_edx): Likewise.
|
||
(i386_cpuid): Updated to call i386_cpuid_edx.
|
||
|
||
--- gcc/doc/extend.texi.jj 2007-02-09 16:18:25.000000000 +0100
|
||
+++ gcc/doc/extend.texi 2007-02-09 21:26:06.000000000 +0100
|
||
@@ -6931,6 +6931,23 @@ v4si __builtin_ia32_pabsd128 (v4si)
|
||
v8hi __builtin_ia32_pabsw128 (v8hi)
|
||
@end smallexample
|
||
|
||
+The following built-in functions are available when @option{-msse4a} is used.
|
||
+
|
||
+@smallexample
|
||
+void _mm_stream_sd (double*,__m128d);
|
||
+Generates the @code{movntsd} machine instruction.
|
||
+void _mm_stream_ss (float*,__m128);
|
||
+Generates the @code{movntss} machine instruction.
|
||
+__m128i _mm_extract_si64 (__m128i, __m128i);
|
||
+Generates the @code{extrq} machine instruction with only SSE register operands.
|
||
+__m128i _mm_extracti_si64 (__m128i, int, int);
|
||
+Generates the @code{extrq} machine instruction with SSE register and immediate operands.
|
||
+__m128i _mm_insert_si64 (__m128i, __m128i);
|
||
+Generates the @code{insertq} machine instruction with only SSE register operands.
|
||
+__m128i _mm_inserti_si64 (__m128i, __m128i, int, int);
|
||
+Generates the @code{insertq} machine instruction with SSE register and immediate operands.
|
||
+@end smallexample
|
||
+
|
||
The following built-in functions are available when @option{-m3dnow} is used.
|
||
All of them generate the machine instruction that is part of the name.
|
||
|
||
--- gcc/doc/invoke.texi.jj 2007-02-09 16:18:25.000000000 +0100
|
||
+++ gcc/doc/invoke.texi 2007-02-09 21:56:44.000000000 +0100
|
||
@@ -522,7 +522,7 @@ Objective-C and Objective-C++ Dialects}.
|
||
-mno-fp-ret-in-387 -msoft-float -msvr3-shlib @gol
|
||
-mno-wide-multiply -mrtd -malign-double @gol
|
||
-mpreferred-stack-boundary=@var{num} @gol
|
||
--mmmx -msse -msse2 -msse3 -mssse3 -m3dnow @gol
|
||
+-mmmx -msse -msse2 -msse3 -mssse3 -msse4a -m3dnow -mpopcnt -mabm @gol
|
||
-mthreads -mno-align-stringops -minline-all-stringops @gol
|
||
-mpush-args -maccumulate-outgoing-args -m128bit-long-double @gol
|
||
-m96bit-long-double -mregparm=@var{num} -msseregparm @gol
|
||
@@ -9062,6 +9062,10 @@ instruction set support.
|
||
@item k8, opteron, athlon64, athlon-fx
|
||
AMD K8 core based CPUs with x86-64 instruction set support. (This supersets
|
||
MMX, SSE, SSE2, 3dNOW!, enhanced 3dNOW! and 64-bit instruction set extensions.)
|
||
+@item amdfam10
|
||
+AMD Family 10 core based CPUs with x86-64 instruction set support. (This
|
||
+supersets MMX, SSE, SSE2, SSE3, SSE4A, 3dNOW!, enhanced 3dNOW!, ABM and 64-bit
|
||
+instruction set extensions.)
|
||
@item winchip-c6
|
||
IDT Winchip C6 CPU, dealt in same way as i486 with additional MMX instruction
|
||
set support.
|
||
@@ -9339,8 +9343,14 @@ preferred alignment to @option{-mpreferr
|
||
@itemx -mno-sse3
|
||
@item -mssse3
|
||
@itemx -mno-ssse3
|
||
+@item -msse4a
|
||
+@item -mno-sse4a
|
||
@item -m3dnow
|
||
@itemx -mno-3dnow
|
||
+@item -mpopcnt
|
||
+@itemx -mno-popcnt
|
||
+@item -mabm
|
||
+@itemx -mno-abm
|
||
@opindex mmmx
|
||
@opindex mno-mmx
|
||
@opindex msse
|
||
--- gcc/testsuite/gcc.target/i386/sse4a-insert.c.jj 2007-02-09 21:26:06.000000000 +0100
|
||
+++ gcc/testsuite/gcc.target/i386/sse4a-insert.c 2007-02-09 21:26:06.000000000 +0100
|
||
@@ -0,0 +1,110 @@
|
||
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
|
||
+/* { dg-options "-O2 -msse4a" } */
|
||
+#include <ammintrin.h>
|
||
+#include <stdlib.h>
|
||
+#include "../../gcc.dg/i386-cpuid.h"
|
||
+
|
||
+static void sse4a_test (void);
|
||
+
|
||
+typedef union
|
||
+{
|
||
+ long long i[2];
|
||
+ __m128i vec;
|
||
+} LI;
|
||
+
|
||
+int
|
||
+main ()
|
||
+{
|
||
+ unsigned long cpu_facilities;
|
||
+
|
||
+ cpu_facilities = i386_extended_cpuid_ecx ();
|
||
+
|
||
+ /* Run SSE4a test only if host has SSE4a support. */
|
||
+ if ((cpu_facilities & bit_SSE4a))
|
||
+ sse4a_test ();
|
||
+
|
||
+ exit (0);
|
||
+}
|
||
+
|
||
+static long long
|
||
+sse4a_test_insert (long long in1, long long in2)
|
||
+{
|
||
+ __m128i v1,v2;
|
||
+ long long index_length, pad;
|
||
+ LI v_out;
|
||
+ index_length = 0x0000000000000810LL;
|
||
+ pad = 0x0;
|
||
+ v1 = _mm_set_epi64x (pad, in1);
|
||
+ v2 = _mm_set_epi64x (index_length, in2);
|
||
+ v_out.vec = _mm_insert_si64 (v1, v2);
|
||
+ return (v_out.i[0]);
|
||
+}
|
||
+
|
||
+static long long
|
||
+sse4a_test_inserti (long long in1, long long in2)
|
||
+{
|
||
+ __m128i v1,v2;
|
||
+ long long pad = 0x0;
|
||
+ LI v_out;
|
||
+ v1 = _mm_set_epi64x (pad, in1);
|
||
+ v2 = _mm_set_epi64x (pad, in2);
|
||
+ v_out.vec = _mm_inserti_si64 (v1, v2, (unsigned int) 0x10, (unsigned int) 0x08);
|
||
+ return (v_out.i[0]);
|
||
+}
|
||
+
|
||
+static chk (long long i1, long long i2)
|
||
+{
|
||
+ int n_fails =0;
|
||
+ if (i1 != i2)
|
||
+ n_fails +=1;
|
||
+ return n_fails;
|
||
+}
|
||
+
|
||
+long long vals_in1[5] =
|
||
+ {
|
||
+ 0x1234567887654321LL,
|
||
+ 0x1456782093002490LL,
|
||
+ 0x2340909123990390LL,
|
||
+ 0x9595959599595999LL,
|
||
+ 0x9099038798000029LL
|
||
+ };
|
||
+
|
||
+long long vals_in2[5] =
|
||
+ {
|
||
+ 0x9ABCDEF00FEDCBA9LL,
|
||
+ 0x234567097289672ALL,
|
||
+ 0x45476453097BD342LL,
|
||
+ 0x23569012AE586FF0LL,
|
||
+ 0x432567ABCDEF765DLL
|
||
+ };
|
||
+
|
||
+long long vals_out[5] =
|
||
+ {
|
||
+ 0x1234567887CBA921LL,
|
||
+ 0x1456782093672A90LL,
|
||
+ 0x2340909123D34290LL,
|
||
+ 0x95959595996FF099LL,
|
||
+ 0x9099038798765D29LL
|
||
+ };
|
||
+
|
||
+static void
|
||
+sse4a_test (void)
|
||
+{
|
||
+ int i;
|
||
+ int fail = 0;
|
||
+ long long out;
|
||
+
|
||
+ for (i = 0; i < 5; i += 1)
|
||
+ {
|
||
+ out = sse4a_test_insert (vals_in1[i], vals_in2[i]);
|
||
+ fail += chk(out, vals_out[i]);
|
||
+
|
||
+ out = sse4a_test_inserti (vals_in1[i], vals_in2[i]);
|
||
+ fail += chk(out, vals_out[i]);
|
||
+ }
|
||
+
|
||
+ if (fail != 0)
|
||
+ abort ();
|
||
+
|
||
+ exit (0);
|
||
+}
|
||
--- gcc/testsuite/gcc.target/i386/sse4a-extract.c.jj 2007-02-09 21:26:06.000000000 +0100
|
||
+++ gcc/testsuite/gcc.target/i386/sse4a-extract.c 2007-02-09 21:26:06.000000000 +0100
|
||
@@ -0,0 +1,100 @@
|
||
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
|
||
+/* { dg-options "-O2 -msse4a" } */
|
||
+#include <ammintrin.h>
|
||
+#include <stdlib.h>
|
||
+#include "../../gcc.dg/i386-cpuid.h"
|
||
+
|
||
+static void sse4a_test (void);
|
||
+
|
||
+typedef union
|
||
+{
|
||
+ long long i[2];
|
||
+ __m128i vec;
|
||
+} LI;
|
||
+
|
||
+int
|
||
+main ()
|
||
+{
|
||
+ unsigned long cpu_facilities;
|
||
+
|
||
+ cpu_facilities = i386_extended_cpuid_ecx ();
|
||
+
|
||
+ /* Run SSE4a test only if host has SSE4a support. */
|
||
+ if ((cpu_facilities & bit_SSE4a))
|
||
+ sse4a_test ();
|
||
+
|
||
+ exit (0);
|
||
+}
|
||
+
|
||
+static long long
|
||
+sse4a_test_extrq (long long in)
|
||
+{
|
||
+ __m128i v1, v2;
|
||
+ long long index_length, pad;
|
||
+ LI v_out;
|
||
+ index_length = 0x0000000000000810LL;
|
||
+ pad = 0x0;
|
||
+ v1 = _mm_set_epi64x (pad, in);
|
||
+ v2 = _mm_set_epi64x (pad, index_length);
|
||
+ v_out.vec = _mm_extract_si64 (v1, v2);
|
||
+ return (v_out.i[0]);
|
||
+}
|
||
+
|
||
+static long long
|
||
+sse4a_test_extrqi (long long in)
|
||
+{
|
||
+ __m128i v1;
|
||
+ long long pad =0x0;
|
||
+ LI v_out;
|
||
+ v1 = _mm_set_epi64x (pad, in);
|
||
+ v_out.vec = _mm_extracti_si64 (v1, (unsigned int) 0x10,(unsigned int) 0x08);
|
||
+ return (v_out.i[0]);
|
||
+}
|
||
+
|
||
+static chk (long long i1, long long i2)
|
||
+{
|
||
+ int n_fails =0;
|
||
+ if (i1 != i2)
|
||
+ n_fails +=1;
|
||
+ return n_fails;
|
||
+}
|
||
+
|
||
+long long vals_in[5] =
|
||
+ {
|
||
+ 0x1234567887654321LL,
|
||
+ 0x1456782093002490LL,
|
||
+ 0x2340909123990390LL,
|
||
+ 0x9595959599595999LL,
|
||
+ 0x9099038798000029LL
|
||
+ };
|
||
+
|
||
+long long vals_out[5] =
|
||
+ {
|
||
+ 0x0000000000006543LL,
|
||
+ 0x0000000000000024LL,
|
||
+ 0x0000000000009903LL,
|
||
+ 0x0000000000005959LL,
|
||
+ 0x0000000000000000LL
|
||
+ };
|
||
+
|
||
+static void
|
||
+sse4a_test (void)
|
||
+{
|
||
+ int i;
|
||
+ int fail = 0;
|
||
+ long long out;
|
||
+
|
||
+ for (i = 0; i < 5; i += 1)
|
||
+ {
|
||
+ out = sse4a_test_extrq (vals_in[i]);
|
||
+ fail += chk(out, vals_out[i]);
|
||
+
|
||
+ out = sse4a_test_extrqi (vals_in[i]);
|
||
+ fail += chk(out, vals_out[i]);
|
||
+ }
|
||
+
|
||
+ if (fail != 0)
|
||
+ abort ();
|
||
+
|
||
+ exit (0);
|
||
+}
|
||
--- gcc/testsuite/gcc.target/i386/sse4a-montss.c.jj 2007-02-09 21:26:06.000000000 +0100
|
||
+++ gcc/testsuite/gcc.target/i386/sse4a-montss.c 2007-02-09 21:26:06.000000000 +0100
|
||
@@ -0,0 +1,64 @@
|
||
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
|
||
+/* { dg-options "-O2 -msse4a" } */
|
||
+#include <ammintrin.h>
|
||
+#include <stdlib.h>
|
||
+#include "../../gcc.dg/i386-cpuid.h"
|
||
+
|
||
+static void sse4a_test (void);
|
||
+
|
||
+int
|
||
+main ()
|
||
+{
|
||
+ unsigned long cpu_facilities;
|
||
+
|
||
+ cpu_facilities = i386_extended_cpuid_ecx ();
|
||
+
|
||
+ /* Run SSE4a test only if host has SSE4a support. */
|
||
+ if ((cpu_facilities & bit_SSE4a))
|
||
+ sse4a_test ();
|
||
+
|
||
+ exit (0);
|
||
+}
|
||
+
|
||
+static void
|
||
+sse4a_test_movntss (float *out, float *in)
|
||
+{
|
||
+ __m128 in_v4sf = _mm_load_ss (in);
|
||
+ _mm_stream_ss (out, in_v4sf);
|
||
+}
|
||
+
|
||
+static int
|
||
+chk_ss (float *v1, float *v2)
|
||
+{
|
||
+ int n_fails = 0;
|
||
+ if (v1[0] != v2[0])
|
||
+ n_fails += 1;
|
||
+ return n_fails;
|
||
+}
|
||
+
|
||
+float vals[10] =
|
||
+ {
|
||
+ 100.0, 200.0, 300.0, 400.0, 5.0,
|
||
+ -1.0, .345, -21.5, 9.32, 8.41
|
||
+ };
|
||
+
|
||
+static void
|
||
+sse4a_test (void)
|
||
+{
|
||
+ int i;
|
||
+ int fail = 0;
|
||
+ float *out;
|
||
+
|
||
+ out = (float *) malloc (sizeof (float));
|
||
+ for (i = 0; i < 10; i += 1)
|
||
+ {
|
||
+ sse4a_test_movntss (out, &vals[i]);
|
||
+
|
||
+ fail += chk_ss (out, &vals[i]);
|
||
+ }
|
||
+
|
||
+ if (fail != 0)
|
||
+ abort ();
|
||
+
|
||
+ exit (0);
|
||
+}
|
||
--- gcc/testsuite/gcc.target/i386/sse4a-montsd.c.jj 2007-02-09 21:26:06.000000000 +0100
|
||
+++ gcc/testsuite/gcc.target/i386/sse4a-montsd.c 2007-02-09 21:26:06.000000000 +0100
|
||
@@ -0,0 +1,64 @@
|
||
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
|
||
+/* { dg-options "-O2 -msse4a" } */
|
||
+#include <ammintrin.h>
|
||
+#include <stdlib.h>
|
||
+#include "../../gcc.dg/i386-cpuid.h"
|
||
+
|
||
+static void sse4a_test (void);
|
||
+
|
||
+int
|
||
+main ()
|
||
+{
|
||
+ unsigned long cpu_facilities;
|
||
+
|
||
+ cpu_facilities = i386_extended_cpuid_ecx ();
|
||
+
|
||
+ /* Run SSE4a test only if host has SSE4a support. */
|
||
+ if ((cpu_facilities & bit_SSE4a))
|
||
+ sse4a_test ();
|
||
+
|
||
+ exit (0);
|
||
+}
|
||
+
|
||
+static void
|
||
+sse4a_test_movntsd (double *out, double *in)
|
||
+{
|
||
+ __m128d in_v2df = _mm_load_sd (in);
|
||
+ _mm_stream_sd (out, in_v2df);
|
||
+}
|
||
+
|
||
+static int
|
||
+chk_sd (double *v1, double *v2)
|
||
+{
|
||
+ int n_fails = 0;
|
||
+ if (v1[0] != v2[0])
|
||
+ n_fails += 1;
|
||
+ return n_fails;
|
||
+}
|
||
+
|
||
+double vals[10] =
|
||
+ {
|
||
+ 100.0, 200.0, 300.0, 400.0, 5.0,
|
||
+ -1.0, .345, -21.5, 9.32, 8.41
|
||
+ };
|
||
+
|
||
+static void
|
||
+sse4a_test (void)
|
||
+{
|
||
+ int i;
|
||
+ int fail = 0;
|
||
+ double *out;
|
||
+
|
||
+ out = (double *) malloc (sizeof (double));
|
||
+ for (i = 0; i < 10; i += 1)
|
||
+ {
|
||
+ sse4a_test_movntsd (out, &vals[i]);
|
||
+
|
||
+ fail += chk_sd (out, &vals[i]);
|
||
+ }
|
||
+
|
||
+ if (fail != 0)
|
||
+ abort ();
|
||
+
|
||
+ exit (0);
|
||
+}
|
||
--- gcc/testsuite/gcc.dg/i386-cpuid.h.jj 2006-10-05 00:26:53.000000000 +0200
|
||
+++ gcc/testsuite/gcc.dg/i386-cpuid.h 2007-02-07 13:07:08.000000000 +0100
|
||
@@ -2,23 +2,32 @@
|
||
Used by 20020523-2.c and i386-sse-6.c, and possibly others. */
|
||
/* Plagarized from 20020523-2.c. */
|
||
|
||
+/* %ecx */
|
||
+#define bit_SSE3 (1 << 0)
|
||
+#define bit_SSSE3 (1 << 9)
|
||
+
|
||
+/* %edx */
|
||
#define bit_CMOV (1 << 15)
|
||
#define bit_MMX (1 << 23)
|
||
#define bit_SSE (1 << 25)
|
||
#define bit_SSE2 (1 << 26)
|
||
|
||
+/* Extended Features */
|
||
+/* %ecx */
|
||
+#define bit_SSE4a (1 << 6)
|
||
+
|
||
#ifndef NOINLINE
|
||
#define NOINLINE __attribute__ ((noinline))
|
||
#endif
|
||
|
||
-unsigned int i386_cpuid (void) NOINLINE;
|
||
-
|
||
-unsigned int NOINLINE
|
||
-i386_cpuid (void)
|
||
+static inline unsigned int
|
||
+i386_get_cpuid (unsigned int *ecx, unsigned int *edx)
|
||
{
|
||
- int fl1, fl2;
|
||
+ int fl1;
|
||
|
||
#ifndef __x86_64__
|
||
+ int fl2;
|
||
+
|
||
/* See if we can use cpuid. On AMD64 we always can. */
|
||
__asm__ ("pushfl; pushfl; popl %0; movl %0,%1; xorl %2,%0;"
|
||
"pushl %0; popfl; pushfl; popl %0; popfl"
|
||
@@ -42,15 +51,99 @@ i386_cpuid (void)
|
||
if (fl1 == 0)
|
||
return (0);
|
||
|
||
- /* Invoke CPUID(1), return %edx; caller can examine bits to
|
||
+ /* Invoke CPUID(1), return %ecx and %edx; caller can examine bits to
|
||
determine what's supported. */
|
||
#ifdef __x86_64__
|
||
- __asm__ ("pushq %%rcx; pushq %%rbx; cpuid; popq %%rbx; popq %%rcx"
|
||
- : "=d" (fl2), "=a" (fl1) : "1" (1) : "cc");
|
||
+ __asm__ ("pushq %%rbx; cpuid; popq %%rbx"
|
||
+ : "=c" (*ecx), "=d" (*edx), "=a" (fl1) : "2" (1) : "cc");
|
||
#else
|
||
- __asm__ ("pushl %%ecx; pushl %%ebx; cpuid; popl %%ebx; popl %%ecx"
|
||
- : "=d" (fl2), "=a" (fl1) : "1" (1) : "cc");
|
||
+ __asm__ ("pushl %%ebx; cpuid; popl %%ebx"
|
||
+ : "=c" (*ecx), "=d" (*edx), "=a" (fl1) : "2" (1) : "cc");
|
||
+#endif
|
||
+
|
||
+ return 1;
|
||
+}
|
||
+
|
||
+static inline unsigned int
|
||
+i386_get_extended_cpuid (unsigned int *ecx, unsigned int *edx)
|
||
+{
|
||
+ int fl1;
|
||
+ if (!(i386_get_cpuid (ecx, edx)))
|
||
+ return 0;
|
||
+
|
||
+ /* Invoke CPUID(0x80000000) to get the highest supported extended function
|
||
+ number */
|
||
+#ifdef __x86_64__
|
||
+ __asm__ ("cpuid"
|
||
+ : "=a" (fl1) : "0" (0x80000000) : "edx", "ecx", "ebx");
|
||
+#else
|
||
+ __asm__ ("pushl %%ebx; cpuid; popl %%ebx"
|
||
+ : "=a" (fl1) : "0" (0x80000000) : "edx", "ecx");
|
||
+#endif
|
||
+ /* Check if highest supported extended function used below are supported */
|
||
+ if (fl1 < 0x80000001)
|
||
+ return 0;
|
||
+
|
||
+ /* Invoke CPUID(0x80000001), return %ecx and %edx; caller can examine bits to
|
||
+ determine what's supported. */
|
||
+#ifdef __x86_64__
|
||
+ __asm__ ("cpuid"
|
||
+ : "=c" (*ecx), "=d" (*edx), "=a" (fl1) : "2" (0x80000001) : "ebx");
|
||
+#else
|
||
+ __asm__ ("pushl %%ebx; cpuid; popl %%ebx"
|
||
+ : "=c" (*ecx), "=d" (*edx), "=a" (fl1) : "2" (0x80000001));
|
||
#endif
|
||
+ return 1;
|
||
+}
|
||
+
|
||
+
|
||
+unsigned int i386_cpuid_ecx (void) NOINLINE;
|
||
+unsigned int i386_cpuid_edx (void) NOINLINE;
|
||
+unsigned int i386_extended_cpuid_ecx (void) NOINLINE;
|
||
+unsigned int i386_extended_cpuid_edx (void) NOINLINE;
|
||
+
|
||
+unsigned int NOINLINE
|
||
+i386_cpuid_ecx (void)
|
||
+{
|
||
+ unsigned int ecx, edx;
|
||
+ if (i386_get_cpuid (&ecx, &edx))
|
||
+ return ecx;
|
||
+ else
|
||
+ return 0;
|
||
+}
|
||
+
|
||
+unsigned int NOINLINE
|
||
+i386_cpuid_edx (void)
|
||
+{
|
||
+ unsigned int ecx, edx;
|
||
+ if (i386_get_cpuid (&ecx, &edx))
|
||
+ return edx;
|
||
+ else
|
||
+ return 0;
|
||
+}
|
||
|
||
- return fl2;
|
||
+unsigned int NOINLINE
|
||
+i386_extended_cpuid_ecx (void)
|
||
+{
|
||
+ unsigned int ecx, edx;
|
||
+ if (i386_get_extended_cpuid (&ecx, &edx))
|
||
+ return ecx;
|
||
+ else
|
||
+ return 0;
|
||
+}
|
||
+
|
||
+unsigned int NOINLINE
|
||
+i386_extended_cpuid_edx (void)
|
||
+{
|
||
+ unsigned int ecx, edx;
|
||
+ if (i386_get_extended_cpuid (&ecx, &edx))
|
||
+ return edx;
|
||
+ else
|
||
+ return 0;
|
||
+}
|
||
+
|
||
+static inline unsigned int
|
||
+i386_cpuid (void)
|
||
+{
|
||
+ return i386_cpuid_edx ();
|
||
}
|
||
--- gcc/config.gcc.jj 2007-02-09 16:18:25.000000000 +0100
|
||
+++ gcc/config.gcc 2007-02-09 21:26:06.000000000 +0100
|
||
@@ -264,12 +264,12 @@ xscale-*-*)
|
||
i[34567]86-*-*)
|
||
cpu_type=i386
|
||
extra_headers="mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h
|
||
- pmmintrin.h tmmintrin.h"
|
||
+ pmmintrin.h tmmintrin.h ammintrin.h"
|
||
;;
|
||
x86_64-*-*)
|
||
cpu_type=i386
|
||
extra_headers="mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h
|
||
- pmmintrin.h tmmintrin.h"
|
||
+ pmmintrin.h tmmintrin.h ammintrin.h"
|
||
need_64bit_hwint=yes
|
||
;;
|
||
ia64-*-*)
|
||
@@ -2396,6 +2396,9 @@ if test x$with_cpu = x ; then
|
||
;;
|
||
i686-*-* | i786-*-*)
|
||
case ${target_noncanonical} in
|
||
+ amdfam10-*)
|
||
+ with_cpu=amdfam10
|
||
+ ;;
|
||
k8-*|opteron-*|athlon_64-*)
|
||
with_cpu=k8
|
||
;;
|
||
@@ -2436,6 +2439,9 @@ if test x$with_cpu = x ; then
|
||
;;
|
||
x86_64-*-*)
|
||
case ${target_noncanonical} in
|
||
+ amdfam10-*)
|
||
+ with_cpu=amdfam10
|
||
+ ;;
|
||
k8-*|opteron-*|athlon_64-*)
|
||
with_cpu=k8
|
||
;;
|
||
@@ -2668,7 +2674,7 @@ case "${target}" in
|
||
esac
|
||
# OK
|
||
;;
|
||
- "" | k8 | opteron | athlon64 | athlon-fx | nocona | core2 | generic)
|
||
+ "" | amdfam10 | k8 | opteron | athlon64 | athlon-fx | nocona | core2 | generic)
|
||
# OK
|
||
;;
|
||
*)
|
||
--- gcc/config/i386/i386.h.jj 2007-02-09 16:18:25.000000000 +0100
|
||
+++ gcc/config/i386/i386.h 2007-02-09 21:29:00.000000000 +0100
|
||
@@ -141,6 +141,7 @@ extern const struct processor_costs *ix8
|
||
#define TARGET_GENERIC32 (ix86_tune == PROCESSOR_GENERIC32)
|
||
#define TARGET_GENERIC64 (ix86_tune == PROCESSOR_GENERIC64)
|
||
#define TARGET_GENERIC (TARGET_GENERIC32 || TARGET_GENERIC64)
|
||
+#define TARGET_AMDFAM10 (ix86_tune == PROCESSOR_AMDFAM10)
|
||
|
||
#define TUNEMASK (1 << ix86_tune)
|
||
extern const int x86_use_leave, x86_push_memory, x86_zero_extend_with_and;
|
||
@@ -159,6 +160,7 @@ extern const int x86_accumulate_outgoing
|
||
extern const int x86_epilogue_using_move, x86_decompose_lea;
|
||
extern const int x86_arch_always_fancy_math_387, x86_shift1;
|
||
extern const int x86_sse_partial_reg_dependency, x86_sse_split_regs;
|
||
+extern const int x86_sse_unaligned_move_optimal;
|
||
extern const int x86_sse_typeless_stores, x86_sse_load0_by_pxor;
|
||
extern const int x86_use_ffreep;
|
||
extern const int x86_inter_unit_moves, x86_schedule;
|
||
@@ -208,6 +210,8 @@ extern int x86_prefetch_sse, x86_cmpxchg
|
||
#define TARGET_PARTIAL_REG_DEPENDENCY (x86_partial_reg_dependency & TUNEMASK)
|
||
#define TARGET_SSE_PARTIAL_REG_DEPENDENCY \
|
||
(x86_sse_partial_reg_dependency & TUNEMASK)
|
||
+#define TARGET_SSE_UNALIGNED_MOVE_OPTIMAL \
|
||
+ (x86_sse_unaligned_move_optimal & TUNEMASK)
|
||
#define TARGET_SSE_SPLIT_REGS (x86_sse_split_regs & TUNEMASK)
|
||
#define TARGET_SSE_TYPELESS_STORES (x86_sse_typeless_stores & TUNEMASK)
|
||
#define TARGET_SSE_LOAD0_BY_PXOR (x86_sse_load0_by_pxor & TUNEMASK)
|
||
@@ -376,6 +380,8 @@ extern int x86_prefetch_sse, x86_cmpxchg
|
||
} \
|
||
else if (TARGET_K8) \
|
||
builtin_define ("__tune_k8__"); \
|
||
+ else if (TARGET_AMDFAM10) \
|
||
+ builtin_define ("__tune_amdfam10__"); \
|
||
else if (TARGET_PENTIUM4) \
|
||
builtin_define ("__tune_pentium4__"); \
|
||
else if (TARGET_NOCONA) \
|
||
@@ -400,6 +406,8 @@ extern int x86_prefetch_sse, x86_cmpxchg
|
||
builtin_define ("__SSSE3__"); \
|
||
builtin_define ("__MNI__"); \
|
||
} \
|
||
+ if (TARGET_SSE4A) \
|
||
+ builtin_define ("__SSE4A__"); \
|
||
if (TARGET_SSE_MATH && TARGET_SSE) \
|
||
builtin_define ("__SSE_MATH__"); \
|
||
if (TARGET_SSE_MATH && TARGET_SSE2) \
|
||
@@ -455,6 +463,11 @@ extern int x86_prefetch_sse, x86_cmpxchg
|
||
builtin_define ("__k8"); \
|
||
builtin_define ("__k8__"); \
|
||
} \
|
||
+ else if (ix86_arch == PROCESSOR_AMDFAM10) \
|
||
+ { \
|
||
+ builtin_define ("__amdfam10"); \
|
||
+ builtin_define ("__amdfam10__"); \
|
||
+ } \
|
||
else if (ix86_arch == PROCESSOR_PENTIUM4) \
|
||
{ \
|
||
builtin_define ("__pentium4"); \
|
||
@@ -493,13 +506,14 @@ extern int x86_prefetch_sse, x86_cmpxchg
|
||
#define TARGET_CPU_DEFAULT_nocona 17
|
||
#define TARGET_CPU_DEFAULT_core2 18
|
||
#define TARGET_CPU_DEFAULT_generic 19
|
||
+#define TARGET_CPU_DEFAULT_amdfam10 20
|
||
|
||
#define TARGET_CPU_DEFAULT_NAMES {"i386", "i486", "pentium", "pentium-mmx",\
|
||
"pentiumpro", "pentium2", "pentium3", \
|
||
"pentium4", "geode", "k6", "k6-2", "k6-3", \
|
||
"athlon", "athlon-4", "k8", \
|
||
"pentium-m", "prescott", "nocona", \
|
||
- "core2", "generic"}
|
||
+ "core2", "generic", "amdfam10"}
|
||
|
||
#ifndef CC1_SPEC
|
||
#define CC1_SPEC "%(cc1_cpu) "
|
||
@@ -2162,6 +2176,7 @@ enum processor_type
|
||
PROCESSOR_CORE2,
|
||
PROCESSOR_GENERIC32,
|
||
PROCESSOR_GENERIC64,
|
||
+ PROCESSOR_AMDFAM10,
|
||
PROCESSOR_max
|
||
};
|
||
|
||
--- gcc/config/i386/i386.md.jj 2007-02-09 16:18:25.000000000 +0100
|
||
+++ gcc/config/i386/i386.md 2007-02-10 19:33:43.000000000 +0100
|
||
@@ -151,6 +151,12 @@
|
||
(UNSPEC_PSHUFB 120)
|
||
(UNSPEC_PSIGN 121)
|
||
(UNSPEC_PALIGNR 122)
|
||
+
|
||
+ ; For SSE4A support
|
||
+ (UNSPEC_EXTRQI 130)
|
||
+ (UNSPEC_EXTRQ 131)
|
||
+ (UNSPEC_INSERTQI 132)
|
||
+ (UNSPEC_INSERTQ 133)
|
||
])
|
||
|
||
(define_constants
|
||
@@ -190,7 +196,8 @@
|
||
|
||
;; Processor type. This attribute must exactly match the processor_type
|
||
;; enumeration in i386.h.
|
||
-(define_attr "cpu" "i386,i486,pentium,pentiumpro,geode,k6,athlon,pentium4,k8,nocona,core2,generic32,generic64"
|
||
+(define_attr "cpu" "i386,i486,pentium,pentiumpro,geode,k6,athlon,pentium4,k8,
|
||
+ nocona,core2,generic32,generic64,amdfam10"
|
||
(const (symbol_ref "ix86_tune")))
|
||
|
||
;; A basic instruction type. Refinements due to arguments to be
|
||
@@ -201,10 +208,10 @@
|
||
incdec,ishift,ishift1,rotate,rotate1,imul,idiv,
|
||
icmp,test,ibr,setcc,icmov,
|
||
push,pop,call,callv,leave,
|
||
- str,cld,
|
||
+ str,bitmanip,cld,
|
||
fmov,fop,fsgn,fmul,fdiv,fpspc,fcmov,fcmp,fxch,fistp,fisttp,frndint,
|
||
sselog,sselog1,sseiadd,sseishft,sseimul,
|
||
- sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt,sseicvt,ssediv,
|
||
+ sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt,sseicvt,ssediv,sseins,
|
||
mmx,mmxmov,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft"
|
||
(const_string "other"))
|
||
|
||
@@ -218,7 +225,7 @@
|
||
(cond [(eq_attr "type" "fmov,fop,fsgn,fmul,fdiv,fpspc,fcmov,fcmp,fxch,fistp,fisttp,frndint")
|
||
(const_string "i387")
|
||
(eq_attr "type" "sselog,sselog1,sseiadd,sseishft,sseimul,
|
||
- sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt,sseicvt,ssediv")
|
||
+ sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt,sseicvt,ssediv,sseins")
|
||
(const_string "sse")
|
||
(eq_attr "type" "mmx,mmxmov,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft")
|
||
(const_string "mmx")
|
||
@@ -228,7 +235,8 @@
|
||
|
||
;; The (bounding maximum) length of an instruction immediate.
|
||
(define_attr "length_immediate" ""
|
||
- (cond [(eq_attr "type" "incdec,setcc,icmov,str,cld,lea,other,multi,idiv,leave")
|
||
+ (cond [(eq_attr "type" "incdec,setcc,icmov,str,cld,lea,other,multi,idiv,leave,
|
||
+ bitmanip")
|
||
(const_int 0)
|
||
(eq_attr "unit" "i387,sse,mmx")
|
||
(const_int 0)
|
||
@@ -282,7 +290,7 @@
|
||
;; Set when 0f opcode prefix is used.
|
||
(define_attr "prefix_0f" ""
|
||
(if_then_else
|
||
- (ior (eq_attr "type" "imovx,setcc,icmov")
|
||
+ (ior (eq_attr "type" "imovx,setcc,icmov,bitmanip")
|
||
(eq_attr "unit" "sse,mmx"))
|
||
(const_int 1)
|
||
(const_int 0)))
|
||
@@ -407,7 +415,7 @@
|
||
(const_string "load")
|
||
(and (eq_attr "type"
|
||
"!alu1,negnot,ishift1,
|
||
- imov,imovx,icmp,test,
|
||
+ imov,imovx,icmp,test,bitmanip,
|
||
fmov,fcmp,fsgn,
|
||
sse,ssemov,ssecmp,ssecomi,ssecvt,sseicvt,sselog1,
|
||
mmx,mmxmov,mmxcmp,mmxcvt")
|
||
@@ -961,10 +969,11 @@
|
||
"sahf"
|
||
[(set_attr "length" "1")
|
||
(set_attr "athlon_decode" "vector")
|
||
+ (set_attr "amdfam10_decode" "direct")
|
||
(set_attr "mode" "SI")])
|
||
|
||
;; Pentium Pro can do steps 1 through 3 in one go.
|
||
-
|
||
+;; comi*, ucomi*, fcomi*, ficomi*,fucomi* (i387 instructions set condition codes)
|
||
(define_insn "*cmpfp_i_mixed"
|
||
[(set (reg:CCFP FLAGS_REG)
|
||
(compare:CCFP (match_operand 0 "register_operand" "f#x,x#f")
|
||
@@ -978,7 +987,8 @@
|
||
(if_then_else (match_operand:SF 1 "" "")
|
||
(const_string "SF")
|
||
(const_string "DF")))
|
||
- (set_attr "athlon_decode" "vector")])
|
||
+ (set_attr "athlon_decode" "vector")
|
||
+ (set_attr "amdfam10_decode" "direct")])
|
||
|
||
(define_insn "*cmpfp_i_sse"
|
||
[(set (reg:CCFP FLAGS_REG)
|
||
@@ -993,7 +1003,8 @@
|
||
(if_then_else (match_operand:SF 1 "" "")
|
||
(const_string "SF")
|
||
(const_string "DF")))
|
||
- (set_attr "athlon_decode" "vector")])
|
||
+ (set_attr "athlon_decode" "vector")
|
||
+ (set_attr "amdfam10_decode" "direct")])
|
||
|
||
(define_insn "*cmpfp_i_i387"
|
||
[(set (reg:CCFP FLAGS_REG)
|
||
@@ -1012,7 +1023,8 @@
|
||
(const_string "DF")
|
||
]
|
||
(const_string "XF")))
|
||
- (set_attr "athlon_decode" "vector")])
|
||
+ (set_attr "athlon_decode" "vector")
|
||
+ (set_attr "amdfam10_decode" "direct")])
|
||
|
||
(define_insn "*cmpfp_iu_mixed"
|
||
[(set (reg:CCFPU FLAGS_REG)
|
||
@@ -1027,7 +1039,8 @@
|
||
(if_then_else (match_operand:SF 1 "" "")
|
||
(const_string "SF")
|
||
(const_string "DF")))
|
||
- (set_attr "athlon_decode" "vector")])
|
||
+ (set_attr "athlon_decode" "vector")
|
||
+ (set_attr "amdfam10_decode" "direct")])
|
||
|
||
(define_insn "*cmpfp_iu_sse"
|
||
[(set (reg:CCFPU FLAGS_REG)
|
||
@@ -1042,7 +1055,8 @@
|
||
(if_then_else (match_operand:SF 1 "" "")
|
||
(const_string "SF")
|
||
(const_string "DF")))
|
||
- (set_attr "athlon_decode" "vector")])
|
||
+ (set_attr "athlon_decode" "vector")
|
||
+ (set_attr "amdfam10_decode" "direct")])
|
||
|
||
(define_insn "*cmpfp_iu_387"
|
||
[(set (reg:CCFPU FLAGS_REG)
|
||
@@ -1061,7 +1075,8 @@
|
||
(const_string "DF")
|
||
]
|
||
(const_string "XF")))
|
||
- (set_attr "athlon_decode" "vector")])
|
||
+ (set_attr "athlon_decode" "vector")
|
||
+ (set_attr "amdfam10_decode" "direct")])
|
||
|
||
;; Move instructions.
|
||
|
||
@@ -1267,7 +1282,8 @@
|
||
[(set_attr "type" "imov")
|
||
(set_attr "mode" "SI")
|
||
(set_attr "pent_pair" "np")
|
||
- (set_attr "athlon_decode" "vector")])
|
||
+ (set_attr "athlon_decode" "vector")
|
||
+ (set_attr "amdfam10_decode" "double")])
|
||
|
||
(define_expand "movhi"
|
||
[(set (match_operand:HI 0 "nonimmediate_operand" "")
|
||
@@ -1384,8 +1400,10 @@
|
||
[(set_attr "type" "imov")
|
||
(set_attr "mode" "SI")
|
||
(set_attr "pent_pair" "np")
|
||
- (set_attr "athlon_decode" "vector")])
|
||
+ (set_attr "athlon_decode" "vector")
|
||
+ (set_attr "amdfam10_decode" "double")])
|
||
|
||
+;; Not added amdfam10_decode since TARGET_PARTIAL_REG_STALL is disabled for AMDFAM10
|
||
(define_insn "*swaphi_2"
|
||
[(set (match_operand:HI 0 "register_operand" "+r")
|
||
(match_operand:HI 1 "register_operand" "+r"))
|
||
@@ -1558,8 +1576,10 @@
|
||
[(set_attr "type" "imov")
|
||
(set_attr "mode" "SI")
|
||
(set_attr "pent_pair" "np")
|
||
- (set_attr "athlon_decode" "vector")])
|
||
+ (set_attr "athlon_decode" "vector")
|
||
+ (set_attr "amdfam10_decode" "vector")])
|
||
|
||
+;; Not added amdfam10_decode since TARGET_PARTIAL_REG_STALL is disabled for AMDFAM10
|
||
(define_insn "*swapqi_2"
|
||
[(set (match_operand:QI 0 "register_operand" "+q")
|
||
(match_operand:QI 1 "register_operand" "+q"))
|
||
@@ -2113,7 +2133,8 @@
|
||
[(set_attr "type" "imov")
|
||
(set_attr "mode" "DI")
|
||
(set_attr "pent_pair" "np")
|
||
- (set_attr "athlon_decode" "vector")])
|
||
+ (set_attr "athlon_decode" "vector")
|
||
+ (set_attr "amdfam10_decode" "double")])
|
||
|
||
(define_expand "movti"
|
||
[(set (match_operand:TI 0 "nonimmediate_operand" "")
|
||
@@ -4122,7 +4143,8 @@
|
||
"cvttss2si{q}\t{%1, %0|%0, %1}"
|
||
[(set_attr "type" "sseicvt")
|
||
(set_attr "mode" "SF")
|
||
- (set_attr "athlon_decode" "double,vector")])
|
||
+ (set_attr "athlon_decode" "double,vector")
|
||
+ (set_attr "amdfam10_decode" "double,double")])
|
||
|
||
(define_insn "fix_truncdfdi_sse"
|
||
[(set (match_operand:DI 0 "register_operand" "=r,r")
|
||
@@ -4131,7 +4153,8 @@
|
||
"cvttsd2si{q}\t{%1, %0|%0, %1}"
|
||
[(set_attr "type" "sseicvt")
|
||
(set_attr "mode" "DF")
|
||
- (set_attr "athlon_decode" "double,vector")])
|
||
+ (set_attr "athlon_decode" "double,vector")
|
||
+ (set_attr "amdfam10_decode" "double,double")])
|
||
|
||
(define_insn "fix_truncsfsi_sse"
|
||
[(set (match_operand:SI 0 "register_operand" "=r,r")
|
||
@@ -4140,7 +4163,8 @@
|
||
"cvttss2si\t{%1, %0|%0, %1}"
|
||
[(set_attr "type" "sseicvt")
|
||
(set_attr "mode" "DF")
|
||
- (set_attr "athlon_decode" "double,vector")])
|
||
+ (set_attr "athlon_decode" "double,vector")
|
||
+ (set_attr "amdfam10_decode" "double,double")])
|
||
|
||
(define_insn "fix_truncdfsi_sse"
|
||
[(set (match_operand:SI 0 "register_operand" "=r,r")
|
||
@@ -4149,7 +4173,8 @@
|
||
"cvttsd2si\t{%1, %0|%0, %1}"
|
||
[(set_attr "type" "sseicvt")
|
||
(set_attr "mode" "DF")
|
||
- (set_attr "athlon_decode" "double,vector")])
|
||
+ (set_attr "athlon_decode" "double,vector")
|
||
+ (set_attr "amdfam10_decode" "double,double")])
|
||
|
||
;; Avoid vector decoded forms of the instruction.
|
||
(define_peephole2
|
||
@@ -4410,7 +4435,8 @@
|
||
[(set_attr "length" "2")
|
||
(set_attr "mode" "HI")
|
||
(set_attr "unit" "i387")
|
||
- (set_attr "athlon_decode" "vector")])
|
||
+ (set_attr "athlon_decode" "vector")
|
||
+ (set_attr "amdfam10_decode" "vector")])
|
||
|
||
;; Conversion between fixed point and floating point.
|
||
|
||
@@ -4461,6 +4487,7 @@
|
||
(set_attr "mode" "SF")
|
||
(set_attr "unit" "*,i387,*,*")
|
||
(set_attr "athlon_decode" "*,*,vector,double")
|
||
+ (set_attr "amdfam10_decode" "*,*,vector,double")
|
||
(set_attr "fp_int_src" "true")])
|
||
|
||
(define_insn "*floatsisf2_sse"
|
||
@@ -4471,6 +4498,7 @@
|
||
[(set_attr "type" "sseicvt")
|
||
(set_attr "mode" "SF")
|
||
(set_attr "athlon_decode" "vector,double")
|
||
+ (set_attr "amdfam10_decode" "vector,double")
|
||
(set_attr "fp_int_src" "true")])
|
||
|
||
(define_insn "*floatsisf2_i387"
|
||
@@ -4504,6 +4532,7 @@
|
||
(set_attr "mode" "SF")
|
||
(set_attr "unit" "*,i387,*,*")
|
||
(set_attr "athlon_decode" "*,*,vector,double")
|
||
+ (set_attr "amdfam10_decode" "*,*,vector,double")
|
||
(set_attr "fp_int_src" "true")])
|
||
|
||
(define_insn "*floatdisf2_sse"
|
||
@@ -4514,6 +4543,7 @@
|
||
[(set_attr "type" "sseicvt")
|
||
(set_attr "mode" "SF")
|
||
(set_attr "athlon_decode" "vector,double")
|
||
+ (set_attr "amdfam10_decode" "vector,double")
|
||
(set_attr "fp_int_src" "true")])
|
||
|
||
(define_insn "*floatdisf2_i387"
|
||
@@ -4572,6 +4602,7 @@
|
||
(set_attr "mode" "DF")
|
||
(set_attr "unit" "*,i387,*,*")
|
||
(set_attr "athlon_decode" "*,*,double,direct")
|
||
+ (set_attr "amdfam10_decode" "*,*,vector,double")
|
||
(set_attr "fp_int_src" "true")])
|
||
|
||
(define_insn "*floatsidf2_sse"
|
||
@@ -4582,6 +4613,7 @@
|
||
[(set_attr "type" "sseicvt")
|
||
(set_attr "mode" "DF")
|
||
(set_attr "athlon_decode" "double,direct")
|
||
+ (set_attr "amdfam10_decode" "vector,double")
|
||
(set_attr "fp_int_src" "true")])
|
||
|
||
(define_insn "*floatsidf2_i387"
|
||
@@ -4615,6 +4647,7 @@
|
||
(set_attr "mode" "DF")
|
||
(set_attr "unit" "*,i387,*,*")
|
||
(set_attr "athlon_decode" "*,*,double,direct")
|
||
+ (set_attr "amdfam10_decode" "*,*,vector,double")
|
||
(set_attr "fp_int_src" "true")])
|
||
|
||
(define_insn "*floatdidf2_sse"
|
||
@@ -4625,6 +4658,7 @@
|
||
[(set_attr "type" "sseicvt")
|
||
(set_attr "mode" "DF")
|
||
(set_attr "athlon_decode" "double,direct")
|
||
+ (set_attr "amdfam10_decode" "vector,double")
|
||
(set_attr "fp_int_src" "true")])
|
||
|
||
(define_insn "*floatdidf2_i387"
|
||
@@ -6832,6 +6866,14 @@
|
||
"TARGET_64BIT"
|
||
"")
|
||
|
||
+;; On AMDFAM10
|
||
+;; IMUL reg64, reg64, imm8 Direct
|
||
+;; IMUL reg64, mem64, imm8 VectorPath
|
||
+;; IMUL reg64, reg64, imm32 Direct
|
||
+;; IMUL reg64, mem64, imm32 VectorPath
|
||
+;; IMUL reg64, reg64 Direct
|
||
+;; IMUL reg64, mem64 Direct
|
||
+
|
||
(define_insn "*muldi3_1_rex64"
|
||
[(set (match_operand:DI 0 "register_operand" "=r,r,r")
|
||
(mult:DI (match_operand:DI 1 "nonimmediate_operand" "%rm,rm,0")
|
||
@@ -6854,6 +6896,11 @@
|
||
(match_operand 1 "memory_operand" ""))
|
||
(const_string "vector")]
|
||
(const_string "direct")))
|
||
+ (set (attr "amdfam10_decode")
|
||
+ (cond [(and (eq_attr "alternative" "0,1")
|
||
+ (match_operand 1 "memory_operand" ""))
|
||
+ (const_string "vector")]
|
||
+ (const_string "direct")))
|
||
(set_attr "mode" "DI")])
|
||
|
||
(define_expand "mulsi3"
|
||
@@ -6864,6 +6911,14 @@
|
||
""
|
||
"")
|
||
|
||
+;; On AMDFAM10
|
||
+;; IMUL reg32, reg32, imm8 Direct
|
||
+;; IMUL reg32, mem32, imm8 VectorPath
|
||
+;; IMUL reg32, reg32, imm32 Direct
|
||
+;; IMUL reg32, mem32, imm32 VectorPath
|
||
+;; IMUL reg32, reg32 Direct
|
||
+;; IMUL reg32, mem32 Direct
|
||
+
|
||
(define_insn "*mulsi3_1"
|
||
[(set (match_operand:SI 0 "register_operand" "=r,r,r")
|
||
(mult:SI (match_operand:SI 1 "nonimmediate_operand" "%rm,rm,0")
|
||
@@ -6885,6 +6940,11 @@
|
||
(match_operand 1 "memory_operand" ""))
|
||
(const_string "vector")]
|
||
(const_string "direct")))
|
||
+ (set (attr "amdfam10_decode")
|
||
+ (cond [(and (eq_attr "alternative" "0,1")
|
||
+ (match_operand 1 "memory_operand" ""))
|
||
+ (const_string "vector")]
|
||
+ (const_string "direct")))
|
||
(set_attr "mode" "SI")])
|
||
|
||
(define_insn "*mulsi3_1_zext"
|
||
@@ -6910,6 +6970,11 @@
|
||
(match_operand 1 "memory_operand" ""))
|
||
(const_string "vector")]
|
||
(const_string "direct")))
|
||
+ (set (attr "amdfam10_decode")
|
||
+ (cond [(and (eq_attr "alternative" "0,1")
|
||
+ (match_operand 1 "memory_operand" ""))
|
||
+ (const_string "vector")]
|
||
+ (const_string "direct")))
|
||
(set_attr "mode" "SI")])
|
||
|
||
(define_expand "mulhi3"
|
||
@@ -6920,6 +6985,13 @@
|
||
"TARGET_HIMODE_MATH"
|
||
"")
|
||
|
||
+;; On AMDFAM10
|
||
+;; IMUL reg16, reg16, imm8 VectorPath
|
||
+;; IMUL reg16, mem16, imm8 VectorPath
|
||
+;; IMUL reg16, reg16, imm16 VectorPath
|
||
+;; IMUL reg16, mem16, imm16 VectorPath
|
||
+;; IMUL reg16, reg16 Direct
|
||
+;; IMUL reg16, mem16 Direct
|
||
(define_insn "*mulhi3_1"
|
||
[(set (match_operand:HI 0 "register_operand" "=r,r,r")
|
||
(mult:HI (match_operand:HI 1 "nonimmediate_operand" "%rm,rm,0")
|
||
@@ -6938,6 +7010,10 @@
|
||
(eq_attr "alternative" "1,2")
|
||
(const_string "vector")]
|
||
(const_string "direct")))
|
||
+ (set (attr "amdfam10_decode")
|
||
+ (cond [(eq_attr "alternative" "0,1")
|
||
+ (const_string "vector")]
|
||
+ (const_string "direct")))
|
||
(set_attr "mode" "HI")])
|
||
|
||
(define_expand "mulqi3"
|
||
@@ -6948,6 +7024,10 @@
|
||
"TARGET_QIMODE_MATH"
|
||
"")
|
||
|
||
+;;On AMDFAM10
|
||
+;; MUL reg8 Direct
|
||
+;; MUL mem8 Direct
|
||
+
|
||
(define_insn "*mulqi3_1"
|
||
[(set (match_operand:QI 0 "register_operand" "=a")
|
||
(mult:QI (match_operand:QI 1 "nonimmediate_operand" "%0")
|
||
@@ -6962,6 +7042,7 @@
|
||
(if_then_else (eq_attr "cpu" "athlon")
|
||
(const_string "vector")
|
||
(const_string "direct")))
|
||
+ (set_attr "amdfam10_decode" "direct")
|
||
(set_attr "mode" "QI")])
|
||
|
||
(define_expand "umulqihi3"
|
||
@@ -6988,6 +7069,7 @@
|
||
(if_then_else (eq_attr "cpu" "athlon")
|
||
(const_string "vector")
|
||
(const_string "direct")))
|
||
+ (set_attr "amdfam10_decode" "direct")
|
||
(set_attr "mode" "QI")])
|
||
|
||
(define_expand "mulqihi3"
|
||
@@ -7012,6 +7094,7 @@
|
||
(if_then_else (eq_attr "cpu" "athlon")
|
||
(const_string "vector")
|
||
(const_string "direct")))
|
||
+ (set_attr "amdfam10_decode" "direct")
|
||
(set_attr "mode" "QI")])
|
||
|
||
(define_expand "umulditi3"
|
||
@@ -7038,6 +7121,7 @@
|
||
(if_then_else (eq_attr "cpu" "athlon")
|
||
(const_string "vector")
|
||
(const_string "double")))
|
||
+ (set_attr "amdfam10_decode" "double")
|
||
(set_attr "mode" "DI")])
|
||
|
||
;; We can't use this pattern in 64bit mode, since it results in two separate 32bit registers
|
||
@@ -7065,6 +7149,7 @@
|
||
(if_then_else (eq_attr "cpu" "athlon")
|
||
(const_string "vector")
|
||
(const_string "double")))
|
||
+ (set_attr "amdfam10_decode" "double")
|
||
(set_attr "mode" "SI")])
|
||
|
||
(define_expand "mulditi3"
|
||
@@ -7091,6 +7176,7 @@
|
||
(if_then_else (eq_attr "cpu" "athlon")
|
||
(const_string "vector")
|
||
(const_string "double")))
|
||
+ (set_attr "amdfam10_decode" "double")
|
||
(set_attr "mode" "DI")])
|
||
|
||
(define_expand "mulsidi3"
|
||
@@ -7117,6 +7203,7 @@
|
||
(if_then_else (eq_attr "cpu" "athlon")
|
||
(const_string "vector")
|
||
(const_string "double")))
|
||
+ (set_attr "amdfam10_decode" "double")
|
||
(set_attr "mode" "SI")])
|
||
|
||
(define_expand "umuldi3_highpart"
|
||
@@ -7153,6 +7240,7 @@
|
||
(if_then_else (eq_attr "cpu" "athlon")
|
||
(const_string "vector")
|
||
(const_string "double")))
|
||
+ (set_attr "amdfam10_decode" "double")
|
||
(set_attr "mode" "DI")])
|
||
|
||
(define_expand "umulsi3_highpart"
|
||
@@ -7188,6 +7276,7 @@
|
||
(if_then_else (eq_attr "cpu" "athlon")
|
||
(const_string "vector")
|
||
(const_string "double")))
|
||
+ (set_attr "amdfam10_decode" "double")
|
||
(set_attr "mode" "SI")])
|
||
|
||
(define_insn "*umulsi3_highpart_zext"
|
||
@@ -7210,6 +7299,7 @@
|
||
(if_then_else (eq_attr "cpu" "athlon")
|
||
(const_string "vector")
|
||
(const_string "double")))
|
||
+ (set_attr "amdfam10_decode" "double")
|
||
(set_attr "mode" "SI")])
|
||
|
||
(define_expand "smuldi3_highpart"
|
||
@@ -7245,6 +7335,7 @@
|
||
(if_then_else (eq_attr "cpu" "athlon")
|
||
(const_string "vector")
|
||
(const_string "double")))
|
||
+ (set_attr "amdfam10_decode" "double")
|
||
(set_attr "mode" "DI")])
|
||
|
||
(define_expand "smulsi3_highpart"
|
||
@@ -7279,6 +7370,7 @@
|
||
(if_then_else (eq_attr "cpu" "athlon")
|
||
(const_string "vector")
|
||
(const_string "double")))
|
||
+ (set_attr "amdfam10_decode" "double")
|
||
(set_attr "mode" "SI")])
|
||
|
||
(define_insn "*smulsi3_highpart_zext"
|
||
@@ -7300,6 +7392,7 @@
|
||
(if_then_else (eq_attr "cpu" "athlon")
|
||
(const_string "vector")
|
||
(const_string "double")))
|
||
+ (set_attr "amdfam10_decode" "double")
|
||
(set_attr "mode" "SI")])
|
||
|
||
;; The patterns that match these are at the end of this file.
|
||
@@ -10281,7 +10374,8 @@
|
||
[(set_attr "type" "ishift")
|
||
(set_attr "prefix_0f" "1")
|
||
(set_attr "mode" "DI")
|
||
- (set_attr "athlon_decode" "vector")])
|
||
+ (set_attr "athlon_decode" "vector")
|
||
+ (set_attr "amdfam10_decode" "vector")])
|
||
|
||
(define_expand "x86_64_shift_adj"
|
||
[(set (reg:CCZ FLAGS_REG)
|
||
@@ -10496,7 +10590,8 @@
|
||
(set_attr "prefix_0f" "1")
|
||
(set_attr "mode" "SI")
|
||
(set_attr "pent_pair" "np")
|
||
- (set_attr "athlon_decode" "vector")])
|
||
+ (set_attr "athlon_decode" "vector")
|
||
+ (set_attr "amdfam10_decode" "vector")])
|
||
|
||
(define_expand "x86_shift_adj_1"
|
||
[(set (reg:CCZ FLAGS_REG)
|
||
@@ -11256,7 +11351,8 @@
|
||
[(set_attr "type" "ishift")
|
||
(set_attr "prefix_0f" "1")
|
||
(set_attr "mode" "DI")
|
||
- (set_attr "athlon_decode" "vector")])
|
||
+ (set_attr "athlon_decode" "vector")
|
||
+ (set_attr "amdfam10_decode" "vector")])
|
||
|
||
(define_expand "ashrdi3"
|
||
[(set (match_operand:DI 0 "shiftdi_operand" "")
|
||
@@ -14520,7 +14616,23 @@
|
||
[(set (match_dup 0) (xor:SI (match_dup 0) (const_int 31)))
|
||
(clobber (reg:CC FLAGS_REG))])]
|
||
""
|
||
- "")
|
||
+{
|
||
+ if (TARGET_ABM)
|
||
+ {
|
||
+ emit_insn (gen_clzsi2_abm (operands[0], operands[1]));
|
||
+ DONE;
|
||
+ }
|
||
+})
|
||
+
|
||
+(define_insn "clzsi2_abm"
|
||
+ [(set (match_operand:SI 0 "register_operand" "=r")
|
||
+ (clz:SI (match_operand:SI 1 "nonimmediate_operand" "")))
|
||
+ (clobber (reg:CC FLAGS_REG))]
|
||
+ "TARGET_ABM"
|
||
+ "lzcnt{l}\t{%1, %0|%0, %1}"
|
||
+ [(set_attr "prefix_rep" "1")
|
||
+ (set_attr "type" "bitmanip")
|
||
+ (set_attr "mode" "SI")])
|
||
|
||
(define_insn "*bsr"
|
||
[(set (match_operand:SI 0 "register_operand" "=r")
|
||
@@ -14529,7 +14641,44 @@
|
||
(clobber (reg:CC FLAGS_REG))]
|
||
""
|
||
"bsr{l}\t{%1, %0|%0, %1}"
|
||
- [(set_attr "prefix_0f" "1")])
|
||
+ [(set_attr "prefix_0f" "1")
|
||
+ (set_attr "mode" "SI")])
|
||
+
|
||
+(define_insn "popcountsi2"
|
||
+ [(set (match_operand:SI 0 "register_operand" "=r")
|
||
+ (popcount:SI (match_operand:SI 1 "nonimmediate_operand" "")))
|
||
+ (clobber (reg:CC FLAGS_REG))]
|
||
+ "TARGET_POPCNT"
|
||
+ "popcnt{l}\t{%1, %0|%0, %1}"
|
||
+ [(set_attr "prefix_rep" "1")
|
||
+ (set_attr "type" "bitmanip")
|
||
+ (set_attr "mode" "SI")])
|
||
+
|
||
+(define_insn "*popcountsi2_cmp"
|
||
+ [(set (reg FLAGS_REG)
|
||
+ (compare
|
||
+ (popcount:SI (match_operand:SI 1 "nonimmediate_operand" "rm"))
|
||
+ (const_int 0)))
|
||
+ (set (match_operand:SI 0 "register_operand" "=r")
|
||
+ (popcount:SI (match_dup 1)))]
|
||
+ "TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)"
|
||
+ "popcnt{l}\t{%1, %0|%0, %1}"
|
||
+ [(set_attr "prefix_rep" "1")
|
||
+ (set_attr "type" "bitmanip")
|
||
+ (set_attr "mode" "SI")])
|
||
+
|
||
+(define_insn "*popcountsi2_cmp_zext"
|
||
+ [(set (reg FLAGS_REG)
|
||
+ (compare
|
||
+ (popcount:SI (match_operand:SI 1 "nonimmediate_operand" "rm"))
|
||
+ (const_int 0)))
|
||
+ (set (match_operand:DI 0 "register_operand" "=r")
|
||
+ (zero_extend:DI(popcount:SI (match_dup 1))))]
|
||
+ "TARGET_64BIT && TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)"
|
||
+ "popcnt{l}\t{%1, %0|%0, %1}"
|
||
+ [(set_attr "prefix_rep" "1")
|
||
+ (set_attr "type" "bitmanip")
|
||
+ (set_attr "mode" "SI")])
|
||
|
||
(define_expand "clzdi2"
|
||
[(parallel
|
||
@@ -14541,7 +14690,23 @@
|
||
[(set (match_dup 0) (xor:DI (match_dup 0) (const_int 63)))
|
||
(clobber (reg:CC FLAGS_REG))])]
|
||
"TARGET_64BIT"
|
||
- "")
|
||
+{
|
||
+ if (TARGET_ABM)
|
||
+ {
|
||
+ emit_insn (gen_clzdi2_abm (operands[0], operands[1]));
|
||
+ DONE;
|
||
+ }
|
||
+})
|
||
+
|
||
+(define_insn "clzdi2_abm"
|
||
+ [(set (match_operand:DI 0 "register_operand" "=r")
|
||
+ (clz:DI (match_operand:DI 1 "nonimmediate_operand" "")))
|
||
+ (clobber (reg:CC FLAGS_REG))]
|
||
+ "TARGET_64BIT && TARGET_ABM"
|
||
+ "lzcnt{q}\t{%1, %0|%0, %1}"
|
||
+ [(set_attr "prefix_rep" "1")
|
||
+ (set_attr "type" "bitmanip")
|
||
+ (set_attr "mode" "DI")])
|
||
|
||
(define_insn "*bsr_rex64"
|
||
[(set (match_operand:DI 0 "register_operand" "=r")
|
||
@@ -14550,7 +14715,92 @@
|
||
(clobber (reg:CC FLAGS_REG))]
|
||
"TARGET_64BIT"
|
||
"bsr{q}\t{%1, %0|%0, %1}"
|
||
- [(set_attr "prefix_0f" "1")])
|
||
+ [(set_attr "prefix_0f" "1")
|
||
+ (set_attr "mode" "DI")])
|
||
+
|
||
+(define_insn "popcountdi2"
|
||
+ [(set (match_operand:DI 0 "register_operand" "=r")
|
||
+ (popcount:DI (match_operand:DI 1 "nonimmediate_operand" "")))
|
||
+ (clobber (reg:CC FLAGS_REG))]
|
||
+ "TARGET_64BIT && TARGET_POPCNT"
|
||
+ "popcnt{q}\t{%1, %0|%0, %1}"
|
||
+ [(set_attr "prefix_rep" "1")
|
||
+ (set_attr "type" "bitmanip")
|
||
+ (set_attr "mode" "DI")])
|
||
+
|
||
+(define_insn "*popcountdi2_cmp"
|
||
+ [(set (reg FLAGS_REG)
|
||
+ (compare
|
||
+ (popcount:DI (match_operand:DI 1 "nonimmediate_operand" "rm"))
|
||
+ (const_int 0)))
|
||
+ (set (match_operand:DI 0 "register_operand" "=r")
|
||
+ (popcount:DI (match_dup 1)))]
|
||
+ "TARGET_64BIT && TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)"
|
||
+ "popcnt{q}\t{%1, %0|%0, %1}"
|
||
+ [(set_attr "prefix_rep" "1")
|
||
+ (set_attr "type" "bitmanip")
|
||
+ (set_attr "mode" "DI")])
|
||
+
|
||
+(define_expand "clzhi2"
|
||
+ [(parallel
|
||
+ [(set (match_operand:HI 0 "register_operand" "")
|
||
+ (minus:HI (const_int 15)
|
||
+ (clz:HI (match_operand:HI 1 "nonimmediate_operand" ""))))
|
||
+ (clobber (reg:CC FLAGS_REG))])
|
||
+ (parallel
|
||
+ [(set (match_dup 0) (xor:HI (match_dup 0) (const_int 15)))
|
||
+ (clobber (reg:CC FLAGS_REG))])]
|
||
+ ""
|
||
+{
|
||
+ if (TARGET_ABM)
|
||
+ {
|
||
+ emit_insn (gen_clzhi2_abm (operands[0], operands[1]));
|
||
+ DONE;
|
||
+ }
|
||
+})
|
||
+
|
||
+(define_insn "clzhi2_abm"
|
||
+ [(set (match_operand:HI 0 "register_operand" "=r")
|
||
+ (clz:HI (match_operand:HI 1 "nonimmediate_operand" "")))
|
||
+ (clobber (reg:CC FLAGS_REG))]
|
||
+ "TARGET_ABM"
|
||
+ "lzcnt{w}\t{%1, %0|%0, %1}"
|
||
+ [(set_attr "prefix_rep" "1")
|
||
+ (set_attr "type" "bitmanip")
|
||
+ (set_attr "mode" "HI")])
|
||
+
|
||
+(define_insn "*bsrhi"
|
||
+ [(set (match_operand:HI 0 "register_operand" "=r")
|
||
+ (minus:HI (const_int 15)
|
||
+ (clz:HI (match_operand:HI 1 "nonimmediate_operand" "rm"))))
|
||
+ (clobber (reg:CC FLAGS_REG))]
|
||
+ ""
|
||
+ "bsr{w}\t{%1, %0|%0, %1}"
|
||
+ [(set_attr "prefix_0f" "1")
|
||
+ (set_attr "mode" "HI")])
|
||
+
|
||
+(define_insn "popcounthi2"
|
||
+ [(set (match_operand:HI 0 "register_operand" "=r")
|
||
+ (popcount:HI (match_operand:HI 1 "nonimmediate_operand" "")))
|
||
+ (clobber (reg:CC FLAGS_REG))]
|
||
+ "TARGET_POPCNT"
|
||
+ "popcnt{w}\t{%1, %0|%0, %1}"
|
||
+ [(set_attr "prefix_rep" "1")
|
||
+ (set_attr "type" "bitmanip")
|
||
+ (set_attr "mode" "HI")])
|
||
+
|
||
+(define_insn "*popcounthi2_cmp"
|
||
+ [(set (reg FLAGS_REG)
|
||
+ (compare
|
||
+ (popcount:HI (match_operand:HI 1 "nonimmediate_operand" "rm"))
|
||
+ (const_int 0)))
|
||
+ (set (match_operand:HI 0 "register_operand" "=r")
|
||
+ (popcount:HI (match_dup 1)))]
|
||
+ "TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)"
|
||
+ "popcnt{w}\t{%1, %0|%0, %1}"
|
||
+ [(set_attr "prefix_rep" "1")
|
||
+ (set_attr "type" "bitmanip")
|
||
+ (set_attr "mode" "HI")])
|
||
|
||
;; Thread-local storage patterns for ELF.
|
||
;;
|
||
@@ -15302,7 +15552,8 @@
|
||
sqrtss\t{%1, %0|%0, %1}"
|
||
[(set_attr "type" "fpspc,sse")
|
||
(set_attr "mode" "SF,SF")
|
||
- (set_attr "athlon_decode" "direct,*")])
|
||
+ (set_attr "athlon_decode" "direct,*")
|
||
+ (set_attr "amdfam10_decode" "direct,*")])
|
||
|
||
(define_insn "*sqrtsf2_sse"
|
||
[(set (match_operand:SF 0 "register_operand" "=x")
|
||
@@ -15311,7 +15562,8 @@
|
||
"sqrtss\t{%1, %0|%0, %1}"
|
||
[(set_attr "type" "sse")
|
||
(set_attr "mode" "SF")
|
||
- (set_attr "athlon_decode" "*")])
|
||
+ (set_attr "athlon_decode" "*")
|
||
+ (set_attr "amdfam10_decode" "*")])
|
||
|
||
(define_insn "*sqrtsf2_i387"
|
||
[(set (match_operand:SF 0 "register_operand" "=f")
|
||
@@ -15320,7 +15572,8 @@
|
||
"fsqrt"
|
||
[(set_attr "type" "fpspc")
|
||
(set_attr "mode" "SF")
|
||
- (set_attr "athlon_decode" "direct")])
|
||
+ (set_attr "athlon_decode" "direct")
|
||
+ (set_attr "amdfam10_decode" "direct")])
|
||
|
||
(define_expand "sqrtdf2"
|
||
[(set (match_operand:DF 0 "register_operand" "")
|
||
@@ -15399,7 +15652,8 @@
|
||
"fsqrt"
|
||
[(set_attr "type" "fpspc")
|
||
(set_attr "mode" "XF")
|
||
- (set_attr "athlon_decode" "direct")])
|
||
+ (set_attr "athlon_decode" "direct")
|
||
+ (set_attr "amdfam10_decode" "direct")])
|
||
|
||
(define_insn "fpremxf4"
|
||
[(set (match_operand:XF 0 "register_operand" "=f")
|
||
@@ -20186,7 +20440,7 @@
|
||
(mult:DI (match_operand:DI 1 "memory_operand" "")
|
||
(match_operand:DI 2 "immediate_operand" "")))
|
||
(clobber (reg:CC FLAGS_REG))])]
|
||
- "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size
|
||
+ "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size
|
||
&& (GET_CODE (operands[2]) != CONST_INT
|
||
|| !CONST_OK_FOR_LETTER_P (INTVAL (operands[2]), 'K'))"
|
||
[(set (match_dup 3) (match_dup 1))
|
||
@@ -20200,7 +20454,7 @@
|
||
(mult:SI (match_operand:SI 1 "memory_operand" "")
|
||
(match_operand:SI 2 "immediate_operand" "")))
|
||
(clobber (reg:CC FLAGS_REG))])]
|
||
- "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size
|
||
+ "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size
|
||
&& (GET_CODE (operands[2]) != CONST_INT
|
||
|| !CONST_OK_FOR_LETTER_P (INTVAL (operands[2]), 'K'))"
|
||
[(set (match_dup 3) (match_dup 1))
|
||
@@ -20215,7 +20469,7 @@
|
||
(mult:SI (match_operand:SI 1 "memory_operand" "")
|
||
(match_operand:SI 2 "immediate_operand" ""))))
|
||
(clobber (reg:CC FLAGS_REG))])]
|
||
- "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size
|
||
+ "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size
|
||
&& (GET_CODE (operands[2]) != CONST_INT
|
||
|| !CONST_OK_FOR_LETTER_P (INTVAL (operands[2]), 'K'))"
|
||
[(set (match_dup 3) (match_dup 1))
|
||
@@ -20233,7 +20487,7 @@
|
||
(match_operand:DI 2 "const_int_operand" "")))
|
||
(clobber (reg:CC FLAGS_REG))])
|
||
(match_scratch:DI 3 "r")]
|
||
- "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size
|
||
+ "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size
|
||
&& CONST_OK_FOR_LETTER_P (INTVAL (operands[2]), 'K')"
|
||
[(set (match_dup 3) (match_dup 2))
|
||
(parallel [(set (match_dup 0) (mult:DI (match_dup 0) (match_dup 3)))
|
||
@@ -20249,7 +20503,7 @@
|
||
(match_operand:SI 2 "const_int_operand" "")))
|
||
(clobber (reg:CC FLAGS_REG))])
|
||
(match_scratch:SI 3 "r")]
|
||
- "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size
|
||
+ "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size
|
||
&& CONST_OK_FOR_LETTER_P (INTVAL (operands[2]), 'K')"
|
||
[(set (match_dup 3) (match_dup 2))
|
||
(parallel [(set (match_dup 0) (mult:SI (match_dup 0) (match_dup 3)))
|
||
@@ -20265,7 +20519,7 @@
|
||
(match_operand:HI 2 "immediate_operand" "")))
|
||
(clobber (reg:CC FLAGS_REG))])
|
||
(match_scratch:HI 3 "r")]
|
||
- "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size"
|
||
+ "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size"
|
||
[(set (match_dup 3) (match_dup 2))
|
||
(parallel [(set (match_dup 0) (mult:HI (match_dup 0) (match_dup 3)))
|
||
(clobber (reg:CC FLAGS_REG))])]
|
||
--- gcc/config/i386/athlon.md.jj 2006-10-29 20:56:45.000000000 +0100
|
||
+++ gcc/config/i386/athlon.md 2007-02-09 21:26:06.000000000 +0100
|
||
@@ -29,6 +29,8 @@
|
||
(const_string "vector")]
|
||
(const_string "direct")))
|
||
|
||
+(define_attr "amdfam10_decode" "direct,vector,double"
|
||
+ (const_string "direct"))
|
||
;;
|
||
;; decode0 decode1 decode2
|
||
;; \ | /
|
||
@@ -131,18 +133,22 @@
|
||
|
||
;; Jump instructions are executed in the branch unit completely transparent to us
|
||
(define_insn_reservation "athlon_branch" 0
|
||
- (and (eq_attr "cpu" "athlon,k8,generic64")
|
||
+ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||
(eq_attr "type" "ibr"))
|
||
"athlon-direct,athlon-ieu")
|
||
(define_insn_reservation "athlon_call" 0
|
||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||
(eq_attr "type" "call,callv"))
|
||
"athlon-vector,athlon-ieu")
|
||
+(define_insn_reservation "athlon_call_amdfam10" 0
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (eq_attr "type" "call,callv"))
|
||
+ "athlon-double,athlon-ieu")
|
||
|
||
;; Latency of push operation is 3 cycles, but ESP value is available
|
||
;; earlier
|
||
(define_insn_reservation "athlon_push" 2
|
||
- (and (eq_attr "cpu" "athlon,k8,generic64")
|
||
+ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||
(eq_attr "type" "push"))
|
||
"athlon-direct,athlon-agu,athlon-store")
|
||
(define_insn_reservation "athlon_pop" 4
|
||
@@ -153,12 +159,16 @@
|
||
(and (eq_attr "cpu" "k8,generic64")
|
||
(eq_attr "type" "pop"))
|
||
"athlon-double,(athlon-ieu+athlon-load)")
|
||
+(define_insn_reservation "athlon_pop_amdfam10" 3
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (eq_attr "type" "pop"))
|
||
+ "athlon-direct,(athlon-ieu+athlon-load)")
|
||
(define_insn_reservation "athlon_leave" 3
|
||
(and (eq_attr "cpu" "athlon")
|
||
(eq_attr "type" "leave"))
|
||
"athlon-vector,(athlon-ieu+athlon-load)")
|
||
(define_insn_reservation "athlon_leave_k8" 3
|
||
- (and (eq_attr "cpu" "k8,generic64")
|
||
+ (and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||
(eq_attr "type" "leave"))
|
||
"athlon-double,(athlon-ieu+athlon-load)")
|
||
|
||
@@ -167,6 +177,11 @@
|
||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||
(eq_attr "type" "lea"))
|
||
"athlon-direct,athlon-agu,nothing")
|
||
+;; Lea executes in AGU unit with 1 cycle latency on AMDFAM10
|
||
+(define_insn_reservation "athlon_lea_amdfam10" 1
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (eq_attr "type" "lea"))
|
||
+ "athlon-direct,athlon-agu,nothing")
|
||
|
||
;; Mul executes in special multiplier unit attached to IEU0
|
||
(define_insn_reservation "athlon_imul" 5
|
||
@@ -176,29 +191,35 @@
|
||
"athlon-vector,athlon-ieu0,athlon-mult,nothing,nothing,athlon-ieu0")
|
||
;; ??? Widening multiply is vector or double.
|
||
(define_insn_reservation "athlon_imul_k8_DI" 4
|
||
- (and (eq_attr "cpu" "k8,generic64")
|
||
+ (and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||
(and (eq_attr "type" "imul")
|
||
(and (eq_attr "mode" "DI")
|
||
(eq_attr "memory" "none,unknown"))))
|
||
"athlon-direct0,athlon-ieu0,athlon-mult,nothing,athlon-ieu0")
|
||
(define_insn_reservation "athlon_imul_k8" 3
|
||
- (and (eq_attr "cpu" "k8,generic64")
|
||
+ (and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||
(and (eq_attr "type" "imul")
|
||
(eq_attr "memory" "none,unknown")))
|
||
"athlon-direct0,athlon-ieu0,athlon-mult,athlon-ieu0")
|
||
+(define_insn_reservation "athlon_imul_amdfam10_HI" 4
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "type" "imul")
|
||
+ (and (eq_attr "mode" "HI")
|
||
+ (eq_attr "memory" "none,unknown"))))
|
||
+ "athlon-vector,athlon-ieu0,athlon-mult,nothing,athlon-ieu0")
|
||
(define_insn_reservation "athlon_imul_mem" 8
|
||
(and (eq_attr "cpu" "athlon")
|
||
(and (eq_attr "type" "imul")
|
||
(eq_attr "memory" "load,both")))
|
||
"athlon-vector,athlon-load,athlon-ieu,athlon-mult,nothing,nothing,athlon-ieu")
|
||
(define_insn_reservation "athlon_imul_mem_k8_DI" 7
|
||
- (and (eq_attr "cpu" "k8,generic64")
|
||
+ (and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||
(and (eq_attr "type" "imul")
|
||
(and (eq_attr "mode" "DI")
|
||
(eq_attr "memory" "load,both"))))
|
||
"athlon-vector,athlon-load,athlon-ieu,athlon-mult,nothing,athlon-ieu")
|
||
(define_insn_reservation "athlon_imul_mem_k8" 6
|
||
- (and (eq_attr "cpu" "k8,generic64")
|
||
+ (and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||
(and (eq_attr "type" "imul")
|
||
(eq_attr "memory" "load,both")))
|
||
"athlon-vector,athlon-load,athlon-ieu,athlon-mult,athlon-ieu")
|
||
@@ -209,21 +230,23 @@
|
||
;; other instructions.
|
||
;; ??? Experiments show that the idiv can overlap with roughly 6 cycles
|
||
;; of the other code
|
||
+;; Using the same heuristics for amdfam10 as K8 with idiv
|
||
|
||
(define_insn_reservation "athlon_idiv" 6
|
||
- (and (eq_attr "cpu" "athlon,k8,generic64")
|
||
+ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||
(and (eq_attr "type" "idiv")
|
||
(eq_attr "memory" "none,unknown")))
|
||
"athlon-vector,(athlon-ieu0*6+(athlon-fpsched,athlon-fvector))")
|
||
(define_insn_reservation "athlon_idiv_mem" 9
|
||
- (and (eq_attr "cpu" "athlon,k8,generic64")
|
||
+ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||
(and (eq_attr "type" "idiv")
|
||
(eq_attr "memory" "load,both")))
|
||
"athlon-vector,((athlon-load,athlon-ieu0*6)+(athlon-fpsched,athlon-fvector))")
|
||
;; The parallelism of string instructions is not documented. Model it same way
|
||
;; as idiv to create smaller automata. This probably does not matter much.
|
||
+;; Using the same heuristics for amdfam10 as K8 with idiv
|
||
(define_insn_reservation "athlon_str" 6
|
||
- (and (eq_attr "cpu" "athlon,k8,generic64")
|
||
+ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||
(and (eq_attr "type" "str")
|
||
(eq_attr "memory" "load,both,store")))
|
||
"athlon-vector,athlon-load,athlon-ieu0*6")
|
||
@@ -234,34 +257,62 @@
|
||
(and (eq_attr "unit" "integer,unknown")
|
||
(eq_attr "memory" "none,unknown"))))
|
||
"athlon-direct,athlon-ieu")
|
||
+(define_insn_reservation "athlon_idirect_amdfam10" 1
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "amdfam10_decode" "direct")
|
||
+ (and (eq_attr "unit" "integer,unknown")
|
||
+ (eq_attr "memory" "none,unknown"))))
|
||
+ "athlon-direct,athlon-ieu")
|
||
(define_insn_reservation "athlon_ivector" 2
|
||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||
(and (eq_attr "athlon_decode" "vector")
|
||
(and (eq_attr "unit" "integer,unknown")
|
||
(eq_attr "memory" "none,unknown"))))
|
||
"athlon-vector,athlon-ieu,athlon-ieu")
|
||
+(define_insn_reservation "athlon_ivector_amdfam10" 2
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "amdfam10_decode" "vector")
|
||
+ (and (eq_attr "unit" "integer,unknown")
|
||
+ (eq_attr "memory" "none,unknown"))))
|
||
+ "athlon-vector,athlon-ieu,athlon-ieu")
|
||
+
|
||
(define_insn_reservation "athlon_idirect_loadmov" 3
|
||
- (and (eq_attr "cpu" "athlon,k8,generic64")
|
||
+ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||
(and (eq_attr "type" "imov")
|
||
(eq_attr "memory" "load")))
|
||
"athlon-direct,athlon-load")
|
||
+
|
||
(define_insn_reservation "athlon_idirect_load" 4
|
||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||
(and (eq_attr "athlon_decode" "direct")
|
||
(and (eq_attr "unit" "integer,unknown")
|
||
(eq_attr "memory" "load"))))
|
||
"athlon-direct,athlon-load,athlon-ieu")
|
||
+(define_insn_reservation "athlon_idirect_load_amdfam10" 4
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "amdfam10_decode" "direct")
|
||
+ (and (eq_attr "unit" "integer,unknown")
|
||
+ (eq_attr "memory" "load"))))
|
||
+ "athlon-direct,athlon-load,athlon-ieu")
|
||
(define_insn_reservation "athlon_ivector_load" 6
|
||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||
(and (eq_attr "athlon_decode" "vector")
|
||
(and (eq_attr "unit" "integer,unknown")
|
||
(eq_attr "memory" "load"))))
|
||
"athlon-vector,athlon-load,athlon-ieu,athlon-ieu")
|
||
+(define_insn_reservation "athlon_ivector_load_amdfam10" 6
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "amdfam10_decode" "vector")
|
||
+ (and (eq_attr "unit" "integer,unknown")
|
||
+ (eq_attr "memory" "load"))))
|
||
+ "athlon-vector,athlon-load,athlon-ieu,athlon-ieu")
|
||
+
|
||
(define_insn_reservation "athlon_idirect_movstore" 1
|
||
- (and (eq_attr "cpu" "athlon,k8,generic64")
|
||
+ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||
(and (eq_attr "type" "imov")
|
||
(eq_attr "memory" "store")))
|
||
"athlon-direct,athlon-agu,athlon-store")
|
||
+
|
||
(define_insn_reservation "athlon_idirect_both" 4
|
||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||
(and (eq_attr "athlon_decode" "direct")
|
||
@@ -270,6 +321,15 @@
|
||
"athlon-direct,athlon-load,
|
||
athlon-ieu,athlon-store,
|
||
athlon-store")
|
||
+(define_insn_reservation "athlon_idirect_both_amdfam10" 4
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "amdfam10_decode" "direct")
|
||
+ (and (eq_attr "unit" "integer,unknown")
|
||
+ (eq_attr "memory" "both"))))
|
||
+ "athlon-direct,athlon-load,
|
||
+ athlon-ieu,athlon-store,
|
||
+ athlon-store")
|
||
+
|
||
(define_insn_reservation "athlon_ivector_both" 6
|
||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||
(and (eq_attr "athlon_decode" "vector")
|
||
@@ -279,6 +339,16 @@
|
||
athlon-ieu,
|
||
athlon-ieu,
|
||
athlon-store")
|
||
+(define_insn_reservation "athlon_ivector_both_amdfam10" 6
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "amdfam10_decode" "vector")
|
||
+ (and (eq_attr "unit" "integer,unknown")
|
||
+ (eq_attr "memory" "both"))))
|
||
+ "athlon-vector,athlon-load,
|
||
+ athlon-ieu,
|
||
+ athlon-ieu,
|
||
+ athlon-store")
|
||
+
|
||
(define_insn_reservation "athlon_idirect_store" 1
|
||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||
(and (eq_attr "athlon_decode" "direct")
|
||
@@ -286,6 +356,14 @@
|
||
(eq_attr "memory" "store"))))
|
||
"athlon-direct,(athlon-ieu+athlon-agu),
|
||
athlon-store")
|
||
+(define_insn_reservation "athlon_idirect_store_amdfam10" 1
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "amdfam10_decode" "direct")
|
||
+ (and (eq_attr "unit" "integer,unknown")
|
||
+ (eq_attr "memory" "store"))))
|
||
+ "athlon-direct,(athlon-ieu+athlon-agu),
|
||
+ athlon-store")
|
||
+
|
||
(define_insn_reservation "athlon_ivector_store" 2
|
||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||
(and (eq_attr "athlon_decode" "vector")
|
||
@@ -293,6 +371,13 @@
|
||
(eq_attr "memory" "store"))))
|
||
"athlon-vector,(athlon-ieu+athlon-agu),athlon-ieu,
|
||
athlon-store")
|
||
+(define_insn_reservation "athlon_ivector_store_amdfam10" 2
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "amdfam10_decode" "vector")
|
||
+ (and (eq_attr "unit" "integer,unknown")
|
||
+ (eq_attr "memory" "store"))))
|
||
+ "athlon-vector,(athlon-ieu+athlon-agu),athlon-ieu,
|
||
+ athlon-store")
|
||
|
||
;; Athlon floatin point unit
|
||
(define_insn_reservation "athlon_fldxf" 12
|
||
@@ -302,7 +387,7 @@
|
||
(eq_attr "mode" "XF"))))
|
||
"athlon-vector,athlon-fpload2,athlon-fvector*9")
|
||
(define_insn_reservation "athlon_fldxf_k8" 13
|
||
- (and (eq_attr "cpu" "k8,generic64")
|
||
+ (and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||
(and (eq_attr "type" "fmov")
|
||
(and (eq_attr "memory" "load")
|
||
(eq_attr "mode" "XF"))))
|
||
@@ -314,7 +399,7 @@
|
||
(eq_attr "memory" "load")))
|
||
"athlon-direct,athlon-fpload,athlon-fany")
|
||
(define_insn_reservation "athlon_fld_k8" 2
|
||
- (and (eq_attr "cpu" "k8,generic64")
|
||
+ (and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||
(and (eq_attr "type" "fmov")
|
||
(eq_attr "memory" "load")))
|
||
"athlon-direct,athlon-fploadk8,athlon-fstore")
|
||
@@ -326,7 +411,7 @@
|
||
(eq_attr "mode" "XF"))))
|
||
"athlon-vector,(athlon-fpsched+athlon-agu),(athlon-store2+(athlon-fvector*7))")
|
||
(define_insn_reservation "athlon_fstxf_k8" 8
|
||
- (and (eq_attr "cpu" "k8,generic64")
|
||
+ (and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||
(and (eq_attr "type" "fmov")
|
||
(and (eq_attr "memory" "store,both")
|
||
(eq_attr "mode" "XF"))))
|
||
@@ -337,16 +422,16 @@
|
||
(eq_attr "memory" "store,both")))
|
||
"athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)")
|
||
(define_insn_reservation "athlon_fst_k8" 2
|
||
- (and (eq_attr "cpu" "k8,generic64")
|
||
+ (and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||
(and (eq_attr "type" "fmov")
|
||
(eq_attr "memory" "store,both")))
|
||
"athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)")
|
||
(define_insn_reservation "athlon_fist" 4
|
||
- (and (eq_attr "cpu" "athlon,k8,generic64")
|
||
+ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||
(eq_attr "type" "fistp,fisttp"))
|
||
"athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)")
|
||
(define_insn_reservation "athlon_fmov" 2
|
||
- (and (eq_attr "cpu" "athlon,k8,generic64")
|
||
+ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||
(eq_attr "type" "fmov"))
|
||
"athlon-direct,athlon-fpsched,athlon-faddmul")
|
||
(define_insn_reservation "athlon_fadd_load" 4
|
||
@@ -355,12 +440,12 @@
|
||
(eq_attr "memory" "load")))
|
||
"athlon-direct,athlon-fpload,athlon-fadd")
|
||
(define_insn_reservation "athlon_fadd_load_k8" 6
|
||
- (and (eq_attr "cpu" "k8,generic64")
|
||
+ (and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||
(and (eq_attr "type" "fop")
|
||
(eq_attr "memory" "load")))
|
||
"athlon-direct,athlon-fploadk8,athlon-fadd")
|
||
(define_insn_reservation "athlon_fadd" 4
|
||
- (and (eq_attr "cpu" "athlon,k8,generic64")
|
||
+ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||
(eq_attr "type" "fop"))
|
||
"athlon-direct,athlon-fpsched,athlon-fadd")
|
||
(define_insn_reservation "athlon_fmul_load" 4
|
||
@@ -369,16 +454,16 @@
|
||
(eq_attr "memory" "load")))
|
||
"athlon-direct,athlon-fpload,athlon-fmul")
|
||
(define_insn_reservation "athlon_fmul_load_k8" 6
|
||
- (and (eq_attr "cpu" "k8,generic64")
|
||
+ (and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||
(and (eq_attr "type" "fmul")
|
||
(eq_attr "memory" "load")))
|
||
"athlon-direct,athlon-fploadk8,athlon-fmul")
|
||
(define_insn_reservation "athlon_fmul" 4
|
||
- (and (eq_attr "cpu" "athlon,k8,generic64")
|
||
+ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||
(eq_attr "type" "fmul"))
|
||
"athlon-direct,athlon-fpsched,athlon-fmul")
|
||
(define_insn_reservation "athlon_fsgn" 2
|
||
- (and (eq_attr "cpu" "athlon,k8,generic64")
|
||
+ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||
(eq_attr "type" "fsgn"))
|
||
"athlon-direct,athlon-fpsched,athlon-fmul")
|
||
(define_insn_reservation "athlon_fdiv_load" 24
|
||
@@ -387,7 +472,7 @@
|
||
(eq_attr "memory" "load")))
|
||
"athlon-direct,athlon-fpload,athlon-fmul")
|
||
(define_insn_reservation "athlon_fdiv_load_k8" 13
|
||
- (and (eq_attr "cpu" "k8,generic64")
|
||
+ (and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||
(and (eq_attr "type" "fdiv")
|
||
(eq_attr "memory" "load")))
|
||
"athlon-direct,athlon-fploadk8,athlon-fmul")
|
||
@@ -396,16 +481,16 @@
|
||
(eq_attr "type" "fdiv"))
|
||
"athlon-direct,athlon-fpsched,athlon-fmul")
|
||
(define_insn_reservation "athlon_fdiv_k8" 11
|
||
- (and (eq_attr "cpu" "k8,generic64")
|
||
+ (and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||
(eq_attr "type" "fdiv"))
|
||
"athlon-direct,athlon-fpsched,athlon-fmul")
|
||
(define_insn_reservation "athlon_fpspc_load" 103
|
||
- (and (eq_attr "cpu" "athlon,k8,generic64")
|
||
+ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||
(and (eq_attr "type" "fpspc")
|
||
(eq_attr "memory" "load")))
|
||
"athlon-vector,athlon-fpload,athlon-fvector")
|
||
(define_insn_reservation "athlon_fpspc" 100
|
||
- (and (eq_attr "cpu" "athlon,k8,generic64")
|
||
+ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||
(eq_attr "type" "fpspc"))
|
||
"athlon-vector,athlon-fpsched,athlon-fvector")
|
||
(define_insn_reservation "athlon_fcmov_load" 7
|
||
@@ -418,12 +503,12 @@
|
||
(eq_attr "type" "fcmov"))
|
||
"athlon-vector,athlon-fpsched,athlon-fvector")
|
||
(define_insn_reservation "athlon_fcmov_load_k8" 17
|
||
- (and (eq_attr "cpu" "k8,generic64")
|
||
+ (and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||
(and (eq_attr "type" "fcmov")
|
||
(eq_attr "memory" "load")))
|
||
"athlon-vector,athlon-fploadk8,athlon-fvector")
|
||
(define_insn_reservation "athlon_fcmov_k8" 15
|
||
- (and (eq_attr "cpu" "k8,generic64")
|
||
+ (and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||
(eq_attr "type" "fcmov"))
|
||
"athlon-vector,athlon-fpsched,athlon-fvector")
|
||
;; fcomi is vector decoded by uses only one pipe.
|
||
@@ -434,13 +519,13 @@
|
||
(eq_attr "memory" "load"))))
|
||
"athlon-vector,athlon-fpload,athlon-fadd")
|
||
(define_insn_reservation "athlon_fcomi_load_k8" 5
|
||
- (and (eq_attr "cpu" "k8,generic64")
|
||
+ (and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||
(and (eq_attr "type" "fcmp")
|
||
(and (eq_attr "athlon_decode" "vector")
|
||
(eq_attr "memory" "load"))))
|
||
"athlon-vector,athlon-fploadk8,athlon-fadd")
|
||
(define_insn_reservation "athlon_fcomi" 3
|
||
- (and (eq_attr "cpu" "athlon,k8,generic64")
|
||
+ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||
(and (eq_attr "athlon_decode" "vector")
|
||
(eq_attr "type" "fcmp")))
|
||
"athlon-vector,athlon-fpsched,athlon-fadd")
|
||
@@ -450,18 +535,18 @@
|
||
(eq_attr "memory" "load")))
|
||
"athlon-direct,athlon-fpload,athlon-fadd")
|
||
(define_insn_reservation "athlon_fcom_load_k8" 4
|
||
- (and (eq_attr "cpu" "k8,generic64")
|
||
+ (and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||
(and (eq_attr "type" "fcmp")
|
||
(eq_attr "memory" "load")))
|
||
"athlon-direct,athlon-fploadk8,athlon-fadd")
|
||
(define_insn_reservation "athlon_fcom" 2
|
||
- (and (eq_attr "cpu" "athlon,k8,generic64")
|
||
+ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||
(eq_attr "type" "fcmp"))
|
||
"athlon-direct,athlon-fpsched,athlon-fadd")
|
||
;; Never seen by the scheduler because we still don't do post reg-stack
|
||
;; scheduling.
|
||
;(define_insn_reservation "athlon_fxch" 2
|
||
-; (and (eq_attr "cpu" "athlon,k8,generic64")
|
||
+; (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||
; (eq_attr "type" "fxch"))
|
||
; "athlon-direct,athlon-fpsched,athlon-fany")
|
||
|
||
@@ -516,6 +601,23 @@
|
||
(and (eq_attr "type" "mmxmov,ssemov")
|
||
(eq_attr "memory" "load")))
|
||
"athlon-direct,athlon-fploadk8,athlon-fstore")
|
||
+;; On AMDFAM10 all double, single and integer packed and scalar SSEx data
|
||
+;; loads generated are direct path, latency of 2 and do not use any FP
|
||
+;; executions units. No seperate entries for movlpx/movhpx loads, which
|
||
+;; are direct path, latency of 4 and use the FADD/FMUL FP execution units,
|
||
+;; as they will not be generated.
|
||
+(define_insn_reservation "athlon_sseld_amdfam10" 2
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "type" "ssemov")
|
||
+ (eq_attr "memory" "load")))
|
||
+ "athlon-direct,athlon-fploadk8")
|
||
+;; On AMDFAM10 MMX data loads generated are direct path, latency of 4
|
||
+;; and can use any FP executions units
|
||
+(define_insn_reservation "athlon_mmxld_amdfam10" 4
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "type" "mmxmov")
|
||
+ (eq_attr "memory" "load")))
|
||
+ "athlon-direct,athlon-fploadk8, athlon-fany")
|
||
(define_insn_reservation "athlon_mmxssest" 3
|
||
(and (eq_attr "cpu" "k8,generic64")
|
||
(and (eq_attr "type" "mmxmov,ssemov")
|
||
@@ -533,6 +635,25 @@
|
||
(and (eq_attr "type" "mmxmov,ssemov")
|
||
(eq_attr "memory" "store,both")))
|
||
"athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)")
|
||
+;; On AMDFAM10 all double, single and integer packed SSEx data stores
|
||
+;; generated are all double path, latency of 2 and use the FSTORE FP
|
||
+;; execution unit. No entries seperate for movupx/movdqu, which are
|
||
+;; vector path, latency of 3 and use the FSTORE*2 FP execution unit,
|
||
+;; as they will not be generated.
|
||
+(define_insn_reservation "athlon_ssest_amdfam10" 2
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "type" "ssemov")
|
||
+ (and (eq_attr "mode" "V4SF,V2DF,TI")
|
||
+ (eq_attr "memory" "store,both"))))
|
||
+ "athlon-double,(athlon-fpsched+athlon-agu),((athlon-fstore+athlon-store)*2)")
|
||
+;; On AMDFAM10 all double, single and integer scalar SSEx and MMX
|
||
+;; data stores generated are all direct path, latency of 2 and use
|
||
+;; the FSTORE FP execution unit
|
||
+(define_insn_reservation "athlon_mmxssest_short_amdfam10" 2
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "type" "mmxmov,ssemov")
|
||
+ (eq_attr "memory" "store,both")))
|
||
+ "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)")
|
||
(define_insn_reservation "athlon_movaps_k8" 2
|
||
(and (eq_attr "cpu" "k8,generic64")
|
||
(and (eq_attr "type" "ssemov")
|
||
@@ -578,6 +699,11 @@
|
||
(and (eq_attr "type" "sselog,sselog1")
|
||
(eq_attr "memory" "load")))
|
||
"athlon-double,athlon-fpload2k8,(athlon-fmul*2)")
|
||
+(define_insn_reservation "athlon_sselog_load_amdfam10" 4
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "type" "sselog,sselog1")
|
||
+ (eq_attr "memory" "load")))
|
||
+ "athlon-direct,athlon-fploadk8,(athlon-fadd|athlon-fmul)")
|
||
(define_insn_reservation "athlon_sselog" 3
|
||
(and (eq_attr "cpu" "athlon")
|
||
(eq_attr "type" "sselog,sselog1"))
|
||
@@ -586,6 +712,11 @@
|
||
(and (eq_attr "cpu" "k8,generic64")
|
||
(eq_attr "type" "sselog,sselog1"))
|
||
"athlon-double,athlon-fpsched,athlon-fmul")
|
||
+(define_insn_reservation "athlon_sselog_amdfam10" 2
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (eq_attr "type" "sselog,sselog1"))
|
||
+ "athlon-direct,athlon-fpsched,(athlon-fadd|athlon-fmul)")
|
||
+
|
||
;; ??? pcmp executes in addmul, probably not worthwhile to bother about that.
|
||
(define_insn_reservation "athlon_ssecmp_load" 2
|
||
(and (eq_attr "cpu" "athlon")
|
||
@@ -594,13 +725,13 @@
|
||
(eq_attr "memory" "load"))))
|
||
"athlon-direct,athlon-fpload,athlon-fadd")
|
||
(define_insn_reservation "athlon_ssecmp_load_k8" 4
|
||
- (and (eq_attr "cpu" "k8,generic64")
|
||
+ (and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||
(and (eq_attr "type" "ssecmp")
|
||
(and (eq_attr "mode" "SF,DF,DI,TI")
|
||
(eq_attr "memory" "load"))))
|
||
"athlon-direct,athlon-fploadk8,athlon-fadd")
|
||
(define_insn_reservation "athlon_ssecmp" 2
|
||
- (and (eq_attr "cpu" "athlon,k8,generic64")
|
||
+ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||
(and (eq_attr "type" "ssecmp")
|
||
(eq_attr "mode" "SF,DF,DI,TI")))
|
||
"athlon-direct,athlon-fpsched,athlon-fadd")
|
||
@@ -614,6 +745,11 @@
|
||
(and (eq_attr "type" "ssecmp")
|
||
(eq_attr "memory" "load")))
|
||
"athlon-double,athlon-fpload2k8,(athlon-fadd*2)")
|
||
+(define_insn_reservation "athlon_ssecmpvector_load_amdfam10" 4
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "type" "ssecmp")
|
||
+ (eq_attr "memory" "load")))
|
||
+ "athlon-direct,athlon-fploadk8,athlon-fadd")
|
||
(define_insn_reservation "athlon_ssecmpvector" 3
|
||
(and (eq_attr "cpu" "athlon")
|
||
(eq_attr "type" "ssecmp"))
|
||
@@ -622,6 +758,10 @@
|
||
(and (eq_attr "cpu" "k8,generic64")
|
||
(eq_attr "type" "ssecmp"))
|
||
"athlon-double,athlon-fpsched,(athlon-fadd*2)")
|
||
+(define_insn_reservation "athlon_ssecmpvector_amdfam10" 2
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (eq_attr "type" "ssecmp"))
|
||
+ "athlon-direct,athlon-fpsched,athlon-fadd")
|
||
(define_insn_reservation "athlon_ssecomi_load" 4
|
||
(and (eq_attr "cpu" "athlon")
|
||
(and (eq_attr "type" "ssecomi")
|
||
@@ -632,10 +772,20 @@
|
||
(and (eq_attr "type" "ssecomi")
|
||
(eq_attr "memory" "load")))
|
||
"athlon-vector,athlon-fploadk8,athlon-fadd")
|
||
+(define_insn_reservation "athlon_ssecomi_load_amdfam10" 5
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "type" "ssecomi")
|
||
+ (eq_attr "memory" "load")))
|
||
+ "athlon-direct,athlon-fploadk8,athlon-fadd")
|
||
(define_insn_reservation "athlon_ssecomi" 4
|
||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||
(eq_attr "type" "ssecmp"))
|
||
"athlon-vector,athlon-fpsched,athlon-fadd")
|
||
+(define_insn_reservation "athlon_ssecomi_amdfam10" 3
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+;; It seems athlon_ssecomi has a bug in the attr_type, fixed for amdfam10
|
||
+ (eq_attr "type" "ssecomi"))
|
||
+ "athlon-direct,athlon-fpsched,athlon-fadd")
|
||
(define_insn_reservation "athlon_sseadd_load" 4
|
||
(and (eq_attr "cpu" "athlon")
|
||
(and (eq_attr "type" "sseadd")
|
||
@@ -643,13 +793,13 @@
|
||
(eq_attr "memory" "load"))))
|
||
"athlon-direct,athlon-fpload,athlon-fadd")
|
||
(define_insn_reservation "athlon_sseadd_load_k8" 6
|
||
- (and (eq_attr "cpu" "k8,generic64")
|
||
+ (and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||
(and (eq_attr "type" "sseadd")
|
||
(and (eq_attr "mode" "SF,DF,DI")
|
||
(eq_attr "memory" "load"))))
|
||
"athlon-direct,athlon-fploadk8,athlon-fadd")
|
||
(define_insn_reservation "athlon_sseadd" 4
|
||
- (and (eq_attr "cpu" "athlon,k8,generic64")
|
||
+ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||
(and (eq_attr "type" "sseadd")
|
||
(eq_attr "mode" "SF,DF,DI")))
|
||
"athlon-direct,athlon-fpsched,athlon-fadd")
|
||
@@ -663,6 +813,11 @@
|
||
(and (eq_attr "type" "sseadd")
|
||
(eq_attr "memory" "load")))
|
||
"athlon-double,athlon-fpload2k8,(athlon-fadd*2)")
|
||
+(define_insn_reservation "athlon_sseaddvector_load_amdfam10" 6
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "type" "sseadd")
|
||
+ (eq_attr "memory" "load")))
|
||
+ "athlon-direct,athlon-fploadk8,athlon-fadd")
|
||
(define_insn_reservation "athlon_sseaddvector" 5
|
||
(and (eq_attr "cpu" "athlon")
|
||
(eq_attr "type" "sseadd"))
|
||
@@ -671,6 +826,10 @@
|
||
(and (eq_attr "cpu" "k8,generic64")
|
||
(eq_attr "type" "sseadd"))
|
||
"athlon-double,athlon-fpsched,(athlon-fadd*2)")
|
||
+(define_insn_reservation "athlon_sseaddvector_amdfam10" 4
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (eq_attr "type" "sseadd"))
|
||
+ "athlon-direct,athlon-fpsched,athlon-fadd")
|
||
|
||
;; Conversions behaves very irregularly and the scheduling is critical here.
|
||
;; Take each instruction separately. Assume that the mode is always set to the
|
||
@@ -684,12 +843,25 @@
|
||
(and (eq_attr "mode" "DF")
|
||
(eq_attr "memory" "load")))))
|
||
"athlon-direct,athlon-fploadk8,athlon-fstore")
|
||
+(define_insn_reservation "athlon_ssecvt_cvtss2sd_load_amdfam10" 7
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "type" "ssecvt")
|
||
+ (and (eq_attr "amdfam10_decode" "double")
|
||
+ (and (eq_attr "mode" "DF")
|
||
+ (eq_attr "memory" "load")))))
|
||
+ "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)")
|
||
(define_insn_reservation "athlon_ssecvt_cvtss2sd" 2
|
||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||
(and (eq_attr "type" "ssecvt")
|
||
(and (eq_attr "athlon_decode" "direct")
|
||
(eq_attr "mode" "DF"))))
|
||
"athlon-direct,athlon-fpsched,athlon-fstore")
|
||
+(define_insn_reservation "athlon_ssecvt_cvtss2sd_amdfam10" 7
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "type" "ssecvt")
|
||
+ (and (eq_attr "amdfam10_decode" "vector")
|
||
+ (eq_attr "mode" "DF"))))
|
||
+ "athlon-vector,athlon-fpsched,athlon-faddmul,(athlon-fstore*2)")
|
||
;; cvtps2pd. Model same way the other double decoded FP conversions.
|
||
(define_insn_reservation "athlon_ssecvt_cvtps2pd_load_k8" 5
|
||
(and (eq_attr "cpu" "k8,athlon,generic64")
|
||
@@ -698,12 +870,25 @@
|
||
(and (eq_attr "mode" "V2DF,V4SF,TI")
|
||
(eq_attr "memory" "load")))))
|
||
"athlon-double,athlon-fpload2k8,(athlon-fstore*2)")
|
||
+(define_insn_reservation "athlon_ssecvt_cvtps2pd_load_amdfam10" 4
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "type" "ssecvt")
|
||
+ (and (eq_attr "amdfam10_decode" "direct")
|
||
+ (and (eq_attr "mode" "V2DF,V4SF,TI")
|
||
+ (eq_attr "memory" "load")))))
|
||
+ "athlon-direct,athlon-fploadk8,athlon-fstore")
|
||
(define_insn_reservation "athlon_ssecvt_cvtps2pd_k8" 3
|
||
(and (eq_attr "cpu" "k8,athlon,generic64")
|
||
(and (eq_attr "type" "ssecvt")
|
||
(and (eq_attr "athlon_decode" "double")
|
||
(eq_attr "mode" "V2DF,V4SF,TI"))))
|
||
"athlon-double,athlon-fpsched,athlon-fstore,athlon-fstore")
|
||
+(define_insn_reservation "athlon_ssecvt_cvtps2pd_amdfam10" 2
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "type" "ssecvt")
|
||
+ (and (eq_attr "amdfam10_decode" "direct")
|
||
+ (eq_attr "mode" "V2DF,V4SF,TI"))))
|
||
+ "athlon-direct,athlon-fpsched,athlon-fstore")
|
||
;; cvtsi2sd mem,reg is directpath path (cvtsi2sd reg,reg is doublepath)
|
||
;; cvtsi2sd has troughput 1 and is executed in store unit with latency of 6
|
||
(define_insn_reservation "athlon_sseicvt_cvtsi2sd_load" 6
|
||
@@ -713,6 +898,13 @@
|
||
(and (eq_attr "mode" "SF,DF")
|
||
(eq_attr "memory" "load")))))
|
||
"athlon-direct,athlon-fploadk8,athlon-fstore")
|
||
+(define_insn_reservation "athlon_sseicvt_cvtsi2sd_load_amdfam10" 9
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "type" "sseicvt")
|
||
+ (and (eq_attr "amdfam10_decode" "double")
|
||
+ (and (eq_attr "mode" "SF,DF")
|
||
+ (eq_attr "memory" "load")))))
|
||
+ "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)")
|
||
;; cvtsi2ss mem, reg is doublepath
|
||
(define_insn_reservation "athlon_sseicvt_cvtsi2ss_load" 9
|
||
(and (eq_attr "cpu" "athlon")
|
||
@@ -728,6 +920,13 @@
|
||
(and (eq_attr "mode" "SF,DF")
|
||
(eq_attr "memory" "load")))))
|
||
"athlon-double,athlon-fploadk8,(athlon-fstore*2)")
|
||
+(define_insn_reservation "athlon_sseicvt_cvtsi2ss_load_amdfam10" 9
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "type" "sseicvt")
|
||
+ (and (eq_attr "amdfam10_decode" "double")
|
||
+ (and (eq_attr "mode" "SF,DF")
|
||
+ (eq_attr "memory" "load")))))
|
||
+ "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)")
|
||
;; cvtsi2sd reg,reg is double decoded (vector on Athlon)
|
||
(define_insn_reservation "athlon_sseicvt_cvtsi2sd_k8" 11
|
||
(and (eq_attr "cpu" "k8,athlon,generic64")
|
||
@@ -736,6 +935,13 @@
|
||
(and (eq_attr "mode" "SF,DF")
|
||
(eq_attr "memory" "none")))))
|
||
"athlon-double,athlon-fploadk8,athlon-fstore")
|
||
+(define_insn_reservation "athlon_sseicvt_cvtsi2sd_amdfam10" 14
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "type" "sseicvt")
|
||
+ (and (eq_attr "amdfam10_decode" "vector")
|
||
+ (and (eq_attr "mode" "SF,DF")
|
||
+ (eq_attr "memory" "none")))))
|
||
+ "athlon-vector,athlon-fploadk8,(athlon-faddmul+athlon-fstore)")
|
||
;; cvtsi2ss reg, reg is doublepath
|
||
(define_insn_reservation "athlon_sseicvt_cvtsi2ss" 14
|
||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||
@@ -744,6 +950,13 @@
|
||
(and (eq_attr "mode" "SF,DF")
|
||
(eq_attr "memory" "none")))))
|
||
"athlon-vector,athlon-fploadk8,(athlon-fvector*2)")
|
||
+(define_insn_reservation "athlon_sseicvt_cvtsi2ss_amdfam10" 14
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "type" "sseicvt")
|
||
+ (and (eq_attr "amdfam10_decode" "vector")
|
||
+ (and (eq_attr "mode" "SF,DF")
|
||
+ (eq_attr "memory" "none")))))
|
||
+ "athlon-vector,athlon-fploadk8,(athlon-faddmul+athlon-fstore)")
|
||
;; cvtsd2ss mem,reg is doublepath, troughput unknown, latency 9
|
||
(define_insn_reservation "athlon_ssecvt_cvtsd2ss_load_k8" 9
|
||
(and (eq_attr "cpu" "k8,athlon,generic64")
|
||
@@ -752,6 +965,13 @@
|
||
(and (eq_attr "mode" "SF")
|
||
(eq_attr "memory" "load")))))
|
||
"athlon-double,athlon-fploadk8,(athlon-fstore*3)")
|
||
+(define_insn_reservation "athlon_ssecvt_cvtsd2ss_load_amdfam10" 9
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "type" "ssecvt")
|
||
+ (and (eq_attr "amdfam10_decode" "double")
|
||
+ (and (eq_attr "mode" "SF")
|
||
+ (eq_attr "memory" "load")))))
|
||
+ "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)")
|
||
;; cvtsd2ss reg,reg is vectorpath, troughput unknown, latency 12
|
||
(define_insn_reservation "athlon_ssecvt_cvtsd2ss" 12
|
||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||
@@ -760,6 +980,13 @@
|
||
(and (eq_attr "mode" "SF")
|
||
(eq_attr "memory" "none")))))
|
||
"athlon-vector,athlon-fpsched,(athlon-fvector*3)")
|
||
+(define_insn_reservation "athlon_ssecvt_cvtsd2ss_amdfam10" 8
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "type" "ssecvt")
|
||
+ (and (eq_attr "amdfam10_decode" "vector")
|
||
+ (and (eq_attr "mode" "SF")
|
||
+ (eq_attr "memory" "none")))))
|
||
+ "athlon-vector,athlon-fpsched,athlon-faddmul,(athlon-fstore*2)")
|
||
(define_insn_reservation "athlon_ssecvt_cvtpd2ps_load_k8" 8
|
||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||
(and (eq_attr "type" "ssecvt")
|
||
@@ -767,6 +994,13 @@
|
||
(and (eq_attr "mode" "V4SF,V2DF,TI")
|
||
(eq_attr "memory" "load")))))
|
||
"athlon-double,athlon-fpload2k8,(athlon-fstore*3)")
|
||
+(define_insn_reservation "athlon_ssecvt_cvtpd2ps_load_amdfam10" 9
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "type" "ssecvt")
|
||
+ (and (eq_attr "amdfam10_decode" "double")
|
||
+ (and (eq_attr "mode" "V4SF,V2DF,TI")
|
||
+ (eq_attr "memory" "load")))))
|
||
+ "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)")
|
||
;; cvtpd2ps mem,reg is vectorpath, troughput unknown, latency 10
|
||
;; ??? Why it is fater than cvtsd2ss?
|
||
(define_insn_reservation "athlon_ssecvt_cvtpd2ps" 8
|
||
@@ -776,6 +1010,13 @@
|
||
(and (eq_attr "mode" "V4SF,V2DF,TI")
|
||
(eq_attr "memory" "none")))))
|
||
"athlon-vector,athlon-fpsched,athlon-fvector*2")
|
||
+(define_insn_reservation "athlon_ssecvt_cvtpd2ps_amdfam10" 7
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "type" "ssecvt")
|
||
+ (and (eq_attr "amdfam10_decode" "double")
|
||
+ (and (eq_attr "mode" "V4SF,V2DF,TI")
|
||
+ (eq_attr "memory" "none")))))
|
||
+ "athlon-double,athlon-fpsched,(athlon-faddmul+athlon-fstore)")
|
||
;; cvtsd2si mem,reg is doublepath, troughput 1, latency 9
|
||
(define_insn_reservation "athlon_secvt_cvtsX2si_load" 9
|
||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||
@@ -784,6 +1025,13 @@
|
||
(and (eq_attr "mode" "SI,DI")
|
||
(eq_attr "memory" "load")))))
|
||
"athlon-vector,athlon-fploadk8,athlon-fvector")
|
||
+(define_insn_reservation "athlon_secvt_cvtsX2si_load_amdfam10" 10
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "type" "sseicvt")
|
||
+ (and (eq_attr "amdfam10_decode" "double")
|
||
+ (and (eq_attr "mode" "SI,DI")
|
||
+ (eq_attr "memory" "load")))))
|
||
+ "athlon-double,athlon-fploadk8,(athlon-fadd+athlon-fstore)")
|
||
;; cvtsd2si reg,reg is doublepath, troughput 1, latency 9
|
||
(define_insn_reservation "athlon_ssecvt_cvtsX2si" 9
|
||
(and (eq_attr "cpu" "athlon")
|
||
@@ -799,6 +1047,29 @@
|
||
(and (eq_attr "mode" "SI,DI")
|
||
(eq_attr "memory" "none")))))
|
||
"athlon-double,athlon-fpsched,athlon-fstore")
|
||
+(define_insn_reservation "athlon_ssecvt_cvtsX2si_amdfam10" 8
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "type" "sseicvt")
|
||
+ (and (eq_attr "amdfam10_decode" "double")
|
||
+ (and (eq_attr "mode" "SI,DI")
|
||
+ (eq_attr "memory" "none")))))
|
||
+ "athlon-double,athlon-fpsched,(athlon-fadd+athlon-fstore)")
|
||
+;; cvtpd2dq reg,mem is doublepath, troughput 1, latency 9 on amdfam10
|
||
+(define_insn_reservation "athlon_sseicvt_cvtpd2dq_load_amdfam10" 9
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "type" "sseicvt")
|
||
+ (and (eq_attr "amdfam10_decode" "double")
|
||
+ (and (eq_attr "mode" "TI")
|
||
+ (eq_attr "memory" "load")))))
|
||
+ "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)")
|
||
+;; cvtpd2dq reg,mem is doublepath, troughput 1, latency 7 on amdfam10
|
||
+(define_insn_reservation "athlon_sseicvt_cvtpd2dq_amdfam10" 7
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "type" "sseicvt")
|
||
+ (and (eq_attr "amdfam10_decode" "double")
|
||
+ (and (eq_attr "mode" "TI")
|
||
+ (eq_attr "memory" "none")))))
|
||
+ "athlon-double,athlon-fpsched,(athlon-faddmul+athlon-fstore)")
|
||
|
||
|
||
(define_insn_reservation "athlon_ssemul_load" 4
|
||
@@ -808,13 +1079,13 @@
|
||
(eq_attr "memory" "load"))))
|
||
"athlon-direct,athlon-fpload,athlon-fmul")
|
||
(define_insn_reservation "athlon_ssemul_load_k8" 6
|
||
- (and (eq_attr "cpu" "k8,generic64")
|
||
+ (and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||
(and (eq_attr "type" "ssemul")
|
||
(and (eq_attr "mode" "SF,DF")
|
||
(eq_attr "memory" "load"))))
|
||
"athlon-direct,athlon-fploadk8,athlon-fmul")
|
||
(define_insn_reservation "athlon_ssemul" 4
|
||
- (and (eq_attr "cpu" "athlon,k8,generic64")
|
||
+ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||
(and (eq_attr "type" "ssemul")
|
||
(eq_attr "mode" "SF,DF")))
|
||
"athlon-direct,athlon-fpsched,athlon-fmul")
|
||
@@ -828,6 +1099,11 @@
|
||
(and (eq_attr "type" "ssemul")
|
||
(eq_attr "memory" "load")))
|
||
"athlon-double,athlon-fpload2k8,(athlon-fmul*2)")
|
||
+(define_insn_reservation "athlon_ssemulvector_load_amdfam10" 6
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "type" "ssemul")
|
||
+ (eq_attr "memory" "load")))
|
||
+ "athlon-direct,athlon-fploadk8,athlon-fmul")
|
||
(define_insn_reservation "athlon_ssemulvector" 5
|
||
(and (eq_attr "cpu" "athlon")
|
||
(eq_attr "type" "ssemul"))
|
||
@@ -836,6 +1112,10 @@
|
||
(and (eq_attr "cpu" "k8,generic64")
|
||
(eq_attr "type" "ssemul"))
|
||
"athlon-double,athlon-fpsched,(athlon-fmul*2)")
|
||
+(define_insn_reservation "athlon_ssemulvector_amdfam10" 4
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (eq_attr "type" "ssemul"))
|
||
+ "athlon-direct,athlon-fpsched,athlon-fmul")
|
||
;; divsd timings. divss is faster
|
||
(define_insn_reservation "athlon_ssediv_load" 20
|
||
(and (eq_attr "cpu" "athlon")
|
||
@@ -844,13 +1124,13 @@
|
||
(eq_attr "memory" "load"))))
|
||
"athlon-direct,athlon-fpload,athlon-fmul*17")
|
||
(define_insn_reservation "athlon_ssediv_load_k8" 22
|
||
- (and (eq_attr "cpu" "k8,generic64")
|
||
+ (and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||
(and (eq_attr "type" "ssediv")
|
||
(and (eq_attr "mode" "SF,DF")
|
||
(eq_attr "memory" "load"))))
|
||
"athlon-direct,athlon-fploadk8,athlon-fmul*17")
|
||
(define_insn_reservation "athlon_ssediv" 20
|
||
- (and (eq_attr "cpu" "athlon,k8,generic64")
|
||
+ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||
(and (eq_attr "type" "ssediv")
|
||
(eq_attr "mode" "SF,DF")))
|
||
"athlon-direct,athlon-fpsched,athlon-fmul*17")
|
||
@@ -864,6 +1144,11 @@
|
||
(and (eq_attr "type" "ssediv")
|
||
(eq_attr "memory" "load")))
|
||
"athlon-double,athlon-fpload2k8,athlon-fmul*34")
|
||
+(define_insn_reservation "athlon_ssedivvector_load_amdfam10" 22
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "type" "ssediv")
|
||
+ (eq_attr "memory" "load")))
|
||
+ "athlon-direct,athlon-fploadk8,athlon-fmul*17")
|
||
(define_insn_reservation "athlon_ssedivvector" 39
|
||
(and (eq_attr "cpu" "athlon")
|
||
(eq_attr "type" "ssediv"))
|
||
@@ -872,3 +1157,12 @@
|
||
(and (eq_attr "cpu" "k8,generic64")
|
||
(eq_attr "type" "ssediv"))
|
||
"athlon-double,athlon-fmul*34")
|
||
+(define_insn_reservation "athlon_ssedivvector_amdfam10" 20
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (eq_attr "type" "ssediv"))
|
||
+ "athlon-direct,athlon-fmul*17")
|
||
+(define_insn_reservation "athlon_sseins_amdfam10" 5
|
||
+ (and (eq_attr "cpu" "amdfam10")
|
||
+ (and (eq_attr "type" "sseins")
|
||
+ (eq_attr "mode" "TI")))
|
||
+ "athlon-vector,athlon-fpsched,athlon-faddmul")
|
||
--- gcc/config/i386/pmmintrin.h.jj 2006-10-05 00:29:29.000000000 +0200
|
||
+++ gcc/config/i386/pmmintrin.h 2007-02-09 21:26:06.000000000 +0100
|
||
@@ -30,7 +30,11 @@
|
||
#ifndef _PMMINTRIN_H_INCLUDED
|
||
#define _PMMINTRIN_H_INCLUDED
|
||
|
||
-#ifdef __SSE3__
|
||
+#ifndef __SSE3__
|
||
+# error "SSE3 instruction set not enabled"
|
||
+#else
|
||
+
|
||
+/* We need definitions from the SSE2 and SSE header files*/
|
||
#include <xmmintrin.h>
|
||
#include <emmintrin.h>
|
||
|
||
--- gcc/config/i386/tmmintrin.h.jj 2007-02-09 16:18:25.000000000 +0100
|
||
+++ gcc/config/i386/tmmintrin.h 2007-02-09 21:26:06.000000000 +0100
|
||
@@ -30,7 +30,11 @@
|
||
#ifndef _TMMINTRIN_H_INCLUDED
|
||
#define _TMMINTRIN_H_INCLUDED
|
||
|
||
-#ifdef __SSSE3__
|
||
+#ifndef __SSSE3__
|
||
+# error "SSSE3 instruction set not enabled"
|
||
+#else
|
||
+
|
||
+/* We need definitions from the SSE3, SSE2 and SSE header files*/
|
||
#include <pmmintrin.h>
|
||
|
||
static __inline __m128i
|
||
--- gcc/config/i386/sse.md.jj 2007-02-09 16:18:25.000000000 +0100
|
||
+++ gcc/config/i386/sse.md 2007-02-09 21:26:06.000000000 +0100
|
||
@@ -963,6 +963,7 @@
|
||
"cvtsi2ss\t{%2, %0|%0, %2}"
|
||
[(set_attr "type" "sseicvt")
|
||
(set_attr "athlon_decode" "vector,double")
|
||
+ (set_attr "amdfam10_decode" "vector,double")
|
||
(set_attr "mode" "SF")])
|
||
|
||
(define_insn "sse_cvtsi2ssq"
|
||
@@ -976,6 +977,7 @@
|
||
"cvtsi2ssq\t{%2, %0|%0, %2}"
|
||
[(set_attr "type" "sseicvt")
|
||
(set_attr "athlon_decode" "vector,double")
|
||
+ (set_attr "amdfam10_decode" "vector,double")
|
||
(set_attr "mode" "SF")])
|
||
|
||
(define_insn "sse_cvtss2si"
|
||
@@ -989,6 +991,7 @@
|
||
"cvtss2si\t{%1, %0|%0, %1}"
|
||
[(set_attr "type" "sseicvt")
|
||
(set_attr "athlon_decode" "double,vector")
|
||
+ (set_attr "amdfam10_decode" "double,double")
|
||
(set_attr "mode" "SI")])
|
||
|
||
(define_insn "sse_cvtss2siq"
|
||
@@ -1002,6 +1005,7 @@
|
||
"cvtss2siq\t{%1, %0|%0, %1}"
|
||
[(set_attr "type" "sseicvt")
|
||
(set_attr "athlon_decode" "double,vector")
|
||
+ (set_attr "amdfam10_decode" "double,double")
|
||
(set_attr "mode" "DI")])
|
||
|
||
(define_insn "sse_cvttss2si"
|
||
@@ -1014,6 +1018,7 @@
|
||
"cvttss2si\t{%1, %0|%0, %1}"
|
||
[(set_attr "type" "sseicvt")
|
||
(set_attr "athlon_decode" "double,vector")
|
||
+ (set_attr "amdfam10_decode" "double,double")
|
||
(set_attr "mode" "SI")])
|
||
|
||
(define_insn "sse_cvttss2siq"
|
||
@@ -1026,6 +1031,7 @@
|
||
"cvttss2siq\t{%1, %0|%0, %1}"
|
||
[(set_attr "type" "sseicvt")
|
||
(set_attr "athlon_decode" "double,vector")
|
||
+ (set_attr "amdfam10_decode" "double,double")
|
||
(set_attr "mode" "DI")])
|
||
|
||
(define_insn "sse2_cvtdq2ps"
|
||
@@ -1921,7 +1927,8 @@
|
||
"cvtsi2sd\t{%2, %0|%0, %2}"
|
||
[(set_attr "type" "sseicvt")
|
||
(set_attr "mode" "DF")
|
||
- (set_attr "athlon_decode" "double,direct")])
|
||
+ (set_attr "athlon_decode" "double,direct")
|
||
+ (set_attr "amdfam10_decode" "vector,double")])
|
||
|
||
(define_insn "sse2_cvtsi2sdq"
|
||
[(set (match_operand:V2DF 0 "register_operand" "=x,x")
|
||
@@ -1934,7 +1941,8 @@
|
||
"cvtsi2sdq\t{%2, %0|%0, %2}"
|
||
[(set_attr "type" "sseicvt")
|
||
(set_attr "mode" "DF")
|
||
- (set_attr "athlon_decode" "double,direct")])
|
||
+ (set_attr "athlon_decode" "double,direct")
|
||
+ (set_attr "amdfam10_decode" "vector,double")])
|
||
|
||
(define_insn "sse2_cvtsd2si"
|
||
[(set (match_operand:SI 0 "register_operand" "=r,r")
|
||
@@ -1947,6 +1955,7 @@
|
||
"cvtsd2si\t{%1, %0|%0, %1}"
|
||
[(set_attr "type" "sseicvt")
|
||
(set_attr "athlon_decode" "double,vector")
|
||
+ (set_attr "amdfam10_decode" "double,double")
|
||
(set_attr "mode" "SI")])
|
||
|
||
(define_insn "sse2_cvtsd2siq"
|
||
@@ -1960,6 +1969,7 @@
|
||
"cvtsd2siq\t{%1, %0|%0, %1}"
|
||
[(set_attr "type" "sseicvt")
|
||
(set_attr "athlon_decode" "double,vector")
|
||
+ (set_attr "amdfam10_decode" "double,double")
|
||
(set_attr "mode" "DI")])
|
||
|
||
(define_insn "sse2_cvttsd2si"
|
||
@@ -1972,7 +1982,8 @@
|
||
"cvttsd2si\t{%1, %0|%0, %1}"
|
||
[(set_attr "type" "sseicvt")
|
||
(set_attr "mode" "SI")
|
||
- (set_attr "athlon_decode" "double,vector")])
|
||
+ (set_attr "athlon_decode" "double,vector")
|
||
+ (set_attr "amdfam10_decode" "double,double")])
|
||
|
||
(define_insn "sse2_cvttsd2siq"
|
||
[(set (match_operand:DI 0 "register_operand" "=r,r")
|
||
@@ -1984,7 +1995,8 @@
|
||
"cvttsd2siq\t{%1, %0|%0, %1}"
|
||
[(set_attr "type" "sseicvt")
|
||
(set_attr "mode" "DI")
|
||
- (set_attr "athlon_decode" "double,vector")])
|
||
+ (set_attr "athlon_decode" "double,vector")
|
||
+ (set_attr "amdfam10_decode" "double,double")])
|
||
|
||
(define_insn "sse2_cvtdq2pd"
|
||
[(set (match_operand:V2DF 0 "register_operand" "=x")
|
||
@@ -2015,7 +2027,8 @@
|
||
"TARGET_SSE2"
|
||
"cvtpd2dq\t{%1, %0|%0, %1}"
|
||
[(set_attr "type" "ssecvt")
|
||
- (set_attr "mode" "TI")])
|
||
+ (set_attr "mode" "TI")
|
||
+ (set_attr "amdfam10_decode" "double")])
|
||
|
||
(define_expand "sse2_cvttpd2dq"
|
||
[(set (match_operand:V4SI 0 "register_operand" "")
|
||
@@ -2033,7 +2046,8 @@
|
||
"TARGET_SSE2"
|
||
"cvttpd2dq\t{%1, %0|%0, %1}"
|
||
[(set_attr "type" "ssecvt")
|
||
- (set_attr "mode" "TI")])
|
||
+ (set_attr "mode" "TI")
|
||
+ (set_attr "amdfam10_decode" "double")])
|
||
|
||
(define_insn "sse2_cvtsd2ss"
|
||
[(set (match_operand:V4SF 0 "register_operand" "=x,x")
|
||
@@ -2047,20 +2061,22 @@
|
||
"cvtsd2ss\t{%2, %0|%0, %2}"
|
||
[(set_attr "type" "ssecvt")
|
||
(set_attr "athlon_decode" "vector,double")
|
||
+ (set_attr "amdfam10_decode" "vector,double")
|
||
(set_attr "mode" "SF")])
|
||
|
||
(define_insn "sse2_cvtss2sd"
|
||
- [(set (match_operand:V2DF 0 "register_operand" "=x")
|
||
+ [(set (match_operand:V2DF 0 "register_operand" "=x,x")
|
||
(vec_merge:V2DF
|
||
(float_extend:V2DF
|
||
(vec_select:V2SF
|
||
- (match_operand:V4SF 2 "nonimmediate_operand" "xm")
|
||
+ (match_operand:V4SF 2 "nonimmediate_operand" "x,m")
|
||
(parallel [(const_int 0) (const_int 1)])))
|
||
- (match_operand:V2DF 1 "register_operand" "0")
|
||
+ (match_operand:V2DF 1 "register_operand" "0,0")
|
||
(const_int 1)))]
|
||
"TARGET_SSE2"
|
||
"cvtss2sd\t{%2, %0|%0, %2}"
|
||
[(set_attr "type" "ssecvt")
|
||
+ (set_attr "amdfam10_decode" "vector,double")
|
||
(set_attr "mode" "DF")])
|
||
|
||
(define_expand "sse2_cvtpd2ps"
|
||
@@ -2081,7 +2097,8 @@
|
||
"TARGET_SSE2"
|
||
"cvtpd2ps\t{%1, %0|%0, %1}"
|
||
[(set_attr "type" "ssecvt")
|
||
- (set_attr "mode" "V4SF")])
|
||
+ (set_attr "mode" "V4SF")
|
||
+ (set_attr "amdfam10_decode" "double")])
|
||
|
||
(define_insn "sse2_cvtps2pd"
|
||
[(set (match_operand:V2DF 0 "register_operand" "=x")
|
||
@@ -2092,7 +2109,8 @@
|
||
"TARGET_SSE2"
|
||
"cvtps2pd\t{%1, %0|%0, %1}"
|
||
[(set_attr "type" "ssecvt")
|
||
- (set_attr "mode" "V2DF")])
|
||
+ (set_attr "mode" "V2DF")
|
||
+ (set_attr "amdfam10_decode" "direct")])
|
||
|
||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||
;;
|
||
@@ -4550,3 +4568,92 @@
|
||
"pabs<mmxvecsize>\t{%1, %0|%0, %1}";
|
||
[(set_attr "type" "sselog1")
|
||
(set_attr "mode" "DI")])
|
||
+
|
||
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||
+;;
|
||
+;; AMD SSE4A instructions
|
||
+;;
|
||
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||
+
|
||
+(define_insn "sse4a_vmmovntv2df"
|
||
+ [(set (match_operand:DF 0 "memory_operand" "=m")
|
||
+ (unspec:DF [(vec_select:DF
|
||
+ (match_operand:V2DF 1 "register_operand" "x")
|
||
+ (parallel [(const_int 0)]))]
|
||
+ UNSPEC_MOVNT))]
|
||
+ "TARGET_SSE4A"
|
||
+ "movntsd\t{%1, %0|%0, %1}"
|
||
+ [(set_attr "type" "ssemov")
|
||
+ (set_attr "mode" "DF")])
|
||
+
|
||
+(define_insn "sse4a_movntdf"
|
||
+ [(set (match_operand:DF 0 "memory_operand" "=m")
|
||
+ (unspec:DF [(match_operand:DF 1 "register_operand" "x")]
|
||
+ UNSPEC_MOVNT))]
|
||
+ "TARGET_SSE4A"
|
||
+ "movntsd\t{%1, %0|%0, %1}"
|
||
+ [(set_attr "type" "ssemov")
|
||
+ (set_attr "mode" "DF")])
|
||
+
|
||
+(define_insn "sse4a_vmmovntv4sf"
|
||
+ [(set (match_operand:SF 0 "memory_operand" "=m")
|
||
+ (unspec:SF [(vec_select:SF
|
||
+ (match_operand:V4SF 1 "register_operand" "x")
|
||
+ (parallel [(const_int 0)]))]
|
||
+ UNSPEC_MOVNT))]
|
||
+ "TARGET_SSE4A"
|
||
+ "movntss\t{%1, %0|%0, %1}"
|
||
+ [(set_attr "type" "ssemov")
|
||
+ (set_attr "mode" "SF")])
|
||
+
|
||
+(define_insn "sse4a_movntsf"
|
||
+ [(set (match_operand:SF 0 "memory_operand" "=m")
|
||
+ (unspec:SF [(match_operand:SF 1 "register_operand" "x")]
|
||
+ UNSPEC_MOVNT))]
|
||
+ "TARGET_SSE4A"
|
||
+ "movntss\t{%1, %0|%0, %1}"
|
||
+ [(set_attr "type" "ssemov")
|
||
+ (set_attr "mode" "SF")])
|
||
+
|
||
+(define_insn "sse4a_extrqi"
|
||
+ [(set (match_operand:V2DI 0 "register_operand" "=x")
|
||
+ (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
|
||
+ (match_operand 2 "const_int_operand" "")
|
||
+ (match_operand 3 "const_int_operand" "")]
|
||
+ UNSPEC_EXTRQI))]
|
||
+ "TARGET_SSE4A"
|
||
+ "extrq\t{%3, %2, %0|%0, %2, %3}"
|
||
+ [(set_attr "type" "sse")
|
||
+ (set_attr "mode" "TI")])
|
||
+
|
||
+(define_insn "sse4a_extrq"
|
||
+ [(set (match_operand:V2DI 0 "register_operand" "=x")
|
||
+ (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
|
||
+ (match_operand:V16QI 2 "register_operand" "x")]
|
||
+ UNSPEC_EXTRQ))]
|
||
+ "TARGET_SSE4A"
|
||
+ "extrq\t{%2, %0|%0, %2}"
|
||
+ [(set_attr "type" "sse")
|
||
+ (set_attr "mode" "TI")])
|
||
+
|
||
+(define_insn "sse4a_insertqi"
|
||
+ [(set (match_operand:V2DI 0 "register_operand" "=x")
|
||
+ (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
|
||
+ (match_operand:V2DI 2 "register_operand" "x")
|
||
+ (match_operand 3 "const_int_operand" "")
|
||
+ (match_operand 4 "const_int_operand" "")]
|
||
+ UNSPEC_INSERTQI))]
|
||
+ "TARGET_SSE4A"
|
||
+ "insertq\t{%4, %3, %2, %0|%0, %2, %3, %4}"
|
||
+ [(set_attr "type" "sseins")
|
||
+ (set_attr "mode" "TI")])
|
||
+
|
||
+(define_insn "sse4a_insertq"
|
||
+ [(set (match_operand:V2DI 0 "register_operand" "=x")
|
||
+ (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
|
||
+ (match_operand:V2DI 2 "register_operand" "x")]
|
||
+ UNSPEC_INSERTQ))]
|
||
+ "TARGET_SSE4A"
|
||
+ "insertq\t{%2, %0|%0, %2}"
|
||
+ [(set_attr "type" "sseins")
|
||
+ (set_attr "mode" "TI")])
|
||
--- gcc/config/i386/i386.opt.jj 2007-02-09 16:18:25.000000000 +0100
|
||
+++ gcc/config/i386/i386.opt 2007-02-09 21:26:06.000000000 +0100
|
||
@@ -205,6 +205,22 @@ mmni
|
||
Target Undocumented Mask(SSSE3) MaskExists
|
||
Support MMX, SSE, SSE2, SSE3 and SSSE3 built-in functions and code generation
|
||
|
||
+msse4a
|
||
+Target Report Mask(SSE4A)
|
||
+Support MMX, SSE, SSE2, SSE3 and SSE4A built-in functions and code generation
|
||
+
|
||
+mpopcnt
|
||
+Target Report Mask(POPCNT)
|
||
+Support code generation of popcount instruction for popcount built-ins
|
||
+namely __builtin_popcount, __builtin_popcountl and __builtin_popcountll
|
||
+
|
||
+mabm
|
||
+Target Report Mask(ABM)
|
||
+Support code generation of Advanced Bit Manipulation (ABM) instructions,
|
||
+which include popcnt and lzcnt instructions, for popcount and clz built-ins
|
||
+namely __builtin_popcount, __builtin_popcountl, __builtin_popcountll and
|
||
+__builtin_clz, __builtin_clzl, __builtin_clzll
|
||
+
|
||
msseregparm
|
||
Target RejectNegative Mask(SSEREGPARM)
|
||
Use SSE register passing conventions for SF and DF mode
|
||
--- gcc/config/i386/ammintrin.h.jj 2007-02-09 21:26:06.000000000 +0100
|
||
+++ gcc/config/i386/ammintrin.h 2007-02-09 21:26:06.000000000 +0100
|
||
@@ -0,0 +1,73 @@
|
||
+/* Copyright (C) 2007 Free Software Foundation, Inc.
|
||
+
|
||
+ This file is part of GCC.
|
||
+
|
||
+ GCC is free software; you can redistribute it and/or modify
|
||
+ it under the terms of the GNU General Public License as published by
|
||
+ the Free Software Foundation; either version 2, or (at your option)
|
||
+ any later version.
|
||
+
|
||
+ GCC is distributed in the hope that it will be useful,
|
||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
+ GNU General Public License for more details.
|
||
+
|
||
+ You should have received a copy of the GNU General Public License
|
||
+ along with GCC; see the file COPYING. If not, write to
|
||
+ the Free Software Foundation, 51 Franklin Street, Fifth Floor,
|
||
+ Boston, MA 02110-1301, USA. */
|
||
+
|
||
+/* As a special exception, if you include this header file into source
|
||
+ files compiled by GCC, this header file does not by itself cause
|
||
+ the resulting executable to be covered by the GNU General Public
|
||
+ License. This exception does not however invalidate any other
|
||
+ reasons why the executable file might be covered by the GNU General
|
||
+ Public License. */
|
||
+
|
||
+/* Implemented from the specification included in the AMD Programmers
|
||
+ Manual Update, version 2.x */
|
||
+
|
||
+#ifndef _AMMINTRIN_H_INCLUDED
|
||
+#define _AMMINTRIN_H_INCLUDED
|
||
+
|
||
+#ifndef __SSE4A__
|
||
+# error "SSE4A instruction set not enabled"
|
||
+#else
|
||
+
|
||
+/* We need definitions from the SSE3, SSE2 and SSE header files*/
|
||
+#include <pmmintrin.h>
|
||
+
|
||
+static __inline void __attribute__((__always_inline__))
|
||
+_mm_stream_sd (double * __P, __m128d __Y)
|
||
+{
|
||
+ __builtin_ia32_movntsd (__P, (__v2df) __Y);
|
||
+}
|
||
+
|
||
+static __inline void __attribute__((__always_inline__))
|
||
+_mm_stream_ss (float * __P, __m128 __Y)
|
||
+{
|
||
+ __builtin_ia32_movntss (__P, (__v4sf) __Y);
|
||
+}
|
||
+
|
||
+static __inline __m128i __attribute__((__always_inline__))
|
||
+_mm_extract_si64 (__m128i __X, __m128i __Y)
|
||
+{
|
||
+ return (__m128i) __builtin_ia32_extrq ((__v2di) __X, (__v16qi) __Y);
|
||
+}
|
||
+
|
||
+#define _mm_extracti_si64(X, I, L) \
|
||
+((__m128i) __builtin_ia32_extrqi ((__v2di)(X), I, L))
|
||
+
|
||
+static __inline __m128i __attribute__((__always_inline__))
|
||
+_mm_insert_si64 (__m128i __X,__m128i __Y)
|
||
+{
|
||
+ return (__m128i) __builtin_ia32_insertq ((__v2di)__X, (__v2di)__Y);
|
||
+}
|
||
+
|
||
+#define _mm_inserti_si64(X, Y, I, L) \
|
||
+((__m128i) __builtin_ia32_insertqi ((__v2di)(X), (__v2di)(Y), I, L))
|
||
+
|
||
+
|
||
+#endif /* __SSE4A__ */
|
||
+
|
||
+#endif /* _AMMINTRIN_H_INCLUDED */
|
||
--- gcc/config/i386/emmintrin.h.jj 2006-10-05 00:29:29.000000000 +0200
|
||
+++ gcc/config/i386/emmintrin.h 2007-02-09 21:26:06.000000000 +0100
|
||
@@ -30,7 +30,11 @@
|
||
#ifndef _EMMINTRIN_H_INCLUDED
|
||
#define _EMMINTRIN_H_INCLUDED
|
||
|
||
-#ifdef __SSE2__
|
||
+#ifndef __SSE2__
|
||
+# error "SSE2 instruction set not enabled"
|
||
+#else
|
||
+
|
||
+/* We need definitions from the SSE header files*/
|
||
#include <xmmintrin.h>
|
||
|
||
/* SSE2 */
|
||
--- gcc/config/i386/i386.c.jj 2007-02-09 16:24:00.000000000 +0100
|
||
+++ gcc/config/i386/i386.c 2007-02-10 19:47:05.000000000 +0100
|
||
@@ -534,6 +534,71 @@ struct processor_costs k8_cost = {
|
||
COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
|
||
};
|
||
|
||
+struct processor_costs amdfam10_cost = {
|
||
+ COSTS_N_INSNS (1), /* cost of an add instruction */
|
||
+ COSTS_N_INSNS (2), /* cost of a lea instruction */
|
||
+ COSTS_N_INSNS (1), /* variable shift costs */
|
||
+ COSTS_N_INSNS (1), /* constant shift costs */
|
||
+ {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
|
||
+ COSTS_N_INSNS (4), /* HI */
|
||
+ COSTS_N_INSNS (3), /* SI */
|
||
+ COSTS_N_INSNS (4), /* DI */
|
||
+ COSTS_N_INSNS (5)}, /* other */
|
||
+ 0, /* cost of multiply per each bit set */
|
||
+ {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
|
||
+ COSTS_N_INSNS (35), /* HI */
|
||
+ COSTS_N_INSNS (51), /* SI */
|
||
+ COSTS_N_INSNS (83), /* DI */
|
||
+ COSTS_N_INSNS (83)}, /* other */
|
||
+ COSTS_N_INSNS (1), /* cost of movsx */
|
||
+ COSTS_N_INSNS (1), /* cost of movzx */
|
||
+ 8, /* "large" insn */
|
||
+ 9, /* MOVE_RATIO */
|
||
+ 4, /* cost for loading QImode using movzbl */
|
||
+ {3, 4, 3}, /* cost of loading integer registers
|
||
+ in QImode, HImode and SImode.
|
||
+ Relative to reg-reg move (2). */
|
||
+ {3, 4, 3}, /* cost of storing integer registers */
|
||
+ 4, /* cost of reg,reg fld/fst */
|
||
+ {4, 4, 12}, /* cost of loading fp registers
|
||
+ in SFmode, DFmode and XFmode */
|
||
+ {6, 6, 8}, /* cost of storing fp registers
|
||
+ in SFmode, DFmode and XFmode */
|
||
+ 2, /* cost of moving MMX register */
|
||
+ {3, 3}, /* cost of loading MMX registers
|
||
+ in SImode and DImode */
|
||
+ {4, 4}, /* cost of storing MMX registers
|
||
+ in SImode and DImode */
|
||
+ 2, /* cost of moving SSE register */
|
||
+ {4, 4, 3}, /* cost of loading SSE registers
|
||
+ in SImode, DImode and TImode */
|
||
+ {4, 4, 5}, /* cost of storing SSE registers
|
||
+ in SImode, DImode and TImode */
|
||
+ 3, /* MMX or SSE register to integer */
|
||
+ /* On K8
|
||
+ MOVD reg64, xmmreg Double FSTORE 4
|
||
+ MOVD reg32, xmmreg Double FSTORE 4
|
||
+ On AMDFAM10
|
||
+ MOVD reg64, xmmreg Double FADD 3
|
||
+ 1/1 1/1
|
||
+ MOVD reg32, xmmreg Double FADD 3
|
||
+ 1/1 1/1 */
|
||
+ 64, /* size of prefetch block */
|
||
+ /* New AMD processors never drop prefetches; if they cannot be performed
|
||
+ immediately, they are queued. We set number of simultaneous prefetches
|
||
+ to a large constant to reflect this (it probably is not a good idea not
|
||
+ to limit number of prefetches at all, as their execution also takes some
|
||
+ time). */
|
||
+ 100, /* number of parallel prefetches */
|
||
+ 5, /* Branch cost */
|
||
+ COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
|
||
+ COSTS_N_INSNS (4), /* cost of FMUL instruction. */
|
||
+ COSTS_N_INSNS (19), /* cost of FDIV instruction. */
|
||
+ COSTS_N_INSNS (2), /* cost of FABS instruction. */
|
||
+ COSTS_N_INSNS (2), /* cost of FCHS instruction. */
|
||
+ COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
|
||
+};
|
||
+
|
||
static const
|
||
struct processor_costs pentium4_cost = {
|
||
COSTS_N_INSNS (1), /* cost of an add instruction */
|
||
@@ -816,11 +881,13 @@ const struct processor_costs *ix86_cost
|
||
#define m_PENT4 (1<<PROCESSOR_PENTIUM4)
|
||
#define m_K8 (1<<PROCESSOR_K8)
|
||
#define m_ATHLON_K8 (m_K8 | m_ATHLON)
|
||
+#define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
|
||
#define m_NOCONA (1<<PROCESSOR_NOCONA)
|
||
#define m_CORE2 (1<<PROCESSOR_CORE2)
|
||
#define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
|
||
#define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
|
||
#define m_GENERIC (m_GENERIC32 | m_GENERIC64)
|
||
+#define m_ATHLON_K8_AMDFAM10 (m_K8 | m_ATHLON | m_AMDFAM10)
|
||
|
||
/* Generic instruction choice should be common subset of supported CPUs
|
||
(PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
|
||
@@ -828,23 +895,31 @@ const struct processor_costs *ix86_cost
|
||
/* Leave is not affecting Nocona SPEC2000 results negatively, so enabling for
|
||
Generic64 seems like good code size tradeoff. We can't enable it for 32bit
|
||
generic because it is not working well with PPro base chips. */
|
||
-const int x86_use_leave = m_386 | m_K6_GEODE | m_ATHLON_K8 | m_CORE2 | m_GENERIC64;
|
||
-const int x86_push_memory = m_386 | m_K6_GEODE | m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
|
||
+const int x86_use_leave = m_386 | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 | m_CORE2
|
||
+ | m_GENERIC64;
|
||
+const int x86_push_memory = m_386 | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 | m_PENT4
|
||
+ | m_NOCONA | m_CORE2 | m_GENERIC;
|
||
const int x86_zero_extend_with_and = m_486 | m_PENT;
|
||
-const int x86_movx = m_ATHLON_K8 | m_PPRO | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC | m_GEODE /* m_386 | m_K6 */;
|
||
+/* Enable to zero extend integer registers to avoid partial dependencies */
|
||
+const int x86_movx = m_ATHLON_K8_AMDFAM10 | m_PPRO | m_PENT4 | m_NOCONA
|
||
+ | m_CORE2 | m_GENERIC | m_GEODE /* m_386 | m_K6 */;
|
||
const int x86_double_with_add = ~m_386;
|
||
const int x86_use_bit_test = m_386;
|
||
-const int x86_unroll_strlen = m_486 | m_PENT | m_PPRO | m_ATHLON_K8 | m_K6 | m_CORE2 | m_GENERIC;
|
||
-const int x86_cmove = m_PPRO | m_GEODE | m_ATHLON_K8 | m_PENT4 | m_NOCONA;
|
||
+const int x86_unroll_strlen = m_486 | m_PENT | m_PPRO | m_ATHLON_K8_AMDFAM10
|
||
+ | m_K6 | m_CORE2 | m_GENERIC;
|
||
+const int x86_cmove = m_PPRO | m_GEODE | m_ATHLON_K8_AMDFAM10 | m_PENT4
|
||
+ | m_NOCONA;
|
||
const int x86_fisttp = m_NOCONA;
|
||
-const int x86_3dnow_a = m_ATHLON_K8;
|
||
-const int x86_deep_branch = m_PPRO | m_K6_GEODE | m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
|
||
+const int x86_3dnow_a = m_ATHLON_K8_AMDFAM10;
|
||
+const int x86_deep_branch = m_PPRO | m_K6_GEODE | m_ATHLON_K8_AMDFAM10
|
||
+ | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
|
||
/* Branch hints were put in P4 based on simulation result. But
|
||
after P4 was made, no performance benefit was observed with
|
||
branch hints. It also increases the code size. As the result,
|
||
icc never generates branch hints. */
|
||
const int x86_branch_hints = 0;
|
||
-const int x86_use_sahf = m_PPRO | m_K6_GEODE | m_PENT4 | m_NOCONA | m_GENERIC32; /*m_GENERIC | m_ATHLON_K8 ? */
|
||
+const int x86_use_sahf = m_PPRO | m_K6_GEODE | m_PENT4 | m_NOCONA | m_GENERIC32;
|
||
+ /*m_GENERIC | m_ATHLON_K8 ? */
|
||
/* We probably ought to watch for partial register stalls on Generic32
|
||
compilation setting as well. However in current implementation the
|
||
partial register stalls are not eliminated very well - they can
|
||
@@ -856,13 +931,16 @@ const int x86_use_sahf = m_PPRO | m_K6_G
|
||
const int x86_partial_reg_stall = m_PPRO;
|
||
const int x86_partial_flag_reg_stall = m_CORE2 | m_GENERIC;
|
||
const int x86_use_himode_fiop = m_386 | m_486 | m_K6_GEODE;
|
||
-const int x86_use_simode_fiop = ~(m_PPRO | m_ATHLON_K8 | m_PENT | m_CORE2 | m_GENERIC);
|
||
+const int x86_use_simode_fiop = ~(m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT
|
||
+ | m_CORE2 | m_GENERIC);
|
||
const int x86_use_mov0 = m_K6;
|
||
const int x86_use_cltd = ~(m_PENT | m_K6 | m_CORE2 | m_GENERIC);
|
||
const int x86_read_modify_write = ~m_PENT;
|
||
const int x86_read_modify = ~(m_PENT | m_PPRO);
|
||
const int x86_split_long_moves = m_PPRO;
|
||
-const int x86_promote_QImode = m_K6_GEODE | m_PENT | m_386 | m_486 | m_ATHLON_K8 | m_CORE2 | m_GENERIC; /* m_PENT4 ? */
|
||
+const int x86_promote_QImode = m_K6_GEODE | m_PENT | m_386 | m_486
|
||
+ | m_ATHLON_K8_AMDFAM10 | m_CORE2 | m_GENERIC;
|
||
+ /* m_PENT4 ? */
|
||
const int x86_fast_prefix = ~(m_PENT | m_486 | m_386);
|
||
const int x86_single_stringop = m_386 | m_PENT4 | m_NOCONA;
|
||
const int x86_qimode_math = ~(0);
|
||
@@ -872,18 +950,37 @@ const int x86_promote_qi_regs = 0;
|
||
if our scheme for avoiding partial stalls was more effective. */
|
||
const int x86_himode_math = ~(m_PPRO);
|
||
const int x86_promote_hi_regs = m_PPRO;
|
||
-const int x86_sub_esp_4 = m_ATHLON_K8 | m_PPRO | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
|
||
-const int x86_sub_esp_8 = m_ATHLON_K8 | m_PPRO | m_386 | m_486 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
|
||
-const int x86_add_esp_4 = m_ATHLON_K8 | m_K6_GEODE | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
|
||
-const int x86_add_esp_8 = m_ATHLON_K8 | m_PPRO | m_K6_GEODE | m_386 | m_486 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
|
||
-const int x86_integer_DFmode_moves = ~(m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC | m_GEODE);
|
||
-const int x86_partial_reg_dependency = m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
|
||
-const int x86_memory_mismatch_stall = m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
|
||
-const int x86_accumulate_outgoing_args = m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC;
|
||
+/* Enable if add/sub rsp is preferred over 1 or 2 push/pop */
|
||
+const int x86_sub_esp_4 = m_ATHLON_K8_AMDFAM10 | m_PPRO | m_PENT4 | m_NOCONA
|
||
+ | m_CORE2 | m_GENERIC;
|
||
+const int x86_sub_esp_8 = m_ATHLON_K8_AMDFAM10 | m_PPRO | m_386 | m_486
|
||
+ | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
|
||
+const int x86_add_esp_4 = m_ATHLON_K8_AMDFAM10 | m_K6_GEODE | m_PENT4 | m_NOCONA
|
||
+ | m_CORE2 | m_GENERIC;
|
||
+const int x86_add_esp_8 = m_ATHLON_K8_AMDFAM10 | m_PPRO | m_K6_GEODE | m_386
|
||
+ | m_486 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
|
||
+/* Enable if integer moves are preferred for DFmode copies */
|
||
+const int x86_integer_DFmode_moves = ~(m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA
|
||
+ | m_PPRO | m_CORE2 | m_GENERIC | m_GEODE);
|
||
+const int x86_partial_reg_dependency = m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA
|
||
+ | m_CORE2 | m_GENERIC;
|
||
+const int x86_memory_mismatch_stall = m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA
|
||
+ | m_CORE2 | m_GENERIC;
|
||
+/* If ACCUMULATE_OUTGOING_ARGS is enabled, the maximum amount of space required
|
||
+ for outgoing arguments will be computed and placed into the variable
|
||
+ `current_function_outgoing_args_size'. No space will be pushed onto the stack
|
||
+ for each call; instead, the function prologue should increase the stack frame
|
||
+ size by this amount. Setting both PUSH_ARGS and ACCUMULATE_OUTGOING_ARGS is
|
||
+ not proper. */
|
||
+const int x86_accumulate_outgoing_args = m_ATHLON_K8_AMDFAM10 | m_PENT4
|
||
+ | m_NOCONA | m_PPRO | m_CORE2
|
||
+ | m_GENERIC;
|
||
const int x86_prologue_using_move = m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC;
|
||
const int x86_epilogue_using_move = m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC;
|
||
const int x86_shift1 = ~m_486;
|
||
-const int x86_arch_always_fancy_math_387 = m_PENT | m_PPRO | m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
|
||
+const int x86_arch_always_fancy_math_387 = m_PENT | m_PPRO
|
||
+ | m_ATHLON_K8_AMDFAM10 | m_PENT4
|
||
+ | m_NOCONA | m_CORE2 | m_GENERIC;
|
||
/* In Generic model we have an confict here in between PPro/Pentium4 based chips
|
||
that thread 128bit SSE registers as single units versus K8 based chips that
|
||
divide SSE registers to two 64bit halves.
|
||
@@ -893,15 +990,66 @@ const int x86_arch_always_fancy_math_387
|
||
this option on P4 brings over 20% SPECfp regression, while enabling it on
|
||
K8 brings roughly 2.4% regression that can be partly masked by careful scheduling
|
||
of moves. */
|
||
-const int x86_sse_partial_reg_dependency = m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC;
|
||
+const int x86_sse_partial_reg_dependency = m_PENT4 | m_NOCONA | m_PPRO | m_CORE2
|
||
+ | m_GENERIC | m_AMDFAM10;
|
||
/* Set for machines where the type and dependencies are resolved on SSE
|
||
register parts instead of whole registers, so we may maintain just
|
||
lower part of scalar values in proper format leaving the upper part
|
||
undefined. */
|
||
const int x86_sse_split_regs = m_ATHLON_K8;
|
||
-const int x86_sse_typeless_stores = m_ATHLON_K8;
|
||
+/* Code generation for scalar reg-reg moves of single and double precision data:
|
||
+ if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
|
||
+ movaps reg, reg
|
||
+ else
|
||
+ movss reg, reg
|
||
+ if (x86_sse_partial_reg_dependency == true)
|
||
+ movapd reg, reg
|
||
+ else
|
||
+ movsd reg, reg
|
||
+
|
||
+ Code generation for scalar loads of double precision data:
|
||
+ if (x86_sse_split_regs == true)
|
||
+ movlpd mem, reg (gas syntax)
|
||
+ else
|
||
+ movsd mem, reg
|
||
+
|
||
+ Code generation for unaligned packed loads of single precision data
|
||
+ (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
|
||
+ if (x86_sse_unaligned_move_optimal)
|
||
+ movups mem, reg
|
||
+
|
||
+ if (x86_sse_partial_reg_dependency == true)
|
||
+ {
|
||
+ xorps reg, reg
|
||
+ movlps mem, reg
|
||
+ movhps mem+8, reg
|
||
+ }
|
||
+ else
|
||
+ {
|
||
+ movlps mem, reg
|
||
+ movhps mem+8, reg
|
||
+ }
|
||
+
|
||
+ Code generation for unaligned packed loads of double precision data
|
||
+ (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
|
||
+ if (x86_sse_unaligned_move_optimal)
|
||
+ movupd mem, reg
|
||
+
|
||
+ if (x86_sse_split_regs == true)
|
||
+ {
|
||
+ movlpd mem, reg
|
||
+ movhpd mem+8, reg
|
||
+ }
|
||
+ else
|
||
+ {
|
||
+ movsd mem, reg
|
||
+ movhpd mem+8, reg
|
||
+ }
|
||
+ */
|
||
+const int x86_sse_unaligned_move_optimal = m_AMDFAM10;
|
||
+const int x86_sse_typeless_stores = m_ATHLON_K8_AMDFAM10;
|
||
const int x86_sse_load0_by_pxor = m_PPRO | m_PENT4 | m_NOCONA;
|
||
-const int x86_use_ffreep = m_ATHLON_K8;
|
||
+const int x86_use_ffreep = m_ATHLON_K8_AMDFAM10;
|
||
const int x86_rep_movl_optimal = m_386 | m_PENT | m_PPRO | m_K6_GEODE | m_CORE2;
|
||
const int x86_use_incdec = ~(m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC);
|
||
|
||
@@ -909,19 +1057,22 @@ const int x86_use_incdec = ~(m_PENT4 | m
|
||
integer data in xmm registers. Which results in pretty abysmal code. */
|
||
const int x86_inter_unit_moves = 0 /* ~(m_ATHLON_K8) */;
|
||
|
||
-const int x86_ext_80387_constants = m_K6_GEODE | m_ATHLON | m_PENT4 | m_NOCONA | m_PPRO | m_GENERIC32;
|
||
+const int x86_ext_80387_constants = m_K6_GEODE | m_ATHLON | m_PENT4
|
||
+ | m_NOCONA | m_PPRO | m_GENERIC32;
|
||
/* Some CPU cores are not able to predict more than 4 branch instructions in
|
||
the 16 byte window. */
|
||
-const int x86_four_jump_limit = m_PPRO | m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
|
||
-const int x86_schedule = m_PPRO | m_ATHLON_K8 | m_K6_GEODE | m_PENT | m_CORE2 | m_GENERIC;
|
||
-const int x86_use_bt = m_ATHLON_K8;
|
||
+const int x86_four_jump_limit = m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT4
|
||
+ | m_NOCONA | m_CORE2 | m_GENERIC;
|
||
+const int x86_schedule = m_PPRO | m_ATHLON_K8_AMDFAM10 | m_K6_GEODE | m_PENT
|
||
+ | m_CORE2 | m_GENERIC;
|
||
+const int x86_use_bt = m_ATHLON_K8_AMDFAM10;
|
||
/* Compare and exchange was added for 80486. */
|
||
const int x86_cmpxchg = ~m_386;
|
||
/* Compare and exchange 8 bytes was added for pentium. */
|
||
const int x86_cmpxchg8b = ~(m_386 | m_486);
|
||
/* Exchange and add was added for 80486. */
|
||
const int x86_xadd = ~m_386;
|
||
-const int x86_pad_returns = m_ATHLON_K8 | m_CORE2 | m_GENERIC;
|
||
+const int x86_pad_returns = m_ATHLON_K8_AMDFAM10 | m_CORE2 | m_GENERIC;
|
||
|
||
/* In case the average insn count for single function invocation is
|
||
lower than this constant, emit fast (but longer) prologue and
|
||
@@ -1485,16 +1636,24 @@ ix86_handle_option (size_t code, const c
|
||
case OPT_msse:
|
||
if (!value)
|
||
{
|
||
- target_flags &= ~(MASK_SSE2 | MASK_SSE3);
|
||
- target_flags_explicit |= MASK_SSE2 | MASK_SSE3;
|
||
+ target_flags &= ~(MASK_SSE2 | MASK_SSE3 | MASK_SSE4A);
|
||
+ target_flags_explicit |= MASK_SSE2 | MASK_SSE3 | MASK_SSE4A;
|
||
}
|
||
return true;
|
||
|
||
case OPT_msse2:
|
||
if (!value)
|
||
{
|
||
- target_flags &= ~MASK_SSE3;
|
||
- target_flags_explicit |= MASK_SSE3;
|
||
+ target_flags &= ~(MASK_SSE3 | MASK_SSE4A);
|
||
+ target_flags_explicit |= MASK_SSE3 | MASK_SSE4A;
|
||
+ }
|
||
+ return true;
|
||
+
|
||
+ case OPT_msse3:
|
||
+ if (!value)
|
||
+ {
|
||
+ target_flags &= ~MASK_SSE4A;
|
||
+ target_flags_explicit |= MASK_SSE4A;
|
||
}
|
||
return true;
|
||
|
||
@@ -1546,7 +1705,8 @@ override_options (void)
|
||
{&nocona_cost, 0, 0, 0, 0, 0, 0, 0},
|
||
{&core2_cost, 0, 0, 16, 7, 16, 7, 16},
|
||
{&generic32_cost, 0, 0, 16, 7, 16, 7, 16},
|
||
- {&generic64_cost, 0, 0, 16, 7, 16, 7, 16}
|
||
+ {&generic64_cost, 0, 0, 16, 7, 16, 7, 16},
|
||
+ {&amdfam10_cost, 0, 0, 32, 7, 32, 7, 32}
|
||
};
|
||
|
||
static const char * const cpu_names[] = TARGET_CPU_DEFAULT_NAMES;
|
||
@@ -1565,7 +1725,10 @@ override_options (void)
|
||
PTA_3DNOW_A = 64,
|
||
PTA_64BIT = 128,
|
||
PTA_SSSE3 = 256,
|
||
- PTA_CX16 = 512
|
||
+ PTA_CX16 = 512,
|
||
+ PTA_POPCNT = 1024,
|
||
+ PTA_ABM = 2048,
|
||
+ PTA_SSE4A = 4096
|
||
} flags;
|
||
}
|
||
const processor_alias_table[] =
|
||
@@ -1621,6 +1784,10 @@ override_options (void)
|
||
| PTA_3DNOW_A | PTA_SSE | PTA_SSE2},
|
||
{"athlon-fx", PROCESSOR_K8, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW | PTA_64BIT
|
||
| PTA_3DNOW_A | PTA_SSE | PTA_SSE2},
|
||
+ {"amdfam10", PROCESSOR_AMDFAM10, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
|
||
+ | PTA_64BIT | PTA_3DNOW_A | PTA_SSE
|
||
+ | PTA_SSE2 | PTA_SSE3 | PTA_POPCNT
|
||
+ | PTA_ABM | PTA_SSE4A | PTA_CX16},
|
||
{"generic32", PROCESSOR_GENERIC32, 0 /* flags are only used for -march switch. */ },
|
||
{"generic64", PROCESSOR_GENERIC64, PTA_64BIT /* flags are only used for -march switch. */ },
|
||
};
|
||
@@ -1772,6 +1939,15 @@ override_options (void)
|
||
x86_prefetch_sse = true;
|
||
if (processor_alias_table[i].flags & PTA_CX16)
|
||
x86_cmpxchg16b = true;
|
||
+ if (processor_alias_table[i].flags & PTA_POPCNT
|
||
+ && !(target_flags_explicit & MASK_POPCNT))
|
||
+ target_flags |= MASK_POPCNT;
|
||
+ if (processor_alias_table[i].flags & PTA_ABM
|
||
+ && !(target_flags_explicit & MASK_ABM))
|
||
+ target_flags |= MASK_ABM;
|
||
+ if (processor_alias_table[i].flags & PTA_SSE4A
|
||
+ && !(target_flags_explicit & MASK_SSE4A))
|
||
+ target_flags |= MASK_SSE4A;
|
||
if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
|
||
error ("CPU you selected does not support x86-64 "
|
||
"instruction set");
|
||
@@ -1963,6 +2139,10 @@ override_options (void)
|
||
if (TARGET_SSSE3)
|
||
target_flags |= MASK_SSE3;
|
||
|
||
+ /* Turn on SSE3 builtins for -msse4a. */
|
||
+ if (TARGET_SSE4A)
|
||
+ target_flags |= MASK_SSE3;
|
||
+
|
||
/* Turn on SSE2 builtins for -msse3. */
|
||
if (TARGET_SSE3)
|
||
target_flags |= MASK_SSE2;
|
||
@@ -1982,6 +2162,10 @@ override_options (void)
|
||
if (TARGET_3DNOW)
|
||
target_flags |= MASK_MMX;
|
||
|
||
+ /* Turn on POPCNT builtins for -mabm. */
|
||
+ if (TARGET_ABM)
|
||
+ target_flags |= MASK_POPCNT;
|
||
+
|
||
if (TARGET_64BIT)
|
||
{
|
||
if (TARGET_ALIGN_DOUBLE)
|
||
@@ -8900,8 +9084,16 @@ ix86_expand_vector_move_misalign (enum m
|
||
}
|
||
|
||
if (TARGET_SSE2 && mode == V2DFmode)
|
||
- {
|
||
- rtx zero;
|
||
+ {
|
||
+ rtx zero;
|
||
+
|
||
+ if (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL)
|
||
+ {
|
||
+ op0 = gen_lowpart (V2DFmode, op0);
|
||
+ op1 = gen_lowpart (V2DFmode, op1);
|
||
+ emit_insn (gen_sse2_movupd (op0, op1));
|
||
+ return;
|
||
+ }
|
||
|
||
/* When SSE registers are split into halves, we can avoid
|
||
writing to the top half twice. */
|
||
@@ -8929,7 +9121,15 @@ ix86_expand_vector_move_misalign (enum m
|
||
emit_insn (gen_sse2_loadhpd (op0, op0, m));
|
||
}
|
||
else
|
||
- {
|
||
+ {
|
||
+ if (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL)
|
||
+ {
|
||
+ op0 = gen_lowpart (V4SFmode, op0);
|
||
+ op1 = gen_lowpart (V4SFmode, op1);
|
||
+ emit_insn (gen_sse_movups (op0, op1));
|
||
+ return;
|
||
+ }
|
||
+
|
||
if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
|
||
emit_move_insn (op0, CONST0_RTX (mode));
|
||
else
|
||
@@ -13461,6 +13661,7 @@ ix86_issue_rate (void)
|
||
case PROCESSOR_PENTIUM4:
|
||
case PROCESSOR_ATHLON:
|
||
case PROCESSOR_K8:
|
||
+ case PROCESSOR_AMDFAM10:
|
||
case PROCESSOR_NOCONA:
|
||
case PROCESSOR_GENERIC32:
|
||
case PROCESSOR_GENERIC64:
|
||
@@ -13659,6 +13860,7 @@ ix86_adjust_cost (rtx insn, rtx link, rt
|
||
|
||
case PROCESSOR_ATHLON:
|
||
case PROCESSOR_K8:
|
||
+ case PROCESSOR_AMDFAM10:
|
||
case PROCESSOR_GENERIC32:
|
||
case PROCESSOR_GENERIC64:
|
||
memory = get_attr_memory (insn);
|
||
@@ -14370,6 +14572,14 @@ enum ix86_builtins
|
||
IX86_BUILTIN_PABSW128,
|
||
IX86_BUILTIN_PABSD128,
|
||
|
||
+ /* AMDFAM10 - SSE4A New Instructions. */
|
||
+ IX86_BUILTIN_MOVNTSD,
|
||
+ IX86_BUILTIN_MOVNTSS,
|
||
+ IX86_BUILTIN_EXTRQI,
|
||
+ IX86_BUILTIN_EXTRQ,
|
||
+ IX86_BUILTIN_INSERTQI,
|
||
+ IX86_BUILTIN_INSERTQ,
|
||
+
|
||
IX86_BUILTIN_VEC_INIT_V2SI,
|
||
IX86_BUILTIN_VEC_INIT_V4HI,
|
||
IX86_BUILTIN_VEC_INIT_V8QI,
|
||
@@ -15102,6 +15312,18 @@ ix86_init_mmx_sse_builtins (void)
|
||
= build_function_type_list (void_type_node,
|
||
pchar_type_node, V16QI_type_node, NULL_TREE);
|
||
|
||
+ tree v2di_ftype_v2di_unsigned_unsigned
|
||
+ = build_function_type_list (V2DI_type_node, V2DI_type_node,
|
||
+ unsigned_type_node, unsigned_type_node,
|
||
+ NULL_TREE);
|
||
+ tree v2di_ftype_v2di_v2di_unsigned_unsigned
|
||
+ = build_function_type_list (V2DI_type_node, V2DI_type_node, V2DI_type_node,
|
||
+ unsigned_type_node, unsigned_type_node,
|
||
+ NULL_TREE);
|
||
+ tree v2di_ftype_v2di_v16qi
|
||
+ = build_function_type_list (V2DI_type_node, V2DI_type_node, V16QI_type_node,
|
||
+ NULL_TREE);
|
||
+
|
||
tree float80_type;
|
||
tree float128_type;
|
||
tree ftype;
|
||
@@ -15435,6 +15657,20 @@ ix86_init_mmx_sse_builtins (void)
|
||
def_builtin (MASK_SSSE3, "__builtin_ia32_palignr", di_ftype_di_di_int,
|
||
IX86_BUILTIN_PALIGNR);
|
||
|
||
+ /* AMDFAM10 SSE4A New built-ins */
|
||
+ def_builtin (MASK_SSE4A, "__builtin_ia32_movntsd",
|
||
+ void_ftype_pdouble_v2df, IX86_BUILTIN_MOVNTSD);
|
||
+ def_builtin (MASK_SSE4A, "__builtin_ia32_movntss",
|
||
+ void_ftype_pfloat_v4sf, IX86_BUILTIN_MOVNTSS);
|
||
+ def_builtin (MASK_SSE4A, "__builtin_ia32_extrqi",
|
||
+ v2di_ftype_v2di_unsigned_unsigned, IX86_BUILTIN_EXTRQI);
|
||
+ def_builtin (MASK_SSE4A, "__builtin_ia32_extrq",
|
||
+ v2di_ftype_v2di_v16qi, IX86_BUILTIN_EXTRQ);
|
||
+ def_builtin (MASK_SSE4A, "__builtin_ia32_insertqi",
|
||
+ v2di_ftype_v2di_v2di_unsigned_unsigned, IX86_BUILTIN_INSERTQI);
|
||
+ def_builtin (MASK_SSE4A, "__builtin_ia32_insertq",
|
||
+ v2di_ftype_v2di_v2di, IX86_BUILTIN_INSERTQ);
|
||
+
|
||
/* Access to the vec_init patterns. */
|
||
ftype = build_function_type_list (V2SI_type_node, integer_type_node,
|
||
integer_type_node, NULL_TREE);
|
||
@@ -15923,9 +16159,9 @@ ix86_expand_builtin (tree exp, rtx targe
|
||
enum insn_code icode;
|
||
tree fndecl = TREE_OPERAND (TREE_OPERAND (exp, 0), 0);
|
||
tree arglist = TREE_OPERAND (exp, 1);
|
||
- tree arg0, arg1, arg2;
|
||
- rtx op0, op1, op2, pat;
|
||
- enum machine_mode tmode, mode0, mode1, mode2, mode3;
|
||
+ tree arg0, arg1, arg2, arg3;
|
||
+ rtx op0, op1, op2, op3, pat;
|
||
+ enum machine_mode tmode, mode0, mode1, mode2, mode3, mode4;
|
||
unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
|
||
|
||
switch (fcode)
|
||
@@ -16340,6 +16576,114 @@ ix86_expand_builtin (tree exp, rtx targe
|
||
emit_insn (pat);
|
||
return target;
|
||
|
||
+ case IX86_BUILTIN_MOVNTSD:
|
||
+ return ix86_expand_store_builtin (CODE_FOR_sse4a_vmmovntv2df, arglist);
|
||
+
|
||
+ case IX86_BUILTIN_MOVNTSS:
|
||
+ return ix86_expand_store_builtin (CODE_FOR_sse4a_vmmovntv4sf, arglist);
|
||
+
|
||
+ case IX86_BUILTIN_INSERTQ:
|
||
+ case IX86_BUILTIN_EXTRQ:
|
||
+ icode = (fcode == IX86_BUILTIN_EXTRQ
|
||
+ ? CODE_FOR_sse4a_extrq
|
||
+ : CODE_FOR_sse4a_insertq);
|
||
+ arg0 = TREE_VALUE (arglist);
|
||
+ arg1 = TREE_VALUE (TREE_CHAIN (arglist));
|
||
+ op0 = expand_expr (arg0, NULL_RTX, VOIDmode, 0);
|
||
+ op1 = expand_expr (arg1, NULL_RTX, VOIDmode, 0);
|
||
+ tmode = insn_data[icode].operand[0].mode;
|
||
+ mode1 = insn_data[icode].operand[1].mode;
|
||
+ mode2 = insn_data[icode].operand[2].mode;
|
||
+ if (! (*insn_data[icode].operand[1].predicate) (op0, mode1))
|
||
+ op0 = copy_to_mode_reg (mode1, op0);
|
||
+ if (! (*insn_data[icode].operand[2].predicate) (op1, mode2))
|
||
+ op1 = copy_to_mode_reg (mode2, op1);
|
||
+ if (optimize || target == 0
|
||
+ || GET_MODE (target) != tmode
|
||
+ || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
|
||
+ target = gen_reg_rtx (tmode);
|
||
+ pat = GEN_FCN (icode) (target, op0, op1);
|
||
+ if (! pat)
|
||
+ return NULL_RTX;
|
||
+ emit_insn (pat);
|
||
+ return target;
|
||
+
|
||
+ case IX86_BUILTIN_EXTRQI:
|
||
+ icode = CODE_FOR_sse4a_extrqi;
|
||
+ arg0 = TREE_VALUE (arglist);
|
||
+ arg1 = TREE_VALUE (TREE_CHAIN (arglist));
|
||
+ arg2 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist)));
|
||
+ op0 = expand_expr (arg0, NULL_RTX, VOIDmode, 0);
|
||
+ op1 = expand_expr (arg1, NULL_RTX, VOIDmode, 0);
|
||
+ op2 = expand_expr (arg2, NULL_RTX, VOIDmode, 0);
|
||
+ tmode = insn_data[icode].operand[0].mode;
|
||
+ mode1 = insn_data[icode].operand[1].mode;
|
||
+ mode2 = insn_data[icode].operand[2].mode;
|
||
+ mode3 = insn_data[icode].operand[3].mode;
|
||
+ if (! (*insn_data[icode].operand[1].predicate) (op0, mode1))
|
||
+ op0 = copy_to_mode_reg (mode1, op0);
|
||
+ if (! (*insn_data[icode].operand[2].predicate) (op1, mode2))
|
||
+ {
|
||
+ error ("index mask must be an immediate");
|
||
+ return gen_reg_rtx (tmode);
|
||
+ }
|
||
+ if (! (*insn_data[icode].operand[3].predicate) (op2, mode3))
|
||
+ {
|
||
+ error ("length mask must be an immediate");
|
||
+ return gen_reg_rtx (tmode);
|
||
+ }
|
||
+ if (optimize || target == 0
|
||
+ || GET_MODE (target) != tmode
|
||
+ || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
|
||
+ target = gen_reg_rtx (tmode);
|
||
+ pat = GEN_FCN (icode) (target, op0, op1, op2);
|
||
+ if (! pat)
|
||
+ return NULL_RTX;
|
||
+ emit_insn (pat);
|
||
+ return target;
|
||
+
|
||
+ case IX86_BUILTIN_INSERTQI:
|
||
+ icode = CODE_FOR_sse4a_insertqi;
|
||
+ arg0 = TREE_VALUE (arglist);
|
||
+ arg1 = TREE_VALUE (TREE_CHAIN (arglist));
|
||
+ arg2 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist)));
|
||
+ arg3 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (TREE_CHAIN (arglist))));
|
||
+ op0 = expand_expr (arg0, NULL_RTX, VOIDmode, 0);
|
||
+ op1 = expand_expr (arg1, NULL_RTX, VOIDmode, 0);
|
||
+ op2 = expand_expr (arg2, NULL_RTX, VOIDmode, 0);
|
||
+ op3 = expand_expr (arg3, NULL_RTX, VOIDmode, 0);
|
||
+ tmode = insn_data[icode].operand[0].mode;
|
||
+ mode1 = insn_data[icode].operand[1].mode;
|
||
+ mode2 = insn_data[icode].operand[2].mode;
|
||
+ mode3 = insn_data[icode].operand[3].mode;
|
||
+ mode4 = insn_data[icode].operand[4].mode;
|
||
+
|
||
+ if (! (*insn_data[icode].operand[1].predicate) (op0, mode1))
|
||
+ op0 = copy_to_mode_reg (mode1, op0);
|
||
+
|
||
+ if (! (*insn_data[icode].operand[2].predicate) (op1, mode2))
|
||
+ op1 = copy_to_mode_reg (mode2, op1);
|
||
+
|
||
+ if (! (*insn_data[icode].operand[3].predicate) (op2, mode3))
|
||
+ {
|
||
+ error ("index mask must be an immediate");
|
||
+ return gen_reg_rtx (tmode);
|
||
+ }
|
||
+ if (! (*insn_data[icode].operand[4].predicate) (op3, mode4))
|
||
+ {
|
||
+ error ("length mask must be an immediate");
|
||
+ return gen_reg_rtx (tmode);
|
||
+ }
|
||
+ if (optimize || target == 0
|
||
+ || GET_MODE (target) != tmode
|
||
+ || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
|
||
+ target = gen_reg_rtx (tmode);
|
||
+ pat = GEN_FCN (icode) (target, op0, op1, op2, op3);
|
||
+ if (! pat)
|
||
+ return NULL_RTX;
|
||
+ emit_insn (pat);
|
||
+ return target;
|
||
+
|
||
case IX86_BUILTIN_VEC_INIT_V2SI:
|
||
case IX86_BUILTIN_VEC_INIT_V4HI:
|
||
case IX86_BUILTIN_VEC_INIT_V8QI:
|
||
--- gcc/config/i386/xmmintrin.h.jj 2006-10-05 00:29:29.000000000 +0200
|
||
+++ gcc/config/i386/xmmintrin.h 2007-02-09 21:26:06.000000000 +0100
|
||
@@ -1241,7 +1241,9 @@ do { \
|
||
} while (0)
|
||
|
||
/* For backward source compatibility. */
|
||
-#include <emmintrin.h>
|
||
+#ifdef __SSE2__
|
||
+# include <emmintrin.h>
|
||
+#endif
|
||
|
||
#endif /* __SSE__ */
|
||
#endif /* _XMMINTRIN_H_INCLUDED */
|