From d8fbfe3435cf181049488c0bf3677e98a2022e51 Mon Sep 17 00:00:00 2001
From: hubicka <hubicka@138bc75d-0d04-0410-961f-82ee72b054a4>
Date: Mon, 17 Apr 2000 21:39:30 +0000
Subject: [PATCH] 	* i386.c (athlon_cost): Fix lea, divide and XFmode
 move costs. 	(x86_integer_DFmode_moves, x86_partial_reg_dependency, 	
 x86_memory_mismatch_stall): New global variables. 	(ix86_adjust_cost):
 Handle MEMORY_BOTH on places MEMORY_STORE was only 	alloved; fix load
 penalties for Athlon. 	* i386.h (x86_integer_DFmode_moves,
 x86_partial_reg_dependency, 	x86_memory_mismatch_stall): Declare. 
 (TARGET_INTEGER_DFMODE_MOVES, TARGET_PARTIAL_REG_DEPENDENCY, 	
 TARGET_MEMORY_MISMATCH_STALL): New. 	* i386.md (athlon scheduling
 parameters): Fix latencies according to 	Athlon Optimization Manual. 
 (sahf, xchg, fldcw, leave instruction patterns): Set athlon_decode to 
 vector. 	(fsqrt instruction patterns): Set athlon_decode to direct. 
 (movhi_1): Promote for TARGET_PARTIAL_REG_DEPENDENCY and for 
 PARTIAL_REGISTER_STALL with !TARGET_HIMODE_MATH machines. 	(movqi_1):
 Handle promoting correctly for TARGET_PARTIAL_REG_DEPENDENCY 	and
 TARGET_PARTIAL_REGISTER_STALL machines. 	(pushdf_nointeger): New
 pattern. 	(pushdf_integer): Rename from pushdf. 	(movdf_nointger):
 Enable for !TARGET_INTEGER_DFMODE_MOVES machines. 	(movdf_intger):
 Disable for !TARGET_INTEGER_DFMODE_MOVES machines.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@33215 138bc75d-0d04-0410-961f-82ee72b054a4
---
 gcc/ChangeLog           |  25 +++++
 gcc/config/i386/i386.c  |  38 +++----
 gcc/config/i386/i386.h  |   6 +-
 gcc/config/i386/i386.md | 233 ++++++++++++++++++++++++++++++++--------
 4 files changed, 236 insertions(+), 66 deletions(-)

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 9fcbff80587d..8d11d84a3c3c 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,28 @@
+Mon Apr 17 23:35:29 MET DST 2000  Jan Hubicka  <jh@suse.cz>
+
+	* i386.c (athlon_cost): Fix lea, divide and XFmode move costs.
+	(x86_integer_DFmode_moves, x86_partial_reg_dependency,
+	 x86_memory_mismatch_stall): New global variables.
+	(ix86_adjust_cost): Handle MEMORY_BOTH on places MEMORY_STORE was only
+	alloved; fix load penalties for Athlon.
+	* i386.h (x86_integer_DFmode_moves, x86_partial_reg_dependency,
+	x86_memory_mismatch_stall): Declare.
+	(TARGET_INTEGER_DFMODE_MOVES, TARGET_PARTIAL_REG_DEPENDENCY,
+	 TARGET_MEMORY_MISMATCH_STALL): New.
+	* i386.md (athlon scheduling parameters): Fix latencies according to
+	Athlon Optimization Manual.
+	(sahf, xchg, fldcw, leave instruction patterns): Set athlon_decode to
+	vector.
+	(fsqrt instruction patterns): Set athlon_decode to direct.
+	(movhi_1): Promote for TARGET_PARTIAL_REG_DEPENDENCY and for
+	PARTIAL_REGISTER_STALL with !TARGET_HIMODE_MATH machines.
+	(movqi_1): Handle promoting correctly for TARGET_PARTIAL_REG_DEPENDENCY
+	and TARGET_PARTIAL_REGISTER_STALL machines.
+	(pushdf_nointeger): New pattern.
+	(pushdf_integer): Rename from pushdf.
+	(movdf_nointger): Enable for !TARGET_INTEGER_DFMODE_MOVES machines.
+	(movdf_intger): Disable for !TARGET_INTEGER_DFMODE_MOVES machines.
+
 2000-04-17  Richard Henderson  <rth@cygnus.com>
 
 	* loop.c (canonicalize_condition): Add WANT_REG argument.
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 516c27449347..b2e81a47e2ac 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -163,12 +163,12 @@ struct processor_costs k6_cost = {
 
 struct processor_costs athlon_cost = {
   1,					/* cost of an add instruction */
-  1,					/* cost of a lea instruction */
+  2,					/* cost of a lea instruction */
   1,					/* variable shift costs */
   1,					/* constant shift costs */
   5,					/* cost of starting a multiply */
   0,					/* cost of multiply per each bit set */
-  19,					/* cost of a divide/mod */
+  42,					/* cost of a divide/mod */
   8,					/* "large" insn */
   9,					/* MOVE_RATIO */
   4,					/* cost for loading QImode using movzbl */
@@ -177,9 +177,9 @@ struct processor_costs athlon_cost = {
 					   Relative to reg-reg move (2). */
   {2, 3, 2},				/* cost of storing integer registers */
   4,					/* cost of reg,reg fld/fst */
-  {6, 6, 6},				/* cost of loading fp registers
+  {6, 6, 20},				/* cost of loading fp registers
 					   in SFmode, DFmode and XFmode */
-  {4, 4, 4}				/* cost of loading integer registers */
+  {4, 4, 16}				/* cost of loading integer registers */
 };
 
 struct processor_costs *ix86_cost = &pentium_cost;
@@ -222,6 +222,9 @@ const int x86_sub_esp_4 = m_ATHLON | m_PPRO;
 const int x86_sub_esp_8 = m_ATHLON | m_PPRO | m_386 | m_486;
 const int x86_add_esp_4 = m_ATHLON | m_K6;
 const int x86_add_esp_8 = m_ATHLON | m_PPRO | m_K6 | m_386 | m_486;
+const int x86_integer_DFmode_moves = ~m_ATHLON;
+const int x86_partial_reg_dependency = m_ATHLON;
+const int x86_memory_mismatch_stall = m_ATHLON;
 
 #define AT_BP(mode) (gen_rtx_MEM ((mode), hard_frame_pointer_rtx))
 
@@ -6287,6 +6290,7 @@ ix86_adjust_cost (insn, link, dep_insn, cost)
      int cost;
 {
   enum attr_type insn_type, dep_insn_type;
+  enum attr_memory memory;
   rtx set, set2;
   int dep_insn_code_number;
 
@@ -6334,7 +6338,8 @@ ix86_adjust_cost (insn, link, dep_insn, cost)
 	 increase the cost here for non-imov insns.  */
       if (dep_insn_type != TYPE_IMOV
 	  && dep_insn_type != TYPE_FMOV
-	  && get_attr_memory (dep_insn) == MEMORY_LOAD)
+	  && ((memory = get_attr_memory (dep_insn) == MEMORY_LOAD)
+              || memory == MEMORY_BOTH))
 	cost += 1;
 
       /* INT->FP conversion is expensive.  */
@@ -6359,7 +6364,8 @@ ix86_adjust_cost (insn, link, dep_insn, cost)
 
       /* Since we can't represent delayed latencies of load+operation, 
 	 increase the cost here for non-imov insns.  */
-      if (get_attr_memory (dep_insn) == MEMORY_LOAD)
+      if ((memory = get_attr_memory (dep_insn) == MEMORY_LOAD)
+          || memory == MEMORY_BOTH)
 	cost += (dep_insn_type != TYPE_IMOV) ? 2 : 1;
 
       /* INT->FP conversion is expensive.  */
@@ -6368,19 +6374,15 @@ ix86_adjust_cost (insn, link, dep_insn, cost)
       break;
 
     case PROCESSOR_ATHLON:
-      /* Address Generation Interlock cause problems on the Athlon CPU because
-         the loads and stores are done in order so once one load or store has
-	 to wait, others must too, so penalize the AGIs slightly by one cycle.
-	 We might experiment with this value later.  */
-      if (ix86_agi_dependant (insn, dep_insn, insn_type))
-	cost += 1;
+      if ((memory = get_attr_memory (dep_insn)) == MEMORY_LOAD
+           || memory == MEMORY_BOTH)
+	{
+	  if (dep_insn_type == TYPE_IMOV || dep_insn_type == TYPE_FMOV)
+	    cost += 2;
+	  else
+	    cost += 3;
+        }
 
-      /* Since we can't represent delayed latencies of load+operation, 
-	 increase the cost here for non-imov insns.  */
-      if (dep_insn_type != TYPE_IMOV
-	  && dep_insn_type != TYPE_FMOV
-	  && get_attr_memory (dep_insn) == MEMORY_LOAD)
-	cost += 2;
     default:
       break;
     }
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 50f08251420f..8ae7be8d4446 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -173,8 +173,9 @@ extern const int x86_use_cltd, x86_read_modify_write;
 extern const int x86_read_modify, x86_split_long_moves;
 extern const int x86_promote_QImode, x86_single_stringop;
 extern const int x86_himode_math, x86_qimode_math, x86_promote_qi_regs;
-extern const int x86_promote_hi_regs;
+extern const int x86_promote_hi_regs, x86_integer_DFmode_moves;
 extern const int x86_add_esp_4, x86_add_esp_8, x86_sub_esp_4, x86_sub_esp_8;
+extern const int x86_partial_reg_dependency, x86_memory_mismatch_stall;
 
 #define TARGET_USE_LEAVE (x86_use_leave & CPUMASK)
 #define TARGET_PUSH_MEMORY (x86_push_memory & CPUMASK)
@@ -206,6 +207,9 @@ extern const int x86_add_esp_4, x86_add_esp_8, x86_sub_esp_4, x86_sub_esp_8;
 #define TARGET_ADD_ESP_8 (x86_add_esp_8 & CPUMASK)
 #define TARGET_SUB_ESP_4 (x86_sub_esp_4 & CPUMASK)
 #define TARGET_SUB_ESP_8 (x86_sub_esp_8 & CPUMASK)
+#define TARGET_INTEGER_DFMODE_MOVES (x86_integer_DFmode_moves & CPUMASK)
+#define TARGET_PARTIAL_REG_DEPENDENCY (x86_partial_reg_dependency & CPUMASK)
+#define TARGET_MEMORY_MISMATCH_STALL (x86_memory_mismatch_stall & CPUMASK)
 
 #define TARGET_STACK_PROBE (target_flags & MASK_STACK_PROBE)
 
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 090d0ebd4441..8b9b2530e787 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -738,7 +738,7 @@
 ;; communicates with all the execution units seperately instead.
 
 (define_attr "athlon_decode" "direct,vector"
-  (cond [(eq_attr "type" "call,imul,idiv,other,multi,fcmov,fpspc,str")
+  (cond [(eq_attr "type" "call,imul,idiv,other,multi,fcmov,fpspc,str,pop,cld,fcmov")
 	   (const_string "vector")
          (and (eq_attr "type" "push")
               (match_operand 1 "memory_operand" ""))
@@ -766,7 +766,7 @@
 
 (define_function_unit "athlon_ieu" 3 0
   (and (eq_attr "cpu" "athlon")
-       (eq_attr "type" "alu1,negnot,alu,icmp,imov,imovx,lea,incdec,ishift,imul,idiv,ibr,setcc,push,pop,call,callv,icmov,str,cld"))
+       (eq_attr "type" "alu1,negnot,alu,icmp,imov,imovx,lea,incdec,ishift,ibr,call,callv,icmov,cld,pop,setcc,push,pop"))
   1 1)
 
 (define_function_unit "athlon_ieu" 3 0
@@ -777,12 +777,12 @@
 (define_function_unit "athlon_ieu" 3 0
   (and (eq_attr "cpu" "athlon")
        (eq_attr "type" "imul"))
-  4 0)
+  5 0)
 
 (define_function_unit "athlon_ieu" 3 0
   (and (eq_attr "cpu" "athlon")
        (eq_attr "type" "idiv"))
-  27 0)
+  42 0)
 
 (define_function_unit "athlon_muldiv" 1 0
   (and (eq_attr "cpu" "athlon")
@@ -792,56 +792,118 @@
 (define_function_unit "athlon_muldiv" 1 0
   (and (eq_attr "cpu" "athlon")
        (eq_attr "type" "idiv"))
-  27 27)
+  42 42)
 
-(define_attr "athlon_fpunits" "none,store,mul,add,muladd,all"
+(define_attr "athlon_fpunits" "none,store,mul,add,muladd,any"
   (cond [(eq_attr "type" "fop,fop1,fcmp")
 	   (const_string "add")
-         (eq_attr "type" "fmul,fdiv,fpspc,fsgn")
+         (eq_attr "type" "fmul,fdiv,fpspc,fsgn,fcmov")
 	   (const_string "mul")
-	 (and (eq_attr "type" "fmov") (eq_attr "memory" "!none"))
+	 (and (eq_attr "type" "fmov") (eq_attr "memory" "store,both"))
 	   (const_string "store")
+	 (and (eq_attr "type" "fmov") (eq_attr "memory" "load"))
+	   (const_string "any")
          (and (eq_attr "type" "fmov")
               (ior (match_operand:SI 1 "register_operand" "")
                    (match_operand 1 "immediate_operand" "")))
 	   (const_string "store")
          (eq_attr "type" "fmov")
-	   (const_string "muladd")
-         (eq_attr "type" "fcmov")
-	   (const_string "all")]
+	   (const_string "muladd")]
 	(const_string "none")))
 
-(define_function_unit "athlon_fp_mul" 1 0
+;; We use latencies 1 for definitions.  This is OK to model colisions
+;; in execution units.  The real latencies are modeled in the "fp" pipeline.
+
+;; fsin, fcos: 96-192
+;; fsincos: 107-211
+;; fsqrt: 19 for SFmode, 27 for DFmode, 35 for XFmode.
+(define_function_unit "athlon_fp" 3 0
   (and (eq_attr "cpu" "athlon")
-       (eq_attr "athlon_fpunits" "mul,all"))
-  4 1)
+       (eq_attr "type" "fpspc"))
+  100 1)
 
-(define_function_unit "athlon_fp_add" 1 0
+;; 16 cycles for SFmode, 20 for DFmode and 24 for XFmode.
+(define_function_unit "athlon_fp" 3 0
   (and (eq_attr "cpu" "athlon")
-       (eq_attr "athlon_fpunits" "add,all"))
+       (eq_attr "type" "fdiv"))
+  24 1)
+
+(define_function_unit "athlon_fp" 3 0
+  (and (eq_attr "cpu" "athlon")
+       (eq_attr "type" "fop,fop1,fmul"))
   4 1)
 
-(define_function_unit "athlon_fp_muladd" 2 0
+;; XFmode loads are slow.
+;; XFmode store is slow too (8 cycles), but we don't need to model it, because
+;; there are no dependent instructions.
+
+(define_function_unit "athlon_fp" 3 0
   (and (eq_attr "cpu" "athlon")
        (and (eq_attr "type" "fmov")
-            (eq_attr "athlon_fpunits" "muladd,mul,add,all")))
+	    (match_operand:XF 1 "memory_operand" "")))
+  10 1)
+
+(define_function_unit "athlon_fp" 3 0
+  (and (eq_attr "cpu" "athlon")
+       (eq_attr "type" "fmov,fsgn"))
   2 1)
 
+;; fcmp and ftst instructions
+(define_function_unit "athlon_fp" 3 0
+  (and (eq_attr "cpu" "athlon")
+       (and (eq_attr "type" "fcmp")
+	    (eq_attr "athlon_decode" "direct")))
+  3 1)
+
+;; fcmpi instructions.
+(define_function_unit "athlon_fp" 3 0
+  (and (eq_attr "cpu" "athlon")
+       (and (eq_attr "type" "fcmp")
+	    (eq_attr "athlon_decode" "vector")))
+  3 1)
+
+(define_function_unit "athlon_fp" 3 0
+  (and (eq_attr "cpu" "athlon")
+       (eq_attr "type" "fcmov"))
+  7 1)
+
+(define_function_unit "athlon_fp_mul" 1 0
+  (and (eq_attr "cpu" "athlon")
+       (eq_attr "athlon_fpunits" "mul"))
+  1 1)
+
+(define_function_unit "athlon_fp_add" 1 0
+  (and (eq_attr "cpu" "athlon")
+       (eq_attr "athlon_fpunits" "add"))
+  1 1)
+
 (define_function_unit "athlon_fp_muladd" 2 0
   (and (eq_attr "cpu" "athlon")
-       (and (eq_attr "type" "!fmov")
-            (eq_attr "athlon_fpunits" "muladd,mul,add,all")))
-  4 1)
+       (eq_attr "athlon_fpunits" "muladd,mul,add"))
+  1 1)
 
 (define_function_unit "athlon_fp_store" 1 0
   (and (eq_attr "cpu" "athlon")
-       (eq_attr "athlon_fpunits" "store,all"))
+       (eq_attr "athlon_fpunits" "store"))
   1 1)
 
-(define_function_unit "athlon_agu" 3 0
+;; We don't need to model the Adress Generation Unit, since we don't model
+;; the re-order buffer yet and thus we never schedule more than three operations
+;; at time.  Later we may want to experiment with MD_SCHED macros modeling the
+;; decoders independently on the functional units.
+
+;(define_function_unit "athlon_agu" 3 0
+;  (and (eq_attr "cpu" "athlon")
+;       (and (eq_attr "memory" "!none")
+;            (eq_attr "athlon_fpunits" "none")))
+;  1 1)
+
+;; Model load unit to avoid too long sequences of loads.  We don't need to
+;; model store queue, since it is hardly going to be bottleneck.
+
+(define_function_unit "athlon_load" 2 0
   (and (eq_attr "cpu" "athlon")
-       (and (eq_attr "memory" "!none")
-            (eq_attr "athlon_fpunits" "none")))
+       (eq_attr "memory" "load,both"))
   1 1)
 
 
@@ -1255,6 +1317,7 @@
   ""
   "sahf"
   [(set_attr "length" "1")
+   (set_attr "athlon_decode" "vector")
    (set_attr "ppro_uops" "one")])
 
 ;; Pentium Pro can do steps 1 through 3 in one go.
@@ -1390,6 +1453,7 @@
   "xchg{l}\\t%1, %0"
   [(set_attr "type" "imov")
    (set_attr "pent_pair" "np")
+   (set_attr "athlon_decode" "vector")
    (set_attr "ppro_uops" "few")])
 
 (define_expand "movhi"
@@ -1437,8 +1501,10 @@
 }"
   [(set (attr "type")
      (cond [(and (eq_attr "alternative" "0")
-		 (eq (symbol_ref "TARGET_PARTIAL_REG_STALL")
-		     (const_int 0)))
+		 (ior (eq (symbol_ref "TARGET_PARTIAL_REG_STALL")
+			  (const_int 0))
+		      (eq (symbol_ref "TARGET_HIMODE_MATH")
+			  (const_int 0))))
 	      (const_string "imov")
 	    (and (eq_attr "alternative" "1,2")
 		 (match_operand:HI 1 "aligned_operand" ""))
@@ -1456,8 +1522,10 @@
 		  (match_operand:HI 1 "aligned_operand" ""))
 	       (const_string "0")
 	     (and (eq_attr "alternative" "0")
-		  (eq (symbol_ref "TARGET_PARTIAL_REG_STALL")
-		      (const_int 0)))
+		  (ior (eq (symbol_ref "TARGET_PARTIAL_REG_STALL")
+			   (const_int 0))
+		       (eq (symbol_ref "TARGET_HIMODE_MATH")
+			   (const_int 0))))
 	       (const_string "0")
 	    ]
 	    (const_string "1")))
@@ -1547,9 +1615,19 @@
   [(set_attr "type" "pop")
    (set_attr "length_prefix" "1")])
 
+;; Situation is quite tricky about when to choose full sized (SImode) move
+;; over QImode moves.  For Q_REG -> Q_REG move we use full size only for
+;; partial register dependency machines (such as AMD Athlon), where QImode
+;; moves issue extra dependency and for partial register stalls machines
+;; that don't use QImode patterns (and QImode move cause stall on the next
+;; instruction).
+;;
+;; For loads of Q_REG to NONQ_REG we use full sized moves except for partial
+;; register stall machines with, where we use QImode instructions, since
+;; partial register stall can be caused there.  Then we use movzx.
 (define_insn "*movqi_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=q,q,r,?r,m")
-	(match_operand:QI 1 "general_operand" "qn,qm,rn,qm,qn"))]
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=q,q ,q ,r,r ,?r,m")
+	(match_operand:QI 1 "general_operand"      " q,qn,qm,q,rn,qm,qn"))]
   "GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM"
   "*
 {
@@ -1560,26 +1638,50 @@
 	abort ();
       return \"movz{bl|x}\\t{%1, %k0|%k0, %1}\";
     default:
-      if (which_alternative == 2)
+      if (which_alternative == 4 || which_alternative == 3
+	  || (which_alternative == 1 && get_attr_length (insn) == 5)
+	  || (which_alternative == 0
+	      && ((TARGET_PARTIAL_REG_STALL && !TARGET_QIMODE_MATH)
+		  || TARGET_PARTIAL_REG_DEPENDENCY)))
         return \"mov{l}\\t{%k1, %k0|%k0, %k1}\";
       else
         return \"mov{b}\\t{%1, %0|%0, %1}\";
     }
 }"
   [(set (attr "type")
-     (cond [(eq_attr "alternative" "3")
+     (cond [(and (eq_attr "alternative" "3")
+		 (ior (eq (symbol_ref "TARGET_PARTIAL_REG_STALL")
+			  (const_int 0))
+		      (eq (symbol_ref "TARGET_QIMODE_MATH")
+			  (const_int 0))))
+	      (const_string "imov")
+	    (eq_attr "alternative" "3,5")
 	      (const_string "imovx")
 	    (and (ne (symbol_ref "TARGET_MOVX")
 		     (const_int 0))
-		 (eq_attr "alternative" "1"))
+		 (eq_attr "alternative" "2"))
 	      (const_string "imovx")
 	   ]
 	   (const_string "imov")))
     ; There's no place to override just the immediate length
     (set (attr "length")
       (cond [(and (eq_attr "type" "imov")
-		  (and (eq_attr "alternative" "2")
-		       (match_operand:HI 1 "immediate_operand" "")))
+		  (and (match_operand:HI 1 "immediate_operand" "")
+		       (eq_attr "alternative" "4")))
+	       (const_string "5")
+	     ;; Avoid extra dependency on partial register.
+	     (and (eq_attr "type" "imov")
+		  (and (eq_attr "alternative" "1")
+		       (ne (symbol_ref "TARGET_PARTIAL_REG_DEPENDENCY")
+			   (const_int 0))))
+	       (const_string "5")
+	     ;; Avoid partial register stalls when not using QImode arithmetic
+	     (and (eq_attr "type" "imov")
+		  (and (eq_attr "alternative" "1")
+		       (and (ne (symbol_ref "TARGET_PARTIAL_REG_STALL")
+				(const_int 0))
+			    (eq (symbol_ref "TARGET_QIMODE_MATH")
+				(const_int 0)))))
 	       (const_string "5")
 	    ]
 	    (const_string "*")))])
@@ -1904,10 +2006,38 @@
 ;; On the average, pushdf using integers can be still shorter.  Allow this
 ;; pattern for optimize_size too.
 
-(define_insn "*pushdf"
+(define_insn "*pushdf_nointeger"
+  [(set (match_operand:DF 0 "push_operand" "=<,<,<")
+	(match_operand:DF 1 "general_no_elim_operand" "f,Fo#f,*r#f"))]
+  "!TARGET_INTEGER_DFMODE_MOVES"
+  "*
+{
+  switch (which_alternative)
+    {
+    case 0:
+      /* %%% We loose REG_DEAD notes for controling pops if we split late.  */
+      operands[0] = gen_rtx_MEM (DFmode, stack_pointer_rtx);
+      operands[2] = stack_pointer_rtx;
+      operands[3] = GEN_INT (8);
+      if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
+	return \"sub{l}\\t{%3, %2|%2, %3}\;fstp%z0\\t%y0\";
+      else
+	return \"sub{l}\\t{%3, %2|%2, %3}\;fst%z0\\t%y0\";
+
+    case 1:
+    case 2:
+      return \"#\";
+
+    default:
+      abort ();
+    }
+}"
+  [(set_attr "type" "multi")])
+
+(define_insn "*pushdf_integer"
   [(set (match_operand:DF 0 "push_operand" "=<,<")
 	(match_operand:DF 1 "general_no_elim_operand" "f#r,rFo#f"))]
-  ""
+  "TARGET_INTEGER_DFMODE_MOVES"
   "*
 {
   switch (which_alternative)
@@ -1955,7 +2085,7 @@
   [(set (match_operand:DF 0 "nonimmediate_operand" "=f,m,f,*r,o")
 	(match_operand:DF 1 "general_operand" "fm,f,G,*roF,F*r"))]
   "(GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)
-   && optimize_size
+   && (optimize_size || !TARGET_INTEGER_DFMODE_MOVES)
    && (reload_in_progress || reload_completed
        || GET_CODE (operands[1]) != CONST_DOUBLE
        || memory_operand (operands[0], DFmode))" 
@@ -2002,7 +2132,7 @@
   [(set (match_operand:DF 0 "nonimmediate_operand" "=f#r,m,f#r,r#f,o")
 	(match_operand:DF 1 "general_operand" "fm#r,f#r,G,roF#f,Fr#f"))]
   "(GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)
-   && !optimize_size
+   && !optimize_size && TARGET_INTEGER_DFMODE_MOVES
    && (reload_in_progress || reload_completed
        || GET_CODE (operands[1]) != CONST_DOUBLE
        || memory_operand (operands[0], DFmode))" 
@@ -2304,7 +2434,8 @@
   else
     return \"fxch\\t%0\";
 }"
-  [(set_attr "type" "fxch")])
+  [(set_attr "type" "fxch")
+   (set_attr "athlon_decode" "vector")])
 
 ;; Zero extension instructions
 
@@ -3202,6 +3333,7 @@
   "TARGET_80387"
   "fldcw\\t%0"
   [(set_attr "length_opcode" "2")
+   (set_attr "athlon_decode" "vector")
    (set_attr "ppro_uops" "few")])
 
 ;; Conversion between fixed point and floating point.
@@ -7691,6 +7823,7 @@
   ""
   "leave"
   [(set_attr "length" "1")
+   (set_attr "athlon_decode" "vector")
    (set_attr "ppro_uops" "few")])
 
 (define_expand "ffssi2"
@@ -8123,7 +8256,8 @@
 	(sqrt:SF (match_operand:SF 1 "register_operand" "0")))]
   "! TARGET_NO_FANCY_MATH_387 && TARGET_80387"
   "fsqrt"
-  [(set_attr "type" "fpspc")])
+  [(set_attr "type" "fpspc")
+   (set_attr "athlon_decode" "direct")])
 
 (define_insn "sqrtdf2"
   [(set (match_operand:DF 0 "register_operand" "=f")
@@ -8131,7 +8265,8 @@
   "! TARGET_NO_FANCY_MATH_387 && TARGET_80387
    && (TARGET_IEEE_FP || flag_fast_math) "
   "fsqrt"
-  [(set_attr "type" "fpspc")])
+  [(set_attr "type" "fpspc")
+   (set_attr "athlon_decode" "direct")])
 
 (define_insn "*sqrtextendsfdf2"
   [(set (match_operand:DF 0 "register_operand" "=f")
@@ -8139,7 +8274,8 @@
 		  (match_operand:SF 1 "register_operand" "0"))))]
   "! TARGET_NO_FANCY_MATH_387 && TARGET_80387"
   "fsqrt"
-  [(set_attr "type" "fpspc")])
+  [(set_attr "type" "fpspc")
+   (set_attr "athlon_decode" "direct")])
 
 (define_insn "sqrtxf2"
   [(set (match_operand:XF 0 "register_operand" "=f")
@@ -8147,7 +8283,8 @@
   "! TARGET_NO_FANCY_MATH_387 && TARGET_80387 
    && (TARGET_IEEE_FP || flag_fast_math) "
   "fsqrt"
-  [(set_attr "type" "fpspc")])
+  [(set_attr "type" "fpspc")
+   (set_attr "athlon_decode" "direct")])
 
 (define_insn "*sqrtextenddfxf2"
   [(set (match_operand:XF 0 "register_operand" "=f")
@@ -8155,7 +8292,8 @@
 		  (match_operand:DF 1 "register_operand" "0"))))]
   "! TARGET_NO_FANCY_MATH_387 && TARGET_80387"
   "fsqrt"
-  [(set_attr "type" "fpspc")])
+  [(set_attr "type" "fpspc")
+   (set_attr "athlon_decode" "direct")])
 
 (define_insn "*sqrtextendsfxf2"
   [(set (match_operand:XF 0 "register_operand" "=f")
@@ -8163,7 +8301,8 @@
 		  (match_operand:SF 1 "register_operand" "0"))))]
   "! TARGET_NO_FANCY_MATH_387 && TARGET_80387"
   "fsqrt"
-  [(set_attr "type" "fpspc")])
+  [(set_attr "type" "fpspc")
+   (set_attr "athlon_decode" "direct")])
 
 (define_insn "sindf2"
   [(set (match_operand:DF 0 "register_operand" "=f")
-- 
GitLab