From 4e8e57b0ce67551ca61b7883e73586ba805f0a61 Mon Sep 17 00:00:00 2001
From: fxcoudert <fxcoudert@138bc75d-0d04-0410-961f-82ee72b054a4>
Date: Sun, 22 Oct 2006 07:41:48 +0000
Subject: [PATCH] 	PR fortran/26025

	* lang.opt: Add -fexternal-blas and -fblas-matmul-limit options.
	* options.c (gfc_init_options): Initialize new flags.
	(gfc_handle_option): Handle new flags.
	* gfortran.h (gfc_option): Add flag_external_blas and
	blas_matmul_limit flags.
	* trans-expr.c (gfc_conv_function_call): Use new argument
	append_args, appending it at the end of the argument list
	built for a function call.
	* trans-stmt.c (gfc_trans_call): Use NULL_TREE for the new
	append_args argument to gfc_trans_call.
	* trans.h (gfc_conv_function_call): Update prototype.
	* trans-decl.c (gfc_build_intrinsic_function_decls): Add
	prototypes for BLAS ?gemm routines.
	* trans-intrinsic.c (gfc_conv_intrinsic_funcall): Generate the
	extra arguments given to the library matmul function, and give
	them to gfc_conv_function_call.
	* invoke.texi: Add documentation for -fexternal-blas and
	-fblas-matmul-limit.

	* m4/matmul.m4: Add possible call to gemm routine.
	* generated/matmul_r8.c: Regenerate.
	* generated/matmul_r16.c: Regenerate.
	* generated/matmul_c8.c: Regenerate.
	* generated/matmul_i8.c: Regenerate.
	* generated/matmul_c16.c: Regenerate.
	* generated/matmul_r10.c: Regenerate.
	* generated/matmul_r4.c: Regenerate.
	* generated/matmul_c10.c: Regenerate.
	* generated/matmul_c4.c: Regenerate.
	* generated/matmul_i4.c: Regenerate.
	* generated/matmul_i16.c: Regenerate.


git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@117948 138bc75d-0d04-0410-961f-82ee72b054a4
---
 gcc/fortran/ChangeLog              | 22 +++++++++++++
 gcc/fortran/gfortran.h             |  2 ++
 gcc/fortran/invoke.texi            | 24 +++++++++++++-
 gcc/fortran/lang.opt               |  8 +++++
 gcc/fortran/options.c              | 10 ++++++
 gcc/fortran/trans-decl.c           | 49 +++++++++++++++++++++++++++++
 gcc/fortran/trans-expr.c           |  9 ++++--
 gcc/fortran/trans-intrinsic.c      | 50 +++++++++++++++++++++++++++++-
 gcc/fortran/trans-stmt.c           |  6 ++--
 gcc/fortran/trans.h                |  9 +++++-
 libgfortran/ChangeLog              | 16 ++++++++++
 libgfortran/generated/matmul_c10.c | 47 ++++++++++++++++++++++++++--
 libgfortran/generated/matmul_c16.c | 47 ++++++++++++++++++++++++++--
 libgfortran/generated/matmul_c4.c  | 47 ++++++++++++++++++++++++++--
 libgfortran/generated/matmul_c8.c  | 47 ++++++++++++++++++++++++++--
 libgfortran/generated/matmul_i16.c | 47 ++++++++++++++++++++++++++--
 libgfortran/generated/matmul_i4.c  | 47 ++++++++++++++++++++++++++--
 libgfortran/generated/matmul_i8.c  | 47 ++++++++++++++++++++++++++--
 libgfortran/generated/matmul_r10.c | 47 ++++++++++++++++++++++++++--
 libgfortran/generated/matmul_r16.c | 47 ++++++++++++++++++++++++++--
 libgfortran/generated/matmul_r4.c  | 47 ++++++++++++++++++++++++++--
 libgfortran/generated/matmul_r8.c  | 47 ++++++++++++++++++++++++++--
 libgfortran/m4/matmul.m4           | 47 ++++++++++++++++++++++++++--
 23 files changed, 726 insertions(+), 43 deletions(-)

diff --git a/gcc/fortran/ChangeLog b/gcc/fortran/ChangeLog
index 94e4d6cd013b..53b7b300764a 100644
--- a/gcc/fortran/ChangeLog
+++ b/gcc/fortran/ChangeLog
@@ -1,3 +1,25 @@
+2006-10-22  Francois-Xavier Coudert  <coudert@clipper.ens.fr>
+
+	PR fortran/26025
+	* lang.opt: Add -fexternal-blas and -fblas-matmul-limit options.
+	* options.c (gfc_init_options): Initialize new flags.
+	(gfc_handle_option): Handle new flags.
+	* gfortran.h (gfc_option): Add flag_external_blas and
+	blas_matmul_limit flags.
+	* trans-expr.c (gfc_conv_function_call): Use new argument
+	append_args, appending it at the end of the argument list
+	built for a function call.
+	* trans-stmt.c (gfc_trans_call): Use NULL_TREE for the new
+	append_args argument to gfc_trans_call.
+	* trans.h (gfc_conv_function_call): Update prototype.
+	* trans-decl.c (gfc_build_intrinsic_function_decls): Add
+	prototypes for BLAS ?gemm routines.
+	* trans-intrinsic.c (gfc_conv_intrinsic_funcall): Generate the
+	extra arguments given to the library matmul function, and give
+	them to gfc_conv_function_call.
+	* invoke.texi: Add documentation for -fexternal-blas and
+	-fblas-matmul-limit.
+
 2006-10-21  Kaveh R. Ghazi  <ghazi@caip.rutgers.edu>
 
 	* Make-lang.in (F95_LIBS): Delete.
diff --git a/gcc/fortran/gfortran.h b/gcc/fortran/gfortran.h
index c89c136f6c09..b34d1c28dcbf 100644
--- a/gcc/fortran/gfortran.h
+++ b/gcc/fortran/gfortran.h
@@ -1652,6 +1652,8 @@ typedef struct
   int flag_f2c;
   int flag_automatic;
   int flag_backslash;
+  int flag_external_blas;
+  int blas_matmul_limit;
   int flag_cray_pointer;
   int flag_d_lines;
   int flag_openmp;
diff --git a/gcc/fortran/invoke.texi b/gcc/fortran/invoke.texi
index 51554a5ddcff..8c6aadd6153a 100644
--- a/gcc/fortran/invoke.texi
+++ b/gcc/fortran/invoke.texi
@@ -152,7 +152,8 @@ by type.  Explanations are in the following sections.
 @gccoptlist{
 -fno-automatic -ff2c -fno-underscoring  -fsecond-underscore @gol
 -fbounds-check  -fmax-stack-var-size=@var{n} @gol
--fpack-derived  -frepack-arrays  -fshort-enums}
+-fpack-derived  -frepack-arrays  -fshort-enums -fexternal-blas
+-fblas-matmul-limit=@var{n}}
 @end table
 
 @menu
@@ -859,6 +860,27 @@ This option is provided for interoperability with C code that was
 compiled with the @command{-fshort-enums} option.  It will make
 GNU Fortran choose the smallest @code{INTEGER} kind a given
 enumerator set will fit in, and give all its enumerators this kind.
+
+@cindex -fexternal-blas
+@item -fexternal-blas
+This option will make gfortran generate calls to BLAS functions for some
+matrix operations like @code{MATMUL}, instead of using our own
+algorithms, if the size of the matrices involved is larger than a given
+limit (see @command{-fblas-matmul-limit}).  This may be profitable if an
+optimized vendor BLAS library is available.  The BLAS library will have
+to be specified at link time.
+
+@cindex -fblas-matmul-limit
+@item -fblas-matmul-limit=@var{n}
+Only significant when @command{-fexternal-blas} is in effect.
+Matrix multiplication of matrices with size larger than (or equal to) @var{n}
+will be performed by calls to BLAS functions, while others will be
+handled by @command{gfortran} internal algorithms. If the matrices
+involved are not square, the size comparison is performed using the
+geometric mean of the dimensions of the argument and result matrices.
+
+The default value for @var{n} is 30.
+
 @end table
 
 @xref{Code Gen Options,,Options for Code Generation Conventions,
diff --git a/gcc/fortran/lang.opt b/gcc/fortran/lang.opt
index cb8810ae62b4..cbef46a040d7 100644
--- a/gcc/fortran/lang.opt
+++ b/gcc/fortran/lang.opt
@@ -85,6 +85,14 @@ fbackslash
 Fortran
 Specify that backslash in string introduces an escape character
 
+fexternal-blas
+Fortran
+Specify that an external BLAS library should be used for matmul calls on large-size arrays
+
+fblas-matmul-limit=
+Fortran RejectNegative Joined UInteger
+-fblas-matmul-limit=<n>        Size of the smallest matrix for which matmul will use BLAS
+
 fdefault-double-8
 Fortran
 Set the default double precision kind to an 8 byte wide type
diff --git a/gcc/fortran/options.c b/gcc/fortran/options.c
index 96347042bf37..f821d3e2695e 100644
--- a/gcc/fortran/options.c
+++ b/gcc/fortran/options.c
@@ -80,6 +80,8 @@ gfc_init_options (unsigned int argc ATTRIBUTE_UNUSED,
   gfc_option.flag_preprocessed = 0;
   gfc_option.flag_automatic = 1;
   gfc_option.flag_backslash = 1;
+  gfc_option.flag_external_blas = 0;
+  gfc_option.blas_matmul_limit = 30;
   gfc_option.flag_cray_pointer = 0;
   gfc_option.flag_d_lines = -1;
   gfc_option.flag_openmp = 0;
@@ -450,6 +452,14 @@ gfc_handle_option (size_t scode, const char *arg, int value)
       gfc_option.flag_dollar_ok = value;
       break;
 
+    case OPT_fexternal_blas:
+      gfc_option.flag_external_blas = value;
+      break;
+
+    case OPT_fblas_matmul_limit_:
+      gfc_option.blas_matmul_limit = value;
+      break;
+
     case OPT_fd_lines_as_code:
       gfc_option.flag_d_lines = 1;
       break;
diff --git a/gcc/fortran/trans-decl.c b/gcc/fortran/trans-decl.c
index d12b953cf9e5..82315b708fcd 100644
--- a/gcc/fortran/trans-decl.c
+++ b/gcc/fortran/trans-decl.c
@@ -143,6 +143,12 @@ tree gfor_fndecl_iargc;
 tree gfor_fndecl_si_kind;
 tree gfor_fndecl_sr_kind;
 
+/* BLAS gemm functions.  */
+tree gfor_fndecl_sgemm;
+tree gfor_fndecl_dgemm;
+tree gfor_fndecl_cgemm;
+tree gfor_fndecl_zgemm;
+
 
 static void
 gfc_add_decl_to_parent_function (tree decl)
@@ -2186,6 +2192,49 @@ gfc_build_intrinsic_function_decls (void)
 				       gfc_int4_type_node, 1,
 				       gfc_real16_type_node);
 
+  /* BLAS functions.  */
+  {
+    tree pint = build_pointer_type (gfc_c_int_type_node);
+    tree ps = build_pointer_type (gfc_get_real_type (gfc_default_real_kind));
+    tree pd = build_pointer_type (gfc_get_real_type (gfc_default_double_kind));
+    tree pc = build_pointer_type (gfc_get_complex_type (gfc_default_real_kind));
+    tree pz = build_pointer_type
+		(gfc_get_complex_type (gfc_default_double_kind));
+
+    gfor_fndecl_sgemm = gfc_build_library_function_decl
+			  (get_identifier
+			     (gfc_option.flag_underscoring ? "sgemm_"
+							   : "sgemm"),
+			   void_type_node, 15, pchar_type_node,
+			   pchar_type_node, pint, pint, pint, ps, ps, pint,
+			   ps, pint, ps, ps, pint, gfc_c_int_type_node,
+			   gfc_c_int_type_node);
+    gfor_fndecl_dgemm = gfc_build_library_function_decl
+			  (get_identifier
+			     (gfc_option.flag_underscoring ? "dgemm_"
+							   : "dgemm"),
+			   void_type_node, 15, pchar_type_node,
+			   pchar_type_node, pint, pint, pint, pd, pd, pint,
+			   pd, pint, pd, pd, pint, gfc_c_int_type_node,
+			   gfc_c_int_type_node);
+    gfor_fndecl_cgemm = gfc_build_library_function_decl
+			  (get_identifier
+			     (gfc_option.flag_underscoring ? "cgemm_"
+							   : "cgemm"),
+			   void_type_node, 15, pchar_type_node,
+			   pchar_type_node, pint, pint, pint, pc, pc, pint,
+			   pc, pint, pc, pc, pint, gfc_c_int_type_node,
+			   gfc_c_int_type_node);
+    gfor_fndecl_zgemm = gfc_build_library_function_decl
+			  (get_identifier
+			     (gfc_option.flag_underscoring ? "zgemm_"
+							   : "zgemm"),
+			   void_type_node, 15, pchar_type_node,
+			   pchar_type_node, pint, pint, pint, pz, pz, pint,
+			   pz, pint, pz, pz, pint, gfc_c_int_type_node,
+			   gfc_c_int_type_node);
+  }
+
   /* Other functions.  */
   gfor_fndecl_size0 =
     gfc_build_library_function_decl (get_identifier (PREFIX("size0")),
diff --git a/gcc/fortran/trans-expr.c b/gcc/fortran/trans-expr.c
index 3e7844ed4455..e5c9f2486bd6 100644
--- a/gcc/fortran/trans-expr.c
+++ b/gcc/fortran/trans-expr.c
@@ -1853,7 +1853,7 @@ is_aliased_array (gfc_expr * e)
 
 int
 gfc_conv_function_call (gfc_se * se, gfc_symbol * sym,
-			gfc_actual_arglist * arg)
+			gfc_actual_arglist * arg, tree append_args)
 {
   gfc_interface_mapping mapping;
   tree arglist;
@@ -2226,6 +2226,11 @@ gfc_conv_function_call (gfc_se * se, gfc_symbol * sym,
   /* Add the hidden string length parameters to the arguments.  */
   arglist = chainon (arglist, stringargs);
 
+  /* We may want to append extra arguments here.  This is used e.g. for
+     calls to libgfortran_matmul_??, which need extra information.  */
+  if (append_args != NULL_TREE)
+    arglist = chainon (arglist, append_args);
+
   /* Generate the actual call.  */
   gfc_conv_function_val (se, sym);
   /* If there are alternate return labels, function type should be
@@ -2545,7 +2550,7 @@ gfc_conv_function_expr (gfc_se * se, gfc_expr * expr)
   sym = expr->value.function.esym;
   if (!sym)
     sym = expr->symtree->n.sym;
-  gfc_conv_function_call (se, sym, expr->value.function.actual);
+  gfc_conv_function_call (se, sym, expr->value.function.actual, NULL_TREE);
 }
 
 
diff --git a/gcc/fortran/trans-intrinsic.c b/gcc/fortran/trans-intrinsic.c
index 53c61c696d91..7dbd60e80967 100644
--- a/gcc/fortran/trans-intrinsic.c
+++ b/gcc/fortran/trans-intrinsic.c
@@ -1378,6 +1378,7 @@ static void
 gfc_conv_intrinsic_funcall (gfc_se * se, gfc_expr * expr)
 {
   gfc_symbol *sym;
+  tree append_args;
 
   gcc_assert (!se->ss || se->ss->expr == expr);
 
@@ -1387,7 +1388,54 @@ gfc_conv_intrinsic_funcall (gfc_se * se, gfc_expr * expr)
     gcc_assert (expr->rank == 0);
 
   sym = gfc_get_symbol_for_expr (expr);
-  gfc_conv_function_call (se, sym, expr->value.function.actual);
+
+  /* Calls to libgfortran_matmul need to be appended special arguments,
+     to be able to call the BLAS ?gemm functions if required and possible.  */
+  append_args = NULL_TREE;
+  if (expr->value.function.isym->generic_id == GFC_ISYM_MATMUL
+      && sym->ts.type != BT_LOGICAL)
+    {
+      tree cint = gfc_get_int_type (gfc_c_int_kind);
+
+      if (gfc_option.flag_external_blas
+	  && (sym->ts.type == BT_REAL || sym->ts.type == BT_COMPLEX)
+	  && (sym->ts.kind == gfc_default_real_kind
+	      || sym->ts.kind == gfc_default_double_kind))
+	{
+	  tree gemm_fndecl;
+
+	  if (sym->ts.type == BT_REAL)
+	    {
+	      if (sym->ts.kind == gfc_default_real_kind)
+		gemm_fndecl = gfor_fndecl_sgemm;
+	      else
+		gemm_fndecl = gfor_fndecl_dgemm;
+	    }
+	  else
+	    {
+	      if (sym->ts.kind == gfc_default_real_kind)
+		gemm_fndecl = gfor_fndecl_cgemm;
+	      else
+		gemm_fndecl = gfor_fndecl_zgemm;
+	    }
+
+	  append_args = gfc_chainon_list (NULL_TREE, build_int_cst (cint, 1));
+	  append_args = gfc_chainon_list
+			  (append_args, build_int_cst
+					  (cint, gfc_option.blas_matmul_limit));
+	  append_args = gfc_chainon_list (append_args,
+					  gfc_build_addr_expr (NULL_TREE,
+							       gemm_fndecl));
+	}
+      else
+	{
+	  append_args = gfc_chainon_list (NULL_TREE, build_int_cst (cint, 0));
+	  append_args = gfc_chainon_list (append_args, build_int_cst (cint, 0));
+	  append_args = gfc_chainon_list (append_args, null_pointer_node);
+	}
+    }
+
+  gfc_conv_function_call (se, sym, expr->value.function.actual, append_args);
   gfc_free (sym);
 }
 
diff --git a/gcc/fortran/trans-stmt.c b/gcc/fortran/trans-stmt.c
index 08ba113cc075..03ff0fee92bd 100644
--- a/gcc/fortran/trans-stmt.c
+++ b/gcc/fortran/trans-stmt.c
@@ -334,7 +334,8 @@ gfc_trans_call (gfc_code * code, bool dependency_check)
 
       /* Translate the call.  */
       has_alternate_specifier
-	= gfc_conv_function_call (&se, code->resolved_sym, code->ext.actual);
+	= gfc_conv_function_call (&se, code->resolved_sym, code->ext.actual,
+				  NULL_TREE);
 
       /* A subroutine without side-effect, by definition, does nothing!  */
       TREE_SIDE_EFFECTS (se.expr) = 1;
@@ -399,7 +400,8 @@ gfc_trans_call (gfc_code * code, bool dependency_check)
       gfc_init_block (&block);
 
       /* Add the subroutine call to the block.  */
-      gfc_conv_function_call (&loopse, code->resolved_sym, code->ext.actual);
+      gfc_conv_function_call (&loopse, code->resolved_sym, code->ext.actual,
+			      NULL_TREE);
       gfc_add_expr_to_block (&loopse.pre, loopse.expr);
 
       gfc_add_block_to_block (&block, &loopse.pre);
diff --git a/gcc/fortran/trans.h b/gcc/fortran/trans.h
index 13c21aa25818..e8bb1d5d6aad 100644
--- a/gcc/fortran/trans.h
+++ b/gcc/fortran/trans.h
@@ -303,7 +303,8 @@ void gfc_conv_intrinsic_function (gfc_se *, gfc_expr *);
 int gfc_is_intrinsic_libcall (gfc_expr *);
 
 /* Also used to CALL subroutines.  */
-int gfc_conv_function_call (gfc_se *, gfc_symbol *, gfc_actual_arglist *);
+int gfc_conv_function_call (gfc_se *, gfc_symbol *, gfc_actual_arglist *,
+			    tree);
 /* gfc_trans_* shouldn't call push/poplevel, use gfc_push/pop_scope */
 
 /* Generate code for a scalar assignment.  */
@@ -507,6 +508,12 @@ extern GTY(()) tree gfor_fndecl_math_exponent8;
 extern GTY(()) tree gfor_fndecl_math_exponent10;
 extern GTY(()) tree gfor_fndecl_math_exponent16;
 
+/* BLAS functions.  */
+extern GTY(()) tree gfor_fndecl_sgemm;
+extern GTY(()) tree gfor_fndecl_dgemm;
+extern GTY(()) tree gfor_fndecl_cgemm;
+extern GTY(()) tree gfor_fndecl_zgemm;
+
 /* String functions.  */
 extern GTY(()) tree gfor_fndecl_compare_string;
 extern GTY(()) tree gfor_fndecl_concat_string;
diff --git a/libgfortran/ChangeLog b/libgfortran/ChangeLog
index a9e70825f802..063c62519d36 100644
--- a/libgfortran/ChangeLog
+++ b/libgfortran/ChangeLog
@@ -1,3 +1,19 @@
+2006-10-22  Francois-Xavier Coudert  <coudert@clipper.ens.fr>
+
+	PR fortran/26025
+	* m4/matmul.m4: Add possible call to gemm routine.
+	* generated/matmul_r8.c: Regenerate.
+	* generated/matmul_r16.c: Regenerate.
+	* generated/matmul_c8.c: Regenerate.
+	* generated/matmul_i8.c: Regenerate.
+	* generated/matmul_c16.c: Regenerate.
+	* generated/matmul_r10.c: Regenerate.
+	* generated/matmul_r4.c: Regenerate.
+	* generated/matmul_c10.c: Regenerate.
+	* generated/matmul_c4.c: Regenerate.
+	* generated/matmul_i4.c: Regenerate.
+	* generated/matmul_i16.c: Regenerate.
+
 2006-10-21  Steven G. Kargl  <kargl@gcc.gnu.org>
 
 	* runtime/error.c: Add errno.h
diff --git a/libgfortran/generated/matmul_c10.c b/libgfortran/generated/matmul_c10.c
index df2cd93c15ff..5e3b281245c5 100644
--- a/libgfortran/generated/matmul_c10.c
+++ b/libgfortran/generated/matmul_c10.c
@@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA.  */
 
 #if defined (HAVE_GFC_COMPLEX_10)
 
+/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
+   passed to us by the front-end, in which case we'll call it for large
+   matrices.  */
+
+typedef void (*blas_call)(const char *, const char *, const int *, const int *,
+                          const int *, const GFC_COMPLEX_10 *, const GFC_COMPLEX_10 *,
+                          const int *, const GFC_COMPLEX_10 *, const int *,
+                          const GFC_COMPLEX_10 *, GFC_COMPLEX_10 *, const int *,
+                          int, int);
+
 /* The order of loops is different in the case of plain matrix
    multiplication C=MATMUL(A,B), and in the frequent special case where
    the argument A is the temporary result of a TRANSPOSE intrinsic:
@@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA.  */
        DO I=1,M
          S = 0
          DO K=1,COUNT
-           S = S+A(I,K)+B(K,J)
+           S = S+A(I,K)*B(K,J)
          C(I,J) = S
    ENDIF
 */
 
+/* If try_blas is set to a nonzero value, then the matmul function will
+   see if there is a way to perform the matrix multiplication by a call
+   to the BLAS gemm function.  */
+
 extern void matmul_c10 (gfc_array_c10 * const restrict retarray, 
-	gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b);
+	gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm);
 export_proto(matmul_c10);
 
 void
 matmul_c10 (gfc_array_c10 * const restrict retarray, 
-	gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b)
+	gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
 {
   const GFC_COMPLEX_10 * restrict abase;
   const GFC_COMPLEX_10 * restrict bbase;
@@ -177,6 +193,31 @@ matmul_c10 (gfc_array_c10 * const restrict retarray,
   bbase = b->data;
   dest = retarray->data;
 
+
+  /* Now that everything is set up, we're performing the multiplication
+     itself.  */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+
+  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+      && (bxstride == 1 || bystride == 1)
+      && (((float) xcount) * ((float) ycount) * ((float) count)
+          > POW3(blas_limit)))
+  {
+    const int m = xcount, n = ycount, k = count, ldc = rystride;
+    const GFC_COMPLEX_10 one = 1, zero = 0;
+    const int lda = (axstride == 1) ? aystride : axstride,
+              ldb = (bxstride == 1) ? bystride : bxstride;
+
+    if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+      {
+        assert (gemm != NULL);
+        gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
+              &one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
+        return;
+      }
+  }
+
   if (rxstride == 1 && axstride == 1 && bxstride == 1)
     {
       const GFC_COMPLEX_10 * restrict bbase_y;
diff --git a/libgfortran/generated/matmul_c16.c b/libgfortran/generated/matmul_c16.c
index 6425eb8d49d9..f7301114b377 100644
--- a/libgfortran/generated/matmul_c16.c
+++ b/libgfortran/generated/matmul_c16.c
@@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA.  */
 
 #if defined (HAVE_GFC_COMPLEX_16)
 
+/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
+   passed to us by the front-end, in which case we'll call it for large
+   matrices.  */
+
+typedef void (*blas_call)(const char *, const char *, const int *, const int *,
+                          const int *, const GFC_COMPLEX_16 *, const GFC_COMPLEX_16 *,
+                          const int *, const GFC_COMPLEX_16 *, const int *,
+                          const GFC_COMPLEX_16 *, GFC_COMPLEX_16 *, const int *,
+                          int, int);
+
 /* The order of loops is different in the case of plain matrix
    multiplication C=MATMUL(A,B), and in the frequent special case where
    the argument A is the temporary result of a TRANSPOSE intrinsic:
@@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA.  */
        DO I=1,M
          S = 0
          DO K=1,COUNT
-           S = S+A(I,K)+B(K,J)
+           S = S+A(I,K)*B(K,J)
          C(I,J) = S
    ENDIF
 */
 
+/* If try_blas is set to a nonzero value, then the matmul function will
+   see if there is a way to perform the matrix multiplication by a call
+   to the BLAS gemm function.  */
+
 extern void matmul_c16 (gfc_array_c16 * const restrict retarray, 
-	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b);
+	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm);
 export_proto(matmul_c16);
 
 void
 matmul_c16 (gfc_array_c16 * const restrict retarray, 
-	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b)
+	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
 {
   const GFC_COMPLEX_16 * restrict abase;
   const GFC_COMPLEX_16 * restrict bbase;
@@ -177,6 +193,31 @@ matmul_c16 (gfc_array_c16 * const restrict retarray,
   bbase = b->data;
   dest = retarray->data;
 
+
+  /* Now that everything is set up, we're performing the multiplication
+     itself.  */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+
+  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+      && (bxstride == 1 || bystride == 1)
+      && (((float) xcount) * ((float) ycount) * ((float) count)
+          > POW3(blas_limit)))
+  {
+    const int m = xcount, n = ycount, k = count, ldc = rystride;
+    const GFC_COMPLEX_16 one = 1, zero = 0;
+    const int lda = (axstride == 1) ? aystride : axstride,
+              ldb = (bxstride == 1) ? bystride : bxstride;
+
+    if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+      {
+        assert (gemm != NULL);
+        gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
+              &one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
+        return;
+      }
+  }
+
   if (rxstride == 1 && axstride == 1 && bxstride == 1)
     {
       const GFC_COMPLEX_16 * restrict bbase_y;
diff --git a/libgfortran/generated/matmul_c4.c b/libgfortran/generated/matmul_c4.c
index 2d47a1349729..f2984ab48abd 100644
--- a/libgfortran/generated/matmul_c4.c
+++ b/libgfortran/generated/matmul_c4.c
@@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA.  */
 
 #if defined (HAVE_GFC_COMPLEX_4)
 
+/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
+   passed to us by the front-end, in which case we'll call it for large
+   matrices.  */
+
+typedef void (*blas_call)(const char *, const char *, const int *, const int *,
+                          const int *, const GFC_COMPLEX_4 *, const GFC_COMPLEX_4 *,
+                          const int *, const GFC_COMPLEX_4 *, const int *,
+                          const GFC_COMPLEX_4 *, GFC_COMPLEX_4 *, const int *,
+                          int, int);
+
 /* The order of loops is different in the case of plain matrix
    multiplication C=MATMUL(A,B), and in the frequent special case where
    the argument A is the temporary result of a TRANSPOSE intrinsic:
@@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA.  */
        DO I=1,M
          S = 0
          DO K=1,COUNT
-           S = S+A(I,K)+B(K,J)
+           S = S+A(I,K)*B(K,J)
          C(I,J) = S
    ENDIF
 */
 
+/* If try_blas is set to a nonzero value, then the matmul function will
+   see if there is a way to perform the matrix multiplication by a call
+   to the BLAS gemm function.  */
+
 extern void matmul_c4 (gfc_array_c4 * const restrict retarray, 
-	gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b);
+	gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm);
 export_proto(matmul_c4);
 
 void
 matmul_c4 (gfc_array_c4 * const restrict retarray, 
-	gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b)
+	gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
 {
   const GFC_COMPLEX_4 * restrict abase;
   const GFC_COMPLEX_4 * restrict bbase;
@@ -177,6 +193,31 @@ matmul_c4 (gfc_array_c4 * const restrict retarray,
   bbase = b->data;
   dest = retarray->data;
 
+
+  /* Now that everything is set up, we're performing the multiplication
+     itself.  */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+
+  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+      && (bxstride == 1 || bystride == 1)
+      && (((float) xcount) * ((float) ycount) * ((float) count)
+          > POW3(blas_limit)))
+  {
+    const int m = xcount, n = ycount, k = count, ldc = rystride;
+    const GFC_COMPLEX_4 one = 1, zero = 0;
+    const int lda = (axstride == 1) ? aystride : axstride,
+              ldb = (bxstride == 1) ? bystride : bxstride;
+
+    if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+      {
+        assert (gemm != NULL);
+        gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
+              &one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
+        return;
+      }
+  }
+
   if (rxstride == 1 && axstride == 1 && bxstride == 1)
     {
       const GFC_COMPLEX_4 * restrict bbase_y;
diff --git a/libgfortran/generated/matmul_c8.c b/libgfortran/generated/matmul_c8.c
index f22719df505a..65cc0a52c4b4 100644
--- a/libgfortran/generated/matmul_c8.c
+++ b/libgfortran/generated/matmul_c8.c
@@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA.  */
 
 #if defined (HAVE_GFC_COMPLEX_8)
 
+/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
+   passed to us by the front-end, in which case we'll call it for large
+   matrices.  */
+
+typedef void (*blas_call)(const char *, const char *, const int *, const int *,
+                          const int *, const GFC_COMPLEX_8 *, const GFC_COMPLEX_8 *,
+                          const int *, const GFC_COMPLEX_8 *, const int *,
+                          const GFC_COMPLEX_8 *, GFC_COMPLEX_8 *, const int *,
+                          int, int);
+
 /* The order of loops is different in the case of plain matrix
    multiplication C=MATMUL(A,B), and in the frequent special case where
    the argument A is the temporary result of a TRANSPOSE intrinsic:
@@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA.  */
        DO I=1,M
          S = 0
          DO K=1,COUNT
-           S = S+A(I,K)+B(K,J)
+           S = S+A(I,K)*B(K,J)
          C(I,J) = S
    ENDIF
 */
 
+/* If try_blas is set to a nonzero value, then the matmul function will
+   see if there is a way to perform the matrix multiplication by a call
+   to the BLAS gemm function.  */
+
 extern void matmul_c8 (gfc_array_c8 * const restrict retarray, 
-	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b);
+	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm);
 export_proto(matmul_c8);
 
 void
 matmul_c8 (gfc_array_c8 * const restrict retarray, 
-	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b)
+	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
 {
   const GFC_COMPLEX_8 * restrict abase;
   const GFC_COMPLEX_8 * restrict bbase;
@@ -177,6 +193,31 @@ matmul_c8 (gfc_array_c8 * const restrict retarray,
   bbase = b->data;
   dest = retarray->data;
 
+
+  /* Now that everything is set up, we're performing the multiplication
+     itself.  */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+
+  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+      && (bxstride == 1 || bystride == 1)
+      && (((float) xcount) * ((float) ycount) * ((float) count)
+          > POW3(blas_limit)))
+  {
+    const int m = xcount, n = ycount, k = count, ldc = rystride;
+    const GFC_COMPLEX_8 one = 1, zero = 0;
+    const int lda = (axstride == 1) ? aystride : axstride,
+              ldb = (bxstride == 1) ? bystride : bxstride;
+
+    if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+      {
+        assert (gemm != NULL);
+        gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
+              &one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
+        return;
+      }
+  }
+
   if (rxstride == 1 && axstride == 1 && bxstride == 1)
     {
       const GFC_COMPLEX_8 * restrict bbase_y;
diff --git a/libgfortran/generated/matmul_i16.c b/libgfortran/generated/matmul_i16.c
index 73c3fbc108d6..a193669d1081 100644
--- a/libgfortran/generated/matmul_i16.c
+++ b/libgfortran/generated/matmul_i16.c
@@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA.  */
 
 #if defined (HAVE_GFC_INTEGER_16)
 
+/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
+   passed to us by the front-end, in which case we'll call it for large
+   matrices.  */
+
+typedef void (*blas_call)(const char *, const char *, const int *, const int *,
+                          const int *, const GFC_INTEGER_16 *, const GFC_INTEGER_16 *,
+                          const int *, const GFC_INTEGER_16 *, const int *,
+                          const GFC_INTEGER_16 *, GFC_INTEGER_16 *, const int *,
+                          int, int);
+
 /* The order of loops is different in the case of plain matrix
    multiplication C=MATMUL(A,B), and in the frequent special case where
    the argument A is the temporary result of a TRANSPOSE intrinsic:
@@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA.  */
        DO I=1,M
          S = 0
          DO K=1,COUNT
-           S = S+A(I,K)+B(K,J)
+           S = S+A(I,K)*B(K,J)
          C(I,J) = S
    ENDIF
 */
 
+/* If try_blas is set to a nonzero value, then the matmul function will
+   see if there is a way to perform the matrix multiplication by a call
+   to the BLAS gemm function.  */
+
 extern void matmul_i16 (gfc_array_i16 * const restrict retarray, 
-	gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b);
+	gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm);
 export_proto(matmul_i16);
 
 void
 matmul_i16 (gfc_array_i16 * const restrict retarray, 
-	gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b)
+	gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
 {
   const GFC_INTEGER_16 * restrict abase;
   const GFC_INTEGER_16 * restrict bbase;
@@ -177,6 +193,31 @@ matmul_i16 (gfc_array_i16 * const restrict retarray,
   bbase = b->data;
   dest = retarray->data;
 
+
+  /* Now that everything is set up, we're performing the multiplication
+     itself.  */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+
+  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+      && (bxstride == 1 || bystride == 1)
+      && (((float) xcount) * ((float) ycount) * ((float) count)
+          > POW3(blas_limit)))
+  {
+    const int m = xcount, n = ycount, k = count, ldc = rystride;
+    const GFC_INTEGER_16 one = 1, zero = 0;
+    const int lda = (axstride == 1) ? aystride : axstride,
+              ldb = (bxstride == 1) ? bystride : bxstride;
+
+    if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+      {
+        assert (gemm != NULL);
+        gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
+              &one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
+        return;
+      }
+  }
+
   if (rxstride == 1 && axstride == 1 && bxstride == 1)
     {
       const GFC_INTEGER_16 * restrict bbase_y;
diff --git a/libgfortran/generated/matmul_i4.c b/libgfortran/generated/matmul_i4.c
index 63bca0152cdd..69b9b487a819 100644
--- a/libgfortran/generated/matmul_i4.c
+++ b/libgfortran/generated/matmul_i4.c
@@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA.  */
 
 #if defined (HAVE_GFC_INTEGER_4)
 
+/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
+   passed to us by the front-end, in which case we'll call it for large
+   matrices.  */
+
+typedef void (*blas_call)(const char *, const char *, const int *, const int *,
+                          const int *, const GFC_INTEGER_4 *, const GFC_INTEGER_4 *,
+                          const int *, const GFC_INTEGER_4 *, const int *,
+                          const GFC_INTEGER_4 *, GFC_INTEGER_4 *, const int *,
+                          int, int);
+
 /* The order of loops is different in the case of plain matrix
    multiplication C=MATMUL(A,B), and in the frequent special case where
    the argument A is the temporary result of a TRANSPOSE intrinsic:
@@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA.  */
        DO I=1,M
          S = 0
          DO K=1,COUNT
-           S = S+A(I,K)+B(K,J)
+           S = S+A(I,K)*B(K,J)
          C(I,J) = S
    ENDIF
 */
 
+/* If try_blas is set to a nonzero value, then the matmul function will
+   see if there is a way to perform the matrix multiplication by a call
+   to the BLAS gemm function.  */
+
 extern void matmul_i4 (gfc_array_i4 * const restrict retarray, 
-	gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b);
+	gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm);
 export_proto(matmul_i4);
 
 void
 matmul_i4 (gfc_array_i4 * const restrict retarray, 
-	gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b)
+	gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
 {
   const GFC_INTEGER_4 * restrict abase;
   const GFC_INTEGER_4 * restrict bbase;
@@ -177,6 +193,31 @@ matmul_i4 (gfc_array_i4 * const restrict retarray,
   bbase = b->data;
   dest = retarray->data;
 
+
+  /* Now that everything is set up, we're performing the multiplication
+     itself.  */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+
+  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+      && (bxstride == 1 || bystride == 1)
+      && (((float) xcount) * ((float) ycount) * ((float) count)
+          > POW3(blas_limit)))
+  {
+    const int m = xcount, n = ycount, k = count, ldc = rystride;
+    const GFC_INTEGER_4 one = 1, zero = 0;
+    const int lda = (axstride == 1) ? aystride : axstride,
+              ldb = (bxstride == 1) ? bystride : bxstride;
+
+    if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+      {
+        assert (gemm != NULL);
+        gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
+              &one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
+        return;
+      }
+  }
+
   if (rxstride == 1 && axstride == 1 && bxstride == 1)
     {
       const GFC_INTEGER_4 * restrict bbase_y;
diff --git a/libgfortran/generated/matmul_i8.c b/libgfortran/generated/matmul_i8.c
index caaf9e8f9763..23a87a904f7b 100644
--- a/libgfortran/generated/matmul_i8.c
+++ b/libgfortran/generated/matmul_i8.c
@@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA.  */
 
 #if defined (HAVE_GFC_INTEGER_8)
 
+/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
+   passed to us by the front-end, in which case we'll call it for large
+   matrices.  */
+
+typedef void (*blas_call)(const char *, const char *, const int *, const int *,
+                          const int *, const GFC_INTEGER_8 *, const GFC_INTEGER_8 *,
+                          const int *, const GFC_INTEGER_8 *, const int *,
+                          const GFC_INTEGER_8 *, GFC_INTEGER_8 *, const int *,
+                          int, int);
+
 /* The order of loops is different in the case of plain matrix
    multiplication C=MATMUL(A,B), and in the frequent special case where
    the argument A is the temporary result of a TRANSPOSE intrinsic:
@@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA.  */
        DO I=1,M
          S = 0
          DO K=1,COUNT
-           S = S+A(I,K)+B(K,J)
+           S = S+A(I,K)*B(K,J)
          C(I,J) = S
    ENDIF
 */
 
+/* If try_blas is set to a nonzero value, then the matmul function will
+   see if there is a way to perform the matrix multiplication by a call
+   to the BLAS gemm function.  */
+
 extern void matmul_i8 (gfc_array_i8 * const restrict retarray, 
-	gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b);
+	gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm);
 export_proto(matmul_i8);
 
 void
 matmul_i8 (gfc_array_i8 * const restrict retarray, 
-	gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b)
+	gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
 {
   const GFC_INTEGER_8 * restrict abase;
   const GFC_INTEGER_8 * restrict bbase;
@@ -177,6 +193,31 @@ matmul_i8 (gfc_array_i8 * const restrict retarray,
   bbase = b->data;
   dest = retarray->data;
 
+
+  /* Now that everything is set up, we're performing the multiplication
+     itself.  */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+
+  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+      && (bxstride == 1 || bystride == 1)
+      && (((float) xcount) * ((float) ycount) * ((float) count)
+          > POW3(blas_limit)))
+  {
+    const int m = xcount, n = ycount, k = count, ldc = rystride;
+    const GFC_INTEGER_8 one = 1, zero = 0;
+    const int lda = (axstride == 1) ? aystride : axstride,
+              ldb = (bxstride == 1) ? bystride : bxstride;
+
+    if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+      {
+        assert (gemm != NULL);
+        gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
+              &one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
+        return;
+      }
+  }
+
   if (rxstride == 1 && axstride == 1 && bxstride == 1)
     {
       const GFC_INTEGER_8 * restrict bbase_y;
diff --git a/libgfortran/generated/matmul_r10.c b/libgfortran/generated/matmul_r10.c
index 8fa1d6d9e497..e4dfd74ef03e 100644
--- a/libgfortran/generated/matmul_r10.c
+++ b/libgfortran/generated/matmul_r10.c
@@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA.  */
 
 #if defined (HAVE_GFC_REAL_10)
 
+/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
+   passed to us by the front-end, in which case we'll call it for large
+   matrices.  */
+
+typedef void (*blas_call)(const char *, const char *, const int *, const int *,
+                          const int *, const GFC_REAL_10 *, const GFC_REAL_10 *,
+                          const int *, const GFC_REAL_10 *, const int *,
+                          const GFC_REAL_10 *, GFC_REAL_10 *, const int *,
+                          int, int);
+
 /* The order of loops is different in the case of plain matrix
    multiplication C=MATMUL(A,B), and in the frequent special case where
    the argument A is the temporary result of a TRANSPOSE intrinsic:
@@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA.  */
        DO I=1,M
          S = 0
          DO K=1,COUNT
-           S = S+A(I,K)+B(K,J)
+           S = S+A(I,K)*B(K,J)
          C(I,J) = S
    ENDIF
 */
 
+/* If try_blas is set to a nonzero value, then the matmul function will
+   see if there is a way to perform the matrix multiplication by a call
+   to the BLAS gemm function.  */
+
 extern void matmul_r10 (gfc_array_r10 * const restrict retarray, 
-	gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b);
+	gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm);
 export_proto(matmul_r10);
 
 void
 matmul_r10 (gfc_array_r10 * const restrict retarray, 
-	gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b)
+	gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
 {
   const GFC_REAL_10 * restrict abase;
   const GFC_REAL_10 * restrict bbase;
@@ -177,6 +193,31 @@ matmul_r10 (gfc_array_r10 * const restrict retarray,
   bbase = b->data;
   dest = retarray->data;
 
+
+  /* Now that everything is set up, we're performing the multiplication
+     itself.  */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+
+  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+      && (bxstride == 1 || bystride == 1)
+      && (((float) xcount) * ((float) ycount) * ((float) count)
+          > POW3(blas_limit)))
+  {
+    const int m = xcount, n = ycount, k = count, ldc = rystride;
+    const GFC_REAL_10 one = 1, zero = 0;
+    const int lda = (axstride == 1) ? aystride : axstride,
+              ldb = (bxstride == 1) ? bystride : bxstride;
+
+    if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+      {
+        assert (gemm != NULL);
+        gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
+              &one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
+        return;
+      }
+  }
+
   if (rxstride == 1 && axstride == 1 && bxstride == 1)
     {
       const GFC_REAL_10 * restrict bbase_y;
diff --git a/libgfortran/generated/matmul_r16.c b/libgfortran/generated/matmul_r16.c
index 0f61b0381681..ec760f2d3d8c 100644
--- a/libgfortran/generated/matmul_r16.c
+++ b/libgfortran/generated/matmul_r16.c
@@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA.  */
 
 #if defined (HAVE_GFC_REAL_16)
 
+/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
+   passed to us by the front-end, in which case we'll call it for large
+   matrices.  */
+
+typedef void (*blas_call)(const char *, const char *, const int *, const int *,
+                          const int *, const GFC_REAL_16 *, const GFC_REAL_16 *,
+                          const int *, const GFC_REAL_16 *, const int *,
+                          const GFC_REAL_16 *, GFC_REAL_16 *, const int *,
+                          int, int);
+
 /* The order of loops is different in the case of plain matrix
    multiplication C=MATMUL(A,B), and in the frequent special case where
    the argument A is the temporary result of a TRANSPOSE intrinsic:
@@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA.  */
        DO I=1,M
          S = 0
          DO K=1,COUNT
-           S = S+A(I,K)+B(K,J)
+           S = S+A(I,K)*B(K,J)
          C(I,J) = S
    ENDIF
 */
 
+/* If try_blas is set to a nonzero value, then the matmul function will
+   see if there is a way to perform the matrix multiplication by a call
+   to the BLAS gemm function.  */
+
 extern void matmul_r16 (gfc_array_r16 * const restrict retarray, 
-	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b);
+	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm);
 export_proto(matmul_r16);
 
 void
 matmul_r16 (gfc_array_r16 * const restrict retarray, 
-	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b)
+	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
 {
   const GFC_REAL_16 * restrict abase;
   const GFC_REAL_16 * restrict bbase;
@@ -177,6 +193,31 @@ matmul_r16 (gfc_array_r16 * const restrict retarray,
   bbase = b->data;
   dest = retarray->data;
 
+
+  /* Now that everything is set up, we're performing the multiplication
+     itself.  */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+
+  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+      && (bxstride == 1 || bystride == 1)
+      && (((float) xcount) * ((float) ycount) * ((float) count)
+          > POW3(blas_limit)))
+  {
+    const int m = xcount, n = ycount, k = count, ldc = rystride;
+    const GFC_REAL_16 one = 1, zero = 0;
+    const int lda = (axstride == 1) ? aystride : axstride,
+              ldb = (bxstride == 1) ? bystride : bxstride;
+
+    if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+      {
+        assert (gemm != NULL);
+        gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
+              &one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
+        return;
+      }
+  }
+
   if (rxstride == 1 && axstride == 1 && bxstride == 1)
     {
       const GFC_REAL_16 * restrict bbase_y;
diff --git a/libgfortran/generated/matmul_r4.c b/libgfortran/generated/matmul_r4.c
index d684dd2905c3..cf2f45fb1256 100644
--- a/libgfortran/generated/matmul_r4.c
+++ b/libgfortran/generated/matmul_r4.c
@@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA.  */
 
 #if defined (HAVE_GFC_REAL_4)
 
+/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
+   passed to us by the front-end, in which case we'll call it for large
+   matrices.  */
+
+typedef void (*blas_call)(const char *, const char *, const int *, const int *,
+                          const int *, const GFC_REAL_4 *, const GFC_REAL_4 *,
+                          const int *, const GFC_REAL_4 *, const int *,
+                          const GFC_REAL_4 *, GFC_REAL_4 *, const int *,
+                          int, int);
+
 /* The order of loops is different in the case of plain matrix
    multiplication C=MATMUL(A,B), and in the frequent special case where
    the argument A is the temporary result of a TRANSPOSE intrinsic:
@@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA.  */
        DO I=1,M
          S = 0
          DO K=1,COUNT
-           S = S+A(I,K)+B(K,J)
+           S = S+A(I,K)*B(K,J)
          C(I,J) = S
    ENDIF
 */
 
+/* If try_blas is set to a nonzero value, then the matmul function will
+   see if there is a way to perform the matrix multiplication by a call
+   to the BLAS gemm function.  */
+
 extern void matmul_r4 (gfc_array_r4 * const restrict retarray, 
-	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b);
+	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm);
 export_proto(matmul_r4);
 
 void
 matmul_r4 (gfc_array_r4 * const restrict retarray, 
-	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b)
+	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
 {
   const GFC_REAL_4 * restrict abase;
   const GFC_REAL_4 * restrict bbase;
@@ -177,6 +193,31 @@ matmul_r4 (gfc_array_r4 * const restrict retarray,
   bbase = b->data;
   dest = retarray->data;
 
+
+  /* Now that everything is set up, we're performing the multiplication
+     itself.  */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+
+  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+      && (bxstride == 1 || bystride == 1)
+      && (((float) xcount) * ((float) ycount) * ((float) count)
+          > POW3(blas_limit)))
+  {
+    const int m = xcount, n = ycount, k = count, ldc = rystride;
+    const GFC_REAL_4 one = 1, zero = 0;
+    const int lda = (axstride == 1) ? aystride : axstride,
+              ldb = (bxstride == 1) ? bystride : bxstride;
+
+    if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+      {
+        assert (gemm != NULL);
+        gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
+              &one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
+        return;
+      }
+  }
+
   if (rxstride == 1 && axstride == 1 && bxstride == 1)
     {
       const GFC_REAL_4 * restrict bbase_y;
diff --git a/libgfortran/generated/matmul_r8.c b/libgfortran/generated/matmul_r8.c
index 41726bce2a58..c746f6c35198 100644
--- a/libgfortran/generated/matmul_r8.c
+++ b/libgfortran/generated/matmul_r8.c
@@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA.  */
 
 #if defined (HAVE_GFC_REAL_8)
 
+/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
+   passed to us by the front-end, in which case we'll call it for large
+   matrices.  */
+
+typedef void (*blas_call)(const char *, const char *, const int *, const int *,
+                          const int *, const GFC_REAL_8 *, const GFC_REAL_8 *,
+                          const int *, const GFC_REAL_8 *, const int *,
+                          const GFC_REAL_8 *, GFC_REAL_8 *, const int *,
+                          int, int);
+
 /* The order of loops is different in the case of plain matrix
    multiplication C=MATMUL(A,B), and in the frequent special case where
    the argument A is the temporary result of a TRANSPOSE intrinsic:
@@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA.  */
        DO I=1,M
          S = 0
          DO K=1,COUNT
-           S = S+A(I,K)+B(K,J)
+           S = S+A(I,K)*B(K,J)
          C(I,J) = S
    ENDIF
 */
 
+/* If try_blas is set to a nonzero value, then the matmul function will
+   see if there is a way to perform the matrix multiplication by a call
+   to the BLAS gemm function.  */
+
 extern void matmul_r8 (gfc_array_r8 * const restrict retarray, 
-	gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b);
+	gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm);
 export_proto(matmul_r8);
 
 void
 matmul_r8 (gfc_array_r8 * const restrict retarray, 
-	gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b)
+	gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
 {
   const GFC_REAL_8 * restrict abase;
   const GFC_REAL_8 * restrict bbase;
@@ -177,6 +193,31 @@ matmul_r8 (gfc_array_r8 * const restrict retarray,
   bbase = b->data;
   dest = retarray->data;
 
+
+  /* Now that everything is set up, we're performing the multiplication
+     itself.  */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+
+  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+      && (bxstride == 1 || bystride == 1)
+      && (((float) xcount) * ((float) ycount) * ((float) count)
+          > POW3(blas_limit)))
+  {
+    const int m = xcount, n = ycount, k = count, ldc = rystride;
+    const GFC_REAL_8 one = 1, zero = 0;
+    const int lda = (axstride == 1) ? aystride : axstride,
+              ldb = (bxstride == 1) ? bystride : bxstride;
+
+    if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+      {
+        assert (gemm != NULL);
+        gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
+              &one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
+        return;
+      }
+  }
+
   if (rxstride == 1 && axstride == 1 && bxstride == 1)
     {
       const GFC_REAL_8 * restrict bbase_y;
diff --git a/libgfortran/m4/matmul.m4 b/libgfortran/m4/matmul.m4
index 3678c639f2a3..ef2f0fb88dc3 100644
--- a/libgfortran/m4/matmul.m4
+++ b/libgfortran/m4/matmul.m4
@@ -37,6 +37,16 @@ include(iparm.m4)dnl
 
 `#if defined (HAVE_'rtype_name`)'
 
+/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
+   passed to us by the front-end, in which case we'll call it for large
+   matrices.  */
+
+typedef void (*blas_call)(const char *, const char *, const int *, const int *,
+                          const int *, const rtype_name *, const rtype_name *,
+                          const int *, const rtype_name *, const int *,
+                          const rtype_name *, rtype_name *, const int *,
+                          int, int);
+
 /* The order of loops is different in the case of plain matrix
    multiplication C=MATMUL(A,B), and in the frequent special case where
    the argument A is the temporary result of a TRANSPOSE intrinsic:
@@ -57,18 +67,24 @@ include(iparm.m4)dnl
        DO I=1,M
          S = 0
          DO K=1,COUNT
-           S = S+A(I,K)+B(K,J)
+           S = S+A(I,K)*B(K,J)
          C(I,J) = S
    ENDIF
 */
 
+/* If try_blas is set to a nonzero value, then the matmul function will
+   see if there is a way to perform the matrix multiplication by a call
+   to the BLAS gemm function.  */
+
 extern void matmul_`'rtype_code (rtype * const restrict retarray, 
-	rtype * const restrict a, rtype * const restrict b);
+	rtype * const restrict a, rtype * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm);
 export_proto(matmul_`'rtype_code);
 
 void
 matmul_`'rtype_code (rtype * const restrict retarray, 
-	rtype * const restrict a, rtype * const restrict b)
+	rtype * const restrict a, rtype * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
 {
   const rtype_name * restrict abase;
   const rtype_name * restrict bbase;
@@ -179,6 +195,31 @@ sinclude(`matmul_asm_'rtype_code`.m4')dnl
   bbase = b->data;
   dest = retarray->data;
 
+
+  /* Now that everything is set up, we're performing the multiplication
+     itself.  */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+
+  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+      && (bxstride == 1 || bystride == 1)
+      && (((float) xcount) * ((float) ycount) * ((float) count)
+          > POW3(blas_limit)))
+  {
+    const int m = xcount, n = ycount, k = count, ldc = rystride;
+    const rtype_name one = 1, zero = 0;
+    const int lda = (axstride == 1) ? aystride : axstride,
+              ldb = (bxstride == 1) ? bystride : bxstride;
+
+    if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+      {
+        assert (gemm != NULL);
+        gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
+              &one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
+        return;
+      }
+  }
+
   if (rxstride == 1 && axstride == 1 && bxstride == 1)
     {
       const rtype_name * restrict bbase_y;
-- 
GitLab