gcc/0099-Enable-Transposed-SLP.patch

From 0dd3b8532f35486bd5db2c71342c8dfed4c0893a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=83=91=E6=99=A8=E5=8D=89?= <zhengchenhui1@huawei.com>
Date: Thu, 25 Jul 2024 17:25:23 +0800
Subject: [PATCH] Enable Transposed SLP.

---
 gcc/common.opt                          |    4 +
 gcc/testsuite/gcc.dg/vect/transpose-1.c |   53 +
 gcc/testsuite/gcc.dg/vect/transpose-2.c |   50 +
 gcc/testsuite/gcc.dg/vect/transpose-3.c |   54 +
 gcc/testsuite/gcc.dg/vect/transpose-4.c |   53 +
 gcc/testsuite/gcc.dg/vect/transpose-5.c |   74 ++
 gcc/testsuite/gcc.dg/vect/transpose-6.c |   67 +
 gcc/testsuite/gcc.dg/vect/transpose-7.c |   53 +
 gcc/testsuite/gcc.dg/vect/transpose-8.c |   53 +
 gcc/testsuite/gcc.dg/vect/vect.exp      |    7 +
 gcc/tree-loop-distribution.cc           | 1464 ++++++++++++++++++++-
 gcc/tree-vect-data-refs.cc              |  237 ++++
 gcc/tree-vect-loop.cc                   |   42 +-
 gcc/tree-vect-patterns.cc               |    4 +-
 gcc/tree-vect-slp.cc                    | 1553 ++++++++++++++++++++---
 gcc/tree-vect-stmts.cc                  |  973 +++++++++++++-
 gcc/tree-vectorizer.h                   |   96 +-
 17 files changed, 4648 insertions(+), 189 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-2.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-3.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-4.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-5.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-6.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-7.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-8.c

diff --git a/gcc/common.opt b/gcc/common.opt
index b18f0b944..5958c4e0b 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -3221,6 +3221,10 @@ ftree-slp-vectorize
 Common Var(flag_tree_slp_vectorize) Optimization EnabledBy(ftree-vectorize)
 Enable basic block vectorization (SLP) on trees.

+ftree-slp-transpose-vectorize
+Common Var(flag_tree_slp_transpose_vectorize) Optimization Init(0)
+Enable basic block vectorization (SLP) for transposed stores and loads on trees.
+
 fvect-cost-model=
 Common Joined RejectNegative Enum(vect_cost_model) Var(flag_vect_cost_model) Init(VECT_COST_MODEL_DEFAULT) Optimization
 -fvect-cost-model=[unlimited|dynamic|cheap|very-cheap]	Specifies the cost model for vectorization.
diff --git a/gcc/testsuite/gcc.dg/vect/transpose-1.c b/gcc/testsuite/gcc.dg/vect/transpose-1.c
new file mode 100644
index 000000000..8237a8b9e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/transpose-1.c
@@ -0,0 +1,53 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-require-effective-target vect_int } */
+#include <stdio.h>
+#include <stdlib.h>
+#include "tree-vect.h"
+
+#define N 4
+#define M 256
+
+int foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2)
+{
+  int i = 0;
+  int sum = 0;
+  unsigned c0[N], c1[N], c2[N], c3[N], c4[N], c5[N], c6[N], c7[N];
+  for (i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2)
+    {
+      c0[i] = pix1[0] - pix2[0];
+      c1[i] = pix1[1] - pix2[1];
+      c2[i] = pix1[2] - pix2[2];
+      c3[i] = pix1[3] - pix2[3];
+      c4[i] = pix1[4] - pix2[4];
+      c5[i] = pix1[5] - pix2[5];
+      c6[i] = pix1[6] - pix2[6];
+      c7[i] = pix1[7] - pix2[7];
+    }
+  for (int i = 0; i < N; i++)
+    {
+      sum += c0[i] + c1[i] + c2[i] + c3[i] + c4[i] + c5[i] + c6[i] + c7[i];
+    }
+  return sum;
+}
+
+int main (int argc, const char* argv[])
+{
+  unsigned char input1[M];
+  unsigned char input2[M];
+  int i1 = 16;
+  int i2 = 8;
+  check_vect ();
+  for (int i = 0; i < M; i++)
+    {
+	input1[i] = i * 2;
+	input2[i] = i;
+    }
+  int sum = foo (input1, i1, input2, i2);
+  if (sum != 1264)
+    {
+      abort ();
+    }
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/transpose-2.c b/gcc/testsuite/gcc.dg/vect/transpose-2.c
new file mode 100644
index 000000000..fdf4dbd96
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/transpose-2.c
@@ -0,0 +1,50 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-additional-options "-fno-tree-loop-vectorize -fno-tree-dse" } */
+/* { dg-require-effective-target vect_int } */
+#include <stdio.h>
+#include <stdlib.h>
+#include "tree-vect.h"
+
+#define N 8
+#define M 256
+
+int foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2)
+{
+  int i = 0;
+  int sum = 0;
+  unsigned short c0[N], c1[N], c2[N], c3[N], c4[N], c5[N], c6[N], c7[N];
+  for (i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2)
+    {
+      c0[i] = pix1[0] - pix2[0];
+      c1[i] = pix1[1] - pix2[1];
+      c2[i] = pix1[2] - pix2[2];
+      c3[i] = pix1[3] - pix2[3];
+    }
+  for (int i = 0; i < N; i++)
+    {
+      sum += c0[i] + c1[i] + c2[i] + c3[i];
+    }
+  return sum;
+}
+
+int main (int argc, const char* argv[])
+{
+  unsigned char input1[M];
+  unsigned char input2[M];
+  int i1 = 5;
+  int i2 = 4;
+  check_vect ();
+  for (int i = 0; i < M; i++)
+    {
+	input1[i] = i * 4;
+	input2[i] = i * 2;
+    }
+  int sum = foo (input1, i1, input2, i2);
+  if (sum != 1440)
+    {
+      abort ();
+    }
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/transpose-3.c b/gcc/testsuite/gcc.dg/vect/transpose-3.c
new file mode 100644
index 000000000..e492e3717
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/transpose-3.c
@@ -0,0 +1,54 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-additional-options "-fno-tree-loop-vectorize -fno-tree-dse -fno-tree-fre" } */
+/* { dg-require-effective-target vect_int } */
+#include <stdio.h>
+#include <stdlib.h>
+#include "tree-vect.h"
+
+#define N 4
+#define M 256
+
+int foo (unsigned short *pix1, int i_pix1, unsigned short *pix2, int i_pix2)
+{
+  int i = 0;
+  int sum = 0;
+  unsigned c0[N], c1[N], c2[N], c3[N], c4[N], c5[N], c6[N], c7[N];
+  for (i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2)
+    {
+      c0[i] = pix1[0] - pix2[0];
+      c1[i] = pix1[1] - pix2[1];
+      c2[i] = pix1[2] - pix2[2];
+      c3[i] = pix1[3] - pix2[3];
+      c4[i] = pix1[4] - pix2[4];
+      c5[i] = pix1[5] - pix2[5];
+      c6[i] = pix1[6] - pix2[6];
+      c7[i] = pix1[7] - pix2[7];
+    }
+  for (int i = 0; i < N; i++)
+     {
+      sum += c0[i] + c1[i] + c2[i] + c3[i] + c4[i] + c5[i] + c6[i] + c7[i];
+    }
+  return sum;
+}
+
+int main (int argc, const char* argv[])
+{
+  unsigned short input1[M];
+  unsigned short input2[M];
+  int i1 = 8;
+  int i2 = 4;
+  check_vect ();
+  for (int i = 0; i < M; i++)
+    {
+	input1[i] = i * 4;
+	input2[i] = i;
+    }
+  int sum = foo (input1, i1, input2, i2);
+  if (sum != 1680)
+    {
+      abort ();
+    }
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/transpose-4.c b/gcc/testsuite/gcc.dg/vect/transpose-4.c
new file mode 100644
index 000000000..0b4adea9b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/transpose-4.c
@@ -0,0 +1,53 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-require-effective-target vect_int } */
+#include <stdio.h>
+#include <stdlib.h>
+#include "tree-vect.h"
+
+#define N 4
+#define M 256
+
+int foo (unsigned *pix1, int i_pix1, unsigned *pix2, int i_pix2)
+{
+  int i = 0;
+  int sum = 0;
+  unsigned c0[N], c1[N], c2[N], c3[N], c4[N], c5[N], c6[N], c7[N];
+  for (i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2)
+    {
+      c0[i] = pix1[0] - pix2[0];
+      c1[i] = pix1[1] - pix2[1];
+      c2[i] = pix1[2] - pix2[2];
+      c3[i] = pix1[3] - pix2[3];
+      c4[i] = pix1[4] - pix2[4];
+      c5[i] = pix1[5] - pix2[5];
+      c6[i] = pix1[6] - pix2[6];
+      c7[i] = pix1[7] - pix2[7];
+    }
+  for (int i = 0; i < N; i++)
+     {
+      sum += c0[i] + c1[i] + c2[i] + c3[i] + c4[i] + c5[i] + c6[i] + c7[i];
+    }
+  return sum;
+}
+
+int main (int argc, const char* argv[])
+{
+  unsigned input1[M];
+  unsigned input2[M];
+  int i1 = 12;
+  int i2 = 6;
+  check_vect ();
+  for (int i = 0; i < M; i++)
+    {
+	input1[i] = i * 7;
+	input2[i] = i * 3;
+    }
+  int sum = foo (input1, i1, input2, i2);
+  if (sum != 3616)
+    {
+      abort ();
+    }
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/transpose-5.c b/gcc/testsuite/gcc.dg/vect/transpose-5.c
new file mode 100644
index 000000000..040dedf1b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/transpose-5.c
@@ -0,0 +1,74 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-additional-options "-fno-tree-dse -fno-tree-fre" } */
+/* { dg-require-effective-target vect_int } */
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "tree-vect.h"
+
+#define N 4
+#define M 256
+#define eps 1e-8
+
+double foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2)
+{
+  unsigned a0[N];
+  unsigned a1[N];
+  unsigned a2[N];
+  unsigned a3[N];
+
+  int b0[N];
+  int b1[N];
+  int b2[N];
+  int b3[N];
+
+  for (int i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2)
+    {
+      a0[i] = (pix1[0] - pix2[0]) + ((pix1[4] + pix2[4]) << 16);
+      a1[i] = (pix1[1] - pix2[1]) + ((pix1[5] + pix2[5]) << 16);
+      a2[i] = (pix1[2] - pix2[2]) + ((pix1[6] + pix2[6]) << 16);
+      a3[i] = (pix1[3] - pix2[3]) + ((pix1[7] + pix2[7]) << 16);
+    }
+
+  for (int i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2)
+    {
+      b0[i] = (pix1[0] - pix2[0]) + (pix1[4] + pix2[4]);
+      b1[i] = (pix1[1] - pix2[1]) + (pix1[5] + pix2[5]);
+      b2[i] = (pix1[2] - pix2[2]) + (pix1[6] + pix2[6]);
+      b3[i] = (pix1[3] - pix2[3]) + (pix1[7] + pix2[7]);
+    }
+
+  double sum = 0;
+  for (int i = 0; i < N; i++)
+    {
+      sum += a0[i] + a1[i] + a2[i] + a3[i] + b0[i] + b1[i] + b2[i] + b3[i];
+    }
+  return sum;
+}
+
+int main (int argc, const char* argv[])
+{
+  unsigned char input1[M];
+  unsigned char input2[M];
+  int i1 = 8;
+  int i2 = 3;
+  unsigned char m = 2;
+  unsigned short n = 12;
+  float t = 3.0;
+  double k = 4.2;
+  check_vect ();
+  for (int i = 0; i < M; i++)
+    {
+	input1[i] = i * 6;
+	input2[i] = i * 3;
+    }
+  double sum = foo (input1, i1, input2, i2);
+  if (fabs (sum - 78648144) > eps)
+    {
+      abort ();
+    }
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */
+/* { dg-final { scan-tree-dump-times "vectorizable_store for slp transpose" 2 "slp1" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/transpose-6.c b/gcc/testsuite/gcc.dg/vect/transpose-6.c
new file mode 100644
index 000000000..3e134ac02
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/transpose-6.c
@@ -0,0 +1,67 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target vect_float } */
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "tree-vect.h"
+
+#define N 4
+#define M 256
+#define eps 1e-8
+
+float foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2)
+{
+  unsigned a0[N];
+  unsigned a1[N];
+  unsigned a2[N];
+  unsigned a3[N];
+
+  float c0[N];
+  float c1[N];
+  float c2[N];
+  float c3[N];
+
+  for (int i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2)
+    {
+      a0[i] = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16);
+      a1[i] = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16);
+      a2[i] = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16);
+      a3[i] = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16);
+
+      c0[i] = (pix1[0] * pix2[0]) + (pix1[4] * pix2[4]);
+      c1[i] = (pix1[1] * pix2[1]) + (pix1[5] * pix2[5]);
+      c2[i] = (pix1[2] * pix2[2]) + (pix1[6] * pix2[6]);
+      c3[i] = (pix1[3] * pix2[3]) + (pix1[7] * pix2[7]);
+    }
+
+  float sum = 0;
+  for (int i = 0; i < N; i++)
+    {
+      sum += a0[i] + a1[i] + a2[i] + a3[i] + c0[i] + c1[i] + c2[i] + c3[i];
+    }
+  return sum;
+}
+
+int main (int argc, const char* argv[])
+{
+  unsigned char input1[M];
+  unsigned char input2[M];
+  int i1 = 18;
+  int i2 = 6;
+  check_vect ();
+  for (int i = 0; i < M; i++)
+    {
+	input1[i] = i * 4;
+	input2[i] = i * 2;
+    }
+  float sum = foo (input1, i1, input2, i2);
+  if (fabs (sum - 106041168) > eps)
+    {
+      abort ();
+    }
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */
+/* { dg-final { scan-tree-dump-times "vectorizable_store for slp transpose" 2 "slp1" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/transpose-7.c b/gcc/testsuite/gcc.dg/vect/transpose-7.c
new file mode 100644
index 000000000..8ba1b1b6d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/transpose-7.c
@@ -0,0 +1,53 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-additional-options "-fno-tree-loop-vectorize -fno-tree-dse" } */
+/* { dg-require-effective-target vect_int } */
+#include <stdio.h>
+#include <stdlib.h>
+#include "tree-vect.h"
+
+#define N 16
+#define M 256
+
+int foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2)
+{
+  int i = 0;
+  int sum = 0;
+  unsigned char c0[N], c1[N];
+  for (int i = 0; i < N/2; i++, pix1 += i_pix1, pix2 += i_pix2)
+    {
+      c0[i] = pix1[0] - pix2[0];
+      c1[i] = pix1[1] - pix2[1];
+    }
+  for (int i = N/2; i < N; i++, pix1 += i_pix1, pix2 += i_pix2)
+    {
+      c0[i] = pix1[0] - pix2[0];
+      c1[i] = pix1[1] - pix2[1];
+   }
+  for (int i = 0; i < N; i++)
+    {
+      sum += c0[i] + c1[i];
+    }
+  return sum;
+}
+
+int main (int argc, const char* argv[])
+{
+  unsigned char input1[M];
+  unsigned char input2[M];
+  int i1 = 6;
+  int i2 = 4;
+  check_vect ();
+  for (int i = 0; i < M; i++)
+    {
+	input1[i] = i * 5;
+	input2[i] = i * 2;
+    }
+  int sum = foo (input1, i1, input2, i2);
+  if (sum != 3280)
+    {
+      abort ();
+    }
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/transpose-8.c b/gcc/testsuite/gcc.dg/vect/transpose-8.c
new file mode 100644
index 000000000..a154f012a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/transpose-8.c
@@ -0,0 +1,53 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-additional-options "-fno-tree-loop-vectorize" } */
+/* { dg-require-effective-target vect_int } */
+#include <stdio.h>
+#include <stdlib.h>
+#include "tree-vect.h"
+
+#define N 32
+#define M 256
+
+int foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2)
+{
+  int i = 0;
+  int sum = 0;
+  unsigned char c0[N], c1[N];
+  for (int i = 0; i < N/2; i++, pix1 += i_pix1, pix2 += i_pix2)
+    {
+      c0[i] = pix1[0] - pix2[0];
+      c1[i] = pix1[1] - pix2[1];
+    }
+  for (int i = N/2; i < N; i++, pix1 += i_pix1, pix2 += i_pix2)
+    {
+      c0[i] = pix1[0] - pix2[0];
+      c1[i] = pix1[1] - pix2[1];
+   }
+  for (int i = 0; i < N; i++)
+    {
+      sum += c0[i] + c1[i];
+    }
+  return sum;
+}
+
+int main (int argc, const char* argv[])
+{
+  unsigned char input1[M];
+  unsigned char input2[M];
+  int i1 = 6;
+  int i2 = 4;
+  check_vect ();
+  for (int i = 0; i < M; i++)
+    {
+	input1[i] = i * 5;
+	input2[i] = i * 2;
+    }
+  int sum = foo (input1, i1, input2, i2);
+  if (sum != 7584)
+    {
+      abort ();
+    }
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect.exp b/gcc/testsuite/gcc.dg/vect/vect.exp
index dcaef1e0a..ae5212411 100644
--- a/gcc/testsuite/gcc.dg/vect/vect.exp
+++ b/gcc/testsuite/gcc.dg/vect/vect.exp
@@ -117,6 +117,13 @@ et-dg-runtest dg-runtest [lsort \
 	[glob -nocomplain $srcdir/$subdir/no-vfa-*.\[cS\]]] \
 	"" $DEFAULT_VECTCFLAGS

+# -ftree-slp-transpose-vectorize SLP tests
+set VECT_SLP_CFLAGS $SAVED_VECT_SLP_CFLAGS
+lappend VECT_SLP_CFLAGS "-ftree-slp-transpose-vectorize"
+et-dg-runtest dg-runtest [lsort \
+	[glob -nocomplain $srcdir/$subdir/transpose-*.\[cS\]]] \
+	"" "-ftree-slp-transpose-vectorize -fdump-tree-slp-details -O3"
+
 # -ffast-math tests
 set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
 lappend DEFAULT_VECTCFLAGS "-ffast-math"
diff --git a/gcc/tree-loop-distribution.cc b/gcc/tree-loop-distribution.cc
index 606eb05e6..8d118e987 100644
--- a/gcc/tree-loop-distribution.cc
+++ b/gcc/tree-loop-distribution.cc
@@ -36,6 +36,47 @@ along with GCC; see the file COPYING3.  If not see
    |   D(I) = A(I-1)*E
    |ENDDO

+   If an unvectorizable loop has grouped loads, and calculations from grouped
+   loads are isomorphic, build temp arrays using stmts where isomorphic
+   calculations end.  Afer distribution, the partition built from temp
+   arrays can be vectorized in pass SLP after loop unrolling.  For example,
+
+   |DO I = 1, N
+   |    A = FOO (ARG_1);
+   |    B = FOO (ARG_2);
+   |    C = BAR_0 (A);
+   |    D = BAR_1 (B);
+   |ENDDO
+
+   is transformed to
+
+   |DO I = 1, N
+   |    J = FOO (ARG_1);
+   |    K = FOO (ARG_2);
+   |    X[I] = J;
+   |    Y[I] = K;
+   |    A = X[I];
+   |    B = Y[I];
+   |    C = BAR_0 (A);
+   |    D = BAR_1 (B);
+   |ENDDO
+
+   and is then distributed to
+
+   |DO I = 1, N
+   |    J = FOO (ARG_1);
+   |    K = FOO (ARG_2);
+   |    X[I] = J;
+   |    Y[I] = K;
+   |ENDDO
+
+   |DO I = 1, N
+   |    A = X[I];
+   |    B = Y[I];
+   |    C = BAR_0 (A);
+   |    D = BAR_1 (B);
+   |ENDDO
+
    Loop distribution is the dual of loop fusion.  It separates statements
    of a loop (or loop nest) into multiple loops (or loop nests) with the
    same loop header.  The major goal is to separate statements which may
@@ -44,7 +85,9 @@ along with GCC; see the file COPYING3.  If not see

      1) Seed partitions with specific type statements.  For now we support
 	two types seed statements: statement defining variable used outside
-	of loop; statement storing to memory.
+	of loop; statement storing to memory.  Moreover, for unvectorizable
+	loops, we try to find isomorphic stmts from grouped load and build
+	temp arrays as new seed statements.
      2) Build reduced dependence graph (RDG) for loop to be distributed.
 	The vertices (RDG:V) model all statements in the loop and the edges
 	(RDG:E) model flow and control dependencies between statements.
@@ -90,6 +133,8 @@ along with GCC; see the file COPYING3.  If not see
 	data reuse.  */

 #include "config.h"
+#define INCLUDE_MAP
+#define INCLUDE_ALGORITHM
 #include "system.h"
 #include "coretypes.h"
 #include "backend.h"
@@ -115,6 +160,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-vectorizer.h"
 #include "tree-eh.h"
 #include "gimple-fold.h"
+#include "optabs-tree.h"
 #include "tree-affine.h"
 #include "intl.h"
 #include "rtl.h"
@@ -188,6 +234,52 @@ struct rdg_vertex
 #define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I]))
 #define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I]))

+/* Results of isomorphic group analysis.  */
+#define UNINITIALIZED	(0)
+#define ISOMORPHIC	(1)
+#define HETEROGENEOUS	(1 << 1)
+#define UNCERTAIN	(1 << 2)
+
+/* Information of a stmt while analyzing isomorphic use in group.  */
+
+typedef struct _group_info
+{
+  gimple *stmt;
+
+  /* True if stmt can be a cut point.  */
+  bool cut_point;
+
+  /* For use_stmt with two rhses, one of which is the lhs of stmt.
+     If the other is unknown to be isomorphic, mark it uncertain.  */
+  bool uncertain;
+
+  /* Searching of isomorphic stmt reaches heterogeneous groups or reaches
+     MEM stmts.  */
+  bool done;
+
+  _group_info ()
+    {
+      stmt = NULL;
+      cut_point = false;
+      uncertain = false;
+      done = false;
+    }
+} *group_info;
+
+/* PAIR of cut points and corresponding profit.  */
+typedef std::pair<vec<gimple *> *, int> stmts_profit;
+
+/* MAP of vector factor VF and corresponding stmts_profit PAIR.  */
+typedef std::map<unsigned, stmts_profit> vf_stmts_profit_map;
+
+/* PAIR of group_num and iteration_num.  We consider rhses from the same
+   group and interation are isomorphic.  */
+typedef std::pair<unsigned, unsigned> group_iteration;
+
+/* An isomorphic stmt is detetmined by lhs of use_stmt, group_num and
+   the iteration_num when we insert this stmt to this map.  */
+typedef std::map<tree, group_iteration> isomer_stmt_lhs;
+
 /* Data dependence type.  */

 enum rdg_dep_type
@@ -600,13 +692,14 @@ class loop_distribution
   /* Returns true when PARTITION1 and PARTITION2 access the same memory
      object in RDG.  */
   bool share_memory_accesses (struct graph *rdg,
-			      partition *partition1, partition *partition2);
+			      partition *partition1, partition *partition2,
+			      hash_set<tree> *excluded_arrays);

   /* For each seed statement in STARTING_STMTS, this function builds
      partition for it by adding depended statements according to RDG.
      All partitions are recorded in PARTITIONS.  */
   void rdg_build_partitions (struct graph *rdg,
-			     vec<gimple *> starting_stmts,
+			     vec<gimple *> *starting_stmts,
 			     vec<partition *> *partitions);

   /* Compute partition dependence created by the data references in DRS1
@@ -643,15 +736,50 @@ class loop_distribution

   /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution.
      ALIAS_DDRS contains ddrs which need runtime alias check.  */
-  void finalize_partitions (class loop *loop, vec<struct partition *>
-			    *partitions, vec<ddr_p> *alias_ddrs);
+  void finalize_partitions (class loop *loop,
+			    vec<struct partition *> *partitions,
+			    vec<ddr_p> *alias_ddrs, bitmap producers);
+
+  /* Analyze loop form and if it's vectorizable to decide if we need to
+     insert temp arrays to distribute it.  */
+  bool may_insert_temp_arrays (loop_p loop, struct graph *&rdg,
+			       control_dependences *cd);
+
+  /* Reset gimple_uid of GIMPLE_DEBUG and GIMPLE_LABEL to -1.  */
+  void reset_gimple_uid (loop_p loop);
+
+  bool check_loop_vectorizable (loop_p loop);
+
+  inline void rebuild_rdg (loop_p loop, struct graph *&rdg,
+			   control_dependences *cd);
+
+  /* If loop is not distributed, remove inserted temp arrays.  */
+  void remove_insertion (loop_p loop, struct graph *flow_only_rdg,
+			 bitmap producers, struct partition *partition);
+
+  /* Insert temp arrays if isomorphic computation exists.  Temp arrays will be
+     regarded as SEED_STMTS for building partitions in succeeding processes.  */
+  bool insert_temp_arrays (loop_p loop, vec<gimple *> seed_stmts,
+			   hash_set<tree> *tmp_array_vars, bitmap producers);
+
+  void build_producers (loop_p loop, bitmap producers,
+			vec<gimple *> &transformed);
+
+  void do_insertion (loop_p loop, struct graph *flow_only_rdg, tree iv,
+		     bitmap cut_points, hash_set <tree> *tmp_array_vars,
+		     bitmap producers);
+
+  /* Fuse PARTITIONS built from inserted temp arrays into one partition,
+     fuse the rest into another.  */
+  void merge_remaining_partitions (vec<struct partition *> *partitions,
+				   bitmap producers);

   /* Distributes the code from LOOP in such a way that producer statements
      are placed before consumer statements.  Tries to separate only the
      statements from STMTS into separate loops.  Returns the number of
      distributed loops.  Set NB_CALLS to number of generated builtin calls.
      Set *DESTROY_P to whether LOOP needs to be destroyed.  */
-  int distribute_loop (class loop *loop, const vec<gimple *> &stmts,
+  int distribute_loop (class loop *loop, vec<gimple *> &stmts,
 		       control_dependences *cd, int *nb_calls, bool *destroy_p,
 		       bool only_patterns_p);

@@ -1893,7 +2021,8 @@ loop_distribution::classify_partition (loop_p loop,

 bool
 loop_distribution::share_memory_accesses (struct graph *rdg,
-		       partition *partition1, partition *partition2)
+		       partition *partition1, partition *partition2,
+		       hash_set <tree> *excluded_arrays)
 {
   unsigned i, j;
   bitmap_iterator bi, bj;
@@ -1927,7 +2056,10 @@ loop_distribution::share_memory_accesses (struct graph *rdg,
 	  if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0)
 	      && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0)
 	      && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0)
-	      && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0))
+	      && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0)
+	      /* An exception, if PARTITION1 and PARTITION2 contain the
+		 temp array we inserted, do not merge them.  */
+	      && !excluded_arrays->contains (DR_REF (dr1)))
 	    return true;
 	}
     }
@@ -1941,14 +2073,14 @@ loop_distribution::share_memory_accesses (struct graph *rdg,

 void
 loop_distribution::rdg_build_partitions (struct graph *rdg,
-					 vec<gimple *> starting_stmts,
+					 vec<gimple *> *starting_stmts,
 					 vec<partition *> *partitions)
 {
   auto_bitmap processed;
   int i;
   gimple *stmt;

-  FOR_EACH_VEC_ELT (starting_stmts, i, stmt)
+  FOR_EACH_VEC_ELT (*starting_stmts, i, stmt)
     {
       int v = rdg_vertex_for_stmt (rdg, stmt);

@@ -2912,13 +3044,47 @@ fuse_memset_builtins (vec<struct partition *> *partitions)
     }
 }

+void
+loop_distribution::merge_remaining_partitions
+			(vec<struct partition *> *partitions,
+			 bitmap producers)
+{
+  struct partition *partition = NULL;
+  struct partition *p1 = NULL, *p2 = NULL;
+  for (unsigned i = 0; partitions->iterate (i, &partition); i++)
+    {
+      if (bitmap_intersect_p (producers, partition->stmts))
+	{
+	  if (p1 == NULL)
+	    {
+	      p1 = partition;
+	      continue;
+	    }
+	  partition_merge_into (NULL, p1, partition, FUSE_FINALIZE);
+	}
+      else
+	{
+	  if (p2 == NULL)
+	    {
+	      p2 = partition;
+	      continue;
+	    }
+	  partition_merge_into (NULL, p2, partition, FUSE_FINALIZE);
+	}
+      partitions->unordered_remove (i);
+      partition_free (partition);
+      i--;
+    }
+}
+
 void
 loop_distribution::finalize_partitions (class loop *loop,
 					vec<struct partition *> *partitions,
-					vec<ddr_p> *alias_ddrs)
+					vec<ddr_p> *alias_ddrs,
+					bitmap producers)
 {
   unsigned i;
-  struct partition *partition, *a;
+  struct partition *partition;

   if (partitions->length () == 1
       || alias_ddrs->length () > 0)
@@ -2950,13 +3116,7 @@ loop_distribution::finalize_partitions (class loop *loop,
       || (loop->inner == NULL
 	  && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin))
     {
-      a = (*partitions)[0];
-      for (i = 1; partitions->iterate (i, &partition); ++i)
-	{
-	  partition_merge_into (NULL, a, partition, FUSE_FINALIZE);
-	  partition_free (partition);
-	}
-      partitions->truncate (1);
+      merge_remaining_partitions (partitions, producers);
     }

   /* Fuse memset builtins if possible.  */
@@ -2964,6 +3124,1216 @@ loop_distribution::finalize_partitions (class loop *loop,
     fuse_memset_builtins (partitions);
 }

+/* Gimple uids of GIMPLE_DEBUG and GIMPLE_LABEL were changed during function
+   vect_analyze_loop, reset them to -1.  */
+
+void
+loop_distribution::reset_gimple_uid (loop_p loop)
+{
+  basic_block *bbs = get_loop_body_in_custom_order (loop, this,
+						    bb_top_order_cmp_r);
+  for (int i = 0; i < int (loop->num_nodes); i++)
+    {
+      basic_block bb = bbs[i];
+      for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
+	   gsi_next (&gsi))
+	{
+	  gimple *stmt = gsi_stmt (gsi);
+	  if (is_gimple_debug (stmt) || gimple_code (stmt) == GIMPLE_LABEL)
+	    gimple_set_uid (stmt, -1);
+	}
+    }
+  free (bbs);
+}
+
+bool
+loop_distribution::check_loop_vectorizable (loop_p loop)
+{
+  vec_info_shared shared;
+  vect_analyze_loop (loop, &shared, true);
+  loop_vec_info vinfo = loop_vec_info_for_loop (loop);
+  reset_gimple_uid (loop);
+  if (vinfo == NULL)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file,
+		 "Loop %d no temp array insertion: bad data access pattern,"
+		 " unable to generate loop_vinfo.\n", loop->num);
+      return false;
+    }
+  if (vinfo->vectorizable)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Loop %d no temp array insertion: original loop"
+			    " can be vectorized without distribution.\n",
+			    loop->num);
+      delete vinfo;
+      loop->aux = NULL;
+      return false;
+    }
+  if (vinfo->grouped_loads.length () == 0)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Loop %d no temp array insertion: original loop"
+			    " has no grouped loads.\n" , loop->num);
+      delete vinfo;
+      loop->aux = NULL;
+      return false;
+    }
+  return true;
+}
+
+inline void
+loop_distribution::rebuild_rdg (loop_p loop, struct graph *&rdg,
+				control_dependences *cd)
+{
+  free_rdg (rdg);
+  rdg = build_rdg (loop, cd);
+  gcc_checking_assert (rdg != NULL);
+}
+
+bool
+loop_distribution::may_insert_temp_arrays (loop_p loop, struct graph *&rdg,
+					   control_dependences *cd)
+{
+  if (!(flag_tree_slp_transpose_vectorize && flag_tree_loop_vectorize))
+    return false;
+
+  /* Only loops with two basic blocks HEADER and LATCH are supported.  HEADER
+     is the main body of a LOOP and LATCH is the basic block that controls the
+     LOOP execution.  Size of temp array is determined by loop execution time,
+     so it must be a const.  */
+  tree loop_extent = number_of_latch_executions (loop);
+  if (loop->inner != NULL || loop->num_nodes > 2
+      || TREE_CODE (loop_extent) != INTEGER_CST)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Loop %d: no temp array insertion: bad loop"
+			    " form.\n", loop->num);
+      return false;
+    }
+
+  if (loop->dont_vectorize)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Loop %d: no temp array insertion: this loop"
+			    " should never be vectorized.\n",
+			    loop->num);
+      return false;
+    }
+
+  /* Do not distribute a LOOP that is able to be vectorized without
+     distribution.  */
+  if (!check_loop_vectorizable (loop))
+    {
+      rebuild_rdg (loop, rdg, cd);
+      return false;
+    }
+
+  rebuild_rdg (loop, rdg, cd);
+  return true;
+}
+
+/* Return max grouped loads' length if all groupes length satisfy len = 2 ^ n.
+   Otherwise, return 0.  */
+
+static unsigned
+get_max_vf (loop_vec_info vinfo)
+{
+  unsigned size = 0;
+  unsigned max = 0;
+  stmt_vec_info stmt_info;
+  unsigned i = 0;
+  FOR_EACH_VEC_ELT (vinfo->grouped_loads, i, stmt_info)
+    {
+      size = stmt_info->size;
+      if (!pow2p_hwi (size))
+	return 0;
+      max = size > max ? size : max;
+    }
+  return max;
+}
+
+/* Convert grouped_loads from linked list to vector with length vf.  Init
+   group_info of each stmt in the same group and put then into a vector.  And
+   these vectors consist WORKLISTS.  We will re-analyze a group if it is
+   uncertain, so we regard WORKLISTS as a circular queue.  */
+
+static unsigned
+build_queue (loop_vec_info vinfo, unsigned vf,
+	     vec<vec<group_info> *> &worklists)
+{
+  stmt_vec_info stmt_info;
+  unsigned i = 0;
+  group_info ginfo = NULL;
+  vec<group_info> *worklist = NULL;
+  FOR_EACH_VEC_ELT (vinfo->grouped_loads, i, stmt_info)
+    {
+      unsigned group_size = stmt_info->size;
+      stmt_vec_info c_stmt_info = stmt_info;
+      bool succ = true;
+      while (group_size >= vf)
+	{
+	  vec_alloc (worklist, vf);
+	  for (unsigned j = 0; j < vf; ++j)
+	    {
+	      if (c_stmt_info == NULL)
+		{
+		  succ = false;
+		  break;
+		}
+	      ginfo = new _group_info ();
+	      ginfo->stmt = c_stmt_info->stmt;
+	      worklist->safe_push (ginfo);
+	      c_stmt_info = c_stmt_info->next_element;
+	    }
+	  if (!succ)
+	    {
+	      unsigned k = 0;
+	      ginfo = NULL;
+	      FOR_EACH_VEC_ELT (*worklist, k, ginfo)
+		delete ginfo;
+	      vec_free (worklist);
+	      break;
+	    }
+	  worklists.safe_push (worklist);
+	  group_size -= vf;
+	}
+    }
+  return worklists.length ();
+}
+
+static bool
+check_same_oprand_type (tree op1, tree op2)
+{
+  tree type1 = TREE_TYPE (op1);
+  tree type2 = TREE_TYPE (op2);
+  if (TREE_CODE (type1) != INTEGER_TYPE && TREE_CODE (type1) != REAL_TYPE)
+    return false;
+
+  return (TREE_CODE (type1) == TREE_CODE (type2)
+	  && TYPE_UNSIGNED (type1) == TYPE_UNSIGNED (type2)
+	  && TYPE_PRECISION (type1) == TYPE_PRECISION (type2));
+}
+
+static bool
+bit_field_p (gimple *stmt)
+{
+  unsigned i = 0;
+  auto_vec<data_reference_p, 2> datarefs_vec;
+  data_reference_p dr;
+  if (!find_data_references_in_stmt (NULL, stmt, &datarefs_vec))
+    return true;
+
+  FOR_EACH_VEC_ELT (datarefs_vec, i, dr)
+    {
+      if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
+	  && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
+	return true;
+    }
+  return false;
+}
+
+static inline bool
+shift_operation (enum tree_code op)
+{
+  return op == LSHIFT_EXPR || op == RSHIFT_EXPR || op == LROTATE_EXPR
+	 || op == RROTATE_EXPR;
+}
+
+/* Return relationship between USE_STMT and the first use_stmt of the group.
+   RHS1 is the lhs of stmt recorded in group_info.  If another rhs of use_stmt
+   is not a constant, return UNCERTAIN and re-check it later.  */
+
+static unsigned
+check_isomorphic (gimple *use_stmt, gimple *first,
+		  tree rhs1, vec<tree> &hetero_lhs)
+{
+  /* Check same operation.  */
+  enum tree_code rhs_code_first = gimple_assign_rhs_code (first);
+  enum tree_code rhs_code_current = gimple_assign_rhs_code (use_stmt);
+  if (rhs_code_first != rhs_code_current)
+    return HETEROGENEOUS;
+
+  /* For shift operations, oprands should be equal.  */
+  if (shift_operation (rhs_code_current))
+    {
+      tree shift_op_first = gimple_assign_rhs2 (first);
+      tree shift_op_current = gimple_assign_rhs2 (use_stmt);
+      if (!operand_equal_p (shift_op_first, shift_op_current, 0)
+	  || !TREE_CONSTANT (shift_op_first))
+	return HETEROGENEOUS;
+
+      return ISOMORPHIC;
+    }
+  /* Type convertion expr or assignment.  */
+  if (gimple_num_ops (first) == 2)
+    return (rhs_code_first == NOP_EXPR || rhs_code_first == CONVERT_EXPR
+	      || rhs_code_first == SSA_NAME) ? ISOMORPHIC : HETEROGENEOUS;
+
+  /* We find USE_STMT from lhs of a stmt, denote it as rhs1 of USE_STMT and
+     the other one as rhs2.  Check if define-stmt of current rhs2 is isomorphic
+     with define-stmt of rhs2 in the first USE_STMT at this group.  */
+  tree rhs2_first = gimple_assign_rhs1 (use_stmt) == rhs1
+		    ? gimple_assign_rhs2 (first) : gimple_assign_rhs1 (first);
+  tree rhs2_curr = gimple_assign_rhs1 (use_stmt) == rhs1
+	      ? gimple_assign_rhs2 (use_stmt) : gimple_assign_rhs1 (use_stmt);
+
+  if (check_same_oprand_type (rhs2_first, rhs2_curr))
+    {
+      if (TREE_CONSTANT (rhs2_curr))
+	return ISOMORPHIC;
+      else if (hetero_lhs.contains (rhs2_curr))
+	return HETEROGENEOUS;
+
+      /* Provisionally set the stmt as uncertain and analyze the whole group
+	 in function CHECK_UNCERTAIN later if all use_stmts are uncertain.  */
+      return UNCERTAIN;
+    }
+  return HETEROGENEOUS;
+}
+
+static bool
+unsupported_operations (gimple *stmt)
+{
+  enum tree_code code = gimple_assign_rhs_code (stmt);
+  return code == COND_EXPR;
+}
+
+/* Check if the single use_stmt of STMT is isomorphic with the first one's
+   use_stmt in current group.  */
+
+static unsigned
+check_use_stmt (group_info elmt, gimple *&first,
+		vec<gimple *> &tmp_stmts, vec<tree> &hetero_lhs)
+{
+  if (gimple_code (elmt->stmt) != GIMPLE_ASSIGN)
+    return HETEROGENEOUS;
+  use_operand_p dummy;
+  tree lhs = gimple_assign_lhs (elmt->stmt);
+  gimple *use_stmt = NULL;
+  single_imm_use (lhs, &dummy, &use_stmt);
+  /* STMTs with three rhs are not supported, e.g., GIMPLE_COND.  */
+  if (use_stmt == NULL || gimple_code (use_stmt) != GIMPLE_ASSIGN
+      || unsupported_operations (use_stmt) || bit_field_p (use_stmt))
+    return HETEROGENEOUS;
+  tmp_stmts.safe_push (use_stmt);
+  if (first == NULL)
+    {
+      first = use_stmt;
+      return UNINITIALIZED;
+    }
+  /* Check if current use_stmt and the first menber's use_stmt in the group
+     are of the same type.  */
+  tree first_lhs = gimple_assign_lhs (first);
+  tree curr_lhs = gimple_assign_lhs (use_stmt);
+  if (!check_same_oprand_type (first_lhs, curr_lhs))
+    return HETEROGENEOUS;
+  return check_isomorphic (use_stmt, first, lhs, hetero_lhs);
+}
+
+/* Replace stmt field in group with stmts in TMP_STMTS, and insert their
+   lhs_info to ISOMER_LHS.  */
+
+static void
+update_isomer_lhs (vec<group_info> *group, unsigned group_num,
+		   unsigned iteration, isomer_stmt_lhs &isomer_lhs,
+		   vec<gimple *> &tmp_stmts, int &profit,
+		   vec<unsigned> &merged_groups)
+{
+  group_info elmt = NULL;
+  /* Do not insert temp array if isomorphic stmts from grouped load have
+     only casting operations.  Once isomorphic calculation has 3 oprands,
+     such as plus operation, this group can be regarded as cut point.  */
+  bool operated = (gimple_num_ops (tmp_stmts[0]) == 3);
+  /* Do not insert temp arrays if search of iosomophic stmts reaches
+     MEM stmts.  */
+  bool has_vdef = gimple_vdef (tmp_stmts[0]) != NULL;
+  bool merge = false;
+  for (unsigned i = 0; i < group->length (); i++)
+    {
+      elmt = (*group)[i];
+      elmt->stmt = has_vdef ? NULL : tmp_stmts[i];
+      elmt->cut_point = has_vdef ? false : (elmt->cut_point || operated);
+      elmt->uncertain = false;
+      elmt->done = has_vdef;
+      tree lhs = gimple_assign_lhs (tmp_stmts[i]);
+      if (isomer_lhs.find (lhs) != isomer_lhs.end ())
+	{
+	  merge = true;
+	  continue;
+	}
+      isomer_lhs[lhs] = std::make_pair (group_num, iteration);
+    }
+  if (merge)
+    {
+      merged_groups.safe_push (group_num);
+      profit = 0;
+      return;
+    }
+  enum vect_cost_for_stmt kind = scalar_stmt;
+  int scalar_cost = builtin_vectorization_cost (kind, NULL_TREE, 0);
+  profit = (tmp_stmts.length () - 1) * scalar_cost;
+}
+
+/* Try to find rhs2 in ISOMER_LHS, if all rhs2 were found and their group_num
+   and iteration are same, GROUP is isomorphic.  */
+
+static unsigned
+check_isomorphic_rhs (vec<group_info> *group, vec<gimple *> &tmp_stmts,
+		      isomer_stmt_lhs &isomer_lhs)
+{
+  group_info elmt = NULL;
+  gimple *stmt = NULL;
+  unsigned j = 0;
+  unsigned group_num = -1u;
+  unsigned iteration = -1u;
+  tree rhs1 = NULL;
+  tree rhs2 = NULL;
+  unsigned status = UNINITIALIZED;
+  FOR_EACH_VEC_ELT (*group, j, elmt)
+    {
+      rhs1 = gimple_assign_lhs (elmt->stmt);
+      stmt = tmp_stmts[j];
+      rhs2 = (rhs1 == gimple_assign_rhs1 (stmt))
+	     ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
+      isomer_stmt_lhs::iterator iter = isomer_lhs.find (rhs2);
+      if (iter != isomer_lhs.end ())
+	{
+	  if (group_num == -1u)
+	    {
+	      group_num = iter->second.first;
+	      iteration = iter->second.second;
+	      status |= ISOMORPHIC;
+	      continue;
+	    }
+	  if (iter->second.first == group_num
+	      && iter->second.second == iteration)
+	    {
+	      status |= ISOMORPHIC;
+	      continue;
+	    }
+	  return HETEROGENEOUS;
+	}
+      else
+	status |= UNCERTAIN;
+    }
+  return status;
+}
+
+/* Update group_info for uncertain groups.  */
+
+static void
+update_uncertain_stmts (vec<group_info> *group, unsigned group_num,
+			 unsigned iteration, vec<gimple *> &tmp_stmts)
+{
+  unsigned j = 0;
+  group_info elmt = NULL;
+  FOR_EACH_VEC_ELT (*group, j, elmt)
+    {
+      elmt->uncertain = true;
+      elmt->done = false;
+    }
+}
+
+/* Push stmts in TMP_STMTS into HETERO_LHS.  */
+
+static void
+set_hetero (vec<group_info> *group, vec<tree> &hetero_lhs,
+	    vec<gimple *> &tmp_stmts)
+{
+  group_info elmt = NULL;
+  unsigned i = 0;
+  for (i = 0; i < group->length (); i++)
+    {
+      elmt = (*group)[i];
+      elmt->uncertain = false;
+      elmt->done = true;
+    }
+  gimple *stmt = NULL;
+  FOR_EACH_VEC_ELT (tmp_stmts, i, stmt)
+    if (stmt != NULL)
+      hetero_lhs.safe_push (gimple_assign_lhs (stmt));
+}
+
+/* Given an uncertain group, TMP_STMTS are use_stmts of stmts in GROUP.
+   Rhs1 is the lhs of stmt in GROUP, rhs2 is the other rhs of USE_STMT.
+
+   Try to find rhs2 in ISOMER_LHS, if all found rhs2 have same group_num
+   and iteration, this uncertain group is isomorphic.
+
+   If no rhs matched, this GROUP remains uncertain and update group_info.
+
+   Otherwise, this GROUP is heterogeneous and return true to end analysis
+   for this group.  */
+
+static bool
+check_uncertain (vec<group_info> *group, unsigned group_num,
+		 unsigned iteration, int &profit,
+		 vec<gimple *> &tmp_stmts, isomer_stmt_lhs &isomer_lhs,
+		 vec<tree> &hetero_lhs, vec<unsigned> &merged_groups)
+{
+  unsigned status = check_isomorphic_rhs (group, tmp_stmts, isomer_lhs);
+  bool done = false;
+  switch (status)
+    {
+      case UNCERTAIN:
+	update_uncertain_stmts (group, group_num, iteration, tmp_stmts);
+	break;
+      case ISOMORPHIC:
+	update_isomer_lhs (group, group_num, iteration, isomer_lhs,
+			   tmp_stmts, profit, merged_groups);
+	break;
+      default:
+	set_hetero (group, hetero_lhs, tmp_stmts);
+	done = true;
+    }
+  return done;
+}
+
+/* Return false if analysis of this group is not finished, e.g., isomorphic or
+   uncertain.  Calculate the profit if vectorized.  */
+
+static bool
+check_group (vec<group_info> *group, unsigned group_num, unsigned iteration,
+	     int &profit, vec<unsigned> &merged_groups,
+	     isomer_stmt_lhs &isomer_lhs, vec<tree> &hetero_lhs)
+{
+  unsigned j = 0;
+  group_info elmt = NULL;
+  gimple *first = NULL;
+  unsigned res = 0;
+  /* Record single use stmts in TMP_STMTS and decide whether replace stmts in
+     ginfo in succeeding processes.  */
+  auto_vec<gimple *, 12> tmp_stmts;
+  FOR_EACH_VEC_ELT (*group, j, elmt)
+    {
+      if (merged_groups.contains (group_num))
+	return true;
+      res |= check_use_stmt (elmt, first, tmp_stmts, hetero_lhs);
+    }
+
+  /* Update each group member according to RES.  */
+  switch (res)
+    {
+      case ISOMORPHIC:
+	update_isomer_lhs (group, group_num, iteration, isomer_lhs,
+			   tmp_stmts, profit, merged_groups);
+	return false;
+      case UNCERTAIN:
+	return check_uncertain (group, group_num, iteration, profit,
+				tmp_stmts, isomer_lhs, hetero_lhs,
+				merged_groups);
+      default:
+	set_hetero (group, hetero_lhs, tmp_stmts);
+	return true;
+    }
+}
+
+/* Return true if all analysises are done except uncertain groups.  */
+
+static bool
+end_of_search (vec<vec<group_info> *> &circular_queue,
+	       vec<unsigned> &merged_groups)
+{
+  unsigned i = 0;
+  vec<group_info> *group = NULL;
+  group_info elmt = NULL;
+  FOR_EACH_VEC_ELT (circular_queue, i, group)
+    {
+      if (merged_groups.contains (i))
+	continue;
+      elmt = (*group)[0];
+      /* If there is any isomorphic use_stmts, continue analysis of isomorphic
+	 use_stmts.  */
+      if (!elmt->done && !elmt->uncertain)
+	return false;
+    }
+  return true;
+}
+
+/* Push valid stmts to STMTS as cutpoints.  */
+
+static bool
+check_any_cutpoints (vec<vec<group_info> *> &circular_queue,
+		     vec<gimple *> *&stmts, vec<unsigned> &merged_groups)
+{
+  unsigned front = 0;
+  vec<group_info> *group = NULL;
+  group_info elmt = NULL;
+  unsigned max = circular_queue.length () * circular_queue[0]->length ();
+  vec_alloc (stmts, max);
+  while (front < circular_queue.length ())
+    {
+      unsigned i = 0;
+      if (merged_groups.contains (front))
+	{
+	  front++;
+	  continue;
+	}
+      group = circular_queue[front++];
+      FOR_EACH_VEC_ELT (*group, i, elmt)
+	if (elmt->stmt != NULL && elmt->done && elmt->cut_point)
+	  stmts->safe_push (elmt->stmt);
+    }
+  return stmts->length () != 0;
+}
+
+/* Grouped loads are isomorphic.  Make pair for group number and iteration,
+   map load stmt to this pair.  We set iteration 0 here.  */
+
+static void
+init_isomer_lhs (vec<vec<group_info> *> &groups, isomer_stmt_lhs &isomer_lhs)
+{
+  vec<group_info> *group = NULL;
+  group_info elmt = NULL;
+  unsigned i = 0;
+  FOR_EACH_VEC_ELT (groups, i, group)
+    {
+      unsigned j = 0;
+      FOR_EACH_VEC_ELT (*group, j, elmt)
+	isomer_lhs[gimple_assign_lhs (elmt->stmt)] = std::make_pair (i, 0);
+    }
+}
+
+/* It's not a strict analysis of load/store profit.  Assume scalar and vector
+   load/store are of the same cost.  The result PROFIT equals profit form
+   vectorizing of scalar loads/stores minus cost of a vectorized load/store.  */
+
+static int
+load_store_profit (unsigned scalar_mem_ops, unsigned vf, unsigned new_mem_ops)
+{
+  int profit = 0;
+  enum vect_cost_for_stmt kind = scalar_load;
+  int scalar_cost = builtin_vectorization_cost (kind, NULL_TREE, 0);
+  profit += (scalar_mem_ops - (scalar_mem_ops / vf)) * scalar_cost;
+  profit -= new_mem_ops / vf * scalar_cost;
+  kind = scalar_store;
+  scalar_cost = builtin_vectorization_cost (kind, NULL_TREE, 0);
+  profit -= new_mem_ops / vf * scalar_cost;
+  return profit;
+}
+
+/* Breadth first search the graph consisting of define-use chain starting from
+   the circular queue initialized by function BUILD_QUEUE.  Find single use of
+   each stmt in group and check if they are isomorphic.  Isomorphic is defined
+   as same rhs type, same operator, and isomorphic calculation of each rhs
+   starting from load.  If another rhs is uncertain to be isomorphic, put it
+   at the end of circular queue and re-analyze it during the next iteration.
+   If a group shares the same use_stmt with another group, skip one of them in
+   succeedor prcoesses as merged.  Iterate the circular queue until all
+   remianing groups heterogeneous or reaches MEN stmts.  If all other groups
+   have finishes the analysis, and the remaining groups are uncertain,
+   return false to avoid endless loop.  */
+
+bool
+bfs_find_isomer_stmts (vec<vec<group_info> *> &circular_queue,
+		       stmts_profit &profit_pair, unsigned vf,
+		       bool &reach_vdef)
+{
+  isomer_stmt_lhs isomer_lhs;
+  auto_vec<tree> hetero_lhs;
+  auto_vec<unsigned> merged_groups;
+  vec<group_info> *group = NULL;
+  /* True if analysis finishes.  */
+  bool done = false;
+  int profit_sum = 0;
+  vec<gimple *> *stmts = NULL;
+  init_isomer_lhs (circular_queue, isomer_lhs);
+  for (unsigned i = 1; !done; ++i)
+    {
+      unsigned front = 0;
+      /* Re-initialize DONE to TRUE while a new iteration begins.  */
+      done = true;
+      while (front < circular_queue.length ())
+	{
+	  int profit = 0;
+	  group = circular_queue[front];
+	  done &= check_group (group, front, i, profit, merged_groups,
+			       isomer_lhs, hetero_lhs);
+	  profit_sum += profit;
+	  if (profit != 0 && (*group)[0]->stmt == NULL)
+	    {
+	      reach_vdef = true;
+	      return false;
+	    }
+	  ++front;
+	}
+      /* Uncertain result, return.  */
+      if (!done && end_of_search (circular_queue, merged_groups))
+	return false;
+    }
+  if (check_any_cutpoints (circular_queue, stmts, merged_groups))
+    {
+      profit_pair.first = stmts;
+      unsigned loads = circular_queue.length () * circular_queue[0]->length ();
+      profit_pair.second = profit_sum + load_store_profit (loads, vf,
+							   stmts->length ());
+      if (profit_pair.second > 0)
+	return true;
+    }
+  return false;
+}
+
+/* Free memory allocated by ginfo.  */
+
+static void
+free_ginfos (vec<vec<group_info> *> &worklists)
+{
+  vec<group_info> *worklist;
+  unsigned i = 0;
+  while (i < worklists.length ())
+    {
+      worklist = worklists[i++];
+      group_info ginfo;
+      unsigned j = 0;
+      FOR_EACH_VEC_ELT (*worklist, j, ginfo)
+	delete ginfo;
+      vec_free (worklist);
+    }
+}
+
+static void
+release_tmp_stmts (vf_stmts_profit_map &candi_stmts)
+{
+  vf_stmts_profit_map::iterator iter;
+  for (iter = candi_stmts.begin (); iter != candi_stmts.end (); ++iter)
+    iter->second.first->release ();
+}
+
+/* Choose the group of stmt with maximun profit.  */
+
+static bool
+decide_stmts_by_profit (vf_stmts_profit_map &candi_stmts, vec<gimple *> &stmts)
+{
+  vf_stmts_profit_map::iterator iter;
+  int profit = 0;
+  int max = 0;
+  vec<gimple *> *tmp = NULL;
+  for (iter = candi_stmts.begin (); iter != candi_stmts.end (); ++iter)
+    {
+      profit = iter->second.second;
+      if (profit > max)
+	{
+	  tmp = iter->second.first;
+	  max = profit;
+	}
+    }
+  if (max == 0)
+    {
+      release_tmp_stmts (candi_stmts);
+      return false;
+    }
+  unsigned i = 0;
+  gimple *stmt = NULL;
+  FOR_EACH_VEC_ELT (*tmp, i, stmt)
+    stmts.safe_push (stmt);
+  release_tmp_stmts (candi_stmts);
+  return stmts.length () != 0;
+}
+
+/* Find isomorphic stmts from grouped loads with vector factor VF.
+
+   Given source code as follows and ignore casting.
+
+   a0 = (a[0] + b[0]) + ((a[4] - b[4]) << 16);
+   a1 = (a[1] + b[1]) + ((a[5] - b[5]) << 16);
+   a2 = (a[2] + b[2]) + ((a[6] - b[6]) << 16);
+   a3 = (a[3] + b[3]) + ((a[7] - b[7]) << 16);
+
+   We get grouped loads in VINFO as
+
+   GROUP_1		GROUP_2
+   _1 = *a		_11 = *b
+   _2 = *(a + 1)	_12 = *(b + 1)
+   _3 = *(a + 2)	_13 = *(b + 2)
+   _4 = *(a + 3)	_14 = *(b + 3)
+   _5 = *(a + 4)	_15 = *(b + 4)
+   _6 = *(a + 5)	_16 = *(b + 5)
+   _7 = *(a + 6)	_17 = *(b + 6)
+   _8 = *(a + 7)	_18 = *(b + 7)
+
+   First we try VF = 8, we get two worklists
+
+   WORKLIST_1		WORKLIST_2
+   _1 = *a		_11 = *b
+   _2 = *(a + 1)	_12 = *(b + 1)
+   _3 = *(a + 2)	_13 = *(b + 2)
+   _4 = *(a + 3)	_14 = *(b + 3)
+   _5 = *(a + 4)	_15 = *(b + 4)
+   _6 = *(a + 5)	_16 = *(b + 5)
+   _7 = *(a + 6)	_17 = *(b + 6)
+   _8 = *(a + 7)	_18 = *(b + 7)
+
+   We find _111 = _1 + _11 and _115 = _5 - _15 are not isomorphic,
+   so we try VF = VF / 2.
+
+   GROUP_1		GROUP_2
+   _1 = *a		_5 = *(a + 4)
+   _2 = *(a + 1)	_6 = *(a + 5)
+   _3 = *(a + 2)	_7 = *(a + 6)
+   _4 = *(a + 3)	_8 = *(a + 7)
+
+   GROUP_3		GROUP_4
+   _11 = *b		_15 = *(b + 4)
+   _12 = *(b + 1)	_16 = *(b + 5)
+   _13 = *(b + 2)	_17 = *(b + 6)
+   _14 = *(b + 3)	_18 = *(b + 7)
+
+   We first analyze group_1, and find all operations are isomorphic, then
+   replace stmts in group_1 with their use_stmts.  Group_2 as well.
+
+   GROUP_1		GROUP_2
+   _111 = _1 + _11	_115 = _5 - _15
+   _112 = _2 + _12	_116 = _6 - _16
+   _113 = _3 + _13	_117 = _7 - _17
+   _114 = _4 + _14	_118 = _8 - _18
+
+   When analyzing group_3 and group_4, we find their use_stmts are the same
+   as group_1 and group_2.  So group_3 is regarded as being merged to group_1
+   and group_4 being merged to group_2.  In future procedures, we will skip
+   group_3 and group_4.
+
+   We repeat such processing until opreations are not isomorphic or searching
+   reaches MEM stmts.  In our given case, searching end up at a0, a1, a2 and
+   a3.  */
+
+static bool
+find_isomorphic_stmts (loop_vec_info vinfo, vec<gimple *> &stmts)
+{
+  unsigned vf = get_max_vf (vinfo);
+  if (vf == 0)
+    return false;
+  auto_vec<vec<group_info> *> circular_queue;
+  /* Map of vector factor and corresponding vectorizing profit.  */
+  stmts_profit profit_map;
+  /* Map of cut_points and vector factor.  */
+  vf_stmts_profit_map candi_stmts;
+  bool reach_vdef = false;
+  while (vf > 2)
+    {
+      if (build_queue (vinfo, vf, circular_queue) == 0)
+	return false;
+      if (!bfs_find_isomer_stmts (circular_queue, profit_map, vf, reach_vdef))
+	{
+	  if (reach_vdef)
+	    {
+	      release_tmp_stmts (candi_stmts);
+	      free_ginfos (circular_queue);
+	      circular_queue.release ();
+	      return false;
+	    }
+	  vf /= 2;
+	  free_ginfos (circular_queue);
+	  circular_queue.release ();
+	  continue;
+	}
+      candi_stmts[vf] = profit_map;
+      free_ginfos (circular_queue);
+      vf /= 2;
+      circular_queue.release ();
+    }
+  return decide_stmts_by_profit (candi_stmts, stmts);
+}
+
+/* Get iv from SEED_STMTS and make sure each seed_stmt has only one iv as index
+   and all indices are the same.  */
+
+static tree
+find_index (vec<gimple *> seed_stmts)
+{
+  if (seed_stmts.length () == 0)
+    return NULL;
+  bool found_index = false;
+  tree index = NULL;
+  unsigned ui = 0;
+  for (ui = 0; ui < seed_stmts.length (); ui++)
+    {
+      if (!gimple_vdef (seed_stmts[ui]))
+	return NULL;
+      tree lhs = gimple_assign_lhs (seed_stmts[ui]);
+      unsigned num_index = 0;
+      while (TREE_CODE (lhs) == ARRAY_REF)
+	{
+	  if (TREE_CODE (TREE_OPERAND (lhs, 1)) == SSA_NAME)
+	    {
+	      num_index++;
+	      if (num_index > 1)
+		return NULL;
+	      if (index == NULL)
+		{
+		  index = TREE_OPERAND (lhs, 1);
+		  found_index = true;
+		}
+	      else if (index != TREE_OPERAND (lhs, 1))
+		return NULL;
+	    }
+	  lhs = TREE_OPERAND (lhs, 0);
+	}
+      if (!found_index)
+	return NULL;
+    }
+  return index;
+}
+
+/* Check if expression of phi is an increament of a const.  */
+
+static void
+check_phi_inc (struct vertex *v_phi, struct graph *rdg, bool &found_inc)
+{
+  struct graph_edge *e_phi;
+  for (e_phi = v_phi->succ; e_phi; e_phi = e_phi->succ_next)
+    {
+      struct vertex *v_inc = &(rdg->vertices[e_phi->dest]);
+      if (!is_gimple_assign (RDGV_STMT (v_inc))
+	  || gimple_expr_code (RDGV_STMT (v_inc)) != PLUS_EXPR)
+	continue;
+      tree rhs1 = gimple_assign_rhs1 (RDGV_STMT (v_inc));
+      tree rhs2 = gimple_assign_rhs2 (RDGV_STMT (v_inc));
+      if (!(integer_onep (rhs1) || integer_onep (rhs2)))
+	continue;
+      struct graph_edge *e_inc;
+      /* find cycle with only two vertices inc and phi: inc <--> phi.  */
+      bool found_cycle = false;
+      for (e_inc = v_inc->succ; e_inc; e_inc = e_inc->succ_next)
+	{
+	  if (e_inc->dest == e_phi->src)
+	    {
+	      found_cycle = true;
+	      break;
+	    }
+	}
+      if (!found_cycle)
+	continue;
+      found_inc = true;
+    }
+}
+
+/* Check if phi satisfies form like PHI <0, i>.  */
+
+static inline bool
+iv_check_phi_stmt (gimple *phi_stmt)
+{
+  return gimple_phi_num_args (phi_stmt) == 2
+	 && (integer_zerop (gimple_phi_arg_def (phi_stmt, 0))
+	     || integer_zerop (gimple_phi_arg_def (phi_stmt, 1)));
+}
+
+/* Make sure the iteration varible is a phi.  */
+
+static tree
+get_iv_from_seed (struct graph *flow_only_rdg, vec<gimple *> seed_stmts)
+{
+  tree index = find_index (seed_stmts);
+  if (index == NULL)
+    return NULL;
+  for (int i = 0; i < flow_only_rdg->n_vertices; i++)
+    {
+      struct vertex *v = &(flow_only_rdg->vertices[i]);
+      if (RDGV_STMT (v) != seed_stmts[0])
+	continue;
+      struct graph_edge *e;
+      bool found_phi = false;
+      for (e = v->pred; e; e = e->pred_next)
+	{
+	  struct vertex *v_phi = &(flow_only_rdg->vertices[e->src]);
+	  gimple *phi_stmt = RDGV_STMT (v_phi);
+	  if (gimple_code (phi_stmt) != GIMPLE_PHI
+	      || gimple_phi_result (phi_stmt) != index)
+	    continue;
+	  if (!iv_check_phi_stmt (phi_stmt))
+	    return NULL;
+	  /* find inc expr in succ of phi.  */
+	  bool found_inc = false;
+	  check_phi_inc (v_phi, flow_only_rdg, found_inc);
+	  if (!found_inc)
+	    return NULL;
+	  found_phi = true;
+	  break;
+	}
+      if (!found_phi)
+	return NULL;
+      break;
+    }
+  return index;
+}
+
+/* Do not distribute loop if vertexes in ROOT_MAP have antidependence with in
+   FLOW_ONLY_RDG.  */
+
+static bool
+check_no_dependency (struct graph *flow_only_rdg, bitmap root_map)
+{
+  bitmap_iterator bi;
+  unsigned ui;
+  auto_vec<unsigned, 16> visited_nodes;
+  auto_bitmap visited_map;
+  EXECUTE_IF_SET_IN_BITMAP (root_map, 0, ui, bi)
+    visited_nodes.safe_push (ui);
+  for (ui = 0; ui < visited_nodes.length (); ui++)
+    {
+      struct vertex *v = &(flow_only_rdg->vertices[visited_nodes[ui]]);
+      struct graph_edge *e;
+      for (e = v->succ; e; e = e->succ_next)
+	{
+	  if (bitmap_bit_p (root_map, e->dest))
+	    return false;
+	  if (bitmap_bit_p (visited_map, e->dest))
+	    continue;
+	  visited_nodes.safe_push (e->dest);
+	  bitmap_set_bit (visited_map, e->dest);
+	}
+    }
+  return true;
+}
+
+/* Find isomorphic stmts from GROUPED_LOADS in VINFO and make sure
+   there is no dependency among those STMT we found.  */
+
+static unsigned
+get_cut_points (struct graph *flow_only_rdg, bitmap cut_points,
+		loop_vec_info vinfo)
+{
+  unsigned n_stmts = 0;
+
+  /* STMTS that may be CUT_POINTS.  */
+  auto_vec<gimple *> stmts;
+  if (!find_isomorphic_stmts (vinfo, stmts))
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "No temp array insertion: no isomorphic stmts"
+			    " were found.\n");
+      return 0;
+    }
+
+  for (int i = 0; i < flow_only_rdg->n_vertices; i++)
+    {
+      if (stmts.contains (RDG_STMT (flow_only_rdg, i)))
+	bitmap_set_bit (cut_points, i);
+    }
+  n_stmts = bitmap_count_bits (cut_points);
+
+  bool succ = check_no_dependency (flow_only_rdg, cut_points);
+  if (!succ)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "No temp array inserted: data dependency"
+			    " among isomorphic stmts.\n");
+      return 0;
+    }
+  return n_stmts;
+}
+
+static void
+build_temp_array (struct vertex *v, gimple_stmt_iterator &gsi,
+		  poly_uint64 array_extent, tree iv,
+		  hash_set<tree> *tmp_array_vars, vec<gimple *> *transformed)
+{
+  gimple *stmt = RDGV_STMT (v);
+  tree lhs = gimple_assign_lhs (stmt);
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "original stmt:\t");
+      print_gimple_stmt (dump_file, stmt, 0, TDF_VOPS|TDF_MEMSYMS);
+    }
+  tree var_ssa = duplicate_ssa_name (lhs, stmt);
+  gimple_assign_set_lhs (stmt, var_ssa);
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "changed to:\t");
+      print_gimple_stmt (dump_file, stmt, 0, TDF_VOPS | TDF_MEMSYMS);
+    }
+  gimple_set_uid (gsi_stmt (gsi), -1);
+  tree vect_elt_type = TREE_TYPE (lhs);
+  tree array_type = build_array_type_nelts (vect_elt_type, array_extent);
+  tree array = create_tmp_var (array_type);
+  tree array_ssa = build4 (ARRAY_REF, vect_elt_type, array, iv, NULL, NULL);
+  tmp_array_vars->add (array_ssa);
+  gimple *store = gimple_build_assign (array_ssa, var_ssa);
+  tree new_vdef = make_ssa_name (gimple_vop (cfun), store);
+  gsi_insert_after (&gsi, store, GSI_NEW_STMT);
+  gimple_set_vdef (store, new_vdef);
+  transformed->safe_push (store);
+  gimple_set_uid (gsi_stmt (gsi), -1);
+  tree array_ssa2 = build4 (ARRAY_REF, vect_elt_type, array, iv, NULL, NULL);
+  tmp_array_vars->add (array_ssa2);
+  gimple *load = gimple_build_assign (lhs, array_ssa2);
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "insert stmt:\t");
+      print_gimple_stmt (dump_file, store, 0, TDF_VOPS|TDF_MEMSYMS);
+      fprintf (dump_file, " and stmt:\t");
+      print_gimple_stmt (dump_file, load, 0, TDF_VOPS|TDF_MEMSYMS);
+    }
+  gimple_set_vuse (load, new_vdef);
+  gsi_insert_after (&gsi, load, GSI_NEW_STMT);
+  gimple_set_uid (gsi_stmt (gsi), -1);
+}
+
+/* Set bitmap PRODUCERS based on vec TRANSFORMED.  */
+
+void
+loop_distribution::build_producers (loop_p loop, bitmap producers,
+				    vec<gimple *> &transformed)
+{
+  auto_vec<gimple *, 10> stmts;
+  stmts_from_loop (loop, &stmts);
+  int i = 0;
+  gimple *stmt = NULL;
+
+  FOR_EACH_VEC_ELT (stmts, i, stmt)
+    gimple_set_uid (stmt, i);
+  i = 0;
+  FOR_EACH_VEC_ELT (transformed, i, stmt)
+    bitmap_set_bit (producers, stmt->uid);
+}
+
+/* Transform stmt
+
+   A = FOO (ARG_1);
+
+   to
+
+   STMT_1: A1 = FOO (ARG_1);
+   STMT_2: X[I] = A1;
+   STMT_3: A = X[I];
+
+   Producer is STMT_2 who defines the temp array and consumer is
+   STMT_3 who uses the temp array.  */
+
+void
+loop_distribution::do_insertion (loop_p loop, struct graph *flow_only_rdg,
+				 tree iv, bitmap cut_points,
+				 hash_set<tree> *tmp_array_vars,
+				 bitmap producers)
+{
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "=== do insertion ===\n");
+
+  auto_vec<gimple *> transformed;
+
+  /* Execution times of loop.  */
+  poly_uint64 array_extent
+    = tree_to_poly_uint64 (number_of_latch_executions (loop)) + 1;
+
+  basic_block *bbs = get_loop_body_in_custom_order (loop, this,
+						    bb_top_order_cmp_r);
+
+  for (int i = 0; i < int (loop->num_nodes); i++)
+    {
+      basic_block bb = bbs[i];
+
+      /* Find all cut points in bb and transform them.  */
+      for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
+	   gsi_next (&gsi))
+	{
+	  unsigned j = gimple_uid (gsi_stmt (gsi));
+	  if (bitmap_bit_p (cut_points, j))
+	    {
+	      struct vertex *v = &(flow_only_rdg->vertices[j]);
+	      build_temp_array (v, gsi, array_extent, iv, tmp_array_vars,
+				&transformed);
+	    }
+	}
+    }
+  build_producers (loop, producers, transformed);
+  update_ssa (TODO_update_ssa);
+  free (bbs);
+}
+
+/* After temp array insertion, given stmts
+   STMT_1: M = FOO (ARG_1);
+   STMT_2: X[I] = M;
+   STMT_3: A = X[I];
+   STMT_2 is the producer, STMT_1 is its prev and STMT_3 is its next.
+   Replace M with A, and remove STMT_2 and STMT_3.  */
+
+static void
+reset_gimple_assign (struct graph *flow_only_rdg, struct partition *partition,
+		     gimple_stmt_iterator &gsi, int j)
+{
+  struct vertex *v = &(flow_only_rdg->vertices[j]);
+  gimple *stmt = RDGV_STMT (v);
+  gimple *prev = stmt->prev;
+  gimple *next = stmt->next;
+  tree n_lhs = gimple_assign_lhs (next);
+  gimple_assign_set_lhs (prev, n_lhs);
+  unlink_stmt_vdef (stmt);
+  if (partition)
+    bitmap_clear_bit (partition->stmts, gimple_uid (gsi_stmt (gsi)));
+  gsi_remove (&gsi, true);
+  release_defs (stmt);
+  if (partition)
+    bitmap_clear_bit (partition->stmts, gimple_uid (gsi_stmt (gsi)));
+  gsi_remove (&gsi, true);
+}
+
+void
+loop_distribution::remove_insertion (loop_p loop, struct graph *flow_only_rdg,
+		  bitmap producers, struct partition *partition)
+{
+  basic_block *bbs = get_loop_body_in_custom_order (loop, this,
+						    bb_top_order_cmp_r);
+  for (int i = 0; i < int (loop->num_nodes); i++)
+    {
+      basic_block bb = bbs[i];
+      for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
+	   gsi_next (&gsi))
+	{
+	  unsigned j = gimple_uid (gsi_stmt (gsi));
+	  if (bitmap_bit_p (producers, j))
+	    reset_gimple_assign (flow_only_rdg, partition, gsi, j);
+	}
+    }
+  update_ssa (TODO_update_ssa);
+  free (bbs);
+}
+
+/* Insert temp arrays if isomorphic computation exists.  Temp arrays will be
+   regarded as SEED_STMTS for building partitions in succeeding processes.  */
+
+bool
+loop_distribution::insert_temp_arrays (loop_p loop, vec<gimple *> seed_stmts,
+			hash_set<tree> *tmp_array_vars, bitmap producers)
+{
+  struct graph *flow_only_rdg = build_rdg (loop, NULL);
+  gcc_checking_assert (flow_only_rdg != NULL);
+  tree iv = get_iv_from_seed (flow_only_rdg, seed_stmts);
+  if (iv == NULL)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Loop %d no temp array insertion: failed to get"
+			    " iteration variable.\n", loop->num);
+      free_rdg (flow_only_rdg);
+      return false;
+  }
+  auto_bitmap cut_points;
+  loop_vec_info vinfo = loop_vec_info_for_loop (loop);
+  unsigned n_cut_points = get_cut_points (flow_only_rdg, cut_points, vinfo);
+  delete vinfo;
+  loop->aux = NULL;
+  if (n_cut_points == 0)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Loop %d no temp array insertion: no cut points"
+			    " found.\n", loop->num);
+      free_rdg (flow_only_rdg);
+      return false;
+    }
+  do_insertion (loop, flow_only_rdg, iv, cut_points, tmp_array_vars, producers);
+  if (dump_enabled_p ())
+    {
+      dump_user_location_t loc = find_loop_location (loop);
+      dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc, "Insertion done:"
+		       " %d temp arrays inserted in Loop %d.\n",
+		       n_cut_points, loop->num);
+    }
+  free_rdg (flow_only_rdg);
+  return true;
+}
+
+static bool find_seed_stmts_for_distribution (class loop *, vec<gimple *> *);
+
 /* Distributes the code from LOOP in such a way that producer statements
    are placed before consumer statements.  Tries to separate only the
    statements from STMTS into separate loops.  Returns the number of
@@ -2972,7 +4342,7 @@ loop_distribution::finalize_partitions (class loop *loop,

 int
 loop_distribution::distribute_loop (class loop *loop,
-		 const vec<gimple *> &stmts,
+		 vec<gimple *> &stmts,
 		 control_dependences *cd, int *nb_calls, bool *destroy_p,
 		 bool only_patterns_p)
 {
@@ -3021,6 +4391,33 @@ loop_distribution::distribute_loop (class loop *loop,
       return 0;
     }

+  /* Try to distribute LOOP if LOOP is simple enough and unable to vectorize.
+     If LOOP has grouped loads, recursively find isomorphic stmts and insert
+     temp arrays, rebuild RDG and call find_seed_stmts_for_distribution
+     to replace STMTS.  */
+
+  hash_set<tree> tmp_array_vars;
+
+  /* STMTs that define those inserted TMP_ARRAYs.  */
+  auto_bitmap producers;
+
+  /* New SEED_STMTS after insertion.  */
+  auto_vec<gimple *> work_list;
+  bool insert_success = false;
+  if (may_insert_temp_arrays (loop, rdg, cd))
+    {
+      if (insert_temp_arrays (loop, stmts, &tmp_array_vars, producers))
+	{
+	  if (find_seed_stmts_for_distribution (loop, &work_list))
+	    {
+	      insert_success = true;
+	    }
+	  else
+	    remove_insertion (loop, rdg, producers, NULL);
+	  rebuild_rdg (loop, rdg, cd);
+	}
+     }
+
   data_reference_p dref;
   for (i = 0; datarefs_vec.iterate (i, &dref); ++i)
     dref->aux = (void *) (uintptr_t) i;
@@ -3029,7 +4426,10 @@ loop_distribution::distribute_loop (class loop *loop,
     dump_rdg (dump_file, rdg);

   auto_vec<struct partition *, 3> partitions;
-  rdg_build_partitions (rdg, stmts, &partitions);
+  if (work_list.length() > stmts.length())
+	rdg_build_partitions (rdg, &work_list, &partitions);
+  else
+	rdg_build_partitions (rdg, &stmts, &partitions);

   auto_vec<ddr_p> alias_ddrs;

@@ -3101,7 +4501,7 @@ loop_distribution::distribute_loop (class loop *loop,
       for (int j = i + 1;
 	   partitions.iterate (j, &partition); ++j)
 	{
-	  if (share_memory_accesses (rdg, into, partition))
+	  if (share_memory_accesses (rdg, into, partition, &tmp_array_vars))
 	    {
 	      partition_merge_into (rdg, into, partition, FUSE_SHARE_REF);
 	      partitions.unordered_remove (j);
@@ -3151,7 +4551,7 @@ loop_distribution::distribute_loop (class loop *loop,
 	}
     }

-  finalize_partitions (loop, &partitions, &alias_ddrs);
+  finalize_partitions (loop, &partitions, &alias_ddrs, producers);

   /* If there is a reduction in all partitions make sure the last one
      is not classified for builtin code generation.  */
@@ -3169,6 +4569,24 @@ loop_distribution::distribute_loop (class loop *loop,
     }

   nbp = partitions.length ();
+
+  /* If we have inserted TMP_ARRAYs but there is only one partition left in
+     the succeeding processes, remove those inserted TMP_ARRAYs back to the
+     original version.  */
+
+  if (nbp == 1 && insert_success)
+    {
+      struct partition *partition = NULL;
+      partitions.iterate (0, &partition);
+      remove_insertion (loop, rdg, producers, partition);
+      if (dump_enabled_p ())
+	{
+	  dump_user_location_t loc = find_loop_location (loop);
+	  dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc, "Insertion removed:"
+			   " unable to distribute loop %d.\n", loop->num);
+	}
+    }
+
   if (nbp == 0
       || (nbp == 1 && !partition_builtin_p (partitions[0]))
       || (nbp > 1 && partition_contains_all_rw (rdg, partitions)))
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index 04e68f621..aae7f62f3 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -2791,6 +2791,9 @@ vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;

       DR_GROUP_SIZE (stmt_info) = groupsize;
+
+      DR_GROUP_SLP_TRANSPOSE (stmt_info) = false;
+
       if (dump_enabled_p ())
 	{
 	  dump_printf_loc (MSG_NOTE, vect_location,
@@ -2820,6 +2823,20 @@ vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
 			     DR_GROUP_GAP (stmt_info));
 	}

+      /* SLP: create an SLP data structure for every interleaving group of
+	 loads for further analysis in vect_analyse_slp.  */
+      if (DR_IS_READ (dr) && !slp_impossible)
+	{
+	  if (loop_vinfo)
+	    {
+	      LOOP_VINFO_GROUPED_LOADS (loop_vinfo).safe_push (stmt_info);
+	    }
+	  if (bb_vinfo)
+	    {
+	      BB_VINFO_GROUPED_LOADS (bb_vinfo).safe_push (stmt_info);
+	    }
+	}
+
       /* SLP: create an SLP data structure for every interleaving group of
 	 stores for further analysis in vect_analyse_slp.  */
       if (DR_IS_WRITE (dr) && !slp_impossible)
@@ -5636,6 +5653,226 @@ vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
     }
 }

+/* Encoding the PERM_MASK_FIRST.  */
+
+static void
+vect_indices_encoding_first (tree vectype, unsigned int array_num,
+			     tree &perm_mask_high_first,
+			     tree &perm_mask_low_first)
+{
+  unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
+  vec_perm_builder sel (nelt, nelt, 1);
+  sel.quick_grow (nelt);
+  unsigned int group_num = nelt / array_num;
+  unsigned int index = 0;
+  unsigned int array = 0;
+  unsigned int group = 0;
+
+  /* The encoding has 1 pattern in the fisrt stage.  */
+  for (array = 0; array < array_num / 2; array++)
+    {
+      for (group = 0; group < group_num * 2; group++)
+	{
+	  sel[index++] = array + array_num * group;
+	}
+    }
+  vec_perm_indices indices (sel, 2, nelt);
+  perm_mask_high_first = vect_gen_perm_mask_checked (vectype, indices);
+
+  index = 0;
+  for (array = array_num / 2; array < array_num; array++)
+    {
+      for (group = 0; group < group_num * 2; group++)
+	{
+	  sel[index++] = array + array_num * group;
+	}
+    }
+  indices.new_vector (sel, 2, nelt);
+  perm_mask_low_first = vect_gen_perm_mask_checked (vectype, indices);
+}
+
+/* Encoding the PERM_MASK.  */
+
+static void
+vect_indices_encoding (tree vectype, unsigned int array_num,
+		       tree &perm_mask_high, tree &perm_mask_low)
+{
+  unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
+  vec_perm_builder sel (nelt, nelt, 1);
+  sel.quick_grow (nelt);
+  unsigned int group_num = nelt / array_num;
+  unsigned int index = 0;
+  unsigned int array = 0;
+  unsigned int group = 0;
+
+  /* The encoding has 2 patterns in the folllowing stages.  */
+  for (array = 0; array < array_num / 2; array++)
+    {
+      for (group = 0; group < group_num; group++)
+	{
+	  sel[index++] = group + group_num * array;
+	}
+      for (group = 0; group < group_num; group++)
+	{
+	  sel[index++] = nelt + group + group_num * array;
+	}
+    }
+  vec_perm_indices indices (sel, 2, nelt);
+  perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
+
+  index = 0;
+  for (array = array_num / 2; array < array_num; array++)
+    {
+      for (group = 0; group < group_num; group++)
+	{
+	  sel[index++] = group + group_num * array;
+	}
+      for (group = 0; group < group_num; group++)
+	{
+	  sel[index++] = nelt + group + group_num * array;
+	}
+    }
+  indices.new_vector (sel, 2, nelt);
+  perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
+}
+
+/* Function vect_transpose_store_chain.
+
+   Given a chain of interleaved stores in DR_CHAIN of LENGTH and ARRAY_NUM that
+   must be a power of 2.  Generate interleave_high/low stmts to reorder
+   the data correctly for the stores.  Return the final references for stores
+   in RESULT_CHAIN.  This function is similar to vect_permute_store_chain (),
+   we interleave the contents of the vectors in their order.
+
+   E.g., LENGTH is 4, the scalar type is short (i.e., VF is 8) and ARRAY_NUM
+   is 4.  That is, the input is 4 vectors each containing 8 elements.
+   And 2 (VF / ARRAY_NUM) of 8 elements come from the same array.  we interleave
+   the contents of the four vectors in their order.  We assign a number to each
+   element, the input sequence is:
+
+   1st vec:   0  1  2  3  4  5  6  7
+   2nd vec:   8  9 10 11 12 13 14 15
+   3rd vec:  16 17 18 19 20 21 22 23
+   4th vec:  24 25 26 27 28 29 30 31
+
+   The output sequence should be:
+
+   1st vec:   0  4  8 12 16 20 24 28
+   2nd vec:   1  5  9 13 17 21 25 29
+   3rd vec:   2  6 10 14 18 22 26 30
+   4th vec:   3  7 11 15 19 23 27 31
+
+   In our example,
+   We get 2 (VF / ARRAY_NUM) elements together in every vector.
+
+   I1:   0  4  1  5  2  6  3  7
+   I2:   8 12  9 13 10 14 11 15
+   I3:  16 20 17 21 18 22 19 23
+   I4:  24 28 25 29 26 30 27 31
+
+   Then, we use interleave_high/low instructions to create such output.
+   Every 2 (VF / ARRAY_NUM) elements are regarded as a whole.  The permutation
+   is done in log LENGTH stages.
+
+   I1: interleave_high (1st vec, 3rd vec)
+   I2: interleave_low (1st vec, 3rd vec)
+   I3: interleave_high (2nd vec, 4th vec)
+   I4: interleave_low (2nd vec, 4th vec)
+
+   The first stage of the sequence should be:
+
+   I1:   0  4 16 20  1  5 17 21
+   I2:   2  6 18 22  3  7 19 23
+   I3:   8 12 24 28  9 13 25 29
+   I4:  10 14 26 30 11 15 27 31
+
+   The following stage sequence should be, i.e. the final result is:
+
+   I1:   0  4  8 12 16 20 24 28
+   I2:   1  5  9 13 17 21 25 29
+   I3:   2  6 10 14 18 22 26 30
+   I4:   3  7 11 15 19 23 27 31.  */
+
+void
+vect_transpose_store_chain (vec_info *vinfo, vec<tree> dr_chain,
+			    unsigned int length, unsigned int array_num,
+			    stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
+			    vec<tree> *result_chain)
+{
+  gimple *perm_stmt = NULL;
+  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+  tree perm_mask_low_first = NULL;
+  tree perm_mask_high_first = NULL;
+  tree perm_mask_low = NULL;
+  tree perm_mask_high = NULL;
+  unsigned int log_length = exact_log2 (length);
+
+  /* Only power of 2 is supported.  */
+  gcc_assert (pow2p_hwi (length));
+
+  /* The encoding has 2 types, one for the grouped pattern in the fisrt stage,
+     another for the interleaved patterns in the following stages.  */
+  gcc_assert (array_num != 0);
+
+  /* Create grouped stmt (in the first stage):
+	group = nelt / array_num;
+	high_first = VEC_PERM_EXPR <vect1, vect2,
+		{0, array_num, 2*array_num, ..., (2*group-1)*array_num,
+		1, 1+array_num, 1+2*array_num, ..., 1+(2*group-1)*array_num,
+		...,
+		array_num/2-1, (array_num/2-1)+array_num, ...,
+		(array_num/2-1)+(2*group-1)*array_num}>
+	low_first = VEC_PERM_EXPR <vect1, vect2,
+		{array_num/2, array_num/2+array_num, array_num/2+2*array_num,
+		..., array_num/2+(2*group-1)*array_num,
+		array_num/2+1, array_num/2+1+array_num,
+		..., array_num/2+1+(2*group-1)*array_num,
+		...,
+		array_num-1, array_num-1+array_num,
+		..., array_num-1+(2*group-1)*array_num}>  */
+  vect_indices_encoding_first (vectype, array_num, perm_mask_high_first,
+			       perm_mask_low_first);
+
+  /* Create interleaving stmt (in the following stages):
+	high = VEC_PERM_EXPR <vect1, vect2, {0, 1, ..., group-1,
+		nelt, nelt+1, ..., nelt+group-1,
+		group, group+1, ..., 2*group-1,
+		nelt+group, nelt+group+1, ..., nelt+2*group-1,
+		...}>
+	low = VEC_PERM_EXPR <vect1, vect2,
+		{nelt/2, nelt/2+1, ..., nelt/2+group-1,
+		nelt*3/2, nelt*3/2+1, ..., nelt*3/2+group-1,
+		nelt/2+group, nelt/2+group+1, ..., nelt/2+2*group-1,
+		nelt*3/2+group, nelt*3/2+group+1, ..., nelt*3/2+2*group-1,
+		...}>  */
+  vect_indices_encoding (vectype, array_num, perm_mask_high, perm_mask_low);
+
+  for (unsigned int perm_time = 0; perm_time < log_length; perm_time++)
+    {
+      for (unsigned int index = 0; index < length / 2; index++)
+	{
+	  tree vect1 = dr_chain[index];
+	  tree vect2 = dr_chain[index + length / 2];
+
+	  tree high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
+	  perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1, vect2,
+					   perm_time == 0 ? perm_mask_high_first
+							  : perm_mask_high);
+	  vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
+	  (*result_chain)[2 * index] = high;
+
+	  tree low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
+	  perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1, vect2,
+					   perm_time == 0 ? perm_mask_low_first
+							  : perm_mask_low);
+	  vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
+	  (*result_chain)[2 * index+1] = low;
+	}
+      memcpy (dr_chain.address (), result_chain->address (),
+	      length * sizeof (tree));
+    }
+}
+
 /* Function vect_setup_realignment

    This function is called when vectorizing an unaligned load using
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 3435f9378..f296e9415 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -2856,7 +2856,7 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
 		     loop_vec_info main_loop_vinfo,
 		     const vector_modes &vector_modes, unsigned &mode_i,
 		     machine_mode &autodetected_vector_mode,
-		     bool &fatal)
+		     bool &fatal, bool result_only_p)
 {
   loop_vec_info loop_vinfo
     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
@@ -2865,6 +2865,8 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
   loop_vinfo->vector_mode = vector_mode;
   unsigned int suggested_unroll_factor = 1;

+  /* Loop_vinfo for loop-distribution pass.  */
+  opt_loop_vec_info fail_loop_vinfo = opt_loop_vec_info::success (NULL);
   /* Run the main analysis.  */
   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
 					&suggested_unroll_factor);
@@ -2933,7 +2935,21 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,

   if (!res)
     {
-      delete loop_vinfo;
+
+	/* If current analysis shows LOOP is unable to vectorize, loop_vinfo
+	will be deleted.  If LOOP is under ldist analysis, backup it before
+	it is deleted and return it if all modes are analyzed and still
+	fail to vectorize.  */
+      if (result_only_p && (mode_i == vector_modes.length ()
+	    || autodetected_vector_mode == VOIDmode))
+	{
+	    fail_loop_vinfo = opt_loop_vec_info::success (loop_vinfo);
+	    loop->aux = (loop_vec_info) fail_loop_vinfo;
+	}
+      else
+	{
+	    delete loop_vinfo;
+	}
       if (fatal)
 	gcc_checking_assert (main_loop_vinfo == NULL);
       return opt_loop_vec_info::propagate_failure (res);
@@ -2946,9 +2962,11 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,

    Apply a set of analyses on LOOP, and create a loop_vec_info struct
    for it.  The different analyses will record information in the
-   loop_vec_info struct.  */
+   loop_vec_info struct.  When RESULT_ONLY_P is true, quit analysis
+   if loop is vectorizable, otherwise, do not delete vinfo. */
 opt_loop_vec_info
-vect_analyze_loop (class loop *loop, vec_info_shared *shared)
+vect_analyze_loop (class loop *loop, vec_info_shared *shared,
+		   bool result_only_p)
 {
   DUMP_VECT_SCOPE ("analyze_loop_nest");

@@ -2996,6 +3014,12 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
 			     && !unlimited_cost_model (loop));
   machine_mode autodetected_vector_mode = VOIDmode;
   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
+  /* Loop_vinfo for loop-distribution pass.  */
+  opt_loop_vec_info fail_loop_vinfo = opt_loop_vec_info::success (NULL);
+  if (result_only_p)
+  {
+     vect_slp_init ();
+  }
   unsigned int mode_i = 0;
   unsigned HOST_WIDE_INT simdlen = loop->simdlen;

@@ -3019,10 +3043,16 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
       opt_loop_vec_info loop_vinfo
 	= vect_analyze_loop_1 (loop, shared, &loop_form_info,
 			       NULL, vector_modes, mode_i,
-			       autodetected_vector_mode, fatal);
+			       autodetected_vector_mode, fatal, result_only_p);
       if (fatal)
 	break;

+      if (result_only_p && (mode_i == vector_modes.length ()
+	  || autodetected_vector_mode == VOIDmode))
+	{
+		return loop_vinfo;
+	}
+
       if (loop_vinfo)
 	{
 	  /*  Analyzis has been successful so update the VF value.  The
@@ -3132,7 +3162,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
 	= vect_analyze_loop_1 (loop, shared, &loop_form_info,
 			       first_loop_vinfo,
 			       vector_modes, mode_i,
-			       autodetected_vector_mode, fatal);
+			       autodetected_vector_mode, fatal, result_only_p);
       if (fatal)
 	break;

diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index e1bcab0f7..c0c15773d 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -5632,8 +5632,8 @@ static vect_recog_func vect_vect_recog_func_ptrs[] = {
      internal functions.  */
   { vect_recog_gather_scatter_pattern, "gather_scatter" },
   { vect_recog_mask_conversion_pattern, "mask_conversion" },
-  { vect_recog_widen_plus_pattern, "widen_plus" },
-  { vect_recog_widen_minus_pattern, "widen_minus" },
+  // { vect_recog_widen_plus_pattern, "widen_plus" },
+  // { vect_recog_widen_minus_pattern, "widen_minus" },
 };

 const unsigned int NUM_PATTERNS = ARRAY_SIZE (vect_vect_recog_func_ptrs);
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index af477c31a..6cbf8085f 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -49,6 +49,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-eh.h"
 #include "tree-cfg.h"
 #include "alloc-pool.h"
+#include "print-tree.h"
+#include "gimple-pretty-print.h"

 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
 					  slp_tree, stmt_vector_for_cost *);
@@ -994,6 +996,21 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
 	}

       gcc_assert (vectype);
+      if (!STMT_VINFO_VECTYPE (stmt_info))
+	STMT_VINFO_VECTYPE (stmt_info) = vectype;
+      if (dump_file)
+	{
+	  fprintf (dump_file, "vect_build_slp_tree_1: %p\n", stmt_info);
+	  print_gimple_stmt (dump_file, stmt, 0);
+	  fprintf (dump_file, "vect_build_slp_tree_1: vectype=");
+	  if (vectype)
+	    print_generic_expr (dump_file, vectype);
+	  fprintf (dump_file, "\n");
+	  fprintf (dump_file, "internal vectype=");
+	  if (STMT_VINFO_VECTYPE (stmt_info))
+	    print_generic_expr (dump_file, STMT_VINFO_VECTYPE (stmt_info));
+	  fprintf (dump_file, "\n");
+	}

       gcall *call_stmt = dyn_cast <gcall *> (stmt);
       if (call_stmt)
@@ -1575,10 +1592,10 @@ vect_build_slp_tree (vec_info *vinfo,
 	dump_printf_loc (MSG_NOTE, vect_location,
 			 "SLP discovery for node %p succeeded\n", res);
       gcc_assert (res_ == res);
-      res->max_nunits = this_max_nunits;
+      res_->max_nunits = this_max_nunits;
       vect_update_max_nunits (max_nunits, this_max_nunits);
       /* Keep a reference for the bst_map use.  */
-      SLP_TREE_REF_COUNT (res)++;
+      SLP_TREE_REF_COUNT (res_)++;
     }
   return res_;
 }
@@ -3190,8 +3207,10 @@ vect_build_slp_instance (vec_info *vinfo,

       /* For basic block SLP, try to break the group up into multiples of
 	 a vector size.  */
+      bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
       if (is_a <bb_vec_info> (vinfo)
-	  && (i > 1 && i < group_size))
+	  && (i > 1 && i < group_size)
+	  && !bb_vinfo->transposed)
 	{
 	  tree scalar_type
 	    = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
@@ -3301,84 +3320,1034 @@ vect_analyze_slp_instance (vec_info *vinfo,
       scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
       while (next_info)
 	{
-	  scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
-	  next_info = DR_GROUP_NEXT_ELEMENT (next_info);
+	  scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
+	  next_info = DR_GROUP_NEXT_ELEMENT (next_info);
+	}
+    }
+  else if (kind == slp_inst_kind_reduc_chain)
+    {
+      /* Collect the reduction stmts and store them in scalar_stmts.  */
+      scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
+      while (next_info)
+	{
+	  scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
+	  next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
+	}
+      /* Mark the first element of the reduction chain as reduction to properly
+	 transform the node.  In the reduction analysis phase only the last
+	 element of the chain is marked as reduction.  */
+      STMT_VINFO_DEF_TYPE (stmt_info)
+	= STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
+      STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
+	= STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
+    }
+  else if (kind == slp_inst_kind_ctor)
+    {
+      tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
+      tree val;
+      scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
+      FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
+	{
+	  stmt_vec_info def_info = vinfo->lookup_def (val);
+	  def_info = vect_stmt_to_vectorize (def_info);
+	  scalar_stmts.quick_push (def_info);
+	}
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "Analyzing vectorizable constructor: %G\n",
+			 stmt_info->stmt);
+    }
+  else if (kind == slp_inst_kind_reduc_group)
+    {
+      /* Collect reduction statements.  */
+      const vec<stmt_vec_info> &reductions
+	= as_a <loop_vec_info> (vinfo)->reductions;
+      scalar_stmts.create (reductions.length ());
+      for (i = 0; reductions.iterate (i, &next_info); i++)
+	if ((STMT_VINFO_RELEVANT_P (next_info)
+	     || STMT_VINFO_LIVE_P (next_info))
+	    /* ???  Make sure we didn't skip a conversion around a reduction
+	       path.  In that case we'd have to reverse engineer that conversion
+	       stmt following the chain using reduc_idx and from the PHI
+	       using reduc_def.  */
+	    && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
+	  scalar_stmts.quick_push (next_info);
+      /* If less than two were relevant/live there's nothing to SLP.  */
+      if (scalar_stmts.length () < 2)
+	return false;
+    }
+  else
+    gcc_unreachable ();
+
+  vec<stmt_vec_info> roots = vNULL;
+  if (kind == slp_inst_kind_ctor)
+    {
+      roots.create (1);
+      roots.quick_push (stmt_info);
+    }
+  /* Build the tree for the SLP instance.  */
+  bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
+				      roots,
+				      max_tree_size, limit, bst_map,
+				      kind == slp_inst_kind_store
+				      ? stmt_info : NULL);
+  if (!res)
+    roots.release ();
+
+  /* ???  If this is slp_inst_kind_store and the above succeeded here's
+     where we should do store group splitting.  */
+
+  return res;
+}
+
+static inline bool
+is_const_assign (stmt_vec_info store_elem)
+{
+  if (store_elem == NULL)
+    {
+      gcc_unreachable ();
+    }
+  gimple *stmt = store_elem->stmt;
+  gimple_rhs_class rhs_class = gimple_assign_rhs_class (stmt);
+  return rhs_class == GIMPLE_SINGLE_RHS
+	 && TREE_CONSTANT (gimple_assign_rhs1 (store_elem->stmt));
+}
+
+/* Push inits to INNERMOST_INITS and check const assign.  */
+
+static bool
+record_innermost (vec<tree> &innermost_inits,
+		  vec<tree> &innermost_offsets,
+		  stmt_vec_info stmt_vinfo)
+{
+  if (!stmt_vinfo)
+    {
+      return false;
+    }
+  stmt_vec_info next_info = stmt_vinfo;
+  while (next_info)
+    {
+      /* No need to vectorize constant assign in a transposed version.  */
+      if (is_const_assign (next_info))
+	{
+	  if (dump_enabled_p ())
+	    {
+	      dump_printf_loc (MSG_NOTE, vect_location,
+			      "no need to vectorize, store is const assign: %G",
+			      next_info->stmt);
+	    }
+	  return false;
+	}
+      innermost_inits.safe_push (STMT_VINFO_DR_INIT (next_info));
+      innermost_offsets.safe_push (STMT_VINFO_DR_OFFSET (next_info));
+      next_info = DR_GROUP_NEXT_ELEMENT (next_info);
+    }
+  return true;
+}
+
+/* Compare inits to INNERMOST_INITS, return FALSE if inits do not match
+   the first grouped_store.  And check const assign meanwhile.  */
+
+static bool
+compare_innermost (const vec<tree> &innermost_inits,
+		   const vec<tree> &innermost_offsets,
+		   stmt_vec_info stmt_vinfo)
+{
+  if (!stmt_vinfo || innermost_inits.length () != stmt_vinfo->size)
+    {
+      return false;
+    }
+  stmt_vec_info next_info = stmt_vinfo;
+  unsigned int i = 0;
+  while (next_info)
+    {
+      if (is_const_assign (next_info))
+	{
+	  if (dump_enabled_p ())
+	    {
+	      dump_printf_loc (MSG_NOTE, vect_location,
+			       "no need to vectorize, store is const "
+			       "assign: %G", next_info->stmt);
+	    }
+	  return false;
+	}
+      if (innermost_inits[i] != STMT_VINFO_DR_INIT (next_info)
+	  || innermost_offsets[i] != STMT_VINFO_DR_OFFSET (next_info))
+	{
+	  return false;
+	}
+      next_info = DR_GROUP_NEXT_ELEMENT (next_info);
+      i++;
+    }
+  return true;
+}
+
+static bool
+check_same_bb (stmt_vec_info grp1, stmt_vec_info grp2)
+{
+  if (grp1->stmt->bb->index == grp2->stmt->bb->index)
+    {
+       return true;
+    }
+  return false;
+}
+
+/* Check if grouped stores are of same type.
+   input: t1/t2 = TREE_TYPE (gimple_assign_lhs (first_element->stmt))
+   output: 0 if same, 1 or -1 else.  */
+
+static int
+tree_type_cmp (const tree t1, const tree t2)
+{
+  gcc_checking_assert (t1 != NULL && t2 != NULL);
+  if (t1 != t2)
+    {
+      if (TREE_CODE (t1) != TREE_CODE (t2))
+	{
+	  return TREE_CODE (t1) > TREE_CODE (t2) ? 1 : -1;
+	}
+      if (TYPE_UNSIGNED (t1) != TYPE_UNSIGNED (t2))
+	{
+	  return TYPE_UNSIGNED (t1) > TYPE_UNSIGNED (t2) ? 1 : -1;
+	}
+      if (TYPE_PRECISION (t1) != TYPE_PRECISION (t2))
+	{
+	  return TYPE_PRECISION (t1) > TYPE_PRECISION (t2) ? 1 : -1;
+	}
+    }
+  return 0;
+}
+
+/* Check it if 2 grouped stores are of same type that
+   we can analyze them in a transpose group.  */
+static int
+check_same_store_type (stmt_vec_info grp1, stmt_vec_info grp2)
+{
+  if (grp1 == grp2)
+    {
+      return 0;
+    }
+  if (grp1->size != grp2->size)
+    {
+      return grp1->size > grp2->size ? 1 : -1;
+    }
+  tree lhs1 = gimple_assign_lhs (grp1->stmt);
+  tree lhs2 = gimple_assign_lhs (grp2->stmt);
+  if (TREE_CODE (lhs1) != TREE_CODE (lhs2))
+    {
+      return TREE_CODE (lhs1) > TREE_CODE (lhs2) ? 1 : -1;
+    }
+  tree grp_type1 = TREE_TYPE (gimple_assign_lhs (grp1->stmt));
+  tree grp_type2 = TREE_TYPE (gimple_assign_lhs (grp2->stmt));
+  int cmp = tree_type_cmp (grp_type1, grp_type2);
+  return cmp;
+}
+
+/* Sort grouped stores according to group_size and store_type.
+   output: 0 if same, 1 if grp1 > grp2, -1 otherwise.  */
+
+static int
+grouped_store_cmp (const void *grp1_, const void *grp2_)
+{
+  stmt_vec_info grp1 = *(stmt_vec_info *)const_cast<void *>(grp1_);
+  stmt_vec_info grp2 = *(stmt_vec_info *)const_cast<void *>(grp2_);
+  return check_same_store_type (grp1, grp2);
+}
+
+/* Transposing is based on permutation in registers.  Permutation requires
+   vector length being power of 2 and satisfying the vector mode.  */
+
+static inline bool
+check_filling_reg (stmt_vec_info current_element)
+{
+  if (current_element->size == 0)
+    {
+      return false;
+    }
+  /* If the gimple STMT was already vectorized in vect pass, it's unable to
+     conduct transpose analysis, skip it.  */
+  bool lhs_vectorized
+	= TREE_CODE (TREE_TYPE (gimple_get_lhs (current_element->stmt)))
+	  == VECTOR_TYPE;
+  bool rhs_vectorized
+	= TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (current_element->stmt)))
+	  == VECTOR_TYPE;
+  if (lhs_vectorized || rhs_vectorized)
+    {
+      return false;
+    }
+  unsigned int store_precision
+    = TYPE_PRECISION (TREE_TYPE (gimple_get_lhs (current_element->stmt)));
+  auto_vector_modes vector_modes;
+  targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
+  unsigned min_mode_size = -1u;
+  for (unsigned i = 0; i < vector_modes.length (); i++)
+    {
+      unsigned mode_bit_size = (GET_MODE_BITSIZE (vector_modes[i])).coeffs[0];
+      min_mode_size = mode_bit_size < min_mode_size
+			? mode_bit_size : min_mode_size;
+    }
+  return store_precision != 0
+	 && pow2p_hwi (current_element->size)
+	 && (current_element->size * store_precision % min_mode_size == 0);
+}
+
+/* Check if previous groups are suitable to transpose, if not, set their
+   group number to -1, reduce grp_num and clear current_groups.
+   Otherwise, just clear current_groups.  */
+
+static void
+check_and_clear_groups (vec<stmt_vec_info> &current_groups,
+			unsigned int &grp_num)
+{
+  stmt_vec_info first_element;
+  if (current_groups.length () == 1
+      || (current_groups.length () != 0
+	  && !pow2p_hwi (current_groups.length ())))
+    {
+      while (current_groups.length () != 0)
+	{
+	  first_element = current_groups.pop ();
+	  first_element->group_number = -1;
+	}
+      grp_num--;
+    }
+  else
+    {
+      while (current_groups.length ())
+	{
+	  current_groups.pop ();
+	}
+    }
+}
+
+
+/* Make sure that transpose slp vectorization is conducted only if grouped
+   stores are one dimension array ref.  */
+
+static bool
+is_store_one_dim_array (gimple *stmt)
+{
+  tree op = gimple_get_lhs (stmt);
+  if (TREE_CODE (op) != ARRAY_REF)
+    return false;
+  return TREE_OPERAND_LENGTH (op) > 0
+	 && TREE_OPERAND_LENGTH (TREE_OPERAND (op, 0)) == 0;
+}
+
+/* Set grouped_stores with similar MEM_REF to the same group and mark their
+   grp_num.  Groups with same grp_num consist the minimum unit to analyze
+   transpose.  Return num of such units.  */
+
+static unsigned
+vect_prepare_transpose (bb_vec_info bb_vinfo)
+{
+  stmt_vec_info current_element = NULL;
+  stmt_vec_info first_element = NULL;
+  unsigned int i = 0;
+  unsigned int grp_num = 0;
+  /* Use arrays to record MEM_REF data in different GROUPED_STORES.  */
+  auto_vec<tree> innermost_inits;
+  auto_vec<tree> innermost_offsets;
+
+  /* A set of stmt_vec_info with same store type.  Analyze them if their size
+     is suitable to transpose.  */
+  auto_vec<stmt_vec_info> current_groups;
+
+  FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, i, current_element)
+    {
+      /* Compare current grouped_store to the first one if first_element exists,
+	 push current_element to current_groups if they are similar on innermost
+	 behavior of MEM_REF.  */
+      if (first_element != NULL
+	  && !check_same_store_type (first_element, current_element)
+	  && compare_innermost (innermost_inits, innermost_offsets,
+				current_element)
+	  && check_same_bb (first_element, current_element))
+	{
+	  current_groups.safe_push (current_element);
+	  current_element->group_number = grp_num;
+	  /* If current_element is the last element in grouped_stores, continue
+	     will exit the loop and leave the last group unanalyzed.  */
+	  if (i == bb_vinfo->grouped_stores.length () - 1)
+	    {
+	      check_and_clear_groups (current_groups, grp_num);
+	    }
+	  continue;
+	}
+      check_and_clear_groups (current_groups, grp_num);
+      innermost_inits.release ();
+      innermost_offsets.release ();
+      /* Beginning of a new group to analyze whether they are able to consist
+	 a unit to conduct transpose analysis.  */
+      first_element = NULL;
+      if (is_store_one_dim_array (current_element->stmt)
+	  && check_filling_reg (current_element)
+	  && record_innermost (innermost_inits, innermost_offsets,
+			       current_element))
+	{
+	  first_element = current_element;
+	  current_groups.safe_push (current_element);
+	  current_element->group_number = ++grp_num;
+	  if (i == bb_vinfo->grouped_stores.length () - 1)
+	    {
+	      check_and_clear_groups (current_groups, grp_num);
+	    }
+	  continue;
+	}
+      current_element->group_number = -1;
+    }
+  return grp_num;
+}
+
+/* Return a flag to transpose grouped stores before building slp tree.
+   Add bool may_transpose in class vec_info.  */
+
+static bool
+vect_may_transpose (bb_vec_info bb_vinfo)
+{
+  if (targetm.vectorize.vec_perm_const == NULL)
+    {
+      return false;
+    }
+
+  if (bb_vinfo->grouped_stores.length () < 2)
+    {
+      return false;
+    }
+
+  DUMP_VECT_SCOPE ("analyze if grouped stores may transpose to slp");
+  /* Sort grouped_stores according to size and type for function
+     vect_prepare_transpose ().  */
+  bb_vinfo->grouped_stores.qsort (grouped_store_cmp);
+
+  int groups = vect_prepare_transpose (bb_vinfo);
+  BB_VINFO_TRANS_GROUPS (bb_vinfo) = groups;
+  if (dump_enabled_p ())
+      dump_printf_loc (MSG_NOTE, vect_location,
+		       "%d groups to analyze transposed slp.\n", groups);
+  return groups != 0;
+}
+
+/* Get the base address of STMT_INFO.  */
+
+static tree
+get_op_base_address (stmt_vec_info stmt_info)
+{
+  struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
+  tree op = DR_BASE_ADDRESS (dr);
+  while (TREE_OPERAND_LENGTH (op) > 0)
+    {
+      op = TREE_OPERAND (op, 0);
+    }
+  return op;
+}
+
+/* Compare the UID of the two stmt_info STMTINFO_A and STMTINFO_B.
+   Sorting them in ascending order.  */
+
+static int
+dr_group_cmp (const void *stmtinfo_a_, const void *stmtinfo_b_)
+{
+  stmt_vec_info stmtinfo_a
+	= *(stmt_vec_info *) const_cast<void *> (stmtinfo_a_);
+  stmt_vec_info stmtinfo_b
+	= *(stmt_vec_info *) const_cast<void *> (stmtinfo_b_);
+
+  /* Stabilize sort.  */
+  if (stmtinfo_a == stmtinfo_b)
+    {
+      return 0;
+    }
+  return gimple_uid (stmtinfo_a->stmt) < gimple_uid (stmtinfo_b->stmt) ? -1 : 1;
+}
+
+/* Find the first elements of the grouped loads which are required to merge.  */
+
+static void
+vect_slp_grouped_load_find (bb_vec_info bb_vinfo, vec<bool> &visited,
+			    vec<stmt_vec_info> &res)
+{
+  unsigned int i = 0;
+  stmt_vec_info merge_first_element = NULL;
+  stmt_vec_info first_element = NULL;
+  tree opa = NULL;
+  unsigned int grp_size_a = 0;
+  FOR_EACH_VEC_ELT (bb_vinfo->grouped_loads, i, first_element)
+    {
+      if (visited[i])
+	{
+	  continue;
+	}
+      if (!STMT_VINFO_GROUPED_ACCESS (first_element)
+	  || !pow2p_hwi (DR_GROUP_SIZE (first_element)))
+	{
+	  /* Non-conforming grouped load should be grouped separately.  */
+	  if (merge_first_element == NULL)
+	    {
+	      visited[i] = true;
+	      res.safe_push (first_element);
+	      return;
+	    }
+	}
+      if (merge_first_element == NULL)
+	{
+	  merge_first_element = first_element;
+	  opa = get_op_base_address (first_element);
+	  grp_size_a = DR_GROUP_SIZE (first_element);
+	  res.safe_push (first_element);
+	  visited[i] = true;
+	  continue;
+	}
+
+      /* If the two first elements are of the same base address and group size,
+	 these two grouped loads need to be merged.  */
+      tree opb = get_op_base_address (first_element);
+      unsigned int grp_size_b = DR_GROUP_SIZE (first_element);
+      if (opa == opb && grp_size_a == grp_size_b)
+	{
+	  res.safe_push (first_element);
+	  visited[i] = true;
+	}
+    }
+}
+
+/* Merge the grouped loads that are found from
+   vect_slp_grouped_load_find ().  */
+
+static stmt_vec_info
+vect_slp_grouped_load_merge (vec<stmt_vec_info> &res)
+{
+  stmt_vec_info stmt_info = res[0];
+  if (res.length () == 1)
+    {
+      return stmt_info;
+    }
+  unsigned int i = 0;
+  unsigned int size = DR_GROUP_SIZE (res[0]);
+  unsigned int new_group_size = size * res.length ();
+  stmt_vec_info first_element = NULL;
+  stmt_vec_info merge_first_element = NULL;
+  stmt_vec_info last_element = NULL;
+  FOR_EACH_VEC_ELT (res, i, first_element)
+    {
+      if (merge_first_element == NULL)
+	{
+	  merge_first_element = first_element;
+	  last_element = merge_first_element;
+	  size = DR_GROUP_SIZE (merge_first_element);
+	}
+
+      if (last_element != first_element
+	  && !DR_GROUP_NEXT_ELEMENT (last_element))
+	{
+	  DR_GROUP_NEXT_ELEMENT (last_element) = first_element;
+	  /* Store the gap from the previous member of the group.  If there is
+	     no gap in the access, DR_GROUP_GAP is always 1.  */
+	  DR_GROUP_GAP_TRANS (first_element) = DR_GROUP_GAP (first_element);
+	  DR_GROUP_GAP (first_element) = 1;
+	}
+      for (stmt_info = first_element; stmt_info;
+	   stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
+	{
+	  DR_GROUP_FIRST_ELEMENT (stmt_info) = merge_first_element;
+	  DR_GROUP_SIZE_TRANS (stmt_info) = DR_GROUP_SIZE (stmt_info);
+	  DR_GROUP_SIZE (stmt_info) = new_group_size;
+	  last_element = stmt_info;
+	}
+    }
+  DR_GROUP_SIZE (merge_first_element) = new_group_size;
+  DR_GROUP_SLP_TRANSPOSE (merge_first_element) = true;
+  DR_GROUP_NEXT_ELEMENT (last_element) = NULL;
+  return merge_first_element;
+}
+
+/* Merge the grouped loads which have the same base address and group size.
+   For example, for grouped loads (opa_1, opa_2, opb_1, opb_2):
+     opa_1: a0->a1->a2->a3
+     opa_2: a8->a9->a10->a11
+     opb_1: b0->b1
+     opb_2: b16->b17
+   we can probably get two merged grouped loads:
+     opa: a0->a1->a2->a3->a8->a9->a10->a11
+     opb: b0->b1->b16->b17.  */
+
+static bool
+vect_merge_slp_grouped_loads (bb_vec_info bb_vinfo)
+{
+  if (bb_vinfo->grouped_loads.length () <= 0)
+    {
+      if (dump_enabled_p ())
+	{
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "The number of grouped loads is 0.\n");
+	}
+      return false;
+    }
+  bb_vinfo->grouped_loads.qsort (dr_group_cmp);
+  auto_vec<bool> visited (bb_vinfo->grouped_loads.length ());
+  auto_vec<stmt_vec_info> grouped_loads_merge;
+  for (unsigned int i = 0; i < bb_vinfo->grouped_loads.length (); i++)
+    {
+      visited.safe_push (false);
+    }
+  while (1)
+    {
+      /* Find grouped loads which are required to merge.  */
+      auto_vec<stmt_vec_info> res;
+      vect_slp_grouped_load_find (bb_vinfo, visited, res);
+      if (res.is_empty ())
+	{
+	  break;
+	}
+      /* Merge the required grouped loads into one group.  */
+      grouped_loads_merge.safe_push (vect_slp_grouped_load_merge (res));
+    }
+  if (grouped_loads_merge.length () == bb_vinfo->grouped_loads.length ())
+    {
+      if (dump_enabled_p ())
+	{
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "No grouped loads need to be merged.\n");
+	}
+      return false;
+    }
+  if (dump_enabled_p ())
+    {
+      dump_printf_loc (MSG_NOTE, vect_location,
+		       "Merging grouped loads successfully.\n");
+    }
+  BB_VINFO_GROUPED_LOADS (bb_vinfo).release ();
+  for (unsigned int i = 0; i < grouped_loads_merge.length (); i++)
+    {
+      BB_VINFO_GROUPED_LOADS (bb_vinfo).safe_push (grouped_loads_merge[i]);
+    }
+  return true;
+}
+
+/* Find the first elements of the grouped stores
+   which are required to transpose and merge.  */
+
+static void
+vect_slp_grouped_store_find (bb_vec_info bb_vinfo, vec<bool> &visited,
+			     vec<stmt_vec_info> &res)
+{
+  stmt_vec_info first_element = NULL;
+  stmt_vec_info merge_first_element = NULL;
+  unsigned int k = 0;
+  FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, k, first_element)
+    {
+      if (visited[k])
+	{
+	  continue;
+	}
+      /* Non-conforming grouped store should be grouped separately.  */
+      if (!STMT_VINFO_GROUPED_ACCESS (first_element)
+	  || first_element->group_number == -1)
+	{
+	  if (merge_first_element == NULL)
+	    {
+	      visited[k] = true;
+	      res.safe_push (first_element);
+	      return;
+	    }
+	}
+      if (first_element->group_number != -1
+	  && merge_first_element == NULL)
+	{
+	  merge_first_element = first_element;
+	}
+      if (merge_first_element->group_number == first_element->group_number)
+	{
+	  visited[k] = true;
+	  res.safe_push (first_element);
+	}
+    }
+}
+
+/* Transpose and merge the grouped stores that are found from
+   vect_slp_grouped_store_find ().  */
+
+static stmt_vec_info
+vect_slp_grouped_store_transform (vec<stmt_vec_info> &res)
+{
+  stmt_vec_info stmt_info = res[0];
+  if (res.length () == 1)
+    {
+      return stmt_info;
+    }
+  stmt_vec_info rearrange_first_element = stmt_info;
+  stmt_vec_info last_element = rearrange_first_element;
+
+  unsigned int size = DR_GROUP_SIZE (rearrange_first_element);
+  unsigned int new_group_size = size * res.length ();
+  for (unsigned int i = 1; i < res.length (); i++)
+    {
+      /* Store the gap from the previous member of the group.  If there is no
+	 gap in the access, DR_GROUP_GAP is always 1.  */
+      DR_GROUP_GAP_TRANS (res[i]) = DR_GROUP_GAP (res[i]);
+      DR_GROUP_GAP (res[i]) = 1;
+    }
+  while (!res.is_empty ())
+    {
+      stmt_info = res[0];
+      res.ordered_remove (0);
+      if (DR_GROUP_NEXT_ELEMENT (stmt_info))
+	{
+	  res.safe_push (DR_GROUP_NEXT_ELEMENT (stmt_info));
+	}
+      DR_GROUP_FIRST_ELEMENT (stmt_info) = rearrange_first_element;
+      DR_GROUP_NEXT_ELEMENT (last_element) = stmt_info;
+      DR_GROUP_SIZE_TRANS (stmt_info) = DR_GROUP_SIZE (stmt_info);
+      DR_GROUP_SIZE (stmt_info) = new_group_size;
+      last_element = stmt_info;
+    }
+
+  DR_GROUP_SIZE (rearrange_first_element) = new_group_size;
+  DR_GROUP_SLP_TRANSPOSE (rearrange_first_element) = true;
+  DR_GROUP_NEXT_ELEMENT (last_element) = NULL;
+  return rearrange_first_element;
+}
+
+/* Save the STMT_INFO in the grouped stores to BB_VINFO_SCALAR_STORES for
+   transposing back grouped stores.  */
+
+static void
+get_scalar_stores (bb_vec_info bb_vinfo)
+{
+  unsigned int k = 0;
+  stmt_vec_info first_element = NULL;
+  FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, k, first_element)
+    {
+      /* Filter the grouped store which is unnecessary for transposing.  */
+      if (!STMT_VINFO_GROUPED_ACCESS (first_element)
+	  || first_element->group_number == -1)
+	{
+	  continue;
+	}
+      vec<stmt_vec_info> tmp_scalar_store;
+      tmp_scalar_store.create (DR_GROUP_SIZE (first_element));
+      for (stmt_vec_info stmt_info = first_element; stmt_info;
+	   stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
+	{
+	  tmp_scalar_store.safe_push (stmt_info);
+	}
+      BB_VINFO_SCALAR_STORES (bb_vinfo).safe_push (tmp_scalar_store);
+    }
+}
+
+/* Transpose and merge the grouped stores which have the same group number.
+   For example, for grouped stores (opa_0, opa_1, opa_2, opa_3):
+     opa_0: a00->a01->a02->a03
+     opa_1: a10->a11->a12->a13
+     opa_2: a20->a21->a22->a23
+     opa_2: a30->a31->a32->a33
+   we can probably get the merged grouped store:
+     opa: a00->a10->a20->a30
+	->a01->a11->a21->a31
+	->a02->a12->a22->a32
+	->a03->a13->a23->a33.  */
+
+static bool
+vect_transform_slp_grouped_stores (bb_vec_info bb_vinfo)
+{
+  if (bb_vinfo->grouped_stores.length () <= 0)
+    {
+      if (dump_enabled_p ())
+	{
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "The number of grouped stores is 0.\n");
+	}
+      return false;
+    }
+
+  bb_vinfo->grouped_stores.qsort (dr_group_cmp);
+  auto_vec<stmt_vec_info> grouped_stores_merge;
+  auto_vec<bool> visited (bb_vinfo->grouped_stores.length ());
+  unsigned int i = 0;
+  for (i = 0; i < bb_vinfo->grouped_stores.length (); i++)
+    {
+      visited.safe_push (false);
+    }
+
+  /* Get scalar stores for the following transposition recovery.  */
+  get_scalar_stores (bb_vinfo);
+
+  while (1)
+    {
+      /* Find grouped stores which are required to transpose and merge.  */
+      auto_vec<stmt_vec_info> res;
+      vect_slp_grouped_store_find (bb_vinfo, visited, res);
+      if (res.is_empty ())
+	{
+	  break;
+	}
+      /* Transpose and merge the required grouped stores into one group.  */
+      grouped_stores_merge.safe_push (vect_slp_grouped_store_transform (res));
+    }
+
+  BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
+  for (i = 0; i < grouped_stores_merge.length (); i++)
+    {
+      BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (grouped_stores_merge[i]);
+    }
+
+  if (dump_enabled_p ())
+    {
+      dump_printf_loc (MSG_NOTE, vect_location,
+		       "Transposing grouped stores successfully.\n");
+    }
+  return true;
+}
+
+/* A helpful function of vect_transform_back_slp_grouped_stores ().  */
+
+static auto_vec<stmt_vec_info>
+vect_transform_back_slp_grouped_store (bb_vec_info bb_vinfo,
+				       stmt_vec_info first_stmt_info)
+{
+  auto_vec<stmt_vec_info> grouped_stores_split;
+  for (unsigned int i = 0; i < bb_vinfo->scalar_stores.length (); i++)
+    {
+      vec<stmt_vec_info> scalar_tmp = bb_vinfo->scalar_stores[i];
+      if (scalar_tmp.length () > 1
+	  && scalar_tmp[0]->group_number != first_stmt_info->group_number)
+	{
+	  continue;
+	}
+      stmt_vec_info cur_stmt_info = NULL;
+      stmt_vec_info cur_first_stmt_info = NULL;
+      stmt_vec_info last_stmt_info = NULL;
+      unsigned int k = 0;
+      FOR_EACH_VEC_ELT (scalar_tmp, k, cur_stmt_info)
+	{
+	  if (k == 0)
+	    {
+	      cur_first_stmt_info = cur_stmt_info;
+	      last_stmt_info = cur_stmt_info;
+	    }
+	  DR_GROUP_FIRST_ELEMENT (cur_stmt_info) = cur_first_stmt_info;
+	  DR_GROUP_NEXT_ELEMENT (last_stmt_info) = cur_stmt_info;
+	  last_stmt_info = cur_stmt_info;
+	}
+      DR_GROUP_SIZE (cur_first_stmt_info) = k;
+      DR_GROUP_NEXT_ELEMENT (last_stmt_info) = NULL;
+      if (first_stmt_info != cur_first_stmt_info)
+	{
+	  DR_GROUP_GAP (cur_first_stmt_info)
+		= DR_GROUP_GAP_TRANS (cur_first_stmt_info);
+	  DR_GROUP_SLP_TRANSPOSE (cur_first_stmt_info) = false;
+	  DR_GROUP_NUMBER (cur_first_stmt_info) = -1;
+	}
+      grouped_stores_split.safe_push (cur_first_stmt_info);
+    }
+  return grouped_stores_split;
+}
+
+/* Transform the grouped store back.  */
+
+void
+vect_transform_back_slp_grouped_stores (bb_vec_info bb_vinfo,
+					stmt_vec_info first_stmt_info)
+{
+  if (first_stmt_info->group_number == -1)
+    {
+      return;
+    }
+  /* Transform back.  */
+  auto_vec<stmt_vec_info> grouped_stores_split
+	= vect_transform_back_slp_grouped_store (bb_vinfo, first_stmt_info);
+
+  /* Add the remaining grouped stores to grouped_stores_split.  */
+  stmt_vec_info first_element = NULL;
+  unsigned int i = 0;
+  FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, i, first_element)
+    {
+      if (first_element->group_number != first_stmt_info->group_number)
+	{
+	  grouped_stores_split.safe_push (first_element);
+	}
+    }
+  DR_GROUP_SLP_TRANSPOSE (first_stmt_info) = false;
+  DR_GROUP_NUMBER (first_stmt_info) = -1;
+  BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
+  for (i = 0; i < grouped_stores_split.length (); i++)
+    {
+      BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (grouped_stores_split[i]);
+    }
+}
+
+/* Function check_for_slp_vectype
+
+   Restriction for grouped stores by checking their vectype.
+   If the vectype of the grouped store is changed, it need transform back.
+   If all grouped stores need to be transformed back, return FALSE.  */
+
+static bool
+check_for_slp_vectype (bb_vec_info bb_vinfo)
+{
+  if (dump_file)
+    fprintf (dump_file, "check_for_slp_vectype: enter\n");
+  stmt_vec_info first_element = NULL;
+  unsigned int i = 0;
+  int count = 0;
+  auto_vec<stmt_vec_info> grouped_stores_check;
+  FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, i, first_element)
+    {
+      grouped_stores_check.safe_push (first_element);
+    }
+  FOR_EACH_VEC_ELT (grouped_stores_check, i, first_element)
+    {
+      if (STMT_VINFO_GROUPED_ACCESS (first_element)
+	  && first_element->group_number != -1)
+	{
+	  unsigned int group_size_b
+			= DR_GROUP_SIZE_TRANS (first_element);
+	  tree vectype = STMT_VINFO_VECTYPE (first_element);
+	  gimple *stmt = STMT_VINFO_STMT (first_element);
+	  tree lhs = gimple_get_lhs (stmt);
+	  tree type = TREE_TYPE (lhs);
+#if 0
+	  if (!vectype && !type)
+	    {
+	      if (dump_file)
+		fprintf (dump_file, "check_for_slp_vectype: no vectype/stmt type\n");
+	      continue;
+	    }
+
+	  if (!vectype)
+	    vectype = type;
+#endif
+	  if (dump_file)
+	    {
+	      fprintf (dump_file, "check_for_slp_vectype: %p\n", first_element);
+	      print_gimple_stmt (dump_file, stmt, 0);
+	      fprintf (dump_file, "check_for_slp_vectype: vectype=");
+	      if (vectype)
+		print_generic_expr (dump_file, vectype);
+	      fprintf (dump_file, "\n");
+	    }
+#if 0
+	  if (!vectype || !VECTOR_TYPE_P (vectype))
+	    continue;
+#endif
+	  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
+	  if (nunits.to_constant () > group_size_b)
+	    {
+	      count++;
+	      /* If the vectype is changed, this grouped store need
+		 to be transformed back.  */
+	      vect_transform_back_slp_grouped_stores (bb_vinfo, first_element);
+	      if (dump_enabled_p ())
+		{
+		  dump_printf_loc (MSG_NOTE, vect_location,
+				   "No supported: only supported for"
+				   " group_size geq than nunits.\n");
+		}
+	    }
+	}
+    }
+  if (count == BB_VINFO_TRANS_GROUPS (bb_vinfo))
+    {
+      return false;
+    }
+  if (dump_file)
+    fprintf (dump_file, "check_for_slp_vectype: True\n");
+  return true;
+}
+
+/* Function check_for_dr_alignment
+
+   Check the alignment of the slp instance loads.
+   Return FALSE if a load cannot be vectorized.  */
+
+static bool
+check_for_dr_alignment (bb_vec_info bb_vinfo, slp_instance instance)
+{
+  slp_tree node = NULL;
+  unsigned int i = 0;
+  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
+    {
+      stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
+      dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
+      if (dump_file)
+	{
+	  fprintf (dump_file, "check_for_dr_alignment: %p\n", first_stmt_info);
+
+	  gimple *stmt = STMT_VINFO_STMT (first_stmt_info);
+	  tree lhs = gimple_get_lhs (stmt);
+	  tree type = TREE_TYPE (lhs);
+	  print_gimple_stmt (dump_file, stmt, 0);
+	}
+
+      tree vectype = STMT_VINFO_VECTYPE (first_stmt_info);
+      int malign = dr_misalignment (first_dr_info, vectype);
+      enum dr_alignment_support supportable_dr_alignment
+	= vect_supportable_dr_alignment (bb_vinfo, first_dr_info,
+					 vectype, malign);
+      if (supportable_dr_alignment == dr_explicit_realign_optimized
+	  || supportable_dr_alignment == dr_explicit_realign)
+	{
+	  return false;
 	}
     }
-  else if (kind == slp_inst_kind_reduc_chain)
+  return true;
+}
+
+/* Initialize slp_transpose flag before transposing.  */
+
+static void
+init_stmt_info_slp_transpose (bb_vec_info bb_vinfo)
+{
+  stmt_vec_info first_element = NULL;
+  unsigned int k = 0;
+  FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, k, first_element)
     {
-      /* Collect the reduction stmts and store them in scalar_stmts.  */
-      scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
-      while (next_info)
+      if (STMT_VINFO_GROUPED_ACCESS (first_element))
 	{
-	  scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
-	  next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
+	  DR_GROUP_SLP_TRANSPOSE (first_element) = false;
 	}
-      /* Mark the first element of the reduction chain as reduction to properly
-	 transform the node.  In the reduction analysis phase only the last
-	 element of the chain is marked as reduction.  */
-      STMT_VINFO_DEF_TYPE (stmt_info)
-	= STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
-      STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
-	= STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
     }
-  else if (kind == slp_inst_kind_ctor)
+  FOR_EACH_VEC_ELT (bb_vinfo->grouped_loads, k, first_element)
     {
-      tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
-      tree val;
-      scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
-      FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
+      if (STMT_VINFO_GROUPED_ACCESS (first_element))
 	{
-	  stmt_vec_info def_info = vinfo->lookup_def (val);
-	  def_info = vect_stmt_to_vectorize (def_info);
-	  scalar_stmts.quick_push (def_info);
+	  DR_GROUP_SLP_TRANSPOSE (first_element) = false;
 	}
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_NOTE, vect_location,
-			 "Analyzing vectorizable constructor: %G\n",
-			 stmt_info->stmt);
     }
-  else if (kind == slp_inst_kind_reduc_group)
+}
+
+/* Analyze and transpose the stmts before building the SLP tree.  */
+
+static bool
+vect_analyze_transpose (bb_vec_info bb_vinfo)
+{
+  DUMP_VECT_SCOPE ("vect_analyze_transpose");
+
+  if (!vect_may_transpose (bb_vinfo))
     {
-      /* Collect reduction statements.  */
-      const vec<stmt_vec_info> &reductions
-	= as_a <loop_vec_info> (vinfo)->reductions;
-      scalar_stmts.create (reductions.length ());
-      for (i = 0; reductions.iterate (i, &next_info); i++)
-	if ((STMT_VINFO_RELEVANT_P (next_info)
-	     || STMT_VINFO_LIVE_P (next_info))
-	    /* ???  Make sure we didn't skip a conversion around a reduction
-	       path.  In that case we'd have to reverse engineer that conversion
-	       stmt following the chain using reduc_idx and from the PHI
-	       using reduc_def.  */
-	    && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
-	  scalar_stmts.quick_push (next_info);
-      /* If less than two were relevant/live there's nothing to SLP.  */
-      if (scalar_stmts.length () < 2)
-	return false;
+      return false;
     }
-  else
-    gcc_unreachable ();

-  vec<stmt_vec_info> roots = vNULL;
-  if (kind == slp_inst_kind_ctor)
+  /* For basic block SLP, try to merge the grouped stores and loads
+     into one group.  */
+  init_stmt_info_slp_transpose (bb_vinfo);
+  if (vect_transform_slp_grouped_stores (bb_vinfo)
+      && vect_merge_slp_grouped_loads (bb_vinfo))
     {
-      roots.create (1);
-      roots.quick_push (stmt_info);
+      if (dump_enabled_p ())
+	{
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "Analysis succeeded with SLP transposed.\n");
+	}
+      return true;
     }
-  /* Build the tree for the SLP instance.  */
-  bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
-				      roots,
-				      max_tree_size, limit, bst_map,
-				      kind == slp_inst_kind_store
-				      ? stmt_info : NULL);
-  if (!res)
-    roots.release ();
-
-  /* ???  If this is slp_inst_kind_store and the above succeeded here's
-     where we should do store group splitting.  */
-
-  return res;
+  if (dump_enabled_p ())
+    {
+      dump_printf_loc (MSG_NOTE, vect_location,
+		       "Analysis failed with SLP transposed.\n");
+    }
+  return false;
 }

 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
@@ -4963,7 +5932,7 @@ vect_slp_analyze_operations (vec_info *vinfo)
 	  /* Check we can vectorize the reduction.  */
 	  || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
 	      && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
-        {
+	{
 	  slp_tree node = SLP_INSTANCE_TREE (instance);
 	  stmt_vec_info stmt_info;
 	  if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
@@ -4975,7 +5944,7 @@ vect_slp_analyze_operations (vec_info *vinfo)
 			     "removing SLP instance operations starting from: %G",
 			     stmt_info->stmt);
 	  vect_free_slp_instance (instance);
-          vinfo->slp_instances.ordered_remove (i);
+	  vinfo->slp_instances.ordered_remove (i);
 	  cost_vec.release ();
 	  while (!visited_vec.is_empty ())
 	    visited.remove (visited_vec.pop ());
@@ -5204,7 +6173,7 @@ vect_bb_slp_scalar_cost (vec_info *vinfo,
       gimple *orig_stmt = orig_stmt_info->stmt;

       /* If there is a non-vectorized use of the defs then the scalar
-         stmt is kept live in which case we do not account it or any
+	 stmt is kept live in which case we do not account it or any
 	 required defs in the SLP children in the scalar cost.  This
 	 way we make the vectorization more costly when compared to
 	 the scalar cost.  */
@@ -5481,7 +6450,11 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,

       vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;

-      if (dump_enabled_p ())
+      BB_VINFO_VEC_INSIDE_COST (bb_vinfo) = vec_inside_cost;
+      BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo) = vec_outside_cost;
+      BB_VINFO_SCALAR_COST (bb_vinfo) = scalar_cost;
+
+      if (!unlimited_cost_model (NULL) && dump_enabled_p ())
 	{
 	  dump_printf_loc (MSG_NOTE, vect_location,
 			   "Cost model analysis for part in loop %d:\n", sl);
@@ -5819,7 +6792,7 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
   if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
     {
       if (dump_enabled_p ())
-        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 			 "not vectorized: unhandled data-ref in basic "
 			 "block.\n");
       return false;
@@ -5854,6 +6827,22 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,

   vect_pattern_recog (bb_vinfo);

+  /* Transpose grouped stores and loads for better vectorizable version.  */
+  if (bb_vinfo->transposed)
+    {
+      if (!vect_analyze_transpose (bb_vinfo))
+	{
+	  if (dump_enabled_p ())
+	    {
+	       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				"not vectorized: unhandled slp transposed in "
+				"basic block.\n");
+	    }
+	  return false;
+	}
+    }
+  bb_vinfo->before_slp = true;
+
   /* Update store groups from pattern processing.  */
   vect_fixup_store_groups_with_patterns (bb_vinfo);

@@ -5872,6 +6861,20 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
       return false;
     }

+  /* Check if the vectype is suitable for SLP transposed.  */
+  if (bb_vinfo->transposed && !check_for_slp_vectype (bb_vinfo))
+    {
+      if (dump_enabled_p ())
+	{
+	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			   "Failed to SLP transposed in the basic block.\n");
+	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			   "not vectorized: vectype is not suitable for "
+			   "SLP transposed in basic block.\n");
+	}
+      return false;
+    }
+
   /* Optimize permutations.  */
   vect_optimize_slp (bb_vinfo);

@@ -5914,6 +6917,27 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
     return false;

+  /* Check if the alignment is suitable for SLP transposed.  */
+  if (bb_vinfo->transposed)
+    {
+      for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); i++)
+	{
+	  if (!check_for_dr_alignment (bb_vinfo, instance))
+	    {
+	      if (dump_enabled_p ())
+		{
+		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				   "Failed to SLP transposed in the basic "
+				   "block.\n");
+		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				   "not vectorized: alignment is not suitable "
+				   "for SLP transposed in basic block.\n");
+		}
+	      return false;
+	    }
+	}
+    }
+
   if (!vect_slp_analyze_operations (bb_vinfo))
     {
       if (dump_enabled_p ())
@@ -5923,7 +6947,88 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
     }

   vect_bb_partition_graph (bb_vinfo);
+  return true;
+}
+
+static bool
+may_new_transpose_bbvinfo (bb_vec_info bb_vinfo_ori, bool res_ori,
+			   loop_p orig_loop)
+{
+  /* If the flag is false or the slp analysis is broken before
+     vect_analyze_slp, we don't try to analyze the transposed SLP version.  */
+  if (!flag_tree_slp_transpose_vectorize
+      || !BB_VINFO_BEFORE_SLP (bb_vinfo_ori))
+    {
+      return false;
+    }
+
+  /* If the original bb_vinfo can't be vectorized, try to new a bb_vinfo
+     of the transposed version.  */
+  if (!res_ori)
+    {
+      return true;
+    }
+
+  /* Caculate the cost of the original bb_vinfo.  */
+  if (unlimited_cost_model (NULL))
+    {
+      vec<slp_instance> &instances = BB_VINFO_SLP_INSTANCES (bb_vinfo_ori);
+      vect_bb_vectorization_profitable_p (bb_vinfo_ori, instances, orig_loop);
+    }
+  /* If the vec cost and scalar cost are not much difference (here we set the
+     threshold to 4), we try to new a bb_vinfo of the transposed version.  */
+  if (BB_VINFO_SCALAR_COST (bb_vinfo_ori)
+      < 4 * (BB_VINFO_VEC_INSIDE_COST (bb_vinfo_ori)
+	     + BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo_ori)))
+    {
+      return true;
+    }
+  return false;
+}

+static bool
+may_choose_transpose_bbvinfo (bb_vec_info bb_vinfo_trans, bool res_trans,
+			      bb_vec_info bb_vinfo_ori, bool res_ori,
+			      loop_p orig_loop)
+{
+  /* The original bb_vinfo is chosen if the transposed bb_vinfo
+     can't be vectorized.  */
+  if (!res_trans)
+    {
+      return false;
+    }
+  /* Caculate the cost of the transposed bb_vinfo.  */
+  if (unlimited_cost_model (NULL))
+    {
+      vec<slp_instance> &instances = BB_VINFO_SLP_INSTANCES (bb_vinfo_trans);
+      vect_bb_vectorization_profitable_p (bb_vinfo_trans, instances,
+					  orig_loop);
+    }
+  int diff_bb_cost = -1;
+  int diff_bb_cost_trans = -1;
+  if (res_ori)
+    {
+      diff_bb_cost = BB_VINFO_SCALAR_COST (bb_vinfo_ori)
+		     - BB_VINFO_VEC_INSIDE_COST (bb_vinfo_ori)
+		     - BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo_ori);
+    }
+  if (res_trans)
+    {
+      diff_bb_cost_trans = BB_VINFO_SCALAR_COST (bb_vinfo_trans)
+			   - BB_VINFO_VEC_INSIDE_COST (bb_vinfo_trans)
+			   - BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo_trans);
+    }
+  /* The original bb_vinfo is chosen when one of the following conditions
+     is satisfied as follows:
+	1) The cost of original version is better transposed version.
+	2) The vec cost is similar to scalar cost in the transposed version.  */
+  if ((res_ori && res_trans && diff_bb_cost >= diff_bb_cost_trans)
+      || (res_trans && BB_VINFO_SCALAR_COST (bb_vinfo_trans)
+		       <= (BB_VINFO_VEC_INSIDE_COST (bb_vinfo_trans)
+			  + BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo_trans))))
+    {
+      return false;
+    }
   return true;
 }

@@ -5937,6 +7042,7 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
 		 loop_p orig_loop)
 {
   bb_vec_info bb_vinfo;
+  bb_vec_info bb_vinfo_trans = NULL;
   auto_vector_modes vector_modes;

   /* Autodetect first vector size we try.  */
@@ -5951,6 +7057,10 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
     {
       bool vectorized = false;
       bool fatal = false;
+      bool res_bb_vinfo_ori = false;
+      bool res_bb_vinfo_trans = false;
+
+      /* New a bb_vinfo of the original version.  */
       bb_vinfo = new _bb_vec_info (bbs, &shared);

       bool first_time_p = shared.datarefs.is_empty ();
@@ -5960,8 +7070,113 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
       else
 	bb_vinfo->shared->check_datarefs ();
       bb_vinfo->vector_mode = next_vector_mode;
+      bb_vinfo->transposed = false;
+      bb_vinfo->before_slp = false;
+
+      res_bb_vinfo_ori = vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal,
+						dataref_groups);
+      auto_vec<slp_instance> profitable_subgraphs;
+      auto_vec<slp_instance> profitable_subgraphs_trans;
+      for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
+	{
+	  if (instance->subgraph_entries.is_empty ())
+	    continue;
+
+	    vect_location = instance->location ();
+	    if (!unlimited_cost_model (NULL)
+		&& !vect_bb_vectorization_profitable_p
+		      (bb_vinfo, instance->subgraph_entries, orig_loop))
+	      {
+		if (dump_enabled_p ())
+		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				   "not vectorized: vectorization is not "
+				   "profitable.\n");
+		  continue;
+	      }
+	    if (res_bb_vinfo_ori)
+	      {
+		if (!dbg_cnt (vect_slp))
+		  continue;
+		profitable_subgraphs.safe_push (instance);
+	      }
+	}
+
+      /* Analyze and new a transposed bb_vinfo.  */
+      if (may_new_transpose_bbvinfo (bb_vinfo, res_bb_vinfo_ori, orig_loop))
+	{
+	  bool fatal_trans = false;
+	  bb_vinfo_trans
+	    = new _bb_vec_info (bbs, &shared);
+	  bool first_time_p = shared.datarefs.is_empty ();
+	  BB_VINFO_DATAREFS (bb_vinfo_trans) = datarefs;
+	  if (first_time_p)
+	    {
+	      bb_vinfo_trans->shared->save_datarefs ();
+	    }
+	  else
+	    {
+	      bb_vinfo_trans->shared->check_datarefs ();
+	    }
+	  bb_vinfo_trans->vector_mode = next_vector_mode;
+	  bb_vinfo_trans->transposed = true;
+	  bb_vinfo_trans->before_slp = false;
+
+	  res_bb_vinfo_trans
+	    = vect_slp_analyze_bb_1 (bb_vinfo_trans, n_stmts, fatal_trans,
+				     dataref_groups);
+	  if (res_bb_vinfo_trans)
+	    {
+	      for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo_trans))
+		{
+		  if (instance->subgraph_entries.is_empty ())
+		    continue;
+
+		  vect_location = instance->location ();
+		  if (!unlimited_cost_model (NULL)
+		      && !vect_bb_vectorization_profitable_p
+			(bb_vinfo_trans, instance->subgraph_entries, orig_loop))
+		    {
+		      if (dump_enabled_p ())
+			  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+					   "not vectorized: transpose vectorization is not "
+					   "profitable.\n");
+		      res_bb_vinfo_trans = false;
+		      continue;
+		     }
+		  if (res_bb_vinfo_trans)
+		    {
+		      if (!dbg_cnt (vect_slp))
+			continue;
+		      profitable_subgraphs_trans.safe_push (instance);
+		    }
+		}
+	    }
+	  if (may_choose_transpose_bbvinfo (bb_vinfo_trans,
+					    res_bb_vinfo_trans,
+					    bb_vinfo, res_bb_vinfo_ori,
+					    orig_loop))
+	    {
+	      bb_vinfo = bb_vinfo_trans;
+	      fatal = fatal_trans;
+	      if (dump_enabled_p ())
+		{
+		  dump_printf_loc (MSG_NOTE, vect_location,
+				   "Basic block part vectorized "
+				   "using transposed version.\n");
+		}
+	    }
+	  else
+	    {
+	      if (dump_enabled_p ())
+		{
+		  dump_printf_loc (MSG_NOTE, vect_location,
+				   "Basic block part vectorized "
+				   "\n");
+		}
+	    }
+	}

-      if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
+      if (res_bb_vinfo_ori || res_bb_vinfo_trans)
 	{
 	  if (dump_enabled_p ())
 	    {
@@ -5972,90 +7187,129 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
 	    }

 	  bb_vinfo->shared->check_datarefs ();
-
-	  auto_vec<slp_instance> profitable_subgraphs;
-	  for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
+	  if (!res_bb_vinfo_trans)
 	    {
-	      if (instance->subgraph_entries.is_empty ())
-		continue;
-
-	      vect_location = instance->location ();
-	      if (!unlimited_cost_model (NULL)
-		  && !vect_bb_vectorization_profitable_p
-			(bb_vinfo, instance->subgraph_entries, orig_loop))
+	      /* When we're vectorizing an if-converted loop body make sure
+		 we vectorized all if-converted code.  */
+	      if (!profitable_subgraphs.is_empty ()
+		  && orig_loop)
 		{
-		  if (dump_enabled_p ())
-		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-				     "not vectorized: vectorization is not "
-				     "profitable.\n");
-		  continue;
+		  gcc_assert (bb_vinfo->bbs.length () == 1);
+		  for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
+		       !gsi_end_p (gsi); gsi_next (&gsi))
+		    {
+		      /* The costing above left us with DCEable vectorized scalar
+			 stmts having the visited flag set on profitable
+			 subgraphs.  Do the delayed clearing of the flag here.  */
+		      if (gimple_visited_p (gsi_stmt (gsi)))
+			{
+			  gimple_set_visited (gsi_stmt (gsi), false);
+			  continue;
+			}
+		      if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
+			continue;
+
+		      if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
+		       if (gimple_assign_rhs_code (ass) == COND_EXPR)
+			 {
+			   if (!profitable_subgraphs.is_empty ()
+			       && dump_enabled_p ())
+			     dump_printf_loc (MSG_NOTE, vect_location,
+					      "not profitable because of "
+					      "unprofitable if-converted scalar "
+					      "code\n");
+			   profitable_subgraphs.truncate (0);
+			 }
+		    }
 		}

-	      if (!dbg_cnt (vect_slp))
-		continue;
+	      /* Finally schedule the profitable subgraphs.  */
+	      for (slp_instance instance : profitable_subgraphs)
+		{
+		  if (!vectorized && dump_enabled_p ())
+		    dump_printf_loc (MSG_NOTE, vect_location,
+				     "Basic block will be vectorized "
+				     "using SLP\n");
+		  vectorized = true;

-	      profitable_subgraphs.safe_push (instance);
-	    }
+		  vect_schedule_slp (bb_vinfo, instance->subgraph_entries);

-	  /* When we're vectorizing an if-converted loop body make sure
-	     we vectorized all if-converted code.  */
-	  if (!profitable_subgraphs.is_empty ()
-	      && orig_loop)
+		  unsigned HOST_WIDE_INT bytes;
+		  if (dump_enabled_p ())
+		    {
+		      if (GET_MODE_SIZE
+			   (bb_vinfo->vector_mode).is_constant (&bytes))
+			 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
+					  "basic block part vectorized using %wu "
+					  "byte vectors\n", bytes);
+		      else
+			 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
+					  "basic block part vectorized using "
+					  "variable length vectors\n");
+		    }
+		}
+	    }
+	  else
 	    {
-	      gcc_assert (bb_vinfo->bbs.length () == 1);
-	      for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
-		   !gsi_end_p (gsi); gsi_next (&gsi))
+	      if (!profitable_subgraphs_trans.is_empty ()
+		  && orig_loop)
 		{
-		  /* The costing above left us with DCEable vectorized scalar
-		     stmts having the visited flag set on profitable
-		     subgraphs.  Do the delayed clearing of the flag here.  */
-		  if (gimple_visited_p (gsi_stmt (gsi)))
+		  gcc_assert (bb_vinfo->bbs.length () == 1);
+		  for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
+		       !gsi_end_p (gsi); gsi_next (&gsi))
 		    {
-		      gimple_set_visited (gsi_stmt (gsi), false);
-		      continue;
+		      /* The costing above left us with DCEable vectorized scalar
+			 stmts having the visited flag set on profitable
+			 subgraphs.  Do the delayed clearing of the flag here.  */
+		      if (gimple_visited_p (gsi_stmt (gsi)))
+			{
+			  gimple_set_visited (gsi_stmt (gsi), false);
+			  continue;
+			}
+		       if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
+			 continue;
+
+		       if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
+			if (gimple_assign_rhs_code (ass) == COND_EXPR)
+			 {
+			   if (!profitable_subgraphs_trans.is_empty ()
+			       && dump_enabled_p ())
+			     dump_printf_loc (MSG_NOTE, vect_location,
+					      "not profitable because of "
+					      "unprofitable if-converted scalar "
+					      "code\n");
+			   profitable_subgraphs_trans.truncate (0);
+			 }
 		    }
-		  if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
-		    continue;
-
-		  if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
-		    if (gimple_assign_rhs_code (ass) == COND_EXPR)
-		      {
-			if (!profitable_subgraphs.is_empty ()
-			    && dump_enabled_p ())
-			  dump_printf_loc (MSG_NOTE, vect_location,
-					   "not profitable because of "
-					   "unprofitable if-converted scalar "
-					   "code\n");
-			profitable_subgraphs.truncate (0);
-		      }
 		}
-	    }

-	  /* Finally schedule the profitable subgraphs.  */
-	  for (slp_instance instance : profitable_subgraphs)
-	    {
-	      if (!vectorized && dump_enabled_p ())
-		dump_printf_loc (MSG_NOTE, vect_location,
-				 "Basic block will be vectorized "
-				 "using SLP\n");
-	      vectorized = true;
+	      /* Finally schedule the profitable subgraphs.  */
+	      for (slp_instance instance : profitable_subgraphs_trans)
+		{
+		  if (!vectorized && dump_enabled_p ())
+		    dump_printf_loc (MSG_NOTE, vect_location,
+				     "Basic block will be vectorized "
+				     "using SLP\n");
+		  vectorized = true;

-	      vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
+		  vect_schedule_slp (bb_vinfo, instance->subgraph_entries);

-	      unsigned HOST_WIDE_INT bytes;
-	      if (dump_enabled_p ())
-		{
-		  if (GET_MODE_SIZE
-			(bb_vinfo->vector_mode).is_constant (&bytes))
-		    dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
-				     "basic block part vectorized using %wu "
-				     "byte vectors\n", bytes);
-		  else
-		    dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
-				     "basic block part vectorized using "
-				     "variable length vectors\n");
+		  unsigned HOST_WIDE_INT bytes;
+		  if (dump_enabled_p ())
+		    {
+		      if (GET_MODE_SIZE
+			   (bb_vinfo->vector_mode).is_constant (&bytes))
+			 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
+					  "basic block part vectorized using %wu "
+					  "byte vectors\n", bytes);
+		      else
+			 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
+					  "basic block part vectorized using "
+					  "variable length vectors\n");
+		    }
 		}
 	    }
+
 	}
       else
 	{
@@ -6081,6 +7335,10 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
 	  }

       delete bb_vinfo;
+      if (bb_vinfo_trans)
+	{
+	  bb_vinfo_trans = NULL;
+	}

       if (mode_i < vector_modes.length ()
 	  && VECTOR_MODE_P (autodetected_vector_mode)
@@ -7244,10 +8502,17 @@ vect_schedule_slp_node (vec_info *vinfo,
 	 ready early, vectorized stores go before the last scalar
 	 stmt which is where all uses are ready.  */
       stmt_vec_info last_stmt_info = NULL;
-      if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
-	last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
-      else /* DR_IS_WRITE */
-	last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
+
+      if (DR_GROUP_FIRST_ELEMENT (stmt_info)
+	  && DR_GROUP_SLP_TRANSPOSE (DR_GROUP_FIRST_ELEMENT (stmt_info)))
+	 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
+      else
+	{
+	   if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
+		last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
+	   else /* DR_IS_WRITE */
+		last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
+	}
       si = gsi_for_stmt (last_stmt_info->stmt);
     }
   else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 349200411..3099f6743 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1369,10 +1369,10 @@ vect_get_load_cost (vec_info *, stmt_vec_info stmt_info, int ncopies,

 static void
 vect_init_vector_1 (vec_info *vinfo, stmt_vec_info stmt_vinfo, gimple *new_stmt,
-		    gimple_stmt_iterator *gsi)
+		    gimple_stmt_iterator *gsi, bool transpose=false)
 {
   if (gsi)
-    vect_finish_stmt_generation (vinfo, stmt_vinfo, new_stmt, gsi);
+    vect_finish_stmt_generation (vinfo, stmt_vinfo, new_stmt, gsi, transpose);
   else
     vinfo->insert_on_entry (stmt_vinfo, new_stmt);

@@ -1393,7 +1393,7 @@ vect_init_vector_1 (vec_info *vinfo, stmt_vec_info stmt_vinfo, gimple *new_stmt,

 tree
 vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type,
-		  gimple_stmt_iterator *gsi)
+		  gimple_stmt_iterator *gsi, bool transpose)
 {
   gimple *init_stmt;
   tree new_temp;
@@ -1418,7 +1418,7 @@ vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type,
 		  new_temp = make_ssa_name (TREE_TYPE (type));
 		  init_stmt = gimple_build_assign (new_temp, COND_EXPR,
 						   val, true_val, false_val);
-		  vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi);
+		  vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi, transpose);
 		  val = new_temp;
 		}
 	    }
@@ -1437,7 +1437,7 @@ vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type,
 		{
 		  init_stmt = gsi_stmt (gsi2);
 		  gsi_remove (&gsi2, false);
-		  vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi);
+		  vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi, transpose);
 		}
 	    }
 	}
@@ -1446,7 +1446,7 @@ vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type,

   new_temp = vect_get_new_ssa_name (type, vect_simple_var, "cst_");
   init_stmt = gimple_build_assign (new_temp, val);
-  vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi);
+  vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi, transpose);
   return new_temp;
 }

@@ -1572,9 +1572,11 @@ vect_get_vec_defs (vec_info *vinfo, stmt_vec_info stmt_info, slp_tree slp_node,
    statement and create and return a stmt_vec_info for it.  */

 static void
-vect_finish_stmt_generation_1 (vec_info *,
-			       stmt_vec_info stmt_info, gimple *vec_stmt)
+vect_finish_stmt_generation_1 (vec_info *vinfo,
+			       stmt_vec_info stmt_info, gimple *vec_stmt, bool transpose=false)
 {
+  if (transpose)
+    stmt_vec_info vec_stmt_info = vinfo->add_pattern_stmt (vec_stmt, NULL);
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location, "add new stmt: %G", vec_stmt);

@@ -1616,7 +1618,7 @@ vect_finish_replace_stmt (vec_info *vinfo,
 void
 vect_finish_stmt_generation (vec_info *vinfo,
 			     stmt_vec_info stmt_info, gimple *vec_stmt,
-			     gimple_stmt_iterator *gsi)
+			     gimple_stmt_iterator *gsi, bool transpose)
 {
   gcc_assert (!stmt_info || gimple_code (stmt_info->stmt) != GIMPLE_LABEL);

@@ -1648,7 +1650,7 @@ vect_finish_stmt_generation (vec_info *vinfo,
 	}
     }
   gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
-  vect_finish_stmt_generation_1 (vinfo, stmt_info, vec_stmt);
+  vect_finish_stmt_generation_1 (vinfo, stmt_info, vec_stmt, transpose);
 }

 /* We want to vectorize a call to combined function CFN with function
@@ -2159,6 +2161,173 @@ vector_vector_composition_type (tree vtype, poly_uint64 nelts, tree *ptype)
   return NULL_TREE;
 }

+/* Check succeedor BB, BB without load is regarded as empty BB.  Ignore empty
+   BB in DFS.  */
+
+static unsigned
+mem_refs_in_bb (basic_block bb, vec<gimple *> &stmts)
+{
+  unsigned num = 0;
+  for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
+       !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gimple *stmt = gsi_stmt (gsi);
+      if (is_gimple_debug (stmt))
+	continue;
+      if (is_gimple_assign (stmt) && gimple_has_mem_ops (stmt)
+	  && !gimple_has_volatile_ops (stmt))
+	{
+	  if (gimple_assign_rhs_code (stmt) == MEM_REF
+	      || gimple_assign_rhs_code (stmt) == ARRAY_REF)
+	    {
+	      stmts.safe_push (stmt);
+	      num++;
+	    }
+	  else if (TREE_CODE (gimple_get_lhs (stmt)) == MEM_REF
+		   || TREE_CODE (gimple_get_lhs (stmt)) == ARRAY_REF)
+	    num++;
+	}
+    }
+  return num;
+}
+
+static bool
+check_same_base (vec<data_reference_p> *datarefs, data_reference_p dr)
+{
+  for (unsigned ui = 0; ui < datarefs->length (); ui++)
+    {
+      tree op1 = TREE_OPERAND (DR_BASE_OBJECT (dr), 0);
+      tree op2 = TREE_OPERAND (DR_BASE_OBJECT ((*datarefs)[ui]), 0);
+      if (TREE_CODE (op1) != TREE_CODE (op2))
+	continue;
+      if (TREE_CODE (op1) == ADDR_EXPR)
+	{
+	  op1 = TREE_OPERAND (op1, 0);
+	  op2 = TREE_OPERAND (op2, 0);
+	}
+      enum tree_code code = TREE_CODE (op1);
+      switch (code)
+	{
+	case VAR_DECL:
+	  if (DECL_NAME (op1) == DECL_NAME (op2)
+	      && DR_IS_READ ((*datarefs)[ui]))
+	    return true;
+	  break;
+	case SSA_NAME:
+	  if (SSA_NAME_VERSION (op1) == SSA_NAME_VERSION (op2)
+	      && DR_IS_READ ((*datarefs)[ui]))
+	    return true;
+	  break;
+	default:
+	  break;
+	}
+    }
+  return false;
+}
+
+/* Iterate all load STMTS, if staisfying same base vectorized stmt, then return,
+   Otherwise, set false to SUCCESS.  */
+
+static void
+check_vec_use (loop_vec_info loop_vinfo, vec<gimple *> &stmts,
+	       stmt_vec_info stmt_info, bool &success)
+{
+  if (stmt_info == NULL)
+    {
+      success = false;
+      return;
+    }
+  if (DR_IS_READ (stmt_info->dr_aux.dr))
+    {
+      success = false;
+      return;
+    }
+  unsigned ui = 0;
+  gimple *candidate = NULL;
+  FOR_EACH_VEC_ELT (stmts, ui, candidate)
+    {
+      if (TREE_CODE (TREE_TYPE (gimple_get_lhs (candidate))) != VECTOR_TYPE)
+	continue;
+
+      if (candidate->bb != candidate->bb->loop_father->header)
+	{
+	  success = false;
+	  return;
+	}
+      auto_vec<data_reference_p> datarefs;
+      tree res = find_data_references_in_bb (candidate->bb->loop_father,
+					     candidate->bb, &datarefs);
+      if (res == chrec_dont_know)
+	{
+	  success = false;
+	  return;
+	}
+      if (check_same_base (&datarefs, stmt_info->dr_aux.dr))
+	return;
+    }
+  success = false;
+}
+
+/* Deep first search from present BB.  If succeedor has load STMTS,
+   stop further searching.  */
+
+static void
+dfs_check_bb (loop_vec_info loop_vinfo, basic_block bb, stmt_vec_info stmt_info,
+	      bool &success, vec<basic_block> &visited_bbs)
+{
+  if (bb == cfun->cfg->x_exit_block_ptr)
+    {
+      success = false;
+      return;
+    }
+  if (!success || visited_bbs.contains (bb) || bb == loop_vinfo->loop->latch)
+    return;
+
+  visited_bbs.safe_push (bb);
+  auto_vec<gimple *> stmts;
+  unsigned num = mem_refs_in_bb (bb, stmts);
+  /* Empty BB.  */
+  if (num == 0)
+    {
+      edge e;
+      edge_iterator ei;
+      FOR_EACH_EDGE (e, ei, bb->succs)
+	{
+	  dfs_check_bb (loop_vinfo, e->dest, stmt_info, success, visited_bbs);
+	  if (!success)
+	    return;
+	}
+      return;
+    }
+  /* Non-empty BB.  */
+  check_vec_use (loop_vinfo, stmts, stmt_info, success);
+}
+
+/* For grouped store, if all succeedors of present BB have vectorized load
+   from same base of store.  If so, set memory_access_type using
+   VMAT_CONTIGUOUS_PERMUTE instead of VMAT_LOAD_STORE_LANES.  */
+
+static bool
+conti_perm (stmt_vec_info stmt_vinfo, loop_vec_info loop_vinfo)
+{
+  gimple *stmt = stmt_vinfo->stmt;
+  if (gimple_code (stmt) != GIMPLE_ASSIGN)
+    return false;
+
+  if (DR_IS_READ (stmt_vinfo->dr_aux.dr))
+    return false;
+
+  basic_block bb = stmt->bb;
+  bool success = true;
+  auto_vec<basic_block> visited_bbs;
+  visited_bbs.safe_push (bb);
+  edge e;
+  edge_iterator ei;
+  FOR_EACH_EDGE (e, ei, bb->succs)
+    dfs_check_bb (loop_vinfo, e->dest, stmt_vinfo, success, visited_bbs);
+  return success;
+}
+
 /* A subroutine of get_load_store_type, with a subset of the same
    arguments.  Handle the case where STMT_INFO is part of a grouped load
    or store.
@@ -2373,6 +2542,20 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
 	      *memory_access_type = VMAT_CONTIGUOUS_PERMUTE;
 	      overrun_p = would_overrun_p;
 	    }
+
+	  if (*memory_access_type == VMAT_LOAD_STORE_LANES
+	      && TREE_CODE (loop_vinfo->num_iters) == INTEGER_CST
+	      && maybe_eq (tree_to_shwi (loop_vinfo->num_iters),
+			   loop_vinfo->vectorization_factor)
+	      && conti_perm (stmt_info, loop_vinfo)
+	      && (vls_type == VLS_LOAD
+		  ? vect_grouped_load_supported (vectype, single_element_p,
+						 group_size)
+		  : vect_grouped_store_supported (vectype, group_size)))
+	    {
+	      *memory_access_type = VMAT_CONTIGUOUS_PERMUTE;
+	      overrun_p = would_overrun_p;
+	    }
 	}

       /* As a last resort, trying using a gather load or scatter store.
@@ -7456,6 +7639,154 @@ vectorizable_scan_store (vec_info *vinfo,
   return true;
 }

+/* Function vect_permute_store_chains
+
+   Call function vect_permute_store_chain ().
+   Given a chain of interleaved stores in DR_CHAIN, generate
+   interleave_high/low stmts to reorder the data correctly.
+   Return the final references for stores in RESULT_CHAIN.  */
+
+static void
+vect_permute_store_chains (vec_info *vinfo, vec<tree> dr_chain,
+			   unsigned int num_each, stmt_vec_info stmt_info,
+			   gimple_stmt_iterator *gsi, vec<tree> *result_chain,
+			   unsigned int group)
+{
+  unsigned int k = 0;
+  unsigned int t = 0;
+
+  /* Divide vectors into GROUP parts.  And permute every NUM_EACH vectors
+     together.  */
+  for (k = 0; k < group; k++)
+    {
+      auto_vec<tree> dr_chain_transposed (num_each);
+      auto_vec<tree> result_chain_transposed (num_each);
+      for (t = k; t < dr_chain.length (); t = t + group)
+	{
+	  dr_chain_transposed.quick_push (dr_chain[t]);
+	}
+      vect_permute_store_chain (vinfo, dr_chain_transposed, num_each,
+				stmt_info, gsi, &result_chain_transposed);
+      for (t = 0; t < num_each; t++)
+	{
+	  result_chain->quick_push (result_chain_transposed[t]);
+	}
+    }
+}
+
+/* Function transpose_oprnd_store
+
+    Calculate the transposed results from VEC_OPRNDS (VEC_STMT)
+    for vectorizable_store.  */
+
+static void
+transpose_oprnd_store (vec_info *vinfo, vec<tree>vec_oprnds,
+		       vec<tree> *result_chain, unsigned int vec_num,
+		       unsigned int const_nunits, unsigned int array_num,
+		       stmt_vec_info first_stmt_info,
+		       gimple_stmt_iterator *gsi)
+{
+  unsigned int group_for_transform = 0;
+  unsigned int num_each = 0;
+
+  /* Transpose back for vec_oprnds.  */
+  /* vec = {vec1, vec2, ...}  */
+  if (array_num < const_nunits
+      && const_nunits % array_num == 0)
+    {
+      vect_transpose_store_chain (vinfo, vec_oprnds,
+				  vec_num, array_num,
+				  first_stmt_info,
+				  gsi, result_chain);
+    }
+   /* vec1 = {vec_part1}, vec2 = {vec_part2}, ...  */
+  else if (array_num >= const_nunits
+	   && array_num % const_nunits == 0)
+    {
+      group_for_transform = array_num / const_nunits;
+      num_each = vec_oprnds.length () / group_for_transform;
+      vect_permute_store_chains (vinfo, vec_oprnds,
+				 num_each, first_stmt_info,
+				 gsi, result_chain,
+				 group_for_transform);
+    }
+  else
+    {
+      gcc_unreachable ();
+    }
+}
+
+static dr_vec_info *
+get_dr_info (stmt_vec_info stmt_info)
+{
+  dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
+  if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
+    {
+      SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
+    }
+  return dr_info;
+}
+
+static unsigned
+dr_align_vect_store (vec_info *vinfo, dr_vec_info *cur_first_dr_info,
+		     tree vectype, unsigned HOST_WIDE_INT &align)
+{
+  unsigned misalign = 0;
+  align = known_alignment (DR_TARGET_ALIGNMENT (cur_first_dr_info));
+  if (aligned_access_p (cur_first_dr_info, vectype))
+    {
+      return misalign;
+    }
+  else if (cur_first_dr_info->misalignment == -1)
+    {
+      align = dr_alignment (vect_dr_behavior (vinfo, cur_first_dr_info));
+    }
+  else
+    {
+      misalign = cur_first_dr_info->misalignment;
+    }
+  return misalign;
+}
+
+static void
+add_new_stmt_vect_store (vec_info *vinfo, tree vectype, tree dataref_ptr,
+			 tree dataref_offset, tree ref_type,
+			 dr_vec_info *cur_first_dr_info, tree vec_oprnd,
+			 gimple_stmt_iterator *gsi, stmt_vec_info stmt_info)
+{
+  /* Data align.  */
+  unsigned HOST_WIDE_INT align;
+  unsigned misalign = dr_align_vect_store (vinfo, cur_first_dr_info,
+					   vectype, align);
+
+  if (dataref_offset == NULL_TREE && TREE_CODE (dataref_ptr) == SSA_NAME)
+    {
+      set_ptr_info_alignment (get_ptr_info (dataref_ptr), align, misalign);
+    }
+
+  /* Get data_ref.  */
+  tree offset = dataref_offset ? dataref_offset : build_int_cst (ref_type, 0);
+  tree data_ref = fold_build2 (MEM_REF, vectype, dataref_ptr, offset);
+  if (aligned_access_p (cur_first_dr_info, vectype))
+    {
+      ;
+    }
+  else if (cur_first_dr_info->misalignment == -1)
+    {
+      TREE_TYPE (data_ref) = build_aligned_type (TREE_TYPE (data_ref),
+						 align * BITS_PER_UNIT);
+    }
+  else
+    {
+      tree elem_type = TREE_TYPE (vectype);
+      TREE_TYPE (data_ref) = build_aligned_type (TREE_TYPE (data_ref),
+						 TYPE_ALIGN (elem_type));
+    }
+  /* Add new stmt.  */
+  vect_copy_ref_info (data_ref, DR_REF (cur_first_dr_info->dr));
+  gassign *new_stmt = gimple_build_assign (data_ref, vec_oprnd);
+  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi, true);
+}

 /* Function vectorizable_store.

@@ -8333,6 +8664,16 @@ vectorizable_store (vec_info *vinfo,
 					   &vec_offsets);
 	      vec_offset = vec_offsets[0];
 	    }
+	  /* If the stmt_info need to be transposed recovery, dataref_ptr
+	     will be caculated later.  */
+	  else if (memory_access_type == VMAT_CONTIGUOUS
+		   && is_a <bb_vec_info> (vinfo)
+		   && STMT_VINFO_GROUPED_ACCESS (stmt_info)
+		   && DR_GROUP_SLP_TRANSPOSE (
+			DR_GROUP_FIRST_ELEMENT (stmt_info)))
+	    {
+	      dataref_ptr = NULL_TREE;
+	    }
 	  else
 	    dataref_ptr
 	      = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
@@ -8423,6 +8764,75 @@ vectorizable_store (vec_info *vinfo,
 	}
       else
 	{
+	  /* group_size: the size of group after transposing and merging.
+	     group_size_b: the size of group before transposing and merging,
+			 and only group_size_b >= const_nunits is supported.
+	     array_num: the number of arrays.
+	     const_nunits: TYPE_VECTOR_SUBPARTS (vectype).
+	     ncontinues: group_size_b / const_nunits, it means the number of
+			 times an array is stored in memory.  */
+	  if (slp && is_a <bb_vec_info> (vinfo)
+	      && STMT_VINFO_GROUPED_ACCESS (stmt_info)
+	      && DR_GROUP_SLP_TRANSPOSE (DR_GROUP_FIRST_ELEMENT (stmt_info)))
+	    {
+	      if (dump_enabled_p ())
+		{
+		  dump_printf_loc (MSG_NOTE, vect_location,
+				   "vectorizable_store for slp transpose.\n");
+		}
+	      /* Transpose back for grouped stores.  */
+	      vect_transform_back_slp_grouped_stores (bb_vinfo,
+						      first_stmt_info);
+
+	      result_chain.create (vec_oprnds.length ());
+	      unsigned int const_nunits = nunits.to_constant ();
+	      unsigned int group_size_b = DR_GROUP_SIZE_TRANS (first_stmt_info);
+	      unsigned int array_num = group_size / group_size_b;
+	      transpose_oprnd_store (vinfo, vec_oprnds, &result_chain, vec_num,
+				     const_nunits, array_num,
+				     first_stmt_info, gsi);
+
+	      /* For every store group, not for every vec, because transposing
+		 and merging have changed the data reference access.  */
+	      gcc_assert (group_size_b >= const_nunits);
+	      unsigned int ncontinues = group_size_b / const_nunits;
+
+	      unsigned int k = 0;
+	      for (i = 0; i < array_num; i++)
+		{
+		  stmt_vec_info first_stmt_b;
+		  BB_VINFO_GROUPED_STORES (vinfo).iterate (i, &first_stmt_b);
+		  bool simd_lane_access_p
+			= STMT_VINFO_SIMD_LANE_ACCESS_P (first_stmt_b) != 0;
+		  tree ref_type = get_group_alias_ptr_type (first_stmt_b);
+		  dataref_ptr = vect_create_data_ref_ptr (
+				 vinfo, first_stmt_b, aggr_type,
+				 simd_lane_access_p ? loop : NULL,
+				 offset, &dummy, gsi, &ptr_incr,
+				 simd_lane_access_p, bump);
+		  dr_vec_info *cur_first_dr_info = get_dr_info (first_stmt_b);
+		  for (unsigned int t = 0; t < ncontinues; t++)
+		    {
+		      vec_oprnd = result_chain[k];
+		      k++;
+		      if (t > 0)
+			{
+			  /* Bump the vector pointer.  */
+			  dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr,
+							 ptr_incr, gsi,
+							 first_stmt_b, bump);
+			}
+		      add_new_stmt_vect_store (vinfo, vectype, dataref_ptr,
+					       dataref_offset, ref_type,
+					       cur_first_dr_info, vec_oprnd,
+					       gsi, first_stmt_b);
+		    }
+		}
+	      oprnds.release ();
+	      result_chain.release ();
+	      vec_oprnds.release ();
+	      return true;
+	    }
 	  new_stmt = NULL;
 	  if (grouped_store)
 	    {
@@ -8719,6 +9129,451 @@ hoist_defs_of_uses (stmt_vec_info stmt_info, class loop *loop)
   return true;
 }

+static tree
+calculate_new_type (tree vectype, unsigned int const_nunits,
+		    unsigned int group_size_b, unsigned int &nloads,
+		    unsigned int &ncontinues, tree &lvectype)
+{
+  tree ltype = TREE_TYPE (vectype);
+  /* nloads is the number of ARRAYs in a vector.
+     vectemp = {a[], b[], ...}  */
+  if (group_size_b < const_nunits)
+    {
+      tree ptype;
+      tree vtype
+	= vector_vector_composition_type (vectype,
+					  const_nunits / group_size_b,
+					  &ptype);
+      if (vtype != NULL_TREE)
+	{
+	  nloads = const_nunits / group_size_b;
+	  lvectype = vtype;
+	  ltype = ptype;
+	  ncontinues = 1;
+	}
+    }
+  /* ncontinues is the number of vectors from an ARRAY.
+     vectemp1 = {a[0], a[1], ...}
+     ...
+     vectempm = {a[k], a[k+1], ...}  */
+  else
+    {
+      nloads = 1;
+      ltype = vectype;
+      ncontinues = group_size_b / const_nunits;
+    }
+  ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype)));
+  return ltype;
+}
+
+static void
+generate_old_load_permutations (slp_tree slp_node, unsigned int group_size,
+				vec<unsigned> &old_load_permutation)
+{
+  /* Generate the old load permutations from the slp_node.  */
+  unsigned i = 0;
+  unsigned k = 0;
+
+  /* If SLP_NODE has load_permutation, we copy it to old_load_permutation.
+     Otherwise, we generate a permutation sequentially.  */
+  if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
+    {
+      FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), i, k)
+	{
+	  old_load_permutation.safe_push (k);
+	}
+    }
+  else
+    {
+      for (unsigned i = 0; i < group_size; i++)
+	{
+	  old_load_permutation.safe_push (i);
+	}
+    }
+}
+
+static void
+generate_new_load_permutation_mapping (unsigned slp_node_length,
+				       vec<unsigned> &group_idx,
+				       const vec<unsigned> &load_permutation,
+				       unsigned int group_size_b,
+				       unsigned &new_group_size,
+				       vec<unsigned> &group_from)
+{
+  /* group_num_vec: only stores the group_loads IDs which are caculated from
+     load_permutation.  */
+  auto_vec<unsigned> group_num_vec;
+
+  /* Caculate which group_loads are the stmts in SLP_NODE from.  */
+  unsigned i = 0;
+  unsigned k = 0;
+  FOR_EACH_VEC_ELT (load_permutation, i, k)
+    {
+      unsigned int t0 = k / group_size_b;
+      if (!group_num_vec.contains (t0))
+	{
+	  group_num_vec.safe_push (t0);
+	}
+      group_from.safe_push (t0);
+    }
+  group_num_vec.qsort (cmp_for_group_num);
+  /* n_groups: the number of group_loads.  */
+  unsigned int n_groups = group_num_vec.length ();
+  new_group_size = n_groups * group_size_b;
+  for (i = 0; i < n_groups; i++)
+    {
+      group_idx.safe_push (group_num_vec[i] * group_size_b);
+    }
+  /* A new mapping from group_ind_vec to group_from.
+      For example:
+	Origin: group_from = {1,1,3,3,5,5,7,7};
+	After mapping: group_from = {0,0,1,1,2,2,2,2};  */
+  auto_vec<unsigned> group_ind_vec (n_groups);
+  for (k = 0; k < n_groups; k++)
+    {
+      group_ind_vec.safe_push (k);
+    }
+  for (i = 0; i < slp_node_length; i++)
+    {
+      for (k = 0; k < n_groups; k++)
+	{
+	  if (group_from[i] == group_num_vec[k])
+	    {
+	      group_from[i] = group_ind_vec[k];
+	      break;
+	    }
+	}
+    }
+}
+
+static void
+generate_new_load_permutation (vec<unsigned> &new_load_permutation,
+			       const vec<unsigned> &old_load_permutation,
+			       slp_tree slp_node, bool &this_load_permuted,
+			       const vec<unsigned> &group_from,
+			       unsigned int group_size_b)
+{
+  unsigned slp_node_length = SLP_TREE_SCALAR_STMTS (slp_node).length ();
+  /* Generate the new load permutation from the new mapping.  */
+  new_load_permutation.create (slp_node_length);
+  unsigned i = 0;
+  unsigned k = 0;
+  FOR_EACH_VEC_ELT (old_load_permutation, i, k)
+    {
+      /* t1 is the new permutation of k in the old permutation.
+	 t1 = base_address + offset:
+	 base_address = group_from[i] * group_size_b;
+	 offset = k % group_size_b.  */
+      unsigned int t1
+	= group_from[i] * group_size_b + k % group_size_b;
+      new_load_permutation.safe_push (t1);
+      if (t1 != k)
+	{
+	  this_load_permuted = true;
+	}
+    }
+}
+
+static bool
+is_slp_perm (bool slp_perm, bool this_load_permuted, poly_uint64 nunits,
+	     unsigned int group_size, stmt_vec_info first_stmt_info)
+{
+  /* Calculate the unrolling factor based on the smallest type.  */
+  poly_uint64 unrolling_factor
+    = exact_div (common_multiple (nunits, group_size), group_size);
+  /* The load requires permutation when unrolling exposes
+     a gap either because the group is larger than the SLP
+     group-size or because there is a gap between the groups.  */
+  if (!slp_perm && !this_load_permuted
+      && (known_eq (unrolling_factor, 1U)
+	  || (group_size == DR_GROUP_SIZE (first_stmt_info)
+	      && DR_GROUP_GAP (first_stmt_info) == 0)))
+    {
+      return false;
+    }
+  else
+    {
+      return true;
+    }
+}
+
+static void
+generate_load_permutation (slp_tree slp_node, unsigned &new_group_size,
+			   unsigned int group_size, unsigned int group_size_b,
+			   bool &this_load_permuted, vec<unsigned> &group_idx,
+			   vec<unsigned> &new_load_permutation)
+{
+  /* Generate the old load permutations from SLP_NODE.  */
+  vec<unsigned> old_load_permutation;
+  old_load_permutation.create (group_size);
+  generate_old_load_permutations (slp_node, group_size, old_load_permutation);
+
+  /* Caculate which group_loads are the stmts in SLP_NODE from.  */
+  unsigned slp_node_length = SLP_TREE_SCALAR_STMTS (slp_node).length ();
+  /* group_from: stores the group_loads ID for every stmt in SLP_NODE.  */
+  vec<unsigned> group_from;
+  group_from.create (slp_node_length);
+  generate_new_load_permutation_mapping (slp_node_length, group_idx,
+					 old_load_permutation,
+					 group_size_b, new_group_size,
+					 group_from);
+
+  /* Generate the new load permutation from the new mapping and caculate
+     this_load_permuted flag.  If this_load_permuted is true, we need execute
+     slp permutation by using new load permutation.  */
+  generate_new_load_permutation (new_load_permutation, old_load_permutation,
+				 slp_node, this_load_permuted, group_from,
+				 group_size_b);
+  old_load_permutation.release ();
+  group_from.release ();
+}
+
+static unsigned int
+dr_align_vect_load (vec_info *vinfo, dr_vec_info *cur_first_dr_info,
+		    tree vectype, unsigned HOST_WIDE_INT &align,
+		    enum dr_alignment_support alignment_support_scheme)
+{
+  unsigned int misalign = 0;
+
+  align = known_alignment (DR_TARGET_ALIGNMENT (cur_first_dr_info));
+  if (alignment_support_scheme == dr_aligned)
+    {
+      gcc_assert (aligned_access_p (cur_first_dr_info, vectype));
+    }
+  else if (cur_first_dr_info->misalignment == -1)
+    {
+      align = dr_alignment (vect_dr_behavior (vinfo, cur_first_dr_info));
+    }
+  else
+    {
+      misalign = cur_first_dr_info->misalignment;
+    }
+  return misalign;
+}
+
+static stmt_vec_info
+add_new_stmt_vect_load (vec_info *vinfo, tree vectype, tree dataref_ptr,
+			tree dataref_offset, tree ref_type, tree ltype,
+			gassign *(&new_stmt), dr_vec_info *cur_first_dr_info,
+			gimple_stmt_iterator *gsi, stmt_vec_info stmt_info)
+{
+  /* Data align.  */
+  int malign = dr_misalignment (cur_first_dr_info, vectype);
+  enum dr_alignment_support alignment_support_scheme
+	= vect_supportable_dr_alignment (vinfo, cur_first_dr_info,
+					 vectype, malign);
+  unsigned HOST_WIDE_INT align;
+  unsigned int misalign = dr_align_vect_load (vinfo, cur_first_dr_info,
+					      vectype, align,
+					      alignment_support_scheme);
+  if (dataref_offset == NULL_TREE && TREE_CODE (dataref_ptr) == SSA_NAME)
+    {
+      set_ptr_info_alignment (get_ptr_info (dataref_ptr), align, misalign);
+    }
+
+  /* Get data_ref.  */
+  tree offset = dataref_offset ? dataref_offset : build_int_cst (ref_type, 0);
+  tree data_ref = fold_build2 (MEM_REF, ltype, dataref_ptr, offset);
+  if (alignment_support_scheme == dr_aligned)
+    {
+      ;
+    }
+  else if (cur_first_dr_info->misalignment == -1)
+    {
+      TREE_TYPE (data_ref)
+	= build_aligned_type (TREE_TYPE (data_ref), align * BITS_PER_UNIT);
+    }
+  else
+    {
+      tree elem_type = TREE_TYPE (vectype);
+      TREE_TYPE (data_ref)
+	= build_aligned_type (TREE_TYPE (data_ref), TYPE_ALIGN (elem_type));
+    }
+
+  /* Add new stmt.  */
+  vect_copy_ref_info (data_ref, DR_REF (cur_first_dr_info->dr));
+  new_stmt = gimple_build_assign (make_ssa_name (ltype), data_ref);
+  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi, true);
+  stmt_vec_info vec_stmt_info = vinfo->lookup_stmt (new_stmt);
+  return vec_stmt_info;
+}
+
+static void
+push_new_stmt_to_dr_chain (bool slp_perm, stmt_vec_info new_stmt_info,
+			   vec<tree> dr_chain, slp_tree slp_node)
+{
+  if (slp_perm)
+    dr_chain.quick_push (gimple_assign_lhs (new_stmt_info->stmt));
+  else
+    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info->stmt);
+}
+
+static stmt_vec_info
+get_first_stmt_info_before_transpose (stmt_vec_info first_stmt_info,
+				      unsigned int group_el,
+				      unsigned int group_size)
+{
+  stmt_vec_info last_stmt_info = first_stmt_info;
+  unsigned int count = 0;
+  gcc_assert (group_el < group_size);
+  while (count < group_el)
+    {
+      last_stmt_info = DR_GROUP_NEXT_ELEMENT (last_stmt_info);
+      count++;
+    }
+  return last_stmt_info;
+}
+
+static stmt_vec_info
+add_new_stmt_for_nloads_greater_than_one (vec_info *vinfo, tree lvectype,
+					  tree vectype,
+					  vec<constructor_elt, va_gc> *v,
+					  stmt_vec_info stmt_info,
+					  gimple_stmt_iterator *gsi)
+{
+  tree vec_inv = build_constructor (lvectype, v);
+  tree new_temp = vect_init_vector (vinfo, stmt_info, vec_inv, lvectype, gsi, true);
+  stmt_vec_info new_stmt_info = vinfo->lookup_def (new_temp);
+  if (lvectype != vectype)
+    {
+      gassign *new_stmt = gimple_build_assign (make_ssa_name (vectype),
+					       VIEW_CONVERT_EXPR,
+					       build1 (VIEW_CONVERT_EXPR,
+						       vectype, new_temp));
+      vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi, true);
+      new_stmt_info = vinfo->lookup_stmt (new_stmt);
+    }
+  return new_stmt_info;
+}
+
+/* Function new_vect_stmt_for_nloads.
+
+   New a VEC_STMT when nloads Arrays are merged into a vector.
+
+   ncopies is the number of vectors that need to be loaded from memmory.
+   nloads is the number of ARRAYs in a vector.
+   vectemp = {a[], b[], ...}  */
+
+static void
+new_vect_stmt_for_nloads (vec_info *vinfo, unsigned int ncopies,
+			  unsigned int nloads, const vec<unsigned> &group_idx,
+			  stmt_vec_info stmt_info, offset_info *offset_info,
+			  vectype_info *vectype_info,
+			  vect_memory_access_type memory_access_type,
+			  bool slp_perm, vec<tree> dr_chain, slp_tree slp_node,
+			  gimple_stmt_iterator *gsi)
+{
+  vec<constructor_elt, va_gc> *v = NULL;
+  stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+  unsigned int group_size = DR_GROUP_SIZE (first_stmt_info);
+  stmt_vec_info first_stmt_info_b = NULL;
+  stmt_vec_info new_stmt_info = NULL;
+  tree dataref_ptr = NULL_TREE;
+  tree dummy;
+  gimple *ptr_incr = NULL;
+  unsigned int n = 0;
+  for (unsigned int i = 0; i < ncopies; i++)
+    {
+      vec_alloc (v, nloads);
+      for (unsigned int t = 0; t < nloads; t++)
+	{
+	  first_stmt_info_b = get_first_stmt_info_before_transpose (
+				first_stmt_info, group_idx[n++], group_size);
+	  dr_vec_info* cur_first_dr_info = get_dr_info (first_stmt_info_b);
+	  tree bump = vect_get_data_ptr_increment (vinfo, cur_first_dr_info,
+						   vectype_info->ltype,
+						   memory_access_type);
+	  bool simd_lane_access_p
+		= STMT_VINFO_SIMD_LANE_ACCESS_P (first_stmt_info_b) != 0;
+
+	  /* Create dataref_ptr which is point to init_address.  */
+	  dataref_ptr = vect_create_data_ref_ptr (
+			 vinfo, first_stmt_info_b, vectype_info->ltype, NULL,
+			 offset_info->offset, &dummy, gsi, &ptr_incr,
+			 simd_lane_access_p, bump);
+
+	  gassign *new_stmt = NULL;
+	  new_stmt_info = add_new_stmt_vect_load (vinfo, vectype_info->vectype, dataref_ptr,
+				  offset_info->dataref_offset,
+				  vectype_info->ref_type,  vectype_info->ltype,
+				  new_stmt, cur_first_dr_info, gsi,
+				  first_stmt_info_b);
+
+	  CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, gimple_assign_lhs (new_stmt));
+	}
+	new_stmt_info = add_new_stmt_for_nloads_greater_than_one (
+				 vinfo, vectype_info->lvectype,
+				 vectype_info->vectype, v,
+				 first_stmt_info_b, gsi);
+	push_new_stmt_to_dr_chain (slp_perm, new_stmt_info,
+				   dr_chain, slp_node);
+    }
+}
+
+/* Function new_vect_stmt_for_ncontinues.
+
+   New a VEC_STMTs when an Array is divided into several vectors.
+
+   n_groups is the number of ARRAYs.
+   ncontinues is the number of vectors from an ARRAY.
+   vectemp1 = {a[0], a[1], ...}
+   ...
+   vectempm = {a[k], a[k+1], ...}  */
+
+static void
+new_vect_stmt_for_ncontinues (vec_info *vinfo, unsigned int ncontinues,
+			      const vec<unsigned> &group_idx,
+			      stmt_vec_info stmt_info,
+			      offset_info* offset_info,
+			      vectype_info* vectype_info,
+			      vect_memory_access_type memory_access_type,
+			      bool slp_perm, vec<tree> &dr_chain,
+			      slp_tree slp_node,
+			      gimple_stmt_iterator *gsi)
+{
+  stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+  unsigned int group_size = DR_GROUP_SIZE (first_stmt_info);
+  stmt_vec_info new_stmt_info = NULL;
+  tree dataref_ptr = NULL_TREE;
+  tree dummy;
+  gimple *ptr_incr = NULL;
+  unsigned int n_groups = group_idx.length ();
+  for (unsigned int i = 0; i < n_groups; i++)
+    {
+      stmt_vec_info first_stmt_info_b = get_first_stmt_info_before_transpose (
+				first_stmt_info, group_idx[i], group_size);
+      dr_vec_info* cur_first_dr_info = get_dr_info (first_stmt_info_b);
+      tree bump = vect_get_data_ptr_increment (vinfo, cur_first_dr_info,
+			vectype_info->ltype, memory_access_type);
+      bool simd_lane_access_p
+		= STMT_VINFO_SIMD_LANE_ACCESS_P (first_stmt_info_b) != 0;
+      for (unsigned int k = 0; k < ncontinues; k++)
+	{
+	  /* Create dataref_ptr which is point to init_address.  */
+	  if (k == 0)
+	    {
+	      dataref_ptr = vect_create_data_ref_ptr (
+			 vinfo, first_stmt_info_b, vectype_info->ltype, NULL,
+			 offset_info->offset, &dummy, gsi, &ptr_incr,
+			 simd_lane_access_p, bump);
+	    }
+	  else
+	    {
+	      dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
+					     gsi, first_stmt_info_b, bump);
+	    }
+	  gassign *new_stmt = NULL;
+	  new_stmt_info = add_new_stmt_vect_load (vinfo, vectype_info->vectype, dataref_ptr,
+				  offset_info->dataref_offset,
+				  vectype_info->ref_type, vectype_info->ltype,
+				  new_stmt, cur_first_dr_info, gsi,
+				  first_stmt_info_b);
+	  push_new_stmt_to_dr_chain (slp_perm, new_stmt_info,
+	  		dr_chain, slp_node);
+	}
+    }
+}
+
 /* vectorizable_load.

    Check if STMT_INFO reads a non scalar data-ref (array/pointer/structure)
@@ -9338,6 +10193,8 @@ vectorizable_load (vec_info *vinfo,
       if (bb_vinfo)
 	first_stmt_info_for_drptr
 	  = vect_find_first_scalar_stmt_in_slp (slp_node);
+  // first_stmt_info_for_drptr = SLP_TREE_SCALAR_STMTS (slp_node)[0];
+

       /* Check if the chain of loads is already vectorized.  */
       if (STMT_VINFO_VEC_STMTS (first_stmt_info).exists ()
@@ -9601,6 +10458,9 @@ vectorizable_load (vec_info *vinfo,
     }
   tree vec_mask = NULL_TREE;
   poly_uint64 group_elt = 0;
+  unsigned new_group_size = 0;
+  vec<unsigned> new_load_permutation;
+
   for (j = 0; j < ncopies; j++)
     {
       /* 1. Create the vector or array pointer update chain.  */
@@ -9621,6 +10481,15 @@ vectorizable_load (vec_info *vinfo,
 	      dataref_ptr = unshare_expr (DR_BASE_ADDRESS (first_dr_info->dr));
 	      dataref_offset = build_int_cst (ref_type, 0);
 	    }
+	  /* If the stmt_info need to be transposed recovery, dataref_ptr
+	     will be caculated later.  */
+	  else if (slp && is_a <bb_vec_info> (vinfo)
+		   && STMT_VINFO_GROUPED_ACCESS (stmt_info)
+		   && DR_GROUP_SLP_TRANSPOSE (
+			DR_GROUP_FIRST_ELEMENT (stmt_info)))
+	    {
+	      dataref_ptr = NULL_TREE;
+	    }
 	  else if (diff_first_stmt_info)
 	    {
 	      dataref_ptr
@@ -9731,6 +10600,63 @@ vectorizable_load (vec_info *vinfo,
 	  /* Record that VEC_ARRAY is now dead.  */
 	  vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
 	}
+      else if (slp && is_a <bb_vec_info> (vinfo)
+	       && STMT_VINFO_GROUPED_ACCESS (stmt_info)
+	       && DR_GROUP_SLP_TRANSPOSE (DR_GROUP_FIRST_ELEMENT (stmt_info)))
+	{
+	  if (dump_enabled_p ())
+	    {
+	      dump_printf_loc (MSG_NOTE, vect_location,
+			       "vectorizable_load for slp transpose.\n");
+	    }
+	  /* group_size: the size of group after merging.
+	     group_size_b: the size of group before merging.
+	     const_nunits: TYPE_VECTOR_SUBPARTS (vectype), it is the number of
+		elements in a vector.
+	     nloads: const_nunits / group_size_b or 1, it means the number
+		of ARRAYs in a vector.
+	     ncontinues: group_size_b / const_nunits or 1, it means the number
+		of vectors from an ARRAY.  */
+	  unsigned int group_size_b = DR_GROUP_SIZE_TRANS (first_stmt_info);
+	  unsigned int const_nunits = nunits.to_constant ();
+	  unsigned int nloads = const_nunits;
+	  unsigned int ncontinues = group_size_b;
+	  tree lvectype = vectype;
+	  tree ltype = calculate_new_type (vectype, const_nunits,
+					   group_size_b, nloads,
+					   ncontinues, lvectype);
+	  bool this_load_permuted = false;
+	  auto_vec<unsigned> group_idx;
+	  generate_load_permutation (slp_node, new_group_size, group_size,
+				     group_size_b, this_load_permuted,
+				     group_idx, new_load_permutation);
+	  slp_perm = is_slp_perm (slp_perm, this_load_permuted, nunits,
+				  group_size, first_stmt_info);
+
+	  /* ncopies: the number of vectors that need to be loaded from
+		 memmory.  */
+	  unsigned int ncopies = new_group_size / const_nunits;
+	  offset_info offset_info = {offset, NULL_TREE, dataref_offset};
+	  vectype_info vectype_info = {vectype, ltype, lvectype, ref_type};
+	  if (slp_perm)
+	    {
+	       dr_chain.create (ncopies);
+	    }
+	  if (nloads > 1 && ncontinues == 1)
+	    {
+	      new_vect_stmt_for_nloads (vinfo, ncopies, nloads, group_idx,
+					stmt_info, &offset_info, &vectype_info,
+					memory_access_type, slp_perm, dr_chain,
+					slp_node, gsi);
+	    }
+	  else
+	    {
+	      new_vect_stmt_for_ncontinues (vinfo, ncontinues, group_idx,
+					    stmt_info, &offset_info,
+					    &vectype_info, memory_access_type,
+					    slp_perm, dr_chain, slp_node, gsi);
+	    }
+	}
       else
 	{
 	  for (i = 0; i < vec_num; i++)
@@ -10177,7 +11103,32 @@ vectorizable_load (vec_info *vinfo,
       if (slp && !slp_perm)
 	continue;

-      if (slp_perm)
+      /* Using the new load permutation to generate vector permute statements
+	 from a list of loads in DR_CHAIN.  */
+      if (slp && slp_perm && is_a <bb_vec_info> (vinfo)
+	  && STMT_VINFO_GROUPED_ACCESS (stmt_info)
+	  && DR_GROUP_SLP_TRANSPOSE (DR_GROUP_FIRST_ELEMENT (stmt_info)))
+	{
+	  unsigned n_perms;
+	  stmt_vec_info stmt_info_ = SLP_TREE_SCALAR_STMTS (slp_node)[0];
+	  unsigned int old_size = DR_GROUP_SIZE (stmt_info);
+	  DR_GROUP_SIZE (stmt_info_) = new_group_size;
+	  vec<unsigned> old_load_permutation
+			  = SLP_TREE_LOAD_PERMUTATION (slp_node);
+	  SLP_TREE_LOAD_PERMUTATION (slp_node) = new_load_permutation;
+	  bool perm_load_success = vect_transform_slp_perm_load (
+				     vinfo, slp_node, dr_chain, gsi, vf,
+				     false, &n_perms);
+	  DR_GROUP_SIZE (stmt_info_) = old_size;
+	  SLP_TREE_LOAD_PERMUTATION (slp_node) = old_load_permutation;
+	  new_load_permutation.release ();
+	  if (!perm_load_success)
+	    {
+	      dr_chain.release ();
+	      return false;
+	    }
+	}
+      else if (slp_perm)
         {
 	  unsigned n_perms;
 	  /* For SLP we know we've seen all possible uses of dr_chain so
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 642eb0aeb..e13bc6c99 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -412,6 +412,21 @@ public:
   vec<ddr_p> ddrs;
 };

+/* Information about offset in vectorizable_load.  */
+struct offset_info {
+  tree offset;
+  tree byte_offset;
+  tree dataref_offset;
+};
+
+/* Information about vectype in vectorizable_load.  */
+struct vectype_info {
+  tree vectype;
+  tree ltype;
+  tree lvectype;
+  tree ref_type;
+};
+
 /* Vectorizer state common between loop and basic-block vectorization.  */
 class vec_info {
 public:
@@ -455,6 +470,14 @@ public:
      stmt in the chain.  */
   auto_vec<stmt_vec_info> grouped_stores;

+  /* All interleaving chains of loads, represented by the first
+     stmt in the chain.  */
+  auto_vec<stmt_vec_info> grouped_loads;
+
+  /* All interleaving chains of stores (before transposed), represented by all
+     stmt in the chain.  */
+  auto_vec<vec<stmt_vec_info> > scalar_stores;
+
   /* The set of vector modes used in the vectorized region.  */
   mode_set used_vector_modes;

@@ -899,6 +922,8 @@ public:
 #define LOOP_VINFO_CHECK_NONZERO(L)        (L)->check_nonzero
 #define LOOP_VINFO_LOWER_BOUNDS(L)         (L)->lower_bounds
 #define LOOP_VINFO_GROUPED_STORES(L)       (L)->grouped_stores
+#define LOOP_VINFO_GROUPED_LOADS(L)	    (L)->grouped_loads
+#define LOOP_VINFO_SCALAR_STORES(L)	    (L)->scalar_stores
 #define LOOP_VINFO_SLP_INSTANCES(L)        (L)->slp_instances
 #define LOOP_VINFO_SLP_UNROLLING_FACTOR(L) (L)->slp_unrolling_factor
 #define LOOP_VINFO_REDUCTIONS(L)           (L)->reductions
@@ -982,6 +1007,25 @@ public:
   vec<basic_block> bbs;

   vec<slp_root> roots;
+
+  /* True, if bb_vinfo can goto vect_analyze_slp.  */
+  bool before_slp;
+
+  /* True, if bb_vinfo is a transposed version.  */
+  bool transposed;
+
+  /* The number of transposed groups.  */
+  int transposed_group;
+
+  /* The cost of the scalar iterations.  */
+  int scalar_cost;
+
+  /* The cost of the vector prologue and epilogue, including peeled
+     iterations and set-up code.  */
+  int vec_outside_cost;
+
+  /* The cost of the vector loop body.  */
+  int vec_inside_cost;
 } *bb_vec_info;

 #define BB_VINFO_BB(B)               (B)->bb
@@ -989,6 +1033,14 @@ public:
 #define BB_VINFO_SLP_INSTANCES(B)    (B)->slp_instances
 #define BB_VINFO_DATAREFS(B)         (B)->shared->datarefs
 #define BB_VINFO_DDRS(B)             (B)->shared->ddrs
+#define BB_VINFO_GROUPED_LOADS(B)    (B)->grouped_loads
+#define BB_VINFO_SCALAR_STORES(B)    (B)->scalar_stores
+#define BB_VINFO_VEC_OUTSIDE_COST(B) (B)->vec_outside_cost
+#define BB_VINFO_VEC_INSIDE_COST(B)  (B)->vec_inside_cost
+#define BB_VINFO_SCALAR_COST(B)      (B)->scalar_cost
+#define BB_VINFO_SLP_TRANSPOSED(B)   (B)->transposed
+#define BB_VINFO_BEFORE_SLP(B)       (B)->before_slp
+#define BB_VINFO_TRANS_GROUPS(B)     (B)->transposed_group

 /*-----------------------------------------------------------------*/
 /* Info on vectorized defs.                                        */
@@ -1219,6 +1271,17 @@ public:
   stmt_vec_info next_element;
   /* The size of the group.  */
   unsigned int size;
+
+  /* The size of the group before transposed.  */
+  unsigned int size_before_transpose;
+
+  /* If true, the stmt_info is slp transposed.  */
+  bool slp_transpose;
+
+  /* Mark the group store number for rebuild interleaving chain
+     during transpose phase.  Value -1 represents unable to transpose.  */
+  int group_number;
+
   /* For stores, number of stores from this group seen. We vectorize the last
      one.  */
   unsigned int store_count;
@@ -1226,6 +1289,9 @@ public:
      is 1.  */
   unsigned int gap;

+  /* The gap before transposed.  */
+  unsigned int gap_before_transpose;
+
   /* The minimum negative dependence distance this stmt participates in
      or zero if none.  */
   unsigned int min_neg_dist;
@@ -1427,6 +1493,12 @@ struct gather_scatter_info {
 #define STMT_VINFO_SLP_VECT_ONLY(S)     (S)->slp_vect_only_p
 #define STMT_VINFO_SLP_VECT_ONLY_PATTERN(S) (S)->slp_vect_pattern_only_p

+#define DR_GROUP_SLP_TRANSPOSE(S) \
+  (gcc_checking_assert ((S)->dr_aux.dr), (S)->slp_transpose)
+#define DR_GROUP_SIZE_TRANS(S) \
+  (gcc_checking_assert ((S)->dr_aux.dr), (S)->size_before_transpose)
+#define DR_GROUP_NUMBER(S) \
+  (gcc_checking_assert ((S)->dr_aux.dr), (S)->group_number)
 #define DR_GROUP_FIRST_ELEMENT(S) \
   (gcc_checking_assert ((S)->dr_aux.dr), (S)->first_element)
 #define DR_GROUP_NEXT_ELEMENT(S) \
@@ -1437,6 +1509,8 @@ struct gather_scatter_info {
   (gcc_checking_assert ((S)->dr_aux.dr), (S)->store_count)
 #define DR_GROUP_GAP(S) \
   (gcc_checking_assert ((S)->dr_aux.dr), (S)->gap)
+#define DR_GROUP_GAP_TRANS(S) \
+  (gcc_checking_assert ((S)->dr_aux.dr), (S)->gap_before_transpose)

 #define REDUC_GROUP_FIRST_ELEMENT(S) \
   (gcc_checking_assert (!(S)->dr_aux.dr), (S)->first_element)
@@ -2033,6 +2107,17 @@ vect_get_scalar_dr_size (dr_vec_info *dr_info)
   return tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_info->dr))));
 }

+/* Compare two unsigned int A and B.
+   Sorting them in ascending order.  */
+
+static inline int
+cmp_for_group_num (const void *a_, const void *b_)
+{
+  unsigned int a = *(unsigned int *)const_cast<void *>(a_);
+  unsigned int b = *(unsigned int *)const_cast<void *>(b_);
+  return a < b ? -1 : 1;
+}
+
 /* Return true if LOOP_VINFO requires a runtime check for whether the
    vector loop is profitable.  */

@@ -2152,7 +2237,7 @@ record_stmt_cost (stmt_vector_for_cost *body_cost_vec, int count,

 extern void vect_finish_replace_stmt (vec_info *, stmt_vec_info, gimple *);
 extern void vect_finish_stmt_generation (vec_info *, stmt_vec_info, gimple *,
-					 gimple_stmt_iterator *);
+					 gimple_stmt_iterator *,bool transpose=false);
 extern opt_result vect_mark_stmts_to_be_vectorized (loop_vec_info, bool *);
 extern tree vect_get_store_rhs (stmt_vec_info);
 void vect_get_vec_defs_for_operand (vec_info *vinfo, stmt_vec_info, unsigned,
@@ -2168,7 +2253,7 @@ void vect_get_vec_defs (vec_info *, stmt_vec_info, slp_tree, unsigned,
 			tree = NULL, vec<tree> * = NULL, tree = NULL,
 			tree = NULL, vec<tree> * = NULL, tree = NULL);
 extern tree vect_init_vector (vec_info *, stmt_vec_info, tree, tree,
-                              gimple_stmt_iterator *);
+			      gimple_stmt_iterator *, bool transpose=false);
 extern tree vect_get_slp_vect_def (slp_tree, unsigned);
 extern bool vect_transform_stmt (vec_info *, stmt_vec_info,
 				 gimple_stmt_iterator *,
@@ -2235,6 +2320,9 @@ extern bool vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
 extern void vect_permute_store_chain (vec_info *, vec<tree> &,
 				      unsigned int, stmt_vec_info,
 				      gimple_stmt_iterator *, vec<tree> *);
+extern void vect_transpose_store_chain (vec_info *, vec<tree>, unsigned int,
+					unsigned int, stmt_vec_info,
+					gimple_stmt_iterator *, vec<tree> *);
 extern tree vect_setup_realignment (vec_info *,
 				    stmt_vec_info, gimple_stmt_iterator *,
 				    tree *, enum dr_alignment_support, tree,
@@ -2262,7 +2350,8 @@ extern bool check_reduction_path (dump_user_location_t, loop_p, gphi *, tree,
 				  enum tree_code);
 extern bool needs_fold_left_reduction_p (tree, code_helper);
 /* Drive for loop analysis stage.  */
-extern opt_loop_vec_info vect_analyze_loop (class loop *, vec_info_shared *);
+extern opt_loop_vec_info vect_analyze_loop (class loop *, vec_info_shared *,
+					    bool result_only_p = false);
 extern tree vect_build_loop_niters (loop_vec_info, bool * = NULL);
 extern void vect_gen_vector_loop_niters (loop_vec_info, tree, tree *,
 					 tree *, bool);
@@ -2331,6 +2420,7 @@ extern bool vect_transform_slp_perm_load (vec_info *, slp_tree, const vec<tree>
 					  gimple_stmt_iterator *, poly_uint64,
 					  bool, unsigned *,
 					  unsigned * = nullptr, bool = false);
+extern void vect_transform_back_slp_grouped_stores (bb_vec_info, stmt_vec_info);
 extern bool vect_slp_analyze_operations (vec_info *);
 extern void vect_schedule_slp (vec_info *, const vec<slp_instance> &);
 extern opt_result vect_analyze_slp (vec_info *, unsigned);
--
2.33.0