Blame - lib/CodeGen/InterleavedLoadCombinePass.cpp - platform_external_llvm80

blob: 989fa164ad2dd2ef3887e1353f8647019b909aac [file] [log] [blame]

Martin Elshuber	5e067bb	2018-11-19 14:26:10 +0000	[diff] [blame]	1	//===- InterleavedLoadCombine.cpp - Combine Interleaved Loads ---- C++ --===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	//
				10	// \file
				11	//
				12	// This file defines the interleaved-load-combine pass. The pass searches for
				13	// ShuffleVectorInstruction that execute interleaving loads. If a matching
				14	// pattern is found, it adds a combined load and further instructions in a
				15	// pattern that is detectable by InterleavedAccesPass. The old instructions are
				16	// left dead to be removed later. The pass is specifically designed to be
				17	// executed just before InterleavedAccesPass to find any left-over instances
				18	// that are not detected within former passes.
				19	//
				20	//===----------------------------------------------------------------------===//
				21
				22	#include "llvm/ADT/Statistic.h"
				23	#include "llvm/Analysis/MemoryLocation.h"
				24	#include "llvm/Analysis/MemorySSA.h"
				25	#include "llvm/Analysis/MemorySSAUpdater.h"
				26	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
				27	#include "llvm/Analysis/TargetTransformInfo.h"
				28	#include "llvm/CodeGen/Passes.h"
				29	#include "llvm/CodeGen/TargetLowering.h"
				30	#include "llvm/CodeGen/TargetPassConfig.h"
				31	#include "llvm/CodeGen/TargetSubtargetInfo.h"
				32	#include "llvm/IR/DataLayout.h"
				33	#include "llvm/IR/Dominators.h"
				34	#include "llvm/IR/Function.h"
				35	#include "llvm/IR/Instructions.h"
				36	#include "llvm/IR/LegacyPassManager.h"
				37	#include "llvm/IR/Module.h"
				38	#include "llvm/Pass.h"
				39	#include "llvm/Support/Debug.h"
				40	#include "llvm/Support/ErrorHandling.h"
				41	#include "llvm/Support/raw_ostream.h"
				42	#include "llvm/Target/TargetMachine.h"
Martin Elshuber	5e067bb	2018-11-19 14:26:10 +0000	[diff] [blame]	43
				44	#include <algorithm>
				45	#include <cassert>
				46	#include <list>
				47
				48	using namespace llvm;
				49
				50	#define DEBUG_TYPE "interleaved-load-combine"
				51
				52	namespace {
				53
				54	/// Statistic counter
				55	STATISTIC(NumInterleavedLoadCombine, "Number of combined loads");
				56
				57	/// Option to disable the pass
				58	static cl::opt<bool> DisableInterleavedLoadCombine(
				59	"disable-" DEBUG_TYPE, cl::init(false), cl::Hidden,
				60	cl::desc("Disable combining of interleaved loads"));
				61
				62	struct VectorInfo;
				63
				64	struct InterleavedLoadCombineImpl {
				65	public:
				66	InterleavedLoadCombineImpl(Function &F, DominatorTree &DT, MemorySSA &MSSA,
				67	TargetMachine &TM)
				68	: F(F), DT(DT), MSSA(MSSA),
				69	TLI(*TM.getSubtargetImpl(F)->getTargetLowering()),
				70	TTI(TM.getTargetTransformInfo(F)) {}
				71
				72	/// Scan the function for interleaved load candidates and execute the
				73	/// replacement if applicable.
				74	bool run();
				75
				76	private:
				77	/// Function this pass is working on
				78	Function &F;
				79
				80	/// Dominator Tree Analysis
				81	DominatorTree &DT;
				82
				83	/// Memory Alias Analyses
				84	MemorySSA &MSSA;
				85
				86	/// Target Lowering Information
				87	const TargetLowering &TLI;
				88
				89	/// Target Transform Information
				90	const TargetTransformInfo TTI;
				91
				92	/// Find the instruction in sets LIs that dominates all others, return nullptr
				93	/// if there is none.
				94	LoadInst findFirstLoad(const std::set<LoadInst > &LIs);
				95
				96	/// Replace interleaved load candidates. It does additional
				97	/// analyses if this makes sense. Returns true on success and false
				98	/// of nothing has been changed.
				99	bool combine(std::list<VectorInfo> &InterleavedLoad,
				100	OptimizationRemarkEmitter &ORE);
				101
				102	/// Given a set of VectorInfo containing candidates for a given interleave
				103	/// factor, find a set that represents a 'factor' interleaved load.
				104	bool findPattern(std::list<VectorInfo> &Candidates,
				105	std::list<VectorInfo> &InterleavedLoad, unsigned Factor,
				106	const DataLayout &DL);
				107	}; // InterleavedLoadCombine
				108
				109	/// First Order Polynomial on an n-Bit Integer Value
				110	///
				111	/// Polynomial(Value) = Value * B + A + E*2^(n-e)
				112	///
				113	/// A and B are the coefficients. E*2^(n-e) is an error within 'e' most
				114	/// significant bits. It is introduced if an exact computation cannot be proven
				115	/// (e.q. division by 2).
				116	///
				117	/// As part of this optimization multiple loads will be combined. It necessary
				118	/// to prove that loads are within some relative offset to each other. This
				119	/// class is used to prove relative offsets of values loaded from memory.
				120	///
				121	/// Representing an integer in this form is sound since addition in two's
				122	/// complement is associative (trivial) and multiplication distributes over the
				123	/// addition (see Proof(1) in Polynomial::mul). Further, both operations
				124	/// commute.
				125	//
				126	// Example:
				127	// declare @fn(i64 %IDX, <4 x float>* %PTR) {
				128	// %Pa1 = add i64 %IDX, 2
				129	// %Pa2 = lshr i64 %Pa1, 1
				130	// %Pa3 = getelementptr inbounds <4 x float>, <4 x float>* %PTR, i64 %Pa2
				131	// %Va = load <4 x float>, <4 x float>* %Pa3
				132	//
				133	// %Pb1 = add i64 %IDX, 4
				134	// %Pb2 = lshr i64 %Pb1, 1
				135	// %Pb3 = getelementptr inbounds <4 x float>, <4 x float>* %PTR, i64 %Pb2
				136	// %Vb = load <4 x float>, <4 x float>* %Pb3
				137	// ... }
				138	//
				139	// The goal is to prove that two loads load consecutive addresses.
				140	//
				141	// In this case the polynomials are constructed by the following
				142	// steps.
				143	//
				144	// The number tag #e specifies the error bits.
				145	//
				146	// Pa_0 = %IDX #0
				147	// Pa_1 = %IDX + 2 #0 \| add 2
				148	// Pa_2 = %IDX/2 + 1 #1 \| lshr 1
				149	// Pa_3 = %IDX/2 + 1 #1 \| GEP, step signext to i64
				150	// Pa_4 = (%IDX/2)*16 + 16 #0 \| GEP, multiply index by sizeof(4) for floats
				151	// Pa_5 = (%IDX/2)*16 + 16 #0 \| GEP, add offset of leading components
				152	//
				153	// Pb_0 = %IDX #0
				154	// Pb_1 = %IDX + 4 #0 \| add 2
				155	// Pb_2 = %IDX/2 + 2 #1 \| lshr 1
				156	// Pb_3 = %IDX/2 + 2 #1 \| GEP, step signext to i64
				157	// Pb_4 = (%IDX/2)*16 + 32 #0 \| GEP, multiply index by sizeof(4) for floats
				158	// Pb_5 = (%IDX/2)*16 + 16 #0 \| GEP, add offset of leading components
				159	//
				160	// Pb_5 - Pa_5 = 16 #0 \| subtract to get the offset
				161	//
				162	// Remark: %PTR is not maintained within this class. So in this instance the
				163	// offset of 16 can only be assumed if the pointers are equal.
				164	//
				165	class Polynomial {
				166	/// Operations on B
				167	enum BOps {
				168	LShr,
				169	Mul,
				170	SExt,
				171	Trunc,
				172	};
				173
				174	/// Number of Error Bits e
				175	unsigned ErrorMSBs;
				176
				177	/// Value
				178	Value *V;
				179
				180	/// Coefficient B
				181	SmallVector<std::pair<BOps, APInt>, 4> B;
				182
				183	/// Coefficient A
				184	APInt A;
				185
				186	public:
				187	Polynomial(Value *V) : ErrorMSBs((unsigned)-1), V(V), B(), A() {
				188	IntegerType *Ty = dyn_cast<IntegerType>(V->getType());
				189	if (Ty) {
				190	ErrorMSBs = 0;
				191	this->V = V;
				192	A = APInt(Ty->getBitWidth(), 0);
				193	}
				194	}
				195
				196	Polynomial(const APInt &A, unsigned ErrorMSBs = 0)
				197	: ErrorMSBs(ErrorMSBs), V(NULL), B(), A(A) {}
				198
				199	Polynomial(unsigned BitWidth, uint64_t A, unsigned ErrorMSBs = 0)
				200	: ErrorMSBs(ErrorMSBs), V(NULL), B(), A(BitWidth, A) {}
				201
				202	Polynomial() : ErrorMSBs((unsigned)-1), V(NULL), B(), A() {}
				203
				204	/// Increment and clamp the number of undefined bits.
				205	void incErrorMSBs(unsigned amt) {
				206	if (ErrorMSBs == (unsigned)-1)
				207	return;
				208
				209	ErrorMSBs += amt;
				210	if (ErrorMSBs > A.getBitWidth())
				211	ErrorMSBs = A.getBitWidth();
				212	}
				213
				214	/// Decrement and clamp the number of undefined bits.
				215	void decErrorMSBs(unsigned amt) {
				216	if (ErrorMSBs == (unsigned)-1)
				217	return;
				218
				219	if (ErrorMSBs > amt)
				220	ErrorMSBs -= amt;
				221	else
				222	ErrorMSBs = 0;
				223	}
				224
				225	/// Apply an add on the polynomial
				226	Polynomial &add(const APInt &C) {
				227	// Note: Addition is associative in two's complement even when in case of
				228	// signed overflow.
				229	//
				230	// Error bits can only propagate into higher significant bits. As these are
				231	// already regarded as undefined, there is no change.
				232	//
				233	// Theorem: Adding a constant to a polynomial does not change the error
				234	// term.
				235	//
				236	// Proof:
				237	//
				238	// Since the addition is associative and commutes:
				239	//
				240	// (B + A + E2^(n-e)) + C = B + (A + C) + E2^(n-e)
				241	// [qed]
				242
				243	if (C.getBitWidth() != A.getBitWidth()) {
				244	ErrorMSBs = (unsigned)-1;
				245	return *this;
				246	}
				247
				248	A += C;
				249	return *this;
				250	}
				251
				252	/// Apply a multiplication onto the polynomial.
				253	Polynomial &mul(const APInt &C) {
				254	// Note: Multiplication distributes over the addition
				255	//
				256	// Theorem: Multiplication distributes over the addition
				257	//
				258	// Proof(1):
				259	//
				260	// (B+A)*C =-
				261	// = (B + A) + (B + A) + .. {C Times}
				262	// addition is associative and commutes, hence
				263	// = B + B + .. {C Times} .. + A + A + .. {C times}
				264	// = BC + AC
				265	// (see (function add) for signed values and overflows)
				266	// [qed]
				267	//
				268	// Theorem: If C has c trailing zeros, errors bits in A or B are shifted out
				269	// to the left.
				270	//
				271	// Proof(2):
				272	//
				273	// Let B' and A' be the n-Bit inputs with some unknown errors EA,
				274	// EB at e leading bits. B' and A' can be written down as:
				275	//
				276	// B' = B + 2^(n-e)*EB
				277	// A' = A + 2^(n-e)*EA
				278	//
				279	// Let C' be an input with c trailing zero bits. C' can be written as
				280	//
				281	// C' = C*2^c
				282	//
				283	// Therefore we can compute the result by using distributivity and
				284	// commutativity.
				285	//
				286	// (B'C' + A'C') = [B + 2^(n-e)EB] C' + [A + 2^(n-e)EA] C' =
				287	// = [B + 2^(n-e)EB + A + 2^(n-e)EA] * C' =
				288	// = (B'+A') * C' =
				289	// = [B + 2^(n-e)EB + A + 2^(n-e)EA] * C' =
				290	// = [B + A + 2^(n-e)EB + 2^(n-e)EA] * C' =
				291	// = (B + A) * C' + [2^(n-e)EB + 2^(n-e)EA)] * C' =
				292	// = (B + A) * C' + [2^(n-e)EB + 2^(n-e)EA)] * C*2^c =
				293	// = (B + A) * C' + C(EB + EA)2^(n-e)*2^c =
				294	//
				295	// Let EC be the final error with EC = C*(EB + EA)
				296	//
				297	// = (B + A)C' + EC2^(n-e)*2^c =
				298	// = (B + A)C' + EC2^(n-(e-c))
				299	//
				300	// Since EC is multiplied by 2^(n-(e-c)) the resulting error contains c
				301	// less error bits than the input. c bits are shifted out to the left.
				302	// [qed]
				303
				304	if (C.getBitWidth() != A.getBitWidth()) {
				305	ErrorMSBs = (unsigned)-1;
				306	return *this;
				307	}
				308
				309	// Multiplying by one is a no-op.
				310	if (C.isOneValue()) {
				311	return *this;
				312	}
				313
				314	// Multiplying by zero removes the coefficient B and defines all bits.
				315	if (C.isNullValue()) {
				316	ErrorMSBs = 0;
				317	deleteB();
				318	}
				319
				320	// See Proof(2): Trailing zero bits indicate a left shift. This removes
				321	// leading bits from the result even if they are undefined.
				322	decErrorMSBs(C.countTrailingZeros());
				323
				324	A *= C;
				325	pushBOperation(Mul, C);
				326	return *this;
				327	}
				328
				329	/// Apply a logical shift right on the polynomial
				330	Polynomial &lshr(const APInt &C) {
				331	// Theorem(1): (B + A + E2^(n-e)) >> 1 => (B >> 1) + (A >> 1) + E'2^(n-e')
				332	// where
				333	// e' = e + 1,
				334	// E is a e-bit number,
				335	// E' is a e'-bit number,
				336	// holds under the following precondition:
				337	// pre(1): A % 2 = 0
				338	// pre(2): e < n, (see Theorem(2) for the trivial case with e=n)
				339	// where >> expresses a logical shift to the right, with adding zeros.
				340	//
				341	// We need to show that for every, E there is a E'
				342	//
				343	// B = b_h * 2^(n-1) + b_m * 2 + b_l
				344	// A = a_h * 2^(n-1) + a_m * 2 (pre(1))
				345	//
				346	// where a_h, b_h, b_l are single bits, and a_m, b_m are (n-2) bit numbers
				347	//
				348	// Let X = (B + A + E*2^(n-e)) >> 1
				349	// Let Y = (B >> 1) + (A >> 1) + E*2^(n-e) >> 1
				350	//
				351	// X = [B + A + E*2^(n-e)] >> 1 =
				352	// = [ b_h * 2^(n-1) + b_m * 2 + b_l +
				353	// + a_h * 2^(n-1) + a_m * 2 +
				354	// + E * 2^(n-e) ] >> 1 =
				355	//
				356	// The sum is built by putting the overflow of [a_m + b+n] into the term
				357	// 2^(n-1). As there are no more bits beyond 2^(n-1) the overflow within
				358	// this bit is discarded. This is expressed by % 2.
				359	//
				360	// The bit in position 0 cannot overflow into the term (b_m + a_m).
				361	//
				362	// = [ ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-1) +
				363	// + ((b_m + a_m) % 2^(n-2)) * 2 +
				364	// + b_l + E * 2^(n-e) ] >> 1 =
				365	//
				366	// The shift is computed by dividing the terms by 2 and by cutting off
				367	// b_l.
				368	//
				369	// = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
				370	// + ((b_m + a_m) % 2^(n-2)) +
				371	// + E * 2^(n-(e+1)) =
				372	//
				373	// by the definition in the Theorem e+1 = e'
				374	//
				375	// = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
				376	// + ((b_m + a_m) % 2^(n-2)) +
				377	// + E * 2^(n-e') =
				378	//
				379	// Compute Y by applying distributivity first
				380	//
				381	// Y = (B >> 1) + (A >> 1) + E*2^(n-e') =
				382	// = (b_h * 2^(n-1) + b_m * 2 + b_l) >> 1 +
				383	// + (a_h * 2^(n-1) + a_m * 2) >> 1 +
				384	// + E * 2^(n-e) >> 1 =
				385	//
				386	// Again, the shift is computed by dividing the terms by 2 and by cutting
				387	// off b_l.
				388	//
				389	// = b_h * 2^(n-2) + b_m +
				390	// + a_h * 2^(n-2) + a_m +
				391	// + E * 2^(n-(e+1)) =
				392	//
				393	// Again, the sum is built by putting the overflow of [a_m + b+n] into
				394	// the term 2^(n-1). But this time there is room for a second bit in the
				395	// term 2^(n-2) we add this bit to a new term and denote it o_h in a
				396	// second step.
				397	//
				398	// = ([b_h + a_h + (b_m + a_m) >> (n-2)] >> 1) * 2^(n-1) +
				399	// + ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
				400	// + ((b_m + a_m) % 2^(n-2)) +
				401	// + E * 2^(n-(e+1)) =
				402	//
				403	// Let o_h = [b_h + a_h + (b_m + a_m) >> (n-2)] >> 1
				404	// Further replace e+1 by e'.
				405	//
				406	// = o_h * 2^(n-1) +
				407	// + ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
				408	// + ((b_m + a_m) % 2^(n-2)) +
				409	// + E * 2^(n-e') =
				410	//
				411	// Move o_h into the error term and construct E'. To ensure that there is
				412	// no 2^x with negative x, this step requires pre(2) (e < n).
				413	//
				414	// = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
				415	// + ((b_m + a_m) % 2^(n-2)) +
				416	// + o_h * 2^(e'-1) * 2^(n-e') + \| pre(2), move 2^(e'-1)
				417	// \| out of the old exponent
				418	// + E * 2^(n-e') =
				419	// = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
				420	// + ((b_m + a_m) % 2^(n-2)) +
				421	// + [o_h * 2^(e'-1) + E] * 2^(n-e') + \| move 2^(e'-1) out of
				422	// \| the old exponent
				423	//
				424	// Let E' = o_h * 2^(e'-1) + E
				425	//
				426	// = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
				427	// + ((b_m + a_m) % 2^(n-2)) +
				428	// + E' * 2^(n-e')
				429	//
				430	// Because X and Y are distinct only in there error terms and E' can be
				431	// constructed as shown the theorem holds.
				432	// [qed]
				433	//
				434	// For completeness in case of the case e=n it is also required to show that
				435	// distributivity can be applied.
				436	//
				437	// In this case Theorem(1) transforms to (the pre-condition on A can also be
				438	// dropped)
				439	//
				440	// Theorem(2): (B + A + E) >> 1 => (B >> 1) + (A >> 1) + E'
				441	// where
				442	// A, B, E, E' are two's complement numbers with the same bit
				443	// width
				444	//
				445	// Let A + B + E = X
				446	// Let (B >> 1) + (A >> 1) = Y
				447	//
				448	// Therefore we need to show that for every X and Y there is an E' which
				449	// makes the equation
				450	//
				451	// X = Y + E'
				452	//
				453	// hold. This is trivially the case for E' = X - Y.
				454	//
				455	// [qed]
				456	//
				457	// Remark: Distributing lshr with and arbitrary number n can be expressed as
				458	// ((((B + A) lshr 1) lshr 1) ... ) {n times}.
				459	// This construction induces n additional error bits at the left.
				460
				461	if (C.getBitWidth() != A.getBitWidth()) {
				462	ErrorMSBs = (unsigned)-1;
				463	return *this;
				464	}
				465
				466	if (C.isNullValue())
				467	return *this;
				468
				469	// Test if the result will be zero
				470	unsigned shiftAmt = C.getZExtValue();
				471	if (shiftAmt >= C.getBitWidth())
				472	return mul(APInt(C.getBitWidth(), 0));
				473
				474	// The proof that shiftAmt LSBs are zero for at least one summand is only
				475	// possible for the constant number.
				476	//
				477	// If this can be proven add shiftAmt to the error counter
				478	// `ErrorMSBs`. Otherwise set all bits as undefined.
				479	if (A.countTrailingZeros() < shiftAmt)
				480	ErrorMSBs = A.getBitWidth();
				481	else
				482	incErrorMSBs(shiftAmt);
				483
				484	// Apply the operation.
				485	pushBOperation(LShr, C);
				486	A = A.lshr(shiftAmt);
				487
				488	return *this;
				489	}
				490
				491	/// Apply a sign-extend or truncate operation on the polynomial.
				492	Polynomial &sextOrTrunc(unsigned n) {
				493	if (n < A.getBitWidth()) {
				494	// Truncate: Clearly undefined Bits on the MSB side are removed
				495	// if there are any.
				496	decErrorMSBs(A.getBitWidth() - n);
				497	A = A.trunc(n);
				498	pushBOperation(Trunc, APInt(sizeof(n) * 8, n));
				499	}
				500	if (n > A.getBitWidth()) {
				501	// Extend: Clearly extending first and adding later is different
				502	// to adding first and extending later in all extended bits.
				503	incErrorMSBs(n - A.getBitWidth());
				504	A = A.sext(n);
				505	pushBOperation(SExt, APInt(sizeof(n) * 8, n));
				506	}
				507
				508	return *this;
				509	}
				510
				511	/// Test if there is a coefficient B.
				512	bool isFirstOrder() const { return V != nullptr; }
				513
				514	/// Test coefficient B of two Polynomials are equal.
				515	bool isCompatibleTo(const Polynomial &o) const {
				516	// The polynomial use different bit width.
				517	if (A.getBitWidth() != o.A.getBitWidth())
				518	return false;
				519
				520	// If neither Polynomial has the Coefficient B.
				521	if (!isFirstOrder() && !o.isFirstOrder())
				522	return true;
				523
				524	// The index variable is different.
				525	if (V != o.V)
				526	return false;
				527
				528	// Check the operations.
				529	if (B.size() != o.B.size())
				530	return false;
				531
				532	auto ob = o.B.begin();
				533	for (auto &b : B) {
				534	if (b != *ob)
				535	return false;
				536	ob++;
				537	}
				538
				539	return true;
				540	}
				541
				542	/// Subtract two polynomials, return an undefined polynomial if
				543	/// subtraction is not possible.
				544	Polynomial operator-(const Polynomial &o) const {
				545	// Return an undefined polynomial if incompatible.
				546	if (!isCompatibleTo(o))
				547	return Polynomial();
				548
				549	// If the polynomials are compatible (meaning they have the same
				550	// coefficient on B), B is eliminated. Thus a polynomial solely
				551	// containing A is returned
				552	return Polynomial(A - o.A, std::max(ErrorMSBs, o.ErrorMSBs));
				553	}
				554
				555	/// Subtract a constant from a polynomial,
				556	Polynomial operator-(uint64_t C) const {
				557	Polynomial Result(*this);
				558	Result.A -= C;
				559	return Result;
				560	}
				561
				562	/// Add a constant to a polynomial,
				563	Polynomial operator+(uint64_t C) const {
				564	Polynomial Result(*this);
				565	Result.A += C;
				566	return Result;
				567	}
				568
				569	/// Returns true if it can be proven that two Polynomials are equal.
				570	bool isProvenEqualTo(const Polynomial &o) {
				571	// Subtract both polynomials and test if it is fully defined and zero.
				572	Polynomial r = *this - o;
				573	return (r.ErrorMSBs == 0) && (!r.isFirstOrder()) && (r.A.isNullValue());
				574	}
				575
				576	/// Print the polynomial into a stream.
				577	void print(raw_ostream &OS) const {
				578	OS << "[{#ErrBits:" << ErrorMSBs << "} ";
				579
				580	if (V) {
				581	for (auto b : B)
				582	OS << "(";
				583	OS << "(" << *V << ") ";
				584
				585	for (auto b : B) {
				586	switch (b.first) {
				587	case LShr:
				588	OS << "LShr ";
				589	break;
				590	case Mul:
				591	OS << "Mul ";
				592	break;
				593	case SExt:
				594	OS << "SExt ";
				595	break;
				596	case Trunc:
				597	OS << "Trunc ";
				598	break;
				599	}
				600
				601	OS << b.second << ") ";
				602	}
				603	}
				604
				605	OS << "+ " << A << "]";
				606	}
				607
				608	private:
				609	void deleteB() {
				610	V = nullptr;
				611	B.clear();
				612	}
				613
				614	void pushBOperation(const BOps Op, const APInt &C) {
				615	if (isFirstOrder()) {
				616	B.push_back(std::make_pair(Op, C));
				617	return;
				618	}
				619	}
				620	};
				621
Simon Pilgrim	7e9b96e	2018-11-19 18:57:49 +0000	[diff] [blame]	622	#ifndef NDEBUG
				623	static raw_ostream &operator<<(raw_ostream &OS, const Polynomial &S) {
				624	S.print(OS);
				625	return OS;
				626	}
				627	#endif
				628
Martin Elshuber	5e067bb	2018-11-19 14:26:10 +0000	[diff] [blame]	629	/// VectorInfo stores abstract the following information for each vector
				630	/// element:
				631	///
				632	/// 1) The the memory address loaded into the element as Polynomial
				633	/// 2) a set of load instruction necessary to construct the vector,
				634	/// 3) a set of all other instructions that are necessary to create the vector and
				635	/// 4) a pointer value that can be used as relative base for all elements.
				636	struct VectorInfo {
				637	private:
				638	VectorInfo(const VectorInfo &c) : VTy(c.VTy) {
				639	llvm_unreachable(
				640	"Copying VectorInfo is neither implemented nor necessary,");
				641	}
				642
				643	public:
				644	/// Information of a Vector Element
				645	struct ElementInfo {
				646	/// Offset Polynomial.
				647	Polynomial Ofs;
				648
				649	/// The Load Instruction used to Load the entry. LI is null if the pointer
				650	/// of the load instruction does not point on to the entry
				651	LoadInst *LI;
				652
				653	ElementInfo(Polynomial Offset = Polynomial(), LoadInst *LI = nullptr)
				654	: Ofs(Offset), LI(LI) {}
				655	};
				656
				657	/// Basic-block the load instructions are within
				658	BasicBlock *BB;
				659
				660	/// Pointer value of all participation load instructions
				661	Value *PV;
				662
				663	/// Participating load instructions
				664	std::set<LoadInst *> LIs;
				665
				666	/// Participating instructions
				667	std::set<Instruction *> Is;
				668
				669	/// Final shuffle-vector instruction
				670	ShuffleVectorInst *SVI;
				671
				672	/// Information of the offset for each vector element
				673	ElementInfo *EI;
				674
				675	/// Vector Type
				676	VectorType *const VTy;
				677
				678	VectorInfo(VectorType *VTy)
				679	: BB(nullptr), PV(nullptr), LIs(), Is(), SVI(nullptr), VTy(VTy) {
				680	EI = new ElementInfo[VTy->getNumElements()];
				681	}
				682
				683	virtual ~VectorInfo() { delete[] EI; }
				684
				685	unsigned getDimension() const { return VTy->getNumElements(); }
				686
				687	/// Test if the VectorInfo can be part of an interleaved load with the
				688	/// specified factor.
				689	///
				690	/// \param Factor of the interleave
				691	/// \param DL Targets Datalayout
				692	///
				693	/// \returns true if this is possible and false if not
				694	bool isInterleaved(unsigned Factor, const DataLayout &DL) const {
				695	unsigned Size = DL.getTypeAllocSize(VTy->getElementType());
				696	for (unsigned i = 1; i < getDimension(); i++) {
				697	if (!EI[i].Ofs.isProvenEqualTo(EI[0].Ofs + i * Factor * Size)) {
				698	return false;
				699	}
				700	}
				701	return true;
				702	}
				703
				704	/// Recursively computes the vector information stored in V.
				705	///
				706	/// This function delegates the work to specialized implementations
				707	///
				708	/// \param V Value to operate on
				709	/// \param Result Result of the computation
				710	///
				711	/// \returns false if no sensible information can be gathered.
				712	static bool compute(Value *V, VectorInfo &Result, const DataLayout &DL) {
				713	ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(V);
				714	if (SVI)
				715	return computeFromSVI(SVI, Result, DL);
				716	LoadInst *LI = dyn_cast<LoadInst>(V);
				717	if (LI)
				718	return computeFromLI(LI, Result, DL);
				719	BitCastInst *BCI = dyn_cast<BitCastInst>(V);
				720	if (BCI)
				721	return computeFromBCI(BCI, Result, DL);
				722	return false;
				723	}
				724
				725	/// BitCastInst specialization to compute the vector information.
				726	///
				727	/// \param BCI BitCastInst to operate on
				728	/// \param Result Result of the computation
				729	///
				730	/// \returns false if no sensible information can be gathered.
				731	static bool computeFromBCI(BitCastInst *BCI, VectorInfo &Result,
				732	const DataLayout &DL) {
				733	Instruction *Op = dyn_cast<Instruction>(BCI->getOperand(0));
				734
				735	if (!Op)
				736	return false;
				737
				738	VectorType *VTy = dyn_cast<VectorType>(Op->getType());
				739	if (!VTy)
				740	return false;
				741
				742	// We can only cast from large to smaller vectors
				743	if (Result.VTy->getNumElements() % VTy->getNumElements())
				744	return false;
				745
				746	unsigned Factor = Result.VTy->getNumElements() / VTy->getNumElements();
				747	unsigned NewSize = DL.getTypeAllocSize(Result.VTy->getElementType());
				748	unsigned OldSize = DL.getTypeAllocSize(VTy->getElementType());
				749
				750	if (NewSize * Factor != OldSize)
				751	return false;
				752
				753	VectorInfo Old(VTy);
				754	if (!compute(Op, Old, DL))
				755	return false;
				756
				757	for (unsigned i = 0; i < Result.VTy->getNumElements(); i += Factor) {
				758	for (unsigned j = 0; j < Factor; j++) {
				759	Result.EI[i + j] =
				760	ElementInfo(Old.EI[i / Factor].Ofs + j * NewSize,
				761	j == 0 ? Old.EI[i / Factor].LI : nullptr);
				762	}
				763	}
				764
				765	Result.BB = Old.BB;
				766	Result.PV = Old.PV;
				767	Result.LIs.insert(Old.LIs.begin(), Old.LIs.end());
				768	Result.Is.insert(Old.Is.begin(), Old.Is.end());
				769	Result.Is.insert(BCI);
				770	Result.SVI = nullptr;
				771
				772	return true;
				773	}
				774
				775	/// ShuffleVectorInst specialization to compute vector information.
				776	///
				777	/// \param SVI ShuffleVectorInst to operate on
				778	/// \param Result Result of the computation
				779	///
				780	/// Compute the left and the right side vector information and merge them by
				781	/// applying the shuffle operation. This function also ensures that the left
				782	/// and right side have compatible loads. This means that all loads are with
				783	/// in the same basic block and are based on the same pointer.
				784	///
				785	/// \returns false if no sensible information can be gathered.
				786	static bool computeFromSVI(ShuffleVectorInst *SVI, VectorInfo &Result,
				787	const DataLayout &DL) {
				788	VectorType *ArgTy = dyn_cast<VectorType>(SVI->getOperand(0)->getType());
				789	assert(ArgTy && "ShuffleVector Operand is not a VectorType");
				790
				791	// Compute the left hand vector information.
				792	VectorInfo LHS(ArgTy);
				793	if (!compute(SVI->getOperand(0), LHS, DL))
				794	LHS.BB = nullptr;
				795
				796	// Compute the right hand vector information.
				797	VectorInfo RHS(ArgTy);
				798	if (!compute(SVI->getOperand(1), RHS, DL))
				799	RHS.BB = nullptr;
				800
				801	// Neither operand produced sensible results?
				802	if (!LHS.BB && !RHS.BB)
				803	return false;
				804	// Only RHS produced sensible results?
				805	else if (!LHS.BB) {
				806	Result.BB = RHS.BB;
				807	Result.PV = RHS.PV;
				808	}
				809	// Only LHS produced sensible results?
				810	else if (!RHS.BB) {
				811	Result.BB = LHS.BB;
				812	Result.PV = LHS.PV;
				813	}
				814	// Both operands produced sensible results?
Martin Elshuber	418c4bb	2018-11-19 18:35:31 +0000	[diff] [blame]	815	else if ((LHS.BB == RHS.BB) && (LHS.PV == RHS.PV)) {
Martin Elshuber	5e067bb	2018-11-19 14:26:10 +0000	[diff] [blame]	816	Result.BB = LHS.BB;
				817	Result.PV = LHS.PV;
				818	}
				819	// Both operands produced sensible results but they are incompatible.
				820	else {
				821	return false;
				822	}
				823
				824	// Merge and apply the operation on the offset information.
				825	if (LHS.BB) {
				826	Result.LIs.insert(LHS.LIs.begin(), LHS.LIs.end());
				827	Result.Is.insert(LHS.Is.begin(), LHS.Is.end());
				828	}
				829	if (RHS.BB) {
				830	Result.LIs.insert(RHS.LIs.begin(), RHS.LIs.end());
				831	Result.Is.insert(RHS.Is.begin(), RHS.Is.end());
				832	}
				833	Result.Is.insert(SVI);
				834	Result.SVI = SVI;
				835
				836	int j = 0;
				837	for (int i : SVI->getShuffleMask()) {
				838	assert((i < 2 * (signed)ArgTy->getNumElements()) &&
				839	"Invalid ShuffleVectorInst (index out of bounds)");
				840
				841	if (i < 0)
				842	Result.EI[j] = ElementInfo();
				843	else if (i < (signed)ArgTy->getNumElements()) {
				844	if (LHS.BB)
				845	Result.EI[j] = LHS.EI[i];
				846	else
				847	Result.EI[j] = ElementInfo();
				848	} else {
				849	if (RHS.BB)
				850	Result.EI[j] = RHS.EI[i - ArgTy->getNumElements()];
				851	else
				852	Result.EI[j] = ElementInfo();
				853	}
				854	j++;
				855	}
				856
				857	return true;
				858	}
				859
				860	/// LoadInst specialization to compute vector information.
				861	///
				862	/// This function also acts as abort condition to the recursion.
				863	///
				864	/// \param LI LoadInst to operate on
				865	/// \param Result Result of the computation
				866	///
				867	/// \returns false if no sensible information can be gathered.
				868	static bool computeFromLI(LoadInst *LI, VectorInfo &Result,
				869	const DataLayout &DL) {
				870	Value *BasePtr;
				871	Polynomial Offset;
				872
				873	if (LI->isVolatile())
				874	return false;
				875
				876	if (LI->isAtomic())
				877	return false;
				878
				879	// Get the base polynomial
				880	computePolynomialFromPointer(*LI->getPointerOperand(), Offset, BasePtr, DL);
				881
				882	Result.BB = LI->getParent();
				883	Result.PV = BasePtr;
				884	Result.LIs.insert(LI);
				885	Result.Is.insert(LI);
				886
				887	for (unsigned i = 0; i < Result.getDimension(); i++) {
				888	Value *Idx[2] = {
				889	ConstantInt::get(Type::getInt32Ty(LI->getContext()), 0),
				890	ConstantInt::get(Type::getInt32Ty(LI->getContext()), i),
				891	};
				892	int64_t Ofs = DL.getIndexedOffsetInType(Result.VTy, makeArrayRef(Idx, 2));
				893	Result.EI[i] = ElementInfo(Offset + Ofs, i == 0 ? LI : nullptr);
				894	}
				895
				896	return true;
				897	}
				898
				899	/// Recursively compute polynomial of a value.
				900	///
				901	/// \param BO Input binary operation
				902	/// \param Result Result polynomial
				903	static void computePolynomialBinOp(BinaryOperator &BO, Polynomial &Result) {
				904	Value *LHS = BO.getOperand(0);
				905	Value *RHS = BO.getOperand(1);
				906
				907	// Find the RHS Constant if any
				908	ConstantInt *C = dyn_cast<ConstantInt>(RHS);
				909	if ((!C) && BO.isCommutative()) {
				910	C = dyn_cast<ConstantInt>(LHS);
				911	if (C)
				912	std::swap(LHS, RHS);
				913	}
				914
				915	switch (BO.getOpcode()) {
				916	case Instruction::Add:
				917	if (!C)
				918	break;
				919
				920	computePolynomial(*LHS, Result);
				921	Result.add(C->getValue());
				922	return;
				923
				924	case Instruction::LShr:
				925	if (!C)
				926	break;
				927
				928	computePolynomial(*LHS, Result);
				929	Result.lshr(C->getValue());
				930	return;
				931
				932	default:
				933	break;
				934	}
				935
				936	Result = Polynomial(&BO);
				937	}
				938
				939	/// Recursively compute polynomial of a value
				940	///
				941	/// \param V input value
				942	/// \param Result result polynomial
				943	static void computePolynomial(Value &V, Polynomial &Result) {
				944	if (isa<BinaryOperator>(&V))
				945	computePolynomialBinOp(*dyn_cast<BinaryOperator>(&V), Result);
				946	else
				947	Result = Polynomial(&V);
				948	}
				949
				950	/// Compute the Polynomial representation of a Pointer type.
				951	///
				952	/// \param Ptr input pointer value
				953	/// \param Result result polynomial
				954	/// \param BasePtr pointer the polynomial is based on
				955	/// \param DL Datalayout of the target machine
				956	static void computePolynomialFromPointer(Value &Ptr, Polynomial &Result,
				957	Value *&BasePtr,
				958	const DataLayout &DL) {
				959	// Not a pointer type? Return an undefined polynomial
				960	PointerType *PtrTy = dyn_cast<PointerType>(Ptr.getType());
				961	if (!PtrTy) {
				962	Result = Polynomial();
				963	BasePtr = nullptr;
				964	}
				965	unsigned PointerBits =
				966	DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace());
				967
				968	/// Skip pointer casts. Return Zero polynomial otherwise
				969	if (isa<CastInst>(&Ptr)) {
				970	CastInst &CI = *cast<CastInst>(&Ptr);
				971	switch (CI.getOpcode()) {
				972	case Instruction::BitCast:
				973	computePolynomialFromPointer(*CI.getOperand(0), Result, BasePtr, DL);
				974	break;
				975	default:
				976	BasePtr = &Ptr;
				977	Polynomial(PointerBits, 0);
				978	break;
				979	}
				980	}
				981	/// Resolve GetElementPtrInst.
				982	else if (isa<GetElementPtrInst>(&Ptr)) {
				983	GetElementPtrInst &GEP = *cast<GetElementPtrInst>(&Ptr);
				984
				985	APInt BaseOffset(PointerBits, 0);
				986
				987	// Check if we can compute the Offset with accumulateConstantOffset
				988	if (GEP.accumulateConstantOffset(DL, BaseOffset)) {
				989	Result = Polynomial(BaseOffset);
				990	BasePtr = GEP.getPointerOperand();
				991	return;
				992	} else {
				993	// Otherwise we allow that the last index operand of the GEP is
				994	// non-constant.
				995	unsigned idxOperand, e;
				996	SmallVector<Value *, 4> Indices;
				997	for (idxOperand = 1, e = GEP.getNumOperands(); idxOperand < e;
				998	idxOperand++) {
				999	ConstantInt *IDX = dyn_cast<ConstantInt>(GEP.getOperand(idxOperand));
				1000	if (!IDX)
				1001	break;
				1002	Indices.push_back(IDX);
				1003	}
				1004
				1005	// It must also be the last operand.
				1006	if (idxOperand + 1 != e) {
				1007	Result = Polynomial();
				1008	BasePtr = nullptr;
				1009	return;
				1010	}
				1011
				1012	// Compute the polynomial of the index operand.
				1013	computePolynomial(*GEP.getOperand(idxOperand), Result);
				1014
				1015	// Compute base offset from zero based index, excluding the last
				1016	// variable operand.
				1017	BaseOffset =
				1018	DL.getIndexedOffsetInType(GEP.getSourceElementType(), Indices);
				1019
				1020	// Apply the operations of GEP to the polynomial.
				1021	unsigned ResultSize = DL.getTypeAllocSize(GEP.getResultElementType());
				1022	Result.sextOrTrunc(PointerBits);
				1023	Result.mul(APInt(PointerBits, ResultSize));
				1024	Result.add(BaseOffset);
				1025	BasePtr = GEP.getPointerOperand();
				1026	}
				1027	}
				1028	// All other instructions are handled by using the value as base pointer and
				1029	// a zero polynomial.
				1030	else {
				1031	BasePtr = &Ptr;
				1032	Polynomial(DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace()), 0);
				1033	}
				1034	}
				1035
				1036	#ifndef NDEBUG
				1037	void print(raw_ostream &OS) const {
				1038	if (PV)
				1039	OS << *PV;
				1040	else
				1041	OS << "(none)";
				1042	OS << " + ";
				1043	for (unsigned i = 0; i < getDimension(); i++)
				1044	OS << ((i == 0) ? "[" : ", ") << EI[i].Ofs;
				1045	OS << "]";
				1046	}
				1047	#endif
				1048	};
				1049
Martin Elshuber	5e067bb	2018-11-19 14:26:10 +0000	[diff] [blame]	1050	} // anonymous namespace
				1051
				1052	bool InterleavedLoadCombineImpl::findPattern(
				1053	std::list<VectorInfo> &Candidates, std::list<VectorInfo> &InterleavedLoad,
				1054	unsigned Factor, const DataLayout &DL) {
				1055	for (auto C0 = Candidates.begin(), E0 = Candidates.end(); C0 != E0; ++C0) {
				1056	unsigned i;
				1057	// Try to find an interleaved load using the front of Worklist as first line
				1058	unsigned Size = DL.getTypeAllocSize(C0->VTy->getElementType());
				1059
				1060	// List containing iterators pointing to the VectorInfos of the candidates
				1061	std::vector<std::list<VectorInfo>::iterator> Res(Factor, Candidates.end());
				1062
				1063	for (auto C = Candidates.begin(), E = Candidates.end(); C != E; C++) {
				1064	if (C->VTy != C0->VTy)
				1065	continue;
				1066	if (C->BB != C0->BB)
				1067	continue;
				1068	if (C->PV != C0->PV)
				1069	continue;
				1070
				1071	// Check the current value matches any of factor - 1 remaining lines
				1072	for (i = 1; i < Factor; i++) {
				1073	if (C->EI[0].Ofs.isProvenEqualTo(C0->EI[0].Ofs + i * Size)) {
				1074	Res[i] = C;
				1075	}
				1076	}
				1077
				1078	for (i = 1; i < Factor; i++) {
				1079	if (Res[i] == Candidates.end())
				1080	break;
				1081	}
				1082	if (i == Factor) {
				1083	Res[0] = C0;
				1084	break;
				1085	}
				1086	}
				1087
				1088	if (Res[0] != Candidates.end()) {
				1089	// Move the result into the output
				1090	for (unsigned i = 0; i < Factor; i++) {
				1091	InterleavedLoad.splice(InterleavedLoad.end(), Candidates, Res[i]);
				1092	}
				1093
				1094	return true;
				1095	}
				1096	}
				1097	return false;
				1098	}
				1099
				1100	LoadInst *
				1101	InterleavedLoadCombineImpl::findFirstLoad(const std::set<LoadInst *> &LIs) {
				1102	assert(!LIs.empty() && "No load instructions given.");
				1103
				1104	// All LIs are within the same BB. Select the first for a reference.
				1105	BasicBlock BB = (LIs.begin())->getParent();
				1106	BasicBlock::iterator FLI =
				1107	std::find_if(BB->begin(), BB->end(), [&LIs](Instruction &I) -> bool {
				1108	return is_contained(LIs, &I);
				1109	});
				1110	assert(FLI != BB->end());
				1111
				1112	return cast<LoadInst>(FLI);
				1113	}
				1114
				1115	bool InterleavedLoadCombineImpl::combine(std::list<VectorInfo> &InterleavedLoad,
				1116	OptimizationRemarkEmitter &ORE) {
				1117	LLVM_DEBUG(dbgs() << "Checking interleaved load\n");
Martin Elshuber	5e067bb	2018-11-19 14:26:10 +0000	[diff] [blame]	1118
				1119	// The insertion point is the LoadInst which loads the first values. The
				1120	// following tests are used to proof that the combined load can be inserted
				1121	// just before InsertionPoint.
				1122	LoadInst *InsertionPoint = InterleavedLoad.front().EI[0].LI;
				1123
				1124	// Test if the offset is computed
				1125	if (!InsertionPoint)
				1126	return false;
				1127
				1128	std::set<LoadInst *> LIs;
				1129	std::set<Instruction *> Is;
				1130	std::set<Instruction *> SVIs;
				1131
				1132	unsigned InterleavedCost;
				1133	unsigned InstructionCost = 0;
				1134
				1135	// Get the interleave factor
				1136	unsigned Factor = InterleavedLoad.size();
				1137
				1138	// Merge all input sets used in analysis
				1139	for (auto &VI : InterleavedLoad) {
				1140	// Generate a set of all load instructions to be combined
				1141	LIs.insert(VI.LIs.begin(), VI.LIs.end());
				1142
				1143	// Generate a set of all instructions taking part in load
				1144	// interleaved. This list excludes the instructions necessary for the
				1145	// polynomial construction.
				1146	Is.insert(VI.Is.begin(), VI.Is.end());
				1147
				1148	// Generate the set of the final ShuffleVectorInst.
				1149	SVIs.insert(VI.SVI);
				1150	}
				1151
				1152	// There is nothing to combine.
				1153	if (LIs.size() < 2)
				1154	return false;
				1155
				1156	// Test if all participating instruction will be dead after the
				1157	// transformation. If intermediate results are used, no performance gain can
				1158	// be expected. Also sum the cost of the Instructions beeing left dead.
				1159	for (auto &I : Is) {
				1160	// Compute the old cost
				1161	InstructionCost +=
				1162	TTI.getInstructionCost(I, TargetTransformInfo::TCK_Latency);
				1163
				1164	// The final SVIs are allowed not to be dead, all uses will be replaced
				1165	if (SVIs.find(I) != SVIs.end())
				1166	continue;
				1167
				1168	// If there are users outside the set to be eliminated, we abort the
				1169	// transformation. No gain can be expected.
				1170	for (const auto &U : I->users()) {
				1171	if (Is.find(dyn_cast<Instruction>(U)) == Is.end())
				1172	return false;
				1173	}
				1174	}
				1175
				1176	// We know that all LoadInst are within the same BB. This guarantees that
				1177	// either everything or nothing is loaded.
				1178	LoadInst *First = findFirstLoad(LIs);
				1179
				1180	// To be safe that the loads can be combined, iterate over all loads and test
				1181	// that the corresponding defining access dominates first LI. This guarantees
				1182	// that there are no aliasing stores in between the loads.
				1183	auto FMA = MSSA.getMemoryAccess(First);
				1184	for (auto LI : LIs) {
				1185	auto MADef = MSSA.getMemoryAccess(LI)->getDefiningAccess();
				1186	if (!MSSA.dominates(MADef, FMA))
				1187	return false;
				1188	}
				1189	assert(!LIs.empty() && "There are no LoadInst to combine");
				1190
				1191	// It is necessary that insertion point dominates all final ShuffleVectorInst.
				1192	for (auto &VI : InterleavedLoad) {
				1193	if (!DT.dominates(InsertionPoint, VI.SVI))
				1194	return false;
				1195	}
				1196
				1197	// All checks are done. Add instructions detectable by InterleavedAccessPass
				1198	// The old instruction will are left dead.
				1199	IRBuilder<> Builder(InsertionPoint);
				1200	Type *ETy = InterleavedLoad.front().SVI->getType()->getElementType();
				1201	unsigned ElementsPerSVI =
				1202	InterleavedLoad.front().SVI->getType()->getNumElements();
				1203	VectorType ILTy = VectorType::get(ETy, Factor ElementsPerSVI);
				1204
				1205	SmallVector<unsigned, 4> Indices;
				1206	for (unsigned i = 0; i < Factor; i++)
				1207	Indices.push_back(i);
				1208	InterleavedCost = TTI.getInterleavedMemoryOpCost(
				1209	Instruction::Load, ILTy, Factor, Indices, InsertionPoint->getAlignment(),
				1210	InsertionPoint->getPointerAddressSpace());
				1211
				1212	if (InterleavedCost >= InstructionCost) {
				1213	return false;
				1214	}
				1215
				1216	// Create a pointer cast for the wide load.
				1217	auto CI = Builder.CreatePointerCast(InsertionPoint->getOperand(0),
				1218	ILTy->getPointerTo(),
				1219	"interleaved.wide.ptrcast");
				1220
				1221	// Create the wide load and update the MemorySSA.
				1222	auto LI = Builder.CreateAlignedLoad(CI, InsertionPoint->getAlignment(),
				1223	"interleaved.wide.load");
				1224	auto MSSAU = MemorySSAUpdater(&MSSA);
				1225	MemoryUse *MSSALoad = cast<MemoryUse>(MSSAU.createMemoryAccessBefore(
				1226	LI, nullptr, MSSA.getMemoryAccess(InsertionPoint)));
				1227	MSSAU.insertUse(MSSALoad);
				1228
				1229	// Create the final SVIs and replace all uses.
				1230	int i = 0;
				1231	for (auto &VI : InterleavedLoad) {
				1232	SmallVector<uint32_t, 4> Mask;
				1233	for (unsigned j = 0; j < ElementsPerSVI; j++)
				1234	Mask.push_back(i + j * Factor);
				1235
				1236	Builder.SetInsertPoint(VI.SVI);
				1237	auto SVI = Builder.CreateShuffleVector(LI, UndefValue::get(LI->getType()),
				1238	Mask, "interleaved.shuffle");
				1239	VI.SVI->replaceAllUsesWith(SVI);
				1240	i++;
				1241	}
				1242
				1243	NumInterleavedLoadCombine++;
				1244	ORE.emit([&]() {
				1245	return OptimizationRemark(DEBUG_TYPE, "Combined Interleaved Load", LI)
				1246	<< "Load interleaved combined with factor "
				1247	<< ore::NV("Factor", Factor);
				1248	});
				1249
				1250	return true;
				1251	}
				1252
				1253	bool InterleavedLoadCombineImpl::run() {
				1254	OptimizationRemarkEmitter ORE(&F);
				1255	bool changed = false;
				1256	unsigned MaxFactor = TLI.getMaxSupportedInterleaveFactor();
				1257
				1258	auto &DL = F.getParent()->getDataLayout();
				1259
				1260	// Start with the highest factor to avoid combining and recombining.
				1261	for (unsigned Factor = MaxFactor; Factor >= 2; Factor--) {
				1262	std::list<VectorInfo> Candidates;
				1263
				1264	for (BasicBlock &BB : F) {
				1265	for (Instruction &I : BB) {
				1266	if (auto SVI = dyn_cast<ShuffleVectorInst>(&I)) {
				1267
				1268	Candidates.emplace_back(SVI->getType());
				1269
				1270	if (!VectorInfo::computeFromSVI(SVI, Candidates.back(), DL)) {
				1271	Candidates.pop_back();
				1272	continue;
				1273	}
				1274
				1275	if (!Candidates.back().isInterleaved(Factor, DL)) {
				1276	Candidates.pop_back();
				1277	}
				1278	}
				1279	}
				1280	}
				1281
				1282	std::list<VectorInfo> InterleavedLoad;
				1283	while (findPattern(Candidates, InterleavedLoad, Factor, DL)) {
				1284	if (combine(InterleavedLoad, ORE)) {
				1285	changed = true;
				1286	} else {
				1287	// Remove the first element of the Interleaved Load but put the others
				1288	// back on the list and continue searching
				1289	Candidates.splice(Candidates.begin(), InterleavedLoad,
				1290	std::next(InterleavedLoad.begin()),
				1291	InterleavedLoad.end());
				1292	}
				1293	InterleavedLoad.clear();
				1294	}
				1295	}
				1296
				1297	return changed;
				1298	}
				1299
				1300	namespace {
				1301	/// This pass combines interleaved loads into a pattern detectable by
				1302	/// InterleavedAccessPass.
				1303	struct InterleavedLoadCombine : public FunctionPass {
				1304	static char ID;
				1305
				1306	InterleavedLoadCombine() : FunctionPass(ID) {
				1307	initializeInterleavedLoadCombinePass(*PassRegistry::getPassRegistry());
				1308	}
				1309
				1310	StringRef getPassName() const override {
				1311	return "Interleaved Load Combine Pass";
				1312	}
				1313
				1314	bool runOnFunction(Function &F) override {
				1315	if (DisableInterleavedLoadCombine)
				1316	return false;
				1317
				1318	auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
				1319	if (!TPC)
				1320	return false;
				1321
				1322	LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName()
				1323	<< "\n");
				1324
				1325	return InterleavedLoadCombineImpl(
				1326	F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
				1327	getAnalysis<MemorySSAWrapperPass>().getMSSA(),
				1328	TPC->getTM<TargetMachine>())
				1329	.run();
				1330	}
				1331
				1332	void getAnalysisUsage(AnalysisUsage &AU) const override {
				1333	AU.addRequired<MemorySSAWrapperPass>();
				1334	AU.addRequired<DominatorTreeWrapperPass>();
				1335	FunctionPass::getAnalysisUsage(AU);
				1336	}
				1337
				1338	private:
				1339	};
				1340	} // anonymous namespace
				1341
				1342	char InterleavedLoadCombine::ID = 0;
				1343
				1344	INITIALIZE_PASS_BEGIN(
				1345	InterleavedLoadCombine, DEBUG_TYPE,
				1346	"Combine interleaved loads into wide loads and shufflevector instructions",
				1347	false, false)
				1348	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
				1349	INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
				1350	INITIALIZE_PASS_END(
				1351	InterleavedLoadCombine, DEBUG_TYPE,
				1352	"Combine interleaved loads into wide loads and shufflevector instructions",
				1353	false, false)
				1354
				1355	FunctionPass *
				1356	llvm::createInterleavedLoadCombinePass() {
				1357	auto P = new InterleavedLoadCombine();
				1358	return P;
				1359	}