@@ -69,6 +69,7 @@ STATISTIC(NumMemSetInfer, "Number of memsets inferred");
69
69
STATISTIC (NumMoveToCpy, " Number of memmoves converted to memcpy" );
70
70
STATISTIC (NumCpyToSet, " Number of memcpys converted to memset" );
71
71
STATISTIC (NumCallSlot, " Number of call slot optimizations performed" );
72
+ STATISTIC (NumStackMove, " Number of stack-move optimizations performed" );
72
73
73
74
namespace {
74
75
@@ -730,6 +731,23 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI,
730
731
return true ;
731
732
}
732
733
734
+ // If this is a load-store pair from a stack slot to a stack slot, we
735
+ // might be able to perform the stack-move optimization just as we do for
736
+ // memcpys from an alloca to an alloca.
737
+ if (auto *DestAlloca = dyn_cast<AllocaInst>(SI->getPointerOperand ())) {
738
+ if (auto *SrcAlloca = dyn_cast<AllocaInst>(LI->getPointerOperand ())) {
739
+ if (performStackMoveOptzn (LI, SI, DestAlloca, SrcAlloca,
740
+ DL.getTypeStoreSize (T), BAA)) {
741
+ // Avoid invalidating the iterator.
742
+ BBI = SI->getNextNonDebugInstruction ()->getIterator ();
743
+ eraseInstruction (SI);
744
+ eraseInstruction (LI);
745
+ ++NumMemCpyInstr;
746
+ return true ;
747
+ }
748
+ }
749
+ }
750
+
733
751
return false ;
734
752
}
735
753
@@ -1408,6 +1426,217 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
1408
1426
return true ;
1409
1427
}
1410
1428
1429
+ // Attempts to optimize the pattern whereby memory is copied from an alloca to
1430
+ // another alloca, where the two allocas don't have conflicting mod/ref. If
1431
+ // successful, the two allocas can be merged into one and the transfer can be
1432
+ // deleted. This pattern is generated frequently in Rust, due to the ubiquity of
1433
+ // move operations in that language.
1434
+ //
1435
+ // Once we determine that the optimization is safe to perform, we replace all
1436
+ // uses of the destination alloca with the source alloca. We also "shrink wrap"
1437
+ // the lifetime markers of the single merged alloca to before the first use
1438
+ // and after the last use. Note that the "shrink wrapping" procedure is a safe
1439
+ // transformation only because we restrict the scope of this optimization to
1440
+ // allocas that aren't captured.
1441
+ bool MemCpyOptPass::performStackMoveOptzn (Instruction *Load, Instruction *Store,
1442
+ AllocaInst *DestAlloca,
1443
+ AllocaInst *SrcAlloca, uint64_t Size,
1444
+ BatchAAResults &BAA) {
1445
+ LLVM_DEBUG (dbgs () << " Stack Move: Attempting to optimize:\n "
1446
+ << *Store << " \n " );
1447
+
1448
+ // Make sure the two allocas are in the same address space.
1449
+ if (SrcAlloca->getAddressSpace () != DestAlloca->getAddressSpace ()) {
1450
+ LLVM_DEBUG (dbgs () << " Stack Move: Address space mismatch\n " );
1451
+ return false ;
1452
+ }
1453
+
1454
+ // 1. Check that copy is full. Calculate the static size of the allocas to be
1455
+ // merged, bail out if we can't.
1456
+ const DataLayout &DL = DestAlloca->getModule ()->getDataLayout ();
1457
+ std::optional<TypeSize> SrcSize = SrcAlloca->getAllocationSize (DL);
1458
+ if (!SrcSize || SrcSize->isScalable () || Size != SrcSize->getFixedValue ()) {
1459
+ LLVM_DEBUG (dbgs () << " Stack Move: Source alloca size mismatch\n " );
1460
+ return false ;
1461
+ }
1462
+ std::optional<TypeSize> DestSize = DestAlloca->getAllocationSize (DL);
1463
+ if (!DestSize || DestSize->isScalable () ||
1464
+ Size != DestSize->getFixedValue ()) {
1465
+ LLVM_DEBUG (dbgs () << " Stack Move: Destination alloca size mismatch\n " );
1466
+ return false ;
1467
+ }
1468
+
1469
+ // 2-1. Check that src and dest are static allocas, which are not affected by
1470
+ // stacksave/stackrestore.
1471
+ if (!SrcAlloca->isStaticAlloca () || !DestAlloca->isStaticAlloca () ||
1472
+ SrcAlloca->getParent () != Load->getParent () ||
1473
+ SrcAlloca->getParent () != Store->getParent ())
1474
+ return false ;
1475
+
1476
+ // 2-2. Check that src and dest are never captured, unescaped allocas. Also
1477
+ // collect lifetime markers first/last users in order to shrink wrap the
1478
+ // lifetimes, and instructions with noalias metadata to remove them.
1479
+
1480
+ SmallVector<Instruction *, 4 > LifetimeMarkers;
1481
+ Instruction *FirstUser = nullptr , *LastUser = nullptr ;
1482
+ SmallSet<Instruction *, 4 > NoAliasInstrs;
1483
+
1484
+ // Recursively track the user and check whether modified alias exist.
1485
+ auto IsDereferenceableOrNull = [](Value *V, const DataLayout &DL) -> bool {
1486
+ bool CanBeNull, CanBeFreed;
1487
+ return V->getPointerDereferenceableBytes (DL, CanBeNull, CanBeFreed);
1488
+ };
1489
+
1490
+ auto CaptureTrackingWithModRef =
1491
+ [&](Instruction *AI,
1492
+ function_ref<bool (Instruction *)> ModRefCallback) -> bool {
1493
+ SmallVector<Instruction *, 8 > Worklist;
1494
+ Worklist.push_back (AI);
1495
+ unsigned MaxUsesToExplore = getDefaultMaxUsesToExploreForCaptureTracking ();
1496
+ Worklist.reserve (MaxUsesToExplore);
1497
+ SmallSet<const Use *, 20 > Visited;
1498
+ while (!Worklist.empty ()) {
1499
+ Instruction *I = Worklist.back ();
1500
+ Worklist.pop_back ();
1501
+ for (const Use &U : I->uses ()) {
1502
+ if (Visited.size () >= MaxUsesToExplore) {
1503
+ LLVM_DEBUG (
1504
+ dbgs ()
1505
+ << " Stack Move: Exceeded max uses to see ModRef, bailing\n " );
1506
+ return false ;
1507
+ }
1508
+ if (!Visited.insert (&U).second )
1509
+ continue ;
1510
+ switch (DetermineUseCaptureKind (U, IsDereferenceableOrNull)) {
1511
+ case UseCaptureKind::MAY_CAPTURE:
1512
+ return false ;
1513
+ case UseCaptureKind::PASSTHROUGH:
1514
+ // Instructions cannot have non-instruction users.
1515
+ Worklist.push_back (cast<Instruction>(U.getUser ()));
1516
+ continue ;
1517
+ case UseCaptureKind::NO_CAPTURE: {
1518
+ auto *UI = cast<Instruction>(U.getUser ());
1519
+ if (DestAlloca->getParent () != UI->getParent ())
1520
+ return false ;
1521
+ if (!FirstUser || UI->comesBefore (FirstUser))
1522
+ FirstUser = UI;
1523
+ if (!LastUser || LastUser->comesBefore (UI))
1524
+ LastUser = UI;
1525
+ if (UI->isLifetimeStartOrEnd ()) {
1526
+ // We note the locations of these intrinsic calls so that we can
1527
+ // delete them later if the optimization succeeds, this is safe
1528
+ // since both llvm.lifetime.start and llvm.lifetime.end intrinsics
1529
+ // conceptually fill all the bytes of the alloca with an undefined
1530
+ // value.
1531
+ int64_t Size = cast<ConstantInt>(UI->getOperand (0 ))->getSExtValue ();
1532
+ if (Size < 0 || Size == DestSize) {
1533
+ LifetimeMarkers.push_back (UI);
1534
+ continue ;
1535
+ }
1536
+ }
1537
+ if (UI->hasMetadata (LLVMContext::MD_noalias))
1538
+ NoAliasInstrs.insert (UI);
1539
+ if (!ModRefCallback (UI))
1540
+ return false ;
1541
+ }
1542
+ }
1543
+ }
1544
+ }
1545
+ return true ;
1546
+ };
1547
+
1548
+ // 3. Check that dest has no Mod/Ref, except full size lifetime intrinsics,
1549
+ // from the alloca to the Store.
1550
+ ModRefInfo DestModRef = ModRefInfo::NoModRef;
1551
+ MemoryLocation DestLoc (DestAlloca, LocationSize::precise (Size));
1552
+ auto DestModRefCallback = [&](Instruction *UI) -> bool {
1553
+ // We don't care about the store itself.
1554
+ if (UI == Store)
1555
+ return true ;
1556
+ ModRefInfo Res = BAA.getModRefInfo (UI, DestLoc);
1557
+ // FIXME: For multi-BB cases, we need to see reachability from it to
1558
+ // store.
1559
+ // Bailout if Dest may have any ModRef before Store.
1560
+ if (UI->comesBefore (Store) && isModOrRefSet (Res))
1561
+ return false ;
1562
+ DestModRef |= BAA.getModRefInfo (UI, DestLoc);
1563
+
1564
+ return true ;
1565
+ };
1566
+
1567
+ if (!CaptureTrackingWithModRef (DestAlloca, DestModRefCallback))
1568
+ return false ;
1569
+
1570
+ // 3. Check that, from after the Load to the end of the BB,
1571
+ // 3-1. if the dest has any Mod, src has no Ref, and
1572
+ // 3-2. if the dest has any Ref, src has no Mod except full-sized lifetimes.
1573
+ MemoryLocation SrcLoc (SrcAlloca, LocationSize::precise (Size));
1574
+
1575
+ auto SrcModRefCallback = [&](Instruction *UI) -> bool {
1576
+ // Any ModRef before Load doesn't matter, also Load and Store can be
1577
+ // ignored.
1578
+ if (UI->comesBefore (Load) || UI == Load || UI == Store)
1579
+ return true ;
1580
+ ModRefInfo Res = BAA.getModRefInfo (UI, SrcLoc);
1581
+ if ((isModSet (DestModRef) && isRefSet (Res)) ||
1582
+ (isRefSet (DestModRef) && isModSet (Res)))
1583
+ return false ;
1584
+
1585
+ return true ;
1586
+ };
1587
+
1588
+ if (!CaptureTrackingWithModRef (SrcAlloca, SrcModRefCallback))
1589
+ return false ;
1590
+
1591
+ // We can do the transformation. First, align the allocas appropriately.
1592
+ SrcAlloca->setAlignment (
1593
+ std::max (SrcAlloca->getAlign (), DestAlloca->getAlign ()));
1594
+
1595
+ // Merge the two allocas.
1596
+ DestAlloca->replaceAllUsesWith (SrcAlloca);
1597
+ eraseInstruction (DestAlloca);
1598
+
1599
+ // Drop metadata on the source alloca.
1600
+ SrcAlloca->dropUnknownNonDebugMetadata ();
1601
+
1602
+ // Do "shrink wrap" the lifetimes, if the original lifetime intrinsics exists.
1603
+ if (!LifetimeMarkers.empty ()) {
1604
+ LLVMContext &C = SrcAlloca->getContext ();
1605
+ IRBuilder<> Builder (C);
1606
+
1607
+ ConstantInt *AllocaSize = ConstantInt::get (Type::getInt64Ty (C), Size);
1608
+ // Create a new lifetime start marker before the first user of src or alloca
1609
+ // users.
1610
+ Builder.SetInsertPoint (FirstUser->getParent (), FirstUser->getIterator ());
1611
+ Builder.CreateLifetimeStart (SrcAlloca, AllocaSize);
1612
+
1613
+ // Create a new lifetime end marker after the last user of src or alloca
1614
+ // users.
1615
+ // FIXME: If the last user is the terminator for the bb, we can insert
1616
+ // lifetime.end marker to the immidiate post-dominator, but currently do
1617
+ // nothing.
1618
+ if (!LastUser->isTerminator ()) {
1619
+ Builder.SetInsertPoint (LastUser->getParent (), ++LastUser->getIterator ());
1620
+ Builder.CreateLifetimeEnd (SrcAlloca, AllocaSize);
1621
+ }
1622
+
1623
+ // Remove all other lifetime markers.
1624
+ for (Instruction *I : LifetimeMarkers)
1625
+ eraseInstruction (I);
1626
+ }
1627
+
1628
+ // As this transformation can cause memory accesses that didn't previously
1629
+ // alias to begin to alias one another, we remove !noalias metadata from any
1630
+ // uses of either alloca. This is conservative, but more precision doesn't
1631
+ // seem worthwhile right now.
1632
+ for (Instruction *I : NoAliasInstrs)
1633
+ I->setMetadata (LLVMContext::MD_noalias, nullptr );
1634
+
1635
+ LLVM_DEBUG (dbgs () << " Stack Move: Performed staack-move optimization\n " );
1636
+ NumStackMove++;
1637
+ return true ;
1638
+ }
1639
+
1411
1640
// / Perform simplification of memcpy's. If we have memcpy A
1412
1641
// / which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite
1413
1642
// / B to be a memcpy from X to Z (or potentially a memmove, depending on
@@ -1464,13 +1693,14 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
1464
1693
MemoryAccess *SrcClobber = MSSA->getWalker ()->getClobberingMemoryAccess (
1465
1694
AnyClobber, MemoryLocation::getForSource (M), BAA);
1466
1695
1467
- // There are four possible optimizations we can do for memcpy:
1696
+ // There are five possible optimizations we can do for memcpy:
1468
1697
// a) memcpy-memcpy xform which exposes redundance for DSE.
1469
1698
// b) call-memcpy xform for return slot optimization.
1470
1699
// c) memcpy from freshly alloca'd space or space that has just started
1471
1700
// its lifetime copies undefined data, and we can therefore eliminate
1472
1701
// the memcpy in favor of the data that was already at the destination.
1473
1702
// d) memcpy from a just-memset'd source can be turned into memset.
1703
+ // e) elimination of memcpy via stack-move optimization.
1474
1704
if (auto *MD = dyn_cast<MemoryDef>(SrcClobber)) {
1475
1705
if (Instruction *MI = MD->getMemoryInst ()) {
1476
1706
if (auto *CopySize = dyn_cast<ConstantInt>(M->getLength ())) {
@@ -1489,7 +1719,8 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
1489
1719
}
1490
1720
}
1491
1721
if (auto *MDep = dyn_cast<MemCpyInst>(MI))
1492
- return processMemCpyMemCpyDependence (M, MDep, BAA);
1722
+ if (processMemCpyMemCpyDependence (M, MDep, BAA))
1723
+ return true ;
1493
1724
if (auto *MDep = dyn_cast<MemSetInst>(MI)) {
1494
1725
if (performMemCpyToMemSetOptzn (M, MDep, BAA)) {
1495
1726
LLVM_DEBUG (dbgs () << " Converted memcpy to memset\n " );
@@ -1508,6 +1739,27 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
1508
1739
}
1509
1740
}
1510
1741
1742
+ // If the transfer is from a stack slot to a stack slot, then we may be able
1743
+ // to perform the stack-move optimization. See the comments in
1744
+ // performStackMoveOptzn() for more details.
1745
+ auto *DestAlloca = dyn_cast<AllocaInst>(M->getDest ());
1746
+ if (!DestAlloca)
1747
+ return false ;
1748
+ auto *SrcAlloca = dyn_cast<AllocaInst>(M->getSource ());
1749
+ if (!SrcAlloca)
1750
+ return false ;
1751
+ ConstantInt *Len = dyn_cast<ConstantInt>(M->getLength ());
1752
+ if (Len == nullptr )
1753
+ return false ;
1754
+ if (performStackMoveOptzn (M, M, DestAlloca, SrcAlloca, Len->getZExtValue (),
1755
+ BAA)) {
1756
+ // Avoid invalidating the iterator.
1757
+ BBI = M->getNextNonDebugInstruction ()->getIterator ();
1758
+ eraseInstruction (M);
1759
+ ++NumMemCpyInstr;
1760
+ return true ;
1761
+ }
1762
+
1511
1763
return false ;
1512
1764
}
1513
1765
0 commit comments