On 10/12/19 12:18 AM, Jiong Wang wrote: > Currently, BPF backend is doing truncation elimination. If one truncation > is performed on a value defined by narrow loads, then it could be redundant > given BPF loads zero extend the destination register implicitly. > > When the definition of the truncated value is a merging value (PHI node) > that could come from different code paths, then checks need to be done on > all possible code paths. > > Above described optimization was introduced as r306685, however it doesn't > work when there is back-edge, for example when loop is used inside BPF > code. > > For example for the following code, a zero-extended value should be stored > into b[i], but the "and reg, 0xffff" is wrongly eliminated which then > generates corrupted data. > > void cal1(unsigned short *a, unsigned long *b, unsigned int k) > { > unsigned short e; > > e = *a; > for (unsigned int i = 0; i < k; i++) { > b[i] = e; > e = ~e; > } > } > > The reason is r306685 was trying to do the PHI node checks inside isel > DAG2DAG phase, and the checks are done on MachineInstr. This is actually > wrong, because MachineInstr is being built during isel phase and the > associated information is not completed yet. A quick search shows none > target other than BPF is access MachineInstr info during isel phase. > > For an PHI node, when you reached it during isel phase, it may have all > predecessors linked, but not successors. It seems successors are linked to > PHI node only when doing SelectionDAGISel::FinishBasicBlock and this > happens later than PreprocessISelDAG hook. > > Previously, BPF program doesn't allow loop, there is probably the reason > why this bug was not exposed. Thanks for the fix. Yes, previously BPF programs do not allow loops so I did an implementation in SelectionDAGISel phase taking advantage of this fact. I do agree a later phase with SSA available is better to handle all cases (with or without loops). Will look at the patch detail a little bit later. > > This patch therefore fixes the bug by the following approach: > - The existing truncation elimination code and the associated > "load_to_vreg_" records are removed. > - Instead, implement truncation elimination using MachineSSA pass, this > is where all information are built, and keep the pass together with other > similar peephole optimizations inside BPFMIPeephole.cpp. Redundant move > elimination logic is updated accordingly. > - Unit testcase included + no compilation errors for kernel BPF selftest. > > Reported-by: David Beckett <david.beckett@xxxxxxxxxxxxx> > Signed-off-by: Jiong Wang <jiong.wang@xxxxxxxxxxxxx> > --- > lib/Target/BPF/BPF.h | 2 + > lib/Target/BPF/BPFISelDAGToDAG.cpp | 169 +++--------------------------- > lib/Target/BPF/BPFMIPeephole.cpp | 187 +++++++++++++++++++++++++++++++++- > lib/Target/BPF/BPFTargetMachine.cpp | 12 ++- > test/CodeGen/BPF/remove_truncate_6.ll | 80 +++++++++++++++ > 5 files changed, 283 insertions(+), 167 deletions(-) > create mode 100644 test/CodeGen/BPF/remove_truncate_6.ll > > diff --git a/lib/Target/BPF/BPF.h b/lib/Target/BPF/BPF.h > index ba21503..6e4f35f 100644 > --- a/lib/Target/BPF/BPF.h > +++ b/lib/Target/BPF/BPF.h > @@ -20,12 +20,14 @@ ModulePass *createBPFAbstractMemberAccess(BPFTargetMachine *TM); > FunctionPass *createBPFISelDag(BPFTargetMachine &TM); > FunctionPass *createBPFMISimplifyPatchablePass(); > FunctionPass *createBPFMIPeepholePass(); > +FunctionPass *createBPFMIPeepholeTruncElimPass(); > FunctionPass *createBPFMIPreEmitPeepholePass(); > FunctionPass *createBPFMIPreEmitCheckingPass(); > > void initializeBPFAbstractMemberAccessPass(PassRegistry&); > void initializeBPFMISimplifyPatchablePass(PassRegistry&); > void initializeBPFMIPeepholePass(PassRegistry&); > +void initializeBPFMIPeepholeTruncElimPass(PassRegistry&); > void initializeBPFMIPreEmitPeepholePass(PassRegistry&); > void initializeBPFMIPreEmitCheckingPass(PassRegistry&); > } > diff --git a/lib/Target/BPF/BPFISelDAGToDAG.cpp b/lib/Target/BPF/BPFISelDAGToDAG.cpp > index 85fa1f2..f2be0ff 100644 > --- a/lib/Target/BPF/BPFISelDAGToDAG.cpp > +++ b/lib/Target/BPF/BPFISelDAGToDAG.cpp > @@ -45,9 +45,7 @@ class BPFDAGToDAGISel : public SelectionDAGISel { > > public: > explicit BPFDAGToDAGISel(BPFTargetMachine &TM) > - : SelectionDAGISel(TM), Subtarget(nullptr) { > - curr_func_ = nullptr; > - } > + : SelectionDAGISel(TM), Subtarget(nullptr) {} > > StringRef getPassName() const override { > return "BPF DAG->DAG Pattern Instruction Selection"; > @@ -92,14 +90,8 @@ private: > val_vec_type &Vals, int Offset); > bool getConstantFieldValue(const GlobalAddressSDNode *Node, uint64_t Offset, > uint64_t Size, unsigned char *ByteSeq); > - bool checkLoadDef(unsigned DefReg, unsigned match_load_op); > - > // Mapping from ConstantStruct global value to corresponding byte-list values > std::map<const void *, val_vec_type> cs_vals_; > - // Mapping from vreg to load memory opcode > - std::map<unsigned, unsigned> load_to_vreg_; > - // Current function > - const Function *curr_func_; > }; > } // namespace > > @@ -325,32 +317,13 @@ void BPFDAGToDAGISel::PreprocessLoad(SDNode *Node, > } > > void BPFDAGToDAGISel::PreprocessISelDAG() { > - // Iterate through all nodes, interested in the following cases: > + // Iterate through all nodes, interested in the following case: > // > // . loads from ConstantStruct or ConstantArray of constructs > // which can be turns into constant itself, with this we can > // avoid reading from read-only section at runtime. > // > - // . reg truncating is often the result of 8/16/32bit->64bit or > - // 8/16bit->32bit conversion. If the reg value is loaded with > - // masked byte width, the AND operation can be removed since > - // BPF LOAD already has zero extension. > - // > - // This also solved a correctness issue. > - // In BPF socket-related program, e.g., __sk_buff->{data, data_end} > - // are 32-bit registers, but later on, kernel verifier will rewrite > - // it with 64-bit value. Therefore, truncating the value after the > - // load will result in incorrect code. > - > - // clear the load_to_vreg_ map so that we have a clean start > - // for this function. > - if (!curr_func_) { > - curr_func_ = FuncInfo->Fn; > - } else if (curr_func_ != FuncInfo->Fn) { > - load_to_vreg_.clear(); > - curr_func_ = FuncInfo->Fn; > - } > - > + // . Removing redundant AND for intrinsic narrow loads. > for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), > E = CurDAG->allnodes_end(); > I != E;) { > @@ -358,8 +331,6 @@ void BPFDAGToDAGISel::PreprocessISelDAG() { > unsigned Opcode = Node->getOpcode(); > if (Opcode == ISD::LOAD) > PreprocessLoad(Node, I); > - else if (Opcode == ISD::CopyToReg) > - PreprocessCopyToReg(Node); > else if (Opcode == ISD::AND) > PreprocessTrunc(Node, I); > } > @@ -491,36 +462,6 @@ bool BPFDAGToDAGISel::fillConstantStruct(const DataLayout &DL, > return true; > } > > -void BPFDAGToDAGISel::PreprocessCopyToReg(SDNode *Node) { > - const RegisterSDNode *RegN = dyn_cast<RegisterSDNode>(Node->getOperand(1)); > - if (!RegN || !Register::isVirtualRegister(RegN->getReg())) > - return; > - > - const LoadSDNode *LD = dyn_cast<LoadSDNode>(Node->getOperand(2)); > - if (!LD) > - return; > - > - // Assign a load value to a virtual register. record its load width > - unsigned mem_load_op = 0; > - switch (LD->getMemOperand()->getSize()) { > - default: > - return; > - case 4: > - mem_load_op = BPF::LDW; > - break; > - case 2: > - mem_load_op = BPF::LDH; > - break; > - case 1: > - mem_load_op = BPF::LDB; > - break; > - } > - > - LLVM_DEBUG(dbgs() << "Find Load Value to VReg " > - << Register::virtReg2Index(RegN->getReg()) << '\n'); > - load_to_vreg_[RegN->getReg()] = mem_load_op; > -} > - > void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node, > SelectionDAG::allnodes_iterator &I) { > ConstantSDNode *MaskN = dyn_cast<ConstantSDNode>(Node->getOperand(1)); > @@ -534,112 +475,26 @@ void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node, > // which the generic optimizer doesn't understand their results are > // zero extended. > SDValue BaseV = Node->getOperand(0); > - if (BaseV.getOpcode() == ISD::INTRINSIC_W_CHAIN) { > - unsigned IntNo = cast<ConstantSDNode>(BaseV->getOperand(1))->getZExtValue(); > - uint64_t MaskV = MaskN->getZExtValue(); > - > - if (!((IntNo == Intrinsic::bpf_load_byte && MaskV == 0xFF) || > - (IntNo == Intrinsic::bpf_load_half && MaskV == 0xFFFF) || > - (IntNo == Intrinsic::bpf_load_word && MaskV == 0xFFFFFFFF))) > - return; > - > - LLVM_DEBUG(dbgs() << "Remove the redundant AND operation in: "; > - Node->dump(); dbgs() << '\n'); > - > - I--; > - CurDAG->ReplaceAllUsesWith(SDValue(Node, 0), BaseV); > - I++; > - CurDAG->DeleteNode(Node); > - > - return; > - } > - > - // Multiple basic blocks case. > - if (BaseV.getOpcode() != ISD::CopyFromReg) > + if (BaseV.getOpcode() != ISD::INTRINSIC_W_CHAIN) > return; > > - unsigned match_load_op = 0; > - switch (MaskN->getZExtValue()) { > - default: > - return; > - case 0xFFFFFFFF: > - match_load_op = BPF::LDW; > - break; > - case 0xFFFF: > - match_load_op = BPF::LDH; > - break; > - case 0xFF: > - match_load_op = BPF::LDB; > - break; > - } > + unsigned IntNo = cast<ConstantSDNode>(BaseV->getOperand(1))->getZExtValue(); > + uint64_t MaskV = MaskN->getZExtValue(); > > - const RegisterSDNode *RegN = > - dyn_cast<RegisterSDNode>(BaseV.getNode()->getOperand(1)); > - if (!RegN || !Register::isVirtualRegister(RegN->getReg())) > + if (!((IntNo == Intrinsic::bpf_load_byte && MaskV == 0xFF) || > + (IntNo == Intrinsic::bpf_load_half && MaskV == 0xFFFF) || > + (IntNo == Intrinsic::bpf_load_word && MaskV == 0xFFFFFFFF))) > return; > - unsigned AndOpReg = RegN->getReg(); > - LLVM_DEBUG(dbgs() << "Examine " << printReg(AndOpReg) << '\n'); > - > - // Examine the PHI insns in the MachineBasicBlock to found out the > - // definitions of this virtual register. At this stage (DAG2DAG > - // transformation), only PHI machine insns are available in the machine basic > - // block. > - MachineBasicBlock *MBB = FuncInfo->MBB; > - MachineInstr *MII = nullptr; > - for (auto &MI : *MBB) { > - for (unsigned i = 0; i < MI.getNumOperands(); ++i) { > - const MachineOperand &MOP = MI.getOperand(i); > - if (!MOP.isReg() || !MOP.isDef()) > - continue; > - Register Reg = MOP.getReg(); > - if (Register::isVirtualRegister(Reg) && Reg == AndOpReg) { > - MII = &MI; > - break; > - } > - } > - } > - > - if (MII == nullptr) { > - // No phi definition in this block. > - if (!checkLoadDef(AndOpReg, match_load_op)) > - return; > - } else { > - // The PHI node looks like: > - // %2 = PHI %0, <%bb.1>, %1, <%bb.3> > - // Trace each incoming definition, e.g., (%0, %bb.1) and (%1, %bb.3) > - // The AND operation can be removed if both %0 in %bb.1 and %1 in > - // %bb.3 are defined with a load matching the MaskN. > - LLVM_DEBUG(dbgs() << "Check PHI Insn: "; MII->dump(); dbgs() << '\n'); > - unsigned PrevReg = -1; > - for (unsigned i = 0; i < MII->getNumOperands(); ++i) { > - const MachineOperand &MOP = MII->getOperand(i); > - if (MOP.isReg()) { > - if (MOP.isDef()) > - continue; > - PrevReg = MOP.getReg(); > - if (!Register::isVirtualRegister(PrevReg)) > - return; > - if (!checkLoadDef(PrevReg, match_load_op)) > - return; > - } > - } > - } > > - LLVM_DEBUG(dbgs() << "Remove the redundant AND operation in: "; Node->dump(); > - dbgs() << '\n'); > + LLVM_DEBUG(dbgs() << "Remove the redundant AND operation in: "; > + Node->dump(); dbgs() << '\n'); > > I--; > CurDAG->ReplaceAllUsesWith(SDValue(Node, 0), BaseV); > I++; > CurDAG->DeleteNode(Node); > -} > - > -bool BPFDAGToDAGISel::checkLoadDef(unsigned DefReg, unsigned match_load_op) { > - auto it = load_to_vreg_.find(DefReg); > - if (it == load_to_vreg_.end()) > - return false; // The definition of register is not exported yet. > > - return it->second == match_load_op; > + return; > } > > FunctionPass *llvm::createBPFISelDag(BPFTargetMachine &TM) { > diff --git a/lib/Target/BPF/BPFMIPeephole.cpp b/lib/Target/BPF/BPFMIPeephole.cpp > index fafd2f7..2b52ce0 100644 > --- a/lib/Target/BPF/BPFMIPeephole.cpp > +++ b/lib/Target/BPF/BPFMIPeephole.cpp > @@ -71,7 +71,7 @@ void BPFMIPeephole::initialize(MachineFunction &MFParm) { > MF = &MFParm; > MRI = &MF->getRegInfo(); > TII = MF->getSubtarget<BPFSubtarget>().getInstrInfo(); > - LLVM_DEBUG(dbgs() << "*** BPF MachineSSA peephole pass ***\n\n"); > + LLVM_DEBUG(dbgs() << "*** BPF MachineSSA ZEXT Elim peephole pass ***\n\n"); > } > > bool BPFMIPeephole::isMovFrom32Def(MachineInstr *MovMI) > @@ -186,7 +186,8 @@ bool BPFMIPeephole::eliminateZExtSeq(void) { > } // end default namespace > > INITIALIZE_PASS(BPFMIPeephole, DEBUG_TYPE, > - "BPF MachineSSA Peephole Optimization", false, false) > + "BPF MachineSSA Peephole Optimization For ZEXT Eliminate", > + false, false) > > char BPFMIPeephole::ID = 0; > FunctionPass* llvm::createBPFMIPeepholePass() { return new BPFMIPeephole(); } > @@ -253,12 +254,16 @@ bool BPFMIPreEmitPeephole::eliminateRedundantMov(void) { > // enabled. The special type cast insn MOV_32_64 involves different > // register class on src (i32) and dst (i64), RA could generate useless > // instruction due to this. > - if (MI.getOpcode() == BPF::MOV_32_64) { > + unsigned Opcode = MI.getOpcode(); > + if (Opcode == BPF::MOV_32_64 || > + Opcode == BPF::MOV_rr || Opcode == BPF::MOV_rr_32) { > Register dst = MI.getOperand(0).getReg(); > - Register dst_sub = TRI->getSubReg(dst, BPF::sub_32); > Register src = MI.getOperand(1).getReg(); > > - if (dst_sub != src) > + if (Opcode == BPF::MOV_32_64) > + dst = TRI->getSubReg(dst, BPF::sub_32); > + > + if (dst != src) > continue; > > ToErase = &MI; > @@ -281,3 +286,175 @@ FunctionPass* llvm::createBPFMIPreEmitPeepholePass() > { > return new BPFMIPreEmitPeephole(); > } > + > +STATISTIC(TruncElemNum, "Number of truncation eliminated"); > + > +namespace { > + > +struct BPFMIPeepholeTruncElim : public MachineFunctionPass { > + > + static char ID; > + const BPFInstrInfo *TII; > + MachineFunction *MF; > + MachineRegisterInfo *MRI; > + > + BPFMIPeepholeTruncElim() : MachineFunctionPass(ID) { > + initializeBPFMIPeepholeTruncElimPass(*PassRegistry::getPassRegistry()); > + } > + > +private: > + // Initialize class variables. > + void initialize(MachineFunction &MFParm); > + > + bool eliminateTruncSeq(void); > + > +public: > + > + // Main entry point for this pass. > + bool runOnMachineFunction(MachineFunction &MF) override { > + if (skipFunction(MF.getFunction())) > + return false; > + > + initialize(MF); > + > + return eliminateTruncSeq(); > + } > +}; > + > +static bool TruncSizeCompatible(int TruncSize, unsigned opcode) > +{ > + if (TruncSize == 1) > + return opcode == BPF::LDB || opcode == BPF::LDB32; > + > + if (TruncSize == 2) > + return opcode == BPF::LDH || opcode == BPF::LDH32; > + > + if (TruncSize == 4) > + return opcode == BPF::LDW || opcode == BPF::LDW32; > + > + return false; > +} > + > +// Initialize class variables. > +void BPFMIPeepholeTruncElim::initialize(MachineFunction &MFParm) { > + MF = &MFParm; > + MRI = &MF->getRegInfo(); > + TII = MF->getSubtarget<BPFSubtarget>().getInstrInfo(); > + LLVM_DEBUG(dbgs() << "*** BPF MachineSSA TRUNC Elim peephole pass ***\n\n"); > +} > + > +// Reg truncating is often the result of 8/16/32bit->64bit or > +// 8/16bit->32bit conversion. If the reg value is loaded with > +// masked byte width, the AND operation can be removed since > +// BPF LOAD already has zero extension. > +// > +// This also solved a correctness issue. > +// In BPF socket-related program, e.g., __sk_buff->{data, data_end} > +// are 32-bit registers, but later on, kernel verifier will rewrite > +// it with 64-bit value. Therefore, truncating the value after the > +// load will result in incorrect code. > +bool BPFMIPeepholeTruncElim::eliminateTruncSeq(void) { > + MachineInstr* ToErase = nullptr; > + bool Eliminated = false; > + > + for (MachineBasicBlock &MBB : *MF) { > + for (MachineInstr &MI : MBB) { > + // The second insn to remove if the eliminate candidate is a pair. > + MachineInstr *MI2 = nullptr;; > + Register DstReg, SrcReg; > + MachineInstr *DefMI; > + int TruncSize = -1; > + > + // If the previous instruction was marked for elimination, remove it now. > + if (ToErase) { > + ToErase->eraseFromParent(); > + ToErase = nullptr; > + } > + > + // AND A, 0xFFFFFFFF will be turned into SLL/SRL pair due to immediate > + // for BPF ANDI is i32, and this case only happens on ALU64. > + if (MI.getOpcode() == BPF::SRL_ri && > + MI.getOperand(2).getImm() == 32) { > + SrcReg = MI.getOperand(1).getReg(); > + MI2 = MRI->getVRegDef(SrcReg); > + DstReg = MI.getOperand(0).getReg(); > + > + if (!MI2 || > + MI2->getOpcode() != BPF::SLL_ri || > + MI2->getOperand(2).getImm() != 32) > + continue; > + > + // Update SrcReg. > + SrcReg = MI2->getOperand(1).getReg(); > + DefMI = MRI->getVRegDef(SrcReg); > + if (DefMI) > + TruncSize = 4; > + } else if (MI.getOpcode() == BPF::AND_ri || > + MI.getOpcode() == BPF::AND_ri_32) { > + SrcReg = MI.getOperand(1).getReg(); > + DstReg = MI.getOperand(0).getReg(); > + DefMI = MRI->getVRegDef(SrcReg); > + > + if (!DefMI) > + continue; > + > + int64_t imm = MI.getOperand(2).getImm(); > + if (imm == 0xff) > + TruncSize = 1; > + else if (imm == 0xffff) > + TruncSize = 2; > + } > + > + if (TruncSize == -1) > + continue; > + > + // The definition is PHI node, check all inputs. > + if (DefMI->isPHI()) { > + bool CheckFail = false; > + > + for (unsigned i = 1, e = DefMI->getNumOperands(); i < e; i += 2) { > + MachineOperand &opnd = DefMI->getOperand(i); > + if (!opnd.isReg()) > + return false; > + > + MachineInstr *PhiDef = MRI->getVRegDef(opnd.getReg()); > + if (!PhiDef || PhiDef->isPHI() || > + !TruncSizeCompatible(TruncSize, PhiDef->getOpcode())) { > + CheckFail = true; > + break; > + } > + } > + > + if (CheckFail) > + continue; > + } else if (!TruncSizeCompatible(TruncSize, DefMI->getOpcode())) { > + continue; > + } > + > + BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(BPF::MOV_rr), DstReg) > + .addReg(SrcReg); > + > + if (MI2) > + MI2->eraseFromParent(); > + > + // Mark it to ToErase, and erase in the next iteration. > + ToErase = &MI; > + TruncElemNum++; > + Eliminated = true; > + } > + } > + > + return Eliminated; > +} > + > +} // end default namespace > + > +INITIALIZE_PASS(BPFMIPeepholeTruncElim, "bpf-mi-trunc-elim", > + "BPF MachineSSA Peephole Optimization For TRUNC Eliminate", > + false, false) > + > +char BPFMIPeepholeTruncElim::ID = 0; > +FunctionPass* llvm::createBPFMIPeepholeTruncElimPass() > +{ > + return new BPFMIPeepholeTruncElim(); > +} > diff --git a/lib/Target/BPF/BPFTargetMachine.cpp b/lib/Target/BPF/BPFTargetMachine.cpp > index d940ac9..0c4f2c7 100644 > --- a/lib/Target/BPF/BPFTargetMachine.cpp > +++ b/lib/Target/BPF/BPFTargetMachine.cpp > @@ -36,6 +36,7 @@ extern "C" void LLVMInitializeBPFTarget() { > PassRegistry &PR = *PassRegistry::getPassRegistry(); > initializeBPFAbstractMemberAccessPass(PR); > initializeBPFMIPeepholePass(PR); > + initializeBPFMIPeepholeTruncElimPass(PR); > } > > // DataLayout: little or big endian > @@ -115,15 +116,16 @@ void BPFPassConfig::addMachineSSAOptimization() { > TargetPassConfig::addMachineSSAOptimization(); > > const BPFSubtarget *Subtarget = getBPFTargetMachine().getSubtargetImpl(); > - if (Subtarget->getHasAlu32() && !DisableMIPeephole) > - addPass(createBPFMIPeepholePass()); > + if (!DisableMIPeephole) { > + if (Subtarget->getHasAlu32()) > + addPass(createBPFMIPeepholePass()); > + addPass(createBPFMIPeepholeTruncElimPass()); > + } > } > > void BPFPassConfig::addPreEmitPass() { > - const BPFSubtarget *Subtarget = getBPFTargetMachine().getSubtargetImpl(); > - > addPass(createBPFMIPreEmitCheckingPass()); > if (getOptLevel() != CodeGenOpt::None) > - if (Subtarget->getHasAlu32() && !DisableMIPeephole) > + if (!DisableMIPeephole) > addPass(createBPFMIPreEmitPeepholePass()); > } > diff --git a/test/CodeGen/BPF/remove_truncate_6.ll b/test/CodeGen/BPF/remove_truncate_6.ll > new file mode 100644 > index 0000000..6577afb > --- /dev/null > +++ b/test/CodeGen/BPF/remove_truncate_6.ll > @@ -0,0 +1,80 @@ > +; RUN: llc < %s -march=bpf -verify-machineinstrs | FileCheck %s > +; RUN: llc < %s -march=bpf -mattr=+alu32 -verify-machineinstrs | FileCheck --check-prefix=CHECK-32 %s > +; > +; void cal1(unsigned short *a, unsigned long *b, unsigned int k) > +; { > +; unsigned short e; > +; > +; e = *a; > +; for (unsigned int i = 0; i < k; i++) { > +; b[i] = e; > +; e = ~e; > +; } > +; } > +; > +; void cal2(unsigned short *a, unsigned int *b, unsigned int k) > +; { > +; unsigned short e; > +; > +; e = *a; > +; for (unsigned int i = 0; i < k; i++) { > +; b[i] = e; > +; e = ~e; > +; } > +; } > + > +; Function Attrs: nofree norecurse nounwind optsize > +define dso_local void @cal1(i16* nocapture readonly %a, i64* nocapture %b, i32 %k) local_unnamed_addr #0 { > +entry: > + %cmp8 = icmp eq i32 %k, 0 > + br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader > + > +for.body.preheader: ; preds = %entry > + %0 = load i16, i16* %a, align 2 > + %wide.trip.count = zext i32 %k to i64 > + br label %for.body > + > +for.cond.cleanup: ; preds = %for.body, %entry > + ret void > + > +for.body: ; preds = %for.body, %for.body.preheader > + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] > + %e.09 = phi i16 [ %0, %for.body.preheader ], [ %neg, %for.body ] > + %conv = zext i16 %e.09 to i64 > + %arrayidx = getelementptr inbounds i64, i64* %b, i64 %indvars.iv > +; CHECK: r{{[0-9]+}} &= 65535 > +; CHECK-32: r{{[0-9]+}} &= 65535 > + store i64 %conv, i64* %arrayidx, align 8 > + %neg = xor i16 %e.09, -1 > + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 > + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count > + br i1 %exitcond, label %for.cond.cleanup, label %for.body > +} > + > +; Function Attrs: nofree norecurse nounwind optsize > +define dso_local void @cal2(i16* nocapture readonly %a, i32* nocapture %b, i32 %k) local_unnamed_addr #0 { > +entry: > + %cmp8 = icmp eq i32 %k, 0 > + br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader > + > +for.body.preheader: ; preds = %entry > + %0 = load i16, i16* %a, align 2 > + %wide.trip.count = zext i32 %k to i64 > + br label %for.body > + > +for.cond.cleanup: ; preds = %for.body, %entry > + ret void > + > +for.body: ; preds = %for.body, %for.body.preheader > + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] > + %e.09 = phi i16 [ %0, %for.body.preheader ], [ %neg, %for.body ] > + %conv = zext i16 %e.09 to i32 > + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv > +; CHECK: r{{[0-9]+}} &= 65535 > +; CHECK-32: w{{[0-9]+}} &= 65535 > + store i32 %conv, i32* %arrayidx, align 4 > + %neg = xor i16 %e.09, -1 > + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 > + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count > + br i1 %exitcond, label %for.cond.cleanup, label %for.body > +} >