Remove use of llvm.codegen intrinsic for GPGPU codegen

We use llvm.codegen intrinsic to generate code for embedded LLVM-IR strings. The reason we introduce such a intrinsic is that previous clang/opt tools was NOT linked with various LLVM targets and their AsmParsers and AsmPrinters. Since clang/opt been linked with all the needed libraries, we no longer need the llvm.codegen intrinsic. llvm-svn: 211573
2014-06-24 08:11:36 +00:00 · 2014-06-24 08:11:36 +00:00 · cc91169fd7
parent ce8245b5fd
commit cc91169fd7
6 changed files with 192 additions and 526 deletions
--- a/polly/include/polly/CodeGen/PTXGenerator.h
+++ b/polly/include/polly/CodeGen/PTXGenerator.h
@ -159,13 +159,6 @@ private:
  ///                     will be copied from host to device.
  Function *createSubfunctionDefinition(int NumArgs);

-  /// @brief Extract all the ptx related subfunctions into a new module.
-  ///
-  /// @param M            Current module.
-  /// @return             The generated module containing only gpu related
-  ///                     subfunctions.
-  Module *extractPTXFunctionsFromModule(const Module *M);
-
  /// @brief Get the Value of CUDA block width.
  Value *getCUDABlockWidth();

--- a/polly/lib/CodeGen/PTXGenerator.cpp
+++ b/polly/lib/CodeGen/PTXGenerator.cpp
@ -22,10 +22,12 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
@ -507,44 +509,98 @@ Value *PTXGenerator::getOutputArraySizeInBytes() {
  return ConstantInt::get(getInt64Type(), OutputBytes);
 }

+static Module *extractPTXFunctionsFromModule(const Module *M,
+                                             const StringRef &Triple) {
+  llvm::ValueToValueMapTy VMap;
+  Module *New = new Module("TempGPUModule", M->getContext());
+  New->setTargetTriple(Triple::normalize(Triple));
+
+  // Loop over the functions in the module, making external functions as before
+  for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) {
+    if (!I->isDeclaration() &&
+        (I->getCallingConv() == CallingConv::PTX_Device ||
+         I->getCallingConv() == CallingConv::PTX_Kernel)) {
+      Function *NF =
+          Function::Create(cast<FunctionType>(I->getType()->getElementType()),
+                           I->getLinkage(), I->getName(), New);
+      NF->copyAttributesFrom(I);
+      VMap[I] = NF;
+
+      Function::arg_iterator DestI = NF->arg_begin();
+      for (Function::const_arg_iterator J = I->arg_begin(); J != I->arg_end();
+           ++J) {
+        DestI->setName(J->getName());
+        VMap[J] = DestI++;
+      }
+      SmallVector<ReturnInst *, 8> Returns; // Ignore returns cloned.
+      CloneFunctionInto(NF, I, VMap, /*ModuleLevelChanges=*/true, Returns);
+    }
+  }
+
+  return New;
+}
+
+static bool createASMAsString(Module *New, const StringRef &Triple,
+                              const StringRef &MCPU, const StringRef &Features,
+                              std::string &ASM) {
+  llvm::Triple TheTriple(Triple::normalize(Triple));
+  std::string ErrMsg;
+  const Target *TheTarget =
+      TargetRegistry::lookupTarget(TheTriple.getTriple(), ErrMsg);
+  if (!TheTarget) {
+    errs() << ErrMsg << "\n";
+    return false;
+  }
+
+  TargetOptions Options;
+  std::unique_ptr<TargetMachine> target(TheTarget->createTargetMachine(
+      TheTriple.getTriple(), MCPU, Features, Options));
+  assert(target.get() && "Could not allocate target machine!");
+  TargetMachine &Target = *target.get();
+
+  // Build up all of the passes that we want to do to the module.
+  PassManager PM;
+
+  TargetLibraryInfo *TLI = new TargetLibraryInfo(TheTriple);
+  PM.add(TLI);
+
+  PM.add(new DataLayoutPass(*Target.getDataLayout()));
+  Target.addAnalysisPasses(PM);
+
+  {
+    raw_string_ostream NameROS(ASM);
+    formatted_raw_ostream FOS(NameROS);
+
+    // Ask the target to add backend passes as necessary.
+    int UseVerifier = true;
+    if (Target.addPassesToEmitFile(PM, FOS, TargetMachine::CGFT_AssemblyFile,
+                                   UseVerifier)) {
+      errs() << "The target does not support generation of this file type!\n";
+      return false;
+    }
+
+    PM.run(*New);
+    FOS.flush();
+  }
+
+  return true;
+}
+
 Value *PTXGenerator::createPTXKernelFunction(Function *SubFunction) {
  Module *M = getModule();
+  Module *GPUModule = extractPTXFunctionsFromModule(M, GPUTriple);
  std::string LLVMKernelStr;
-  raw_string_ostream NameROS(LLVMKernelStr);
-  formatted_raw_ostream FOS(NameROS);
-  FOS << "target triple = \"" << GPUTriple << "\"\n";
-  SubFunction->print(FOS);
-
-  // Insert ptx intrinsics into the kernel string.
-  for (Module::iterator I = M->begin(), E = M->end(); I != E;) {
-    Function *F = I++;
-    // Function must be a prototype and unused.
-    if (F->isDeclaration() && F->isIntrinsic()) {
-      switch (F->getIntrinsicID()) {
-      case Intrinsic::ptx_read_nctaid_x:
-      case Intrinsic::ptx_read_nctaid_y:
-      case Intrinsic::ptx_read_ctaid_x:
-      case Intrinsic::ptx_read_ctaid_y:
-      case Intrinsic::ptx_read_ntid_x:
-      case Intrinsic::ptx_read_ntid_y:
-      case Intrinsic::ptx_read_tid_x:
-      case Intrinsic::ptx_read_tid_y:
-        F->print(FOS);
-        break;
-      default:
-        break;
-      }
-    }
+  if (!createASMAsString(GPUModule, GPUTriple, "sm_20" /*MCPU*/,
+                         "" /*Features*/, LLVMKernelStr)) {
+    errs() << "Generate ptx string failed!\n";
+    return NULL;
  }

  Value *LLVMKernel =
      Builder.CreateGlobalStringPtr(LLVMKernelStr, "llvm_kernel");
-  Value *MCPU = Builder.CreateGlobalStringPtr("sm_10", "mcpu");
-  Value *Features = Builder.CreateGlobalStringPtr("", "cpu_features");

-  Function *GetDeviceKernel = Intrinsic::getDeclaration(M, Intrinsic::codegen);
-
-  return Builder.CreateCall3(GetDeviceKernel, LLVMKernel, MCPU, Features);
+  delete GPUModule;
+  return LLVMKernel;
 }

 Value *PTXGenerator::getPTXKernelEntryName(Function *SubFunction) {
--- a/polly/test/Cloog/CodeGen/GPGPU/1d_parallel.ll
+++ b/polly/test/Cloog/CodeGen/GPGPU/1d_parallel.ll
@ -0,0 +1,72 @@
+; REQUIRES: nvptx-registered-target
+; RUN: opt %loadPolly -basicaa -polly-import-jscop -polly-import-jscop-dir=%S -polly-import-jscop-postfix=transformed+gpu -enable-polly-gpgpu -polly-gpgpu-triple=nvptx64-unknown-unknown -polly-codegen < %s -S | FileCheck %s
+
+;int A[1024];
+
+;int gpu() {
+;  int i;
+;
+;  for(i = 0; i < 1024; i++)
+;    A[i] = i*128 + 508;
+;
+;  return 0;
+;}
+;
+;int main() {
+;  int b = gpu();
+;  return 0;
+;}
+
+; ModuleID = '1d_parallel.s'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = common global [1024 x i32] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define i32 @gpu() #0 {
+  br label %.split
+
+.split:                                           ; preds = %0
+  br label %1
+
+; <label>:1                                       ; preds = %.split, %1
+  %indvar = phi i64 [ 0, %.split ], [ %indvar.next, %1 ]
+  %2 = mul i64 %indvar, 128
+  %3 = add i64 %2, 508
+  %4 = trunc i64 %3 to i32
+  %scevgep = getelementptr [1024 x i32]* @A, i64 0, i64 %indvar
+  store i32 %4, i32* %scevgep, align 4
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp ne i64 %indvar.next, 1024
+  br i1 %exitcond, label %1, label %5
+
+; <label>:5                                       ; preds = %1
+  ret i32 0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @main() #0 {
+  br label %.split
+
+.split:                                           ; preds = %0
+  %1 = tail call i32 @gpu()
+  ret i32 0
+}
+
+; CHECK:  call void @polly_initDevice
+; CHECK:  call void @polly_getPTXModule
+; CHECK:  call void @polly_getPTXKernelEntry
+; CHECK:  call void @polly_allocateMemoryForHostAndDevice
+; CHECK:  call void @polly_setKernelParameters
+; CHECK:  call void @polly_startTimerByCudaEvent
+; CHECK:  call void @polly_launchKernel
+; CHECK:  call void @polly_copyFromDeviceToHost
+; CHECK:  call void @polly_stopTimerByCudaEvent
+; CHECK:  call void @polly_cleanupGPGPUResources
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.5.0 "}
--- a/polly/test/Cloog/CodeGen/GPGPU/gpu___%1---%5.jscop
+++ b/polly/test/Cloog/CodeGen/GPGPU/gpu___%1---%5.jscop
@ -0,0 +1,17 @@
+{
+   "context" : "{  :  }",
+   "name" : "%1 => %5",
+   "statements" : [
+      {
+         "accesses" : [
+            {
+               "kind" : "write",
+               "relation" : "{ Stmt_1[i0] -> MemRef_A[i0] }"
+            }
+         ],
+         "domain" : "{ Stmt_1[i0] : i0 >= 0 and i0 <= 1023 }",
+         "name" : "Stmt_1",
+         "schedule" : "{ Stmt_1[i0] -> scattering[0, i0, 0] }"
+      }
+   ]
+}
--- a/polly/test/Cloog/CodeGen/GPGPU/gpu___%1---%5.jscop.transformed+gpu
+++ b/polly/test/Cloog/CodeGen/GPGPU/gpu___%1---%5.jscop.transformed+gpu
@ -0,0 +1,17 @@
+{
+   "context" : "{  :  }",
+   "name" : "%1 => %5",
+   "statements" : [
+      {
+         "accesses" : [
+            {
+               "kind" : "write",
+               "relation" : "{ Stmt_1[i0] -> MemRef_A[i0] }"
+            }
+         ],
+         "domain" : "{ Stmt_1[i0] : i0 >= 0 and i0 <= 1023 }",
+         "name" : "Stmt_1",
+         "schedule" : "{ Stmt_1[i0] -> scattering[0, o0, o1, o2, o3] : o0 >= 0 and o0 <= 1 and o1 >= 0 and o1 <= 1 and o2 >= 0 and o2 <= 15 and o3 >= 0 and o3 <= 15 and i0 = 512o0 + 256o1 + 16o2 + o3 }"
+      }
+   ]
+}
--- a/polly/utils/0001-Add-llvm.codegen-intrinsic.patch
+++ b/polly/utils/0001-Add-llvm.codegen-intrinsic.patch
@ -1,489 +0,0 @@
-From 7e36390f24f6ceaea7bc2ba4adcd55d06cf73439 Mon Sep 17 00:00:00 2001
-From: Yabin Hu <yabin.hwu@gmail.com>
-Date: Thu, 29 Nov 2012 16:08:29 +0800
-Subject: [PATCH] Add llvm.codegen intrinsic.
-
-The llvm.codegen intrinsic generates code for embedded LLVM-IR
-strings. Each call to the intrinsic is replaced by a pointer to
-the newly generated target code. The code generation target can be
-different to the one of the parent module.
---
- docs/LangRef.html                                  |   36 +++
- include/llvm/CodeGen/Passes.h                      |    3 +
- include/llvm/InitializePasses.h                    |    1 +
- include/llvm/Intrinsics.td                         |    4 +
- lib/CodeGen/CMakeLists.txt                         |    1 +
- lib/CodeGen/CodeGen.cpp                            |    1 +
- lib/CodeGen/CodeGenIntrinsic.cpp                   |  227 ++++++++++++++++++++
- lib/CodeGen/Passes.cpp                             |    3 +
- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp   |    2 +
- lib/Target/LLVMBuild.txt                           |    2 +-
- lib/VMCore/Verifier.cpp                            |   10 +
- .../CodeGen/X86/EmbeddedCG/embedded-codegen-ptx.ll |   28 +++
- test/CodeGen/X86/EmbeddedCG/lit.local.cfg          |    5 +
- 13 files changed, 322 insertions(+), 1 deletions(-)
- create mode 100644 lib/CodeGen/CodeGenIntrinsic.cpp
- create mode 100644 test/CodeGen/X86/EmbeddedCG/embedded-codegen-ptx.ll
- create mode 100644 test/CodeGen/X86/EmbeddedCG/lit.local.cfg
-
-diff --git a/docs/LangRef.html b/docs/LangRef.html
-index cfc1c7d..eae069e 100644
--- a/docs/LangRef.html
-+++ b/docs/LangRef.html
-@@ -243,6 +243,7 @@
-           <li><a href="#int_prefetch">'<tt>llvm.prefetch</tt>' Intrinsic</a></li>
-           <li><a href="#int_pcmarker">'<tt>llvm.pcmarker</tt>' Intrinsic</a></li>
-           <li><a href="#int_readcyclecounter">'<tt>llvm.readcyclecounter</tt>' Intrinsic</a></li>
-+          <li><a href="#int_codegen_intrinsic">'<tt>llvm.codegen</tt>' Intrinsic</a></li>
-         </ol>
-       </li>
-       <li><a href="#int_libc">Standard C Library Intrinsics</a>
-@@ -7249,6 +7250,41 @@ LLVM</a>.</p>
- 
- </div>
- 
-+<!-- _______________________________________________________________________ -->
-+<h4>
-+  <a name="int_codegen_intrinsic">'<tt>llvm.codegen</tt>' Intrinsic</a>
-+</h4>
-+
-+<div>
-+
-+<h5>Syntax:</h5>
-+<pre>
-+  declare i8* @llvm.codegen(i8* &lt;IRString&gt;, i8* &lt;MCPU&gt;, i8* &lt;
-+  Features&gt;)
-+</pre>
-+
-+<h5>Overview:</h5>
-+<p>The '<tt>llvm.codegen</tt>' intrinsic uses the LLVM back ends to generate
-+   code for embedded LLVM-IR strings. The code generation target can be
-+   different to the one of the parent module.</p>
-+
-+<h5>Arguments:</h5>
-+<p><tt>IRString</tt> is a string containing LLVM-IR.</p>
-+<p><tt>MCPU</tt> is the name of the target CPU.</p>
-+<p><tt>Features</tt> is the string representation of the additional target
-+   features.</p>
-+
-+<h5>Semantics:</h5>
-+<p>The '<tt>llvm.codegen</tt>' intrinsic transforms a string containing LLVM IR
-+   to target assembly code. Calls to the intrinsic are replaced by a pointer to
-+   the newly generated target code. In case LLVM can not generate code (e.g. the
-+   target is not available), the call to the intrinsic is replaced by a i8 NULL
-+   pointer.Users of this intrinsic should make sure the target triple is
-+   properly set in the &lt;IRString&gt;. Inputs to both &lt;MCPU&gt; and
-+   &lt;Features&gt; parameters can be null pointers.</p>
-+
-+</div>
-+
- </div>
- 
- <!-- ======================================================================= -->
-diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h
-index 44c9676..57b3aa2 100644
--- a/include/llvm/CodeGen/Passes.h
-+++ b/include/llvm/CodeGen/Passes.h
-@@ -432,6 +432,9 @@ namespace llvm {
-   /// branch folding).
-   extern char &GCMachineCodeAnalysisID;
- 
-+  /// CodeGenIntrinsic Pass - Create target code for embedded LLVM-IR strings.
-+  FunctionPass *createCodeGenIntrinsicPass();
-+
-   /// Deleter Pass - Releases GC metadata.
-   ///
-   FunctionPass *createGCInfoDeleter();
-diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
-index e06b892..fe8655e 100644
--- a/include/llvm/InitializePasses.h
-+++ b/include/llvm/InitializePasses.h
-@@ -93,6 +93,7 @@ void initializeCorrelatedValuePropagationPass(PassRegistry&);
- void initializeDAEPass(PassRegistry&);
- void initializeDAHPass(PassRegistry&);
- void initializeDCEPass(PassRegistry&);
-+void initializeCodeGenIntrinsicPass(PassRegistry&);
- void initializeDSEPass(PassRegistry&);
- void initializeDeadInstEliminationPass(PassRegistry&);
- void initializeDeadMachineInstructionElimPass(PassRegistry&);
-diff --git a/include/llvm/Intrinsics.td b/include/llvm/Intrinsics.td
-index d3a548c..a60d2bb 100644
--- a/include/llvm/Intrinsics.td
-+++ b/include/llvm/Intrinsics.td
-@@ -238,6 +238,10 @@ def int_readcyclecounter : Intrinsic<[llvm_i64_ty]>;
- // guard to the correct place on the stack frame.
- def int_stackprotector : Intrinsic<[], [llvm_ptr_ty, llvm_ptrptr_ty], []>;
- 
-+//===----------------- Code Generation for Embedded LLVM-IR ---------------===//
-+def int_codegen  : Intrinsic<[llvm_ptr_ty],
-+                              [llvm_ptr_ty, llvm_ptr_ty, llvm_ptr_ty]>;
-+
- //===------------------- Standard C Library Intrinsics --------------------===//
- //
- 
-diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
-index 7a20ff6..8e1ab9a 100644
--- a/lib/CodeGen/CMakeLists.txt
-+++ b/lib/CodeGen/CMakeLists.txt
-@@ -6,6 +6,7 @@ add_llvm_library(LLVMCodeGen
-   CalcSpillWeights.cpp
-   CallingConvLower.cpp
-   CodeGen.cpp
-+  CodeGenIntrinsic.cpp
-   CodePlacementOpt.cpp
-   CriticalAntiDepBreaker.cpp
-   DeadMachineInstructionElim.cpp
-diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp
-index a53f6f8..702ee18 100644
--- a/lib/CodeGen/CodeGen.cpp
-+++ b/lib/CodeGen/CodeGen.cpp
-@@ -21,6 +21,7 @@ using namespace llvm;
- void llvm::initializeCodeGen(PassRegistry &Registry) {
-   initializeBranchFolderPassPass(Registry);
-   initializeCalculateSpillWeightsPass(Registry);
-+  initializeCodeGenIntrinsicPass(Registry);
-   initializeCodePlacementOptPass(Registry);
-   initializeDeadMachineInstructionElimPass(Registry);
-   initializeEarlyIfConverterPass(Registry);
-diff --git a/lib/CodeGen/CodeGenIntrinsic.cpp b/lib/CodeGen/CodeGenIntrinsic.cpp
-new file mode 100644
-index 0000000..cf8aa54
--- /dev/null
-+++ b/lib/CodeGen/CodeGenIntrinsic.cpp
-@@ -0,0 +1,227 @@
-+//===-- CodeGenIntrinsic.cpp - CodeGen Intrinsic --------------------------===//
-+//
-+//                     The LLVM Compiler Infrastructure
-+//
-+// This file is distributed under the University of Illinois Open Source
-+// License. See LICENSE.TXT for details.
-+//
-+//===----------------------------------------------------------------------===//
-+//
-+// This file implements the llvm.codegen intrinsic.
-+//
-+//===----------------------------------------------------------------------===//
-+
-+#include "llvm/CodeGen/Passes.h"
-+#include "llvm/CallingConv.h"
-+#include "llvm/IRBuilder.h"
-+#include "llvm/IntrinsicInst.h"
-+#include "llvm/LLVMContext.h"
-+#include "llvm/Module.h"
-+#include "llvm/PassManager.h"
-+#include "llvm/AsmParser/Parser.h"
-+#include "llvm/Target/TargetMachine.h"
-+#include "llvm/Target/TargetRegisterInfo.h"
-+#include "llvm/Support/Debug.h"
-+#include "llvm/Support/ErrorHandling.h"
-+#include "llvm/Support/FormattedStream.h"
-+#include "llvm/Support/Host.h"
-+#include "llvm/Support/raw_ostream.h"
-+#include "llvm/Support/SourceMgr.h"
-+#include "llvm/Support/TargetRegistry.h"
-+#include "llvm/ADT/Triple.h"
-+
-+using namespace llvm;
-+
-+namespace {
-+  /// ASMGenerator generates target-specific assembly code from LLVM IR.
-+  class ASMGenerator {
-+  public:
-+    ASMGenerator() {}
-+
-+    /// generate - Generates a target code string from a LLVM IR Value.
-+    bool generate(Value *IRStr, Value *MCPUStr, Value *FeaturesStr,
-+                  std::string &ASM);
-+
-+  private:
-+    bool getStringFromConstantExpr(Value *ConstData, std::string &Out) const;
-+  };
-+
-+  /// CodeGenIntrinsic - This pass replaces each call to the llvm.codegen
-+  /// intrinsic with a string generated by ASMGenerator.
-+  class CodeGenIntrinsic : public FunctionPass {
-+  public:
-+    static char ID;
-+
-+    CodeGenIntrinsic();
-+    const char *getPassName() const;
-+    virtual bool runOnFunction(Function &F);
-+  };
-+}
-+
-+// -----------------------------------------------------------------------------
-+static bool getTargetMachineFromModule(Module *M, const StringRef &TripleStr,
-+                                       const StringRef &MCPU,
-+                                       const StringRef &Features,
-+                                       TargetMachine *&TM) {
-+  std::string ErrMsg;
-+  const Target *TheTarget = TargetRegistry::lookupTarget(TripleStr, ErrMsg);
-+  if (!TheTarget) {
-+    errs() << ErrMsg << "\n";
-+    return false;
-+  }
-+
-+  TargetOptions Options;
-+  TM = TheTarget->createTargetMachine(TripleStr, MCPU, Features, Options);
-+  assert(TM && "Could not allocate target machine!");
-+  return true;
-+}
-+
-+static bool createASMAsString(Module *New, const StringRef &Triple,
-+                              const StringRef &MCPU, const StringRef &Features,
-+                              std::string &ASM) {
-+  TargetMachine *Target;
-+  if (!getTargetMachineFromModule(New, Triple, MCPU, Features, Target)) {
-+    return false;
-+  }
-+
-+  // Build up all of the passes that we want to do to the module.
-+  PassManager PM;
-+
-+  // Get the data layout of the new module. If it is empty, return false.
-+  const std::string &ModuleDataLayout = New->getDataLayout();
-+  if (ModuleDataLayout.empty())
-+    return false;
-+
-+  {
-+    raw_string_ostream NameROS(ASM);
-+    formatted_raw_ostream FOS(NameROS);
-+
-+    // Ask the target to add backend passes as necessary.
-+    int UseVerifier = true;
-+    if (Target->addPassesToEmitFile(PM, FOS, TargetMachine::CGFT_AssemblyFile,
-+                                    UseVerifier)) {
-+      errs() << "CodeGen Intrinsic: target does not support generation of this "
-+             << "file type!\n";
-+
-+      return false;
-+    }
-+
-+    PM.run(*New);
-+    FOS.flush();
-+  }
-+
-+  delete Target;
-+  return true;
-+}
-+
-+bool ASMGenerator::getStringFromConstantExpr(Value *ConstData,
-+                                             std::string &Out) const {
-+  bool Result = false;
-+  if (ConstantExpr *U = dyn_cast<ConstantExpr>(ConstData)) {
-+    Value *R = U->getOperand(0);
-+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(R)) {
-+      Constant *C = GV->getInitializer();
-+      if (ConstantDataArray *CA = dyn_cast<ConstantDataArray>(C)) {
-+        Out = CA->getAsString();
-+        Result = true;
-+      }
-+    }
-+  }
-+  return Result;
-+}
-+
-+bool ASMGenerator::generate(Value *IRStr, Value *MCPUStr, Value *FeaturesStr,
-+                            std::string &ASM) {
-+  std::string Kernel;
-+  if (!getStringFromConstantExpr(IRStr, Kernel))
-+    return false;
-+
-+  std::string MCPU;
-+  if (!getStringFromConstantExpr(MCPUStr, MCPU))
-+    MCPU = "";
-+
-+  std::string Features;
-+  if (!getStringFromConstantExpr(FeaturesStr, Features))
-+    Features = "";
-+
-+  SMDiagnostic ErrorMessage;
-+  LLVMContext Context;
-+  std::auto_ptr<Module> TempModule(
-+    ParseAssemblyString(Kernel.c_str(), 0, ErrorMessage, Context));
-+
-+  Triple TheTriple(TempModule->getTargetTriple());
-+  const std::string TripleStr = TheTriple.getTriple();
-+  if(TripleStr.empty()) {
-+    errs() << "error: Target triple isn't set correctly for the new module.\n";
-+    return false;
-+  }
-+
-+  return createASMAsString(TempModule.get(), TripleStr.data(), MCPU.data(),
-+                           Features.data(), ASM);
-+}
-+
-+// -----------------------------------------------------------------------------
-+INITIALIZE_PASS(CodeGenIntrinsic, "codegen-intrinsic", "CodeGen Intrinsic",
-+                false, false)
-+
-+FunctionPass *llvm::createCodeGenIntrinsicPass() {
-+  return new CodeGenIntrinsic();
-+}
-+
-+char CodeGenIntrinsic::ID = 0;
-+
-+CodeGenIntrinsic::CodeGenIntrinsic()
-+  : FunctionPass(ID) {
-+}
-+
-+const char *CodeGenIntrinsic::getPassName() const {
-+  return "Lowering CodeGen Intrinsic.";
-+}
-+
-+bool CodeGenIntrinsic::runOnFunction(Function &F) {
-+  bool MadeChange = false;
-+  Module *M = F.getParent();
-+  if (Function *CG = M->getFunction("llvm.codegen")) {
-+    for (Function::use_iterator I = CG->use_begin(), E = CG->use_end();
-+         I != E; ++I) {
-+      if (CallInst *CI = dyn_cast<CallInst>(*I)) {
-+        if (&F != CI->getParent()->getParent())
-+          continue;
-+
-+        std::string ASM;
-+        ASMGenerator *Generator = new ASMGenerator();
-+        IRBuilder<> Builder(CI->getParent(), CI);
-+        Value *St;
-+        if (!Generator->generate(CI->getArgOperand(0), CI->getArgOperand(1),
-+                                 CI->getArgOperand(2), ASM)) {
-+          Type *Ty= CG->getReturnType();
-+          St = Constant::getNullValue(Ty);
-+        } else {
-+          // FIXME: Is this actually dangerous as WritingAnLLVMPass.html claims?
-+          //        Seems that, short of multithreaded LLVM, it should be safe;
-+          //        all that is necessary is that a simple Module::iterator loop
-+          //        not be invalidated. Appending to the GlobalVariable list is
-+          //        safe in that sense.
-+          //
-+          //        All the output passes emit globals last. The ExecutionEngine
-+          //        explicitly supports adding globals to the module after
-+          //        initialization.
-+          //
-+          //        Still, if it isn't deemed acceptable, then this
-+          //        transformation needs to be a ModulePass (which means it
-+          //        cannot be in the  'llc' pipeline  (which uses a
-+          //        FunctionPassManager (which segfaults (not asserts) if
-+          //        provided a ModulePass))).
-+          St = Builder.CreateGlobalStringPtr(ASM, "ASM");
-+        }
-+        CI->replaceAllUsesWith(St);
-+        CI->eraseFromParent();
-+        // We should erase the unused globals from current module. But we
-+        // can't do this within a FunctionPass.
-+        MadeChange = true;
-+      }
-+    }
-+  }
-+
-+  return MadeChange;
-+}
-diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp
-index 526d994..1de0c63 100644
--- a/lib/CodeGen/Passes.cpp
-+++ b/lib/CodeGen/Passes.cpp
-@@ -369,6 +369,9 @@ void TargetPassConfig::addIRPasses() {
- 
-   addPass(createGCLoweringPass());
- 
-+  // Generate target code for embedded LLVM-IR strings.
-+  addPass(createCodeGenIntrinsicPass());
-+
-   // Make sure that no unreachable blocks are instruction selected.
-   addPass(createUnreachableBlockEliminationPass());
- }
-diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
-index 56e774c..97006c0 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
-+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
-@@ -5169,6 +5169,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
-   case Intrinsic::donothing:
-     // ignore
-     return 0;
-+  case Intrinsic::codegen:
-+    llvm_unreachable("failed to lower codegen intrinsic!");
-   }
- }
- 
-diff --git a/lib/Target/LLVMBuild.txt b/lib/Target/LLVMBuild.txt
-index eb6c779..a54f57c 100644
--- a/lib/Target/LLVMBuild.txt
-+++ b/lib/Target/LLVMBuild.txt
-@@ -45,7 +45,7 @@ parent = Libraries
- type = Library
- name = Target
- parent = Libraries
-required_libraries = Core MC Support
-+required_libraries = Core MC Support AsmParser
- 
- ; This is a special group whose required libraries are extended (by llvm-build)
- ; with every built target, which makes it easy for tools to include every
-diff --git a/lib/VMCore/Verifier.cpp b/lib/VMCore/Verifier.cpp
-index 3782957..896772a 100644
--- a/lib/VMCore/Verifier.cpp
-+++ b/lib/VMCore/Verifier.cpp
-@@ -1952,6 +1952,16 @@ void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
-     Assert1(isa<ConstantInt>(CI.getArgOperand(1)),
-             "llvm.invariant.end parameter #2 must be a constant integer", &CI);
-     break;
-+  case Intrinsic::codegen:
-+    Assert1(isa<ConstantExpr>(CI.getArgOperand(0)),
-+            "llvm.codegen parameter #1 must be a constant expression", &CI);
-+    Assert1(isa<ConstantExpr>(CI.getArgOperand(1)) ||
-+            isa<ConstantPointerNull>(CI.getArgOperand(1)),
-+            "llvm.codegen parameter #2 must be a constant expression", &CI);
-+    Assert1(isa<ConstantExpr>(CI.getArgOperand(2)) ||
-+            isa<ConstantPointerNull>(CI.getArgOperand(2)),
-+            "llvm.codegen parameter #3 must be a constant expression", &CI);
-+    break;
-   }
- }
- 
-diff --git a/test/CodeGen/X86/EmbeddedCG/embedded-codegen-ptx.ll b/test/CodeGen/X86/EmbeddedCG/embedded-codegen-ptx.ll
-new file mode 100644
-index 0000000..73d34e1
--- /dev/null
-+++ b/test/CodeGen/X86/EmbeddedCG/embedded-codegen-ptx.ll
-@@ -0,0 +1,28 @@
-+; RUN: llc < %s -march=x86 | FileCheck %s
-+
-+; ModuleID = 'embedded-codegen-ptx.ll'
-+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
-+target triple = "i386-pc-linux-gnu"
-+
-+@llvm_kernel = private unnamed_addr constant [1940 x i8] c"target triple = \22nvptx-unknown-unknown\22\0A\0Adefine internal ptx_kernel void @gpu_codegen.ptx_subfn(i8* %ptx.Array) {\0Aptx.setup:\0A  %0 = bitcast i8* %ptx.Array to [128 x [128 x i32]]*\0A  %1 = call i32 @llvm.ptx.read.nctaid.x()\0A  %2 = zext i32 %1 to i64\0A  %3 = call i32 @llvm.ptx.read.nctaid.y()\0A  %4 = zext i32 %3 to i64\0A  %5 = call i32 @llvm.ptx.read.ntid.x()\0A  %6 = zext i32 %5 to i64\0A  %7 = call i32 @llvm.ptx.read.ntid.y()\0A  %8 = zext i32 %7 to i64\0A  %9 = call i32 @llvm.ptx.read.ctaid.x()\0A  %10 = zext i32 %9 to i64\0A  %11 = call i32 @llvm.ptx.read.ctaid.y()\0A  %12 = zext i32 %11 to i64\0A  %13 = call i32 @llvm.ptx.read.tid.x()\0A  %14 = zext i32 %13 to i64\0A  %15 = call i32 @llvm.ptx.read.tid.y()\0A  %16 = zext i32 %15 to i64\0A  br label %ptx.loop_body\0A\0Aptx.exit:                                         ; preds = %polly.stmt.for.body3\0A  ret void\0A\0Aptx.loop_body:                                    ; preds = %ptx.setup\0A  %p_gpu_index_i = mul i64 %12, %2\0A  %17 = add i64 %p_gpu_index_i, %10\0A  %p_gpu_index_j = mul i64 %16, %6\0A  %18 = add i64 %p_gpu_index_j, %14\0A  br label %polly.stmt.for.body3\0A\0Apolly.stmt.for.body3:                             ; preds = %ptx.loop_body\0A  %19 = trunc i64 %17 to i32\0A  %p_mul = shl nsw i32 %19, 7\0A  %20 = trunc i64 %18 to i32\0A  %p_add = add nsw i32 %p_mul, %20\0A  %21 = trunc i64 %17 to i32\0A  %22 = trunc i64 %18 to i32\0A  %p_arrayidx4 = getelementptr inbounds [128 x [128 x i32]]* %0, i32 0, i32 %21, i32 %22\0A  store i32 %p_add, i32* %p_arrayidx4\0A  br label %ptx.exit\0A}\0A\0Adeclare i32 @llvm.ptx.read.nctaid.x() nounwind readnone\0A\0Adeclare i32 @llvm.ptx.read.nctaid.y() nounwind readnone\0A\0Adeclare i32 @llvm.ptx.read.ctaid.x() nounwind readnone\0A\0Adeclare i32 @llvm.ptx.read.ctaid.y() nounwind readnone\0A\0Adeclare i32 @llvm.ptx.read.ntid.x() nounwind readnone\0A\0Adeclare i32 @llvm.ptx.read.ntid.y() nounwind readnone\0A\0Adeclare i32 @llvm.ptx.read.tid.x() nounwind readnone\0A\0Adeclare i32 @llvm.ptx.read.tid.y() nounwind readnone\0A\00"
-+
-+@.str = private unnamed_addr constant [3 x i8] c"%s\00", align 1
-+
-+define i32 @gpu_codegen() nounwind {
-+entry:
-+  %0 = call i8* @llvm.codegen(i8* getelementptr inbounds ([1940 x i8]* @llvm_kernel, i32 0, i32 0), i8* null, i8* null)
-+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([3 x i8]* @.str, i32 0, i32 0), i8* %0)
-+  ret i32 0
-+}
-+
-+define i32 @main() nounwind {
-+entry:
-+  %call = call i32 @gpu_codegen()
-+  ret i32 0
-+}
-+
-+declare i8* @llvm.codegen(i8*, i8*, i8*) nounwind
-+
-+declare i32 @printf(i8*, ...) nounwind
-+
-+; CHECK: gpu_codegen_2E_ptx_subfn
-diff --git a/test/CodeGen/X86/EmbeddedCG/lit.local.cfg b/test/CodeGen/X86/EmbeddedCG/lit.local.cfg
-new file mode 100644
-index 0000000..7180c84
--- /dev/null
-+++ b/test/CodeGen/X86/EmbeddedCG/lit.local.cfg
-@@ -0,0 +1,5 @@
-+config.suffixes = ['.ll', '.c', '.cpp']
-+
-+targets = set(config.root.targets_to_build.split())
-+if not 'NVPTX' in targets:
-+    config.unsupported = True
-- 
-1.7.6.5
-