Skip to content
Jin Wang edited this page Mar 27, 2015 · 1 revision

How to add a new pass onto Ocelot

A compiler works like a stream system: we have a chain of filters, and the program, which we can see as a stream of instructions, is fed to each of these filters sequentially. A pass can analyze the program, or it can transform the program. The divergence analysis that we have is just an analysis: it does not change the program at all. On the other hand, the register allocators do change the program, inserting load and store instructions into the original code. In this page I will show you how to add a new pass onto Ocelot. This is a very simple pass that only prints the instructions in the Dataflow Graph that describes a program. Writing the pass

The first step is, naturally, to write the pass itself. The pass is composed of two files, a .cpp and a .h, the latter containing the headers of the constructors, destructors and functions implemented by the classes of your pass, which are all implemented in the .cpp file, in standart C++ fashion. It’s important to notice that the classes you create will have to extend the class kernelPass, like on the example below.

#include "ocelot/transforms/interface/PrettyPrinterPass.h"
#include "ocelot/analysis/interface/DivergenceAnalysis.h"

#include "iostream"

 namespace transforms {

   PrettyPrinterPass::PrettyPrinterPass()
     : KernelPass(Analysis::DivergenceAnalysis, "PrettyPrinterPass") {
   }

   void PrettyPrinter::runOnKernel( ir::IRKernel& k ) {
     Analysis* dfg_structure = getAnalysis(Analysis::DataflowGraphAnalysis);
     assert(dfg_structure != 0);

     analysis::DataflowGraph& dfg =
       *static_cast(dfg_structure);

     analysis::DataflowGraph::iterator block = ++dfg.begin();
     analysis::DataflowGraph::iterator blockEnd = --dfg.end();
     for (; block != blockEnd; block++) {
       std::cout << "New Basic Block:\n";
       std::_List_iterator i = block->block()->instructions.begin();
       std::_List_iterator e = block->block()->instructions.end();
       while (i != e) {
         ir::PTXInstruction* inst = static_cast(*i);
         std::cout <toString() << std::endl;
         i++;
       }
     }
  }
}

#ifndef PRETTY_PRINTER_PASS_H_
#define PRETTY_PRINTER_PASS_H_

#include "ocelot/transforms/interface/Pass.h"
namespace transforms {
/*! \brief This pass prints the instructions in the Dataflow Graph
 */
class PrettyPrinterPass: public KernelPass
{
private:

public:
  PrettyPrinterPass();
  virtual ~PrettyPrinterPass() {};
  virtual void initialize( const ir::Module& m ){};
  virtual void runOnKernel( ir::IRKernel& k );
  virtual void finalize(){};
};

}

#endif /* BLOCKUNIFICATIONPASS_H_ */

Adding the pass to the Ocelot toolchain

To add the pass to the Ocelot toolchain, several little additions have to be made to certain key parts of ocelot. For instance, changes will have to be done to ocelot/tools/PTXOPtimizer.cpp/h. On the .cpp file, your header file has to be included, like on the example below:

/*! \file PTXOptimzer.cpp
  \date Thursday December 31, 2009
  \author Gregory Diamos
  \brief The source file for the Ocelot PTX optimizer
*/

#ifndef PTX_OPTIMIZER_CPP_INCLUDED
#define PTX_OPTIMIZER_CPP_INCLUDED

// Ocelot Includes
#include "ocelot/tools/PTXOptimizer.h"
#include "ocelot/transforms/interface/PassManager.h"
#include "ocelot/transforms/interface/RemoveBarrierPass.h"
#include "ocelot/transforms/interface/StructuralTransform.h"
#include "ocelot/transforms/interface/ConvertPredicationToSelectPass.h"
//Include your header here.

Also, you have to add your file to the if chains on the functions PTXOptimizer::optimize() and parsePassTypes( const std::string& passList ). The first function guarantees that your pass will be read by the compiler, and the latter makes sure you’ll be able to execute your pass by writing a new customized shortcut on the command line.

   //PTXOptimizer::optimize() function.
   if( passes & SplitBasicBlocks )
    {
      transforms::Pass* pass = new transforms::SplitBasicBlockPass(
        basicBlockSize );
      manager.addPass( *pass );
    }

    if( passes & SyncElimination )
    {
      transforms::Pass* pass = new transforms::SyncEliminationPass;
      manager.addPass( *pass );
    }

    if( passes & PrettyPrint )
    {
      transforms::Pass* pass = new transforms::PrettyPrinterPass;
      manager.addPass( *pass );
    }
    //Your if should go here, with a similar format.

    //parsePassTypes( const std::string& passList ) function
    else if( *pass == "split-blocks" )
    {
      report( "  Matched split-blocks." );
      types |= tools::PTXOptimizer::SplitBasicBlocks;
    }
    else if( *pass == "sync-elimination" )
    {
      report( "  Matched sync-elimination." );
      types |= tools::PTXOptimizer::SyncElimination;
    }
    else if( *pass == "pretty-print" )
    {
      report( "  Matched pretty-print." );
      types |= tools::PTXOptimizer::PrettyPrint;
    }
    //Your pass should go here, above that if( !pass->empty() ).
    else if( !pass->empty() )
    {
      std::cout << "==Ocelot== Warning: Unknown pass name - '" << *pass
        << "'\n";
    }

Besides that, there is an addition to be made on ocelot/tools/PTXOPtimizer.h: you have to, on enum PassType, attribute an hexadecimal adress to a variable representing your pass:

     /*! \brief The possible PTX to PTX passes */
      enum PassType
      {
        InvalidPassType      = 0x0,
        RemoveBarriers       = 0x1,
        ReverseIfConversion  = 0x2,
        SubkernelFormation   = 0x4,
        StructuralTransform  = 0x8,
        MIMDThreadScheduling = 0x10,
        DeadCodeElimination  = 0x20,
        SplitBasicBlocks     = 0x40,
        SyncElimination      = 0x80,
        PrettyPrint          = 0x100
        //Insert the adress to your variable here.
      };

The last change that needs to be done is on ocelot/api/interface/OcelotRuntime.h, where you should add the declaration of the function representing your pass:

#include "ocelot/api/interface/OcelotConfiguration.h"
#include "ocelot/trace/interface/MemoryChecker.h"
#include "ocelot/trace/interface/MemoryRaceDetector.h"
#include "ocelot/trace/interface/InteractiveDebugger.h"

#include "ocelot/transforms/interface/StructuralTransform.h"
#include "ocelot/transforms/interface/ConvertPredicationToSelectPass.h"
#include "ocelot/transforms/interface/LinearScanRegisterAllocationPass.h"
#include "ocelot/transforms/interface/MIMDThreadSchedulingPass.h"
#include "ocelot/transforms/interface/SyncEliminationPass.h"
#include "ocelot/transforms/interface/PrettyPrinterPass.h"

namespace ocelot
{
  /*! \brief This is an interface for managing state associated with Ocelot */
  class OcelotRuntime {
  private:
    trace::MemoryChecker _memoryChecker;
    trace::MemoryRaceDetector _raceDetector;
    trace::InteractiveDebugger _debugger;

    transforms::StructuralTransform _structuralTransform;
    transforms::ConvertPredicationToSelectPass _predicationToSelect;
    transforms::LinearScanRegisterAllocationPass _linearScanAllocation;
    transforms::MIMDThreadSchedulingPass _mimdThreadScheduling;
    transforms::SyncEliminationPass _syncElimination;
    transforms::PrettyPrinterPass _prettyPrinterPass;
    //Insert your declaration here.

    bool _initialized;

  public:
    //! \brief initializes Ocelot runtime state
    OcelotRuntime();

    //! \brief initializes the Ocelot runtime object with the
    //         Ocelot configuration object
    void configure( const api::OcelotConfiguration &c );

  };
}

#endif

Testing the pass

The last step is to test your pass. To do that, you can execute it with the benchmark your choice, for example, compute_20_ORI_babda9cd76164acf. With that benchmark chosen, execute PTXOptimizer -i compute_20_ORI_babda9cd76164acf -p pass-shortcut-here -o out.ptx on the /home/ocelot/tests/ok/sdk3.1/ directory. Below we present a small benchmark example:

.version 2.2
   .target sm_20
   .file 1 ""
   .file 2 "/tmp/tmpxft_000013cc_00000000-6_vectorAdd.cudafe2.gpu"
   .file 3 "/usr/lib/gcc/i486-linux-gnu/4.4.3/include/stddef.h"
   .file 4 "/usr/local/cuda/include/crt/device_runtime.h"
   .file 5 "/usr/local/cuda/include/host_defines.h"
   .file 6 "/usr/local/cuda/include/builtin_types.h"
   .file 7 "/usr/local/cuda/include/device_types.h"
   .file 8 "/usr/local/cuda/include/driver_types.h"
   .file 9 "/usr/local/cuda/include/surface_types.h"
   .file 10  "/usr/local/cuda/include/texture_types.h"
   .file 11  "/usr/local/cuda/include/vector_types.h"
   .file 12  "/usr/local/cuda/include/device_launch_parameters.h"
   .file 13  "/usr/local/cuda/include/crt/storage_class.h"
   .file 14  "/usr/include/bits/types.h"
   .file 15  "/usr/include/time.h"
   .file 16  "/usr/local/cuda/include/texture_fetch_functions.h"
   .file 17  "/usr/local/cuda/include/common_functions.h"
   .file 18  "/usr/local/cuda/include/math_functions.h"
   .file 19  "/usr/local/cuda/include/math_constants.h"
   .file 20  "/usr/local/cuda/include/device_functions.h"
   .file 21  "/usr/local/cuda/include/sm_11_atomic_functions.h"
   .file 22  "/usr/local/cuda/include/sm_12_atomic_functions.h"
   .file 23  "/usr/local/cuda/include/sm_13_double_functions.h"
   .file 24  "/usr/local/cuda/include/sm_20_atomic_functions.h"
   .file 25  "/usr/local/cuda/include/sm_20_intrinsics.h"
   .file 26  "/usr/local/cuda/include/surface_functions.h"
   .file 27  "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
   .file 28  "vectorAdd.cu"
   .entry _Z6VecAddPKfS0_Pfi (
     .param .u32 __cudaparm__Z6VecAddPKfS0_Pfi_A,
     .param .u32 __cudaparm__Z6VecAddPKfS0_Pfi_B,
     .param .u32 __cudaparm__Z6VecAddPKfS0_Pfi_C,
     .param .s32 __cudaparm__Z6VecAddPKfS0_Pfi_N)
   {
   .reg .u32 %r;
   .reg .f32 %f;
   .reg .pred %p;
   .loc  28  43  0
 $LDWbegin__Z6VecAddPKfS0_Pfi:
   mov.u32   %r1, %ctaid.x;
   mov.u32   %r2, %ntid.x;
   mul.lo.u32  %r3, %r1, %r2;
   mov.u32   %r4, %tid.x;
.u32   %r5, %r4, %r3;
   ld.param.s32  %r6, [__cudaparm__Z6VecAddPKfS0_Pfi_N];
   setp.le.s32   %p1, %r6, %r5;
   @%p1 bra  $Lt_0_1026;
   .loc  28  47  0
   mul.lo.u32  %r7, %r5, 4;
   ld.param.u32  %r8, [__cudaparm__Z6VecAddPKfS0_Pfi_A];
   add.u32   %r9, %r8, %r7;
   ld.global.f32   %f1, [%r9+0];
   ld.param.u32  %r10, [__cudaparm__Z6VecAddPKfS0_Pfi_B];
   add.u32   %r11, %r10, %r7;
   ld.global.f32   %f2, [%r11+0];
   add.f32   %f3, %f1, %f2;
   ld.param.u32  %r12, [__cudaparm__Z6VecAddPKfS0_Pfi_C];
   add.u32   %r13, %r12, %r7;
   st.global.f32   [%r13+0], %f3;
 $Lt_0_1026:
   .loc  28  48  0
   exit;
 $LDWend__Z6VecAddPKfS0_Pfi:
   } // _Z6VecAddPKfS0_Pfi

Reference

Thanks to Alberto de Sá Cavalcanti de Albuquerque for providing this tutorial, there original version may be found here