-
Notifications
You must be signed in to change notification settings - Fork 69
AddingPasses
A compiler works like a stream system: we have a chain of filters, and the program, which we can see as a stream of instructions, is fed to each of these filters sequentially. A pass can analyze the program, or it can transform the program. The divergence analysis that we have is just an analysis: it does not change the program at all. On the other hand, the register allocators do change the program, inserting load and store instructions into the original code. In this page I will show you how to add a new pass onto Ocelot. This is a very simple pass that only prints the instructions in the Dataflow Graph that describes a program. Writing the pass
The first step is, naturally, to write the pass itself. The pass is composed of two files, a .cpp and a .h, the latter containing the headers of the constructors, destructors and functions implemented by the classes of your pass, which are all implemented in the .cpp file, in standart C++ fashion. It’s important to notice that the classes you create will have to extend the class kernelPass, like on the example below.
#include "ocelot/transforms/interface/PrettyPrinterPass.h"
#include "ocelot/analysis/interface/DivergenceAnalysis.h"
#include "iostream"
namespace transforms {
PrettyPrinterPass::PrettyPrinterPass()
: KernelPass(Analysis::DivergenceAnalysis, "PrettyPrinterPass") {
}
void PrettyPrinter::runOnKernel( ir::IRKernel& k ) {
Analysis* dfg_structure = getAnalysis(Analysis::DataflowGraphAnalysis);
assert(dfg_structure != 0);
analysis::DataflowGraph& dfg =
*static_cast(dfg_structure);
analysis::DataflowGraph::iterator block = ++dfg.begin();
analysis::DataflowGraph::iterator blockEnd = --dfg.end();
for (; block != blockEnd; block++) {
std::cout << "New Basic Block:\n";
std::_List_iterator i = block->block()->instructions.begin();
std::_List_iterator e = block->block()->instructions.end();
while (i != e) {
ir::PTXInstruction* inst = static_cast(*i);
std::cout <toString() << std::endl;
i++;
}
}
}
}
#ifndef PRETTY_PRINTER_PASS_H_
#define PRETTY_PRINTER_PASS_H_
#include "ocelot/transforms/interface/Pass.h"
namespace transforms {
/*! \brief This pass prints the instructions in the Dataflow Graph
*/
class PrettyPrinterPass: public KernelPass
{
private:
public:
PrettyPrinterPass();
virtual ~PrettyPrinterPass() {};
virtual void initialize( const ir::Module& m ){};
virtual void runOnKernel( ir::IRKernel& k );
virtual void finalize(){};
};
}
#endif /* BLOCKUNIFICATIONPASS_H_ */
To add the pass to the Ocelot toolchain, several little additions have to be made to certain key parts of ocelot. For instance, changes will have to be done to ocelot/tools/PTXOPtimizer.cpp/h. On the .cpp file, your header file has to be included, like on the example below:
/*! \file PTXOptimzer.cpp
\date Thursday December 31, 2009
\author Gregory Diamos
\brief The source file for the Ocelot PTX optimizer
*/
#ifndef PTX_OPTIMIZER_CPP_INCLUDED
#define PTX_OPTIMIZER_CPP_INCLUDED
// Ocelot Includes
#include "ocelot/tools/PTXOptimizer.h"
#include "ocelot/transforms/interface/PassManager.h"
#include "ocelot/transforms/interface/RemoveBarrierPass.h"
#include "ocelot/transforms/interface/StructuralTransform.h"
#include "ocelot/transforms/interface/ConvertPredicationToSelectPass.h"
//Include your header here.
Also, you have to add your file to the if chains on the functions PTXOptimizer::optimize() and parsePassTypes( const std::string& passList ). The first function guarantees that your pass will be read by the compiler, and the latter makes sure you’ll be able to execute your pass by writing a new customized shortcut on the command line.
//PTXOptimizer::optimize() function.
if( passes & SplitBasicBlocks )
{
transforms::Pass* pass = new transforms::SplitBasicBlockPass(
basicBlockSize );
manager.addPass( *pass );
}
if( passes & SyncElimination )
{
transforms::Pass* pass = new transforms::SyncEliminationPass;
manager.addPass( *pass );
}
if( passes & PrettyPrint )
{
transforms::Pass* pass = new transforms::PrettyPrinterPass;
manager.addPass( *pass );
}
//Your if should go here, with a similar format.
//parsePassTypes( const std::string& passList ) function
else if( *pass == "split-blocks" )
{
report( " Matched split-blocks." );
types |= tools::PTXOptimizer::SplitBasicBlocks;
}
else if( *pass == "sync-elimination" )
{
report( " Matched sync-elimination." );
types |= tools::PTXOptimizer::SyncElimination;
}
else if( *pass == "pretty-print" )
{
report( " Matched pretty-print." );
types |= tools::PTXOptimizer::PrettyPrint;
}
//Your pass should go here, above that if( !pass->empty() ).
else if( !pass->empty() )
{
std::cout << "==Ocelot== Warning: Unknown pass name - '" << *pass
<< "'\n";
}
Besides that, there is an addition to be made on ocelot/tools/PTXOPtimizer.h: you have to, on enum PassType, attribute an hexadecimal adress to a variable representing your pass:
/*! \brief The possible PTX to PTX passes */
enum PassType
{
InvalidPassType = 0x0,
RemoveBarriers = 0x1,
ReverseIfConversion = 0x2,
SubkernelFormation = 0x4,
StructuralTransform = 0x8,
MIMDThreadScheduling = 0x10,
DeadCodeElimination = 0x20,
SplitBasicBlocks = 0x40,
SyncElimination = 0x80,
PrettyPrint = 0x100
//Insert the adress to your variable here.
};
The last change that needs to be done is on ocelot/api/interface/OcelotRuntime.h, where you should add the declaration of the function representing your pass:
#include "ocelot/api/interface/OcelotConfiguration.h"
#include "ocelot/trace/interface/MemoryChecker.h"
#include "ocelot/trace/interface/MemoryRaceDetector.h"
#include "ocelot/trace/interface/InteractiveDebugger.h"
#include "ocelot/transforms/interface/StructuralTransform.h"
#include "ocelot/transforms/interface/ConvertPredicationToSelectPass.h"
#include "ocelot/transforms/interface/LinearScanRegisterAllocationPass.h"
#include "ocelot/transforms/interface/MIMDThreadSchedulingPass.h"
#include "ocelot/transforms/interface/SyncEliminationPass.h"
#include "ocelot/transforms/interface/PrettyPrinterPass.h"
namespace ocelot
{
/*! \brief This is an interface for managing state associated with Ocelot */
class OcelotRuntime {
private:
trace::MemoryChecker _memoryChecker;
trace::MemoryRaceDetector _raceDetector;
trace::InteractiveDebugger _debugger;
transforms::StructuralTransform _structuralTransform;
transforms::ConvertPredicationToSelectPass _predicationToSelect;
transforms::LinearScanRegisterAllocationPass _linearScanAllocation;
transforms::MIMDThreadSchedulingPass _mimdThreadScheduling;
transforms::SyncEliminationPass _syncElimination;
transforms::PrettyPrinterPass _prettyPrinterPass;
//Insert your declaration here.
bool _initialized;
public:
//! \brief initializes Ocelot runtime state
OcelotRuntime();
//! \brief initializes the Ocelot runtime object with the
// Ocelot configuration object
void configure( const api::OcelotConfiguration &c );
};
}
#endif
The last step is to test your pass. To do that, you can execute it with the benchmark your choice, for example, compute_20_ORI_babda9cd76164acf. With that benchmark chosen, execute PTXOptimizer -i compute_20_ORI_babda9cd76164acf -p pass-shortcut-here -o out.ptx on the /home/ocelot/tests/ok/sdk3.1/ directory. Below we present a small benchmark example:
.version 2.2
.target sm_20
.file 1 ""
.file 2 "/tmp/tmpxft_000013cc_00000000-6_vectorAdd.cudafe2.gpu"
.file 3 "/usr/lib/gcc/i486-linux-gnu/4.4.3/include/stddef.h"
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/include/host_defines.h"
.file 6 "/usr/local/cuda/include/builtin_types.h"
.file 7 "/usr/local/cuda/include/device_types.h"
.file 8 "/usr/local/cuda/include/driver_types.h"
.file 9 "/usr/local/cuda/include/surface_types.h"
.file 10 "/usr/local/cuda/include/texture_types.h"
.file 11 "/usr/local/cuda/include/vector_types.h"
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "/usr/local/cuda/include/texture_fetch_functions.h"
.file 17 "/usr/local/cuda/include/common_functions.h"
.file 18 "/usr/local/cuda/include/math_functions.h"
.file 19 "/usr/local/cuda/include/math_constants.h"
.file 20 "/usr/local/cuda/include/device_functions.h"
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
.file 26 "/usr/local/cuda/include/surface_functions.h"
.file 27 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
.file 28 "vectorAdd.cu"
.entry _Z6VecAddPKfS0_Pfi (
.param .u32 __cudaparm__Z6VecAddPKfS0_Pfi_A,
.param .u32 __cudaparm__Z6VecAddPKfS0_Pfi_B,
.param .u32 __cudaparm__Z6VecAddPKfS0_Pfi_C,
.param .s32 __cudaparm__Z6VecAddPKfS0_Pfi_N)
{
.reg .u32 %r;
.reg .f32 %f;
.reg .pred %p;
.loc 28 43 0
$LDWbegin__Z6VecAddPKfS0_Pfi:
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %ntid.x;
mul.lo.u32 %r3, %r1, %r2;
mov.u32 %r4, %tid.x;
.u32 %r5, %r4, %r3;
ld.param.s32 %r6, [__cudaparm__Z6VecAddPKfS0_Pfi_N];
setp.le.s32 %p1, %r6, %r5;
@%p1 bra $Lt_0_1026;
.loc 28 47 0
mul.lo.u32 %r7, %r5, 4;
ld.param.u32 %r8, [__cudaparm__Z6VecAddPKfS0_Pfi_A];
add.u32 %r9, %r8, %r7;
ld.global.f32 %f1, [%r9+0];
ld.param.u32 %r10, [__cudaparm__Z6VecAddPKfS0_Pfi_B];
add.u32 %r11, %r10, %r7;
ld.global.f32 %f2, [%r11+0];
add.f32 %f3, %f1, %f2;
ld.param.u32 %r12, [__cudaparm__Z6VecAddPKfS0_Pfi_C];
add.u32 %r13, %r12, %r7;
st.global.f32 [%r13+0], %f3;
$Lt_0_1026:
.loc 28 48 0
exit;
$LDWend__Z6VecAddPKfS0_Pfi:
} // _Z6VecAddPKfS0_Pfi
Thanks to Alberto de Sá Cavalcanti de Albuquerque for providing this tutorial, there original version may be found here