|
GTPin
|
The Funtime tool counts the cycles it takes for each kernel to execute from the beginning to the end
To run the Funtime tool in its default configuration, use this command:
Profilers/Bin/gtpin -t funtime -- app
When you run the in-house GTPin Funtime tool in its default configuration, the directory GTPIN_PROFILE_FUNTIME0 is generated. GTPin saves the profiling results in the file GTPIN_PROFILE_FUNTIME0\Session_Final\funtime.txt. The results are presented in the following format:
### Kernel/Shader execution-time profile generated by GTPin ###
Legend:
NA - kernel was not instrumented.
Name HashID SIMD Type Freq. Total-Cycle Avg-Cycles Skipped Platform Execution descriptor
L3_SLM_8x8_8x16 f54af91315561f54 8 CS 512 164965992 322199 0 OpenCL 0 0
L3_SLM_8x8_8x16 f54af91315561f54 8 CS 512 164442942 321177 0 OpenCL 0 1
L3_SLM_8x8_8x16 f54af91315561f54 8 CS 512 165458952 323162 0 OpenCL 0 2
L3_SLM_8x8_8x16 f54af91315561f54 8 CS 512 168150199 328418 0 OpenCL 0 3
L3_SLM_8x8_8x16 f54af91315561f54 8 CS 512 165136147 322531 0 OpenCL 0 4
L3_SLM_8x8_8x16 f54af91315561f54 8 CS 512 167331603 326819 0 OpenCL 0 5
Each line represents a single run (dispatch to HW device) of specific kernel, where the fields have the following meaning:
Total-Cycle/Freq.If the name of the kernel is not known to GTPin, GTPin creates an artificial name in the format: CS_asmf54af91315561f54_simd8 where the prefix indicates the kernel type; the suffix indicates the SIMD width to which this kernel was compiled; and the 16-digit number is the hash ID of the IR representation of this kernel.
(Back to the list of all GTPin Sample Tools)
00001 /*========================== begin_copyright_notice ============================ 00002 Copyright (C) 2018-2022 Intel Corporation 00003 00004 SPDX-License-Identifier: MIT 00005 ============================= end_copyright_notice ===========================*/ 00006 00007 /*! 00008 * @file Funtime tool definitions 00009 */ 00010 00011 #ifndef FUNTIME_H_ 00012 #define FUNTIME_H_ 00013 00014 #include <list> 00015 #include <map> 00016 #include <set> 00017 #include <string> 00018 00019 #include "gtpin_api.h" 00020 #include "gtpin_tool_utils.h" 00021 00022 using namespace gtpin; 00023 00024 /* ============================================================================================= */ 00025 // Struct FuntimeRecord 00026 /* ============================================================================================= */ 00027 /*! 00028 * Layout of records collected in profile buffer by the funtime tool 00029 */ 00030 struct FuntimeRecord 00031 { 00032 uint64_t cycles; ///< Total number of cycles 00033 uint32_t freq; ///< Total number of executions 00034 uint32_t skipped; ///< Total number of skipped executions 00035 }; 00036 00037 /* ============================================================================================= */ 00038 // Class FuntimeDispatchProfile 00039 /* ============================================================================================= */ 00040 /*! 00041 * Profiling data collected during a single kernel dispatch 00042 */ 00043 struct FuntimeDispatchProfile 00044 { 00045 explicit FuntimeDispatchProfile(const IGtKernelDispatch& kernelDispatch, uint32_t tile = 0); 00046 void Accumulate(const FuntimeRecord& record); 00047 00048 GtKernelExecDesc kernelExecDesc; ///< Kernel execution descriptor 00049 uint32_t tileId; ///< Identifier of the subdevice (tile) assigned to this kernel dispatch 00050 uint64_t cycles; ///< Total number of cycles 00051 uint64_t freq; ///< Total number of executions 00052 uint64_t skipped; ///< Total number of skipped executions 00053 }; 00054 00055 /* ============================================================================================= */ 00056 // Class FuntimeKernelProfile 00057 /* ============================================================================================= */ 00058 /*! 00059 * Aggregated profile of all instrumented kernel dispatches 00060 */ 00061 class FuntimeKernelProfile 00062 { 00063 public: 00064 FuntimeKernelProfile(const IGtKernel& kernel, const GtProfileArray& profileArray); 00065 00066 /// Add new dispatched kernel instance, and return reference to its (empty) profile 00067 FuntimeDispatchProfile& AddKernelDispatch(const IGtKernelDispatch& kernelDispatch, uint32_t tile = 0); 00068 00069 std::string ToString() const; ///< @return Text representation of the profile data 00070 const GtProfileArray& GetProfileArray() const { return _profileArray; }///< @return Profile buffer accessor 00071 00072 private: 00073 std::string _name; ///< Kernel's name 00074 GtKernelType _type; ///< Kernel's type 00075 GtGpuPlatform _platform; ///< Kernel's platform 00076 uint64_t _hashId; ///< Kernel's hash identifier 00077 GtSimdWidth _simd; ///< Kernel's SIMD width 00078 GtProfileArray _profileArray; ///< Profile buffer accessor 00079 std::list<FuntimeDispatchProfile> _dispatchProfiles; ///< Profiles per kernel dispatch 00080 }; 00081 00082 /* ============================================================================================= */ 00083 // Class Funtime 00084 /* ============================================================================================= */ 00085 /*! 00086 * Implementation of the IGtTool interface for the funtime tool 00087 */ 00088 class Funtime : public GtTool 00089 { 00090 public: 00091 /// Implementation of the IGtTool interface 00092 const char* Name() const { return "funtime"; } 00093 00094 void OnKernelBuild(IGtKernelInstrument& instrumentor); 00095 void OnKernelRun(IGtKernelDispatch& dispatcher); 00096 void OnKernelComplete(IGtKernelDispatch& dispatcher); 00097 00098 public: 00099 static void OnFini(); ///< Callback function registered with atexit() 00100 std::string ToString() const; ///< @return Text representation of the profile data 00101 00102 static Funtime* Instance(); ///< @return Single instance of this class 00103 00104 private: 00105 Funtime() = default; 00106 Funtime(const Funtime&) = delete; 00107 Funtime& operator = (const Funtime&) = delete; 00108 ~Funtime() = default; 00109 00110 /// Generate code at entry/exits of the kernel 00111 void GeneratePreCode(GtGenProcedure& proc, const IGtGenCoder& coder); 00112 void GeneratePostCode(GtGenProcedure& proc, const IGtGenCoder& coder, const GtProfileArray& profileArray); 00113 00114 /// @return true/false - use 64-bit/32-bit integer for the cycle counter 00115 static bool Use64BitCounters(const IGtGenCoder& coder); 00116 00117 private: 00118 std::map<GtKernelId, FuntimeKernelProfile> _kernels; ///< Collection of kernel profiles 00119 00120 GtReg _addrReg; ///< Virtual register that holds address within profile buffer 00121 GtReg _dataReg; ///< Virtual register that holds data to be read from/written to profile buffer 00122 GtReg _timeReg; ///< Virtual timer register 00123 GtReg _tmpReg32; ///< Virtual 32-bit scratch register 00124 }; 00125 00126 #endif
00001 /*========================== begin_copyright_notice ============================ 00002 Copyright (C) 2015-2025 Intel Corporation 00003 00004 SPDX-License-Identifier: MIT 00005 ============================= end_copyright_notice ===========================*/ 00006 00007 /*! 00008 * @file Implementation of the Funtime tool 00009 */ 00010 00011 #include <fstream> 00012 #include <sstream> 00013 #include <iomanip> 00014 #include <algorithm> 00015 00016 #include "funtime.h" 00017 00018 using namespace gtpin; 00019 using namespace std; 00020 00021 /* ============================================================================================= */ 00022 // Configuration 00023 /* ============================================================================================= */ 00024 Knob<int> knobNumThreadBuckets("num_thread_buckets", 32, "Number of thread buckets. Default - 32, zero - maximum thread buckets"); 00025 Knob<bool> knobPerTileProfiling("per_tile_profiling", false, "Enable per-tile (subdevice) profiling"); 00026 Knob<bool> knobSkipZeroResults("skip_zero_results", false, "Skip zero results in the Funtime output"); 00027 00028 /* ============================================================================================= */ 00029 // Funtime implementation 00030 /* ============================================================================================= */ 00031 Funtime* Funtime::Instance() 00032 { 00033 static Funtime instance; 00034 return &instance; 00035 } 00036 00037 void Funtime::OnKernelBuild(IGtKernelInstrument& instrumentor) 00038 { 00039 const IGtKernel& kernel = instrumentor.Kernel(); 00040 const IGtCfg& cfg = instrumentor.Cfg(); 00041 const IGtGenCoder& coder = instrumentor.Coder(); 00042 const IGtGenArch& genArch = GTPin_GetCore()->GenArch(); 00043 const IGtGenModel& genModel = kernel.GenModel(); 00044 IGtProfileBufferAllocator& allocator = instrumentor.ProfileBufferAllocator(); 00045 IGtVregFactory& vregs = coder.VregFactory(); 00046 bool is64BitCounter = Use64BitCounters(coder); 00047 00048 // Allocate the profile buffer. It will hold single FuntimeRecord per each thread bucket 00049 uint32_t numThreadBuckets = (knobNumThreadBuckets == 0) ? genModel.MaxThreadBuckets() : knobNumThreadBuckets; 00050 uint32_t numTiles = (knobPerTileProfiling && coder.IsTileIdSupported()) ? genArch.MaxTiles(kernel.GpuPlatform()) : 1; 00051 GtProfileArray profileArray(sizeof(FuntimeRecord), numTiles, numThreadBuckets); 00052 profileArray.Allocate(allocator); 00053 00054 // Initialize virtual registers 00055 _timeReg = vregs.Make(VREG_TYPE_DWORD); 00056 _tmpReg32 = vregs.MakeScratch(); 00057 _addrReg = vregs.MakeMsgAddrScratch(); 00058 _dataReg = vregs.MakeMsgDataScratch(is64BitCounter? VREG_TYPE_QWORD : VREG_TYPE_DWORD); 00059 00060 // Generate code that starts/stops timer at entry/exit of the kernel 00061 GtGenProcedure preCode; GeneratePreCode(preCode, coder); 00062 GtGenProcedure postCode; GeneratePostCode(postCode, coder, profileArray); 00063 00064 // Instrument kernel entries 00065 instrumentor.InstrumentEntries(preCode); 00066 00067 // Instrument kernel exits 00068 for (auto bblPtr : cfg.ExitBbls()) 00069 { 00070 const IGtIns& ins = bblPtr->LastIns(); GTPIN_ASSERT(ins.IsEot()); 00071 GtGenProcedure fakeConsumers; 00072 coder.GenerateFakeSrcConsumers(fakeConsumers, ins); 00073 instrumentor.InstrumentInstruction(ins, GtIpoint::Before(), fakeConsumers); 00074 instrumentor.InstrumentInstruction(ins, GtIpoint::Before(), postCode); 00075 } 00076 00077 _kernels.emplace(kernel.Id(), FuntimeKernelProfile(kernel, profileArray)); 00078 } 00079 00080 void Funtime::OnKernelRun(IGtKernelDispatch& dispatcher) 00081 { 00082 bool isProfileEnabled = false; 00083 00084 const IGtKernel& kernel = dispatcher.Kernel(); 00085 GtKernelExecDesc execDesc; dispatcher.GetExecDescriptor(execDesc); 00086 if (kernel.IsInstrumented() && IsKernelExecProfileEnabled(execDesc, kernel.GpuPlatform(), kernel.Name().Get())) 00087 { 00088 auto it = _kernels.find(kernel.Id()); 00089 00090 if (it != _kernels.end()) 00091 { 00092 IGtProfileBuffer* buffer = dispatcher.CreateProfileBuffer(); GTPIN_ASSERT(buffer); 00093 FuntimeKernelProfile& kernelProfile = it->second; 00094 const GtProfileArray& profileArray = kernelProfile.GetProfileArray(); 00095 if (profileArray.Initialize(*buffer)) 00096 { 00097 isProfileEnabled = true; 00098 } 00099 else 00100 { 00101 GTPIN_ERROR_MSG(string("FUNTIME : ") + string(kernel.Name()) + " : Failed to write into memory buffer"); 00102 } 00103 } 00104 } 00105 dispatcher.SetProfilingMode(isProfileEnabled); 00106 } 00107 00108 void Funtime::OnKernelComplete(IGtKernelDispatch& dispatcher) 00109 { 00110 const IGtKernel& kernel = dispatcher.Kernel(); 00111 GtKernelExecDesc execDesc; dispatcher.GetExecDescriptor(execDesc); 00112 bool isProfilingEnabled = dispatcher.IsProfilingEnabled(); 00113 if (!isProfilingEnabled || !IsKernelExecProfileEnabled(execDesc, kernel.GpuPlatform(), kernel.Name().Get())) 00114 { 00115 return; // Do nothing with unprofiled kernel dispatches 00116 } 00117 00118 auto it = _kernels.find(kernel.Id()); 00119 00120 if (it != _kernels.end()) 00121 { 00122 const IGtProfileBuffer* buffer = dispatcher.GetProfileBuffer(); GTPIN_ASSERT(buffer); 00123 FuntimeKernelProfile& kernelProfile = it->second; 00124 const GtProfileArray& profileArray = kernelProfile.GetProfileArray(); 00125 00126 uint32_t numTiles = profileArray.NumRecords(); // There is a single record for each tile in each thread bucket 00127 for (uint32_t tileId = 0; tileId < numTiles; tileId++) 00128 { 00129 FuntimeDispatchProfile& dispatchProfile = kernelProfile.AddKernelDispatch(dispatcher, tileId); 00130 00131 for (uint32_t threadBucket = 0; threadBucket < profileArray.NumThreadBuckets(); ++threadBucket) 00132 { 00133 FuntimeRecord record; 00134 if (!profileArray.Read(*buffer, &record, tileId, 1, threadBucket)) 00135 { 00136 GTPIN_ERROR_MSG(string("FUNTIME : ") + string(kernel.Name()) + " : Failed to read from memory buffer"); 00137 } 00138 else 00139 { 00140 dispatchProfile.Accumulate(record); 00141 } 00142 } 00143 } 00144 } 00145 } 00146 00147 void Funtime::OnFini() 00148 { 00149 Funtime& me = *Instance(); 00150 string profileDir = GTPin_GetCore()->ProfileDir(); 00151 string filePath = JoinPath(profileDir, "funtime.txt"); 00152 00153 ofstream fs(filePath); 00154 if (fs.is_open()) 00155 { 00156 fs << me.ToString(); 00157 fs.close(); 00158 } 00159 else 00160 { 00161 GTPIN_WARNING("FUNTIME : could not create file: " + filePath); 00162 } 00163 } 00164 00165 string Funtime::ToString() const 00166 { 00167 ostringstream ostr; 00168 ostr << "### Kernel/Shader execution-time profile generated by GTPin ###" << endl << endl; 00169 ostr << "Legend:" << endl; 00170 ostr << "NA - kernel was not instrumented." << endl << endl; 00171 ostr << setw(30) << "Name" << setw(20) << "HashID" << setw(10) << "SIMD" << setw(10) << "Type"; 00172 ostr << setw(15) << "Freq." << setw(15) << "Total-Cycle" << setw(15) << "Avg-Cycles" << setw(15) << "Skipped"; 00173 ostr << setw(20) << "Platform"; 00174 if (knobPerTileProfiling) 00175 { 00176 ostr << setw(10) << "Tile"; 00177 } 00178 ostr << " " << setw(35) << "Execution descriptor"; 00179 ostr << endl; 00180 for (const auto& kernelEntry : _kernels) 00181 { 00182 ostr << kernelEntry.second.ToString(); 00183 } 00184 return ostr.str(); 00185 } 00186 00187 bool Funtime::Use64BitCounters(const IGtGenCoder& coder) 00188 { 00189 return coder.InstructionFactory().CanAccessAtomically(GED_DATA_TYPE_uq); 00190 } 00191 00192 void Funtime::GeneratePreCode(GtGenProcedure& proc, const IGtGenCoder& coder) 00193 { 00194 coder.StartTimer(proc, _timeReg); 00195 if (!proc.empty()) { proc.front()->AppendAnnotation(__func__); } 00196 } 00197 00198 void Funtime::GeneratePostCode(GtGenProcedure& proc, const IGtGenCoder& coder, const GtProfileArray& profileArray) 00199 { 00200 IGtInsFactory& insF = coder.InstructionFactory(); 00201 bool is64BitCounter = Use64BitCounters(coder); 00202 GtReg flagReg = FlagReg(0); 00203 GtReg dataRegL = {_dataReg, sizeof(uint32_t), 0}; // Low 32-bits of the data payload register 00204 00205 // Generate code that computes elapsed time, and sets flagReg in case of timer overflow 00206 coder.StopTimerExt(proc, _timeReg); 00207 00208 // _addrReg = address of the current thread's FuntimeRecord in the profile buffer 00209 if (profileArray.NumRecords() > 1) 00210 { 00211 // The record number = tile ID 00212 GtReg& offsetReg = _tmpReg32; 00213 coder.LoadTileId(proc, offsetReg); 00214 proc += insF.MakeMul(offsetReg, offsetReg, sizeof(FuntimeRecord)); 00215 profileArray.ComputeAddress(coder, proc, _addrReg, offsetReg); 00216 } 00217 else 00218 { 00219 profileArray.ComputeAddress(coder, proc, _addrReg); 00220 } 00221 00222 int32_t base = 0; 00223 int32_t offset; 00224 00225 // cycles += _timeReg 00226 offset = offsetof(FuntimeRecord, cycles) - base; 00227 profileArray.ComputeRelAddress(coder, proc, _addrReg, _addrReg, offset); base += offset; 00228 proc += insF.MakeMov(dataRegL, _timeReg); // Move timer value to the low 32-bits of the data register 00229 if (is64BitCounter) 00230 { 00231 // Clear the high 32-bits of the data payload register 00232 GtReg dataRegH = {_dataReg, sizeof(uint32_t), 1}; 00233 proc += insF.MakeMov(dataRegH, 0); 00234 } 00235 proc += insF.MakeAtomicAdd(NullReg(), _addrReg, _dataReg, (is64BitCounter? GED_DATA_TYPE_uq : GED_DATA_TYPE_ud)); 00236 00237 // freq++ 00238 offset = offsetof(FuntimeRecord, freq) - base; 00239 profileArray.ComputeRelAddress(coder, proc, _addrReg, _addrReg, offset); base += offset; 00240 proc += insF.MakeAtomicInc(NullReg(), _addrReg, GED_DATA_TYPE_ud); 00241 00242 // if (flagReg) skipped++ 00243 offset = offsetof(FuntimeRecord, skipped) - base; 00244 profileArray.ComputeRelAddress(coder, proc, _addrReg, _addrReg, offset); base += offset; 00245 proc += insF.MakeAtomicInc(NullReg(), _addrReg, GED_DATA_TYPE_ud).SetPredicate(flagReg); 00246 00247 if (!proc.empty()) { proc.front()->AppendAnnotation(__func__); } 00248 } 00249 00250 /* ============================================================================================= */ 00251 // FuntimeDispatchProfile implementation 00252 /* ============================================================================================= */ 00253 FuntimeDispatchProfile::FuntimeDispatchProfile(const IGtKernelDispatch& kernelDispatch, uint32_t tile) : 00254 tileId(tile), cycles(0), freq(0), skipped(0) 00255 { 00256 kernelDispatch.GetExecDescriptor(kernelExecDesc); 00257 } 00258 00259 void FuntimeDispatchProfile::Accumulate(const FuntimeRecord& record) 00260 { 00261 cycles += record.cycles; 00262 freq += record.freq; 00263 skipped += record.skipped; 00264 } 00265 00266 /* ============================================================================================= */ 00267 // FuntimeKernelProfile implementation 00268 /* ============================================================================================= */ 00269 FuntimeKernelProfile::FuntimeKernelProfile(const IGtKernel& kernel, const GtProfileArray& profileArray) : 00270 _name(GlueString(kernel.Name())), _type(kernel.Type()), _platform(kernel.GpuPlatform()), _hashId(kernel.HashId()), 00271 _simd(kernel.SimdWidth()), _profileArray(profileArray) {} 00272 00273 FuntimeDispatchProfile& FuntimeKernelProfile::AddKernelDispatch(const IGtKernelDispatch& kernelDispatch, const uint32_t tile) 00274 { 00275 _dispatchProfiles.emplace_back(kernelDispatch, tile); 00276 return _dispatchProfiles.back(); 00277 } 00278 00279 string FuntimeKernelProfile::ToString() const 00280 { 00281 ostringstream ostr; 00282 if (!_dispatchProfiles.empty()) 00283 { 00284 for (const auto& dp: _dispatchProfiles) 00285 { 00286 if (knobSkipZeroResults && (dp.freq == 0)) 00287 { 00288 continue; // Skip zero results if the knob is set 00289 } 00290 00291 uint64_t avgCycles = (dp.freq ? (dp.cycles / dp.freq) : 0); 00292 00293 ostr << setw(30) << _name << setw(20) << hex << _hashId << dec << setw(10) << _simd << setw(10) << _type.ToString(); 00294 ostr << setw(15) << dp.freq << setw(15) << dp.cycles << setw(15) << avgCycles << setw(15) << dp.skipped; 00295 ostr << setw(20) << _platform.ToString(); 00296 if (knobPerTileProfiling) 00297 { 00298 ostr << setw(10) << dp.tileId; 00299 } 00300 ostr << " " << setw(35) << dp.kernelExecDesc.ToString(_platform, ExecDescAlignedFormat()); 00301 ostr << endl; 00302 } 00303 } 00304 else 00305 { 00306 ostr << setw(30) << _name << setw(20) << hex << _hashId << dec << setw(10) << _simd << setw(10) << _type.ToString(); 00307 ostr << setw(15) << "NA" << setw(15) << "NA" << setw(15) << "NA" << setw(15) << "NA" << setw(20) << "NA"; 00308 if (knobPerTileProfiling) 00309 { 00310 ostr << setw(10) << "NA"; 00311 } 00312 ostr << " " << setw(35) << "NA"; 00313 } 00314 ostr << endl; 00315 return ostr.str(); 00316 } 00317 00318 /* ============================================================================================= */ 00319 // GTPin_Entry 00320 /* ============================================================================================= */ 00321 EXPORT_C_FUNC void GTPin_Entry(int argc, const char *argv[]) 00322 { 00323 ConfigureGTPin(argc, argv); 00324 Funtime::Instance()->Register(); 00325 atexit(Funtime::OnFini); 00326 }
(Back to the list of all GTPin Sample Tools)
Copyright (C) 2013-2025 Intel Corporation
SPDX-License-Identifier: MIT
1.7.4