|
GTPin
|
The Toggle tool counts the amount of bits toggled by the kernel
To run the Toggle tool in its default configuration, use this command:
Profilers\GTReplay\intel64\gtreplay.exe -t toggle -- path-to-the-directory-containing-the-trace
(Back to the list of all GTReplay Sample Tools)
00001 /*========================== begin_copyright_notice ============================ 00002 Copyright (C) 2021-2022 Intel Corporation 00003 00004 SPDX-License-Identifier: MIT 00005 ============================= end_copyright_notice ===========================*/ 00006 00007 /******************************************************************************************************* 00008 * TOGGLE tool 00009 * 00010 * Count dynamic amount of toggling bits - the ones that changed their values 0->1 and 1->0 00011 * 00012 * NOTE: the tool callbacks might be called from different threads. 00013 */ 00014 #include <stdio.h> 00015 #include <string.h> 00016 #include <vector> 00017 #ifdef TARGET_WINDOWS 00018 #include "intrin.h" 00019 #endif 00020 #ifdef TARGET_LINUX 00021 #include "x86intrin.h" 00022 #endif 00023 00024 #include "gtreplay_assert.h" 00025 #include "gtreplay_client.h" 00026 #include "knob_parser.h" 00027 00028 // Structure definitions 00029 typedef union { 00030 uint8_t byte[32]; 00031 uint16_t word[32]; 00032 uint32_t dword[32]; 00033 uint64_t qword[32]; 00034 int8_t sbyte[32]; 00035 int16_t sword[32]; 00036 int32_t sdword[32]; 00037 int64_t sqword[32]; 00038 float spfloat[32]; 00039 double dpfloat[32]; 00040 } Operand; 00041 00042 typedef struct { 00043 uint32_t dword[8]; 00044 } FullReg32; 00045 00046 typedef struct { 00047 uint32_t dword[16]; 00048 } FullReg64; 00049 00050 typedef union { 00051 FullReg32 reg32[16]; 00052 FullReg64 reg64[16]; 00053 } SendDest; 00054 00055 // Global variables 00056 uint32_t gMaxNumOfHwThreads = 0; 00057 uint32_t gMaxNumOfTiles = 0; 00058 uint32_t gRegWidth = 32; 00059 00060 uint64_t total_icount = 0; 00061 uint64_t total_toggle_bits = 0; 00062 std::vector<std::vector<uint64_t>> icount; 00063 std::vector<std::vector<SendDest>> sendRegsBefore; 00064 std::vector<std::vector<Operand>> dstBefore; 00065 std::vector<std::vector<uint32_t>> execMask; 00066 std::vector<std::vector<uint64_t>> toggledBits; 00067 std::string kernelName; 00068 00069 void HandleSendBefore(uint32_t tileId, uint32_t tid, GTReplayIns ins, GTReplayState state) 00070 { 00071 uint32_t numOfElements = 0; 00072 uint32_t elementWidth = 0; 00073 00074 // Obtain and save the registers 00075 GTReplay_GetSendDestination(ins, state, (uint8_t*)&sendRegsBefore[tileId][tid], &numOfElements, &elementWidth); 00076 } 00077 00078 void HandleSendAfter(uint32_t tileId, uint32_t tid, GTReplayIns ins, GTReplayState state) 00079 { 00080 SendDest dst; 00081 uint32_t numOfElements = 0; 00082 uint32_t elementWidth = 0; 00083 00084 // Obtain the registers 00085 GTReplay_GetSendDestination(ins, state, (uint8_t*)&dst, &numOfElements, &elementWidth); 00086 00087 if (numOfElements == 0) 00088 { 00089 return; 00090 } 00091 00092 uint32_t count = 0; 00093 00094 if (gRegWidth == 32) 00095 { 00096 // go over all registers 00097 for (uint32_t i = 0; i < numOfElements; i++) 00098 { 00099 FullReg32 afreg = dst.reg32[i], bfreg = sendRegsBefore[tileId][tid].reg32[i]; 00100 00101 // go over all elements 00102 for (uint32_t j = 0; j < 8; j++) 00103 { 00104 // compute the amount of toggled bits 00105 uint32_t tmp = bfreg.dword[j] ^ afreg.dword[j]; 00106 00107 count += _mm_popcnt_u32(tmp); 00108 } 00109 } 00110 } 00111 else 00112 { 00113 // go over all registers 00114 for (uint32_t i = 0; i < numOfElements; i++) 00115 { 00116 FullReg64 afreg = dst.reg64[i], bfreg = sendRegsBefore[tileId][tid].reg64[i]; 00117 00118 // go over all elements 00119 for (uint32_t j = 0; j < 16; j++) 00120 { 00121 // compute the amount of toggled bits 00122 uint32_t tmp = bfreg.dword[j] ^ afreg.dword[j]; 00123 00124 count += _mm_popcnt_u32(tmp); 00125 } 00126 } 00127 } 00128 00129 toggledBits[tileId][tid] += count; 00130 } 00131 00132 /* 00133 * BeforeInsCallback - callback called before instruction execution 00134 * 00135 * @params[in] tid - the ID of the GPU HW thread for which the callback is called 00136 * @params[in] ins - a handle to the current instruction 00137 * @params[in] state - a handle to the HW Thread state corresponding to tid 00138 */ 00139 void BeforeInsCallback(uint32_t tileId, uint32_t tid, GTReplayIns ins, GTReplayState state, void*) 00140 { 00141 GTREPLAY_ASSERT(tileId < gMaxNumOfTiles && tid < gMaxNumOfHwThreads); 00142 // Update the instruction counter corresponding to the current HW thread 00143 icount[tileId][tid]++; 00144 00145 // Check whether the instruction has destination 00146 if (!GTReplay_HasDestination(ins)) 00147 { 00148 // If not, there is nothing to do 00149 return; 00150 } 00151 00152 // Check whether the instruction is a SEND instruction 00153 if (GTReplay_IsSend(ins)) 00154 { 00155 // Is yes, handle SEND instruction separately 00156 HandleSendBefore(tileId, tid, ins, state); 00157 return; 00158 } 00159 00160 // Obtain exec mask 00161 execMask[tileId][tid] = GTReplay_DynamicExecMask(ins, state); 00162 00163 uint32_t numOfElements = 0; 00164 uint32_t elementWidth = 0; 00165 00166 // Obtain and save destination before 00167 GTReplay_GetDestination(ins, state, execMask[tileId][tid], (uint8_t*)&dstBefore[tileId][tid], &numOfElements, &elementWidth); 00168 } 00169 00170 /* 00171 * AfterInsCallback - callback called after instruction execution 00172 * 00173 * @params[in] tid - the ID of the GPU HW thread for which the callback is called 00174 * @params[in] ins - a handle to the current instruction 00175 * @params[in] state - a handle to the HW Thread state corresponding to tid 00176 */ 00177 void AfterInsCallback(uint32_t tileId, uint32_t tid, GTReplayIns ins, GTReplayState state, void*) 00178 { 00179 GTREPLAY_ASSERT(tileId < gMaxNumOfTiles&& tid < gMaxNumOfHwThreads); 00180 00181 // Check whether the instruction has destination 00182 if (!GTReplay_HasDestination(ins)) 00183 { 00184 // If not, there is nothing to do 00185 return; 00186 } 00187 00188 // Check whether the instruction is a SEND instruction 00189 if (GTReplay_IsSend(ins)) 00190 { 00191 // Is yes, handle SEND instruction separately 00192 HandleSendAfter(tileId, tid, ins, state); 00193 return; 00194 } 00195 00196 Operand dstAfter = {}; 00197 uint32_t numOfElements = 0; 00198 uint32_t elementWidth = 0; 00199 00200 // Obtain destination after 00201 GTReplay_GetDestination(ins, state, execMask[tileId][tid], (uint8_t*)&dstAfter, &numOfElements, &elementWidth); 00202 00203 uint32_t count = 0; 00204 00205 // Iterate over all elements 00206 for (uint32_t i = 0; i < numOfElements; i++) 00207 { 00208 // Compute the amount of toggled bits 00209 switch (elementWidth) { 00210 case 1: count += _mm_popcnt_u32(dstBefore[tileId][tid].byte[i] ^ dstAfter.byte[i]); break; 00211 case 2: count += _mm_popcnt_u32(dstBefore[tileId][tid].word[i] ^ dstAfter.word[i]); break; 00212 case 4: count += _mm_popcnt_u32(dstBefore[tileId][tid].dword[i] ^ dstAfter.dword[i]); break; 00213 case 8: 00214 { 00215 uint64_t tmp = dstBefore[tileId][tid].qword[i] ^ dstAfter.qword[i]; 00216 00217 count += _mm_popcnt_u32((uint32_t)(tmp & 0xFFFFFFFF)); 00218 count += _mm_popcnt_u32((uint32_t)(tmp >> 32)); 00219 00220 break; 00221 } 00222 default: break; 00223 } 00224 } 00225 00226 toggledBits[tileId][tid] += count; 00227 } 00228 00229 /* 00230 * OnKernelComplete - callback called upon kernel completion 00231 * 00232 * @params[in] kernel - a handle to the kernel 00233 */ 00234 void OnKernelComplete(GTReplayKernel kernel) 00235 { 00236 total_icount = 0; 00237 total_toggle_bits = 0; 00238 00239 // Accumulate counters from all HW threads 00240 for (uint32_t tileId = 0; tileId < gMaxNumOfTiles; tileId++) 00241 { 00242 for (uint32_t t = 0; t < gMaxNumOfHwThreads; t++) 00243 { 00244 total_icount += icount[tileId][t]; 00245 total_toggle_bits += toggledBits[tileId][t]; 00246 } 00247 } 00248 00249 // Print the results 00250 std::cout << "\n\n=================\n"; 00251 std::cout << "BIT TOGGLING TOOL\n"; 00252 std::cout << "=================\n\n"; 00253 std::cout.imbue(std::locale("")); 00254 std::cout << "Kernel: " << kernelName << "\n\n"; 00255 std::cout << "TOTAL ICOUNT = " << total_icount << "\n\n"; 00256 std::cout << "TOTAL TOGGLED BITS = " << total_toggle_bits << "\n\n"; 00257 } 00258 00259 /* 00260 * OnKernelBuild - callback called before kernel execution 00261 * The purpose of this callback is to traverse the kernel binary and instrument callbacks 00262 * 00263 * @params[in] kernel - a handle to the kernel 00264 */ 00265 void OnKernelBuild(GTReplayKernel kernel) 00266 { 00267 uint32_t gModelId = GTReplay_GetModel(kernel); 00268 00269 gMaxNumOfHwThreads = GTReplay_MaxNumOfHWThreads(gModelId); 00270 00271 gMaxNumOfTiles = GTReplay_MaxNumOfTiles(kernel); 00272 GTREPLAY_ASSERT(gMaxNumOfTiles); 00273 00274 gRegWidth = GTReplay_RegisterWidth(gModelId); 00275 00276 // Traverse all the basic blocks 00277 for (GTReplayBbl bbl = GTReplay_BblHead(kernel); GTReplay_BblValid(bbl); bbl = GTReplay_BblNext(bbl)) 00278 { 00279 // Traverse all the instruction within the basic blocks 00280 for (GTReplayIns ins = GTReplay_InsHead(bbl); GTReplay_InsValid(ins); ins = GTReplay_InsNext(ins)) 00281 { 00282 // Register callback to be called before instruction execution 00283 GTReplay_RegisterCallbackBeforeIns(kernel, ins, BeforeInsCallback, NULL); 00284 // Register callback to be called after instruction execution 00285 GTReplay_RegisterCallbackAfterIns(kernel, ins, AfterInsCallback, NULL); 00286 } 00287 } 00288 00289 // Allocate and initialize buffers 00290 icount.resize(gMaxNumOfTiles); 00291 toggledBits.resize(gMaxNumOfTiles); 00292 execMask.resize(gMaxNumOfTiles); 00293 sendRegsBefore.resize(gMaxNumOfTiles); 00294 dstBefore.resize(gMaxNumOfTiles); 00295 for (uint32_t i = 0; i < gMaxNumOfTiles; i++) 00296 { 00297 icount[i].resize(gMaxNumOfHwThreads, 0); 00298 toggledBits[i].resize(gMaxNumOfHwThreads, 0); 00299 execMask[i].resize(gMaxNumOfHwThreads, 0); 00300 sendRegsBefore[i].resize(gMaxNumOfHwThreads); 00301 dstBefore[i].resize(gMaxNumOfHwThreads); 00302 } 00303 00304 uint32_t kernelNameSize = 0; 00305 GTReplay_GetKernelName(kernel, &kernelNameSize, nullptr); 00306 00307 char* buf = new char[kernelNameSize + 1](); 00308 GTReplay_GetKernelName(kernel, &kernelNameSize, buf); 00309 00310 kernelName = std::string(buf); 00311 00312 delete[] buf; 00313 } 00314 00315 /* 00316 * GTReplay_Entry - tool entry point 00317 */ 00318 extern "C" 00319 DLLEXP void FASTCALL GTReplay_Entry(int argc, const char *argv[]) 00320 { 00321 // configure GTReplay 00322 ConfigureGTReplay(argc, argv); 00323 00324 // register OnKernelBuild and OnKernelComplete callbacks 00325 GTReplay_RegisterOnKernelBuildCallback(OnKernelBuild); 00326 GTReplay_RegisterOnKernelCompleteCallback(OnKernelComplete); 00327 00328 // Start GTReplay 00329 GTReplay_Start(); 00330 }
(Back to the list of all GTReplay Sample Tools)
Copyright (C) 2013-2025 Intel Corporation
SPDX-License-Identifier: MIT
1.7.4