|
GTPin
|
The Latency tool counts the cycles it takes for the basic block (BBL) of each kernel to execute from the beginning to the end
The latency tool supports two modes of operation:
--mode 0 - Default mode, which measures the cycles (latency) at basic block granularity.--mode 1 - Measures the cycles (latency) of memory read instructions.To run the Latency tool (in its default configuration), use the following command:
Profilers/Bin/gtpin -t latency [--mode 0] -- app
To run the Latency tool in mode 1, use the following command:
Profilers/Bin/gtpin -t latency --mode 1 -- app
NOTE: GTPin divides the kernel binary code into basic blocks (BBLs). Each BBL is a sequence of instructions that has a single entry point and a single exit point. Usually, the starting point of basic blocks created by GTPin are targets of control flow instructions. Obviously, a control flow instruction must be the last instruction of a BBL. Therefore, if a BBL starts with a control flow instruction, it is a single-instruction BBL. In addition, EOT instructions form single-instruction basic blocks. In mode 0, the latency tool measures only basic blocks which have at least one non-control flow instruction.
When you run the in-house GTPin Latency tool in its default configuration, GTPin generates the directory: GTPIN_PROFILE_LATENCY0. GTPin saves the profiling results in the file: GTPIN_PROFILE_LATENCY0\Session_Final\latency.out. The profiling results are presented in the following format:
### Kernel/Shader execution-time profile generated by GTPin ###
Legend:
NA - kernel was not instrumented.
Name HashID SIMD Type Ins. ID Freq. Total-latency(%) Total-latency Avg-Cycles Platform Execution descriptor
BitonicSort f641279bbb4bc39f 32 CS 0 262144 0.03 112702896 429.00 OpenCL 0 0
BitonicSort f641279bbb4bc39f 32 CS 14 0 0.00 0 0.00 OpenCL 0 0
BitonicSort f641279bbb4bc39f 32 CS 17 0 0.00 0 0.00 OpenCL 0 0
BitonicSort f641279bbb4bc39f 32 CS 96 0 0.00 0 0.00 OpenCL 0 0
BitonicSort f641279bbb4bc39f 32 CS 101 0 0.00 0 0.00 OpenCL 0 0
BitonicSort f641279bbb4bc39f 32 CS 107 0 0.00 0 0.00 OpenCL 0 0
BitonicSort f641279bbb4bc39f 32 CS 131 0 0.00 0 0.00 OpenCL 0 0
BitonicSort f641279bbb4bc39f 32 CS 150 0 0.00 0 0.00 OpenCL 0 0
BitonicSort f641279bbb4bc39f 32 CS 170 262144 0.01 58853503 224.00 OpenCL 0 0
BitonicSort f641279bbb4bc39f 32 CS 183 262144 0.16 718844611 2742.00 OpenCL 0 0
BitonicSort f641279bbb4bc39f 32 CS 192 0 0.00 0 0.00 OpenCL 0 0
BitonicSort f641279bbb4bc39f 32 CS 200 262144 0.01 37169385 141.00 OpenCL 0 0
BitonicSort f641279bbb4bc39f 32 CS 209 262144 0.01 30225183 115.00 OpenCL 0 0
BitonicSort f641279bbb4bc39f 32 CS 228 262144 0.21 918534816 3503.00 OpenCL 0 0
BitonicSort f641279bbb4bc39f 32 CS 247 262144 0.00 8497472 32.00 OpenCL 0 0
Each line corresponds to a single BBL (for mode 0), or to a single memory read instruction (mode 1) at a single run (dispatched to a HW device) of a specific kernel, where the fields have the following meaning:
If the name of the kernel is not known to GTPin, then GTPin creates an artificial name in the format: CS_asmf54af91315561f54_simd8, where the prefix indicates the kernel type; the suffix indicates the SIMD width to which this kernel was compiled; and the 16-digit number represents the hash ID of the IR representation of this kernel.
A user can know what exact instruction or basic block is meant, by looking into the assembly dump of the corresponding kernel which is saved in the folder: GTPIN_PROFILE_LATENCY0\ASM. For example:
// kernel name: BitonicSort // BBL0 [ 0] (W) mov (8|M0) r100.0<1>:ud r0.0<1;1,0>:ud [ 1] (W) or (1|M0) cr0.0<1>:ud cr0.0<0;1,0>:ud 0x4C0:uw {Switch} [ 2] (W) mul (1|M0) r8.0<1>:d r9.0<0;1,0>:d r100.1<0;1,0>:d {Compacted} [ 3] (W) cmp (16|M0) (eq)f1.0 null<1>:d r8.2<0;1,0>:d 0:w [ 4] (W) cmp (16|M16) (eq)f1.0 null<1>:d r8.2<0;1,0>:d 0:w [ 5] add (8|M0) r3.0<1>:q r1.0<8;8,1>:uw r8.0<0;1,0>:ud [ 6] add (8|M8) r5.0<1>:q r1.8<8;8,1>:uw r8.0<0;1,0>:ud [ 7] add (8|M16) r11.0<1>:q r2.0<8;8,1>:uw r8.0<0;1,0>:ud [ 8] add (8|M24) r9.0<1>:q r2.8<8;8,1>:uw r8.0<0;1,0>:ud [ 9] add (8|M0) r60.0<1>:q r3.0<4;4,1>:q r7.0<0;1,0>:ud [ 10] add (8|M8) r58.0<1>:q r5.0<4;4,1>:q r7.0<0;1,0>:ud [ 11] add (8|M16) r4.0<1>:q r11.0<4;4,1>:q r7.0<0;1,0>:ud [ 12] add (8|M24) r2.0<1>:q r9.0<4;4,1>:q r7.0<0;1,0>:ud [ 13] (W&f1.0) jmpi 2296 // BBL1 [ 14] (W) cmp (16|M0) (eq)f0.0 null<1>:d r8.3<0;1,0>:d 0:w [ 15] (W) cmp (16|M16) (eq)f0.0 null<1>:d r8.3<0;1,0>:d 0:w [ 16] (W&f0.0) jmpi 1376 // BBL2 [ 17] (W) add (1|M0) r8.0<1>:d r8.3<0;1,0>:d 31:w [ 18] (W) mov (1|M0) r6.0<1>:w 1:w [ 19] (W) add (1|M0) r8.7<1>:d r8.3<0;1,0>:d 63:w [ 20] (W) and (1|M0) r8.6<1>:d r8.3<0;1,0>:d 63:w [ 21] (W) and (1|M0) r8.1<1>:d r8.0<0;1,0>:d 31:w [ 22] (W) and (1|M0) r8.0<1>:d r8.7<0;1,0>:d 63:w [ 23] (W) shl (1|M0) r8.3<1>:d r6.0<0;1,0>:w r8.1<0;1,0>:d [ 24] shr (8|M0) r6.0<1>:q r60.0<4;4,1>:uq r8.0<0;1,0>:ud [ 25] shr (8|M8) r9.0<1>:q r58.0<4;4,1>:uq r8.0<0;1,0>:ud [ 26] shr (8|M16) r11.0<1>:q r4.0<4;4,1>:uq r8.0<0;1,0>:ud [ 27] shr (8|M24) r13.0<1>:q r2.0<4;4,1>:uq r8.0<0;1,0>:ud [ 28] (W) add (1|M0) r8.0<1>:q r8.3<0;1,0>:d -1:w [ 29] shl (8|M0) r25.0<1>:q r6.0<4;4,1>:q r8.6<0;1,0>:ud [ 30] shl (8|M8) r23.0<1>:q r9.0<4;4,1>:q r8.6<0;1,0>:ud [ 31] shl (8|M16) r21.0<1>:q r11.0<4;4,1>:q r8.6<0;1,0>:ud [ 32] shl (8|M24) r19.0<1>:q r13.0<4;4,1>:q r8.6<0;1,0>:ud [ 33] and (8|M0) r6.0<1>:q r60.0<4;4,1>:q r8.0<0;1,0>:q [ 34] and (8|M8) r9.0<1>:q r58.0<4;4,1>:q r8.0<0;1,0>:q [ 35] and (8|M16) r15.0<1>:q r4.0<4;4,1>:q r8.0<0;1,0>:q [ 36] and (8|M24) r17.0<1>:q r2.0<4;4,1>:q r8.0<0;1,0>:q [ 37] add (8|M0) r13.0<1>:q r25.0<4;4,1>:q r6.0<4;4,1>:q [ 38] add (8|M8) r11.0<1>:q r23.0<4;4,1>:q r9.0<4;4,1>:q [ 39] add (8|M16) r9.0<1>:q r21.0<4;4,1>:q r15.0<4;4,1>:q [ 40] add (8|M24) r6.0<1>:q r19.0<4;4,1>:q r17.0<4;4,1>:q [ 41] (W) add (1|M0) r8.0<1>:d r8.2<0;1,0>:d 63:w [ 42] add (8|M0) r15.0<2>:d r13.0<4;4,1>:q r8.3<0;1,0>:d [ 43] add (8|M8) r19.0<2>:d r11.0<4;4,1>:q r8.3<0;1,0>:d [ 44] add (8|M16) r17.0<2>:d r9.0<4;4,1>:q r8.3<0;1,0>:d [ 45] add (8|M24) r21.0<2>:d r6.0<4;4,1>:q r8.3<0;1,0>:d [ 46] mov (8|M0) r62.0<1>:d r13.0<2;1,0>:d [ 47] mov (8|M8) r63.0<1>:d r11.0<2;1,0>:d [ 48] mov (8|M16) r64.0<1>:d r9.0<2;1,0>:d [ 49] mov (8|M24) r65.0<1>:d r6.0<2;1,0>:d [ 50] mov (8|M0) r6.0<1>:d r15.0<2;1,0>:d [ 51] mov (8|M8) r7.0<1>:d r19.0<2;1,0>:d [ 52] mov (8|M16) r66.0<1>:d r17.0<2;1,0>:d [ 53] mov (8|M24) r67.0<1>:d r21.0<2;1,0>:d [ 54] shl (16|M0) r62.0<1>:d r62.0<8;8,1>:d 4:w [ 55] shl (16|M16) r64.0<1>:d r64.0<8;8,1>:d 4:w [ 56] shl (16|M0) r6.0<1>:d r6.0<8;8,1>:d 4:w [ 57] shl (16|M16) r66.0<1>:d r66.0<8;8,1>:d 4:w [ 58] add (16|M0) r62.0<1>:d r62.0<8;8,1>:d r8.5<0;1,0>:d {Compacted} [ 59] add (16|M16) r64.0<1>:d r64.0<8;8,1>:d r8.5<0;1,0>:d [ 60] add (16|M0) r6.0<1>:d r6.0<8;8,1>:d r8.5<0;1,0>:d {Compacted} [ 61] add (16|M16) r66.0<1>:d r66.0<8;8,1>:d r8.5<0;1,0>:d [ 62] send (16|M0) r18:w r62 0xC 0x4805000 [ 63] send (16|M16) r50:w r64 0xC 0x4805000 [ 64] send (16|M0) r10:w r6 0xC 0x4805000 [ 65] send (16|M16) r42:w r66 0xC 0x4805000 [ 66] (W) and (1|M0) r8.0<1>:d r8.0<0;1,0>:d 63:w [ 67] shr (8|M0) r26.0<1>:q r60.0<4;4,1>:uq r8.0<0;1,0>:ud [ 68] shr (8|M8) r28.0<1>:q r58.0<4;4,1>:uq r8.0<0;1,0>:ud [ 69] shr (8|M16) r34.0<1>:q r4.0<4;4,1>:uq r8.0<0;1,0>:ud [ 70] shr (8|M24) r36.0<1>:q r2.0<4;4,1>:uq r8.0<0;1,0>:ud [ 71] and (8|M0) r32.0<1>:q r26.0<4;4,1>:q 1:w [ 72] and (8|M8) r30.0<1>:q r28.0<4;4,1>:q 1:w [ 73] and (8|M16) r28.0<1>:q r34.0<4;4,1>:q 1:w [ 74] and (8|M24) r26.0<1>:q r36.0<4;4,1>:q 1:w [ 75] cmp (8|M0) (eq)f0.0 null<1>:q r32.0<4;4,1>:q r8.4<0;1,0>:ud [ 76] cmp (8|M8) (eq)f0.0 null<1>:q r30.0<4;4,1>:q r8.4<0;1,0>:ud [ 77] cmp (8|M16) (eq)f0.0 null<1>:q r28.0<4;4,1>:q r8.4<0;1,0>:ud [ 78] cmp (8|M24) (eq)f0.0 null<1>:q r26.0<4;4,1>:q r8.4<0;1,0>:ud [ 79] sel (16|M0) (lt)f0.0 r34.0<1>:d r18.0<8;8,1>:d r10.0<8;8,1>:d {Compacted} [ 80] sel (16|M0) (lt)f0.0 r36.0<1>:d r20.0<8;8,1>:d r12.0<8;8,1>:d {Compacted} [ 81] sel (16|M0) (lt)f0.0 r38.0<1>:d r22.0<8;8,1>:d r14.0<8;8,1>:d {Compacted} [ 82] sel (16|M0) (lt)f0.0 r40.0<1>:d r24.0<8;8,1>:d r16.0<8;8,1>:d {Compacted} [ 83] sel (16|M0) (ge)f0.0 r26.0<1>:d r10.0<8;8,1>:d r18.0<8;8,1>:d {Compacted} [ 84] sel (16|M0) (ge)f0.0 r28.0<1>:d r12.0<8;8,1>:d r20.0<8;8,1>:d {Compacted} [ 85] sel (16|M0) (ge)f0.0 r30.0<1>:d r14.0<8;8,1>:d r22.0<8;8,1>:d {Compacted} [ 86] sel (16|M0) (ge)f0.0 r32.0<1>:d r16.0<8;8,1>:d r24.0<8;8,1>:d {Compacted} [ 87] sel (16|M16) (lt)f0.0 r17.0<1>:d r50.0<8;8,1>:d r42.0<8;8,1>:d [ 88] sel (16|M16) (lt)f0.0 r19.0<1>:d r52.0<8;8,1>:d r44.0<8;8,1>:d [ 89] sel (16|M16) (lt)f0.0 r21.0<1>:d r54.0<8;8,1>:d r46.0<8;8,1>:d [ 90] sel (16|M16) (lt)f0.0 r23.0<1>:d r56.0<8;8,1>:d r48.0<8;8,1>:d [ 91] sel (16|M16) (ge)f0.0 r9.0<1>:d r42.0<8;8,1>:d r50.0<8;8,1>:d [ 92] sel (16|M16) (ge)f0.0 r11.0<1>:d r44.0<8;8,1>:d r52.0<8;8,1>:d [ 93] sel (16|M16) (ge)f0.0 r13.0<1>:d r46.0<8;8,1>:d r54.0<8;8,1>:d [ 94] sel (16|M16) (ge)f0.0 r15.0<1>:d r48.0<8;8,1>:d r56.0<8;8,1>:d // BBL3 [ 95] (~f0.0) if (32|M0) 96 160 // BBL4 [ 96] sends (16|M0) null:w r62 r34 0x20C 0x4025000 [ 97] sends (16|M16) null:w r64 r17 0x20C 0x4025000 [ 98] sends (16|M0) null:w r6 r26 0x20C 0x4025000 [ 99] sends (16|M16) null:w r66 r9 0x20C 0x4025000 // BBL5 [100] else (32|M0) 80 80 // BBL6 [101] sends (16|M0) null:w r6 r34 0x20C 0x4025000 [102] sends (16|M16) null:w r66 r17 0x20C 0x4025000 [103] sends (16|M0) null:w r62 r26 0x20C 0x4025000 [104] sends (16|M16) null:w r64 r9 0x20C 0x4025000 // BBL7 [105] endif (32|M0) 16 // BBL8 [106] (W) jmpi 872 // BBL9 [107] mov (8|M0) r6.0<1>:d r60.0<2;1,0>:d [108] mov (8|M8) r7.0<1>:d r58.0<2;1,0>:d [109] mov (8|M16) r42.0<1>:d r4.0<2;1,0>:d [110] mov (8|M24) r43.0<1>:d r2.0<2;1,0>:d [111] (W) and (1|M0) r8.0<1>:d r8.2<0;1,0>:d 63:w [112] shl (16|M0) r6.0<1>:d r6.0<8;8,1>:d 4:w [113] shl (16|M16) r42.0<1>:d r42.0<8;8,1>:d 4:w [114] shr (8|M0) r9.0<1>:q r60.0<4;4,1>:uq r8.0<0;1,0>:ud [115] shr (8|M8) r11.0<1>:q r58.0<4;4,1>:uq r8.0<0;1,0>:ud [116] shr (8|M16) r18.0<1>:q r4.0<4;4,1>:uq r8.0<0;1,0>:ud [117] add (16|M0) r6.0<1>:d r6.0<8;8,1>:d r8.5<0;1,0>:d {Compacted} [118] add (16|M16) r42.0<1>:d r42.0<8;8,1>:d r8.5<0;1,0>:d [119] shr (8|M24) r26.0<1>:q r2.0<4;4,1>:uq r8.0<0;1,0>:ud [120] and (8|M0) r24.0<1>:q r9.0<4;4,1>:q 1:w [121] and (8|M8) r22.0<1>:q r11.0<4;4,1>:q 1:w [122] send (16|M0) r10:w r6 0xC 0x4805000 [123] send (16|M16) r34:w r42 0xC 0x4805000 [124] and (8|M16) r20.0<1>:q r18.0<4;4,1>:q 1:w [125] and (8|M24) r18.0<1>:q r26.0<4;4,1>:q 1:w [126] cmp (8|M0) (eq)f1.0 null<1>:q r24.0<4;4,1>:q r8.4<0;1,0>:ud [127] cmp (8|M8) (eq)f1.0 null<1>:q r22.0<4;4,1>:q r8.4<0;1,0>:ud [128] cmp (8|M16) (eq)f1.0 null<1>:q r20.0<4;4,1>:q r8.4<0;1,0>:ud [129] cmp (8|M24) (eq)f1.0 null<1>:q r18.0<4;4,1>:q r8.4<0;1,0>:ud // BBL10 [130] (~f1.0) if (32|M0) 256 480 // BBL11 [131] sel (16|M0) (lt)f0.0 r30.0<1>:d r10.0<8;8,1>:d r14.0<8;8,1>:d {Compacted} [132] sel (16|M0) (lt)f0.0 r28.0<1>:d r12.0<8;8,1>:d r16.0<8;8,1>:d {Compacted} [133] sel (16|M0) (ge)f0.0 r18.0<1>:d r14.0<8;8,1>:d r10.0<8;8,1>:d {Compacted} [134] sel (16|M0) (ge)f0.0 r10.0<1>:d r16.0<8;8,1>:d r12.0<8;8,1>:d {Compacted} [135] sel (16|M16) (lt)f0.0 r22.0<1>:d r34.0<8;8,1>:d r38.0<8;8,1>:d [136] sel (16|M16) (lt)f0.0 r20.0<1>:d r36.0<8;8,1>:d r40.0<8;8,1>:d [137] sel (16|M16) (ge)f0.0 r44.0<1>:d r38.0<8;8,1>:d r34.0<8;8,1>:d [138] sel (16|M16) (ge)f0.0 r24.0<1>:d r40.0<8;8,1>:d r36.0<8;8,1>:d [139] sel (16|M0) (lt)f0.0 r26.0<1>:d r30.0<8;8,1>:d r28.0<8;8,1>:d {Compacted} [140] sel (16|M0) (ge)f0.0 r28.0<1>:d r30.0<8;8,1>:d r28.0<8;8,1>:d {Compacted} [141] sel (16|M0) (lt)f0.0 r30.0<1>:d r18.0<8;8,1>:d r10.0<8;8,1>:d {Compacted} [142] sel (16|M0) (ge)f0.0 r32.0<1>:d r18.0<8;8,1>:d r10.0<8;8,1>:d {Compacted} [143] sel (16|M16) (lt)f0.0 r18.0<1>:d r22.0<8;8,1>:d r20.0<8;8,1>:d [144] sel (16|M16) (ge)f0.0 r20.0<1>:d r22.0<8;8,1>:d r20.0<8;8,1>:d [145] sel (16|M16) (lt)f0.0 r22.0<1>:d r44.0<8;8,1>:d r24.0<8;8,1>:d [146] sel (16|M16) (ge)f0.0 r24.0<1>:d r44.0<8;8,1>:d r24.0<8;8,1>:d [147] sends (16|M0) null:w r6 r26 0x20C 0x4025000 [148] sends (16|M16) null:w r42 r18 0x20C 0x4025000 // BBL12 [149] else (32|M0) 240 240 // BBL13 [150] sel (16|M0) (ge)f0.0 r22.0<1>:d r14.0<8;8,1>:d r10.0<8;8,1>:d {Compacted} [151] sel (16|M0) (ge)f0.0 r20.0<1>:d r16.0<8;8,1>:d r12.0<8;8,1>:d {Compacted} [152] sel (16|M0) (lt)f0.0 r30.0<1>:d r10.0<8;8,1>:d r14.0<8;8,1>:d {Compacted} [153] sel (16|M0) (lt)f0.0 r28.0<1>:d r12.0<8;8,1>:d r16.0<8;8,1>:d {Compacted} [154] sel (16|M16) (ge)f0.0 r13.0<1>:d r38.0<8;8,1>:d r34.0<8;8,1>:d [155] sel (16|M16) (ge)f0.0 r11.0<1>:d r40.0<8;8,1>:d r36.0<8;8,1>:d [156] sel (16|M16) (lt)f0.0 r25.0<1>:d r34.0<8;8,1>:d r38.0<8;8,1>:d [157] sel (16|M16) (lt)f0.0 r15.0<1>:d r36.0<8;8,1>:d r40.0<8;8,1>:d [158] sel (16|M0) (ge)f0.0 r17.0<1>:d r20.0<8;8,1>:d r22.0<8;8,1>:d {Compacted} [159] sel (16|M0) (lt)f0.0 r19.0<1>:d r20.0<8;8,1>:d r22.0<8;8,1>:d {Compacted} [160] sel (16|M0) (ge)f0.0 r21.0<1>:d r28.0<8;8,1>:d r30.0<8;8,1>:d {Compacted} [161] sel (16|M0) (lt)f0.0 r23.0<1>:d r28.0<8;8,1>:d r30.0<8;8,1>:d {Compacted} [162] sel (16|M16) (ge)f0.0 r9.0<1>:d r11.0<8;8,1>:d r13.0<8;8,1>:d [163] sel (16|M16) (lt)f0.0 r11.0<1>:d r11.0<8;8,1>:d r13.0<8;8,1>:d [164] sel (16|M16) (ge)f0.0 r13.0<1>:d r15.0<8;8,1>:d r25.0<8;8,1>:d [165] sel (16|M16) (lt)f0.0 r15.0<1>:d r15.0<8;8,1>:d r25.0<8;8,1>:d [166] sends (16|M0) null:w r6 r17 0x20C 0x4025000 [167] sends (16|M16) null:w r42 r9 0x20C 0x4025000 // BBL14 [168] endif (32|M0) 16 // BBL15 [169] (W) jmpi 1048 // BBL16 [170] mov (8|M0) r6.0<1>:d r60.0<2;1,0>:d [171] mov (8|M8) r7.0<1>:d r58.0<2;1,0>:d [172] mov (8|M16) r26.0<1>:d r4.0<2;1,0>:d [173] mov (8|M24) r27.0<1>:d r2.0<2;1,0>:d [174] (W) cmp (16|M0) (eq)f1.0 null<1>:d r8.4<0;1,0>:d 0:w [175] (W) cmp (16|M16) (eq)f1.0 null<1>:d r8.4<0;1,0>:d 0:w [176] shl (16|M0) r6.0<1>:d r6.0<8;8,1>:d 4:w [177] shl (16|M16) r26.0<1>:d r26.0<8;8,1>:d 4:w [178] add (16|M0) r6.0<1>:d r6.0<8;8,1>:d r8.5<0;1,0>:d {Compacted} [179] add (16|M16) r26.0<1>:d r26.0<8;8,1>:d r8.5<0;1,0>:d [180] send (16|M0) r10:w r6 0xC 0x4805000 [181] send (16|M16) r18:w r26 0xC 0x4805000 [182] (W&f1.0) jmpi 128 // BBL17 [183] sel (16|M0) (lt)f0.0 r32.0<1>:d r10.0<8;8,1>:d r12.0<8;8,1>:d {Compacted} [184] sel (16|M16) (lt)f0.0 r42.0<1>:d r18.0<8;8,1>:d r20.0<8;8,1>:d [185] sel (16|M0) (ge)f0.0 r30.0<1>:d r12.0<8;8,1>:d r10.0<8;8,1>:d {Compacted} [186] sel (16|M16) (ge)f0.0 r38.0<1>:d r20.0<8;8,1>:d r18.0<8;8,1>:d [187] sel (16|M0) (ge)f0.0 r28.0<1>:d r14.0<8;8,1>:d r16.0<8;8,1>:d {Compacted} [188] sel (16|M16) (ge)f0.0 r40.0<1>:d r22.0<8;8,1>:d r24.0<8;8,1>:d [189] sel (16|M0) (lt)f0.0 r34.0<1>:d r16.0<8;8,1>:d r14.0<8;8,1>:d {Compacted} [190] sel (16|M16) (lt)f0.0 r36.0<1>:d r24.0<8;8,1>:d r22.0<8;8,1>:d [191] (W) jmpi 112 // BBL18 [192] sel (16|M0) (ge)f0.0 r32.0<1>:d r12.0<8;8,1>:d r10.0<8;8,1>:d {Compacted} [193] sel (16|M16) (ge)f0.0 r42.0<1>:d r20.0<8;8,1>:d r18.0<8;8,1>:d [194] sel (16|M0) (lt)f0.0 r30.0<1>:d r10.0<8;8,1>:d r12.0<8;8,1>:d {Compacted} [195] sel (16|M16) (lt)f0.0 r38.0<1>:d r18.0<8;8,1>:d r20.0<8;8,1>:d [196] sel (16|M0) (lt)f0.0 r28.0<1>:d r16.0<8;8,1>:d r14.0<8;8,1>:d {Compacted} [197] sel (16|M16) (lt)f0.0 r40.0<1>:d r24.0<8;8,1>:d r22.0<8;8,1>:d [198] sel (16|M0) (ge)f0.0 r34.0<1>:d r14.0<8;8,1>:d r16.0<8;8,1>:d {Compacted} [199] sel (16|M16) (ge)f0.0 r36.0<1>:d r22.0<8;8,1>:d r24.0<8;8,1>:d // BBL19 [200] and (8|M0) r11.0<1>:q r60.0<4;4,1>:q 1:w [201] and (8|M8) r9.0<1>:q r58.0<4;4,1>:q 1:w [202] and (8|M16) r4.0<1>:q r4.0<4;4,1>:q 1:w [203] and (8|M24) r2.0<1>:q r2.0<4;4,1>:q 1:w [204] cmp (8|M0) (eq)f0.0 null<1>:q r11.0<4;4,1>:q r8.4<0;1,0>:ud [205] cmp (8|M8) (eq)f0.0 null<1>:q r9.0<4;4,1>:q r8.4<0;1,0>:ud [206] cmp (8|M16) (eq)f0.0 null<1>:q r4.0<4;4,1>:q r8.4<0;1,0>:ud [207] cmp (8|M24) (eq)f0.0 null<1>:q r2.0<4;4,1>:q r8.4<0;1,0>:ud // BBL20 [208] (~f0.0) if (32|M0) 256 480 // BBL21 [209] sel (16|M0) (lt)f0.0 r20.0<1>:d r32.0<8;8,1>:d r28.0<8;8,1>:d {Compacted} [210] sel (16|M0) (lt)f0.0 r18.0<1>:d r30.0<8;8,1>:d r34.0<8;8,1>:d {Compacted} [211] sel (16|M0) (ge)f0.0 r8.0<1>:d r28.0<8;8,1>:d r32.0<8;8,1>:d {Compacted} [212] sel (16|M0) (ge)f0.0 r4.0<1>:d r34.0<8;8,1>:d r30.0<8;8,1>:d {Compacted} [213] sel (16|M16) (lt)f0.0 r12.0<1>:d r42.0<8;8,1>:d r40.0<8;8,1>:d [214] sel (16|M16) (lt)f0.0 r10.0<1>:d r38.0<8;8,1>:d r36.0<8;8,1>:d [215] sel (16|M16) (ge)f0.0 r14.0<1>:d r40.0<8;8,1>:d r42.0<8;8,1>:d [216] sel (16|M16) (ge)f0.0 r2.0<1>:d r36.0<8;8,1>:d r38.0<8;8,1>:d [217] sel (16|M0) (lt)f0.0 r16.0<1>:d r20.0<8;8,1>:d r18.0<8;8,1>:d {Compacted} [218] sel (16|M0) (ge)f0.0 r18.0<1>:d r20.0<8;8,1>:d r18.0<8;8,1>:d {Compacted} [219] sel (16|M0) (lt)f0.0 r20.0<1>:d r8.0<8;8,1>:d r4.0<8;8,1>:d {Compacted} [220] sel (16|M0) (ge)f0.0 r22.0<1>:d r8.0<8;8,1>:d r4.0<8;8,1>:d {Compacted} [221] sel (16|M16) (lt)f0.0 r8.0<1>:d r12.0<8;8,1>:d r10.0<8;8,1>:d [222] sel (16|M16) (ge)f0.0 r10.0<1>:d r12.0<8;8,1>:d r10.0<8;8,1>:d [223] sel (16|M16) (lt)f0.0 r12.0<1>:d r14.0<8;8,1>:d r2.0<8;8,1>:d [224] sel (16|M16) (ge)f0.0 r14.0<1>:d r14.0<8;8,1>:d r2.0<8;8,1>:d [225] sends (16|M0) null:w r6 r16 0x20C 0x4025000 [226] sends (16|M16) null:w r26 r8 0x20C 0x4025000 // BBL22 [227] else (32|M0) 240 240 // BBL23 [228] sel (16|M0) (ge)f0.0 r20.0<1>:d r28.0<8;8,1>:d r32.0<8;8,1>:d {Compacted} [229] sel (16|M0) (ge)f0.0 r18.0<1>:d r34.0<8;8,1>:d r30.0<8;8,1>:d {Compacted} [230] sel (16|M0) (lt)f0.0 r14.0<1>:d r32.0<8;8,1>:d r28.0<8;8,1>:d {Compacted} [231] sel (16|M0) (lt)f0.0 r8.0<1>:d r30.0<8;8,1>:d r34.0<8;8,1>:d {Compacted} [232] sel (16|M16) (ge)f0.0 r12.0<1>:d r40.0<8;8,1>:d r42.0<8;8,1>:d [233] sel (16|M16) (ge)f0.0 r10.0<1>:d r36.0<8;8,1>:d r38.0<8;8,1>:d [234] sel (16|M16) (lt)f0.0 r4.0<1>:d r42.0<8;8,1>:d r40.0<8;8,1>:d [235] sel (16|M16) (lt)f0.0 r2.0<1>:d r38.0<8;8,1>:d r36.0<8;8,1>:d [236] sel (16|M0) (ge)f0.0 r16.0<1>:d r18.0<8;8,1>:d r20.0<8;8,1>:d {Compacted} [237] sel (16|M0) (lt)f0.0 r18.0<1>:d r18.0<8;8,1>:d r20.0<8;8,1>:d {Compacted} [238] sel (16|M0) (ge)f0.0 r20.0<1>:d r8.0<8;8,1>:d r14.0<8;8,1>:d {Compacted} [239] sel (16|M0) (lt)f0.0 r22.0<1>:d r8.0<8;8,1>:d r14.0<8;8,1>:d {Compacted} [240] sel (16|M16) (ge)f0.0 r8.0<1>:d r10.0<8;8,1>:d r12.0<8;8,1>:d [241] sel (16|M16) (lt)f0.0 r10.0<1>:d r10.0<8;8,1>:d r12.0<8;8,1>:d [242] sel (16|M16) (ge)f0.0 r12.0<1>:d r2.0<8;8,1>:d r4.0<8;8,1>:d [243] sel (16|M16) (lt)f0.0 r14.0<1>:d r2.0<8;8,1>:d r4.0<8;8,1>:d [244] sends (16|M0) null:w r6 r16 0x20C 0x4025000 [245] sends (16|M16) null:w r26 r8 0x20C 0x4025000 // BBL24 [246] endif (32|M0) 16 // BBL25 [247] (W) mov (8|M0) r112.0<1>:ud r100.0<8;8,1>:ud {Compacted} // BBL26 [248] (W) send (8|M0) null r112 0x27 0x2000010 {EOT}
(Back to the list of all GTPin Sample Tools)
00001 /*========================== begin_copyright_notice ============================ 00002 Copyright (C) 2017-2022 Intel Corporation 00003 00004 SPDX-License-Identifier: MIT 00005 ============================= end_copyright_notice ===========================*/ 00006 00007 /*! 00008 * @file Latency tool definitions 00009 */ 00010 00011 #ifndef LATENCY_H_ 00012 #define LATENCY_H_ 00013 00014 #include <list> 00015 #include <map> 00016 #include <set> 00017 #include <string> 00018 00019 #include "gtpin_api.h" 00020 #include "gtpin_tool_utils.h" 00021 00022 using namespace gtpin; 00023 00024 /*! 00025 * Identifier of the instrumentation site; depends on the latency measurment mode: 00026 * - Basic block latency : ID of the first instruction of the basic block 00027 * - Memory read latency : ID of the memory instruction 00028 */ 00029 using InstrumentSiteId = InsId; 00030 00031 /// Array of instrumentation site IDs, indexed by ordinal numbers of corresponding records in the profile buffer 00032 using InstrumentSites = std::vector<InstrumentSiteId>; 00033 00034 /* ============================================================================================= */ 00035 // Struct LatencyRecord 00036 /* ============================================================================================= */ 00037 /*! 00038 * Layout of records collected in profile buffer by the Latency tool 00039 */ 00040 struct alignas(uint64_t) LatencyRecord 00041 { 00042 uint64_t cycles; ///< Total number of cycles 00043 uint32_t freq; ///< Total number of executions 00044 }; 00045 00046 /* ============================================================================================= */ 00047 // Class LatencyDispatchProfile 00048 /* ============================================================================================= */ 00049 /*! 00050 * Profiling data collected during a single kernel dispatch per a single instrumentation site 00051 */ 00052 struct LatencyDispatchProfile 00053 { 00054 explicit LatencyDispatchProfile(const IGtKernelDispatch& kernelDispatch, InstrumentSiteId siteId); 00055 void Accumulate(const LatencyRecord& record); 00056 00057 GtKernelExecDesc kernelExecDesc; ///< Kernel execution descriptor 00058 InstrumentSiteId siteId; ///< Identifier of the instrumentation site: BBL ID or instruction ID 00059 uint64_t cycles; ///< Total number of cycles 00060 uint64_t freq; ///< Total number of executions 00061 }; 00062 00063 /* ============================================================================================= */ 00064 // Class LatencyKernelProfile 00065 /* ============================================================================================= */ 00066 /*! 00067 * Aggregated profile of all instrumented kernel dispatches 00068 */ 00069 class LatencyKernelProfile 00070 { 00071 public: 00072 LatencyKernelProfile(const IGtKernel& kernel, const IGtCfg& cfg, const GtProfileArray& profileArray, 00073 InstrumentSites&& instrumentSites); 00074 00075 /// Add the specified profile of a kernel dispatch 00076 LatencyDispatchProfile& AddDispatchProfile(const LatencyDispatchProfile& dispathProfile); 00077 00078 std::string ToString() const; ///< @return Text representation of the profile data 00079 void DumpAsm() const; ///< Dump kernel's assembly text to file 00080 std::string GetName() const { return _name; } ///< @return Kernel's name 00081 const GtProfileArray& GetProfileArray() const { return _profileArray; } ///< @return Profile buffer accessor 00082 00083 /// Given a record number in the profile buffer, return corresponding ID of the instrumentation site 00084 InstrumentSiteId GetSiteId(uint32_t recordNum) const; 00085 00086 /// @return Collection of profiles per kernel dispatch 00087 typedef std::list<LatencyDispatchProfile> DispatchProfiles; 00088 const DispatchProfiles& GetDispatchProfiles() const { return _dispatchProfiles; } 00089 00090 private: 00091 std::string _name; ///< Kernel's name 00092 std::string _uniqueName; ///< Kernel's unique name (IGC style) 00093 GtKernelType _type; ///< Kernel's type 00094 GtGpuPlatform _platform; ///< Kernel's platform 00095 uint64_t _hashId; ///< Kernel's hash identifier 00096 GtSimdWidth _simd; ///< Kernel's SIMD width 00097 std::string _asmText; ///< Kernel's assembly text 00098 GtProfileArray _profileArray; ///< Profile buffer accessor 00099 InstrumentSites _instrumentSites; ///< Array of instrumentation site IDs 00100 DispatchProfiles _dispatchProfiles; ///< Profiles per kernel dispatch 00101 std::map<std::string, uint64_t> _execTotalCycles; ///< Total number of cycles per execution descriptor 00102 }; 00103 00104 /* ============================================================================================= */ 00105 // Class Latency 00106 /* ============================================================================================= */ 00107 /*! 00108 * Implementation of the IGtTool interface for the Latency tool 00109 */ 00110 class Latency : public GtTool 00111 { 00112 public: 00113 /// Implementation of the IGtTool interface 00114 const char* Name() const { return "latency"; } 00115 00116 void OnKernelBuild(IGtKernelInstrument& instrumentor); 00117 void OnKernelRun(IGtKernelDispatch& dispatcher); 00118 void OnKernelComplete(IGtKernelDispatch& dispatcher); 00119 00120 public: 00121 std::string ToString() const; ///< @return Text representation of the profile data 00122 void DumpAsm() const; ///< Dump assembly text of profiled kernels to files 00123 00124 static Latency* Instance(); ///< @return Single instance of this class 00125 static void OnFini() { Instance()->Fini(); } ///< Callback function registered with atexit() 00126 protected: 00127 Latency() = default; 00128 Latency(const Latency&) = delete; 00129 Latency& operator = (const Latency&) = delete; 00130 ~Latency() = default; 00131 00132 /// @return Collection of kernel profiles 00133 typedef std::map<GtKernelId, LatencyKernelProfile> KernelProfiles; 00134 const KernelProfiles& GetKernelProfiles() const { return _kernels; } 00135 00136 /// Post process and dump profiling data 00137 void Fini(); 00138 private: 00139 /// Generate latency instrumentation for basic blocks 00140 void InstrumentBasicBlocks(IGtKernelInstrument& instrumentor); 00141 00142 /// Generate latency instrumentation for memory instructions 00143 void InstrumentMemoryInstructions(IGtKernelInstrument& instrumentor); 00144 00145 /// Generate instrumentation to be inserted before the code fragment whose latency is measured 00146 void GeneratePreCode(GtGenProcedure& proc, const IGtGenCoder& coder); 00147 00148 /*! 00149 * Generate instrumentation that computes latency and stores the result in the profile buffer. 00150 * - GeneratePostCodeMemBound - Generate memory-bound instrumentation 00151 * - GeneratePostCodeRegBound - Generate register-bound instrumentation. Return success/failure status 00152 * 00153 * @param[in, out] postProc Procedure to be inserted after the code fragment whose latency is measured 00154 * @param[in, out] finiProc Procedure to be inserted before each EOT 00155 * @param[in] coder GEN code generator 00156 * @param[in] profileArray Array of 'LatencyRecord's in the profile buffer 00157 * @param[in] recordNum Index of the record in 'profileArray' associated with the instrumentation site 00158 */ 00159 void GeneratePostCodeMemBound(GtGenProcedure& postProc, const IGtGenCoder& coder, 00160 const GtProfileArray& profileArray, uint32_t recordNum); 00161 bool GeneratePostCodeRegBound(GtGenProcedure& postProc, GtGenProcedure& finiProc, const IGtGenCoder& coder, 00162 const GtProfileArray& profileArray, uint32_t recordNum); 00163 00164 /// @return true/false - use 64-bit/32-bit integer for the cycle counter 00165 static bool Use64BitCounters(const IGtGenCoder& coder); 00166 00167 private: 00168 KernelProfiles _kernels; ///< Collection of kernel profiles 00169 00170 GtReg _addrReg; ///< Virtual register that holds address within profile buffer 00171 GtReg _dataReg; ///< Virtual register that holds data to be read from/written to profile buffer 00172 GtReg _timeReg; ///< Virtual timer register 00173 }; 00174 00175 #endif
00001 /*========================== begin_copyright_notice ============================ 00002 Copyright (C) 2016-2026 Intel Corporation 00003 00004 SPDX-License-Identifier: MIT 00005 ============================= end_copyright_notice ===========================*/ 00006 00007 /*! 00008 * @file Implementation of the Latency tool 00009 */ 00010 00011 #include <fstream> 00012 #include <sstream> 00013 #include <iomanip> 00014 #include <algorithm> 00015 00016 #include "latency.h" 00017 00018 using namespace gtpin; 00019 using namespace std; 00020 00021 /* ============================================================================================= */ 00022 // Configuration 00023 /* ============================================================================================= */ 00024 Knob<int> knobMode("mode", 0, "Latency measurment mode: 0 - basic block latency, 1 - memory read latency"); 00025 Knob<int> knobNumThreadBuckets("num_thread_buckets", 0, "Number of thread buckets. 0 (default) - maximum thread buckets"); 00026 Knob<bool> knobUseRegInstrument("no_reg_instrument", true, "Disable register-bound instrumentation"); 00027 Knob<int> knobMaxFreq("max_freq", 100, "Maximum amount of instrument executions for bbl per kernel execution. 100 (default) - Applied only when use_conditional_instrument is on"); 00028 Knob<bool> knobUseConditionalInstrument("use_conditional_instrument", false, "Enable conditional instrumentation"); 00029 Knob<bool> knobSkipZeroResults("skip_zero_results", false, "Skip zero results in the Latency output"); 00030 00031 /* ============================================================================================= */ 00032 // Latency implementation 00033 /* ============================================================================================= */ 00034 Latency* Latency::Instance() 00035 { 00036 static Latency instance; 00037 return &instance; 00038 } 00039 00040 void Latency::OnKernelBuild(IGtKernelInstrument& instrumentor) 00041 { 00042 /* 00043 * Register allocation policy 00044 * ====================================================================================================== 00045 * To ensure accuracy of latency measurments, the instrumentation overhead should be as low as possible. 00046 * It is believed that register-bound procedures incur lower overhead than memory-bound procedures. 00047 * Assuming that, we will try to use register-bound instrumentation for as many sites as possible, and 00048 * apply memory-bound instrumentation to the rest of the sites. 00049 * 00050 * To check and guarantee availability of free physical registers we pre-allocate registers 00051 * in the register-bound procedures created by the GeneratePostCodeRegBound() function. 00052 * For this purpose we use the IGtRegAllocator::ReserveVregOperands() interface. 00053 * 00054 * For memory-bound procedures, pre-allocation of registers is not needed, and even can be harmful, as 00055 * explained in the IGtRegAllocator description. On the other hand, excessive pre-allocation of registers 00056 * in the register-bound procedures may cause multiple spills/fills in memory-bound procedures. To avoid 00057 * this situation, we do the following: 00058 * A) RESERVE registers in memory-bound procedures BEFORE generating register-bound procedures using 00059 * IGtRegAllocator::Reserve() 00060 * B) RELEASE registers in memory-bound procedures AFTER generating register-bound procedures using 00061 * IGtRegAllocator::ReleaseReserved() 00062 */ 00063 00064 const IGtGenCoder& coder = instrumentor.Coder(); 00065 IGtVregFactory& vregs = coder.VregFactory(); 00066 IGtRegAllocator& ra = coder.RegAllocator(); 00067 bool is64BitCounter = Use64BitCounters(coder); 00068 00069 // Initialize virtual registers 00070 _timeReg = vregs.Make(VREG_TYPE_DWORD); 00071 _addrReg = vregs.MakeMsgAddrScratch(); 00072 _dataReg = vregs.MakeMsgDataScratch(is64BitCounter? VREG_TYPE_QWORD : VREG_TYPE_DWORD); 00073 00074 // A) Reserve registers used in the memory-bound procedures 00075 ra.Reserve(_dataReg.VregNumber()); 00076 ra.Reserve(_addrReg.VregNumber()); 00077 ra.Reserve(vregs.GetProfileBufferAddrVreg().Num()); 00078 00079 switch (knobMode) 00080 { 00081 case 0: InstrumentBasicBlocks(instrumentor); break; 00082 case 1: InstrumentMemoryInstructions(instrumentor); break; 00083 default: GTPIN_ERROR_MSG("LATENCY : Invalid value of the 'mode' option"); 00084 } 00085 00086 // B) Release registers used in the memory-bound procedures 00087 ra.ReleaseReserved(_dataReg.VregNumber()); 00088 ra.ReleaseReserved(_addrReg.VregNumber()); 00089 ra.ReleaseReserved(vregs.GetProfileBufferAddrVreg().Num()); 00090 } 00091 00092 void Latency::InstrumentBasicBlocks(IGtKernelInstrument& instrumentor) 00093 { 00094 const IGtKernel& kernel = instrumentor.Kernel(); 00095 const IGtCfg& cfg = instrumentor.Cfg(); 00096 const IGtGenCoder& coder = instrumentor.Coder(); 00097 const IGtGenModel& genModel = kernel.GenModel(); 00098 IGtProfileBufferAllocator& allocator = instrumentor.ProfileBufferAllocator(); 00099 00100 // Select basic blocks to be instrumented. Exclude BBLs whose only instruction is a branch 00101 vector<const IGtBbl*> bbls; 00102 for (auto bblPtr : cfg.Bbls()) 00103 { 00104 if (!bblPtr->IsEmpty() && !bblPtr->FirstIns().IsChangingIP()) 00105 { 00106 bbls.push_back(bblPtr); 00107 } 00108 } 00109 00110 // Allocate the profile buffer. It will hold single LatencyRecord per each instrumentation site in each thread bucket 00111 uint32_t numThreadBuckets = (knobNumThreadBuckets == 0) ? genModel.MaxThreadBuckets() : knobNumThreadBuckets; 00112 uint32_t numRecords = (uint32_t)bbls.size(); 00113 GtProfileArray profileArray(sizeof(LatencyRecord), numRecords, numThreadBuckets); 00114 profileArray.Allocate(allocator); 00115 00116 00117 // Instrument selected basic blocks 00118 GtGenProcedure finiCode; // Procedure that stores results of register-bound measurments in the profile buffer 00119 bool tryRegInstrument = knobUseRegInstrument; // true - try to apply register-bound instrumentation 00120 for (uint32_t idx = 0; idx != bbls.size(); ++idx) 00121 { 00122 const IGtBbl& bbl = *bbls[idx]; 00123 00124 int32_t bblId = bbl.Id(); 00125 00126 if (bblId < knobMinInstrumentBbl || bblId > knobMaxInstrumentBbl) 00127 { 00128 continue; 00129 } 00130 00131 // Insert code that starts timer at BBL's entry 00132 GtGenProcedure preCode; 00133 GeneratePreCode(preCode, coder); 00134 InstrumentBbl(instrumentor, bbl, GtIpoint::Before(), preCode); 00135 00136 // Insert code that stops timer at BBL's exit and stores result in the profile buffer 00137 GtGenProcedure postCode; 00138 if (bbl.IsEot()) 00139 { 00140 // Insert fake consumers of source registers to expose hidden latency of EOT instructions 00141 const IGtIns& eotIns = bbl.LastIns(); 00142 coder.GenerateFakeSrcConsumers(postCode, eotIns); 00143 GeneratePostCodeMemBound(postCode, coder, profileArray, idx); 00144 InstrumentInstruction(instrumentor, eotIns, GtIpoint::Before(), postCode); 00145 } 00146 else 00147 { 00148 if (tryRegInstrument) 00149 { 00150 tryRegInstrument = GeneratePostCodeRegBound(postCode, finiCode, coder, profileArray, idx); 00151 } 00152 if (!tryRegInstrument) 00153 { 00154 GeneratePostCodeMemBound(postCode, coder, profileArray, idx); 00155 } 00156 InstrumentBbl(instrumentor, bbl, GtIpoint::After(), postCode); 00157 } 00158 } 00159 00160 // Insert 'finiCode' at all exits of the kernel - before each EOT instruction 00161 instrumentor.InstrumentExits(finiCode); 00162 00163 // Transform 'bbls' into array of instrumentation sites 00164 InstrumentSites sites(bbls.size()); 00165 std::transform(bbls.begin(), bbls.end(), sites.begin(), [](const IGtBbl* b) -> InsId { return b->FirstIns().Id(); }); 00166 00167 // Create LatencyKernelProfile object that represents profile of this kernel 00168 _kernels.emplace(kernel.Id(), LatencyKernelProfile(kernel, cfg, profileArray, std::move(sites))); 00169 } 00170 00171 void Latency::InstrumentMemoryInstructions(IGtKernelInstrument& instrumentor) 00172 { 00173 const IGtKernel& kernel = instrumentor.Kernel(); 00174 const IGtCfg& cfg = instrumentor.Cfg(); 00175 const IGtGenCoder& coder = instrumentor.Coder(); 00176 const IGtGenModel& genModel = kernel.GenModel(); 00177 IGtProfileBufferAllocator& allocator = instrumentor.ProfileBufferAllocator(); 00178 00179 // Select memory instructions to be instrumentated 00180 InstrumentSites sites; 00181 for (auto bblPtr : cfg.Bbls()) 00182 { 00183 for (auto insPtr : bblPtr->Instructions()) 00184 { 00185 if (insPtr->IsMemRead() && !insPtr->IsEot()) 00186 { 00187 sites.push_back(insPtr->Id()); 00188 } 00189 } 00190 } 00191 00192 // Allocate the profile buffer. It will hold single LatencyRecord per each instrumentation site in each thread bucket 00193 uint32_t numThreadBuckets = (knobNumThreadBuckets == 0) ? genModel.MaxThreadBuckets() : knobNumThreadBuckets; 00194 uint32_t numRecords = (uint32_t)sites.size(); 00195 GtProfileArray profileArray(sizeof(LatencyRecord), numRecords, numThreadBuckets); 00196 profileArray.Allocate(allocator); 00197 00198 // Instrument each instruction whose ID appears in 'sites': 00199 // - Before instruction, insert fake consumers of source registers + procedure that starts timer 00200 // - After instruction, insert fake consumers of destination registers + procedure that stops timer 00201 // Use memory-bound instrumentation because, in case of a single instruction latency, the instrumentation overhead does 00202 // not compromise accuracy, while the register allocation flexibility is important. 00203 for (uint32_t idx = 0; idx != sites.size(); ++idx) 00204 { 00205 const IGtIns& ins = cfg.GetInstruction(sites[idx]); 00206 00207 GtGenProcedure preCode; 00208 coder.GenerateFakeSrcConsumers(preCode, ins); 00209 GeneratePreCode(preCode, coder); 00210 InstrumentInstruction(instrumentor, ins, GtIpoint::Before(), preCode); 00211 00212 GtGenProcedure postCode; 00213 coder.GenerateFakeDstConsumers(postCode, ins); 00214 GeneratePostCodeMemBound(postCode, coder, profileArray, idx); 00215 InstrumentInstruction(instrumentor, ins, GtIpoint::After(), postCode); 00216 } 00217 00218 // Create LatencyKernelProfile object that represents profile of this kernel 00219 _kernels.emplace(kernel.Id(), LatencyKernelProfile(kernel, cfg, profileArray, std::move(sites))); 00220 } 00221 00222 void Latency::OnKernelRun(IGtKernelDispatch& dispatcher) 00223 { 00224 bool isProfileEnabled = false; 00225 00226 const IGtKernel& kernel = dispatcher.Kernel(); 00227 GtKernelExecDesc execDesc; dispatcher.GetExecDescriptor(execDesc); 00228 if (kernel.IsInstrumented() && IsKernelExecProfileEnabled(execDesc, kernel.GpuPlatform(), kernel.Name().Get())) 00229 { 00230 auto it = _kernels.find(kernel.Id()); 00231 00232 if (it != _kernels.end()) 00233 { 00234 IGtProfileBuffer* buffer = dispatcher.CreateProfileBuffer(); GTPIN_ASSERT(buffer); 00235 LatencyKernelProfile& kernelProfile = it->second; 00236 const GtProfileArray& profileArray = kernelProfile.GetProfileArray(); 00237 00238 if (profileArray.Size() != 0) 00239 { 00240 if (profileArray.Initialize(*buffer)) 00241 { 00242 isProfileEnabled = true; 00243 } 00244 else 00245 { 00246 GTPIN_ERROR_MSG(string("LATENCY : ") + string(kernel.Name()) + " : Failed to write into memory buffer"); 00247 } 00248 } 00249 } 00250 } 00251 dispatcher.SetProfilingMode(isProfileEnabled); 00252 } 00253 00254 void Latency::OnKernelComplete(IGtKernelDispatch& dispatcher) 00255 { 00256 const IGtKernel& kernel = dispatcher.Kernel(); 00257 GtKernelExecDesc execDesc; dispatcher.GetExecDescriptor(execDesc); 00258 bool isProfilingEnabled = dispatcher.IsProfilingEnabled(); 00259 if (!isProfilingEnabled || !IsKernelExecProfileEnabled(execDesc, kernel.GpuPlatform(), kernel.Name().Get())) 00260 { 00261 return; // Do nothing with unprofiled kernel dispatches 00262 } 00263 00264 auto it = _kernels.find(kernel.Id()); 00265 00266 if (it != _kernels.end()) 00267 { 00268 const IGtProfileBuffer* buffer = dispatcher.GetProfileBuffer(); GTPIN_ASSERT(buffer); 00269 LatencyKernelProfile& kernelProfile = it->second; 00270 const GtProfileArray& profileArray = kernelProfile.GetProfileArray(); 00271 00272 for (uint32_t recordNum = 0; recordNum != profileArray.NumRecords(); ++recordNum) 00273 { 00274 LatencyDispatchProfile dispatchProfile(dispatcher, kernelProfile.GetSiteId(recordNum)); 00275 00276 for (uint32_t threadBucket = 0; threadBucket < profileArray.NumThreadBuckets(); ++threadBucket) 00277 { 00278 LatencyRecord record; 00279 if (!profileArray.Read(*buffer, &record, recordNum, 1, threadBucket)) 00280 { 00281 GTPIN_ERROR_MSG(string("LATENCY : ") + string(kernel.Name()) + " : Failed to read from memory buffer"); 00282 } 00283 else 00284 { 00285 dispatchProfile.Accumulate(record); 00286 } 00287 } 00288 kernelProfile.AddDispatchProfile(dispatchProfile); 00289 } 00290 } 00291 } 00292 00293 bool Latency::Use64BitCounters(const IGtGenCoder& coder) 00294 { 00295 return coder.InstructionFactory().CanAccessAtomically(GED_DATA_TYPE_uq); 00296 } 00297 00298 void Latency::GeneratePreCode(GtGenProcedure& proc, const IGtGenCoder& coder) 00299 { 00300 coder.StartTimer(proc, _timeReg); 00301 if (!proc.empty()) { proc.front()->AppendAnnotation(__func__); } 00302 } 00303 00304 void Latency::GeneratePostCodeMemBound(GtGenProcedure& proc, const IGtGenCoder& coder, 00305 const GtProfileArray& profileArray, uint32_t recordNum) 00306 { 00307 IGtInsFactory& insF = coder.InstructionFactory(); 00308 bool is64BitCounter = Use64BitCounters(coder); 00309 GtReg dataRegL = {_dataReg, sizeof(uint32_t), 0}; // Low 32-bits of the data payload register 00310 GtPredicate pred(FlagReg(0)); 00311 00312 // Generate code that computes elapsed time 00313 coder.StopTimerExt(proc, _timeReg); 00314 00315 // cycles += _timeReg 00316 proc += insF.MakeMov(dataRegL, _timeReg); // Move timer value to the low 32-bits of the data register 00317 if (is64BitCounter) 00318 { 00319 // Clear the high 32-bits of the data payload register 00320 GtReg dataRegH = {_dataReg, sizeof(uint32_t), 1}; 00321 proc += insF.MakeMov(dataRegH, 0); 00322 } 00323 00324 // _addrReg = address of the current thread's CILatencyRecord in the profile buffer 00325 profileArray.ComputeAddress(coder, proc, _addrReg, recordNum); 00326 proc += insF.MakeAtomicAdd(NullReg(), _addrReg, _dataReg, (is64BitCounter? GED_DATA_TYPE_uq : GED_DATA_TYPE_ud)); 00327 00328 // freq++ 00329 profileArray.ComputeRelAddress(coder, proc, _addrReg, _addrReg, offsetof(LatencyRecord, freq)); 00330 proc += insF.MakeAtomicInc(NullReg(), _addrReg, GED_DATA_TYPE_ud).SetPredicate(!pred);; 00331 00332 if (knobUseConditionalInstrument) 00333 { 00334 /* 00335 Since we don't have a way to atomically read-modify-write a memory location with a condition, 00336 print a warning that the conditional instrumentation is not supported in memory-bound mode. 00337 */ 00338 GTPIN_WARNING("LATENCY : Skipping Conditional instrumentation for record number: " + DecStr(recordNum) + ", since it is not supported in memory-bound mode."); 00339 } 00340 00341 if (!proc.empty()) { proc.front()->AppendAnnotation(__func__); } 00342 } 00343 00344 bool Latency::GeneratePostCodeRegBound(GtGenProcedure& postProc, GtGenProcedure& finiProc, const IGtGenCoder& coder, 00345 const GtProfileArray& profileArray, uint32_t recordNum) 00346 { 00347 IGtInsFactory& insF = coder.InstructionFactory(); 00348 IGtVregFactory& vregs = coder.VregFactory(); 00349 IGtRegAllocator& ra = coder.RegAllocator(); 00350 bool is64BitCounter = Use64BitCounters(coder); 00351 00352 GtReg flagReg = FlagReg(0); 00353 GtReg freqReg = vregs.MakeCounter(VREG_TYPE_DWORD); // Frequency counter 00354 GtReg cycleReg = vregs.MakeCounter(is64BitCounter ? VREG_TYPE_QWORD : VREG_TYPE_DWORD); // Cycle counter 00355 GtReg cycleRegL = {cycleReg, sizeof(uint32_t), 0}; // Low 32-bits of cycle counter 00356 GtReg dataRegL = {_dataReg, sizeof(uint32_t), 0}; // Low 32-bits of '_dataReg' 00357 00358 // Generate procedure that computes and aggregates cycles and frequency 00359 GtGenProcedure proc; 00360 coder.StopTimerExt(proc, _timeReg); 00361 00362 // Generate procedure that updates registers 00363 GtGenProcedure updateResProc; 00364 updateResProc += insF.MakeAdd(cycleRegL, cycleRegL, _timeReg); // Add elapsed time to lower 32-bits of of 'cycleReg' 00365 if (is64BitCounter) 00366 { 00367 // If cycleRegL overflowed, increment the high 32-bits of 'cycleReg' 00368 GtReg cycleRegH = {cycleReg, sizeof(uint32_t), 1}; 00369 updateResProc += insF.MakeCmp(GED_COND_MODIFIER_l, flagReg, cycleRegL, _timeReg); 00370 updateResProc += insF.MakeAdd(cycleRegH, cycleRegH, 1).SetPredicate(flagReg); 00371 } 00372 updateResProc += insF.MakeAdd(freqReg, freqReg, 1); // freq++ 00373 00374 if (!ra.ReserveVregOperands(updateResProc)) 00375 { 00376 return false; // No more free registers 00377 } 00378 updateResProc.front()->AppendAnnotation(__func__); 00379 00380 // Convert to conditional instrumentation if knob is turned on. Otherwise - append original procedure. 00381 if (knobUseConditionalInstrument) 00382 { 00383 /* 00384 * This code demonstrates a use of the conditional instrumentation feature. 00385 * It converts the updating results procedure into conditional, its execution will be determined during runtime 00386 * when the condition is met. In this case - if the value of freqReg is smaller than max_freq for BBL. 00387 * 00388 * It has two steps: 00389 * 1. Creates procedure that calculates the condition's value. 00390 * 2. Converts original procedure into conditional based of the condition's value stored in a virtual register. 00391 * 00392 */ 00393 00394 // Generate condition procedure for register bound basic block - check if freqReg is lower than knobMaxFreq 00395 GtGenProcedure condPostCode; 00396 coder.GenerateConditionalProcedure(updateResProc, condPostCode, GED_COND_MODIFIER_l, freqReg, knobMaxFreq.GetValue()); 00397 00398 proc += condPostCode; 00399 } 00400 else 00401 { 00402 proc += updateResProc; 00403 } 00404 00405 postProc.MoveAfter(std::move(proc)); 00406 00407 // Generate 'finiProc' procedure that stores aggregated cycles and frequency counters in the profile buffer 00408 profileArray.ComputeAddress(coder, proc, _addrReg, recordNum); 00409 proc += insF.MakeRegMov(_dataReg, cycleReg); 00410 proc += insF.MakeAtomicAdd(NullReg(), _addrReg, _dataReg, (is64BitCounter? GED_DATA_TYPE_uq : GED_DATA_TYPE_ud)); 00411 00412 profileArray.ComputeRelAddress(coder, proc, _addrReg, _addrReg, offsetof(LatencyRecord, freq)); 00413 proc += insF.MakeRegMov(dataRegL, freqReg); 00414 proc += insF.MakeAtomicAdd(NullReg(), _addrReg, _dataReg, GED_DATA_TYPE_ud); 00415 00416 proc.front()->AppendAnnotation("GenerateFiniCode"); 00417 finiProc.MoveAfter(std::move(proc)); 00418 00419 return true; 00420 } 00421 00422 string Latency::ToString() const 00423 { 00424 ostringstream ostr; 00425 ostr << "### Kernel/Shader execution-time profile generated by GTPin ###" << endl << endl; 00426 ostr << "Legend:" << endl; 00427 ostr << "NA - kernel was not instrumented." << endl << endl; 00428 ostr << setw(30) << "Name" << setw(20) << "HashID" << setw(10) << "SIMD" << setw(10) << "Type"; 00429 ostr << setw(10) << "Ins. ID"; 00430 ostr << setw(15) << "Freq." << setw(20) << "Total-latency(%)" << setw(20) << "Total-latency" << setw(20) << "Avg-Cycles"; 00431 ostr << setw(20) << "Platform" << " " << setw(35) << "Execution descriptor"; 00432 ostr << endl; 00433 for (const auto& kernelEntry : _kernels) 00434 { 00435 ostr << kernelEntry.second.ToString(); 00436 } 00437 return ostr.str(); 00438 } 00439 00440 void Latency::DumpAsm() const 00441 { 00442 for (const auto& kernelEntry : _kernels) 00443 { 00444 kernelEntry.second.DumpAsm(); 00445 } 00446 } 00447 00448 void Latency::Fini() 00449 { 00450 string profileDir = GTPin_GetCore()->ProfileDir(); 00451 string filePath = JoinPath(profileDir, "latency.txt"); 00452 00453 ofstream fs(filePath); 00454 if (fs.is_open()) 00455 { 00456 fs << ToString(); 00457 fs.close(); 00458 } 00459 else 00460 { 00461 GTPIN_WARNING("LATENCY : could not create file: " + filePath); 00462 } 00463 DumpAsm(); 00464 } 00465 00466 /* ============================================================================================= */ 00467 // LatencyDispatchProfile implementation 00468 /* ============================================================================================= */ 00469 LatencyDispatchProfile::LatencyDispatchProfile(const IGtKernelDispatch& kernelDispatch, InstrumentSiteId id) : 00470 siteId(id), cycles(0), freq(0) 00471 { 00472 kernelDispatch.GetExecDescriptor(kernelExecDesc); 00473 } 00474 00475 void LatencyDispatchProfile::Accumulate(const LatencyRecord& record) 00476 { 00477 cycles += record.cycles; 00478 freq += record.freq; 00479 } 00480 00481 /* ============================================================================================= */ 00482 // LatencyKernelProfile implementation 00483 /* ============================================================================================= */ 00484 LatencyKernelProfile::LatencyKernelProfile(const IGtKernel& kernel, 00485 const IGtCfg& cfg, 00486 const GtProfileArray& profileArray, 00487 InstrumentSites&& instrumentSites) : 00488 _name(GlueString(kernel.Name())), _uniqueName(kernel.UniqueName()), _type(kernel.Type()), _platform(kernel.GpuPlatform()), _hashId(kernel.HashId()), 00489 _simd(kernel.SimdWidth()), _asmText(CfgAsmText(cfg)), _profileArray(profileArray), 00490 _instrumentSites(std::move(instrumentSites)) {} 00491 00492 InstrumentSiteId LatencyKernelProfile::GetSiteId(uint32_t recordNum) const 00493 { 00494 GTPIN_ASSERT(recordNum < _instrumentSites.size()); 00495 return _instrumentSites[recordNum]; 00496 } 00497 00498 LatencyDispatchProfile& LatencyKernelProfile::AddDispatchProfile(const LatencyDispatchProfile& dispathProfile) 00499 { 00500 _dispatchProfiles.push_back(dispathProfile); 00501 string execDesStr = dispathProfile.kernelExecDesc.ToString(_platform); 00502 _execTotalCycles[execDesStr] += dispathProfile.cycles; 00503 return _dispatchProfiles.back(); 00504 } 00505 00506 string LatencyKernelProfile::ToString() const 00507 { 00508 ostringstream ostr; 00509 if (!_dispatchProfiles.empty()) 00510 { 00511 for (const auto& dp: _dispatchProfiles) 00512 { 00513 if (knobSkipZeroResults && (dp.freq == 0)) 00514 { 00515 continue; // Skip zero results if the knob is set 00516 } 00517 00518 string execDesStr = dp.kernelExecDesc.ToString(_platform); 00519 uint64_t execTotalCycles = _execTotalCycles.at(execDesStr); 00520 double dispatchToExecTotal = (execTotalCycles ? ((100.0 * (double)dp.cycles) / execTotalCycles) : 0.0); 00521 uint64_t avgCycles = (dp.freq ? (dp.cycles / dp.freq) : 0); 00522 00523 00524 ostr << setw(30) << _name << setw(20) << hex << _hashId << dec << setw(10) << _simd << setw(10) << _type.ToString(); 00525 ostr << setw(10) << uint32_t(dp.siteId) << setw(15) << dp.freq; 00526 ostr << fixed << setprecision(2) << setw(20) << dispatchToExecTotal << setw(20) << dp.cycles << setw(20) << avgCycles; 00527 ostr << setw(20) << _platform.ToString() << " " << setw(35) << dp.kernelExecDesc.ToString(_platform, ExecDescAlignedFormat()); 00528 ostr << endl; 00529 } 00530 } 00531 else 00532 { 00533 ostr << setw(30) << _name << setw(20) << hex << _hashId << dec << setw(10) << _simd << setw(10) << _type.ToString(); 00534 ostr << setw(10) << "NA" << setw(15) << "NA"; 00535 ostr << setw(20) << "NA" << setw(20) << "NA" << setw(20) << "NA"; 00536 ostr << setw(20) << "NA" << " " << setw(35) << "NA"; 00537 } 00538 ostr << endl; 00539 return ostr.str(); 00540 } 00541 00542 void LatencyKernelProfile::DumpAsm() const 00543 { 00544 DumpKernelAsmText(_name, _uniqueName, _asmText); 00545 } 00546 00547 // Define DETACHED_LATENCY to use Latency functionality in a different tool 00548 #if !defined (DETACHED_LATENCY) 00549 /* ============================================================================================= */ 00550 // GTPin_Entry 00551 /* ============================================================================================= */ 00552 EXPORT_C_FUNC void GTPin_Entry(int argc, const char *argv[]) 00553 { 00554 ConfigureGTPin(argc, argv); 00555 Latency::Instance()->Register(); 00556 atexit(Latency::OnFini); 00557 } 00558 #endif
(Back to the list of all GTPin Sample Tools)
Copyright (C) 2013-2025 Intel Corporation
SPDX-License-Identifier: MIT
1.7.4