Skip to content

Latest commit

 

History

History
136 lines (127 loc) · 8.62 KB

README.md

File metadata and controls

136 lines (127 loc) · 8.62 KB

GPU PerfMon Read

Overview

This sample is a simple LD_PRELOAD based tool that allows to collect perfromance monitoring (PM) and timestamp register values per basic block for each kernel executed on Intel(R) Processor Graphics.

As a result, assembly listing annotated with percentage of cycles PM register was incremented per basic block for each kernel will be printed (e.g. for EU Stall event it will be percengate of cycles EU was stalled):

=== GEMM (runs 4 times) ===
[   0.00%] 0x0000: (W)      mov (8|M0)               r5.0<1>:ud    r0.0<1;1,0>:ud
[       -] 0x0010: (W)      or (1|M0)                cr0.0<1>:ud   cr0.0<0;1,0>:ud   0x4C0:uw         {Switch}
[       -] 0x0020: (W)      mul (1|M0)               r6.0<1>:d     r9.3<0;1,0>:d     r5.6<0;1,0>:d
[       -] 0x0030: (W)      mul (1|M0)               r6.1<1>:d     r9.2<0;1,0>:d     r5.1<0;1,0>:d
[       -] 0x0040: (W)      cmp (16|M0)   (gt)f1.0   null<1>:d     r8.6<0;1,0>:d     0:w
[       -] 0x0050: (W)      cmp (16|M16)  (gt)f1.0   null<1>:d     r8.6<0;1,0>:d     0:w
[       -] 0x0060:          add (16|M0)              r10.0<1>:d    r6.0<0;1,0>:d     r3.0<16;16,1>:uw
[       -] 0x0070:          add (16|M16)             r3.0<1>:d     r6.0<0;1,0>:d     r4.0<16;16,1>:uw
[       -] 0x0080:          add (16|M0)              r14.0<1>:d    r6.1<0;1,0>:d     r1.0<16;16,1>:uw
[       -] 0x0090:          add (16|M16)             r12.0<1>:d    r6.1<0;1,0>:d     r2.0<16;16,1>:uw
[       -] 0x00A0:          add (16|M0)              r10.0<1>:d    r10.0<8;8,1>:d    r7.1<0;1,0>:d    {Compacted}
[       -] 0x00A8:          add (16|M16)             r3.0<1>:d     r3.0<8;8,1>:d     r7.1<0;1,0>:d
[       -] 0x00B8:          add (16|M0)              r26.0<1>:d    r14.0<8;8,1>:d    r7.0<0;1,0>:d    {Compacted}
[       -] 0x00C0:          add (16|M16)             r16.0<1>:d    r12.0<8;8,1>:d    r7.0<0;1,0>:d
[       -] 0x00D0:          mul (16|M0)              r20.0<1>:d    r10.0<8;8,1>:d    r8.6<0;1,0>:d    {Compacted}
[       -] 0x00D8:          mul (16|M16)             r14.0<1>:d    r3.0<8;8,1>:d     r8.6<0;1,0>:d
[       -] 0x00E8: (W&f1.0) jmpi                                 L296

[   0.00%] 0x00F8:          mov (16|M0)              r116.0<1>:f   0x0:f
[       -] 0x0108:          mov (16|M16)             r18.0<1>:f    0x0:f
[       -] 0x0118: (W)      jmpi                                 L656

[   0.00%] 0x0128:          mov (16|M0)              r116.0<1>:f   0x0:f
[       -] 0x0138:          mov (16|M16)             r18.0<1>:f    0x0:f
[       -] 0x0148: (W)      mov (1|M0)               r4.0<1>:d     0:w

[  26.23%] 0x0158: (W)      mul (1|M0)               r4.1<1>:d     r4.0<0;1,0>:d     r8.6<0;1,0>:d
[       -] 0x0168:          add (16|M0)              r10.0<1>:d    r20.0<8;8,1>:d    r4.0<0;1,0>:d    {Compacted}
[       -] 0x0170:          add (16|M16)             r2.0<1>:d     r14.0<8;8,1>:d    r4.0<0;1,0>:d
[       -] 0x0180: (W)      add (1|M0)               r4.0<1>:d     r4.0<0;1,0>:d     1:w
[       -] 0x0190:          add (16|M0)              r12.0<1>:d    r4.1<0;1,0>:d     r26.0<8;8,1>:d   {Compacted}
[       -] 0x0198:          add (16|M16)             r6.0<1>:d     r4.1<0;1,0>:d     r16.0<8;8,1>:d
[       -] 0x01A8:          shl (16|M0)              r10.0<1>:d    r10.0<8;8,1>:d    2:w
[       -] 0x01B8:          shl (16|M16)             r2.0<1>:d     r2.0<8;8,1>:d     2:w
[       -] 0x01C8:          shl (16|M0)              r12.0<1>:d    r12.0<8;8,1>:d    2:w
[       -] 0x01D8:          shl (16|M16)             r6.0<1>:d     r6.0<8;8,1>:d     2:w
[       -] 0x01E8:          add (16|M0)              r10.0<1>:d    r10.0<8;8,1>:d    r8.7<0;1,0>:d    {Compacted}
[       -] 0x01F0:          add (16|M16)             r2.0<1>:d     r2.0<8;8,1>:d     r8.7<0;1,0>:d
[       -] 0x0200:          add (16|M0)              r12.0<1>:d    r12.0<8;8,1>:d    r9.0<0;1,0>:d    {Compacted}
[       -] 0x0208:          add (16|M16)             r6.0<1>:d     r6.0<8;8,1>:d     r9.0<0;1,0>:d
[       -] 0x0218:          send (16|M0)             r22:w    r10     0xC         0x4205E00  //  wr:2+?, rd:2, Untyped Surface Read msc:30, to bti 0
[       -] 0x0228: (W)      cmp (16|M0)   (lt)f0.0   null<1>:d     r4.0<0;1,0>:d     r8.6<0;1,0>:d    {Compacted}
[       -] 0x0230:          send (16|M16)            r24:w    r2      0xC         0x4205E00  //  wr:2+?, rd:2, Untyped Surface Read msc:30, to bti 0
[       -] 0x0240:          send (16|M0)             r109:w   r12     0xC         0x4205E01  //  wr:2+?, rd:2, Untyped Surface Read msc:30, to bti 1
[       -] 0x0250:          send (16|M16)            r107:w   r6      0xC         0x4205E01  //  wr:2+?, rd:2, Untyped Surface Read msc:30, to bti 1
[       -] 0x0260: (W)      cmp (16|M16)  (lt)f0.0   null<1>:d     r4.0<0;1,0>:d     r8.6<0;1,0>:d
[       -] 0x0270:          mad (16|M0)              r116.0<1>:f   r116.0<2;1>:f     r22.0<2;1>:f      r109.0<1>:f      {Compacted}
[       -] 0x0278:          mad (16|M16)             r18.0<1>:f    r18.0<2;1>:f      r24.0<2;1>:f      r107.0<1>:f      {Compacted}
[       -] 0x0280: (W&f0.0) jmpi                                 L344

[   0.00%] 0x0290:          add (16|M0)              r6.0<1>:d     r20.0<8;8,1>:d    r26.0<8;8,1>:d   {Compacted}
[       -] 0x0298:          add (16|M16)             r2.0<1>:d     r14.0<8;8,1>:d    r16.0<8;8,1>:d
[       -] 0x02A8: (W)      mov (8|M0)               r112.0<1>:ud  r5.0<8;8,1>:ud                   {Compacted}
[       -] 0x02B0:          shl (16|M0)              r6.0<1>:d     r6.0<8;8,1>:d     2:w
[       -] 0x02C0:          shl (16|M16)             r2.0<1>:d     r2.0<8;8,1>:d     2:w
[       -] 0x02D0:          add (16|M0)              r6.0<1>:d     r6.0<8;8,1>:d     r9.1<0;1,0>:d    {Compacted}
[       -] 0x02D8:          add (16|M16)             r2.0<1>:d     r2.0<8;8,1>:d     r9.1<0;1,0>:d
[       -] 0x02E8:          sends (16|M0)            null:w   r6      r116    0x8C        0x4025E02  //  wr:2+2, rd:0, Untyped Surface Write msc:30, to bti 2
[       -] 0x02F8:          sends (16|M16)           null:w   r2      r18     0x8C        0x4025E02  //  wr:2+2, rd:0, Untyped Surface Write msc:30, to bti 2
[       -] 0x0308: (W)      send (8|M0)              null     r112    0x27        0x2000010  {EOT} //  wr:1+?, rd:0,  end of thread
[       -] 0x0318:          illegal
[       -] 0x0328:          illegal
[       -] 0x0338:          illegal
[       -] 0x0348:          illegal
[       -] 0x0358:          illegal
[       -] 0x0368:          illegal
[       -] 0x0378:          illegal
[       -] 0x0388:          illegal
[       -] 0x0398:          illegal
[       -] 0x03A8:          illegal
Total PM percentage: 26.24%

Supported OS

  • Linux
  • Windows (under development)

Prerequisites

Build and Run

Linux

Run the following commands to build the sample:

cd <pti>/samples/gpu_perfmon_read
mkdir build
cd build
cmake -DCMAKE_BUILD_TYPE=Release [-DGTPIN_PATH=<gtpin>/Profilers] ..
make

Use this command line to run the tool:

./gpu_perfmon_read <target_application>

One may use cl_gemm as target application:

./gpu_perfmon_read ../../cl_gemm/build/cl_gemm

Windows

Use Microsoft* Visual Studio x64 command prompt to run the following commands and build the sample:

cd <pti>\samples\gpu_perfmon_read
mkdir build
cd build
cmake -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=Release -DGTPIN_PATH=<gtpin>\Profilers -DCMAKE_LIBRARY_PATH=<iga_lib_path> ..
nmake

Use this command line to run the tool:

set PATH=%PATH%;<gtpin>\Profilers\Lib\intel64
gpu_perfmon_read.exe <target_application>

One may use cl_gemm as target application:

set PATH=%PATH%;<gtpin>\Profilers\Lib\intel64
gpu_perfmon_read.exe ..\..\cl_gemm\build\cl_gemm.exe

Note: to build this sample one may need to generate *.lib file from IGA *.dll (see here for details) and provide the path to this *.lib to cmake with -DCMAKE_LIBRARY_PATH.

Also one may need to add an actual path to IGA *.dll into PATH before sample run, e.g.:

set PATH=%PATH%;<gtpin>\Profilers\Lib\intel64
set PATH=%PATH%;<iga_dll_path>
gpu_perfmon_read.exe ..\..\cl_gemm\build\cl_gemm.exe

One may use gpu_perfmon_set utility to tune PM register on some particular event collection prior to run this tool, two terminals opened at the same time may be required (not supported on Windows currently).