copy metrics descriptions to analysis_configs YAML files

Signed-off-by: Peter Park <[email protected]>
ROCm · Jan 13, 2025 · 92a3f11 · 92a3f11
1 parent 7e7d99d
commit 92a3f11
Show file tree

Hide file tree

Showing 11 changed files with 898 additions and 126 deletions.
diff --git a/src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system-speed-of-light.yaml b/src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system-speed-of-light.yaml
@@ -2,7 +2,131 @@
 # Add description/tips for each metric in this section.
 # So it could be shown in hover.
 Metric Description:
-  SALU: &SALU_anchor Scalar Arithmetic Logic Unit
+  Speed-of-Light:
+    VALU FLOPs: &VALU_FLOPs_desc >-
+      The total floating-point operations executed per second on the VALU. This is also presented as
+      a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this
+      does not include any floating-point operations from MFMA instructions.
+    VALU IOPs: &VALU_IOPs_desc >-
+      The total integer operations executed per second on the VALU. This is also presented as a
+      percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does
+      not include any integer operations from MFMA instructions.
+    MFMA FLOPs (BF16): &MFMA_FLOPs_BF16_desc >-
+      The total number of 16-bit brain floating point MFMA operations executed per second. Note:
+      this does not include any 16-bit brain floating point operations from VALU instructions. This
+      is also presented as a percent of the peak theoretical BF16 MFMA operations achievable on the
+      specific accelerator.
+    MFMA FLOPs (F16): &MFMA_FLOPs_F16_desc >-
+      The total number of 16-bit floating point MFMA operations executed per second. Note: this does
+      not include any 16-bit floating point operations from VALU instructions. This is also
+      presented as a percent of the peak theoretical F16 MFMA operations achievable on the specific
+      accelerator.
+    MFMA FLOPs (F32): &MFMA_FLOPs_F32_desc >-
+      The total number of 32-bit floating point MFMA operations executed per second. Note: this does
+      not include any 32-bit floating point operations from VALU instructions. This is also
+      presented as a percent of the peak theoretical F32 MFMA operations achievable on the specific
+      accelerator.
+    MFMA FLOPs (F64): &MFMA_FLOPs_F64_desc >-
+      The total number of 64-bit floating point MFMA operations executed per second. Note: this does
+      not include any 64-bit floating point operations from VALU instructions. This is also
+      presented as a percent of the peak theoretical F64 MFMA operations achievable on the specific
+      accelerator.
+    MFMA IOPs (INT8): &MFMA_IOPs_INT8_desc >-
+      The total number of 8-bit integer MFMA operations executed per second. Note: this does not
+      include any 8-bit integer operations from VALU instructions. This is also presented as a
+      percent of the peak theoretical INT8 MFMA operations achievable on the specific accelerator.
+    SALU Utilization: &SALU_Utilization_desc >-
+      Indicates what percent of the kernel's duration the SALU was busy executing instructions.
+      Computed as the ratio of the total number of cycles spent by the scheduler issuing SALU or
+      SMEM instructions over the total CU cycles.
+    VALU Utilization: &VALU_Utilization_desc >-
+      Indicates what percent of the kernel's duration the VALU was busy executing instructions. Does
+      not include VMEM operations. Computed as the ratio of the total number of cycles spent by the
+      scheduler issuing VALU instructions over the total CU cycles.
+    MFMA Utilization: &MFMA_Utilization_desc >-
+      Indicates what percent of the kernel's duration the MFMA unit was busy executing instructions.
+      Computed as the ratio of the total number of cycles the MFMA was busy over the total CU
+      cycles.
+    VMEM Utilization: &VMEM_Utilization_desc >-
+      Indicates what percent of the kernel's duration the VMEM unit was busy executing instructions,
+      including both global/generic and spill/scratch operations (see the VMEM instruction count
+      metrics) for more detail). Does not include VALU operations. Computed as the ratio of the
+      total number of cycles spent by the scheduler issuing VMEM instructions over the total CU
+      cycles.
+    Branch Utilization: &Branch_Utilization_desc >-
+      Indicates what percent of the kernel's duration the branch unit was busy executing
+      instructions. Computed as the ratio of the total number of cycles spent by the scheduler
+      issuing branch instructions over the total CU cycles.
+    VALU Active Threads: &VALU_Active_Threads_desc >-
+      Indicates the average level of divergence within a wavefront over the lifetime of the kernel.
+      The number of work-items that were active in a wavefront during execution of each VALU
+      instruction, time-averaged over all VALU instructions run on all wavefronts in the kernel.
+    IPC: &IPC_desc >-
+      The ratio of the total number of instructions executed on the CU over the total active CU
+      cycles. This is also presented as a percent of the peak theoretical bandwidth achievable on
+      the specific accelerator.
+    Wavefront Occupancy: &Wavefront_Occupancy_desc >-
+      The time-averaged number of wavefronts resident on the accelerator over the lifetime of the
+      kernel. Note: this metric may be inaccurate for short-running kernels (less than 1ms). This is
+      also presented as a percent of the peak theoretical occupancy achievable on the specific
+      accelerator.
+    Theoretical LDS Bandwidth: &Theoretical_LDS_Bandwidth_desc >-
+      Indicates the maximum amount of bytes that could have been loaded from, stored to, or
+      atomically updated in the LDS per unit time (see LDS Bandwidth example for more detail). This
+      is also presented as a percent of the peak theoretical F64 MFMA operations achievable on the
+      specific accelerator.
+    LDS Bank Conflicts/Access: &LDS_Bank_Conflicts_Access_desc >-
+      The ratio of the number of cycles spent in the LDS scheduler due to bank conflicts (as
+      determined by the conflict resolution hardware) to the base number of cycles that would be
+      spent in the LDS scheduler in a completely uncontended case. This is also presented in
+      normalized form (i.e., the Bank Conflict Rate).
+    vL1D Cache Hit Rate: &vL1D_Cache_Hit_Rate_desc >-
+      The ratio of the number of vL1D cache line requests that hit in vL1D cache over the total
+      number of cache line requests to the vL1D cache RAM.
+    vL1D Cache BW: &vL1D_Cache_BW_desc >-
+      The number of bytes looked up in the vL1D cache as a result of VMEM instructions per unit
+      time. The number of bytes is calculated as the number of cache lines requested multiplied by
+      the cache line size. This value does not consider partial requests; so, for example, if only a
+      single value is requested in a cache line, the data movement will still be counted as a full
+      cache line. This is also presented as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator.
+    L2 Cache Hit Rate: &L2_Cache_Hit_Rate_desc >-
+      The ratio of the number of L2 cache line requests that hit in the L2 cache over the total
+      number of incoming cache line requests to the L2 cache.
+    L2 Cache BW: &L2_Cache_BW_desc >-
+      The number of bytes looked up in the L2 cache per unit time. The number of bytes is calculated
+      as the number of cache lines requested multiplied by the cache line size. This value does not
+      consider partial requests; so, for example, if only a single value is requested in a cache
+      line, the data movement will still be counted as a full cache line. This is also presented as
+      a percent of the peak theoretical bandwidth achievable on the specific accelerator.
+    L2-Fabric Read BW: &L2-Fabric_Read_BW_desc >-
+      The number of bytes read by the L2 over the Infinity Fabric™ interface per unit time. This is
+      also presented as a percent of the peak theoretical bandwidth achievable on the specific
+      accelerator.
+    L2-Fabric Write BW: &L2-Fabric_Write_BW_desc >-
+      The number of bytes sent by the L2 over the Infinity Fabric interface by write and atomic
+      operations per unit time. This is also presented as a percent of the peak theoretical
+      bandwidth achievable on the specific accelerator.
+    L2-Fabric Read Latency: &L2-Fabric_Read_Latency_desc >-
+      The time-averaged number of cycles read requests spent in Infinity Fabric before data was
+      returned to the L2.
+    L2-Fabric Write Latency: &L2-Fabric_Write_Latency_desc >-
+      The time-averaged number of cycles write requests spent in Infinity Fabric before a completion
+      acknowledgement was returned to the L2.
+    sL1D Cache Hit Rate: &sL1D_Cache_Hit_Rate_desc >-
+      The percent of sL1D requests that hit on a previously loaded line the cache. Calculated as the
+      ratio of the number of sL1D requests that hit over the number of all sL1D requests.
+    sL1D Cache BW: &sL1D_Cache_BW_desc >-
+      The number of bytes looked up in the sL1D cache per unit time. This is also presented as a
+      percent of the peak theoretical bandwidth achievable on the specific accelerator.
+    L1I Hit Rate: &L1I_Hit_Rate_desc >-
+      The percent of L1I requests that hit on a previously loaded line the cache. Calculated as the
+      ratio of the number of L1I requests that hit over the number of all L1I requests.
+    L1I BW: &L1I_BW_desc >-
+      The number of bytes looked up in the L1I cache per unit time. This is also presented as a
+      percent of the peak theoretical bandwidth achievable on the specific accelerator.
+    L1I Fetch Latency: &L1I_Fetch_Latency_desc >-
+      The average number of cycles spent to fetch instructions to a CU.
 
 # Define the panel properties and properties of each metric in the panel.
 Panel Config:

diff --git a/src/rocprof_compute_soc/analysis_configs/gfx90a/0500_command-processor.yaml b/src/rocprof_compute_soc/analysis_configs/gfx90a/0500_command-processor.yaml
@@ -2,6 +2,37 @@
 # Add description/tips for each metric in this section.
 # So it could be shown in hover.
 Metric Description:
+  Command Processor Fetcher:
+    CPF Utilization: &CPF_Utilization_desc >-
+      Percent of total cycles where the CPF was busy actively doing any work. The ratio of CPF busy
+      cycles over total cycles counted by the CPF.
+    CPF Stall: &CPF_Stall_desc >-
+      Percent of CPF busy cycles where the CPF was stalled for any reason.
+    CPF-L2 Utilization: &CPF-L2_Utilization_desc >-
+      Percent of total cycles counted by the CPF-L2 interface where the CPF-L2 interface was active
+      doing any work. The ratio of CPF-L2 busy cycles over total cycles counted by the CPF-L2.
+    CPF-L2 Stall: &CPF-L2_Stall_desc >-
+      Percent of CPF-L2 busy cycles where the CPF-L2 interface was stalled for any reason.
+    CPF-UTCL1 Stall: &CPF-UTCL1_Stall_desc >-
+      Percent of CPF busy cycles where the CPF was stalled by address translation.
+  Packet Processor:
+    CPC Utilization: &CPC_Utilization_desc >-
+      Percent of total cycles where the CPC was busy actively doing any work. The ratio of CPC busy
+      cycles over total cycles counted by the CPC.
+    CPC Stall Rate: &CPC_Stall_Rate_desc >-
+      Percent of CPC busy cycles where the CPC was stalled for any reason.
+    CPC Packet Decoding Utilization: &CPC_Packet_Decoding_Utilization_desc >-
+      Percent of CPC busy cycles spent decoding commands for processing.
+    CPC-Workgroup Manager Utilization: &CPC-Workgroup_Manager_Utilization_desc >-
+      Percent of CPC busy cycles spent dispatching workgroups to the workgroup manager.
+    CPC-L2 Utilization: &CPC-L2_Utilization_desc >-
+      Percent of total cycles counted by the CPC-L2 interface where the CPC-L2 interface was active
+      doing any work.
+    CPC-UTCL1 Stall: &CPC-UTCL1_Stall_desc >-
+      Percent of CPC busy cycles where the CPC was stalled by address translation.
+    CPC-UTCL2 Utilization: &CPC-UTCL2_Utilization_desc >-
+      Percent of total cycles counted by the CPC's L2 address translation interface where the CPC
+      was busy doing address translation work.
 
 # Define the panel properties and properties of each metric in the panel.
 Panel Config:
@@ -27,7 +58,7 @@ Panel Config:
             max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
               if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
             unit: pct
-            tips:
+            tips: *CPF_Utilization_desc
           CPF Stall:
             avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
               != 0) else None))
@@ -36,7 +67,7 @@ Panel Config:
             max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
               != 0) else None))
             unit: pct
-            tips:
+            tips: *CPF_Stall_desc
           CPF-L2 Utilization:
             avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
               if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
@@ -45,7 +76,7 @@ Panel Config:
             max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
               if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
             unit: pct
-            tips:
+            tips: *CPF-L2_Utilization_desc
           CPF-L2 Stall:
             avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
               != 0) else None))
@@ -54,7 +85,7 @@ Panel Config:
             max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
               != 0) else None))
             unit: pct
-            tips:
+            tips: *CPF-L2_Stall_desc
           CPF-UTCL1 Stall:
             avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
               != 0) else None)
@@ -63,7 +94,7 @@ Panel Config:
             max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
               != 0) else None)
             unit: pct
-            tips:
+            tips: *CPF-UTCL1_Stall_desc
 
     - metric_table:
         id: 502
@@ -84,7 +115,7 @@ Panel Config:
             max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
               if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
             unit: pct
-            tips:
+            tips: *CPC_Utilization_desc
           CPC Stall Rate:
             avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
               != 0) else None))
@@ -93,19 +124,19 @@ Panel Config:
             max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
               != 0) else None))
             unit: pct
-            tips:
+            tips: *CPC_Stall_Rate_desc
           CPC Packet Decoding Utilization:
             avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None)
             min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None)
             max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None)
             unit: pct
-            tips:
+            tips: *CPC_Packet_Decoding_Utilization_desc
           CPC-Workgroup Manager Utilization:
             avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None)
             min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None)
             max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None)
             unit: Pct
-            tips:
+            tips: *CPC-Workgroup_Manager_Utilization_desc
           CPC-L2 Utilization:
             avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
               if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
@@ -114,7 +145,7 @@ Panel Config:
             max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
               if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
             unit: pct
-            tips:
+            tips: *CPC-L2_Utilization_desc
           CPC-UTCL1 Stall:
             avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
               != 0) else None)
@@ -123,7 +154,7 @@ Panel Config:
             max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
               != 0) else None)
             unit: pct
-            tips:
+            tips: *CPC-UTCL1_Stall_desc
           CPC-UTCL2 Utilization:
             avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
               if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
@@ -132,4 +163,4 @@ Panel Config:
             max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
               if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
             unit: pct
-            tips:
+            tips: *CPC-UTCL2_Utilization_desc