add ta and td descriptions

ROCm · Nov 7, 2024 · ab47f7b · ab47f7b
1 parent 4cd346b
commit ab47f7b
Show file tree

Hide file tree

Showing 6 changed files with 568 additions and 136 deletions.
diff --git a/src/rocprof_compute_soc/analysis_configs/gfx906/1500_TA_and_TD.yaml b/src/rocprof_compute_soc/analysis_configs/gfx906/1500_TA_and_TD.yaml
@@ -2,6 +2,78 @@
 # Add description/tips for each metric in this section.
 # So it could be shown in hover.
 Metric Description:
+  Address Processing Unit:
+    Busy: &TA_Busy_tip >-
+      Percent of the total CU cycles the address processor was busy.
+    Address Stall: &TA_Address_Stall_tip >-
+      Percent of the total CU cycles the address processor was stalled from sending address requests
+      further into the vL1D pipeline.
+    Data Stall: &TA_Data_Stall_tip >-
+      Percent of the total CU cycles the address processor was stalled from sending write/atomic
+      data further into the vL1D pipeline.
+    Data-Processor → Address Stall: &TA_Data-Processor_Address_Stall_tip >-
+      Percent of total CU cycles the address processor was stalled waiting to send command data to
+      the data processor.
+    Total Instructions: &TA_Total_Instructions_tip >-
+    Global/Generic Instructions: &TA_Global_Generic_Instructions_tip >-
+      The total number of global and generic memory instructions executed on all compute units on
+      the accelerator, per normalization unit.
+    Global/Generic Read Instructions: &TA_Global_Generic_Read_tip >-
+      The total number of global and generic memory read instructions executed on all compute units
+      on the accelerator, per normalization unit.
+    Global/Generic Write Instructions: &TA_Global_Generic_Write_tip >-
+      The total number of global and generic memory write instructions executed on all compute units
+      on the accelerator, per normalization unit.
+    Global/Generic Atomic Instructions: &TA_Global_Generic_Atomic_tip >-
+      The total number of global and generic memory atomic (with and without return) instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Instructions: &TA_Spill_Stack_Instructions_tip >-
+      The total number of spill/stack memory instructions executed on all compute units on the
+      accelerator, per normalization unit.
+    Spill/Stack Read Instructions: &TA_Spill_Stack_Read_tip >-
+      The total number of spill/stack memory read instructions executed on all compute units on the
+      accelerator, per normalization unit.
+    Spill/Stack Write Instructions: &TA_Spill_Stack_Write_tip >-
+      The total number of spill/stack memory write instructions executed on all compute units on the
+      accelerator, per normalization unit.
+    Spill/Stack Atomic Instructions: &TA_Spill_Stack_Atomic_tip >-
+      The total number of spill/stack memory atomic (with and without return) instructions executed
+      on all compute units on the accelerator, per normalization unit. Typically unused as these
+      memory operations are typically used to implement thread-local storage.
+    Spill/Stack Total Cycles: &TA_Spill_Stack_Total_Cycles_tip >-
+      The number of cycles the address processing unit spent working on spill/stack instructions,
+      per normalization unit.
+    Spill/Stack Coalesced Read: &TA_Spill_Stack_Coalesced_Read_tip >-
+      The number of cycles the address processing unit spent working on coalesced spill/stack read
+      instructions, per normalization unit.
+    Spill/Stack Coalesced Write: &TA_Spill_Stack_Coalesced_Write_tip >-
+      The number of cycles the address processing unit spent working on coalesced spill/stack write
+      instructions, per normalization unit.
+  Data-Return Path:
+    Data-Return Busy: &TD_Busy_tip >-
+      Percent of the total CU cycles the data-return unit was busy processing or waiting on data to
+      return to the CU.
+    Cache RAM → Data-Return Stall: &TD_Cache_RAM_Data-Return_Stall_tip >-
+      Percent of the total CU cycles the data-return unit was stalled on data to be returned from
+      the vL1D Cache RAM.
+    Workgroup Manager → Data-Return Stall: &TD_Workgroup_Manager_Data-Return_Stall_tip >-
+      Percent of the total CU cycles the data-return unit was stalled by the workgroup manager due
+      to initialization of registers as a part of launching new workgroups.
+    Coalescable Instructions: &TD_Coalescable_Instructions_tip >-
+      The number of instructions submitted to the data-return unit by the address processor that
+      were found to be coalescable, per normalization unit.
+    Read Instructions: &TD_Read_tip >-
+      The number of read instructions submitted to the data-return unit by the address processor
+      summed over all compute units on the accelerator, per normalization unit. This is expected to
+      be the sum of global/generic and spill/stack reads in the address processor.
+    Write Instructions: &TD_Write_tip >-
+      The number of store instructions submitted to the data-return unit by the address processor
+      summed over all compute units on the accelerator, per normalization unit. This is expected to
+      be the sum of global/generic and spill/stack stores counted by the vL1D cache-front-end.
+    Atomic Instructions: &TD_Atomic_tip >-
+      The number of atomic instructions submitted to the data-return unit by the address processor
+      summed over all compute units on the accelerator, per normalization unit. This is expected to
+      be the sum of global/generic and spill/stack atomics in the address processor.
 
 # Define the panel properties and properties of each metric in the panel.
 Panel Config:
@@ -24,97 +96,97 @@ Panel Config:
             min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
             max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
             unit: pct
-            tips: 
+            tips: *TA_Busy_tip
           Address Stall:
             avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
             min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
             max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
             unit: pct
-            tips: 
+            tips: *TA_Address_Stall_tip
           Data Stall:
             avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
             min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
             max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
             unit: pct
-            tips: 
+            tips: *TA_Data_Stall_tip
           Data-Processor → Address Stall:
             avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
             min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
             max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
             unit: pct
-            tips: 
+            tips: *TA_Data-Processor_Address_Stall_tip
           Total Instructions:
             avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
             min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
             max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
             unit: (Instructions  + $normUnit)
-            tips: 
+            tips: *TA_Total_Instructions_tip
           Global/Generic Instructions:
             avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
             unit: (Instructions  + $normUnit)
-            tips: 
+            tips: *TA_Global_Generic_Instructions_tip
           Global/Generic Read Instructions:
             avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
             unit: (Instructions  + $normUnit)
-            tips: 
+            tips: *TA_Global_Generic_Read_tip
           Global/Generic Write Instructions:
             avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
             unit: (Instructions  + $normUnit)
-            tips: 
+            tips: *TA_Global_Generic_Write_tip
           Global/Generic Atomic Instructions:
             avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
             unit: (Instructions  + $normUnit)
-            tips: 
+            tips: *TA_Global_Generic_Atomic_tip
           Spill/Stack Instructions:
             avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
             unit: (Instructions  + $normUnit)
-            tips: 
+            tips: *TA_Spill_Stack_Instructions_tip
           Spill/Stack Read Instructions:
             avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
             unit: (Instructions  + $normUnit)
-            tips: 
+            tips: *TA_Spill_Stack_Read_tip
           Spill/Stack Write Instructions:
             avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
             unit: (Instructions  + $normUnit)
-            tips: 
+            tips: *TA_Spill_Stack_Write_tip
           Spill/Stack Atomic Instructions:
             avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
             unit: (Instructions  + $normUnit)
-            tips: 
+            tips: *TA_Spill_Stack_Atomic_tip
           Spill/Stack Total Cycles:
             avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
             min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
             max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
             unit: (Cycles  + $normUnit)
-            tips: 
+            tips: *TA_Spill_Stack_Total_Cycles_tip
           Spill/Stack Coalesced Read:
             avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
             min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
             max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
             unit: (Cycles  + $normUnit)
-            tips: 
+            tips: *TA_Spill_Stack_Coalesced_Read_tip
           Spill/Stack Coalesced Write:
             avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
             min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
             max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
             unit: (Cycles  + $normUnit)
-            tips:
+            tips: *TA_Spill_Stack_Coalesced_Write_tip
 
     - metric_table:
         id: 1502
@@ -132,13 +204,13 @@ Panel Config:
             min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
             max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
             unit: pct
-            tips: 
+            tips: *TD_Busy_tip
           Cache RAM → Data-Return Stall:
             avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
             min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
             max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
             unit: pct
-            tips: 
+            tips: *TD_Cache_RAM_Data-Return_Stall_tip
           Workgroup manager → Data-Return Stall:
             avg: # No perf counter
             min: # No perf counter
@@ -150,7 +222,7 @@ Panel Config:
             min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
             max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
             unit: (Instructions  + $normUnit)
-            tips: 
+            tips:  *TD_Coalescable_Instructions_tip
           Read Instructions:
             avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
               / $denom))
@@ -159,16 +231,16 @@ Panel Config:
             max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
               / $denom))
             unit: (Instructions  + $normUnit)
-            tips: 
+            tips: *TD_Read_tip
           Write Instructions:
             avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
             min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
             max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
             unit: (Instructions  + $normUnit)
-            tips: 
+            tips: *TD_Write_tip
           Atomic Instructions:
             avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
             min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
             max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
             unit: (Instructions  + $normUnit)
-            tips: 
+            tips: *TD_Atomic_tip