From ba71b21c3eeb461ed82b4c92709c654a06eb00f7 Mon Sep 17 00:00:00 2001
From: Pablo Gonzalez <pablo.gonzalez@factored.ai>
Date: Tue, 21 Jan 2025 12:05:12 -0500
Subject: [PATCH 1/2] Low latency checks to submission checker and report low
 latency (#2017)

* Add low latency checks to submission checker

* Set llama2-70b-low-latency as a different model to avoid uncommenting config

* Replace: low-latency->interactive

* Include interactive mode in llama2 reference + README

* Update README to have interactive server scenario + remove unnecessary changes

* Update test-submission-checker.yml

* Update test-submission-checker.yml

---------

Co-authored-by: Arjun Suresh <arjun@gateoverflow.com>
---
 .github/workflows/test-submission-checker.yml |   4 +-
 language/llama2-70b/README.md                 |  18 +++
 language/llama2-70b/main.py                   |  11 +-
 language/llama2-70b/user.conf                 |   2 -
 loadgen/mlperf.conf                           |   8 ++
 tools/submission/generate_final_report.py     |   4 +
 tools/submission/submission_checker.py        | 114 +++++++++++++++---
 7 files changed, 135 insertions(+), 26 deletions(-)

diff --git a/.github/workflows/test-submission-checker.yml b/.github/workflows/test-submission-checker.yml
index b7ab88ddb..5b7637d7d 100644
--- a/.github/workflows/test-submission-checker.yml
+++ b/.github/workflows/test-submission-checker.yml
@@ -32,7 +32,7 @@ jobs:
     - name: Install dependencies
       run: |
         python3 -m pip install cm4mlops
-        git clone https://github.com/mlcommons/inference_results_v4.1 --depth 1
+        git clone https://github.com/mlcommons/mlperf_inference_unofficial_submissions_v5.0 --depth 1
     - name: Test MLPerf inference submission checker
       run: |
-        cm run script --tags=run,mlperf,inference,submission,checker --adr.inference-src.tags=_branch.$PR_HEAD_REF,_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src.version=custom --input=`pwd`/inference_results_v4.1 --src_version=v4.1 --extra_args=" --skip-extra-accuracy-files-check" --quiet 
+        cm run script --tags=run,mlperf,inference,submission,checker --adr.inference-src.tags=_branch.$PR_HEAD_REF,_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src.version=custom --input=`pwd`/mlperf_inference_unofficial_submissions_v5.0 --src_version=v5.0 --extra_args=" --skip-extra-files-in-root-check --skip-extra-accuracy-files-check" --quiet 
diff --git a/language/llama2-70b/README.md b/language/llama2-70b/README.md
index 07d355344..5268863ac 100644
--- a/language/llama2-70b/README.md
+++ b/language/llama2-70b/README.md
@@ -245,3 +245,21 @@ scale from a 0.0-1.0 scale):
 - Tokens per sample: 294.45
 
 This was run on a DGX-H100 node. Total runtime was ~4.5 days.
+
+# Run llama2-70b-interactive benchmark
+
+For official, Llama2-70b submissions it is also possible to submit in the interactive category. This sets a more strict latency requirements for Time to First Token (ttft) and Time per Output Token (tpot). Specifically, the interactive category requires loadgen to enforce `ttft <= 450ms` and `ttft <= 40ms`
+
+In order to run interactive category, it is sufficient to set the flag `--lg-model-name` as `llama2-70b-interactive` when calling the `main.py` to run the benchmark. For example, to run the server scenario in interactive mode:
+
+```
+python -u main.py --scenario Server \
+                --model-path ${CHECKPOINT_PATH} \
+                --mlperf-conf mlperf.conf \
+                --user-conf user.conf \
+                --total-sample-count 24576 \
+                --device cpu \
+                --dataset-path ${DATASET_PATH} \
+                --output-log-dir server-logs \
+                --lg-model-name llama2-70b-interactive
+```
diff --git a/language/llama2-70b/main.py b/language/llama2-70b/main.py
index 1d795685f..84ccf849a 100644
--- a/language/llama2-70b/main.py
+++ b/language/llama2-70b/main.py
@@ -120,7 +120,13 @@ def get_args():
         default=None,
         help="Specify an api endpoint call to use api mode",
     )
-
+    parser.add_argument(
+        "--lg-model-name",
+        type=str,
+        default="llama2-70b",
+        choices=["llama2-70b", "llama2-70b-interactive"],
+        help="Model name(specified in llm server)",
+    )
     args = parser.parse_args()
     return args
 
@@ -146,8 +152,7 @@ def main():
     settings = lg.TestSettings()
     settings.scenario = scenario_map[args.scenario.lower()]
     # mlperf.conf is automatically loaded by the loadgen
-    # settings.FromConfig(args.mlperf_conf, "llama2-70b", args.scenario)
-    settings.FromConfig(args.user_conf, "llama2-70b", args.scenario)
+    settings.FromConfig(args.user_conf, args.lg_model_name, args.scenario)
 
     if args.accuracy:
         settings.mode = lg.TestMode.AccuracyOnly
diff --git a/language/llama2-70b/user.conf b/language/llama2-70b/user.conf
index 945082fe9..bb97c437a 100644
--- a/language/llama2-70b/user.conf
+++ b/language/llama2-70b/user.conf
@@ -9,5 +9,3 @@
 *.Server.target_qps = 0.5
 *.Server.min_duration = 120000
 *.Server.min_query_count = 100
-
-llama2-70b.Server.sample_concatenate_permutation = 1
\ No newline at end of file
diff --git a/loadgen/mlperf.conf b/loadgen/mlperf.conf
index 836b42564..582381bbd 100644
--- a/loadgen/mlperf.conf
+++ b/loadgen/mlperf.conf
@@ -15,6 +15,7 @@ rnnt.*.performance_sample_count_override = 2513
 gptj.*.performance_sample_count_override = 13368
 mixtral-8x7b.*.performance_sample_count_override = 15000
 llama2-70b.*.performance_sample_count_override = 24576
+llama2-70b-interactive.*.performance_sample_count_override = 24576
 llama3_1-405b.*.performance_sample_count_override = 8313
 stable-diffusion-xl.*.performance_sample_count_override = 5000
 rgat.*.performance_sample_count_override = 788379
@@ -49,6 +50,7 @@ rgat.*.sample_concatenate_permutation = 1
 # LLM benchmarks have non-uniform inputs and outputs, and use equal issue mode for all latency scenario
 gptj.*.sample_concatenate_permutation = 1
 llama2-70b.*.sample_concatenate_permutation = 1
+llama2-70b-interactive.*.sample_concatenate_permutation = 1
 mixtral-8x7b.*.sample_concatenate_permutation = 1
 llama3_1-405b.*.sample_concatenate_permutation = 1
 
@@ -66,6 +68,7 @@ gptj.Server.target_latency = 20000
 stable-diffusion-xl.Server.target_latency = 20000
 # Benchmarks that measure token latencies
 llama2-70b.*.use_token_latencies = 1
+llama2-70b-interactive.*.use_token_latencies = 1
 mixtral-8x7b.*.use_token_latencies = 1
 llama3_1-405b.*.use_token_latencies = 1
 # gptj benchmark infers token latencies
@@ -76,6 +79,11 @@ llama2-70b.Server.target_latency = 0
 llama2-70b.Server.ttft_latency = 2000
 llama2-70b.Server.tpot_latency = 200
 
+# Target Latencies for low latency setting
+llama2-70b-interactive.Server.target_latency = 0
+llama2-70b-interactive.Server.ttft_latency = 450
+llama2-70b-interactive.Server.tpot_latency = 40
+
 mixtral-8x7b.Server.target_latency = 0
 mixtral-8x7b.Server.ttft_latency = 2000
 mixtral-8x7b.Server.tpot_latency = 200
diff --git a/tools/submission/generate_final_report.py b/tools/submission/generate_final_report.py
index aa5b36983..da965ef3e 100644
--- a/tools/submission/generate_final_report.py
+++ b/tools/submission/generate_final_report.py
@@ -147,6 +147,8 @@ def main():
             "stable-diffusion-xl",
             "llama2-70b-99",
             "llama2-70b-99.9",
+            "llama2-70b-interactive-99",
+            "llama2-70b-interactive-99.9",
             "mixtral-8x7b",
         ],
         ["SingleStream", "MultiStream", "Server", "Offline"],
@@ -209,6 +211,8 @@ def main():
                 "stable-diffusion-xl": ["Server", "Offline"],
                 "llama2-70b-99": ["Server", "Offline"],
                 "llama2-70b-99.9": ["Server", "Offline"],
+                "llama2-70b-interactive-99": ["Server", "Offline"],
+                "llama2-70b-interactive-99.9": ["Server", "Offline"],
                 "mixtral-8x7b": ["Server", "Offline"],
                 "rgat": ["Offline"],
                 "llama3.1-405b": ["Offline", "Server"]
diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py
index 91cba0f31..264c9b373 100755
--- a/tools/submission/submission_checker.py
+++ b/tools/submission/submission_checker.py
@@ -194,7 +194,6 @@
             "ssd-resnet34": "retinanet",
             "mobilenet": "resnet",
             "resnet50": "resnet",
-            "llama3_1-405b": "llama3.1-405b"
         },
         "seeds": {
             "qsl_rng_seed": 3066443479025735752,
@@ -266,6 +265,8 @@
             "gptj-99.9",
             "llama2-70b-99",
             "llama2-70b-99.9",
+            "llama2-70b-interactive-99",
+            "llama2-70b-interactive-99.9",
             "stable-diffusion-xl",
             "mixtral-8x7b",
             "llama3.1-405b",
@@ -283,6 +284,8 @@
             "gptj-99.9": ["Server", "Offline"],
             "llama2-70b-99": ["Server", "Offline"],
             "llama2-70b-99.9": ["Server", "Offline"],
+            "llama2-70b-interactive-99": ["Server", "Offline"],
+            "llama2-70b-interactive-99.9": ["Server", "Offline"],
             "stable-diffusion-xl": ["Server", "Offline"],
             "mixtral-8x7b": ["Server", "Offline"],
             "llama3.1-405b": ["Server", "Offline"],
@@ -314,6 +317,8 @@
             "gptj-99.9": ["SingleStream", "Offline", "Server"],
             "llama2-70b-99": ["Server", "Offline"],
             "llama2-70b-99.9": ["Server", "Offline"],
+            "llama2-70b-interactive-99": ["Server", "Offline"],
+            "llama2-70b-interactive-99.9": ["Server", "Offline"],
             "stable-diffusion-xl": ["SingleStream", "Offline", "Server"],
             "mixtral-8x7b": ["Server", "Offline"],
             "llama3.1-405b": ["Server", "Offline"],
@@ -370,6 +375,26 @@
                 "TOKENS_PER_SAMPLE",
                 294.45 * 0.9,
             ),
+            "llama2-70b-interactive-99": (
+                "ROUGE1",
+                44.4312 * 0.99,
+                "ROUGE2",
+                22.0352 * 0.99,
+                "ROUGEL",
+                28.6162 * 0.99,
+                "TOKENS_PER_SAMPLE",
+                294.45 * 0.9,
+            ),
+            "llama2-70b-interactive-99.9": (
+                "ROUGE1",
+                44.4312 * 0.999,
+                "ROUGE2",
+                22.0352 * 0.999,
+                "ROUGEL",
+                28.6162 * 0.999,
+                "TOKENS_PER_SAMPLE",
+                294.45 * 0.9,
+            ),
             "stable-diffusion-xl": (
                 "CLIP_SCORE",
                 31.68631873,
@@ -409,6 +434,8 @@
             ),
             "llama2-70b-99": ("TOKENS_PER_SAMPLE", 294.45 * 1.1),
             "llama2-70b-99.9": ("TOKENS_PER_SAMPLE", 294.45 * 1.1),
+            "llama2-70b-interactive-99": ("TOKENS_PER_SAMPLE", 294.45 * 1.1),
+            "llama2-70b-interactive-99.9": ("TOKENS_PER_SAMPLE", 294.45 * 1.1),
             "mixtral-8x7b": ("TOKENS_PER_SAMPLE", 145.9 * 1.1),
             "llama3.1-405b": ("TOKENS_PER_SAMPLE", 684.68 * 1.1),
         },
@@ -428,6 +455,8 @@
             "gptj-99.9": 13368,
             "llama2-70b-99": 24576,
             "llama2-70b-99.9": 24576,
+            "llama2-70b-interactive-99": 24576,
+            "llama2-70b-interactive-99.9": 24576,
             "stable-diffusion-xl": 5000,
             "mixtral-8x7b": 15000,
             "llama3.1-405b": 8313,
@@ -439,8 +468,10 @@
         # not really needed
         "model_mapping": {
             # map model names to the official mlperf model class
+            "ssd-resnet34": "retinanet",
             "mobilenet": "resnet",
             "resnet50": "resnet",
+            "llama3_1-405b": "llama3.1-405b",
         },
         "seeds": {
             # TODO: Update random seeds
@@ -459,6 +490,8 @@
             "stable-diffusion-xl": {"Server": 20000000000},
             "llama2-70b-99": {"Server": 20000000000},
             "llama2-70b-99.9": {"Server": 20000000000},
+            "llama2-70b-interactive-99": {"Server": 20000000000},
+            "llama2-70b-interactive-99.9": {"Server": 20000000000},
             "mixtral-8x7b": {"Server": 20000000000},
             "llama3.1-405b": {"Server": 60000000000}
         },
@@ -485,6 +518,8 @@
             "gptj-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
             "llama2-70b-99": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
             "llama2-70b-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+            "llama2-70b-interactive-99": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+            "llama2-70b-interactive-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
             "stable-diffusion-xl": {
                 "SingleStream": 1024,
                 "Server": 270336,
@@ -578,6 +613,8 @@
     "gptj-99.9": 13368,
     "llama2-70b-99": 24576,
     "llama2-70b-99.9": 24576,
+    "llama2-70b-interactive-99": 24576,
+    "llama2-70b-interactive-99.9": 24576,
     "stable-diffusion-xl": 5000,
     "mixtral-8x7b": 15000,
     "llama3.1-405b": 8313,
@@ -645,6 +682,14 @@
             "Offline": "result_tokens_per_second",
             "Server": "result_completed_tokens_per_second",
         },
+        "llama2-70b-interactive-99": {
+            "Offline": "result_tokens_per_second",
+            "Server": "result_completed_tokens_per_second",
+        },
+        "llama2-70b-interactive-99.9": {
+            "Offline": "result_tokens_per_second",
+            "Server": "result_completed_tokens_per_second",
+        },
         "gptj-99": {
             "Offline": "result_inferred_tokens_per_second",
             "Server": "result_inferred_completed_tokens_per_second",
@@ -666,14 +711,20 @@
 
 LLM_LATENCY_LIMITS = {
     "llama2-70b-99": {
-        "conversational": {"ttft": 2000 * 1000000, "tpot": 200 * 1000000}
+        "ttft": 2000 * 1000000, "tpot": 200 * 1000000
     },
     "llama2-70b-99.9": {
-        "conversational": {"ttft": 2000 * 1000000, "tpot": 200 * 1000000}
+        "ttft": 2000 * 1000000, "tpot": 200 * 1000000
+    },
+    "llama2-70b-interactive-99": {
+        "ttft": 450 * 1000000, "tpot": 40 * 1000000
+    },
+    "llama2-70b-interactive-99.9": {
+        "ttft": 450 * 1000000, "tpot": 40 * 1000000
     },
-    "mixtral-8x7b": {"conversational": {"ttft": 2000 * 1000000, "tpot": 200 * 1000000}},
+    "mixtral-8x7b": {"ttft": 2000 * 1000000, "tpot": 200 * 1000000},
     "llama3.1-405b": {
-        "conversational": {"ttft": 6000 * 1000000, "tpot": 175 * 1000000}
+        "ttft": 6000 * 1000000, "tpot": 175 * 1000000
     },
 }
 
@@ -956,6 +1007,8 @@ def requires_equal_issue(self, model, division):
                 "gptj-99.9",
                 "llama2-70b-99",
                 "llama2-70b-99.9",
+                "llama2-70b-interactive-99",
+                "llama2-70b-interactive-99.9",
                 "mixtral-8x7b",
                 "llama3.1-405b",
                 "rgat",
@@ -1253,25 +1306,29 @@ def extra_check_llm(mlperf_log, scenario, model):
     if mlperf_log["requested_use_token_latencies"]:
         if scenario == "Offline":
             # For offline no further checks are necessary
-            return None, True
+            return True
         else:
-            for constraint, limits in LLM_LATENCY_LIMITS[model].items():
-                if (
-                    mlperf_log["result_first_token_99.00_percentile_latency_ns"]
-                    < limits["ttft"]
-                    and mlperf_log["result_time_per_output_token_99.00_percentile_ns"]
-                    < limits["tpot"]
-                ):
-                    return constraint, True
+            limits = LLM_LATENCY_LIMITS[model]
+            if (
+                mlperf_log["result_first_token_99.00_percentile_latency_ns"]
+                < limits["ttft"]
+                and mlperf_log["result_time_per_output_token_99.00_percentile_ns"]
+                < limits["tpot"]
+            ):
+                return True
     else:
         log.error(
             f"use_token_latencies flag needs to be enabled for Llama2 benchmark")
-        return None, False
+        return False
 
     log.error(
-        f'Failed Llama2 extra check for TTFT and TPOT. TTFT 99-tile: {mlperf_log["result_first_token_99.00_percentile_latency_ns"]}, TPOT 99-tile: {mlperf_log["result_time_per_output_token_99.00_percentile_ns"]}'
+        'Failed extra check for TTFT and TPOT. Obtained: TTFT 99-tile: %.4f, TPOT 99-tile: %.4f. Required: TTFT 99-tile: %.4f, TPOT 99-tile: %.4f',
+        mlperf_log["result_first_token_99.00_percentile_latency_ns"],
+        mlperf_log["result_time_per_output_token_99.00_percentile_ns"],
+        limits["ttft"],
+        limits["tpot"]
     )
-    return None, False
+    return False
 
 
 def get_performance_metric(
@@ -1340,9 +1397,11 @@ def check_performance_dir(
         )
 
     if model in ["llama2-70b-99", "llama2-70b-99.9",
+                 "llama2-70b-interactive-99", "llama2-70b-interactive-99.9",
                  "mixtral-8x7b", "llama3.1-405b"]:
-        llama_constraint, is_valid = extra_check_llm(
+        llm_is_valid = extra_check_llm(
             mlperf_log, scenario_fixed, model)
+        is_valid = (llm_is_valid and is_valid)
 
     latency_99_percentile = mlperf_log["result_99.00_percentile_latency_ns"]
     latency_mean = mlperf_log["result_mean_latency_ns"]
@@ -1874,6 +1933,18 @@ def log_result(
                 "Offline": "Tokens/s",
                 "Server": "Tokens/s",
             },
+            "llama2-70b-interactive-99": {
+                "SingleStream": "Latency (ms)",
+                "MultiStream": "Latency (ms)",
+                "Offline": "Tokens/s",
+                "Server": "Tokens/s",
+            },
+            "llama2-70b-interactive-99.9": {
+                "SingleStream": "Latency (ms)",
+                "MultiStream": "Latency (ms)",
+                "Offline": "Tokens/s",
+                "Server": "Tokens/s",
+            },
             "mixtral-8x7b": {
                 "SingleStream": "Latency (ms)",
                 "MultiStream": "Latency (ms)",
@@ -2398,7 +2469,7 @@ def log_result(
                                     perf_path,
                                     scenario_fixed,
                                     division,
-                                    system_json,
+                                    system_json
                                 )
                                 if is_inferred:
                                     inferred = 1
@@ -2966,6 +3037,8 @@ def check_compliance_dir(
         "gptj-99.9",
         "llama2-70b-99",
         "llama2-70b-99.9",
+        "llama2-70b-interactive-99",
+        "llama2-70b-interactive-99.9",
         "mixtral-8x7b",
         "llama3.1-405b",
         "rgat",
@@ -2987,6 +3060,8 @@ def check_compliance_dir(
         "gptj-99.9",
         "llama2-70b-99",
         "llama2-70b-99.9",
+        "llama2-70b-interactive-99",
+        "llama2-70b-interactive-99.9",
         "mixtral-8x7b",
         "llama3.1-405b",
     ]:
@@ -2997,6 +3072,7 @@ def check_compliance_dir(
         test_list.remove("TEST04")
 
     if model in ["llama2-70b-99", "llama2-70b-99.9",
+                 "llama2-70b-interactive-99", "llama2-70b-interactive-99.9",
                  "mixtral-8x7b", "llama3.1-405b"]:
         test_list.append("TEST06")
 

From 2ed0a3e0c8a973ad4e9cb43e4c419d1ac35f4ff7 Mon Sep 17 00:00:00 2001
From: mrmhodak <mrmhodak@users.noreply.github.com>
Date: Tue, 21 Jan 2025 17:05:26 +0000
Subject: [PATCH 2/2] Increment version to 5.0.12

---
 loadgen/VERSION.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loadgen/VERSION.txt b/loadgen/VERSION.txt
index 175775885..718db1c46 100644
--- a/loadgen/VERSION.txt
+++ b/loadgen/VERSION.txt
@@ -1 +1 @@
-5.0.11
+5.0.12