Low latency checks to submission checker and report low latency (#2017)

* Add low latency checks to submission checker * Set llama2-70b-low-latency as a different model to avoid uncommenting config * Replace: low-latency->interactive * Include interactive mode in llama2 reference + README * Update README to have interactive server scenario + remove unnecessary changes * Update test-submission-checker.yml * Update test-submission-checker.yml --------- Co-authored-by: Arjun Suresh <[email protected]>
mlcommons · Jan 21, 2025 · ba71b21 · ba71b21
1 parent dcd0c3e
commit ba71b21
Show file tree

Hide file tree

Showing 7 changed files with 135 additions and 26 deletions.
diff --git a/.github/workflows/test-submission-checker.yml b/.github/workflows/test-submission-checker.yml
@@ -32,7 +32,7 @@ jobs:
     - name: Install dependencies
       run: |
         python3 -m pip install cm4mlops
-        git clone https://github.com/mlcommons/inference_results_v4.1 --depth 1
+        git clone https://github.com/mlcommons/mlperf_inference_unofficial_submissions_v5.0 --depth 1
     - name: Test MLPerf inference submission checker
       run: |
-        cm run script --tags=run,mlperf,inference,submission,checker --adr.inference-src.tags=_branch.$PR_HEAD_REF,_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src.version=custom --input=`pwd`/inference_results_v4.1 --src_version=v4.1 --extra_args=" --skip-extra-accuracy-files-check" --quiet 
+        cm run script --tags=run,mlperf,inference,submission,checker --adr.inference-src.tags=_branch.$PR_HEAD_REF,_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src.version=custom --input=`pwd`/mlperf_inference_unofficial_submissions_v5.0 --src_version=v5.0 --extra_args=" --skip-extra-files-in-root-check --skip-extra-accuracy-files-check" --quiet 
diff --git a/language/llama2-70b/README.md b/language/llama2-70b/README.md
@@ -245,3 +245,21 @@ scale from a 0.0-1.0 scale):
 - Tokens per sample: 294.45
 
 This was run on a DGX-H100 node. Total runtime was ~4.5 days.
+
+# Run llama2-70b-interactive benchmark
+
+For official, Llama2-70b submissions it is also possible to submit in the interactive category. This sets a more strict latency requirements for Time to First Token (ttft) and Time per Output Token (tpot). Specifically, the interactive category requires loadgen to enforce `ttft <= 450ms` and `ttft <= 40ms`
+
+In order to run interactive category, it is sufficient to set the flag `--lg-model-name` as `llama2-70b-interactive` when calling the `main.py` to run the benchmark. For example, to run the server scenario in interactive mode:
+
+```
+python -u main.py --scenario Server \
+                --model-path ${CHECKPOINT_PATH} \
+                --mlperf-conf mlperf.conf \
+                --user-conf user.conf \
+                --total-sample-count 24576 \
+                --device cpu \
+                --dataset-path ${DATASET_PATH} \
+                --output-log-dir server-logs \
+                --lg-model-name llama2-70b-interactive
+```
diff --git a/language/llama2-70b/main.py b/language/llama2-70b/main.py
@@ -120,7 +120,13 @@ def get_args():
         default=None,
         help="Specify an api endpoint call to use api mode",
     )
-
+    parser.add_argument(
+        "--lg-model-name",
+        type=str,
+        default="llama2-70b",
+        choices=["llama2-70b", "llama2-70b-interactive"],
+        help="Model name(specified in llm server)",
+    )
     args = parser.parse_args()
     return args
 
@@ -146,8 +152,7 @@ def main():
     settings = lg.TestSettings()
     settings.scenario = scenario_map[args.scenario.lower()]
     # mlperf.conf is automatically loaded by the loadgen
-    # settings.FromConfig(args.mlperf_conf, "llama2-70b", args.scenario)
-    settings.FromConfig(args.user_conf, "llama2-70b", args.scenario)
+    settings.FromConfig(args.user_conf, args.lg_model_name, args.scenario)
 
     if args.accuracy:
         settings.mode = lg.TestMode.AccuracyOnly

diff --git a/language/llama2-70b/user.conf b/language/llama2-70b/user.conf
@@ -9,5 +9,3 @@
 *.Server.target_qps = 0.5
 *.Server.min_duration = 120000
 *.Server.min_query_count = 100
-
-llama2-70b.Server.sample_concatenate_permutation = 1
diff --git a/loadgen/mlperf.conf b/loadgen/mlperf.conf
@@ -15,6 +15,7 @@ rnnt.*.performance_sample_count_override = 2513
 gptj.*.performance_sample_count_override = 13368
 mixtral-8x7b.*.performance_sample_count_override = 15000
 llama2-70b.*.performance_sample_count_override = 24576
+llama2-70b-interactive.*.performance_sample_count_override = 24576
 llama3_1-405b.*.performance_sample_count_override = 8313
 stable-diffusion-xl.*.performance_sample_count_override = 5000
 rgat.*.performance_sample_count_override = 788379
@@ -49,6 +50,7 @@ rgat.*.sample_concatenate_permutation = 1
 # LLM benchmarks have non-uniform inputs and outputs, and use equal issue mode for all latency scenario
 gptj.*.sample_concatenate_permutation = 1
 llama2-70b.*.sample_concatenate_permutation = 1
+llama2-70b-interactive.*.sample_concatenate_permutation = 1
 mixtral-8x7b.*.sample_concatenate_permutation = 1
 llama3_1-405b.*.sample_concatenate_permutation = 1
 
@@ -66,6 +68,7 @@ gptj.Server.target_latency = 20000
 stable-diffusion-xl.Server.target_latency = 20000
 # Benchmarks that measure token latencies
 llama2-70b.*.use_token_latencies = 1
+llama2-70b-interactive.*.use_token_latencies = 1
 mixtral-8x7b.*.use_token_latencies = 1
 llama3_1-405b.*.use_token_latencies = 1
 # gptj benchmark infers token latencies
@@ -76,6 +79,11 @@ llama2-70b.Server.target_latency = 0
 llama2-70b.Server.ttft_latency = 2000
 llama2-70b.Server.tpot_latency = 200
 
+# Target Latencies for low latency setting
+llama2-70b-interactive.Server.target_latency = 0
+llama2-70b-interactive.Server.ttft_latency = 450
+llama2-70b-interactive.Server.tpot_latency = 40
+
 mixtral-8x7b.Server.target_latency = 0
 mixtral-8x7b.Server.ttft_latency = 2000
 mixtral-8x7b.Server.tpot_latency = 200

diff --git a/tools/submission/generate_final_report.py b/tools/submission/generate_final_report.py
@@ -147,6 +147,8 @@ def main():
             "stable-diffusion-xl",
             "llama2-70b-99",
             "llama2-70b-99.9",
+            "llama2-70b-interactive-99",
+            "llama2-70b-interactive-99.9",
             "mixtral-8x7b",
         ],
         ["SingleStream", "MultiStream", "Server", "Offline"],
@@ -209,6 +211,8 @@ def main():
                 "stable-diffusion-xl": ["Server", "Offline"],
                 "llama2-70b-99": ["Server", "Offline"],
                 "llama2-70b-99.9": ["Server", "Offline"],
+                "llama2-70b-interactive-99": ["Server", "Offline"],
+                "llama2-70b-interactive-99.9": ["Server", "Offline"],
                 "mixtral-8x7b": ["Server", "Offline"],
                 "rgat": ["Offline"],
                 "llama3.1-405b": ["Offline", "Server"]

diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py
@@ -194,7 +194,6 @@
             "ssd-resnet34": "retinanet",
             "mobilenet": "resnet",
             "resnet50": "resnet",
-            "llama3_1-405b": "llama3.1-405b"
         },
         "seeds": {
             "qsl_rng_seed": 3066443479025735752,
@@ -266,6 +265,8 @@
             "gptj-99.9",
             "llama2-70b-99",
             "llama2-70b-99.9",
+            "llama2-70b-interactive-99",
+            "llama2-70b-interactive-99.9",
             "stable-diffusion-xl",
             "mixtral-8x7b",
             "llama3.1-405b",
@@ -283,6 +284,8 @@
             "gptj-99.9": ["Server", "Offline"],
             "llama2-70b-99": ["Server", "Offline"],
             "llama2-70b-99.9": ["Server", "Offline"],
+            "llama2-70b-interactive-99": ["Server", "Offline"],
+            "llama2-70b-interactive-99.9": ["Server", "Offline"],
             "stable-diffusion-xl": ["Server", "Offline"],
             "mixtral-8x7b": ["Server", "Offline"],
             "llama3.1-405b": ["Server", "Offline"],
@@ -314,6 +317,8 @@
             "gptj-99.9": ["SingleStream", "Offline", "Server"],
             "llama2-70b-99": ["Server", "Offline"],
             "llama2-70b-99.9": ["Server", "Offline"],
+            "llama2-70b-interactive-99": ["Server", "Offline"],
+            "llama2-70b-interactive-99.9": ["Server", "Offline"],
             "stable-diffusion-xl": ["SingleStream", "Offline", "Server"],
             "mixtral-8x7b": ["Server", "Offline"],
             "llama3.1-405b": ["Server", "Offline"],
@@ -370,6 +375,26 @@
                 "TOKENS_PER_SAMPLE",
                 294.45 * 0.9,
             ),
+            "llama2-70b-interactive-99": (
+                "ROUGE1",
+                44.4312 * 0.99,
+                "ROUGE2",
+                22.0352 * 0.99,
+                "ROUGEL",
+                28.6162 * 0.99,
+                "TOKENS_PER_SAMPLE",
+                294.45 * 0.9,
+            ),
+            "llama2-70b-interactive-99.9": (
+                "ROUGE1",
+                44.4312 * 0.999,
+                "ROUGE2",
+                22.0352 * 0.999,
+                "ROUGEL",
+                28.6162 * 0.999,
+                "TOKENS_PER_SAMPLE",
+                294.45 * 0.9,
+            ),
             "stable-diffusion-xl": (
                 "CLIP_SCORE",
                 31.68631873,
@@ -409,6 +434,8 @@
             ),
             "llama2-70b-99": ("TOKENS_PER_SAMPLE", 294.45 * 1.1),
             "llama2-70b-99.9": ("TOKENS_PER_SAMPLE", 294.45 * 1.1),
+            "llama2-70b-interactive-99": ("TOKENS_PER_SAMPLE", 294.45 * 1.1),
+            "llama2-70b-interactive-99.9": ("TOKENS_PER_SAMPLE", 294.45 * 1.1),
             "mixtral-8x7b": ("TOKENS_PER_SAMPLE", 145.9 * 1.1),
             "llama3.1-405b": ("TOKENS_PER_SAMPLE", 684.68 * 1.1),
         },
@@ -428,6 +455,8 @@
             "gptj-99.9": 13368,
             "llama2-70b-99": 24576,
             "llama2-70b-99.9": 24576,
+            "llama2-70b-interactive-99": 24576,
+            "llama2-70b-interactive-99.9": 24576,
             "stable-diffusion-xl": 5000,
             "mixtral-8x7b": 15000,
             "llama3.1-405b": 8313,
@@ -439,8 +468,10 @@
         # not really needed
         "model_mapping": {
             # map model names to the official mlperf model class
+            "ssd-resnet34": "retinanet",
             "mobilenet": "resnet",
             "resnet50": "resnet",
+            "llama3_1-405b": "llama3.1-405b",
         },
         "seeds": {
             # TODO: Update random seeds
@@ -459,6 +490,8 @@
             "stable-diffusion-xl": {"Server": 20000000000},
             "llama2-70b-99": {"Server": 20000000000},
             "llama2-70b-99.9": {"Server": 20000000000},
+            "llama2-70b-interactive-99": {"Server": 20000000000},
+            "llama2-70b-interactive-99.9": {"Server": 20000000000},
             "mixtral-8x7b": {"Server": 20000000000},
             "llama3.1-405b": {"Server": 60000000000}
         },
@@ -485,6 +518,8 @@
             "gptj-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
             "llama2-70b-99": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
             "llama2-70b-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+            "llama2-70b-interactive-99": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+            "llama2-70b-interactive-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
             "stable-diffusion-xl": {
                 "SingleStream": 1024,
                 "Server": 270336,
@@ -578,6 +613,8 @@
     "gptj-99.9": 13368,
     "llama2-70b-99": 24576,
     "llama2-70b-99.9": 24576,
+    "llama2-70b-interactive-99": 24576,
+    "llama2-70b-interactive-99.9": 24576,
     "stable-diffusion-xl": 5000,
     "mixtral-8x7b": 15000,
     "llama3.1-405b": 8313,
@@ -645,6 +682,14 @@
             "Offline": "result_tokens_per_second",
             "Server": "result_completed_tokens_per_second",
         },
+        "llama2-70b-interactive-99": {
+            "Offline": "result_tokens_per_second",
+            "Server": "result_completed_tokens_per_second",
+        },
+        "llama2-70b-interactive-99.9": {
+            "Offline": "result_tokens_per_second",
+            "Server": "result_completed_tokens_per_second",
+        },
         "gptj-99": {
             "Offline": "result_inferred_tokens_per_second",
             "Server": "result_inferred_completed_tokens_per_second",
@@ -666,14 +711,20 @@
 
 LLM_LATENCY_LIMITS = {
     "llama2-70b-99": {
-        "conversational": {"ttft": 2000 * 1000000, "tpot": 200 * 1000000}
+        "ttft": 2000 * 1000000, "tpot": 200 * 1000000
     },
     "llama2-70b-99.9": {
-        "conversational": {"ttft": 2000 * 1000000, "tpot": 200 * 1000000}
+        "ttft": 2000 * 1000000, "tpot": 200 * 1000000
+    },
+    "llama2-70b-interactive-99": {
+        "ttft": 450 * 1000000, "tpot": 40 * 1000000
+    },
+    "llama2-70b-interactive-99.9": {
+        "ttft": 450 * 1000000, "tpot": 40 * 1000000
     },
-    "mixtral-8x7b": {"conversational": {"ttft": 2000 * 1000000, "tpot": 200 * 1000000}},
+    "mixtral-8x7b": {"ttft": 2000 * 1000000, "tpot": 200 * 1000000},
     "llama3.1-405b": {
-        "conversational": {"ttft": 6000 * 1000000, "tpot": 175 * 1000000}
+        "ttft": 6000 * 1000000, "tpot": 175 * 1000000
     },
 }
 
@@ -956,6 +1007,8 @@ def requires_equal_issue(self, model, division):
                 "gptj-99.9",
                 "llama2-70b-99",
                 "llama2-70b-99.9",
+                "llama2-70b-interactive-99",
+                "llama2-70b-interactive-99.9",
                 "mixtral-8x7b",
                 "llama3.1-405b",
                 "rgat",
@@ -1253,25 +1306,29 @@ def extra_check_llm(mlperf_log, scenario, model):
     if mlperf_log["requested_use_token_latencies"]:
         if scenario == "Offline":
             # For offline no further checks are necessary
-            return None, True
+            return True
         else:
-            for constraint, limits in LLM_LATENCY_LIMITS[model].items():
-                if (
-                    mlperf_log["result_first_token_99.00_percentile_latency_ns"]
-                    < limits["ttft"]
-                    and mlperf_log["result_time_per_output_token_99.00_percentile_ns"]
-                    < limits["tpot"]
-                ):
-                    return constraint, True
+            limits = LLM_LATENCY_LIMITS[model]
+            if (
+                mlperf_log["result_first_token_99.00_percentile_latency_ns"]
+                < limits["ttft"]
+                and mlperf_log["result_time_per_output_token_99.00_percentile_ns"]
+                < limits["tpot"]
+            ):
+                return True
     else:
         log.error(
             f"use_token_latencies flag needs to be enabled for Llama2 benchmark")
-        return None, False
+        return False
 
     log.error(
-        f'Failed Llama2 extra check for TTFT and TPOT. TTFT 99-tile: {mlperf_log["result_first_token_99.00_percentile_latency_ns"]}, TPOT 99-tile: {mlperf_log["result_time_per_output_token_99.00_percentile_ns"]}'
+        'Failed extra check for TTFT and TPOT. Obtained: TTFT 99-tile: %.4f, TPOT 99-tile: %.4f. Required: TTFT 99-tile: %.4f, TPOT 99-tile: %.4f',
+        mlperf_log["result_first_token_99.00_percentile_latency_ns"],
+        mlperf_log["result_time_per_output_token_99.00_percentile_ns"],
+        limits["ttft"],
+        limits["tpot"]
     )
-    return None, False
+    return False
 
 
 def get_performance_metric(
@@ -1340,9 +1397,11 @@ def check_performance_dir(
         )
 
     if model in ["llama2-70b-99", "llama2-70b-99.9",
+                 "llama2-70b-interactive-99", "llama2-70b-interactive-99.9",
                  "mixtral-8x7b", "llama3.1-405b"]:
-        llama_constraint, is_valid = extra_check_llm(
+        llm_is_valid = extra_check_llm(
             mlperf_log, scenario_fixed, model)
+        is_valid = (llm_is_valid and is_valid)
 
     latency_99_percentile = mlperf_log["result_99.00_percentile_latency_ns"]
     latency_mean = mlperf_log["result_mean_latency_ns"]
@@ -1874,6 +1933,18 @@ def log_result(
                 "Offline": "Tokens/s",
                 "Server": "Tokens/s",
             },
+            "llama2-70b-interactive-99": {
+                "SingleStream": "Latency (ms)",
+                "MultiStream": "Latency (ms)",
+                "Offline": "Tokens/s",
+                "Server": "Tokens/s",
+            },
+            "llama2-70b-interactive-99.9": {
+                "SingleStream": "Latency (ms)",
+                "MultiStream": "Latency (ms)",
+                "Offline": "Tokens/s",
+                "Server": "Tokens/s",
+            },
             "mixtral-8x7b": {
                 "SingleStream": "Latency (ms)",
                 "MultiStream": "Latency (ms)",
@@ -2398,7 +2469,7 @@ def log_result(
                                     perf_path,
                                     scenario_fixed,
                                     division,
-                                    system_json,
+                                    system_json
                                 )
                                 if is_inferred:
                                     inferred = 1
@@ -2966,6 +3037,8 @@ def check_compliance_dir(
         "gptj-99.9",
         "llama2-70b-99",
         "llama2-70b-99.9",
+        "llama2-70b-interactive-99",
+        "llama2-70b-interactive-99.9",
         "mixtral-8x7b",
         "llama3.1-405b",
         "rgat",
@@ -2987,6 +3060,8 @@ def check_compliance_dir(
         "gptj-99.9",
         "llama2-70b-99",
         "llama2-70b-99.9",
+        "llama2-70b-interactive-99",
+        "llama2-70b-interactive-99.9",
         "mixtral-8x7b",
         "llama3.1-405b",
     ]:
@@ -2997,6 +3072,7 @@ def check_compliance_dir(
         test_list.remove("TEST04")
 
     if model in ["llama2-70b-99", "llama2-70b-99.9",
+                 "llama2-70b-interactive-99", "llama2-70b-interactive-99.9",
                  "mixtral-8x7b", "llama3.1-405b"]:
         test_list.append("TEST06")