From ba71b21c3eeb461ed82b4c92709c654a06eb00f7 Mon Sep 17 00:00:00 2001 From: Pablo Gonzalez Date: Tue, 21 Jan 2025 12:05:12 -0500 Subject: [PATCH 1/2] Low latency checks to submission checker and report low latency (#2017) * Add low latency checks to submission checker * Set llama2-70b-low-latency as a different model to avoid uncommenting config * Replace: low-latency->interactive * Include interactive mode in llama2 reference + README * Update README to have interactive server scenario + remove unnecessary changes * Update test-submission-checker.yml * Update test-submission-checker.yml --------- Co-authored-by: Arjun Suresh --- .github/workflows/test-submission-checker.yml | 4 +- language/llama2-70b/README.md | 18 +++ language/llama2-70b/main.py | 11 +- language/llama2-70b/user.conf | 2 - loadgen/mlperf.conf | 8 ++ tools/submission/generate_final_report.py | 4 + tools/submission/submission_checker.py | 114 +++++++++++++++--- 7 files changed, 135 insertions(+), 26 deletions(-) diff --git a/.github/workflows/test-submission-checker.yml b/.github/workflows/test-submission-checker.yml index b7ab88ddb..5b7637d7d 100644 --- a/.github/workflows/test-submission-checker.yml +++ b/.github/workflows/test-submission-checker.yml @@ -32,7 +32,7 @@ jobs: - name: Install dependencies run: | python3 -m pip install cm4mlops - git clone https://github.com/mlcommons/inference_results_v4.1 --depth 1 + git clone https://github.com/mlcommons/mlperf_inference_unofficial_submissions_v5.0 --depth 1 - name: Test MLPerf inference submission checker run: | - cm run script --tags=run,mlperf,inference,submission,checker --adr.inference-src.tags=_branch.$PR_HEAD_REF,_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src.version=custom --input=`pwd`/inference_results_v4.1 --src_version=v4.1 --extra_args=" --skip-extra-accuracy-files-check" --quiet + cm run script --tags=run,mlperf,inference,submission,checker --adr.inference-src.tags=_branch.$PR_HEAD_REF,_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src.version=custom --input=`pwd`/mlperf_inference_unofficial_submissions_v5.0 --src_version=v5.0 --extra_args=" --skip-extra-files-in-root-check --skip-extra-accuracy-files-check" --quiet diff --git a/language/llama2-70b/README.md b/language/llama2-70b/README.md index 07d355344..5268863ac 100644 --- a/language/llama2-70b/README.md +++ b/language/llama2-70b/README.md @@ -245,3 +245,21 @@ scale from a 0.0-1.0 scale): - Tokens per sample: 294.45 This was run on a DGX-H100 node. Total runtime was ~4.5 days. + +# Run llama2-70b-interactive benchmark + +For official, Llama2-70b submissions it is also possible to submit in the interactive category. This sets a more strict latency requirements for Time to First Token (ttft) and Time per Output Token (tpot). Specifically, the interactive category requires loadgen to enforce `ttft <= 450ms` and `ttft <= 40ms` + +In order to run interactive category, it is sufficient to set the flag `--lg-model-name` as `llama2-70b-interactive` when calling the `main.py` to run the benchmark. For example, to run the server scenario in interactive mode: + +``` +python -u main.py --scenario Server \ + --model-path ${CHECKPOINT_PATH} \ + --mlperf-conf mlperf.conf \ + --user-conf user.conf \ + --total-sample-count 24576 \ + --device cpu \ + --dataset-path ${DATASET_PATH} \ + --output-log-dir server-logs \ + --lg-model-name llama2-70b-interactive +``` diff --git a/language/llama2-70b/main.py b/language/llama2-70b/main.py index 1d795685f..84ccf849a 100644 --- a/language/llama2-70b/main.py +++ b/language/llama2-70b/main.py @@ -120,7 +120,13 @@ def get_args(): default=None, help="Specify an api endpoint call to use api mode", ) - + parser.add_argument( + "--lg-model-name", + type=str, + default="llama2-70b", + choices=["llama2-70b", "llama2-70b-interactive"], + help="Model name(specified in llm server)", + ) args = parser.parse_args() return args @@ -146,8 +152,7 @@ def main(): settings = lg.TestSettings() settings.scenario = scenario_map[args.scenario.lower()] # mlperf.conf is automatically loaded by the loadgen - # settings.FromConfig(args.mlperf_conf, "llama2-70b", args.scenario) - settings.FromConfig(args.user_conf, "llama2-70b", args.scenario) + settings.FromConfig(args.user_conf, args.lg_model_name, args.scenario) if args.accuracy: settings.mode = lg.TestMode.AccuracyOnly diff --git a/language/llama2-70b/user.conf b/language/llama2-70b/user.conf index 945082fe9..bb97c437a 100644 --- a/language/llama2-70b/user.conf +++ b/language/llama2-70b/user.conf @@ -9,5 +9,3 @@ *.Server.target_qps = 0.5 *.Server.min_duration = 120000 *.Server.min_query_count = 100 - -llama2-70b.Server.sample_concatenate_permutation = 1 \ No newline at end of file diff --git a/loadgen/mlperf.conf b/loadgen/mlperf.conf index 836b42564..582381bbd 100644 --- a/loadgen/mlperf.conf +++ b/loadgen/mlperf.conf @@ -15,6 +15,7 @@ rnnt.*.performance_sample_count_override = 2513 gptj.*.performance_sample_count_override = 13368 mixtral-8x7b.*.performance_sample_count_override = 15000 llama2-70b.*.performance_sample_count_override = 24576 +llama2-70b-interactive.*.performance_sample_count_override = 24576 llama3_1-405b.*.performance_sample_count_override = 8313 stable-diffusion-xl.*.performance_sample_count_override = 5000 rgat.*.performance_sample_count_override = 788379 @@ -49,6 +50,7 @@ rgat.*.sample_concatenate_permutation = 1 # LLM benchmarks have non-uniform inputs and outputs, and use equal issue mode for all latency scenario gptj.*.sample_concatenate_permutation = 1 llama2-70b.*.sample_concatenate_permutation = 1 +llama2-70b-interactive.*.sample_concatenate_permutation = 1 mixtral-8x7b.*.sample_concatenate_permutation = 1 llama3_1-405b.*.sample_concatenate_permutation = 1 @@ -66,6 +68,7 @@ gptj.Server.target_latency = 20000 stable-diffusion-xl.Server.target_latency = 20000 # Benchmarks that measure token latencies llama2-70b.*.use_token_latencies = 1 +llama2-70b-interactive.*.use_token_latencies = 1 mixtral-8x7b.*.use_token_latencies = 1 llama3_1-405b.*.use_token_latencies = 1 # gptj benchmark infers token latencies @@ -76,6 +79,11 @@ llama2-70b.Server.target_latency = 0 llama2-70b.Server.ttft_latency = 2000 llama2-70b.Server.tpot_latency = 200 +# Target Latencies for low latency setting +llama2-70b-interactive.Server.target_latency = 0 +llama2-70b-interactive.Server.ttft_latency = 450 +llama2-70b-interactive.Server.tpot_latency = 40 + mixtral-8x7b.Server.target_latency = 0 mixtral-8x7b.Server.ttft_latency = 2000 mixtral-8x7b.Server.tpot_latency = 200 diff --git a/tools/submission/generate_final_report.py b/tools/submission/generate_final_report.py index aa5b36983..da965ef3e 100644 --- a/tools/submission/generate_final_report.py +++ b/tools/submission/generate_final_report.py @@ -147,6 +147,8 @@ def main(): "stable-diffusion-xl", "llama2-70b-99", "llama2-70b-99.9", + "llama2-70b-interactive-99", + "llama2-70b-interactive-99.9", "mixtral-8x7b", ], ["SingleStream", "MultiStream", "Server", "Offline"], @@ -209,6 +211,8 @@ def main(): "stable-diffusion-xl": ["Server", "Offline"], "llama2-70b-99": ["Server", "Offline"], "llama2-70b-99.9": ["Server", "Offline"], + "llama2-70b-interactive-99": ["Server", "Offline"], + "llama2-70b-interactive-99.9": ["Server", "Offline"], "mixtral-8x7b": ["Server", "Offline"], "rgat": ["Offline"], "llama3.1-405b": ["Offline", "Server"] diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index 91cba0f31..264c9b373 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -194,7 +194,6 @@ "ssd-resnet34": "retinanet", "mobilenet": "resnet", "resnet50": "resnet", - "llama3_1-405b": "llama3.1-405b" }, "seeds": { "qsl_rng_seed": 3066443479025735752, @@ -266,6 +265,8 @@ "gptj-99.9", "llama2-70b-99", "llama2-70b-99.9", + "llama2-70b-interactive-99", + "llama2-70b-interactive-99.9", "stable-diffusion-xl", "mixtral-8x7b", "llama3.1-405b", @@ -283,6 +284,8 @@ "gptj-99.9": ["Server", "Offline"], "llama2-70b-99": ["Server", "Offline"], "llama2-70b-99.9": ["Server", "Offline"], + "llama2-70b-interactive-99": ["Server", "Offline"], + "llama2-70b-interactive-99.9": ["Server", "Offline"], "stable-diffusion-xl": ["Server", "Offline"], "mixtral-8x7b": ["Server", "Offline"], "llama3.1-405b": ["Server", "Offline"], @@ -314,6 +317,8 @@ "gptj-99.9": ["SingleStream", "Offline", "Server"], "llama2-70b-99": ["Server", "Offline"], "llama2-70b-99.9": ["Server", "Offline"], + "llama2-70b-interactive-99": ["Server", "Offline"], + "llama2-70b-interactive-99.9": ["Server", "Offline"], "stable-diffusion-xl": ["SingleStream", "Offline", "Server"], "mixtral-8x7b": ["Server", "Offline"], "llama3.1-405b": ["Server", "Offline"], @@ -370,6 +375,26 @@ "TOKENS_PER_SAMPLE", 294.45 * 0.9, ), + "llama2-70b-interactive-99": ( + "ROUGE1", + 44.4312 * 0.99, + "ROUGE2", + 22.0352 * 0.99, + "ROUGEL", + 28.6162 * 0.99, + "TOKENS_PER_SAMPLE", + 294.45 * 0.9, + ), + "llama2-70b-interactive-99.9": ( + "ROUGE1", + 44.4312 * 0.999, + "ROUGE2", + 22.0352 * 0.999, + "ROUGEL", + 28.6162 * 0.999, + "TOKENS_PER_SAMPLE", + 294.45 * 0.9, + ), "stable-diffusion-xl": ( "CLIP_SCORE", 31.68631873, @@ -409,6 +434,8 @@ ), "llama2-70b-99": ("TOKENS_PER_SAMPLE", 294.45 * 1.1), "llama2-70b-99.9": ("TOKENS_PER_SAMPLE", 294.45 * 1.1), + "llama2-70b-interactive-99": ("TOKENS_PER_SAMPLE", 294.45 * 1.1), + "llama2-70b-interactive-99.9": ("TOKENS_PER_SAMPLE", 294.45 * 1.1), "mixtral-8x7b": ("TOKENS_PER_SAMPLE", 145.9 * 1.1), "llama3.1-405b": ("TOKENS_PER_SAMPLE", 684.68 * 1.1), }, @@ -428,6 +455,8 @@ "gptj-99.9": 13368, "llama2-70b-99": 24576, "llama2-70b-99.9": 24576, + "llama2-70b-interactive-99": 24576, + "llama2-70b-interactive-99.9": 24576, "stable-diffusion-xl": 5000, "mixtral-8x7b": 15000, "llama3.1-405b": 8313, @@ -439,8 +468,10 @@ # not really needed "model_mapping": { # map model names to the official mlperf model class + "ssd-resnet34": "retinanet", "mobilenet": "resnet", "resnet50": "resnet", + "llama3_1-405b": "llama3.1-405b", }, "seeds": { # TODO: Update random seeds @@ -459,6 +490,8 @@ "stable-diffusion-xl": {"Server": 20000000000}, "llama2-70b-99": {"Server": 20000000000}, "llama2-70b-99.9": {"Server": 20000000000}, + "llama2-70b-interactive-99": {"Server": 20000000000}, + "llama2-70b-interactive-99.9": {"Server": 20000000000}, "mixtral-8x7b": {"Server": 20000000000}, "llama3.1-405b": {"Server": 60000000000} }, @@ -485,6 +518,8 @@ "gptj-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, "llama2-70b-99": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, "llama2-70b-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, + "llama2-70b-interactive-99": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, + "llama2-70b-interactive-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, "stable-diffusion-xl": { "SingleStream": 1024, "Server": 270336, @@ -578,6 +613,8 @@ "gptj-99.9": 13368, "llama2-70b-99": 24576, "llama2-70b-99.9": 24576, + "llama2-70b-interactive-99": 24576, + "llama2-70b-interactive-99.9": 24576, "stable-diffusion-xl": 5000, "mixtral-8x7b": 15000, "llama3.1-405b": 8313, @@ -645,6 +682,14 @@ "Offline": "result_tokens_per_second", "Server": "result_completed_tokens_per_second", }, + "llama2-70b-interactive-99": { + "Offline": "result_tokens_per_second", + "Server": "result_completed_tokens_per_second", + }, + "llama2-70b-interactive-99.9": { + "Offline": "result_tokens_per_second", + "Server": "result_completed_tokens_per_second", + }, "gptj-99": { "Offline": "result_inferred_tokens_per_second", "Server": "result_inferred_completed_tokens_per_second", @@ -666,14 +711,20 @@ LLM_LATENCY_LIMITS = { "llama2-70b-99": { - "conversational": {"ttft": 2000 * 1000000, "tpot": 200 * 1000000} + "ttft": 2000 * 1000000, "tpot": 200 * 1000000 }, "llama2-70b-99.9": { - "conversational": {"ttft": 2000 * 1000000, "tpot": 200 * 1000000} + "ttft": 2000 * 1000000, "tpot": 200 * 1000000 + }, + "llama2-70b-interactive-99": { + "ttft": 450 * 1000000, "tpot": 40 * 1000000 + }, + "llama2-70b-interactive-99.9": { + "ttft": 450 * 1000000, "tpot": 40 * 1000000 }, - "mixtral-8x7b": {"conversational": {"ttft": 2000 * 1000000, "tpot": 200 * 1000000}}, + "mixtral-8x7b": {"ttft": 2000 * 1000000, "tpot": 200 * 1000000}, "llama3.1-405b": { - "conversational": {"ttft": 6000 * 1000000, "tpot": 175 * 1000000} + "ttft": 6000 * 1000000, "tpot": 175 * 1000000 }, } @@ -956,6 +1007,8 @@ def requires_equal_issue(self, model, division): "gptj-99.9", "llama2-70b-99", "llama2-70b-99.9", + "llama2-70b-interactive-99", + "llama2-70b-interactive-99.9", "mixtral-8x7b", "llama3.1-405b", "rgat", @@ -1253,25 +1306,29 @@ def extra_check_llm(mlperf_log, scenario, model): if mlperf_log["requested_use_token_latencies"]: if scenario == "Offline": # For offline no further checks are necessary - return None, True + return True else: - for constraint, limits in LLM_LATENCY_LIMITS[model].items(): - if ( - mlperf_log["result_first_token_99.00_percentile_latency_ns"] - < limits["ttft"] - and mlperf_log["result_time_per_output_token_99.00_percentile_ns"] - < limits["tpot"] - ): - return constraint, True + limits = LLM_LATENCY_LIMITS[model] + if ( + mlperf_log["result_first_token_99.00_percentile_latency_ns"] + < limits["ttft"] + and mlperf_log["result_time_per_output_token_99.00_percentile_ns"] + < limits["tpot"] + ): + return True else: log.error( f"use_token_latencies flag needs to be enabled for Llama2 benchmark") - return None, False + return False log.error( - f'Failed Llama2 extra check for TTFT and TPOT. TTFT 99-tile: {mlperf_log["result_first_token_99.00_percentile_latency_ns"]}, TPOT 99-tile: {mlperf_log["result_time_per_output_token_99.00_percentile_ns"]}' + 'Failed extra check for TTFT and TPOT. Obtained: TTFT 99-tile: %.4f, TPOT 99-tile: %.4f. Required: TTFT 99-tile: %.4f, TPOT 99-tile: %.4f', + mlperf_log["result_first_token_99.00_percentile_latency_ns"], + mlperf_log["result_time_per_output_token_99.00_percentile_ns"], + limits["ttft"], + limits["tpot"] ) - return None, False + return False def get_performance_metric( @@ -1340,9 +1397,11 @@ def check_performance_dir( ) if model in ["llama2-70b-99", "llama2-70b-99.9", + "llama2-70b-interactive-99", "llama2-70b-interactive-99.9", "mixtral-8x7b", "llama3.1-405b"]: - llama_constraint, is_valid = extra_check_llm( + llm_is_valid = extra_check_llm( mlperf_log, scenario_fixed, model) + is_valid = (llm_is_valid and is_valid) latency_99_percentile = mlperf_log["result_99.00_percentile_latency_ns"] latency_mean = mlperf_log["result_mean_latency_ns"] @@ -1874,6 +1933,18 @@ def log_result( "Offline": "Tokens/s", "Server": "Tokens/s", }, + "llama2-70b-interactive-99": { + "SingleStream": "Latency (ms)", + "MultiStream": "Latency (ms)", + "Offline": "Tokens/s", + "Server": "Tokens/s", + }, + "llama2-70b-interactive-99.9": { + "SingleStream": "Latency (ms)", + "MultiStream": "Latency (ms)", + "Offline": "Tokens/s", + "Server": "Tokens/s", + }, "mixtral-8x7b": { "SingleStream": "Latency (ms)", "MultiStream": "Latency (ms)", @@ -2398,7 +2469,7 @@ def log_result( perf_path, scenario_fixed, division, - system_json, + system_json ) if is_inferred: inferred = 1 @@ -2966,6 +3037,8 @@ def check_compliance_dir( "gptj-99.9", "llama2-70b-99", "llama2-70b-99.9", + "llama2-70b-interactive-99", + "llama2-70b-interactive-99.9", "mixtral-8x7b", "llama3.1-405b", "rgat", @@ -2987,6 +3060,8 @@ def check_compliance_dir( "gptj-99.9", "llama2-70b-99", "llama2-70b-99.9", + "llama2-70b-interactive-99", + "llama2-70b-interactive-99.9", "mixtral-8x7b", "llama3.1-405b", ]: @@ -2997,6 +3072,7 @@ def check_compliance_dir( test_list.remove("TEST04") if model in ["llama2-70b-99", "llama2-70b-99.9", + "llama2-70b-interactive-99", "llama2-70b-interactive-99.9", "mixtral-8x7b", "llama3.1-405b"]: test_list.append("TEST06") From 2ed0a3e0c8a973ad4e9cb43e4c419d1ac35f4ff7 Mon Sep 17 00:00:00 2001 From: mrmhodak Date: Tue, 21 Jan 2025 17:05:26 +0000 Subject: [PATCH 2/2] Increment version to 5.0.12 --- loadgen/VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loadgen/VERSION.txt b/loadgen/VERSION.txt index 175775885..718db1c46 100644 --- a/loadgen/VERSION.txt +++ b/loadgen/VERSION.txt @@ -1 +1 @@ -5.0.11 +5.0.12