Skip to content

Commit

Permalink
Low latency checks to submission checker and report low latency (#2017)
Browse files Browse the repository at this point in the history
* Add low latency checks to submission checker

* Set llama2-70b-low-latency as a different model to avoid uncommenting config

* Replace: low-latency->interactive

* Include interactive mode in llama2 reference + README

* Update README to have interactive server scenario + remove unnecessary changes

* Update test-submission-checker.yml

* Update test-submission-checker.yml

---------

Co-authored-by: Arjun Suresh <[email protected]>
  • Loading branch information
pgmpablo157321 and arjunsuresh authored Jan 21, 2025
1 parent dcd0c3e commit ba71b21
Show file tree
Hide file tree
Showing 7 changed files with 135 additions and 26 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/test-submission-checker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
- name: Install dependencies
run: |
python3 -m pip install cm4mlops
git clone https://github.com/mlcommons/inference_results_v4.1 --depth 1
git clone https://github.com/mlcommons/mlperf_inference_unofficial_submissions_v5.0 --depth 1
- name: Test MLPerf inference submission checker
run: |
cm run script --tags=run,mlperf,inference,submission,checker --adr.inference-src.tags=_branch.$PR_HEAD_REF,_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src.version=custom --input=`pwd`/inference_results_v4.1 --src_version=v4.1 --extra_args=" --skip-extra-accuracy-files-check" --quiet
cm run script --tags=run,mlperf,inference,submission,checker --adr.inference-src.tags=_branch.$PR_HEAD_REF,_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src.version=custom --input=`pwd`/mlperf_inference_unofficial_submissions_v5.0 --src_version=v5.0 --extra_args=" --skip-extra-files-in-root-check --skip-extra-accuracy-files-check" --quiet
18 changes: 18 additions & 0 deletions language/llama2-70b/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -245,3 +245,21 @@ scale from a 0.0-1.0 scale):
- Tokens per sample: 294.45

This was run on a DGX-H100 node. Total runtime was ~4.5 days.

# Run llama2-70b-interactive benchmark

For official, Llama2-70b submissions it is also possible to submit in the interactive category. This sets a more strict latency requirements for Time to First Token (ttft) and Time per Output Token (tpot). Specifically, the interactive category requires loadgen to enforce `ttft <= 450ms` and `ttft <= 40ms`

In order to run interactive category, it is sufficient to set the flag `--lg-model-name` as `llama2-70b-interactive` when calling the `main.py` to run the benchmark. For example, to run the server scenario in interactive mode:

```
python -u main.py --scenario Server \
--model-path ${CHECKPOINT_PATH} \
--mlperf-conf mlperf.conf \
--user-conf user.conf \
--total-sample-count 24576 \
--device cpu \
--dataset-path ${DATASET_PATH} \
--output-log-dir server-logs \
--lg-model-name llama2-70b-interactive
```
11 changes: 8 additions & 3 deletions language/llama2-70b/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,13 @@ def get_args():
default=None,
help="Specify an api endpoint call to use api mode",
)

parser.add_argument(
"--lg-model-name",
type=str,
default="llama2-70b",
choices=["llama2-70b", "llama2-70b-interactive"],
help="Model name(specified in llm server)",
)
args = parser.parse_args()
return args

Expand All @@ -146,8 +152,7 @@ def main():
settings = lg.TestSettings()
settings.scenario = scenario_map[args.scenario.lower()]
# mlperf.conf is automatically loaded by the loadgen
# settings.FromConfig(args.mlperf_conf, "llama2-70b", args.scenario)
settings.FromConfig(args.user_conf, "llama2-70b", args.scenario)
settings.FromConfig(args.user_conf, args.lg_model_name, args.scenario)

if args.accuracy:
settings.mode = lg.TestMode.AccuracyOnly
Expand Down
2 changes: 0 additions & 2 deletions language/llama2-70b/user.conf
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,3 @@
*.Server.target_qps = 0.5
*.Server.min_duration = 120000
*.Server.min_query_count = 100

llama2-70b.Server.sample_concatenate_permutation = 1
8 changes: 8 additions & 0 deletions loadgen/mlperf.conf
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ rnnt.*.performance_sample_count_override = 2513
gptj.*.performance_sample_count_override = 13368
mixtral-8x7b.*.performance_sample_count_override = 15000
llama2-70b.*.performance_sample_count_override = 24576
llama2-70b-interactive.*.performance_sample_count_override = 24576
llama3_1-405b.*.performance_sample_count_override = 8313
stable-diffusion-xl.*.performance_sample_count_override = 5000
rgat.*.performance_sample_count_override = 788379
Expand Down Expand Up @@ -49,6 +50,7 @@ rgat.*.sample_concatenate_permutation = 1
# LLM benchmarks have non-uniform inputs and outputs, and use equal issue mode for all latency scenario
gptj.*.sample_concatenate_permutation = 1
llama2-70b.*.sample_concatenate_permutation = 1
llama2-70b-interactive.*.sample_concatenate_permutation = 1
mixtral-8x7b.*.sample_concatenate_permutation = 1
llama3_1-405b.*.sample_concatenate_permutation = 1

Expand All @@ -66,6 +68,7 @@ gptj.Server.target_latency = 20000
stable-diffusion-xl.Server.target_latency = 20000
# Benchmarks that measure token latencies
llama2-70b.*.use_token_latencies = 1
llama2-70b-interactive.*.use_token_latencies = 1
mixtral-8x7b.*.use_token_latencies = 1
llama3_1-405b.*.use_token_latencies = 1
# gptj benchmark infers token latencies
Expand All @@ -76,6 +79,11 @@ llama2-70b.Server.target_latency = 0
llama2-70b.Server.ttft_latency = 2000
llama2-70b.Server.tpot_latency = 200

# Target Latencies for low latency setting
llama2-70b-interactive.Server.target_latency = 0
llama2-70b-interactive.Server.ttft_latency = 450
llama2-70b-interactive.Server.tpot_latency = 40

mixtral-8x7b.Server.target_latency = 0
mixtral-8x7b.Server.ttft_latency = 2000
mixtral-8x7b.Server.tpot_latency = 200
Expand Down
4 changes: 4 additions & 0 deletions tools/submission/generate_final_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,8 @@ def main():
"stable-diffusion-xl",
"llama2-70b-99",
"llama2-70b-99.9",
"llama2-70b-interactive-99",
"llama2-70b-interactive-99.9",
"mixtral-8x7b",
],
["SingleStream", "MultiStream", "Server", "Offline"],
Expand Down Expand Up @@ -209,6 +211,8 @@ def main():
"stable-diffusion-xl": ["Server", "Offline"],
"llama2-70b-99": ["Server", "Offline"],
"llama2-70b-99.9": ["Server", "Offline"],
"llama2-70b-interactive-99": ["Server", "Offline"],
"llama2-70b-interactive-99.9": ["Server", "Offline"],
"mixtral-8x7b": ["Server", "Offline"],
"rgat": ["Offline"],
"llama3.1-405b": ["Offline", "Server"]
Expand Down
114 changes: 95 additions & 19 deletions tools/submission/submission_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,6 @@
"ssd-resnet34": "retinanet",
"mobilenet": "resnet",
"resnet50": "resnet",
"llama3_1-405b": "llama3.1-405b"
},
"seeds": {
"qsl_rng_seed": 3066443479025735752,
Expand Down Expand Up @@ -266,6 +265,8 @@
"gptj-99.9",
"llama2-70b-99",
"llama2-70b-99.9",
"llama2-70b-interactive-99",
"llama2-70b-interactive-99.9",
"stable-diffusion-xl",
"mixtral-8x7b",
"llama3.1-405b",
Expand All @@ -283,6 +284,8 @@
"gptj-99.9": ["Server", "Offline"],
"llama2-70b-99": ["Server", "Offline"],
"llama2-70b-99.9": ["Server", "Offline"],
"llama2-70b-interactive-99": ["Server", "Offline"],
"llama2-70b-interactive-99.9": ["Server", "Offline"],
"stable-diffusion-xl": ["Server", "Offline"],
"mixtral-8x7b": ["Server", "Offline"],
"llama3.1-405b": ["Server", "Offline"],
Expand Down Expand Up @@ -314,6 +317,8 @@
"gptj-99.9": ["SingleStream", "Offline", "Server"],
"llama2-70b-99": ["Server", "Offline"],
"llama2-70b-99.9": ["Server", "Offline"],
"llama2-70b-interactive-99": ["Server", "Offline"],
"llama2-70b-interactive-99.9": ["Server", "Offline"],
"stable-diffusion-xl": ["SingleStream", "Offline", "Server"],
"mixtral-8x7b": ["Server", "Offline"],
"llama3.1-405b": ["Server", "Offline"],
Expand Down Expand Up @@ -370,6 +375,26 @@
"TOKENS_PER_SAMPLE",
294.45 * 0.9,
),
"llama2-70b-interactive-99": (
"ROUGE1",
44.4312 * 0.99,
"ROUGE2",
22.0352 * 0.99,
"ROUGEL",
28.6162 * 0.99,
"TOKENS_PER_SAMPLE",
294.45 * 0.9,
),
"llama2-70b-interactive-99.9": (
"ROUGE1",
44.4312 * 0.999,
"ROUGE2",
22.0352 * 0.999,
"ROUGEL",
28.6162 * 0.999,
"TOKENS_PER_SAMPLE",
294.45 * 0.9,
),
"stable-diffusion-xl": (
"CLIP_SCORE",
31.68631873,
Expand Down Expand Up @@ -409,6 +434,8 @@
),
"llama2-70b-99": ("TOKENS_PER_SAMPLE", 294.45 * 1.1),
"llama2-70b-99.9": ("TOKENS_PER_SAMPLE", 294.45 * 1.1),
"llama2-70b-interactive-99": ("TOKENS_PER_SAMPLE", 294.45 * 1.1),
"llama2-70b-interactive-99.9": ("TOKENS_PER_SAMPLE", 294.45 * 1.1),
"mixtral-8x7b": ("TOKENS_PER_SAMPLE", 145.9 * 1.1),
"llama3.1-405b": ("TOKENS_PER_SAMPLE", 684.68 * 1.1),
},
Expand All @@ -428,6 +455,8 @@
"gptj-99.9": 13368,
"llama2-70b-99": 24576,
"llama2-70b-99.9": 24576,
"llama2-70b-interactive-99": 24576,
"llama2-70b-interactive-99.9": 24576,
"stable-diffusion-xl": 5000,
"mixtral-8x7b": 15000,
"llama3.1-405b": 8313,
Expand All @@ -439,8 +468,10 @@
# not really needed
"model_mapping": {
# map model names to the official mlperf model class
"ssd-resnet34": "retinanet",
"mobilenet": "resnet",
"resnet50": "resnet",
"llama3_1-405b": "llama3.1-405b",
},
"seeds": {
# TODO: Update random seeds
Expand All @@ -459,6 +490,8 @@
"stable-diffusion-xl": {"Server": 20000000000},
"llama2-70b-99": {"Server": 20000000000},
"llama2-70b-99.9": {"Server": 20000000000},
"llama2-70b-interactive-99": {"Server": 20000000000},
"llama2-70b-interactive-99.9": {"Server": 20000000000},
"mixtral-8x7b": {"Server": 20000000000},
"llama3.1-405b": {"Server": 60000000000}
},
Expand All @@ -485,6 +518,8 @@
"gptj-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
"llama2-70b-99": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
"llama2-70b-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
"llama2-70b-interactive-99": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
"llama2-70b-interactive-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
"stable-diffusion-xl": {
"SingleStream": 1024,
"Server": 270336,
Expand Down Expand Up @@ -578,6 +613,8 @@
"gptj-99.9": 13368,
"llama2-70b-99": 24576,
"llama2-70b-99.9": 24576,
"llama2-70b-interactive-99": 24576,
"llama2-70b-interactive-99.9": 24576,
"stable-diffusion-xl": 5000,
"mixtral-8x7b": 15000,
"llama3.1-405b": 8313,
Expand Down Expand Up @@ -645,6 +682,14 @@
"Offline": "result_tokens_per_second",
"Server": "result_completed_tokens_per_second",
},
"llama2-70b-interactive-99": {
"Offline": "result_tokens_per_second",
"Server": "result_completed_tokens_per_second",
},
"llama2-70b-interactive-99.9": {
"Offline": "result_tokens_per_second",
"Server": "result_completed_tokens_per_second",
},
"gptj-99": {
"Offline": "result_inferred_tokens_per_second",
"Server": "result_inferred_completed_tokens_per_second",
Expand All @@ -666,14 +711,20 @@

LLM_LATENCY_LIMITS = {
"llama2-70b-99": {
"conversational": {"ttft": 2000 * 1000000, "tpot": 200 * 1000000}
"ttft": 2000 * 1000000, "tpot": 200 * 1000000
},
"llama2-70b-99.9": {
"conversational": {"ttft": 2000 * 1000000, "tpot": 200 * 1000000}
"ttft": 2000 * 1000000, "tpot": 200 * 1000000
},
"llama2-70b-interactive-99": {
"ttft": 450 * 1000000, "tpot": 40 * 1000000
},
"llama2-70b-interactive-99.9": {
"ttft": 450 * 1000000, "tpot": 40 * 1000000
},
"mixtral-8x7b": {"conversational": {"ttft": 2000 * 1000000, "tpot": 200 * 1000000}},
"mixtral-8x7b": {"ttft": 2000 * 1000000, "tpot": 200 * 1000000},
"llama3.1-405b": {
"conversational": {"ttft": 6000 * 1000000, "tpot": 175 * 1000000}
"ttft": 6000 * 1000000, "tpot": 175 * 1000000
},
}

Expand Down Expand Up @@ -956,6 +1007,8 @@ def requires_equal_issue(self, model, division):
"gptj-99.9",
"llama2-70b-99",
"llama2-70b-99.9",
"llama2-70b-interactive-99",
"llama2-70b-interactive-99.9",
"mixtral-8x7b",
"llama3.1-405b",
"rgat",
Expand Down Expand Up @@ -1253,25 +1306,29 @@ def extra_check_llm(mlperf_log, scenario, model):
if mlperf_log["requested_use_token_latencies"]:
if scenario == "Offline":
# For offline no further checks are necessary
return None, True
return True
else:
for constraint, limits in LLM_LATENCY_LIMITS[model].items():
if (
mlperf_log["result_first_token_99.00_percentile_latency_ns"]
< limits["ttft"]
and mlperf_log["result_time_per_output_token_99.00_percentile_ns"]
< limits["tpot"]
):
return constraint, True
limits = LLM_LATENCY_LIMITS[model]
if (
mlperf_log["result_first_token_99.00_percentile_latency_ns"]
< limits["ttft"]
and mlperf_log["result_time_per_output_token_99.00_percentile_ns"]
< limits["tpot"]
):
return True
else:
log.error(
f"use_token_latencies flag needs to be enabled for Llama2 benchmark")
return None, False
return False

log.error(
f'Failed Llama2 extra check for TTFT and TPOT. TTFT 99-tile: {mlperf_log["result_first_token_99.00_percentile_latency_ns"]}, TPOT 99-tile: {mlperf_log["result_time_per_output_token_99.00_percentile_ns"]}'
'Failed extra check for TTFT and TPOT. Obtained: TTFT 99-tile: %.4f, TPOT 99-tile: %.4f. Required: TTFT 99-tile: %.4f, TPOT 99-tile: %.4f',
mlperf_log["result_first_token_99.00_percentile_latency_ns"],
mlperf_log["result_time_per_output_token_99.00_percentile_ns"],
limits["ttft"],
limits["tpot"]
)
return None, False
return False


def get_performance_metric(
Expand Down Expand Up @@ -1340,9 +1397,11 @@ def check_performance_dir(
)

if model in ["llama2-70b-99", "llama2-70b-99.9",
"llama2-70b-interactive-99", "llama2-70b-interactive-99.9",
"mixtral-8x7b", "llama3.1-405b"]:
llama_constraint, is_valid = extra_check_llm(
llm_is_valid = extra_check_llm(
mlperf_log, scenario_fixed, model)
is_valid = (llm_is_valid and is_valid)

latency_99_percentile = mlperf_log["result_99.00_percentile_latency_ns"]
latency_mean = mlperf_log["result_mean_latency_ns"]
Expand Down Expand Up @@ -1874,6 +1933,18 @@ def log_result(
"Offline": "Tokens/s",
"Server": "Tokens/s",
},
"llama2-70b-interactive-99": {
"SingleStream": "Latency (ms)",
"MultiStream": "Latency (ms)",
"Offline": "Tokens/s",
"Server": "Tokens/s",
},
"llama2-70b-interactive-99.9": {
"SingleStream": "Latency (ms)",
"MultiStream": "Latency (ms)",
"Offline": "Tokens/s",
"Server": "Tokens/s",
},
"mixtral-8x7b": {
"SingleStream": "Latency (ms)",
"MultiStream": "Latency (ms)",
Expand Down Expand Up @@ -2398,7 +2469,7 @@ def log_result(
perf_path,
scenario_fixed,
division,
system_json,
system_json
)
if is_inferred:
inferred = 1
Expand Down Expand Up @@ -2966,6 +3037,8 @@ def check_compliance_dir(
"gptj-99.9",
"llama2-70b-99",
"llama2-70b-99.9",
"llama2-70b-interactive-99",
"llama2-70b-interactive-99.9",
"mixtral-8x7b",
"llama3.1-405b",
"rgat",
Expand All @@ -2987,6 +3060,8 @@ def check_compliance_dir(
"gptj-99.9",
"llama2-70b-99",
"llama2-70b-99.9",
"llama2-70b-interactive-99",
"llama2-70b-interactive-99.9",
"mixtral-8x7b",
"llama3.1-405b",
]:
Expand All @@ -2997,6 +3072,7 @@ def check_compliance_dir(
test_list.remove("TEST04")

if model in ["llama2-70b-99", "llama2-70b-99.9",
"llama2-70b-interactive-99", "llama2-70b-interactive-99.9",
"mixtral-8x7b", "llama3.1-405b"]:
test_list.append("TEST06")

Expand Down

0 comments on commit ba71b21

Please sign in to comment.