diff --git a/language/llama3-405b/README.md b/language/llama3-405b/README.md index b7909d7ea3..c3359fa365 100644 --- a/language/llama3-405b/README.md +++ b/language/llama3-405b/README.md @@ -152,17 +152,6 @@ if [ -e ${ACCURACY_LOG_FILE} ]; then python evaluate-accuracy.py --checkpoint-path ${CHECKPOINT_PATH} \ --mlperf-accuracy-file ${ACCURACY_LOG_FILE} --dataset-file ${DATASET_PATH} --dtype int32 fi - -# Optional: Create a pickled pandas DataFrame that is the original dataset with extra columns with output data from the -# accuracy run. The following columns will be added: -# - "gen_output_tok_id": A list of ints representing the tokenized output sequence. -# - "gen_output_text": A str representing the untokenized output sequence. -# - "gen_output_tok_len": An int representing the number of output tokens. -# - "rouge1": The rouge1 score for this sample -# - "rouge2": The rouge2 score for this sample -# - "rougeL": The rougeL score for this sample -# This file will by default be saved to 'full_output.pkl'. You can modify this with --output-pkl-path. -python consolidate_results.py --dataset-path ${DATASET_PATH} --model-dir ${CHECKPOINT_PATH} ``` For the GPU run - The above steps have been automated in `run_accuracy.sh`. You can also modify this script to use diff --git a/language/llama3-405b/consolidate_results.py b/language/llama3-405b/consolidate_results.py deleted file mode 100644 index 645b7bc57b..0000000000 --- a/language/llama3-405b/consolidate_results.py +++ /dev/null @@ -1,137 +0,0 @@ -import argparse -import evaluate -import glob -import nltk -import numpy as np -import os -import pandas as pd -import pickle - -from pathlib import Path -from transformers import LlamaTokenizerFast -from tqdm import tqdm - - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--dataset-path", - type=str, - default=None, - help="Path to .pkl generated by processorca.py", - ) - parser.add_argument( - "--run-outputs", - type=str, - default="run_outputs", - help="Output dir generated by accuracy run.", - ) - parser.add_argument( - "--model-dir", - type=str, - default=None, - help="Path to Llamav2 HuggingFace repo clone", - ) - parser.add_argument( - "--output-pkl-path", - type=str, - default="full_output.pkl", - help="Path to dump output to", - ) - args = parser.parse_args() - return args - - -def load_dataset(p: os.PathLike): - print(f"Loading from {p}...") - return pd.read_pickle(p) - - -def load_run_outputs(p: os.PathLike): - g = glob.glob(str(Path(p) / "q*.pkl")) - - by_query_idx = dict() - for pkl_file in g: - print(f"Loading from {pkl_file}...") - with open(pkl_file, "rb") as f: - d = pickle.load(f) - assert len(d["query_ids"]) == len(d["outputs"]) - - for i in range(len(d["query_ids"])): - qid = d["query_ids"][i] - assert qid not in by_query_idx - by_query_idx[qid] = d["outputs"][i] - - return by_query_idx - - -def main(args): - # Set up decode and evaluation objects - tokenizer = LlamaTokenizerFast.from_pretrained(args.model_dir) - metric = evaluate.load("rouge") - nltk.download("punkt") - - # Load Data - df = load_dataset(args.dataset_path) - run_outputs = load_run_outputs(args.run_outputs) - assert len(run_outputs) == 24576 - - # Set up columns to add - output_tok_ids_col = [None] * 24576 - output_text_col = [None] * 24576 - output_lens = [None] * 24576 - - # Process data - no_eos_ids = [] - for qid, output in tqdm(run_outputs.items()): - L = list(output) - # Prune trailing 2s (EOS token) - try: - first2 = L.index(2) - L = L[:first2] - except ValueError: - # Do nothing - no_eos_ids.append(qid) - - assert L[-1] != 2 - output_tok_ids_col[qid] = L - output_lens[qid] = len(L) - - # Decode tokens - output_text_col[qid] = tokenizer.decode( - output_tok_ids_col[qid], skip_special_tokens=True - ) - print(f"Found {len(no_eos_ids)} samples with no EOS token") - - print("Calculating rouge scores...") - def _preproc(s): return "\n".join(nltk.sent_tokenize(s.strip())) - preds = list(map(_preproc, output_text_col)) - targets = list(map(_preproc, list(df["output"]))) - rouge_scores = metric.compute( - predictions=preds, references=targets, use_stemmer=True, use_aggregator=False - ) - - assert len(rouge_scores["rouge1"]) == 24576 - assert len(rouge_scores["rouge2"]) == 24576 - assert len(rouge_scores["rougeL"]) == 24576 - - agg = {k: round(np.mean(v) * 100, 4) for k, v in rouge_scores.items()} - print(agg) - print("Avg output seqlen:", np.mean(output_lens)) - - # Set columns - df["gen_output_tok_id"] = output_tok_ids_col - df["gen_output_text"] = output_text_col - df["gen_output_tok_len"] = output_lens - df["rouge1"] = rouge_scores["rouge1"] - df["rouge2"] = rouge_scores["rouge2"] - df["rougeL"] = rouge_scores["rougeL"] - - p = Path(args.output_pkl_path) - p.parent.mkdir(exist_ok=True) - df.to_pickle(p) - print(f"Dumped to {p}") - - -if __name__ == "__main__": - main(get_args()) diff --git a/language/llama3-405b/dataset.py b/language/llama3-405b/dataset.py index 3c24619151..ff95557c7b 100644 --- a/language/llama3-405b/dataset.py +++ b/language/llama3-405b/dataset.py @@ -80,16 +80,6 @@ def postProcess( output_seq = out_tokens assert len(query_id_list) == len(output_seq) - # Save outputs - if not os.path.exists("run_outputs"): - os.makedirs("run_outputs") - fname = "q" + "_".join([str(i) for i in query_id_list]) - fname = f"run_outputs/{fname}.pkl" - with open(fname, mode="wb") as f: - d = {"query_ids": query_id_list, "outputs": output_seq} - log.info(f"Saving outputs to {fname}") - pickle.dump(d, f) - return np.asarray(output_seq, dtype=np.int32) def LoadSamplesToRam(self, sample_list):