diff --git a/language/llama3-405b/README.md b/language/llama3-405b/README.md
index b7909d7ea3..c3359fa365 100644
--- a/language/llama3-405b/README.md
+++ b/language/llama3-405b/README.md
@@ -152,17 +152,6 @@ if [ -e ${ACCURACY_LOG_FILE} ]; then
         python evaluate-accuracy.py --checkpoint-path ${CHECKPOINT_PATH} \
                 --mlperf-accuracy-file ${ACCURACY_LOG_FILE} --dataset-file ${DATASET_PATH} --dtype int32
 fi
-
-# Optional: Create a pickled pandas DataFrame that is the original dataset with extra columns with output data from the
-# accuracy run. The following columns will be added:
-# - "gen_output_tok_id": A list of ints representing the tokenized output sequence.
-# - "gen_output_text": A str representing the untokenized output sequence.
-# - "gen_output_tok_len": An int representing the number of output tokens.
-# - "rouge1": The rouge1 score for this sample
-# - "rouge2": The rouge2 score for this sample
-# - "rougeL": The rougeL score for this sample
-# This file will by default be saved to 'full_output.pkl'. You can modify this with --output-pkl-path.
-python consolidate_results.py --dataset-path ${DATASET_PATH} --model-dir ${CHECKPOINT_PATH}
 ```
 
 For the GPU run - The above steps have been automated in `run_accuracy.sh`. You can also modify this script to use
diff --git a/language/llama3-405b/consolidate_results.py b/language/llama3-405b/consolidate_results.py
deleted file mode 100644
index 645b7bc57b..0000000000
--- a/language/llama3-405b/consolidate_results.py
+++ /dev/null
@@ -1,137 +0,0 @@
-import argparse
-import evaluate
-import glob
-import nltk
-import numpy as np
-import os
-import pandas as pd
-import pickle
-
-from pathlib import Path
-from transformers import LlamaTokenizerFast
-from tqdm import tqdm
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--dataset-path",
-        type=str,
-        default=None,
-        help="Path to .pkl generated by processorca.py",
-    )
-    parser.add_argument(
-        "--run-outputs",
-        type=str,
-        default="run_outputs",
-        help="Output dir generated by accuracy run.",
-    )
-    parser.add_argument(
-        "--model-dir",
-        type=str,
-        default=None,
-        help="Path to Llamav2 HuggingFace repo clone",
-    )
-    parser.add_argument(
-        "--output-pkl-path",
-        type=str,
-        default="full_output.pkl",
-        help="Path to dump output to",
-    )
-    args = parser.parse_args()
-    return args
-
-
-def load_dataset(p: os.PathLike):
-    print(f"Loading from {p}...")
-    return pd.read_pickle(p)
-
-
-def load_run_outputs(p: os.PathLike):
-    g = glob.glob(str(Path(p) / "q*.pkl"))
-
-    by_query_idx = dict()
-    for pkl_file in g:
-        print(f"Loading from {pkl_file}...")
-        with open(pkl_file, "rb") as f:
-            d = pickle.load(f)
-        assert len(d["query_ids"]) == len(d["outputs"])
-
-        for i in range(len(d["query_ids"])):
-            qid = d["query_ids"][i]
-            assert qid not in by_query_idx
-            by_query_idx[qid] = d["outputs"][i]
-
-    return by_query_idx
-
-
-def main(args):
-    # Set up decode and evaluation objects
-    tokenizer = LlamaTokenizerFast.from_pretrained(args.model_dir)
-    metric = evaluate.load("rouge")
-    nltk.download("punkt")
-
-    # Load Data
-    df = load_dataset(args.dataset_path)
-    run_outputs = load_run_outputs(args.run_outputs)
-    assert len(run_outputs) == 24576
-
-    # Set up columns to add
-    output_tok_ids_col = [None] * 24576
-    output_text_col = [None] * 24576
-    output_lens = [None] * 24576
-
-    # Process data
-    no_eos_ids = []
-    for qid, output in tqdm(run_outputs.items()):
-        L = list(output)
-        # Prune trailing 2s (EOS token)
-        try:
-            first2 = L.index(2)
-            L = L[:first2]
-        except ValueError:
-            # Do nothing
-            no_eos_ids.append(qid)
-
-        assert L[-1] != 2
-        output_tok_ids_col[qid] = L
-        output_lens[qid] = len(L)
-
-        # Decode tokens
-        output_text_col[qid] = tokenizer.decode(
-            output_tok_ids_col[qid], skip_special_tokens=True
-        )
-    print(f"Found {len(no_eos_ids)} samples with no EOS token")
-
-    print("Calculating rouge scores...")
-    def _preproc(s): return "\n".join(nltk.sent_tokenize(s.strip()))
-    preds = list(map(_preproc, output_text_col))
-    targets = list(map(_preproc, list(df["output"])))
-    rouge_scores = metric.compute(
-        predictions=preds, references=targets, use_stemmer=True, use_aggregator=False
-    )
-
-    assert len(rouge_scores["rouge1"]) == 24576
-    assert len(rouge_scores["rouge2"]) == 24576
-    assert len(rouge_scores["rougeL"]) == 24576
-
-    agg = {k: round(np.mean(v) * 100, 4) for k, v in rouge_scores.items()}
-    print(agg)
-    print("Avg output seqlen:", np.mean(output_lens))
-
-    # Set columns
-    df["gen_output_tok_id"] = output_tok_ids_col
-    df["gen_output_text"] = output_text_col
-    df["gen_output_tok_len"] = output_lens
-    df["rouge1"] = rouge_scores["rouge1"]
-    df["rouge2"] = rouge_scores["rouge2"]
-    df["rougeL"] = rouge_scores["rougeL"]
-
-    p = Path(args.output_pkl_path)
-    p.parent.mkdir(exist_ok=True)
-    df.to_pickle(p)
-    print(f"Dumped to {p}")
-
-
-if __name__ == "__main__":
-    main(get_args())
diff --git a/language/llama3-405b/dataset.py b/language/llama3-405b/dataset.py
index 3c24619151..ff95557c7b 100644
--- a/language/llama3-405b/dataset.py
+++ b/language/llama3-405b/dataset.py
@@ -80,16 +80,6 @@ def postProcess(
         output_seq = out_tokens
         assert len(query_id_list) == len(output_seq)
 
-        # Save outputs
-        if not os.path.exists("run_outputs"):
-            os.makedirs("run_outputs")
-        fname = "q" + "_".join([str(i) for i in query_id_list])
-        fname = f"run_outputs/{fname}.pkl"
-        with open(fname, mode="wb") as f:
-            d = {"query_ids": query_id_list, "outputs": output_seq}
-            log.info(f"Saving outputs to {fname}")
-            pickle.dump(d, f)
-
         return np.asarray(output_seq, dtype=np.int32)
 
     def LoadSamplesToRam(self, sample_list):