Skip to content

Commit

Permalink
Introduce data mixing recipe yaml files
Browse files Browse the repository at this point in the history
This introduces Recipe yaml files, which are used both as an input
into the data mixing process and as an output of the process.

As an input, we have some default recipe files that specify any
precomputed datasets that should be mixed with data from new skills
when generating the overall mix of samples that will be sent to the
training process.

If a downstream user/packager wants to add default recipes (and
datasets), they should install them to a path like
`/usr/share/instructlab/sdg` (varies by platform, uses Python's
`platformdirs.PlatformDirs` to respect platform conventions).

Recipes should be in sdg/default_data_recipes/{knowledge,skills}.yaml

Datasets should be in sdg/datasets but this location is not enforced.

Currently we are not shipping any default recipe files in the upstream,
but there is a unit test in place to ensure the functionality to load
default recipes from disk works once we decide how we want to ship a
precomputed dataset to our upstream users.

As an output of the data generation process, we write recipe yamls to
document which datasets were mixed together and in what proportions
along with the system prompt that was used during the
generation. Here's an example of a recipe yaml put into the output
directory after running data generation:

```yaml
datasets:
- path: node_datasets_2024-07-25T17_49_46/knowledge_tonsils_overview_e2e-tonsils_p10.jsonl
  sampling_size: 1.0
metadata:
  sys_prompt: "I am, Red Hat\xAE Instruct Model based on Granite 7B, an AI language\
    \ model developed by Red Hat and IBM Research, based on the Granite-7b-base language\
    \ model. My primary function is to be a chat assistant."
```

Datasets may be referenced by relative paths, which are relative to the
recipe's own directory. Or, they may use absolute filesystem paths.

Anything written out under the metadata section (currently just
sys_prompt) is purely informational for the user and ignored when
loading recipes.

Parts of this are extracted and rebased from
aakankshaduggal#4
aakankshaduggal#20

Refs instructlab#162, instructlab#171, instructlab#185, instructlab#201.

Co-authored-by: shivchander <[email protected]>
Co-authored-by: Khaled Sulayman <[email protected]>
Co-authored-by: abhi1092 <[email protected]>
Co-authored-by: Aakanksha Duggal <[email protected]>
Co-authored-by: Mark McLoughlin <[email protected]>
Signed-off-by: Ben Browning <[email protected]>
  • Loading branch information
6 people committed Jul 25, 2024
1 parent afadfd5 commit fcfacfc
Show file tree
Hide file tree
Showing 7 changed files with 227 additions and 34 deletions.
129 changes: 97 additions & 32 deletions src/instructlab/sdg/datamixing.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

# Third Party
from datasets import Dataset, concatenate_datasets, load_dataset
import yaml

# First Party
from instructlab.sdg.logger_config import setup_logger
Expand All @@ -27,18 +28,12 @@ def _adjust_train_sample_size(ds: Dataset, num_samples: int):
return pandas.dataset_from_pandas_dataframe(df)


def _load_ds(path, sampling_size, num_proc):
def _sample_ds(dataset, sampling_size, num_proc):
"""
Load a dataset from the given file path and select sampling_size
number/ratio of samples from it, ensuring the loaded dataset has only
ALLOWED_COLS columns in it with any additional columns moved to the
metadata section.
Select sampling_size number/ratio of samples from a dataset, ensuring
the returned dataset has only ALLOWED_COLS columns in it with any
additional columns moved to the metadata section.
"""
logger.info(f"Loading dataset from {path} ...")
dataset = load_dataset("json", data_files=path, split="train")
logger.info(f"Dataset columns: {dataset.column_names}")
logger.info(f"Dataset loaded with {len(dataset)} samples")

if sampling_size != 1.0:
if isinstance(sampling_size, int):
num_samples = sampling_size
Expand Down Expand Up @@ -94,29 +89,63 @@ class Recipe:
"""

def __init__(
self, initial_datasets: Optional[list] = None, sys_prompt: Optional[str] = ""
self, recipe_path: Optional[str] = None, sys_prompt: Optional[str] = ""
):
self.recipe = {
"datasets": initial_datasets or [],
"sys_prompt": sys_prompt,
}
self.sys_prompt = self.recipe.get("sys_prompt", "")
self.recipe_path = recipe_path or ""
self.sys_prompt = sys_prompt

# Defaults if no recipe path given or these values don't
# exist in the given recipe file
self.datasets = []
if recipe_path is not None:
recipe = self._load_recipe()
if "datasets" in recipe:
self.datasets = recipe["datasets"]

self.dataset_added = False

def _load_recipe(self):
with open(self.recipe_path, encoding="utf-8") as fp:
return yaml.safe_load(fp)

def _load_ds(self, path):
"""
Load a dataset from the given location. If a jsonl file is
given, we load the dataset from disk. Otherwise, we load the
path given from HuggingFace. Relative paths are resolved
respective to the directory the recipe yaml itself resides in.
"""
if not os.path.isabs(path):
path = os.path.join(os.path.dirname(self.recipe_path), path)
logger.info(f"Loading dataset from {path} ...")
dataset = load_dataset("json", data_files=path, split="train")
logger.info(f"Dataset columns: {dataset.column_names}")
logger.info(f"Dataset loaded with {len(dataset)} samples")
return dataset

def _load_and_sample_datasets(self, num_proc):
"""
Load and sample all the datasets in this recipe, taking
into account the desired sampling size from each individual
dataset to control the overall mix of samples in the final
dataset.
"""
return [
_sample_ds(
self._load_ds(dataset["path"]), dataset["sampling_size"], num_proc
)
for dataset in self.datasets
]

def _create_mixed_dataset(self, num_proc):
"""
Create the mixed dataset from its list of included datasets, taking
into account the desired sampling size from each individual dataset
to control the overall mix of samples in the final dataset.
Create the final mixed dataset by loading, sampling, and
concatenating all datasets in this recipe
"""
if not self.dataset_added:
logger.error("No dataset added to the recipe")

mixed_ds = [
_load_ds(dataset["path"], dataset["sampling_size"], num_proc)
for dataset in self.recipe["datasets"]
]

mixed_ds = self._load_and_sample_datasets(num_proc)
mixed_ds = concatenate_datasets(mixed_ds)
mixed_ds = mixed_ds.map(
_add_system_message,
Expand All @@ -143,7 +172,20 @@ def add_dataset(self, path, sampling_size):
of the samples, and so on.
"""
self.dataset_added = True
self.recipe["datasets"].append({"path": path, "sampling_size": sampling_size})
self.datasets.append({"path": path, "sampling_size": sampling_size})

def save_recipe(self, output_path):
recipe = {
"datasets": self.datasets,
"metadata": {"sys_prompt": self.sys_prompt},
}
with open(output_path, "w", encoding="utf-8") as fp:
yaml.dump(recipe, fp)
# Update this instance's recipe_path to reflect the path we
# just saved it to so that any subsequent loading of datasets
# (like via save_mixed_dataset) pulls from relative to the
# saved recipe_path.
self.recipe_path = output_path

def save_mixed_dataset(self, output_path, num_proc):
"""
Expand Down Expand Up @@ -398,18 +440,34 @@ class DataMixer:
# once.
NUM_SYNTH_SKILLS = 30

def __init__(self, output_dir, date_suffix, sys_prompt, num_procs):
def __init__(self, data_dirs, output_dir, date_suffix, sys_prompt, num_procs):
self.data_dirs = data_dirs
self.output_dir = output_dir
self.sys_prompt = sys_prompt
self.date_suffix = date_suffix
self.num_procs = num_procs

self.knowledge_recipe = Recipe(sys_prompt=self.sys_prompt)
self.skills_recipe = Recipe(sys_prompt=self.sys_prompt)
self.knowledge_recipe = self._load_default_recipe("knowledge.yaml")
self.skills_recipe = self._load_default_recipe("skills.yaml")

self.output_file_knowledge_recipe = f"knowledge_recipe_{date_suffix}.yaml"
self.output_file_skills_recipe = f"skills_recipe_{date_suffix}.yaml"
self.output_file_mixed_knowledge = f"knowledge_train_msgs_{date_suffix}.jsonl"
self.output_file_mixed_skills = f"skills_train_msgs_{date_suffix}.jsonl"

def _load_default_recipe(self, yaml_basename):
"""
Load a default system recipe from e.g. /usr/share/instructlab/sdg/default_data_recipes
if it exists, otherwise return an empty recipe.
"""
for d in self.data_dirs:
default_recipe_path = os.path.join(d, "default_data_recipes", yaml_basename)
if os.path.exists(default_recipe_path):
return Recipe(
recipe_path=default_recipe_path, sys_prompt=self.sys_prompt
)
return Recipe(sys_prompt=self.sys_prompt)

def _gen_leaf_node_data(
self, leaf_node_data, recipe, output_file_leaf_node, sampling_size=1.0
):
Expand All @@ -420,7 +478,7 @@ def _gen_leaf_node_data(
"""
output_file = os.path.join(self.output_dir, output_file_leaf_node)
leaf_node_data.to_json(output_file, orient="records", lines=True)
recipe.add_dataset(output_file, sampling_size)
recipe.add_dataset(output_file_leaf_node, sampling_size)

def collect(self, leaf_node_path, new_generated_data, is_knowledge):
if is_knowledge:
Expand Down Expand Up @@ -459,20 +517,27 @@ def collect(self, leaf_node_path, new_generated_data, is_knowledge):
sampling_size=self.NUM_SYNTH_SKILLS,
)

def _gen_mixed_data(self, recipe, output_file_mixed):
def _gen_mixed_data(self, recipe, output_file_recipe, output_file_data):
"""
Mix the generated leaf node data into a single dataset and write it to
disk. The heavy lifting is delegated to the Recipe class.
"""
if recipe.dataset_added:
full_recipe_path = os.path.join(self.output_dir, output_file_recipe)
recipe.save_recipe(full_recipe_path)
recipe.save_mixed_dataset(
os.path.join(self.output_dir, output_file_mixed),
os.path.join(self.output_dir, output_file_data),
self.num_procs,
)

def generate(self):
self._gen_mixed_data(self.knowledge_recipe, self.output_file_mixed_knowledge)
self._gen_mixed_data(
self.knowledge_recipe,
self.output_file_knowledge_recipe,
self.output_file_mixed_knowledge,
)
self._gen_mixed_data(
self.skills_recipe,
self.output_file_skills_recipe,
self.output_file_mixed_skills,
)
16 changes: 15 additions & 1 deletion src/instructlab/sdg/generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,20 @@ def load_pipeline(yaml_basename):
)


def _mixer_init(ctx, output_dir, date_suffix):
pd = platformdirs.PlatformDirs(
appname=os.path.join("instructlab", "sdg"), multipath=True
)
data_dirs = list(pd.iter_data_dirs())
return DataMixer(
data_dirs,
output_dir,
date_suffix,
_SYS_PROMPT,
ctx.dataset_num_procs,
)


# This is part of the public API, and used by instructlab.
# TODO - parameter removal needs to be done in sync with a CLI change.
# pylint: disable=unused-argument
Expand Down Expand Up @@ -341,7 +355,7 @@ def generate_data(

mmlu_bench_pipe = mmlubench_pipe_init(ctx)

mixer = DataMixer(output_dir, date_suffix, _SYS_PROMPT, ctx.dataset_num_procs)
mixer = _mixer_init(ctx, output_dir, date_suffix)

if console_output:
logger.info(
Expand Down
105 changes: 104 additions & 1 deletion tests/test_datamixing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,31 @@
Unit tests for the top-level datamixing module.
"""

# Standard
from importlib import resources
from unittest.mock import patch
import os

# Third Party
from datasets import Dataset

# First Party
from instructlab.sdg.datamixing import _add_extra_contexts_to_samples
from instructlab.sdg.datamixing import DataMixer, Recipe, _add_extra_contexts_to_samples

# We mock out the actual things that use num_procs anyway, but just
# for a consistent value in the tests...
TEST_NUM_PROCS = 4
TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "testdata")
TEST_RECIPE_PATH = os.path.join(TEST_DATA_DIR, "relative_path_recipe.yaml")
TEST_SAMPLES_ABS_PATH = os.path.join(TEST_DATA_DIR, "datasets/samples.jsonl")


def _empty_recipe(self):
return {}


def _noop_sample(dataset, _sampling_size, _num_procs):
return dataset


def _fake_context(msg_id):
Expand All @@ -18,6 +38,89 @@ def _fake_context(msg_id):
}


def test_datamixer_can_load_default_recipes():
"""
Test that DataMixer can load default recipe files by pointing
it at a simple set of test recipe files under the testdata/
directory.
"""
date_suffix = "2024-07-25T15_52_10"
prompt = "You are a useful AI assistant."
mixer = DataMixer(
[TEST_DATA_DIR], TEST_DATA_DIR, date_suffix, prompt, TEST_NUM_PROCS
)
assert mixer.knowledge_recipe.datasets[0]["path"] == "test/knowledge.jsonl"
assert mixer.skills_recipe.datasets[0]["path"] == "test/skills.jsonl"


def test_recipe_init_with_empty_params_adds_dataset():
"""
Test that an empty-initialized recipe can add datasets
"""
recipe = Recipe()
recipe.add_dataset("testdata/datasets/samples.jsonl", 1.0)
assert recipe.dataset_added


def test_recipe_init_with_empty_params_loads_abs_dataset():
"""
Test that an empty-initialized recipe can load datasets from
absolute file paths.
"""
recipe = Recipe()
dataset = recipe._load_ds(TEST_SAMPLES_ABS_PATH)
assert dataset is not None


def test_recipe_init_with_empty_params_loads_rel_dataset():
"""
Test that an empty-initialized recipe looks for dataset files relative
to the current working directory (as opposed to blowing up because of
no recipe_path given).
"""
recipe = Recipe()
rel_path = os.path.relpath(TEST_SAMPLES_ABS_PATH)
dataset = recipe._load_ds(rel_path)
assert dataset is not None


@patch.object(Recipe, "_load_recipe", _empty_recipe)
def test_init_with_empty_recipe_files():
"""
Test that we can initialize a Recipe that points to a recipe
file that does not contain one or more of our expected keys, and
that instead of blowing up (like with a KeyError) we just use sane
defaults.
"""
recipe = Recipe(recipe_path=TEST_RECIPE_PATH)
assert len(recipe.datasets) == 0
assert recipe.sys_prompt == ""


@patch("instructlab.sdg.datamixing._sample_ds", _noop_sample)
def test_load_ds_with_relative_jsonl_path():
"""
Test that the _load_ds function can load from datasets from jsonl
files referenced with a path relative to the recipe file
"""
recipe = Recipe(recipe_path=TEST_RECIPE_PATH)
dataset = recipe._load_and_sample_datasets(TEST_NUM_PROCS)
assert dataset is not None


@patch("instructlab.sdg.datamixing._sample_ds", _noop_sample)
def test_load_ds_with_absolute_jsonl_path():
"""
Test that the _load_ds function can load from datasets from jsonl
files referenced with an absolute dataset path
"""
recipe = Recipe(recipe_path=TEST_RECIPE_PATH)
# Patch an absolute path into our recipe before loading it
recipe.datasets[0]["path"] = TEST_SAMPLES_ABS_PATH
dataset = recipe._load_and_sample_datasets(TEST_NUM_PROCS)
assert dataset is not None


def test_add_extra_contexts_to_samples_with_one_sample():
"""
Test _add_extra_contexts_to_samples doesn't error out when
Expand Down
1 change: 1 addition & 0 deletions tests/testdata/datasets/samples.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id": "abc123", "messages": [], "metadata": {}}
3 changes: 3 additions & 0 deletions tests/testdata/default_data_recipes/knowledge.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
datasets:
- path: test/knowledge.jsonl
sampling_size: 1.0
3 changes: 3 additions & 0 deletions tests/testdata/default_data_recipes/skills.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
datasets:
- path: test/skills.jsonl
sampling_size: 1.0
4 changes: 4 additions & 0 deletions tests/testdata/relative_path_recipe.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
datasets:
- path: datasets/samples.jsonl
sampling_size: 1.0
sys_prompt: I am a reliable AI assistant.

0 comments on commit fcfacfc

Please sign in to comment.