From 01b8112ab5ffa46b271193f508ee965164421757 Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Fri, 21 Apr 2023 02:30:25 +0800 Subject: [PATCH 01/34] ignore .DS_Store --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 31afbff..ac8a99b 100644 --- a/.gitignore +++ b/.gitignore @@ -129,3 +129,4 @@ dmypy.json # Pyre type checker .pyre/ +.DS_Store \ No newline at end of file From fe320037e1668ec5071707fcd50bceee82f8b8ce Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Fri, 21 Apr 2023 02:31:05 +0800 Subject: [PATCH 02/34] data stat generator --- data/data_stat.py | 342 +++++++++++++++++++++++++++++++++++++++++++ scripts/data_stat.sh | 5 + 2 files changed, 347 insertions(+) create mode 100644 data/data_stat.py create mode 100644 scripts/data_stat.sh diff --git a/data/data_stat.py b/data/data_stat.py new file mode 100644 index 0000000..0bc2d8e --- /dev/null +++ b/data/data_stat.py @@ -0,0 +1,342 @@ +import json +import datasets +import argparse + +# huggingface dataset signature with configs +SERIES_A_DATASET_NAME_DICT = { + "udhr": None, + "AmazonScience/mintaka": None, + "xcsr": [ + 'X-CSQA-en', + 'X-CSQA-zh', + 'X-CSQA-de', + 'X-CSQA-es', + 'X-CSQA-fr', + 'X-CSQA-it', + 'X-CSQA-jap', + 'X-CSQA-nl', + 'X-CSQA-pl', + 'X-CSQA-pt', + 'X-CSQA-ru', + 'X-CSQA-ar', + 'X-CSQA-vi', + 'X-CSQA-hi', + 'X-CSQA-sw', + 'X-CSQA-ur', + 'X-CODAH-en', + 'X-CODAH-zh', + 'X-CODAH-de', + 'X-CODAH-es', + 'X-CODAH-fr', + 'X-CODAH-it', + 'X-CODAH-jap', + 'X-CODAH-nl', + 'X-CODAH-pl', + 'X-CODAH-pt', + 'X-CODAH-ru', + 'X-CODAH-ar', + 'X-CODAH-vi', + 'X-CODAH-hi', + 'X-CODAH-sw', + 'X-CODAH-ur' + ], + "shmuhammad/AfriSenti-twitter-sentiment": [ + 'amh', + 'hau', + 'ibo', + 'arq', + 'ary', + 'yor', + 'por', + 'twi', + 'tso', + 'tir', + 'pcm', + 'kin', + 'swa' + ], # orm is not workin + "indonlp/NusaX-senti": [ + 'ace', + 'ban', + 'bjn', + 'bug', + 'eng', + 'ind', + 'jav', + 'mad', + 'min', + 'nij', + 'sun', + 'bbc' + ], + "sbmaruf/forai_ml-masakhane-news": [ + 'amh', + 'eng', + 'fra', + 'hau', + 'ibo', + 'lin', + 'lug', + 'orm', + 'pcm', + 'run', + 'sna', + 'som', + 'swa', + 'tir', + 'xho', + 'yor' + ], + "papluca/language-identification": [ + 'wikipedia-zero-shot', + 'wikipedia-zero-shot.af', + 'wikipedia-zero-shot.ar', + 'wikipedia-zero-shot.be', + 'wikipedia-zero-shot.bg', + 'wikipedia-zero-shot.bn', + 'wikipedia-zero-shot.ca', + 'wikipedia-zero-shot.cs', + 'wikipedia-zero-shot.da', + 'wikipedia-zero-shot.de', + 'wikipedia-zero-shot.el', + 'wikipedia-zero-shot.en', + 'wikipedia-zero-shot.es', + 'wikipedia-zero-shot.fa', + 'wikipedia-zero-shot.fi', + 'wikipedia-zero-shot.fr', + 'wikipedia-zero-shot.he', + 'wikipedia-zero-shot.hi', + 'wikipedia-zero-shot.hu', + 'wikipedia-zero-shot.id', + 'wikipedia-zero-shot.it', + 'wikipedia-zero-shot.ja', + 'wikipedia-zero-shot.ko', + 'wikipedia-zero-shot.ml', + 'wikipedia-zero-shot.mr', + 'wikipedia-zero-shot.ms', + 'wikipedia-zero-shot.nl', + 'wikipedia-zero-shot.no', + 'wikipedia-zero-shot.pl', + 'wikipedia-zero-shot.pt', + 'wikipedia-zero-shot.ro', + 'wikipedia-zero-shot.ru', + 'wikipedia-zero-shot.si', + 'wikipedia-zero-shot.sk', + 'wikipedia-zero-shot.sl', + 'wikipedia-zero-shot.sr', + 'wikipedia-zero-shot.sv', + 'wikipedia-zero-shot.sw', + 'wikipedia-zero-shot.ta', + 'wikipedia-zero-shot.te', + 'wikipedia-zero-shot.th', + 'wikipedia-zero-shot.tr', + 'wikipedia-zero-shot.uk', + 'wikipedia-zero-shot.vi', + 'wikipedia-zero-shot.zh', + 'wikinews-zero-shot', + 'wikinews-zero-shot.ar', + 'wikinews-zero-shot.cs', + 'wikinews-zero-shot.de', + 'wikinews-zero-shot.en', + 'wikinews-zero-shot.es', + 'wikinews-zero-shot.fi', + 'wikinews-zero-shot.fr', + 'wikinews-zero-shot.it', + 'wikinews-zero-shot.ja', + 'wikinews-zero-shot.ko', + 'wikinews-zero-shot.nl', + 'wikinews-zero-shot.no', + 'wikinews-zero-shot.pl', + 'wikinews-zero-shot.pt', + 'wikinews-zero-shot.ru', + 'wikinews-zero-shot.sr', + 'wikinews-zero-shot.sv', + 'wikinews-zero-shot.ta', + 'wikinews-zero-shot.tr', + 'wikinews-zero-shot.uk', + 'wikinews-zero-shot.zh', + 'wikinews-cross-domain', + 'wikinews-cross-domain.ar', + 'wikinews-cross-domain.bg', + 'wikinews-cross-domain.ca', + 'wikinews-cross-domain.cs', + 'wikinews-cross-domain.de', + 'wikinews-cross-domain.el', + 'wikinews-cross-domain.en', + 'wikinews-cross-domain.es', + 'wikinews-cross-domain.fi', + 'wikinews-cross-domain.fr', + 'wikinews-cross-domain.he', + 'wikinews-cross-domain.hu', + 'wikinews-cross-domain.it', + 'wikinews-cross-domain.ja', + 'wikinews-cross-domain.ko', + 'wikinews-cross-domain.nl', + 'wikinews-cross-domain.no', + 'wikinews-cross-domain.pl', + 'wikinews-cross-domain.pt', + 'wikinews-cross-domain.ro', + 'wikinews-cross-domain.ru', + 'wikinews-cross-domain.sr', + 'wikinews-cross-domain.sv', + 'wikinews-cross-domain.ta', + 'wikinews-cross-domain.tr', + 'wikinews-cross-domain.uk', + 'wikinews-cross-domain.zh' + ], + "adithya7/xlel_wd": [ + 'wikipedia-zero-shot', + 'wikipedia-zero-shot.af', + 'wikipedia-zero-shot.ar', + 'wikipedia-zero-shot.be', + 'wikipedia-zero-shot.bg', + 'wikipedia-zero-shot.bn', + 'wikipedia-zero-shot.ca', + 'wikipedia-zero-shot.cs', + 'wikipedia-zero-shot.da', + 'wikipedia-zero-shot.de', + 'wikipedia-zero-shot.el', + 'wikipedia-zero-shot.en', + 'wikipedia-zero-shot.es', + 'wikipedia-zero-shot.fa', + 'wikipedia-zero-shot.fi', + 'wikipedia-zero-shot.fr', + 'wikipedia-zero-shot.he', + 'wikipedia-zero-shot.hi', + 'wikipedia-zero-shot.hu', + 'wikipedia-zero-shot.id', + 'wikipedia-zero-shot.it', + 'wikipedia-zero-shot.ja', + 'wikipedia-zero-shot.ko', + 'wikipedia-zero-shot.ml', + 'wikipedia-zero-shot.mr', + 'wikipedia-zero-shot.ms', + 'wikipedia-zero-shot.nl', + 'wikipedia-zero-shot.no', + 'wikipedia-zero-shot.pl', + 'wikipedia-zero-shot.pt', + 'wikipedia-zero-shot.ro', + 'wikipedia-zero-shot.ru', + 'wikipedia-zero-shot.si', + 'wikipedia-zero-shot.sk', + 'wikipedia-zero-shot.sl', + 'wikipedia-zero-shot.sr', + 'wikipedia-zero-shot.sv', + 'wikipedia-zero-shot.sw', + 'wikipedia-zero-shot.ta', + 'wikipedia-zero-shot.te', + 'wikipedia-zero-shot.th', + 'wikipedia-zero-shot.tr', + 'wikipedia-zero-shot.uk', + 'wikipedia-zero-shot.vi', + 'wikipedia-zero-shot.zh', + 'wikinews-zero-shot', + 'wikinews-zero-shot.ar', + 'wikinews-zero-shot.cs', + 'wikinews-zero-shot.de', + 'wikinews-zero-shot.en', + 'wikinews-zero-shot.es', + 'wikinews-zero-shot.fi', + 'wikinews-zero-shot.fr', + 'wikinews-zero-shot.it', + 'wikinews-zero-shot.ja', + 'wikinews-zero-shot.ko', + 'wikinews-zero-shot.nl', + 'wikinews-zero-shot.no', + 'wikinews-zero-shot.pl', + 'wikinews-zero-shot.pt', + 'wikinews-zero-shot.ru', + 'wikinews-zero-shot.sr', + 'wikinews-zero-shot.sv', + 'wikinews-zero-shot.ta', + 'wikinews-zero-shot.tr', + 'wikinews-zero-shot.uk', + 'wikinews-zero-shot.zh', + 'wikinews-cross-domain', + 'wikinews-cross-domain.ar', + 'wikinews-cross-domain.bg', + 'wikinews-cross-domain.ca', + 'wikinews-cross-domain.cs', + 'wikinews-cross-domain.de', + 'wikinews-cross-domain.el', + 'wikinews-cross-domain.en', + 'wikinews-cross-domain.es', + 'wikinews-cross-domain.fi', + 'wikinews-cross-domain.fr', + 'wikinews-cross-domain.he', + 'wikinews-cross-domain.hu', + 'wikinews-cross-domain.it', + 'wikinews-cross-domain.ja', + 'wikinews-cross-domain.ko', + 'wikinews-cross-domain.nl', + 'wikinews-cross-domain.no', + 'wikinews-cross-domain.pl', + 'wikinews-cross-domain.pt', + 'wikinews-cross-domain.ro', + 'wikinews-cross-domain.ru', + 'wikinews-cross-domain.sr', + 'wikinews-cross-domain.sv', + 'wikinews-cross-domain.ta', + 'wikinews-cross-domain.tr', + 'wikinews-cross-domain.uk', + 'wikinews-cross-domain.zh' + ], + "ted_talks_iwslt": [ + 'eu_ca_2014', + 'eu_ca_2015', + 'eu_ca_2016', + 'nl_en_2014', + 'nl_en_2015', + 'nl_en_2016', + 'nl_hi_2014', + 'nl_hi_2015', + 'nl_hi_2016', + 'de_ja_2014', + 'de_ja_2015', + 'de_ja_2016', + 'fr-ca_hi_2014', + 'fr-ca_hi_2015', + 'fr-ca_hi_2016' + ] +} + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--dataset-names", + nargs="+", + default=None, + help="Print the stat of the dataset. If `None` it will print stat of all the used data." + ) + args = parser.parse_args() + stat_dict = {} + if args.dataset_names is None: + args.dataset_names = list(SERIES_A_DATASET_NAME_DICT.keys()) + for dataset_name, subset_names in SERIES_A_DATASET_NAME_DICT.items(): + if dataset_name not in args.dataset_names: + continue + stat_dict[dataset_name] = {} + if subset_names is None: + stat_dict[dataset_name]['Subset(None)'] = {} + dt = datasets.load_dataset(dataset_name, ignore_verifications=True) + for split in dt.keys(): + stat_dict[dataset_name]['Subset(None)'][split] = { + "size": len(dt[split]), + "column": list(dt[split].column_names), + } + else: + for subset in subset_names: + assert subset not in stat_dict[dataset_name] + stat_dict[dataset_name][subset] = {} + dt = datasets.load_dataset(dataset_name, name=subset, ignore_verifications=True) + for split in dt.keys(): + stat_dict[dataset_name][subset][split] = { + "size": len(dt[split]), + "column": list(dt[split].column_names), + } + + print(f"{json.dumps(stat_dict, indent=4)}") + +if __name__ == "__main__": + main() + diff --git a/scripts/data_stat.sh b/scripts/data_stat.sh new file mode 100644 index 0000000..4a22414 --- /dev/null +++ b/scripts/data_stat.sh @@ -0,0 +1,5 @@ +# to see all the stat +python data/data_stat.py + +# to see select dataset stat +python data/data_stat.py --dataset-names udhr \ No newline at end of file From 76a150b3a0fff564bf4be6eda347ffb008909412 Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Fri, 21 Apr 2023 02:36:34 +0800 Subject: [PATCH 03/34] download data from google sheet. --- data/check_prompts.py | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 data/check_prompts.py diff --git a/data/check_prompts.py b/data/check_prompts.py new file mode 100644 index 0000000..e8e4a00 --- /dev/null +++ b/data/check_prompts.py @@ -0,0 +1,40 @@ +import os +import csv +import argparse +import subprocess + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--form_path", + type=str, + default=None, + help="Path of the google sheet." + ) + parser.add_argument( + "--overwrite", + action="store_true", + help="Overwrite eexisting prompt file prompts.csv." + ) + parser.add_argument( + "--prompt-dir", + type=str, + default="data/", + help="Overwrite eexisting prompt file prompts.csv." + ) + args = parser.parse_args() + prompt_file_path = f"{args.prompt_dir}/prompts.csv" + if os.path.exists(prompt_file_path) and args.overwrite: # if file exists, it may be from prev. run/download. + subprocess.check_output(f"mv {prompt_file_path} {prompt_file_path}.old", shell=True) + subprocess.check_output("curl -L https://docs.google.com/spreadsheets/d/10bCwOhM8zKNkqKi54gIvdwrR44YlWQFV9fpGm7acHv8/export?format=csv > ./data/prompts.csv", shell=True) + + + with open('data/prompts.csv', 'r') as csvfile: + csvreader = csv.reader(csvfile) + next(iter(csvreader)) + for row in csvreader: + print(row) + +if __name__ == "__main__": + main() \ No newline at end of file From 8c691a7f663fe4e34d5894a9b6064c91d0b352bb Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Fri, 21 Apr 2023 18:00:39 +0800 Subject: [PATCH 04/34] update official masakhane/masakhanews --- data/data_stat.py | 132 +++++++++++++++++++++++----------------------- 1 file changed, 66 insertions(+), 66 deletions(-) diff --git a/data/data_stat.py b/data/data_stat.py index 0bc2d8e..cc8b6ab 100644 --- a/data/data_stat.py +++ b/data/data_stat.py @@ -4,72 +4,72 @@ # huggingface dataset signature with configs SERIES_A_DATASET_NAME_DICT = { - "udhr": None, - "AmazonScience/mintaka": None, - "xcsr": [ - 'X-CSQA-en', - 'X-CSQA-zh', - 'X-CSQA-de', - 'X-CSQA-es', - 'X-CSQA-fr', - 'X-CSQA-it', - 'X-CSQA-jap', - 'X-CSQA-nl', - 'X-CSQA-pl', - 'X-CSQA-pt', - 'X-CSQA-ru', - 'X-CSQA-ar', - 'X-CSQA-vi', - 'X-CSQA-hi', - 'X-CSQA-sw', - 'X-CSQA-ur', - 'X-CODAH-en', - 'X-CODAH-zh', - 'X-CODAH-de', - 'X-CODAH-es', - 'X-CODAH-fr', - 'X-CODAH-it', - 'X-CODAH-jap', - 'X-CODAH-nl', - 'X-CODAH-pl', - 'X-CODAH-pt', - 'X-CODAH-ru', - 'X-CODAH-ar', - 'X-CODAH-vi', - 'X-CODAH-hi', - 'X-CODAH-sw', - 'X-CODAH-ur' - ], - "shmuhammad/AfriSenti-twitter-sentiment": [ - 'amh', - 'hau', - 'ibo', - 'arq', - 'ary', - 'yor', - 'por', - 'twi', - 'tso', - 'tir', - 'pcm', - 'kin', - 'swa' - ], # orm is not workin - "indonlp/NusaX-senti": [ - 'ace', - 'ban', - 'bjn', - 'bug', - 'eng', - 'ind', - 'jav', - 'mad', - 'min', - 'nij', - 'sun', - 'bbc' - ], - "sbmaruf/forai_ml-masakhane-news": [ + # "udhr": None, + # "AmazonScience/mintaka": None, + # "xcsr": [ + # 'X-CSQA-en', + # 'X-CSQA-zh', + # 'X-CSQA-de', + # 'X-CSQA-es', + # 'X-CSQA-fr', + # 'X-CSQA-it', + # 'X-CSQA-jap', + # 'X-CSQA-nl', + # 'X-CSQA-pl', + # 'X-CSQA-pt', + # 'X-CSQA-ru', + # 'X-CSQA-ar', + # 'X-CSQA-vi', + # 'X-CSQA-hi', + # 'X-CSQA-sw', + # 'X-CSQA-ur', + # 'X-CODAH-en', + # 'X-CODAH-zh', + # 'X-CODAH-de', + # 'X-CODAH-es', + # 'X-CODAH-fr', + # 'X-CODAH-it', + # 'X-CODAH-jap', + # 'X-CODAH-nl', + # 'X-CODAH-pl', + # 'X-CODAH-pt', + # 'X-CODAH-ru', + # 'X-CODAH-ar', + # 'X-CODAH-vi', + # 'X-CODAH-hi', + # 'X-CODAH-sw', + # 'X-CODAH-ur' + # ], + # "shmuhammad/AfriSenti-twitter-sentiment": [ + # 'amh', + # 'hau', + # 'ibo', + # 'arq', + # 'ary', + # 'yor', + # 'por', + # 'twi', + # 'tso', + # 'tir', + # 'pcm', + # 'kin', + # 'swa' + # ], # orm is not workin + # "indonlp/NusaX-senti": [ + # 'ace', + # 'ban', + # 'bjn', + # 'bug', + # 'eng', + # 'ind', + # 'jav', + # 'mad', + # 'min', + # 'nij', + # 'sun', + # 'bbc' + # ], + "masakhane/masakhanews": [ 'amh', 'eng', 'fra', From 76a50d40daefa64fb7616cfa575b9498d83dfd68 Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Sun, 23 Apr 2023 02:48:43 +0800 Subject: [PATCH 05/34] update jinja prompt loader --- data/check_prompts.py | 20 +++++++++++++++++++- scripts/check_prompt.sh | 0 2 files changed, 19 insertions(+), 1 deletion(-) create mode 100644 scripts/check_prompt.sh diff --git a/data/check_prompts.py b/data/check_prompts.py index e8e4a00..ed4b9bb 100644 --- a/data/check_prompts.py +++ b/data/check_prompts.py @@ -1,8 +1,27 @@ import os import csv +import json import argparse import subprocess +from promptsource.templates import Template +from .data_stat import SERIES_A_DATASET_NAME_DICT +def check( + json_example, + template_name, + jinja_template, + template_reference=None, + answer_choices=None +): + json_example = json.loads(json_example) + template = Template( + template_name, + jinja_template, + template_reference, + answer_choices=answer_choices + ) + lm_io = template.apply(json_example, highlight_variables=False) + return lm_io def main(): parser = argparse.ArgumentParser() @@ -29,7 +48,6 @@ def main(): subprocess.check_output(f"mv {prompt_file_path} {prompt_file_path}.old", shell=True) subprocess.check_output("curl -L https://docs.google.com/spreadsheets/d/10bCwOhM8zKNkqKi54gIvdwrR44YlWQFV9fpGm7acHv8/export?format=csv > ./data/prompts.csv", shell=True) - with open('data/prompts.csv', 'r') as csvfile: csvreader = csv.reader(csvfile) next(iter(csvreader)) diff --git a/scripts/check_prompt.sh b/scripts/check_prompt.sh new file mode 100644 index 0000000..e69de29 From 2f1f316cdd841da79f0ef115b12f9827b29c0957 Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Sun, 23 Apr 2023 02:51:49 +0800 Subject: [PATCH 06/34] update data source --- data/data_stat.py | 154 ++++++++++++++++++++++++++-------------------- 1 file changed, 88 insertions(+), 66 deletions(-) diff --git a/data/data_stat.py b/data/data_stat.py index cc8b6ab..68c8479 100644 --- a/data/data_stat.py +++ b/data/data_stat.py @@ -4,71 +4,71 @@ # huggingface dataset signature with configs SERIES_A_DATASET_NAME_DICT = { - # "udhr": None, - # "AmazonScience/mintaka": None, - # "xcsr": [ - # 'X-CSQA-en', - # 'X-CSQA-zh', - # 'X-CSQA-de', - # 'X-CSQA-es', - # 'X-CSQA-fr', - # 'X-CSQA-it', - # 'X-CSQA-jap', - # 'X-CSQA-nl', - # 'X-CSQA-pl', - # 'X-CSQA-pt', - # 'X-CSQA-ru', - # 'X-CSQA-ar', - # 'X-CSQA-vi', - # 'X-CSQA-hi', - # 'X-CSQA-sw', - # 'X-CSQA-ur', - # 'X-CODAH-en', - # 'X-CODAH-zh', - # 'X-CODAH-de', - # 'X-CODAH-es', - # 'X-CODAH-fr', - # 'X-CODAH-it', - # 'X-CODAH-jap', - # 'X-CODAH-nl', - # 'X-CODAH-pl', - # 'X-CODAH-pt', - # 'X-CODAH-ru', - # 'X-CODAH-ar', - # 'X-CODAH-vi', - # 'X-CODAH-hi', - # 'X-CODAH-sw', - # 'X-CODAH-ur' - # ], - # "shmuhammad/AfriSenti-twitter-sentiment": [ - # 'amh', - # 'hau', - # 'ibo', - # 'arq', - # 'ary', - # 'yor', - # 'por', - # 'twi', - # 'tso', - # 'tir', - # 'pcm', - # 'kin', - # 'swa' - # ], # orm is not workin - # "indonlp/NusaX-senti": [ - # 'ace', - # 'ban', - # 'bjn', - # 'bug', - # 'eng', - # 'ind', - # 'jav', - # 'mad', - # 'min', - # 'nij', - # 'sun', - # 'bbc' - # ], + "udhr": None, + "AmazonScience/mintaka": None, + "xcsr": [ + 'X-CSQA-en', + 'X-CSQA-zh', + 'X-CSQA-de', + 'X-CSQA-es', + 'X-CSQA-fr', + 'X-CSQA-it', + 'X-CSQA-jap', + 'X-CSQA-nl', + 'X-CSQA-pl', + 'X-CSQA-pt', + 'X-CSQA-ru', + 'X-CSQA-ar', + 'X-CSQA-vi', + 'X-CSQA-hi', + 'X-CSQA-sw', + 'X-CSQA-ur', + 'X-CODAH-en', + 'X-CODAH-zh', + 'X-CODAH-de', + 'X-CODAH-es', + 'X-CODAH-fr', + 'X-CODAH-it', + 'X-CODAH-jap', + 'X-CODAH-nl', + 'X-CODAH-pl', + 'X-CODAH-pt', + 'X-CODAH-ru', + 'X-CODAH-ar', + 'X-CODAH-vi', + 'X-CODAH-hi', + 'X-CODAH-sw', + 'X-CODAH-ur' + ], + "shmuhammad/AfriSenti-twitter-sentiment": [ + 'amh', + 'hau', + 'ibo', + 'arq', + 'ary', + 'yor', + 'por', + 'twi', + 'tso', + 'tir', + 'pcm', + 'kin', + 'swa' + ], # orm is not workin + "indonlp/NusaX-senti": [ + 'ace', + 'ban', + 'bjn', + 'bug', + 'eng', + 'ind', + 'jav', + 'mad', + 'min', + 'nij', + 'sun', + 'bbc' + ], "masakhane/masakhanews": [ 'amh', 'eng', @@ -281,7 +281,7 @@ 'wikinews-cross-domain.uk', 'wikinews-cross-domain.zh' ], - "ted_talks_iwslt": [ + "sbmaruf/forai_ml-ted_talk_iwslt": [ 'eu_ca_2014', 'eu_ca_2015', 'eu_ca_2016', @@ -297,6 +297,28 @@ 'fr-ca_hi_2014', 'fr-ca_hi_2015', 'fr-ca_hi_2016' + ], + "sbmaruf/forai_ml_masakhane_mafand":[ + 'en-amh', + 'en-hau', + 'en-ibo', + 'en-kin', + 'en-lug', + 'en-nya', + 'en-pcm', + 'en-sna', + 'en-swa', + 'en-tsn', + 'en-twi', + 'en-xho', + 'en-yor', + 'en-zul', + 'fr-bam', + 'fr-bbj', + 'fr-ewe', + 'fr-fon', + 'fr-mos', + 'fr-wol' ] } From bbcb48704d7fa28beefe93047d0885deaf9eaf9e Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Sun, 23 Apr 2023 04:39:54 +0800 Subject: [PATCH 07/34] sanity check of scsqa structure --- data/data_stat.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/data/data_stat.py b/data/data_stat.py index 68c8479..f8905bb 100644 --- a/data/data_stat.py +++ b/data/data_stat.py @@ -356,6 +356,11 @@ def main(): "size": len(dt[split]), "column": list(dt[split].column_names), } + if "X-CSQA" in subset: + for sample in dt[split]: + assert len(sample['question']['choices']['label']) == 5 + + print(f"{json.dumps(stat_dict, indent=4)}") From 3f01f470f4be42da007f8422ae6b7af03bca9e77 Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Sun, 30 Apr 2023 20:06:06 +0800 Subject: [PATCH 08/34] adding more datasets and output formatting --- data/data_stat.py | 73 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 70 insertions(+), 3 deletions(-) diff --git a/data/data_stat.py b/data/data_stat.py index f8905bb..fcb5ae3 100644 --- a/data/data_stat.py +++ b/data/data_stat.py @@ -1,3 +1,5 @@ +import os +import csv import json import datasets import argparse @@ -319,6 +321,48 @@ 'fr-fon', 'fr-mos', 'fr-wol' + ], + "exams":[ + 'alignments', + 'multilingual', + 'multilingual_with_para', + 'crosslingual_test', + 'crosslingual_with_para_test', + 'crosslingual_bg', + 'crosslingual_with_para_bg', + 'crosslingual_hr', + 'crosslingual_with_para_hr', + 'crosslingual_hu', + 'crosslingual_with_para_hu', + 'crosslingual_it', + 'crosslingual_with_para_it', + 'crosslingual_mk', + 'crosslingual_with_para_mk', + 'crosslingual_pl', + 'crosslingual_with_para_pl', + 'crosslingual_pt', + 'crosslingual_with_para_pt', + 'crosslingual_sq', + 'crosslingual_with_para_sq', + 'crosslingual_sr', + 'crosslingual_with_para_sr', + 'crosslingual_tr', + 'crosslingual_with_para_tr', + 'crosslingual_vi', + 'crosslingual_with_para_vi' + ], + "allenai/soda": None, + "arabic_billion_words":[ + 'Alittihad', + 'Almasryalyoum', + 'Almustaqbal', + 'Alqabas', + 'Echoroukonline', + 'Ryiadh', + 'Sabanews', + 'SaudiYoum', + 'Techreen', + 'Youm7' ] } @@ -330,6 +374,17 @@ def main(): default=None, help="Print the stat of the dataset. If `None` it will print stat of all the used data." ) + parser.add_argument( + "--export-format", + choices=['json', "csv"], + default=".json", + help="Which format you want to export." + ) + parser.add_argument( + "--output-dir", + default=None, + help="The path to the folder where stat will be saved." + ) args = parser.parse_args() stat_dict = {} if args.dataset_names is None: @@ -360,9 +415,21 @@ def main(): for sample in dt[split]: assert len(sample['question']['choices']['label']) == 5 - - - print(f"{json.dumps(stat_dict, indent=4)}") + if args.output_dir != 'None': + file_name = os.path.join(args.output_dir, "stat") + f".{args.export_format}" + if args.export_format == "json": + with open(file_name, "w") as file_ptr: + file_ptr.write(f"{json.dumps(stat_dict, indent=4)}\n") + elif args.export_format == "csv": + # with open(file_name, mode='w') as file_ptr: + # writer = csv.writer(file_ptr) + # for dataset_name, subset_name, in SERIES_A_DATASET_NAME_DICT.keys(): + # row = [f"{dataset_name}"] + + # writer.writerow(stat_dict.values()) + pass + else: + raise NotImplementedError if __name__ == "__main__": main() From bf8f2c064c44d9d012bdf4d36a833727d0701d3a Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Sun, 30 Apr 2023 20:13:10 +0800 Subject: [PATCH 09/34] refactoring --- data/data_stat.py | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/data/data_stat.py b/data/data_stat.py index fcb5ae3..3ea52cb 100644 --- a/data/data_stat.py +++ b/data/data_stat.py @@ -392,28 +392,22 @@ def main(): for dataset_name, subset_names in SERIES_A_DATASET_NAME_DICT.items(): if dataset_name not in args.dataset_names: continue + assert dataset_name not in stat_dict stat_dict[dataset_name] = {} - if subset_names is None: - stat_dict[dataset_name]['Subset(None)'] = {} - dt = datasets.load_dataset(dataset_name, ignore_verifications=True) + subset_names = [None] if subset_names is None else subset_names + for subset in subset_names: + assert subset not in stat_dict[dataset_name] + stat_dict[dataset_name][subset] = {} + dt = datasets.load_dataset(dataset_name, name=subset, ignore_verifications=True) for split in dt.keys(): - stat_dict[dataset_name]['Subset(None)'][split] = { + stat_dict[dataset_name][subset][split] = { "size": len(dt[split]), "column": list(dt[split].column_names), } - else: - for subset in subset_names: - assert subset not in stat_dict[dataset_name] - stat_dict[dataset_name][subset] = {} - dt = datasets.load_dataset(dataset_name, name=subset, ignore_verifications=True) - for split in dt.keys(): - stat_dict[dataset_name][subset][split] = { - "size": len(dt[split]), - "column": list(dt[split].column_names), - } - if "X-CSQA" in subset: - for sample in dt[split]: - assert len(sample['question']['choices']['label']) == 5 + # re-valuation of hypothesis considered in prompt template + if subset is not None and "X-CSQA" in subset: + for sample in dt[split]: + assert len(sample['question']['choices']['label']) == 5 if args.output_dir != 'None': file_name = os.path.join(args.output_dir, "stat") + f".{args.export_format}" From e6488ddb7426a361b8ebe04dc05c781ef3a3a166 Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Sun, 30 Apr 2023 20:26:56 +0800 Subject: [PATCH 10/34] doc string --- data/check_prompts.py | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/data/check_prompts.py b/data/check_prompts.py index ed4b9bb..d3b2f93 100644 --- a/data/check_prompts.py +++ b/data/check_prompts.py @@ -3,16 +3,33 @@ import json import argparse import subprocess +from typing import Tuple, Optional from promptsource.templates import Template from .data_stat import SERIES_A_DATASET_NAME_DICT def check( - json_example, - template_name, - jinja_template, - template_reference=None, - answer_choices=None -): + json_example: str, + template_name: str, + jinja_template: str, + template_reference: Optional[str] = None, + answer_choices: Optional[str] = None +)-> Tuple[str, str]: + """ + Given a + Args: + json_example (str): a string contains json object. The json object is loaded + by `json.loads()`. Typically this is a sample from + huggingface dataset converted to a string by a `json.dumps()`. + template_name: unique name (per dataset) for template + jinja_template: template expressed in Jinja + template_reference: string describing author or paper reference for template + answer_choices: Jinja expression for answer choices. Should produce + a ||| delimited string of choices that enumerates + the possible completions for templates that should + be evaluated as ranked completions. If None, then + the template is open-ended. This list is accessible + from within Jinja as the variable `answer_choices`. + """ json_example = json.loads(json_example) template = Template( template_name, From 7b9f1eb7954a0cba0ac0836e6ff6907b78c1b7c4 Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Sun, 30 Apr 2023 20:37:48 +0800 Subject: [PATCH 11/34] add metadata --- data/check_prompts.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/data/check_prompts.py b/data/check_prompts.py index d3b2f93..421c0a1 100644 --- a/data/check_prompts.py +++ b/data/check_prompts.py @@ -4,7 +4,7 @@ import argparse import subprocess from typing import Tuple, Optional -from promptsource.templates import Template +from promptsource.templates import Template, Metadata from .data_stat import SERIES_A_DATASET_NAME_DICT def check( @@ -12,10 +12,12 @@ def check( template_name: str, jinja_template: str, template_reference: Optional[str] = None, + metadata: Optional[Metadata] = None, answer_choices: Optional[str] = None )-> Tuple[str, str]: """ - Given a + Given an example (`json_example`) from a huggingface dataset and prompt template (`jinja_template`), + the objective is to check if we can project the example in language model i/o format. Args: json_example (str): a string contains json object. The json object is loaded by `json.loads()`. Typically this is a sample from @@ -23,6 +25,8 @@ def check( template_name: unique name (per dataset) for template jinja_template: template expressed in Jinja template_reference: string describing author or paper reference for template + metadata: A Metadata object with template annotations. + Follow [here](https://github.com/bigscience-workshop/promptsource/blob/main/promptsource/templates.py#L417) for more details. answer_choices: Jinja expression for answer choices. Should produce a ||| delimited string of choices that enumerates the possible completions for templates that should @@ -35,6 +39,7 @@ def check( template_name, jinja_template, template_reference, + metadata=metadata, answer_choices=answer_choices ) lm_io = template.apply(json_example, highlight_variables=False) From 48244c0613f053d3db99dc686f9974dcf4a86afa Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Wed, 10 May 2023 07:10:24 +0800 Subject: [PATCH 12/34] prompt checker pipeline --- data/check_prompts.py | 101 ++++++++++++++++++++++++++++++++++------ scripts/check_prompt.sh | 3 ++ 2 files changed, 90 insertions(+), 14 deletions(-) diff --git a/data/check_prompts.py b/data/check_prompts.py index 421c0a1..d611b29 100644 --- a/data/check_prompts.py +++ b/data/check_prompts.py @@ -1,18 +1,27 @@ import os import csv +import copy import json import argparse +import datasets import subprocess -from typing import Tuple, Optional -from promptsource.templates import Template, Metadata +from typing import Tuple, Optional, List +from promptsource.templates import Template, LANGUAGES from .data_stat import SERIES_A_DATASET_NAME_DICT +dataset_mapper = { + "AfriSenti-twitter-sentiment https://huggingface.co/datasets/shmuhammad/AfriSenti-twitter-sentiment": "shmuhammad/AfriSenti-twitter-sentiment" +} + def check( json_example: str, template_name: str, jinja_template: str, template_reference: Optional[str] = None, - metadata: Optional[Metadata] = None, + original_task: Optional[str] = None, + choices_in_prompt: Optional[bool] = None, + metrics: Optional[List[str]] = None, + languages: Optional[List[str]] = None, answer_choices: Optional[str] = None )-> Tuple[str, str]: """ @@ -25,8 +34,12 @@ def check( template_name: unique name (per dataset) for template jinja_template: template expressed in Jinja template_reference: string describing author or paper reference for template - metadata: A Metadata object with template annotations. - Follow [here](https://github.com/bigscience-workshop/promptsource/blob/main/promptsource/templates.py#L417) for more details. + original_task: If True, this prompt asks a model to perform the original task designed for + this dataset. + choices_in_prompt: If True, the answer choices are included in the templates such that models + see those choices in the input. Only applicable to classification tasks. + metrics: List of strings denoting metrics to use for evaluation + languages: List of strings denoting languages used in the prompt (not the associated dataset!) answer_choices: Jinja expression for answer choices. Should produce a ||| delimited string of choices that enumerates the possible completions for templates that should @@ -35,6 +48,12 @@ def check( from within Jinja as the variable `answer_choices`. """ json_example = json.loads(json_example) + metadata = Template.Metadata( + original_task, + choices_in_prompt, + metrics, + languages + ) template = Template( template_name, jinja_template, @@ -45,12 +64,56 @@ def check( lm_io = template.apply(json_example, highlight_variables=False) return lm_io +def validate(prompt_template_data): + print(json.dumps(prompt_template_data, indent=4)) + dataset_info = prompt_template_data['What dataset do you pick?'] + dataset_signature = dataset_mapper[dataset_info] + dataset_subsets = SERIES_A_DATASET_NAME_DICT[dataset_signature] + for dataset_subset in dataset_subsets: + dataset = datasets.load_dataset(dataset_signature, dataset_subset) + splits = dataset.keys() + for split in splits: + data = dataset[split] + model_input = prompt_template_data['Input to the model'] + model_exp_output = prompt_template_data['Model\'s expected output'] + for sample in data: + lm_io = check( + json_example = json.dumps(sample), + template_name = prompt_template_data['Name'], + jinja_template = f"{model_input} ||| {model_exp_output}", + template_reference = prompt_template_data['Discord username'], + ) + if len(lm_io) == 2: + print(f"Validating dataset_signature:dataset_subset:split={dataset_signature}:{dataset_subset}:{split} with prompt template... [DONE]") + else: + print(f"Validating dataset_signature:dataset_subset:split={dataset_signature}:{dataset_subset}:{split} with prompt template... [FAILED]") + raise ValueError("Templating Error.") + break + print(dataset_signature, dataset_subsets) + +def parse(prompt_file_path, validate_rows): + print(validate_rows) + _prmompt_dict, dt_structure, idx_to_header = {}, {}, {} + with open(prompt_file_path, 'r') as csvfile: + csvreader = csv.reader(csvfile) + for row_idx, row in enumerate(csvreader): + if row_idx == 0: + for idx, dt in enumerate(row): + dt_structure[dt] = {} + idx_to_header[idx] = dt + if row_idx+1 in validate_rows: + sample = copy.deepcopy(dt_structure) + for idx, dt in enumerate(row): + sample[idx_to_header[idx]] = dt + _prmompt_dict[ row_idx+1 ] = sample + return _prmompt_dict + def main(): parser = argparse.ArgumentParser() parser.add_argument( "--form_path", type=str, - default=None, + default="https://docs.google.com/spreadsheets/d/10bCwOhM8zKNkqKi54gIvdwrR44YlWQFV9fpGm7acHv8/export?format=csv&id=10bCwOhM8zKNkqKi54gIvdwrR44YlWQFV9fpGm7acHv8&gid=726399306", help="Path of the google sheet." ) parser.add_argument( @@ -62,19 +125,29 @@ def main(): "--prompt-dir", type=str, default="data/", + help="Overwrite existing prompt file prompts.csv." + ) + parser.add_argument( + "--validate-rows", + nargs='*', + default=[3], + type=int, help="Overwrite eexisting prompt file prompts.csv." ) args = parser.parse_args() prompt_file_path = f"{args.prompt_dir}/prompts.csv" if os.path.exists(prompt_file_path) and args.overwrite: # if file exists, it may be from prev. run/download. subprocess.check_output(f"mv {prompt_file_path} {prompt_file_path}.old", shell=True) - subprocess.check_output("curl -L https://docs.google.com/spreadsheets/d/10bCwOhM8zKNkqKi54gIvdwrR44YlWQFV9fpGm7acHv8/export?format=csv > ./data/prompts.csv", shell=True) - - with open('data/prompts.csv', 'r') as csvfile: - csvreader = csv.reader(csvfile) - next(iter(csvreader)) - for row in csvreader: - print(row) - + if not os.path.exists(prompt_file_path): + cmd = f"curl -L '{args.form_path}' -o {prompt_file_path}" + subprocess.check_output(cmd, shell=True) + + prompt_dict = parse(prompt_file_path, args.validate_rows) + for row_id, prompt_template_data in prompt_dict.items(): + print(f"Validating row {row_id} ...") + validate(prompt_template_data) + + + if __name__ == "__main__": main() \ No newline at end of file diff --git a/scripts/check_prompt.sh b/scripts/check_prompt.sh index e69de29..729dc5d 100644 --- a/scripts/check_prompt.sh +++ b/scripts/check_prompt.sh @@ -0,0 +1,3 @@ +python3 -m data.check_prompts \ +--prompt-dir data \ +--validate-rows 3 \ No newline at end of file From 0f159f90375d023d358d97bf61842745dbf0f152 Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Wed, 10 May 2023 07:11:25 +0800 Subject: [PATCH 13/34] type --- data/check_prompts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/check_prompts.py b/data/check_prompts.py index d611b29..dc33922 100644 --- a/data/check_prompts.py +++ b/data/check_prompts.py @@ -132,7 +132,7 @@ def main(): nargs='*', default=[3], type=int, - help="Overwrite eexisting prompt file prompts.csv." + help="Overwrite existing prompt file prompts.csv." ) args = parser.parse_args() prompt_file_path = f"{args.prompt_dir}/prompts.csv" From 2ef6ba132cbe53a620d86741ffc40109f6ddf71c Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Wed, 10 May 2023 07:15:18 +0800 Subject: [PATCH 14/34] code formatting & doc string added --- data/check_prompts.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/data/check_prompts.py b/data/check_prompts.py index dc33922..cf65f53 100644 --- a/data/check_prompts.py +++ b/data/check_prompts.py @@ -64,7 +64,11 @@ def check( lm_io = template.apply(json_example, highlight_variables=False) return lm_io + def validate(prompt_template_data): + """ + Validate a prompt template + """ print(json.dumps(prompt_template_data, indent=4)) dataset_info = prompt_template_data['What dataset do you pick?'] dataset_signature = dataset_mapper[dataset_info] @@ -89,10 +93,12 @@ def validate(prompt_template_data): print(f"Validating dataset_signature:dataset_subset:split={dataset_signature}:{dataset_subset}:{split} with prompt template... [FAILED]") raise ValueError("Templating Error.") break - print(dataset_signature, dataset_subsets) + def parse(prompt_file_path, validate_rows): - print(validate_rows) + """ + Parse list of rows menntioned in validate_rows. + """ _prmompt_dict, dt_structure, idx_to_header = {}, {}, {} with open(prompt_file_path, 'r') as csvfile: csvreader = csv.reader(csvfile) @@ -101,13 +107,14 @@ def parse(prompt_file_path, validate_rows): for idx, dt in enumerate(row): dt_structure[dt] = {} idx_to_header[idx] = dt - if row_idx+1 in validate_rows: + if row_idx+1 in validate_rows: # 1 based indexing sample = copy.deepcopy(dt_structure) for idx, dt in enumerate(row): sample[idx_to_header[idx]] = dt _prmompt_dict[ row_idx+1 ] = sample return _prmompt_dict + def main(): parser = argparse.ArgumentParser() parser.add_argument( @@ -132,7 +139,7 @@ def main(): nargs='*', default=[3], type=int, - help="Overwrite existing prompt file prompts.csv." + help="List of row indices (1-based indexing ). The row mentioned here will indicate the row of `--form_path` spreadsheet." ) args = parser.parse_args() prompt_file_path = f"{args.prompt_dir}/prompts.csv" From a75519a79b9ed8f64b73539902cd955f26b65aa4 Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Mon, 22 May 2023 23:59:15 +0800 Subject: [PATCH 15/34] Add all dataset info --- data/check_prompts.py | 77 +++++++++++++++++++++++++++---------------- 1 file changed, 48 insertions(+), 29 deletions(-) diff --git a/data/check_prompts.py b/data/check_prompts.py index cf65f53..8b22502 100644 --- a/data/check_prompts.py +++ b/data/check_prompts.py @@ -10,7 +10,20 @@ from .data_stat import SERIES_A_DATASET_NAME_DICT dataset_mapper = { - "AfriSenti-twitter-sentiment https://huggingface.co/datasets/shmuhammad/AfriSenti-twitter-sentiment": "shmuhammad/AfriSenti-twitter-sentiment" + "AfriSenti-twitter-sentiment https://huggingface.co/datasets/shmuhammad/AfriSenti-twitter-sentiment": "shmuhammad/AfriSenti-twitter-sentiment", + "Joke-explanation https://huggingface.co/datasets/theblackcat102/joke_explaination": "theblackcat102/joke_explaination", + "Language Identification https://huggingface.co/datasets/papluca/language-identification": "papluca/language-identification", + "Mafand - a machine translation task https://huggingface.co/datasets/masakhane/mafand": "sbmaruf/forai_ml_masakhane_mafand", + "Masakhanews https://github.com/masakhane-io/masakhane-news": "masakhane/masakhanews", + "Mintaka https://huggingface.co/datasets/AmazonScience/mintaka":"AmazonScience/mintaka", + "NarrativeQA https://huggingface.co/datasets/narrativeqa": "narrativeqa", + "NusaX - sentiment classification https://huggingface.co/datasets/indonlp/NusaX-senti": "indonlp/NusaX-senti", + "qrecc https://huggingface.co/datasets/svakulenk0/qrecc": "svakulenk0/qrecc", + "SODA https://huggingface.co/datasets/allenai/soda": "allenai/soda", + "TED https://huggingface.co/datasets/ted_talks_iwslt": "sbmaruf/forai_ml-ted_talk_iwslt", + "WikiCatSum https://huggingface.co/datasets/GEM/wiki_cat_sum": "GEM/wiki_cat_sum", + "X-CSQA https://huggingface.co/datasets/xcsr": "xcsr", + "xlel_wd https://huggingface.co/datasets/adithya7/xlel_wd": "adithya7/xlel_wd" } def check( @@ -65,36 +78,42 @@ def check( return lm_io -def validate(prompt_template_data): +def validate(prompt_template_data, row_id): """ Validate a prompt template """ - print(json.dumps(prompt_template_data, indent=4)) - dataset_info = prompt_template_data['What dataset do you pick?'] - dataset_signature = dataset_mapper[dataset_info] - dataset_subsets = SERIES_A_DATASET_NAME_DICT[dataset_signature] - for dataset_subset in dataset_subsets: - dataset = datasets.load_dataset(dataset_signature, dataset_subset) - splits = dataset.keys() - for split in splits: - data = dataset[split] - model_input = prompt_template_data['Input to the model'] - model_exp_output = prompt_template_data['Model\'s expected output'] - for sample in data: - lm_io = check( - json_example = json.dumps(sample), - template_name = prompt_template_data['Name'], - jinja_template = f"{model_input} ||| {model_exp_output}", - template_reference = prompt_template_data['Discord username'], - ) - if len(lm_io) == 2: - print(f"Validating dataset_signature:dataset_subset:split={dataset_signature}:{dataset_subset}:{split} with prompt template... [DONE]") - else: - print(f"Validating dataset_signature:dataset_subset:split={dataset_signature}:{dataset_subset}:{split} with prompt template... [FAILED]") - raise ValueError("Templating Error.") - break - - + try: + print(json.dumps(prompt_template_data, indent=4)) + dataset_info = prompt_template_data['What dataset do you pick?'] + if dataset_info not in dataset_mapper: + dataset_signature = dataset_info.split()[0].lower() + else: + dataset_signature = dataset_mapper[dataset_info] + dataset_subsets = SERIES_A_DATASET_NAME_DICT[dataset_signature] + for dataset_subset in dataset_subsets: + dataset = datasets.load_dataset(dataset_signature, dataset_subset) + splits = dataset.keys() + for split in splits: + data = dataset[split] + model_input = prompt_template_data['Input to the model'] + model_exp_output = prompt_template_data['Model\'s expected output'] + for sample in data: + lm_io = check( + json_example = json.dumps(sample), + template_name = prompt_template_data['Name'], + jinja_template = f"{model_input} ||| {model_exp_output}", + template_reference = prompt_template_data['Discord username'], + ) + if len(lm_io) == 2: + print(f"Validating dataset_signature:dataset_subset:split={dataset_signature}:{dataset_subset}:{split} with prompt template... [DONE]") + else: + print(f"Validating dataset_signature:dataset_subset:split={dataset_signature}:{dataset_subset}:{split} with prompt template... [FAILED]") + raise ValueError("Templating Error.") + break + except: + print(f"Error in row {row_id}") + raise + def parse(prompt_file_path, validate_rows): """ Parse list of rows menntioned in validate_rows. @@ -152,7 +171,7 @@ def main(): prompt_dict = parse(prompt_file_path, args.validate_rows) for row_id, prompt_template_data in prompt_dict.items(): print(f"Validating row {row_id} ...") - validate(prompt_template_data) + validate(prompt_template_data, row_id) From d778277aaf0b69225deaba4c8bd2bb74d86bf970 Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Fri, 26 May 2023 00:20:05 +0800 Subject: [PATCH 16/34] update naming --- data/check_prompts.py | 179 ---------------------------------------- scripts/check_prompt.sh | 3 - 2 files changed, 182 deletions(-) delete mode 100644 data/check_prompts.py delete mode 100644 scripts/check_prompt.sh diff --git a/data/check_prompts.py b/data/check_prompts.py deleted file mode 100644 index 8b22502..0000000 --- a/data/check_prompts.py +++ /dev/null @@ -1,179 +0,0 @@ -import os -import csv -import copy -import json -import argparse -import datasets -import subprocess -from typing import Tuple, Optional, List -from promptsource.templates import Template, LANGUAGES -from .data_stat import SERIES_A_DATASET_NAME_DICT - -dataset_mapper = { - "AfriSenti-twitter-sentiment https://huggingface.co/datasets/shmuhammad/AfriSenti-twitter-sentiment": "shmuhammad/AfriSenti-twitter-sentiment", - "Joke-explanation https://huggingface.co/datasets/theblackcat102/joke_explaination": "theblackcat102/joke_explaination", - "Language Identification https://huggingface.co/datasets/papluca/language-identification": "papluca/language-identification", - "Mafand - a machine translation task https://huggingface.co/datasets/masakhane/mafand": "sbmaruf/forai_ml_masakhane_mafand", - "Masakhanews https://github.com/masakhane-io/masakhane-news": "masakhane/masakhanews", - "Mintaka https://huggingface.co/datasets/AmazonScience/mintaka":"AmazonScience/mintaka", - "NarrativeQA https://huggingface.co/datasets/narrativeqa": "narrativeqa", - "NusaX - sentiment classification https://huggingface.co/datasets/indonlp/NusaX-senti": "indonlp/NusaX-senti", - "qrecc https://huggingface.co/datasets/svakulenk0/qrecc": "svakulenk0/qrecc", - "SODA https://huggingface.co/datasets/allenai/soda": "allenai/soda", - "TED https://huggingface.co/datasets/ted_talks_iwslt": "sbmaruf/forai_ml-ted_talk_iwslt", - "WikiCatSum https://huggingface.co/datasets/GEM/wiki_cat_sum": "GEM/wiki_cat_sum", - "X-CSQA https://huggingface.co/datasets/xcsr": "xcsr", - "xlel_wd https://huggingface.co/datasets/adithya7/xlel_wd": "adithya7/xlel_wd" -} - -def check( - json_example: str, - template_name: str, - jinja_template: str, - template_reference: Optional[str] = None, - original_task: Optional[str] = None, - choices_in_prompt: Optional[bool] = None, - metrics: Optional[List[str]] = None, - languages: Optional[List[str]] = None, - answer_choices: Optional[str] = None -)-> Tuple[str, str]: - """ - Given an example (`json_example`) from a huggingface dataset and prompt template (`jinja_template`), - the objective is to check if we can project the example in language model i/o format. - Args: - json_example (str): a string contains json object. The json object is loaded - by `json.loads()`. Typically this is a sample from - huggingface dataset converted to a string by a `json.dumps()`. - template_name: unique name (per dataset) for template - jinja_template: template expressed in Jinja - template_reference: string describing author or paper reference for template - original_task: If True, this prompt asks a model to perform the original task designed for - this dataset. - choices_in_prompt: If True, the answer choices are included in the templates such that models - see those choices in the input. Only applicable to classification tasks. - metrics: List of strings denoting metrics to use for evaluation - languages: List of strings denoting languages used in the prompt (not the associated dataset!) - answer_choices: Jinja expression for answer choices. Should produce - a ||| delimited string of choices that enumerates - the possible completions for templates that should - be evaluated as ranked completions. If None, then - the template is open-ended. This list is accessible - from within Jinja as the variable `answer_choices`. - """ - json_example = json.loads(json_example) - metadata = Template.Metadata( - original_task, - choices_in_prompt, - metrics, - languages - ) - template = Template( - template_name, - jinja_template, - template_reference, - metadata=metadata, - answer_choices=answer_choices - ) - lm_io = template.apply(json_example, highlight_variables=False) - return lm_io - - -def validate(prompt_template_data, row_id): - """ - Validate a prompt template - """ - try: - print(json.dumps(prompt_template_data, indent=4)) - dataset_info = prompt_template_data['What dataset do you pick?'] - if dataset_info not in dataset_mapper: - dataset_signature = dataset_info.split()[0].lower() - else: - dataset_signature = dataset_mapper[dataset_info] - dataset_subsets = SERIES_A_DATASET_NAME_DICT[dataset_signature] - for dataset_subset in dataset_subsets: - dataset = datasets.load_dataset(dataset_signature, dataset_subset) - splits = dataset.keys() - for split in splits: - data = dataset[split] - model_input = prompt_template_data['Input to the model'] - model_exp_output = prompt_template_data['Model\'s expected output'] - for sample in data: - lm_io = check( - json_example = json.dumps(sample), - template_name = prompt_template_data['Name'], - jinja_template = f"{model_input} ||| {model_exp_output}", - template_reference = prompt_template_data['Discord username'], - ) - if len(lm_io) == 2: - print(f"Validating dataset_signature:dataset_subset:split={dataset_signature}:{dataset_subset}:{split} with prompt template... [DONE]") - else: - print(f"Validating dataset_signature:dataset_subset:split={dataset_signature}:{dataset_subset}:{split} with prompt template... [FAILED]") - raise ValueError("Templating Error.") - break - except: - print(f"Error in row {row_id}") - raise - -def parse(prompt_file_path, validate_rows): - """ - Parse list of rows menntioned in validate_rows. - """ - _prmompt_dict, dt_structure, idx_to_header = {}, {}, {} - with open(prompt_file_path, 'r') as csvfile: - csvreader = csv.reader(csvfile) - for row_idx, row in enumerate(csvreader): - if row_idx == 0: - for idx, dt in enumerate(row): - dt_structure[dt] = {} - idx_to_header[idx] = dt - if row_idx+1 in validate_rows: # 1 based indexing - sample = copy.deepcopy(dt_structure) - for idx, dt in enumerate(row): - sample[idx_to_header[idx]] = dt - _prmompt_dict[ row_idx+1 ] = sample - return _prmompt_dict - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--form_path", - type=str, - default="https://docs.google.com/spreadsheets/d/10bCwOhM8zKNkqKi54gIvdwrR44YlWQFV9fpGm7acHv8/export?format=csv&id=10bCwOhM8zKNkqKi54gIvdwrR44YlWQFV9fpGm7acHv8&gid=726399306", - help="Path of the google sheet." - ) - parser.add_argument( - "--overwrite", - action="store_true", - help="Overwrite eexisting prompt file prompts.csv." - ) - parser.add_argument( - "--prompt-dir", - type=str, - default="data/", - help="Overwrite existing prompt file prompts.csv." - ) - parser.add_argument( - "--validate-rows", - nargs='*', - default=[3], - type=int, - help="List of row indices (1-based indexing ). The row mentioned here will indicate the row of `--form_path` spreadsheet." - ) - args = parser.parse_args() - prompt_file_path = f"{args.prompt_dir}/prompts.csv" - if os.path.exists(prompt_file_path) and args.overwrite: # if file exists, it may be from prev. run/download. - subprocess.check_output(f"mv {prompt_file_path} {prompt_file_path}.old", shell=True) - if not os.path.exists(prompt_file_path): - cmd = f"curl -L '{args.form_path}' -o {prompt_file_path}" - subprocess.check_output(cmd, shell=True) - - prompt_dict = parse(prompt_file_path, args.validate_rows) - for row_id, prompt_template_data in prompt_dict.items(): - print(f"Validating row {row_id} ...") - validate(prompt_template_data, row_id) - - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/scripts/check_prompt.sh b/scripts/check_prompt.sh deleted file mode 100644 index 729dc5d..0000000 --- a/scripts/check_prompt.sh +++ /dev/null @@ -1,3 +0,0 @@ -python3 -m data.check_prompts \ ---prompt-dir data \ ---validate-rows 3 \ No newline at end of file From a9f210caa68dc105bb74d3836e1a37773d307059 Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Fri, 26 May 2023 00:20:25 +0800 Subject: [PATCH 17/34] add split language --- data/data_stat.py | 658 +++++++++++++++++++++------------------------- 1 file changed, 296 insertions(+), 362 deletions(-) diff --git a/data/data_stat.py b/data/data_stat.py index 3ea52cb..0fb7337 100644 --- a/data/data_stat.py +++ b/data/data_stat.py @@ -6,364 +6,299 @@ # huggingface dataset signature with configs SERIES_A_DATASET_NAME_DICT = { - "udhr": None, - "AmazonScience/mintaka": None, - "xcsr": [ - 'X-CSQA-en', - 'X-CSQA-zh', - 'X-CSQA-de', - 'X-CSQA-es', - 'X-CSQA-fr', - 'X-CSQA-it', - 'X-CSQA-jap', - 'X-CSQA-nl', - 'X-CSQA-pl', - 'X-CSQA-pt', - 'X-CSQA-ru', - 'X-CSQA-ar', - 'X-CSQA-vi', - 'X-CSQA-hi', - 'X-CSQA-sw', - 'X-CSQA-ur', - 'X-CODAH-en', - 'X-CODAH-zh', - 'X-CODAH-de', - 'X-CODAH-es', - 'X-CODAH-fr', - 'X-CODAH-it', - 'X-CODAH-jap', - 'X-CODAH-nl', - 'X-CODAH-pl', - 'X-CODAH-pt', - 'X-CODAH-ru', - 'X-CODAH-ar', - 'X-CODAH-vi', - 'X-CODAH-hi', - 'X-CODAH-sw', - 'X-CODAH-ur' - ], - "shmuhammad/AfriSenti-twitter-sentiment": [ - 'amh', - 'hau', - 'ibo', - 'arq', - 'ary', - 'yor', - 'por', - 'twi', - 'tso', - 'tir', - 'pcm', - 'kin', - 'swa' - ], # orm is not workin - "indonlp/NusaX-senti": [ - 'ace', - 'ban', - 'bjn', - 'bug', - 'eng', - 'ind', - 'jav', - 'mad', - 'min', - 'nij', - 'sun', - 'bbc' - ], - "masakhane/masakhanews": [ - 'amh', - 'eng', - 'fra', - 'hau', - 'ibo', - 'lin', - 'lug', - 'orm', - 'pcm', - 'run', - 'sna', - 'som', - 'swa', - 'tir', - 'xho', - 'yor' - ], - "papluca/language-identification": [ - 'wikipedia-zero-shot', - 'wikipedia-zero-shot.af', - 'wikipedia-zero-shot.ar', - 'wikipedia-zero-shot.be', - 'wikipedia-zero-shot.bg', - 'wikipedia-zero-shot.bn', - 'wikipedia-zero-shot.ca', - 'wikipedia-zero-shot.cs', - 'wikipedia-zero-shot.da', - 'wikipedia-zero-shot.de', - 'wikipedia-zero-shot.el', - 'wikipedia-zero-shot.en', - 'wikipedia-zero-shot.es', - 'wikipedia-zero-shot.fa', - 'wikipedia-zero-shot.fi', - 'wikipedia-zero-shot.fr', - 'wikipedia-zero-shot.he', - 'wikipedia-zero-shot.hi', - 'wikipedia-zero-shot.hu', - 'wikipedia-zero-shot.id', - 'wikipedia-zero-shot.it', - 'wikipedia-zero-shot.ja', - 'wikipedia-zero-shot.ko', - 'wikipedia-zero-shot.ml', - 'wikipedia-zero-shot.mr', - 'wikipedia-zero-shot.ms', - 'wikipedia-zero-shot.nl', - 'wikipedia-zero-shot.no', - 'wikipedia-zero-shot.pl', - 'wikipedia-zero-shot.pt', - 'wikipedia-zero-shot.ro', - 'wikipedia-zero-shot.ru', - 'wikipedia-zero-shot.si', - 'wikipedia-zero-shot.sk', - 'wikipedia-zero-shot.sl', - 'wikipedia-zero-shot.sr', - 'wikipedia-zero-shot.sv', - 'wikipedia-zero-shot.sw', - 'wikipedia-zero-shot.ta', - 'wikipedia-zero-shot.te', - 'wikipedia-zero-shot.th', - 'wikipedia-zero-shot.tr', - 'wikipedia-zero-shot.uk', - 'wikipedia-zero-shot.vi', - 'wikipedia-zero-shot.zh', - 'wikinews-zero-shot', - 'wikinews-zero-shot.ar', - 'wikinews-zero-shot.cs', - 'wikinews-zero-shot.de', - 'wikinews-zero-shot.en', - 'wikinews-zero-shot.es', - 'wikinews-zero-shot.fi', - 'wikinews-zero-shot.fr', - 'wikinews-zero-shot.it', - 'wikinews-zero-shot.ja', - 'wikinews-zero-shot.ko', - 'wikinews-zero-shot.nl', - 'wikinews-zero-shot.no', - 'wikinews-zero-shot.pl', - 'wikinews-zero-shot.pt', - 'wikinews-zero-shot.ru', - 'wikinews-zero-shot.sr', - 'wikinews-zero-shot.sv', - 'wikinews-zero-shot.ta', - 'wikinews-zero-shot.tr', - 'wikinews-zero-shot.uk', - 'wikinews-zero-shot.zh', - 'wikinews-cross-domain', - 'wikinews-cross-domain.ar', - 'wikinews-cross-domain.bg', - 'wikinews-cross-domain.ca', - 'wikinews-cross-domain.cs', - 'wikinews-cross-domain.de', - 'wikinews-cross-domain.el', - 'wikinews-cross-domain.en', - 'wikinews-cross-domain.es', - 'wikinews-cross-domain.fi', - 'wikinews-cross-domain.fr', - 'wikinews-cross-domain.he', - 'wikinews-cross-domain.hu', - 'wikinews-cross-domain.it', - 'wikinews-cross-domain.ja', - 'wikinews-cross-domain.ko', - 'wikinews-cross-domain.nl', - 'wikinews-cross-domain.no', - 'wikinews-cross-domain.pl', - 'wikinews-cross-domain.pt', - 'wikinews-cross-domain.ro', - 'wikinews-cross-domain.ru', - 'wikinews-cross-domain.sr', - 'wikinews-cross-domain.sv', - 'wikinews-cross-domain.ta', - 'wikinews-cross-domain.tr', - 'wikinews-cross-domain.uk', - 'wikinews-cross-domain.zh' - ], - "adithya7/xlel_wd": [ - 'wikipedia-zero-shot', - 'wikipedia-zero-shot.af', - 'wikipedia-zero-shot.ar', - 'wikipedia-zero-shot.be', - 'wikipedia-zero-shot.bg', - 'wikipedia-zero-shot.bn', - 'wikipedia-zero-shot.ca', - 'wikipedia-zero-shot.cs', - 'wikipedia-zero-shot.da', - 'wikipedia-zero-shot.de', - 'wikipedia-zero-shot.el', - 'wikipedia-zero-shot.en', - 'wikipedia-zero-shot.es', - 'wikipedia-zero-shot.fa', - 'wikipedia-zero-shot.fi', - 'wikipedia-zero-shot.fr', - 'wikipedia-zero-shot.he', - 'wikipedia-zero-shot.hi', - 'wikipedia-zero-shot.hu', - 'wikipedia-zero-shot.id', - 'wikipedia-zero-shot.it', - 'wikipedia-zero-shot.ja', - 'wikipedia-zero-shot.ko', - 'wikipedia-zero-shot.ml', - 'wikipedia-zero-shot.mr', - 'wikipedia-zero-shot.ms', - 'wikipedia-zero-shot.nl', - 'wikipedia-zero-shot.no', - 'wikipedia-zero-shot.pl', - 'wikipedia-zero-shot.pt', - 'wikipedia-zero-shot.ro', - 'wikipedia-zero-shot.ru', - 'wikipedia-zero-shot.si', - 'wikipedia-zero-shot.sk', - 'wikipedia-zero-shot.sl', - 'wikipedia-zero-shot.sr', - 'wikipedia-zero-shot.sv', - 'wikipedia-zero-shot.sw', - 'wikipedia-zero-shot.ta', - 'wikipedia-zero-shot.te', - 'wikipedia-zero-shot.th', - 'wikipedia-zero-shot.tr', - 'wikipedia-zero-shot.uk', - 'wikipedia-zero-shot.vi', - 'wikipedia-zero-shot.zh', - 'wikinews-zero-shot', - 'wikinews-zero-shot.ar', - 'wikinews-zero-shot.cs', - 'wikinews-zero-shot.de', - 'wikinews-zero-shot.en', - 'wikinews-zero-shot.es', - 'wikinews-zero-shot.fi', - 'wikinews-zero-shot.fr', - 'wikinews-zero-shot.it', - 'wikinews-zero-shot.ja', - 'wikinews-zero-shot.ko', - 'wikinews-zero-shot.nl', - 'wikinews-zero-shot.no', - 'wikinews-zero-shot.pl', - 'wikinews-zero-shot.pt', - 'wikinews-zero-shot.ru', - 'wikinews-zero-shot.sr', - 'wikinews-zero-shot.sv', - 'wikinews-zero-shot.ta', - 'wikinews-zero-shot.tr', - 'wikinews-zero-shot.uk', - 'wikinews-zero-shot.zh', - 'wikinews-cross-domain', - 'wikinews-cross-domain.ar', - 'wikinews-cross-domain.bg', - 'wikinews-cross-domain.ca', - 'wikinews-cross-domain.cs', - 'wikinews-cross-domain.de', - 'wikinews-cross-domain.el', - 'wikinews-cross-domain.en', - 'wikinews-cross-domain.es', - 'wikinews-cross-domain.fi', - 'wikinews-cross-domain.fr', - 'wikinews-cross-domain.he', - 'wikinews-cross-domain.hu', - 'wikinews-cross-domain.it', - 'wikinews-cross-domain.ja', - 'wikinews-cross-domain.ko', - 'wikinews-cross-domain.nl', - 'wikinews-cross-domain.no', - 'wikinews-cross-domain.pl', - 'wikinews-cross-domain.pt', - 'wikinews-cross-domain.ro', - 'wikinews-cross-domain.ru', - 'wikinews-cross-domain.sr', - 'wikinews-cross-domain.sv', - 'wikinews-cross-domain.ta', - 'wikinews-cross-domain.tr', - 'wikinews-cross-domain.uk', - 'wikinews-cross-domain.zh' - ], - "sbmaruf/forai_ml-ted_talk_iwslt": [ - 'eu_ca_2014', - 'eu_ca_2015', - 'eu_ca_2016', - 'nl_en_2014', - 'nl_en_2015', - 'nl_en_2016', - 'nl_hi_2014', - 'nl_hi_2015', - 'nl_hi_2016', - 'de_ja_2014', - 'de_ja_2015', - 'de_ja_2016', - 'fr-ca_hi_2014', - 'fr-ca_hi_2015', - 'fr-ca_hi_2016' - ], - "sbmaruf/forai_ml_masakhane_mafand":[ - 'en-amh', - 'en-hau', - 'en-ibo', - 'en-kin', - 'en-lug', - 'en-nya', - 'en-pcm', - 'en-sna', - 'en-swa', - 'en-tsn', - 'en-twi', - 'en-xho', - 'en-yor', - 'en-zul', - 'fr-bam', - 'fr-bbj', - 'fr-ewe', - 'fr-fon', - 'fr-mos', - 'fr-wol' - ], - "exams":[ - 'alignments', - 'multilingual', - 'multilingual_with_para', - 'crosslingual_test', - 'crosslingual_with_para_test', - 'crosslingual_bg', - 'crosslingual_with_para_bg', - 'crosslingual_hr', - 'crosslingual_with_para_hr', - 'crosslingual_hu', - 'crosslingual_with_para_hu', - 'crosslingual_it', - 'crosslingual_with_para_it', - 'crosslingual_mk', - 'crosslingual_with_para_mk', - 'crosslingual_pl', - 'crosslingual_with_para_pl', - 'crosslingual_pt', - 'crosslingual_with_para_pt', - 'crosslingual_sq', - 'crosslingual_with_para_sq', - 'crosslingual_sr', - 'crosslingual_with_para_sr', - 'crosslingual_tr', - 'crosslingual_with_para_tr', - 'crosslingual_vi', - 'crosslingual_with_para_vi' - ], - "allenai/soda": None, - "arabic_billion_words":[ - 'Alittihad', - 'Almasryalyoum', - 'Almustaqbal', - 'Alqabas', - 'Echoroukonline', - 'Ryiadh', - 'Sabanews', - 'SaudiYoum', - 'Techreen', - 'Youm7' - ] + "udhr": { + None: "mixed" + }, + "AmazonScience/mintaka": { + "ar": "ar", + "de": "de", + "en": "en", + "es": "es", + "fr": "fr", + "hi": "hi", + "it": "it", + "ja": "ja", + "pt": "pt", + }, + "xcsr": { + 'X-CSQA-en': "en", + 'X-CSQA-zh': "zh", + 'X-CSQA-de': "de", + 'X-CSQA-es': "es", + 'X-CSQA-fr': "fr", + 'X-CSQA-it': "it", + 'X-CSQA-jap': "ja", + 'X-CSQA-nl': "nl", + 'X-CSQA-pl': "pl", + 'X-CSQA-pt': "pt", + 'X-CSQA-ru': "ru", + 'X-CSQA-ar': "ar", + 'X-CSQA-vi': "vi", + 'X-CSQA-hi': "hi", + 'X-CSQA-sw': "sw", + 'X-CSQA-ur': "ur", + # 'X-CODAH-en': "en", + # 'X-CODAH-zh': "zh", + # 'X-CODAH-de': "de", + # 'X-CODAH-es': "es", + # 'X-CODAH-fr': "fr", + # 'X-CODAH-it': "it", + # 'X-CODAH-jap': "ja", + # 'X-CODAH-nl': "nl", + # 'X-CODAH-pl': "pl", + # 'X-CODAH-pt': "pt", + # 'X-CODAH-ru': "ru", + # 'X-CODAH-ar': "ar", + # 'X-CODAH-vi': "vi", + # 'X-CODAH-hi': "hi", + # 'X-CODAH-sw': "sw", + # 'X-CODAH-ur': "ur", + }, + "shmuhammad/AfriSenti-twitter-sentiment": { + 'amh':'amh', + 'hau':'hau', + 'ibo':'ibo', + 'arq':'arq', + 'ary':'ary', + # 'yor':'yor', + 'por':'por', + 'twi':'twi', + 'tso':'tso', + 'tir':'tir', + 'pcm':'pcm', + 'kin':'kin', + 'swa': 'swa', + # 'orm': 'orm', + }, + "indonlp/NusaX-senti": { + 'ace':'ace', + 'ban':'ban', + 'bjn':'bjn', + # 'bug':'bug', + 'eng':'eng', + 'ind':'ind', + # 'jav':'jav', + 'mad':'mad', + 'min':'min', + 'nij':'nij', + 'sun':'sun', + 'bbc':'bbc', + }, + "masakhane/masakhanews": { + 'amh':'amh', + 'eng':'eng', + 'fra':'fra', + 'hau':'hau', + 'ibo':'ibo', + 'lin':'lin', + 'lug':'lug', + 'orm':'orm', + 'pcm':'pcm', + 'run':'run', + 'sna':'sna', + 'som':'som', + 'swa':'swa', + 'tir':'tir', + 'xho':'xho', + 'yor':'yor', + }, + "papluca/language-identification": { + None: "mixed", + }, + "adithya7/xlel_wd": { + 'wikipedia-zero-shot': "mixed", + 'wikinews-zero-shot': "mixed", + 'wikinews-cross-domain': "mixed", + 'wikipedia-zero-shot.af': 'af', + 'wikipedia-zero-shot.ar': 'ar', + 'wikipedia-zero-shot.be': 'be', + 'wikipedia-zero-shot.bg': 'bg', + 'wikipedia-zero-shot.bn': 'bn', + 'wikipedia-zero-shot.ca': 'ca', + 'wikipedia-zero-shot.cs': 'cs', + 'wikipedia-zero-shot.da': 'da', + 'wikipedia-zero-shot.de': 'de', + 'wikipedia-zero-shot.el': 'el', + 'wikipedia-zero-shot.en': 'en', + 'wikipedia-zero-shot.es': 'es', + 'wikipedia-zero-shot.fa': 'fa', + 'wikipedia-zero-shot.fi': 'fi', + 'wikipedia-zero-shot.fr': 'fr', + 'wikipedia-zero-shot.he': 'he', + 'wikipedia-zero-shot.hi': 'hi', + 'wikipedia-zero-shot.hu': 'hu', + 'wikipedia-zero-shot.id': 'id', + 'wikipedia-zero-shot.it': 'it', + 'wikipedia-zero-shot.ja': 'ja', + 'wikipedia-zero-shot.ko': 'ko', + 'wikipedia-zero-shot.ml': 'ml', + 'wikipedia-zero-shot.mr': 'mr', + 'wikipedia-zero-shot.ms': 'ms', + 'wikipedia-zero-shot.nl': 'nl', + 'wikipedia-zero-shot.no': 'no', + 'wikipedia-zero-shot.pl': 'pl', + 'wikipedia-zero-shot.pt': 'pt', + 'wikipedia-zero-shot.ro': 'ro', + 'wikipedia-zero-shot.ru': 'ru', + 'wikipedia-zero-shot.si': 'si', + 'wikipedia-zero-shot.sk': 'sk', + 'wikipedia-zero-shot.sl': 'sl', + 'wikipedia-zero-shot.sr': 'sr', + 'wikipedia-zero-shot.sv': 'sv', + 'wikipedia-zero-shot.sw': 'sw', + 'wikipedia-zero-shot.ta': 'ta', + 'wikipedia-zero-shot.te': 'te', + 'wikipedia-zero-shot.th': 'th', + 'wikipedia-zero-shot.tr': 'tr', + 'wikipedia-zero-shot.uk': 'uk', + 'wikipedia-zero-shot.vi': 'vi', + 'wikipedia-zero-shot.zh': 'zh', + 'wikinews-zero-shot.ar': 'ar', + 'wikinews-zero-shot.cs': 'cs', + 'wikinews-zero-shot.de': 'de', + 'wikinews-zero-shot.en': 'en', + 'wikinews-zero-shot.es': 'es', + 'wikinews-zero-shot.fi': 'fi', + 'wikinews-zero-shot.fr': 'fr', + 'wikinews-zero-shot.it': 'it', + 'wikinews-zero-shot.ja': 'ja', + 'wikinews-zero-shot.ko': 'ko', + 'wikinews-zero-shot.nl': 'nl', + 'wikinews-zero-shot.no': 'no', + 'wikinews-zero-shot.pl': 'pl', + 'wikinews-zero-shot.pt': 'pt', + 'wikinews-zero-shot.ru': 'ru', + 'wikinews-zero-shot.sr': 'sr', + 'wikinews-zero-shot.sv': 'sv', + 'wikinews-zero-shot.ta': 'ta', + # 'wikinews-zero-shot.tr': 'tr', + 'wikinews-zero-shot.uk': 'uk', + 'wikinews-zero-shot.zh': 'zh', + 'wikinews-cross-domain.ar': 'ar', + 'wikinews-cross-domain.bg': 'bg', + 'wikinews-cross-domain.ca': 'ca', + 'wikinews-cross-domain.cs': 'cs', + 'wikinews-cross-domain.de': 'de', + 'wikinews-cross-domain.el': 'el', + 'wikinews-cross-domain.en': 'en', + 'wikinews-cross-domain.es': 'es', + 'wikinews-cross-domain.fi': 'fi', + 'wikinews-cross-domain.fr': 'fr', + 'wikinews-cross-domain.he': 'he', + 'wikinews-cross-domain.hu': 'hu', + 'wikinews-cross-domain.it': 'it', + 'wikinews-cross-domain.ja': 'ja', + 'wikinews-cross-domain.ko': 'ko', + 'wikinews-cross-domain.nl': 'nl', + 'wikinews-cross-domain.no': 'no', + 'wikinews-cross-domain.pl': 'pl', + 'wikinews-cross-domain.pt': 'pt', + 'wikinews-cross-domain.ro': 'ro', + 'wikinews-cross-domain.ru': 'ru', + 'wikinews-cross-domain.sr': 'sr', + 'wikinews-cross-domain.sv': 'sv', + 'wikinews-cross-domain.ta': 'ta', + 'wikinews-cross-domain.tr': 'tr', + 'wikinews-cross-domain.uk': 'uk', + 'wikinews-cross-domain.zh': 'zh', + }, + "sbmaruf/forai_ml-ted_talk_iwslt": { + 'eu_ca_2014': 'eu_ca', + 'eu_ca_2015': 'eu_ca', + 'eu_ca_2016': 'eu_ca', + 'nl_en_2014': 'nl_en', + 'nl_en_2015': 'nl_en', + 'nl_en_2016': 'nl_en', + 'nl_hi_2014': 'nl_hi', + 'nl_hi_2015': 'nl_hi', + 'nl_hi_2016': 'nl_hi', + 'de_ja_2014': 'de_ja', + 'de_ja_2015': 'de_ja', + 'de_ja_2016': 'de_ja', + 'fr-ca_hi_2014': 'fr_hi', + 'fr-ca_hi_2015': 'fr_hi', + 'fr-ca_hi_2016': 'fr_hi', + }, + "sbmaruf/forai_ml_masakhane_mafand":{ + 'en-amh': 'en-amh', + 'en-hau': 'en-hau', + 'en-ibo': 'en-ibo', + 'en-kin': 'en-kin', + 'en-lug': 'en-lug', + 'en-nya': 'en-nya', + 'en-pcm': 'en-pcm', + 'en-sna': 'en-sna', + 'en-swa': 'en-swa', + 'en-tsn': 'en-tsn', + 'en-twi': 'en-twi', + 'en-xho': 'en-xho', + 'en-yor': 'en-yor', + 'en-zul': 'en-zul', + 'fr-bam': 'fr-bam', + 'fr-bbj': 'fr-bbj', + 'fr-ewe': 'fr-ewe', + 'fr-fon': 'fr-fon', + 'fr-mos': 'fr-mos', + 'fr-wol': 'fr-wol', + }, + "exams":{ + # 'alignments': 'mixed', + 'multilingual': 'mixed', + 'multilingual_with_para': 'mixed', + 'crosslingual_test':'mixed', + 'crosslingual_with_para_test': 'mixed', + 'crosslingual_bg': "bg", + 'crosslingual_with_para_bg': "bg", + 'crosslingual_hr': "hr", + 'crosslingual_with_para_hr': "hr", + 'crosslingual_hu': "hu", + 'crosslingual_with_para_hu': "hu", + 'crosslingual_it': "it", + 'crosslingual_with_para_it': "it", + 'crosslingual_mk': "mk", + 'crosslingual_with_para_mk': "mk", + 'crosslingual_pl': "pl", + 'crosslingual_with_para_pl': "pl", + 'crosslingual_pt': "pt", + 'crosslingual_with_para_pt': "pt", + 'crosslingual_sq': "sq", + 'crosslingual_with_para_sq': "sq", + 'crosslingual_sr': "sr", + 'crosslingual_with_para_sr': "sr", + 'crosslingual_tr': "tr", + 'crosslingual_with_para_tr': "tr", + 'crosslingual_vi': "vi", + 'crosslingual_with_para_vi': "vi", + }, + "allenai/soda": { + None: "en", + }, + "arabic_billion_words": { + 'Alittihad': "Alittihad", + 'Almasryalyoum': "Almasryalyoum", + 'Almustaqbal': "Almustaqbal", + 'Alqabas': "Alqabas", + 'Echoroukonline': "Echoroukonline", + 'Ryiadh': "Ryiadh", + 'Sabanews': "Sabanews", + 'SaudiYoumSaudi': "", + 'Techreen': "Techreen", + 'Youm7': "Youm7", + }, + "theblackcat102/joke_explaination": { + None: "en", + }, + "narrativeqa": { + None: "en", + }, + "svakulenk0/qrecc": { + None: "en", + }, + "GEM/wiki_cat_sum": { + "animan": "en", + "company": "en", + "film": "en", + } } def main(): @@ -389,16 +324,15 @@ def main(): stat_dict = {} if args.dataset_names is None: args.dataset_names = list(SERIES_A_DATASET_NAME_DICT.keys()) - for dataset_name, subset_names in SERIES_A_DATASET_NAME_DICT.items(): + for dataset_name, subset_dict in SERIES_A_DATASET_NAME_DICT.items(): if dataset_name not in args.dataset_names: continue assert dataset_name not in stat_dict stat_dict[dataset_name] = {} - subset_names = [None] if subset_names is None else subset_names - for subset in subset_names: + for subset, subset_lang in subset_dict.items(): assert subset not in stat_dict[dataset_name] stat_dict[dataset_name][subset] = {} - dt = datasets.load_dataset(dataset_name, name=subset, ignore_verifications=True) + dt = datasets.load_dataset(dataset_name, name=subset, verification_mode="no_checks") for split in dt.keys(): stat_dict[dataset_name][subset][split] = { "size": len(dt[split]), From 023b257771c0ef2cc8a2dbc2d33f27c22f2f6aa6 Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Fri, 26 May 2023 00:21:17 +0800 Subject: [PATCH 18/34] Automatic script running --- data/validate_and_generate.py | 324 +++++++++++++++++++++++++++++++ scripts/validate_and_generate.sh | 6 + 2 files changed, 330 insertions(+) create mode 100644 data/validate_and_generate.py create mode 100644 scripts/validate_and_generate.sh diff --git a/data/validate_and_generate.py b/data/validate_and_generate.py new file mode 100644 index 0000000..ffc5f71 --- /dev/null +++ b/data/validate_and_generate.py @@ -0,0 +1,324 @@ +import os +import csv +import copy +import json +import tqdm +import argparse +import datasets +import subprocess +from datetime import date +import concurrent.futures +from typing import Tuple, Optional, List +from promptsource.templates import Template +from .data_stat import SERIES_A_DATASET_NAME_DICT +datasets.logging.set_verbosity_error() + +mt5_langs_name_pair = [ + ("Afrikaans", "af"), ("Albanian", "sq"), ("Amharic", "am"), ("Arabic", "ar"), ("Armenian", "hy"), ("Azerbaijani", "az"), + ("Basque", "eu"), ("Belarusian", "be"), ("Bengali", "bn"), ("Bulgarian","bg"), ("Burmese", "my"), + ("Catalan", "ca"), ("Cebuano", "ceb"), ("Chichewa", "ny"), ("Chinese", "zh"), ("Corsican", "co"), ("Czech", "cs"), + ("Danish", "da"), ("Dutch", "nl"), + ("English", "en"), ("Esperanto", "eo"), ("Estonian", "et"), + ("Filipino", "fil"), ("Finnish", "fi"), ("French", "fr"), + ("Galician", "gl"), ("Georgian", "ka"), ("German", "de"), ("Greek", "el"), ("Gujarati", "gu"), + ("Haitian Creole", "ht"), ("Hausa", "ha"), ("Hawaiian", "haw"), ("Hebrew", "iw"), ("Hindi", "hi"), ("Hmong", "hmn"), ("Hungarian", "hu"), + ("Icelandic", "is"), ("Igbo", "ig"), ("Indonesian", "id"), ("Irish", "ga"), ("Italian", "it"), + ("Japanese", "ja"), ("Javanese", "jv"), + ("Kannada", "kn"), ("Kazakh", "kk"), ("Khmer", "km"), ("Korean", "ko"), ("Kurdish", "ku"), ("Kyrgyz", "ky"), + ("Lao", "lo"), ("Latin", "la"), ("Latvian", "lv"), ("Lithuanian", "lt"), ("Luxembourgish", "lb"), + ("Macedonian", "mk"), ("Malagasy", "mg"), ("Malay", "ms"), ("Malayalam", "ml"), ("Maltese", "mt"), ("Maori", "mi"), ("Marathi", "mr"), ("Mongolian", "mn"), + ("Nepali", "ne"), ("Norwegian", "no"), + ("Pashto", "ps"), ("Persian", "fa"), ("Polish", "pl"), ("Portuguese", "pt"), ("Punjabi", "pa"), + ("Romanian", "ro"), ("Russian", "ru"), + ("Samoan", "sm"), ("Scottish Gaelic", "gd"), ("Serbian", "sr"), ("Shona", "sn"), ("Sindhi", "sd"), ("Sinhala", "si"), ("Slovak","sk"), ("Slovenian", "sl"), ("Somali", "so"), ("Sotho", "st"), ("Spanish", "es"), ("Sundanese", "su"), ("Swahili", "sw"), ("Swedish", "sv"), + ("Tajik", "tg"), ("Tamil", "ta"), ("Telugu", "te"), ("Thai", "th"), ("Turkish", "tr"), + ("Ukrainian", "uk"), ("Urdu", "ur"), ("Uzbek", "uz"), + ("Vietnamese", "vi"), + ("Welsh", "cy"), ("West Frisian", "fy"), + ("Xhosa", "xh"), + ("Yiddish", "yi"), ("Yoruba", "yo"), ("Zulu", "zu") +] +mt5_langs_full_name_to_iso_name = { full_name: iso_name for full_name, iso_name in mt5_langs_name_pair} + +dataset_mapper = { + "AfriSenti-twitter-sentiment https://huggingface.co/datasets/shmuhammad/AfriSenti-twitter-sentiment": "shmuhammad/AfriSenti-twitter-sentiment", + "Joke-explanation https://huggingface.co/datasets/theblackcat102/joke_explaination": "theblackcat102/joke_explaination", + "Language Identification https://huggingface.co/datasets/papluca/language-identification": "papluca/language-identification", + "Mafand - a machine translation task https://huggingface.co/datasets/masakhane/mafand": "sbmaruf/forai_ml_masakhane_mafand", + "Masakhanews https://github.com/masakhane-io/masakhane-news": "masakhane/masakhanews", + "Mintaka https://huggingface.co/datasets/AmazonScience/mintaka":"AmazonScience/mintaka", + "NarrativeQA https://huggingface.co/datasets/narrativeqa": "narrativeqa", + "NusaX - sentiment classification https://huggingface.co/datasets/indonlp/NusaX-senti": "indonlp/NusaX-senti", + "qrecc https://huggingface.co/datasets/svakulenk0/qrecc": "svakulenk0/qrecc", + "SODA https://huggingface.co/datasets/allenai/soda": "allenai/soda", + "TED https://huggingface.co/datasets/ted_talks_iwslt": "sbmaruf/forai_ml-ted_talk_iwslt", + "WikiCatSum https://huggingface.co/datasets/GEM/wiki_cat_sum": "GEM/wiki_cat_sum", + "X-CSQA https://huggingface.co/datasets/xcsr": "xcsr", + "xlel_wd https://huggingface.co/datasets/adithya7/xlel_wd": "adithya7/xlel_wd" +} + +IGNORE_TASKS = [ + "arabic_billion_words", + "narrativeqa", + "svakulenk0/qrecc" +] +def check( + json_example: str, + template_name: str, + jinja_template: str, + template_reference: Optional[str] = None, + original_task: Optional[str] = None, + choices_in_prompt: Optional[bool] = None, + metrics: Optional[List[str]] = None, + languages: Optional[List[str]] = None, + answer_choices: Optional[str] = None +)-> Tuple[str, str]: + """ + Given an example (`json_example`) from a huggingface dataset and prompt template (`jinja_template`), + the objective is to check if we can project the example in language model i/o format. + Args: + json_example (str): a string contains json object. The json object is loaded + by `json.loads()`. Typically this is a sample from + huggingface dataset converted to a string by a `json.dumps()`. + template_name: unique name (per dataset) for template + jinja_template: template expressed in Jinja + template_reference: string describing author or paper reference for template + original_task: If True, this prompt asks a model to perform the original task designed for + this dataset. + choices_in_prompt: If True, the answer choices are included in the templates such that models + see those choices in the input. Only applicable to classification tasks. + metrics: List of strings denoting metrics to use for evaluation + languages: List of strings denoting languages used in the prompt (not the associated dataset!) + answer_choices: Jinja expression for answer choices. Should produce + a ||| delimited string of choices that enumerates + the possible completions for templates that should + be evaluated as ranked completions. If None, then + the template is open-ended. This list is accessible + from within Jinja as the variable `answer_choices`. + """ + json_example = json.loads(json_example) + metadata = Template.Metadata( + original_task, + choices_in_prompt, + metrics, + languages + ) + template = Template( + template_name, + jinja_template, + template_reference, + metadata=metadata, + answer_choices=answer_choices + ) + lm_io = template.apply(json_example, highlight_variables=False) + return lm_io + + +def create_name_with_hierarchy(output_dir, dataset_signature, dataset_subset, split_name, template_name, template_lang): + """ + /__/template-generation/