From 01b8112ab5ffa46b271193f508ee965164421757 Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Fri, 21 Apr 2023 02:30:25 +0800
Subject: [PATCH 01/34] ignore .DS_Store

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 31afbff..ac8a99b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -129,3 +129,4 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+.DS_Store
\ No newline at end of file

From fe320037e1668ec5071707fcd50bceee82f8b8ce Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Fri, 21 Apr 2023 02:31:05 +0800
Subject: [PATCH 02/34] data stat generator

---
 data/data_stat.py    | 342 +++++++++++++++++++++++++++++++++++++++++++
 scripts/data_stat.sh |   5 +
 2 files changed, 347 insertions(+)
 create mode 100644 data/data_stat.py
 create mode 100644 scripts/data_stat.sh

diff --git a/data/data_stat.py b/data/data_stat.py
new file mode 100644
index 0000000..0bc2d8e
--- /dev/null
+++ b/data/data_stat.py
@@ -0,0 +1,342 @@
+import json
+import datasets
+import argparse
+
+# huggingface dataset signature with configs
+SERIES_A_DATASET_NAME_DICT = {
+	"udhr": None,
+	"AmazonScience/mintaka": None,
+	"xcsr": [
+		'X-CSQA-en', 
+		'X-CSQA-zh', 
+		'X-CSQA-de', 
+		'X-CSQA-es', 
+		'X-CSQA-fr', 
+		'X-CSQA-it', 
+		'X-CSQA-jap', 
+		'X-CSQA-nl', 
+		'X-CSQA-pl', 
+		'X-CSQA-pt', 
+		'X-CSQA-ru', 
+		'X-CSQA-ar', 
+		'X-CSQA-vi', 
+		'X-CSQA-hi', 
+		'X-CSQA-sw', 
+		'X-CSQA-ur', 
+		'X-CODAH-en', 
+		'X-CODAH-zh', 
+		'X-CODAH-de', 
+		'X-CODAH-es', 
+		'X-CODAH-fr', 
+		'X-CODAH-it', 
+		'X-CODAH-jap', 
+		'X-CODAH-nl', 
+		'X-CODAH-pl', 
+		'X-CODAH-pt', 
+		'X-CODAH-ru', 
+		'X-CODAH-ar', 
+		'X-CODAH-vi', 
+		'X-CODAH-hi', 
+		'X-CODAH-sw', 
+		'X-CODAH-ur'
+	],
+	"shmuhammad/AfriSenti-twitter-sentiment": [
+		'amh', 
+		'hau', 
+		'ibo',
+		'arq', 
+		'ary', 
+		'yor', 
+		'por', 
+		'twi', 
+		'tso', 
+		'tir', 
+		'pcm', 
+		'kin', 
+		'swa'
+	], # orm is not workin
+	"indonlp/NusaX-senti": [
+		'ace', 
+		'ban', 
+		'bjn', 
+		'bug',
+		'eng',
+		'ind',
+		'jav', 
+		'mad', 
+		'min', 
+		'nij', 
+		'sun', 
+		'bbc'
+	],
+	"sbmaruf/forai_ml-masakhane-news": [
+		'amh', 
+		'eng', 
+		'fra', 
+		'hau', 
+		'ibo', 
+		'lin', 
+		'lug', 
+		'orm', 
+		'pcm', 
+		'run', 
+		'sna', 
+		'som', 
+		'swa', 
+		'tir', 
+		'xho', 
+		'yor'
+	],
+	"papluca/language-identification": [
+		'wikipedia-zero-shot', 
+		'wikipedia-zero-shot.af', 
+		'wikipedia-zero-shot.ar', 
+		'wikipedia-zero-shot.be', 
+		'wikipedia-zero-shot.bg', 
+		'wikipedia-zero-shot.bn',
+		'wikipedia-zero-shot.ca',
+		'wikipedia-zero-shot.cs', 
+		'wikipedia-zero-shot.da', 
+		'wikipedia-zero-shot.de', 
+		'wikipedia-zero-shot.el', 
+		'wikipedia-zero-shot.en',
+		'wikipedia-zero-shot.es',
+		'wikipedia-zero-shot.fa', 
+		'wikipedia-zero-shot.fi',
+		'wikipedia-zero-shot.fr',
+		'wikipedia-zero-shot.he',
+		'wikipedia-zero-shot.hi',
+		'wikipedia-zero-shot.hu',
+		'wikipedia-zero-shot.id',
+		'wikipedia-zero-shot.it',
+		'wikipedia-zero-shot.ja',
+		'wikipedia-zero-shot.ko',
+		'wikipedia-zero-shot.ml',
+		'wikipedia-zero-shot.mr',
+		'wikipedia-zero-shot.ms',
+		'wikipedia-zero-shot.nl',
+		'wikipedia-zero-shot.no',
+		'wikipedia-zero-shot.pl',
+		'wikipedia-zero-shot.pt',
+		'wikipedia-zero-shot.ro',
+		'wikipedia-zero-shot.ru',
+		'wikipedia-zero-shot.si',
+		'wikipedia-zero-shot.sk',
+  		'wikipedia-zero-shot.sl',
+		'wikipedia-zero-shot.sr', 
+		'wikipedia-zero-shot.sv', 
+		'wikipedia-zero-shot.sw',
+		'wikipedia-zero-shot.ta', 
+		'wikipedia-zero-shot.te',
+		'wikipedia-zero-shot.th',
+		'wikipedia-zero-shot.tr',
+		'wikipedia-zero-shot.uk',
+		'wikipedia-zero-shot.vi',
+		'wikipedia-zero-shot.zh',
+		'wikinews-zero-shot',
+		'wikinews-zero-shot.ar',
+		'wikinews-zero-shot.cs',
+		'wikinews-zero-shot.de',
+		'wikinews-zero-shot.en',
+		'wikinews-zero-shot.es',
+		'wikinews-zero-shot.fi', 
+		'wikinews-zero-shot.fr',
+		'wikinews-zero-shot.it',
+		'wikinews-zero-shot.ja',
+		'wikinews-zero-shot.ko',
+		'wikinews-zero-shot.nl',
+		'wikinews-zero-shot.no',
+		'wikinews-zero-shot.pl',
+		'wikinews-zero-shot.pt',
+		'wikinews-zero-shot.ru',
+		'wikinews-zero-shot.sr',
+		'wikinews-zero-shot.sv',
+		'wikinews-zero-shot.ta',
+		'wikinews-zero-shot.tr',
+		'wikinews-zero-shot.uk',
+		'wikinews-zero-shot.zh',
+		'wikinews-cross-domain', 
+		'wikinews-cross-domain.ar',
+		'wikinews-cross-domain.bg',
+		'wikinews-cross-domain.ca',
+		'wikinews-cross-domain.cs',
+		'wikinews-cross-domain.de',
+		'wikinews-cross-domain.el',
+		'wikinews-cross-domain.en',
+		'wikinews-cross-domain.es',
+		'wikinews-cross-domain.fi',
+		'wikinews-cross-domain.fr',
+		'wikinews-cross-domain.he', 
+		'wikinews-cross-domain.hu', 
+		'wikinews-cross-domain.it', 
+		'wikinews-cross-domain.ja',
+  		'wikinews-cross-domain.ko',
+		'wikinews-cross-domain.nl',
+		'wikinews-cross-domain.no',
+  		'wikinews-cross-domain.pl',
+		'wikinews-cross-domain.pt',
+		'wikinews-cross-domain.ro',
+  		'wikinews-cross-domain.ru',
+		'wikinews-cross-domain.sr',
+		'wikinews-cross-domain.sv',
+		'wikinews-cross-domain.ta',
+	 	'wikinews-cross-domain.tr',
+	  	'wikinews-cross-domain.uk', 
+		'wikinews-cross-domain.zh'
+	],
+	"adithya7/xlel_wd": [
+		'wikipedia-zero-shot', 
+		'wikipedia-zero-shot.af', 
+		'wikipedia-zero-shot.ar', 
+		'wikipedia-zero-shot.be', 
+		'wikipedia-zero-shot.bg',
+		'wikipedia-zero-shot.bn',
+		'wikipedia-zero-shot.ca',
+		'wikipedia-zero-shot.cs',
+		'wikipedia-zero-shot.da',
+		'wikipedia-zero-shot.de',
+		'wikipedia-zero-shot.el',
+		'wikipedia-zero-shot.en',
+		'wikipedia-zero-shot.es',
+		'wikipedia-zero-shot.fa',
+		'wikipedia-zero-shot.fi',
+		'wikipedia-zero-shot.fr',
+		'wikipedia-zero-shot.he',
+		'wikipedia-zero-shot.hi',
+		'wikipedia-zero-shot.hu', 
+		'wikipedia-zero-shot.id',
+		'wikipedia-zero-shot.it',
+		'wikipedia-zero-shot.ja',
+		'wikipedia-zero-shot.ko',
+		'wikipedia-zero-shot.ml',
+		'wikipedia-zero-shot.mr',
+		'wikipedia-zero-shot.ms',
+		'wikipedia-zero-shot.nl',
+		'wikipedia-zero-shot.no', 
+		'wikipedia-zero-shot.pl',
+		'wikipedia-zero-shot.pt',
+		'wikipedia-zero-shot.ro',
+		'wikipedia-zero-shot.ru',
+		'wikipedia-zero-shot.si',
+		'wikipedia-zero-shot.sk',
+		'wikipedia-zero-shot.sl',
+		'wikipedia-zero-shot.sr',
+		'wikipedia-zero-shot.sv',
+		'wikipedia-zero-shot.sw',
+		'wikipedia-zero-shot.ta',
+		'wikipedia-zero-shot.te',
+		'wikipedia-zero-shot.th', 
+		'wikipedia-zero-shot.tr',
+		'wikipedia-zero-shot.uk',
+	 	'wikipedia-zero-shot.vi', 
+		'wikipedia-zero-shot.zh',
+	 	'wikinews-zero-shot',
+	  	'wikinews-zero-shot.ar',
+	   	'wikinews-zero-shot.cs',
+		'wikinews-zero-shot.de',
+		'wikinews-zero-shot.en',
+		'wikinews-zero-shot.es',
+		'wikinews-zero-shot.fi',
+		'wikinews-zero-shot.fr',
+		'wikinews-zero-shot.it',
+		'wikinews-zero-shot.ja',
+		'wikinews-zero-shot.ko',
+		'wikinews-zero-shot.nl', 
+		'wikinews-zero-shot.no', 
+		'wikinews-zero-shot.pl',
+	 	'wikinews-zero-shot.pt', 
+		'wikinews-zero-shot.ru', 
+		'wikinews-zero-shot.sr',
+		'wikinews-zero-shot.sv', 
+		'wikinews-zero-shot.ta',
+	 	'wikinews-zero-shot.tr',
+	  	'wikinews-zero-shot.uk',
+		'wikinews-zero-shot.zh',
+		'wikinews-cross-domain',
+		'wikinews-cross-domain.ar',
+		'wikinews-cross-domain.bg',
+		'wikinews-cross-domain.ca',
+		'wikinews-cross-domain.cs',
+	 	'wikinews-cross-domain.de',
+		'wikinews-cross-domain.el',
+	 	'wikinews-cross-domain.en',
+	  	'wikinews-cross-domain.es',
+	   	'wikinews-cross-domain.fi',
+		'wikinews-cross-domain.fr', 
+		'wikinews-cross-domain.he', 
+		'wikinews-cross-domain.hu', 
+		'wikinews-cross-domain.it', 
+		'wikinews-cross-domain.ja', 
+		'wikinews-cross-domain.ko', 
+		'wikinews-cross-domain.nl', 
+		'wikinews-cross-domain.no', 
+		'wikinews-cross-domain.pl', 
+		'wikinews-cross-domain.pt', 
+		'wikinews-cross-domain.ro', 
+		'wikinews-cross-domain.ru', 
+		'wikinews-cross-domain.sr', 
+		'wikinews-cross-domain.sv', 
+		'wikinews-cross-domain.ta', 
+		'wikinews-cross-domain.tr', 
+		'wikinews-cross-domain.uk', 
+		'wikinews-cross-domain.zh'
+	],
+	"ted_talks_iwslt": [
+		'eu_ca_2014', 
+	 	'eu_ca_2015', 
+	  	'eu_ca_2016', 
+	   	'nl_en_2014', 
+		'nl_en_2015', 
+		'nl_en_2016', 
+		'nl_hi_2014', 
+		'nl_hi_2015', 
+		'nl_hi_2016', 
+		'de_ja_2014', 
+		'de_ja_2015', 
+		'de_ja_2016', 
+		'fr-ca_hi_2014', 
+		'fr-ca_hi_2015', 
+		'fr-ca_hi_2016'
+	]
+}
+
+def main():
+	parser = argparse.ArgumentParser()
+	parser.add_argument(
+		"--dataset-names",
+		nargs="+",
+		default=None,
+		help="Print the stat of the dataset. If `None` it will print stat of all the used data."
+	)
+	args = parser.parse_args()
+	stat_dict = {}
+	if args.dataset_names is None:
+		args.dataset_names = list(SERIES_A_DATASET_NAME_DICT.keys())
+	for dataset_name, subset_names in SERIES_A_DATASET_NAME_DICT.items():
+		if dataset_name not in args.dataset_names:
+			continue
+		stat_dict[dataset_name] = {}
+		if subset_names is None:
+			stat_dict[dataset_name]['Subset(None)'] = {}
+			dt = datasets.load_dataset(dataset_name, ignore_verifications=True)
+			for split in dt.keys():
+				stat_dict[dataset_name]['Subset(None)'][split] = {
+					"size": len(dt[split]),
+					"column": list(dt[split].column_names),
+				}
+		else:
+			for subset in subset_names:
+				assert subset not in stat_dict[dataset_name]
+				stat_dict[dataset_name][subset] = {}
+				dt = datasets.load_dataset(dataset_name, name=subset, ignore_verifications=True)
+				for split in dt.keys():
+					stat_dict[dataset_name][subset][split] = {
+						"size": len(dt[split]),
+						"column": list(dt[split].column_names),
+					}
+
+	print(f"{json.dumps(stat_dict, indent=4)}")
+
+if __name__ == "__main__":
+	main()
+   
diff --git a/scripts/data_stat.sh b/scripts/data_stat.sh
new file mode 100644
index 0000000..4a22414
--- /dev/null
+++ b/scripts/data_stat.sh
@@ -0,0 +1,5 @@
+# to see all the stat
+python data/data_stat.py
+
+# to see select dataset stat
+python data/data_stat.py --dataset-names udhr
\ No newline at end of file

From 76a150b3a0fff564bf4be6eda347ffb008909412 Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Fri, 21 Apr 2023 02:36:34 +0800
Subject: [PATCH 03/34] download data from google sheet.

---
 data/check_prompts.py | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 data/check_prompts.py

diff --git a/data/check_prompts.py b/data/check_prompts.py
new file mode 100644
index 0000000..e8e4a00
--- /dev/null
+++ b/data/check_prompts.py
@@ -0,0 +1,40 @@
+import os
+import csv
+import argparse
+import subprocess
+
+
+def main():
+	parser = argparse.ArgumentParser()
+	parser.add_argument(
+		"--form_path",
+		type=str,
+		default=None,
+		help="Path of the google sheet."
+	)
+	parser.add_argument(
+		"--overwrite",
+		action="store_true",
+		help="Overwrite eexisting prompt file prompts.csv."
+	)
+	parser.add_argument(
+		"--prompt-dir",
+		type=str,
+		default="data/",
+		help="Overwrite eexisting prompt file prompts.csv."
+	)
+	args = parser.parse_args()
+	prompt_file_path = f"{args.prompt_dir}/prompts.csv"
+	if os.path.exists(prompt_file_path) and args.overwrite: # if file exists, it may be from prev. run/download.
+		subprocess.check_output(f"mv {prompt_file_path} {prompt_file_path}.old", shell=True)
+		subprocess.check_output("curl -L https://docs.google.com/spreadsheets/d/10bCwOhM8zKNkqKi54gIvdwrR44YlWQFV9fpGm7acHv8/export?format=csv > ./data/prompts.csv", shell=True)
+  
+
+	with open('data/prompts.csv', 'r') as csvfile:
+		csvreader = csv.reader(csvfile)
+		next(iter(csvreader))
+		for row in csvreader:
+			print(row)
+ 
+if __name__ == "__main__":
+	main()
\ No newline at end of file

From 8c691a7f663fe4e34d5894a9b6064c91d0b352bb Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Fri, 21 Apr 2023 18:00:39 +0800
Subject: [PATCH 04/34] update official masakhane/masakhanews

---
 data/data_stat.py | 132 +++++++++++++++++++++++-----------------------
 1 file changed, 66 insertions(+), 66 deletions(-)

diff --git a/data/data_stat.py b/data/data_stat.py
index 0bc2d8e..cc8b6ab 100644
--- a/data/data_stat.py
+++ b/data/data_stat.py
@@ -4,72 +4,72 @@
 
 # huggingface dataset signature with configs
 SERIES_A_DATASET_NAME_DICT = {
-	"udhr": None,
-	"AmazonScience/mintaka": None,
-	"xcsr": [
-		'X-CSQA-en', 
-		'X-CSQA-zh', 
-		'X-CSQA-de', 
-		'X-CSQA-es', 
-		'X-CSQA-fr', 
-		'X-CSQA-it', 
-		'X-CSQA-jap', 
-		'X-CSQA-nl', 
-		'X-CSQA-pl', 
-		'X-CSQA-pt', 
-		'X-CSQA-ru', 
-		'X-CSQA-ar', 
-		'X-CSQA-vi', 
-		'X-CSQA-hi', 
-		'X-CSQA-sw', 
-		'X-CSQA-ur', 
-		'X-CODAH-en', 
-		'X-CODAH-zh', 
-		'X-CODAH-de', 
-		'X-CODAH-es', 
-		'X-CODAH-fr', 
-		'X-CODAH-it', 
-		'X-CODAH-jap', 
-		'X-CODAH-nl', 
-		'X-CODAH-pl', 
-		'X-CODAH-pt', 
-		'X-CODAH-ru', 
-		'X-CODAH-ar', 
-		'X-CODAH-vi', 
-		'X-CODAH-hi', 
-		'X-CODAH-sw', 
-		'X-CODAH-ur'
-	],
-	"shmuhammad/AfriSenti-twitter-sentiment": [
-		'amh', 
-		'hau', 
-		'ibo',
-		'arq', 
-		'ary', 
-		'yor', 
-		'por', 
-		'twi', 
-		'tso', 
-		'tir', 
-		'pcm', 
-		'kin', 
-		'swa'
-	], # orm is not workin
-	"indonlp/NusaX-senti": [
-		'ace', 
-		'ban', 
-		'bjn', 
-		'bug',
-		'eng',
-		'ind',
-		'jav', 
-		'mad', 
-		'min', 
-		'nij', 
-		'sun', 
-		'bbc'
-	],
-	"sbmaruf/forai_ml-masakhane-news": [
+	# "udhr": None,
+	# "AmazonScience/mintaka": None,
+	# "xcsr": [
+	# 	'X-CSQA-en', 
+	# 	'X-CSQA-zh', 
+	# 	'X-CSQA-de', 
+	# 	'X-CSQA-es', 
+	# 	'X-CSQA-fr', 
+	# 	'X-CSQA-it', 
+	# 	'X-CSQA-jap', 
+	# 	'X-CSQA-nl', 
+	# 	'X-CSQA-pl', 
+	# 	'X-CSQA-pt', 
+	# 	'X-CSQA-ru', 
+	# 	'X-CSQA-ar', 
+	# 	'X-CSQA-vi', 
+	# 	'X-CSQA-hi', 
+	# 	'X-CSQA-sw', 
+	# 	'X-CSQA-ur', 
+	# 	'X-CODAH-en', 
+	# 	'X-CODAH-zh', 
+	# 	'X-CODAH-de', 
+	# 	'X-CODAH-es', 
+	# 	'X-CODAH-fr', 
+	# 	'X-CODAH-it', 
+	# 	'X-CODAH-jap', 
+	# 	'X-CODAH-nl', 
+	# 	'X-CODAH-pl', 
+	# 	'X-CODAH-pt', 
+	# 	'X-CODAH-ru', 
+	# 	'X-CODAH-ar', 
+	# 	'X-CODAH-vi', 
+	# 	'X-CODAH-hi', 
+	# 	'X-CODAH-sw', 
+	# 	'X-CODAH-ur'
+	# ],
+	# "shmuhammad/AfriSenti-twitter-sentiment": [
+	# 	'amh', 
+	# 	'hau', 
+	# 	'ibo',
+	# 	'arq', 
+	# 	'ary', 
+	# 	'yor', 
+	# 	'por', 
+	# 	'twi', 
+	# 	'tso', 
+	# 	'tir', 
+	# 	'pcm', 
+	# 	'kin', 
+	# 	'swa'
+	# ], # orm is not workin
+	# "indonlp/NusaX-senti": [
+	# 	'ace', 
+	# 	'ban', 
+	# 	'bjn', 
+	# 	'bug',
+	# 	'eng',
+	# 	'ind',
+	# 	'jav', 
+	# 	'mad', 
+	# 	'min', 
+	# 	'nij', 
+	# 	'sun', 
+	# 	'bbc'
+	# ],
+	"masakhane/masakhanews": [
 		'amh', 
 		'eng', 
 		'fra', 

From 76a50d40daefa64fb7616cfa575b9498d83dfd68 Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Sun, 23 Apr 2023 02:48:43 +0800
Subject: [PATCH 05/34] update jinja prompt loader

---
 data/check_prompts.py   | 20 +++++++++++++++++++-
 scripts/check_prompt.sh |  0
 2 files changed, 19 insertions(+), 1 deletion(-)
 create mode 100644 scripts/check_prompt.sh

diff --git a/data/check_prompts.py b/data/check_prompts.py
index e8e4a00..ed4b9bb 100644
--- a/data/check_prompts.py
+++ b/data/check_prompts.py
@@ -1,8 +1,27 @@
 import os
 import csv
+import json
 import argparse
 import subprocess
+from promptsource.templates import Template
+from .data_stat import SERIES_A_DATASET_NAME_DICT
 
+def check(
+	json_example, 
+	template_name, 
+	jinja_template, 
+	template_reference=None, 
+	answer_choices=None
+):
+	json_example = json.loads(json_example)
+	template = Template(
+		template_name, 
+	 	jinja_template, 
+	  	template_reference, 
+	   	answer_choices=answer_choices
+	)
+	lm_io = template.apply(json_example, highlight_variables=False)
+	return lm_io
 
 def main():
 	parser = argparse.ArgumentParser()
@@ -29,7 +48,6 @@ def main():
 		subprocess.check_output(f"mv {prompt_file_path} {prompt_file_path}.old", shell=True)
 		subprocess.check_output("curl -L https://docs.google.com/spreadsheets/d/10bCwOhM8zKNkqKi54gIvdwrR44YlWQFV9fpGm7acHv8/export?format=csv > ./data/prompts.csv", shell=True)
   
-
 	with open('data/prompts.csv', 'r') as csvfile:
 		csvreader = csv.reader(csvfile)
 		next(iter(csvreader))
diff --git a/scripts/check_prompt.sh b/scripts/check_prompt.sh
new file mode 100644
index 0000000..e69de29

From 2f1f316cdd841da79f0ef115b12f9827b29c0957 Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Sun, 23 Apr 2023 02:51:49 +0800
Subject: [PATCH 06/34] update data source

---
 data/data_stat.py | 154 ++++++++++++++++++++++++++--------------------
 1 file changed, 88 insertions(+), 66 deletions(-)

diff --git a/data/data_stat.py b/data/data_stat.py
index cc8b6ab..68c8479 100644
--- a/data/data_stat.py
+++ b/data/data_stat.py
@@ -4,71 +4,71 @@
 
 # huggingface dataset signature with configs
 SERIES_A_DATASET_NAME_DICT = {
-	# "udhr": None,
-	# "AmazonScience/mintaka": None,
-	# "xcsr": [
-	# 	'X-CSQA-en', 
-	# 	'X-CSQA-zh', 
-	# 	'X-CSQA-de', 
-	# 	'X-CSQA-es', 
-	# 	'X-CSQA-fr', 
-	# 	'X-CSQA-it', 
-	# 	'X-CSQA-jap', 
-	# 	'X-CSQA-nl', 
-	# 	'X-CSQA-pl', 
-	# 	'X-CSQA-pt', 
-	# 	'X-CSQA-ru', 
-	# 	'X-CSQA-ar', 
-	# 	'X-CSQA-vi', 
-	# 	'X-CSQA-hi', 
-	# 	'X-CSQA-sw', 
-	# 	'X-CSQA-ur', 
-	# 	'X-CODAH-en', 
-	# 	'X-CODAH-zh', 
-	# 	'X-CODAH-de', 
-	# 	'X-CODAH-es', 
-	# 	'X-CODAH-fr', 
-	# 	'X-CODAH-it', 
-	# 	'X-CODAH-jap', 
-	# 	'X-CODAH-nl', 
-	# 	'X-CODAH-pl', 
-	# 	'X-CODAH-pt', 
-	# 	'X-CODAH-ru', 
-	# 	'X-CODAH-ar', 
-	# 	'X-CODAH-vi', 
-	# 	'X-CODAH-hi', 
-	# 	'X-CODAH-sw', 
-	# 	'X-CODAH-ur'
-	# ],
-	# "shmuhammad/AfriSenti-twitter-sentiment": [
-	# 	'amh', 
-	# 	'hau', 
-	# 	'ibo',
-	# 	'arq', 
-	# 	'ary', 
-	# 	'yor', 
-	# 	'por', 
-	# 	'twi', 
-	# 	'tso', 
-	# 	'tir', 
-	# 	'pcm', 
-	# 	'kin', 
-	# 	'swa'
-	# ], # orm is not workin
-	# "indonlp/NusaX-senti": [
-	# 	'ace', 
-	# 	'ban', 
-	# 	'bjn', 
-	# 	'bug',
-	# 	'eng',
-	# 	'ind',
-	# 	'jav', 
-	# 	'mad', 
-	# 	'min', 
-	# 	'nij', 
-	# 	'sun', 
-	# 	'bbc'
-	# ],
+	"udhr": None,
+	"AmazonScience/mintaka": None,
+	"xcsr": [
+		'X-CSQA-en', 
+		'X-CSQA-zh', 
+		'X-CSQA-de', 
+		'X-CSQA-es', 
+		'X-CSQA-fr', 
+		'X-CSQA-it', 
+		'X-CSQA-jap', 
+		'X-CSQA-nl', 
+		'X-CSQA-pl', 
+		'X-CSQA-pt', 
+		'X-CSQA-ru', 
+		'X-CSQA-ar', 
+		'X-CSQA-vi', 
+		'X-CSQA-hi', 
+		'X-CSQA-sw', 
+		'X-CSQA-ur', 
+		'X-CODAH-en', 
+		'X-CODAH-zh', 
+		'X-CODAH-de', 
+		'X-CODAH-es', 
+		'X-CODAH-fr', 
+		'X-CODAH-it', 
+		'X-CODAH-jap', 
+		'X-CODAH-nl', 
+		'X-CODAH-pl', 
+		'X-CODAH-pt', 
+		'X-CODAH-ru', 
+		'X-CODAH-ar', 
+		'X-CODAH-vi', 
+		'X-CODAH-hi', 
+		'X-CODAH-sw', 
+		'X-CODAH-ur'
+	],
+	"shmuhammad/AfriSenti-twitter-sentiment": [
+		'amh', 
+		'hau', 
+		'ibo',
+		'arq', 
+		'ary', 
+		'yor', 
+		'por', 
+		'twi', 
+		'tso', 
+		'tir', 
+		'pcm', 
+		'kin', 
+		'swa'
+	], # orm is not workin
+	"indonlp/NusaX-senti": [
+		'ace', 
+		'ban', 
+		'bjn', 
+		'bug',
+		'eng',
+		'ind',
+		'jav', 
+		'mad', 
+		'min', 
+		'nij', 
+		'sun', 
+		'bbc'
+	],
 	"masakhane/masakhanews": [
 		'amh', 
 		'eng', 
@@ -281,7 +281,7 @@
 		'wikinews-cross-domain.uk', 
 		'wikinews-cross-domain.zh'
 	],
-	"ted_talks_iwslt": [
+	"sbmaruf/forai_ml-ted_talk_iwslt": [
 		'eu_ca_2014', 
 	 	'eu_ca_2015', 
 	  	'eu_ca_2016', 
@@ -297,6 +297,28 @@
 		'fr-ca_hi_2014', 
 		'fr-ca_hi_2015', 
 		'fr-ca_hi_2016'
+	],
+	"sbmaruf/forai_ml_masakhane_mafand":[
+		'en-amh', 
+		'en-hau', 
+		'en-ibo', 
+		'en-kin', 
+		'en-lug', 
+		'en-nya', 
+		'en-pcm', 
+		'en-sna', 
+		'en-swa', 
+		'en-tsn', 
+		'en-twi', 
+		'en-xho', 
+		'en-yor', 
+		'en-zul', 
+		'fr-bam', 
+		'fr-bbj', 
+		'fr-ewe', 
+		'fr-fon', 
+		'fr-mos', 
+		'fr-wol'
 	]
 }
 

From bbcb48704d7fa28beefe93047d0885deaf9eaf9e Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Sun, 23 Apr 2023 04:39:54 +0800
Subject: [PATCH 07/34] sanity check of scsqa structure

---
 data/data_stat.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/data/data_stat.py b/data/data_stat.py
index 68c8479..f8905bb 100644
--- a/data/data_stat.py
+++ b/data/data_stat.py
@@ -356,6 +356,11 @@ def main():
 						"size": len(dt[split]),
 						"column": list(dt[split].column_names),
 					}
+					if "X-CSQA" in subset:
+						for sample in dt[split]:
+							assert len(sample['question']['choices']['label']) == 5
+
+
 
 	print(f"{json.dumps(stat_dict, indent=4)}")
 

From 3f01f470f4be42da007f8422ae6b7af03bca9e77 Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Sun, 30 Apr 2023 20:06:06 +0800
Subject: [PATCH 08/34] adding more datasets and output formatting

---
 data/data_stat.py | 73 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 70 insertions(+), 3 deletions(-)

diff --git a/data/data_stat.py b/data/data_stat.py
index f8905bb..fcb5ae3 100644
--- a/data/data_stat.py
+++ b/data/data_stat.py
@@ -1,3 +1,5 @@
+import os
+import csv
 import json
 import datasets
 import argparse
@@ -319,6 +321,48 @@
 		'fr-fon', 
 		'fr-mos', 
 		'fr-wol'
+	],
+	"exams":[
+		'alignments', 
+		'multilingual', 
+		'multilingual_with_para', 
+		'crosslingual_test', 
+		'crosslingual_with_para_test', 
+		'crosslingual_bg', 
+		'crosslingual_with_para_bg', 
+		'crosslingual_hr', 
+		'crosslingual_with_para_hr', 
+		'crosslingual_hu', 
+		'crosslingual_with_para_hu', 
+		'crosslingual_it', 
+		'crosslingual_with_para_it', 
+		'crosslingual_mk', 
+		'crosslingual_with_para_mk', 
+		'crosslingual_pl', 
+		'crosslingual_with_para_pl', 
+		'crosslingual_pt', 
+		'crosslingual_with_para_pt', 
+		'crosslingual_sq', 
+		'crosslingual_with_para_sq', 
+		'crosslingual_sr', 
+		'crosslingual_with_para_sr', 
+		'crosslingual_tr', 
+		'crosslingual_with_para_tr', 
+		'crosslingual_vi', 
+		'crosslingual_with_para_vi'
+	],
+	"allenai/soda": None, 
+	"arabic_billion_words":[
+		'Alittihad', 
+		'Almasryalyoum', 
+		'Almustaqbal', 
+		'Alqabas', 
+		'Echoroukonline', 
+		'Ryiadh', 
+		'Sabanews', 
+		'SaudiYoum', 
+		'Techreen', 
+		'Youm7'
 	]
 }
 
@@ -330,6 +374,17 @@ def main():
 		default=None,
 		help="Print the stat of the dataset. If `None` it will print stat of all the used data."
 	)
+	parser.add_argument(
+		"--export-format",
+		choices=['json', "csv"],
+		default=".json",
+		help="Which format you want to export."
+	)
+	parser.add_argument(
+		"--output-dir",
+		default=None,
+		help="The path to the folder where stat will be saved."
+	)
 	args = parser.parse_args()
 	stat_dict = {}
 	if args.dataset_names is None:
@@ -360,9 +415,21 @@ def main():
 						for sample in dt[split]:
 							assert len(sample['question']['choices']['label']) == 5
 
-
-
-	print(f"{json.dumps(stat_dict, indent=4)}")
+	if args.output_dir != 'None': 
+		file_name = os.path.join(args.output_dir, "stat") + f".{args.export_format}"
+		if args.export_format == "json":
+			with open(file_name, "w") as file_ptr:
+				file_ptr.write(f"{json.dumps(stat_dict, indent=4)}\n")
+		elif args.export_format == "csv":
+			# with open(file_name, mode='w') as file_ptr:
+			# 	writer = csv.writer(file_ptr)
+			# 	for dataset_name, subset_name, in SERIES_A_DATASET_NAME_DICT.keys():
+			# 		row = [f"{dataset_name}"]
+				
+			# 	writer.writerow(stat_dict.values())
+			pass
+		else:
+			raise NotImplementedError
 
 if __name__ == "__main__":
 	main()

From bf8f2c064c44d9d012bdf4d36a833727d0701d3a Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Sun, 30 Apr 2023 20:13:10 +0800
Subject: [PATCH 09/34] refactoring

---
 data/data_stat.py | 28 +++++++++++-----------------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/data/data_stat.py b/data/data_stat.py
index fcb5ae3..3ea52cb 100644
--- a/data/data_stat.py
+++ b/data/data_stat.py
@@ -392,28 +392,22 @@ def main():
 	for dataset_name, subset_names in SERIES_A_DATASET_NAME_DICT.items():
 		if dataset_name not in args.dataset_names:
 			continue
+		assert dataset_name not in stat_dict
 		stat_dict[dataset_name] = {}
-		if subset_names is None:
-			stat_dict[dataset_name]['Subset(None)'] = {}
-			dt = datasets.load_dataset(dataset_name, ignore_verifications=True)
+		subset_names = [None] if subset_names is None else subset_names
+		for subset in subset_names:
+			assert subset not in stat_dict[dataset_name]
+			stat_dict[dataset_name][subset] = {}
+			dt = datasets.load_dataset(dataset_name, name=subset, ignore_verifications=True)
 			for split in dt.keys():
-				stat_dict[dataset_name]['Subset(None)'][split] = {
+				stat_dict[dataset_name][subset][split] = {
 					"size": len(dt[split]),
 					"column": list(dt[split].column_names),
 				}
-		else:
-			for subset in subset_names:
-				assert subset not in stat_dict[dataset_name]
-				stat_dict[dataset_name][subset] = {}
-				dt = datasets.load_dataset(dataset_name, name=subset, ignore_verifications=True)
-				for split in dt.keys():
-					stat_dict[dataset_name][subset][split] = {
-						"size": len(dt[split]),
-						"column": list(dt[split].column_names),
-					}
-					if "X-CSQA" in subset:
-						for sample in dt[split]:
-							assert len(sample['question']['choices']['label']) == 5
+				# re-valuation of hypothesis considered in prompt template
+				if subset is not None and "X-CSQA" in subset:
+					for sample in dt[split]:
+						assert len(sample['question']['choices']['label']) == 5
 
 	if args.output_dir != 'None': 
 		file_name = os.path.join(args.output_dir, "stat") + f".{args.export_format}"

From e6488ddb7426a361b8ebe04dc05c781ef3a3a166 Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Sun, 30 Apr 2023 20:26:56 +0800
Subject: [PATCH 10/34] doc string

---
 data/check_prompts.py | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/data/check_prompts.py b/data/check_prompts.py
index ed4b9bb..d3b2f93 100644
--- a/data/check_prompts.py
+++ b/data/check_prompts.py
@@ -3,16 +3,33 @@
 import json
 import argparse
 import subprocess
+from typing import Tuple, Optional	
 from promptsource.templates import Template
 from .data_stat import SERIES_A_DATASET_NAME_DICT
 
 def check(
-	json_example, 
-	template_name, 
-	jinja_template, 
-	template_reference=None, 
-	answer_choices=None
-):
+	json_example: str, 
+	template_name: str, 
+	jinja_template: str, 
+	template_reference: Optional[str] = None, 
+	answer_choices: Optional[str] = None
+)-> Tuple[str, str]:
+	"""
+	Given a 
+	Args:
+		json_example (str): a string contains json object. The json object is loaded 
+								by `json.loads()`. Typically this is a sample from 
+								huggingface dataset converted to a string by a `json.dumps()`. 
+		template_name: unique name (per dataset) for template
+        jinja_template: template expressed in Jinja
+        template_reference: string describing author or paper reference for template
+        answer_choices: Jinja expression for answer choices. Should produce
+                            	a ||| delimited string of choices that enumerates
+                            	the possible completions for templates that should
+                            	be evaluated as ranked completions. If None, then
+                            	the template is open-ended. This list is accessible
+                            	from within Jinja as the variable `answer_choices`.
+	"""
 	json_example = json.loads(json_example)
 	template = Template(
 		template_name, 

From 7b9f1eb7954a0cba0ac0836e6ff6907b78c1b7c4 Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Sun, 30 Apr 2023 20:37:48 +0800
Subject: [PATCH 11/34] add metadata

---
 data/check_prompts.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/data/check_prompts.py b/data/check_prompts.py
index d3b2f93..421c0a1 100644
--- a/data/check_prompts.py
+++ b/data/check_prompts.py
@@ -4,7 +4,7 @@
 import argparse
 import subprocess
 from typing import Tuple, Optional	
-from promptsource.templates import Template
+from promptsource.templates import Template, Metadata
 from .data_stat import SERIES_A_DATASET_NAME_DICT
 
 def check(
@@ -12,10 +12,12 @@ def check(
 	template_name: str, 
 	jinja_template: str, 
 	template_reference: Optional[str] = None, 
+	metadata: Optional[Metadata] = None, 
 	answer_choices: Optional[str] = None
 )-> Tuple[str, str]:
 	"""
-	Given a 
+	Given an example (`json_example`) from a huggingface dataset and prompt template (`jinja_template`),
+	the objective is to check if we can project the example in language model i/o format.  
 	Args:
 		json_example (str): a string contains json object. The json object is loaded 
 								by `json.loads()`. Typically this is a sample from 
@@ -23,6 +25,8 @@ def check(
 		template_name: unique name (per dataset) for template
         jinja_template: template expressed in Jinja
         template_reference: string describing author or paper reference for template
+		metadata: A Metadata object with template annotations. 
+								Follow [here](https://github.com/bigscience-workshop/promptsource/blob/main/promptsource/templates.py#L417) for more details.
         answer_choices: Jinja expression for answer choices. Should produce
                             	a ||| delimited string of choices that enumerates
                             	the possible completions for templates that should
@@ -35,6 +39,7 @@ def check(
 		template_name, 
 	 	jinja_template, 
 	  	template_reference, 
+		metadata=metadata,
 	   	answer_choices=answer_choices
 	)
 	lm_io = template.apply(json_example, highlight_variables=False)

From 48244c0613f053d3db99dc686f9974dcf4a86afa Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Wed, 10 May 2023 07:10:24 +0800
Subject: [PATCH 12/34] prompt checker pipeline

---
 data/check_prompts.py   | 101 ++++++++++++++++++++++++++++++++++------
 scripts/check_prompt.sh |   3 ++
 2 files changed, 90 insertions(+), 14 deletions(-)

diff --git a/data/check_prompts.py b/data/check_prompts.py
index 421c0a1..d611b29 100644
--- a/data/check_prompts.py
+++ b/data/check_prompts.py
@@ -1,18 +1,27 @@
 import os
 import csv
+import copy
 import json
 import argparse
+import datasets
 import subprocess
-from typing import Tuple, Optional	
-from promptsource.templates import Template, Metadata
+from typing import Tuple, Optional, List	
+from promptsource.templates import Template, LANGUAGES
 from .data_stat import SERIES_A_DATASET_NAME_DICT
 
+dataset_mapper = {
+	"AfriSenti-twitter-sentiment https://huggingface.co/datasets/shmuhammad/AfriSenti-twitter-sentiment": "shmuhammad/AfriSenti-twitter-sentiment" 
+}
+
 def check(
 	json_example: str, 
 	template_name: str, 
 	jinja_template: str, 
 	template_reference: Optional[str] = None, 
-	metadata: Optional[Metadata] = None, 
+	original_task: Optional[str] = None, 
+	choices_in_prompt: Optional[bool] = None,
+	metrics: Optional[List[str]] = None,
+	languages: Optional[List[str]] = None,
 	answer_choices: Optional[str] = None
 )-> Tuple[str, str]:
 	"""
@@ -25,8 +34,12 @@ def check(
 		template_name: unique name (per dataset) for template
         jinja_template: template expressed in Jinja
         template_reference: string describing author or paper reference for template
-		metadata: A Metadata object with template annotations. 
-								Follow [here](https://github.com/bigscience-workshop/promptsource/blob/main/promptsource/templates.py#L417) for more details.
+		original_task: If True, this prompt asks a model to perform the original task designed for
+                this dataset.
+		choices_in_prompt: If True, the answer choices are included in the templates such that models
+			see those choices in the input. Only applicable to classification tasks.
+		metrics: List of strings denoting metrics to use for evaluation
+		languages: List of strings denoting languages used in the prompt (not the associated dataset!)
         answer_choices: Jinja expression for answer choices. Should produce
                             	a ||| delimited string of choices that enumerates
                             	the possible completions for templates that should
@@ -35,6 +48,12 @@ def check(
                             	from within Jinja as the variable `answer_choices`.
 	"""
 	json_example = json.loads(json_example)
+	metadata = Template.Metadata(
+		original_task,
+		choices_in_prompt,
+		metrics,
+		languages
+	)
 	template = Template(
 		template_name, 
 	 	jinja_template, 
@@ -45,12 +64,56 @@ def check(
 	lm_io = template.apply(json_example, highlight_variables=False)
 	return lm_io
 
+def validate(prompt_template_data):
+	print(json.dumps(prompt_template_data, indent=4))
+	dataset_info = prompt_template_data['What dataset do you pick?']
+	dataset_signature = dataset_mapper[dataset_info]
+	dataset_subsets = SERIES_A_DATASET_NAME_DICT[dataset_signature]
+	for dataset_subset in dataset_subsets:
+		dataset = datasets.load_dataset(dataset_signature, dataset_subset)
+		splits = dataset.keys()
+		for split in splits:
+			data = dataset[split]
+			model_input = prompt_template_data['Input to the model']
+			model_exp_output = prompt_template_data['Model\'s expected output']
+			for sample in data:
+				lm_io = check(
+					json_example = json.dumps(sample),
+					template_name = prompt_template_data['Name'],
+					jinja_template = f"{model_input} ||| {model_exp_output}",
+					template_reference = prompt_template_data['Discord username'],
+				)
+				if len(lm_io) == 2:
+					print(f"Validating dataset_signature:dataset_subset:split={dataset_signature}:{dataset_subset}:{split} with prompt template... [DONE]")
+				else:
+					print(f"Validating dataset_signature:dataset_subset:split={dataset_signature}:{dataset_subset}:{split} with prompt template... [FAILED]")
+					raise ValueError("Templating Error.")
+				break
+	print(dataset_signature, dataset_subsets)
+
+def parse(prompt_file_path, validate_rows):
+	print(validate_rows)
+	_prmompt_dict, dt_structure, idx_to_header = {}, {}, {}
+	with open(prompt_file_path, 'r') as csvfile:
+		csvreader = csv.reader(csvfile)
+		for row_idx, row in enumerate(csvreader):
+			if row_idx == 0:
+				for idx, dt in enumerate(row):
+					dt_structure[dt] = {}
+					idx_to_header[idx] = dt
+			if row_idx+1 in validate_rows:
+				sample = copy.deepcopy(dt_structure)
+				for idx, dt in enumerate(row):
+					sample[idx_to_header[idx]] = dt
+				_prmompt_dict[ row_idx+1 ] = sample
+	return _prmompt_dict
+
 def main():
 	parser = argparse.ArgumentParser()
 	parser.add_argument(
 		"--form_path",
 		type=str,
-		default=None,
+		default="https://docs.google.com/spreadsheets/d/10bCwOhM8zKNkqKi54gIvdwrR44YlWQFV9fpGm7acHv8/export?format=csv&id=10bCwOhM8zKNkqKi54gIvdwrR44YlWQFV9fpGm7acHv8&gid=726399306",
 		help="Path of the google sheet."
 	)
 	parser.add_argument(
@@ -62,19 +125,29 @@ def main():
 		"--prompt-dir",
 		type=str,
 		default="data/",
+		help="Overwrite existing prompt file prompts.csv."
+	)
+	parser.add_argument(
+		"--validate-rows",
+		nargs='*',
+		default=[3],
+		type=int,
 		help="Overwrite eexisting prompt file prompts.csv."
 	)
 	args = parser.parse_args()
 	prompt_file_path = f"{args.prompt_dir}/prompts.csv"
 	if os.path.exists(prompt_file_path) and args.overwrite: # if file exists, it may be from prev. run/download.
 		subprocess.check_output(f"mv {prompt_file_path} {prompt_file_path}.old", shell=True)
-		subprocess.check_output("curl -L https://docs.google.com/spreadsheets/d/10bCwOhM8zKNkqKi54gIvdwrR44YlWQFV9fpGm7acHv8/export?format=csv > ./data/prompts.csv", shell=True)
-  
-	with open('data/prompts.csv', 'r') as csvfile:
-		csvreader = csv.reader(csvfile)
-		next(iter(csvreader))
-		for row in csvreader:
-			print(row)
- 
+	if not os.path.exists(prompt_file_path):
+		cmd = f"curl -L '{args.form_path}' -o {prompt_file_path}"
+		subprocess.check_output(cmd, shell=True)
+
+	prompt_dict = parse(prompt_file_path, args.validate_rows)
+	for row_id, prompt_template_data in prompt_dict.items():
+		print(f"Validating row {row_id} ...")
+		validate(prompt_template_data)
+
+
+
 if __name__ == "__main__":
 	main()
\ No newline at end of file
diff --git a/scripts/check_prompt.sh b/scripts/check_prompt.sh
index e69de29..729dc5d 100644
--- a/scripts/check_prompt.sh
+++ b/scripts/check_prompt.sh
@@ -0,0 +1,3 @@
+python3 -m data.check_prompts \
+--prompt-dir data \
+--validate-rows 3
\ No newline at end of file

From 0f159f90375d023d358d97bf61842745dbf0f152 Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Wed, 10 May 2023 07:11:25 +0800
Subject: [PATCH 13/34] type

---
 data/check_prompts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/check_prompts.py b/data/check_prompts.py
index d611b29..dc33922 100644
--- a/data/check_prompts.py
+++ b/data/check_prompts.py
@@ -132,7 +132,7 @@ def main():
 		nargs='*',
 		default=[3],
 		type=int,
-		help="Overwrite eexisting prompt file prompts.csv."
+		help="Overwrite existing prompt file prompts.csv."
 	)
 	args = parser.parse_args()
 	prompt_file_path = f"{args.prompt_dir}/prompts.csv"

From 2ef6ba132cbe53a620d86741ffc40109f6ddf71c Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Wed, 10 May 2023 07:15:18 +0800
Subject: [PATCH 14/34] code formatting & doc string added

---
 data/check_prompts.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/data/check_prompts.py b/data/check_prompts.py
index dc33922..cf65f53 100644
--- a/data/check_prompts.py
+++ b/data/check_prompts.py
@@ -64,7 +64,11 @@ def check(
 	lm_io = template.apply(json_example, highlight_variables=False)
 	return lm_io
 
+
 def validate(prompt_template_data):
+	"""
+	Validate a prompt template
+	"""
 	print(json.dumps(prompt_template_data, indent=4))
 	dataset_info = prompt_template_data['What dataset do you pick?']
 	dataset_signature = dataset_mapper[dataset_info]
@@ -89,10 +93,12 @@ def validate(prompt_template_data):
 					print(f"Validating dataset_signature:dataset_subset:split={dataset_signature}:{dataset_subset}:{split} with prompt template... [FAILED]")
 					raise ValueError("Templating Error.")
 				break
-	print(dataset_signature, dataset_subsets)
+
 
 def parse(prompt_file_path, validate_rows):
-	print(validate_rows)
+	"""
+	Parse list of rows menntioned in validate_rows. 
+	"""
 	_prmompt_dict, dt_structure, idx_to_header = {}, {}, {}
 	with open(prompt_file_path, 'r') as csvfile:
 		csvreader = csv.reader(csvfile)
@@ -101,13 +107,14 @@ def parse(prompt_file_path, validate_rows):
 				for idx, dt in enumerate(row):
 					dt_structure[dt] = {}
 					idx_to_header[idx] = dt
-			if row_idx+1 in validate_rows:
+			if row_idx+1 in validate_rows: # 1 based indexing
 				sample = copy.deepcopy(dt_structure)
 				for idx, dt in enumerate(row):
 					sample[idx_to_header[idx]] = dt
 				_prmompt_dict[ row_idx+1 ] = sample
 	return _prmompt_dict
 
+
 def main():
 	parser = argparse.ArgumentParser()
 	parser.add_argument(
@@ -132,7 +139,7 @@ def main():
 		nargs='*',
 		default=[3],
 		type=int,
-		help="Overwrite existing prompt file prompts.csv."
+		help="List of row indices (1-based indexing ). The row mentioned here will indicate the row of `--form_path` spreadsheet."
 	)
 	args = parser.parse_args()
 	prompt_file_path = f"{args.prompt_dir}/prompts.csv"

From a75519a79b9ed8f64b73539902cd955f26b65aa4 Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Mon, 22 May 2023 23:59:15 +0800
Subject: [PATCH 15/34] Add all dataset info

---
 data/check_prompts.py | 77 +++++++++++++++++++++++++++----------------
 1 file changed, 48 insertions(+), 29 deletions(-)

diff --git a/data/check_prompts.py b/data/check_prompts.py
index cf65f53..8b22502 100644
--- a/data/check_prompts.py
+++ b/data/check_prompts.py
@@ -10,7 +10,20 @@
 from .data_stat import SERIES_A_DATASET_NAME_DICT
 
 dataset_mapper = {
-	"AfriSenti-twitter-sentiment https://huggingface.co/datasets/shmuhammad/AfriSenti-twitter-sentiment": "shmuhammad/AfriSenti-twitter-sentiment" 
+	"AfriSenti-twitter-sentiment https://huggingface.co/datasets/shmuhammad/AfriSenti-twitter-sentiment": "shmuhammad/AfriSenti-twitter-sentiment",
+	"Joke-explanation https://huggingface.co/datasets/theblackcat102/joke_explaination": "theblackcat102/joke_explaination",
+	"Language Identification https://huggingface.co/datasets/papluca/language-identification": "papluca/language-identification",
+	"Mafand - a machine translation task https://huggingface.co/datasets/masakhane/mafand": "sbmaruf/forai_ml_masakhane_mafand",
+	"Masakhanews https://github.com/masakhane-io/masakhane-news": "masakhane/masakhanews",
+	"Mintaka https://huggingface.co/datasets/AmazonScience/mintaka":"AmazonScience/mintaka",
+	"NarrativeQA https://huggingface.co/datasets/narrativeqa": "narrativeqa",
+	"NusaX - sentiment classification https://huggingface.co/datasets/indonlp/NusaX-senti": "indonlp/NusaX-senti",
+	"qrecc https://huggingface.co/datasets/svakulenk0/qrecc": "svakulenk0/qrecc",
+	"SODA https://huggingface.co/datasets/allenai/soda": "allenai/soda",
+	"TED https://huggingface.co/datasets/ted_talks_iwslt": "sbmaruf/forai_ml-ted_talk_iwslt",
+	"WikiCatSum https://huggingface.co/datasets/GEM/wiki_cat_sum": "GEM/wiki_cat_sum",
+	"X-CSQA https://huggingface.co/datasets/xcsr": "xcsr",
+	"xlel_wd https://huggingface.co/datasets/adithya7/xlel_wd": "adithya7/xlel_wd"
 }
 
 def check(
@@ -65,36 +78,42 @@ def check(
 	return lm_io
 
 
-def validate(prompt_template_data):
+def validate(prompt_template_data, row_id):
 	"""
 	Validate a prompt template
 	"""
-	print(json.dumps(prompt_template_data, indent=4))
-	dataset_info = prompt_template_data['What dataset do you pick?']
-	dataset_signature = dataset_mapper[dataset_info]
-	dataset_subsets = SERIES_A_DATASET_NAME_DICT[dataset_signature]
-	for dataset_subset in dataset_subsets:
-		dataset = datasets.load_dataset(dataset_signature, dataset_subset)
-		splits = dataset.keys()
-		for split in splits:
-			data = dataset[split]
-			model_input = prompt_template_data['Input to the model']
-			model_exp_output = prompt_template_data['Model\'s expected output']
-			for sample in data:
-				lm_io = check(
-					json_example = json.dumps(sample),
-					template_name = prompt_template_data['Name'],
-					jinja_template = f"{model_input} ||| {model_exp_output}",
-					template_reference = prompt_template_data['Discord username'],
-				)
-				if len(lm_io) == 2:
-					print(f"Validating dataset_signature:dataset_subset:split={dataset_signature}:{dataset_subset}:{split} with prompt template... [DONE]")
-				else:
-					print(f"Validating dataset_signature:dataset_subset:split={dataset_signature}:{dataset_subset}:{split} with prompt template... [FAILED]")
-					raise ValueError("Templating Error.")
-				break
-
-
+	try:
+		print(json.dumps(prompt_template_data, indent=4))
+		dataset_info = prompt_template_data['What dataset do you pick?']
+		if dataset_info not in dataset_mapper:
+			dataset_signature = dataset_info.split()[0].lower()
+		else:
+			dataset_signature = dataset_mapper[dataset_info]
+		dataset_subsets = SERIES_A_DATASET_NAME_DICT[dataset_signature]
+		for dataset_subset in dataset_subsets:
+			dataset = datasets.load_dataset(dataset_signature, dataset_subset)
+			splits = dataset.keys()
+			for split in splits:
+				data = dataset[split]
+				model_input = prompt_template_data['Input to the model']
+				model_exp_output = prompt_template_data['Model\'s expected output']
+				for sample in data:
+					lm_io = check(
+						json_example = json.dumps(sample),
+						template_name = prompt_template_data['Name'],
+						jinja_template = f"{model_input} ||| {model_exp_output}",
+						template_reference = prompt_template_data['Discord username'],
+					)
+					if len(lm_io) == 2:
+						print(f"Validating dataset_signature:dataset_subset:split={dataset_signature}:{dataset_subset}:{split} with prompt template... [DONE]")
+					else:
+						print(f"Validating dataset_signature:dataset_subset:split={dataset_signature}:{dataset_subset}:{split} with prompt template... [FAILED]")
+						raise ValueError("Templating Error.")
+					break
+	except:
+		print(f"Error in row {row_id}")
+		raise
+	
 def parse(prompt_file_path, validate_rows):
 	"""
 	Parse list of rows menntioned in validate_rows. 
@@ -152,7 +171,7 @@ def main():
 	prompt_dict = parse(prompt_file_path, args.validate_rows)
 	for row_id, prompt_template_data in prompt_dict.items():
 		print(f"Validating row {row_id} ...")
-		validate(prompt_template_data)
+		validate(prompt_template_data, row_id)
 
 
 

From d778277aaf0b69225deaba4c8bd2bb74d86bf970 Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Fri, 26 May 2023 00:20:05 +0800
Subject: [PATCH 16/34] update naming

---
 data/check_prompts.py   | 179 ----------------------------------------
 scripts/check_prompt.sh |   3 -
 2 files changed, 182 deletions(-)
 delete mode 100644 data/check_prompts.py
 delete mode 100644 scripts/check_prompt.sh

diff --git a/data/check_prompts.py b/data/check_prompts.py
deleted file mode 100644
index 8b22502..0000000
--- a/data/check_prompts.py
+++ /dev/null
@@ -1,179 +0,0 @@
-import os
-import csv
-import copy
-import json
-import argparse
-import datasets
-import subprocess
-from typing import Tuple, Optional, List	
-from promptsource.templates import Template, LANGUAGES
-from .data_stat import SERIES_A_DATASET_NAME_DICT
-
-dataset_mapper = {
-	"AfriSenti-twitter-sentiment https://huggingface.co/datasets/shmuhammad/AfriSenti-twitter-sentiment": "shmuhammad/AfriSenti-twitter-sentiment",
-	"Joke-explanation https://huggingface.co/datasets/theblackcat102/joke_explaination": "theblackcat102/joke_explaination",
-	"Language Identification https://huggingface.co/datasets/papluca/language-identification": "papluca/language-identification",
-	"Mafand - a machine translation task https://huggingface.co/datasets/masakhane/mafand": "sbmaruf/forai_ml_masakhane_mafand",
-	"Masakhanews https://github.com/masakhane-io/masakhane-news": "masakhane/masakhanews",
-	"Mintaka https://huggingface.co/datasets/AmazonScience/mintaka":"AmazonScience/mintaka",
-	"NarrativeQA https://huggingface.co/datasets/narrativeqa": "narrativeqa",
-	"NusaX - sentiment classification https://huggingface.co/datasets/indonlp/NusaX-senti": "indonlp/NusaX-senti",
-	"qrecc https://huggingface.co/datasets/svakulenk0/qrecc": "svakulenk0/qrecc",
-	"SODA https://huggingface.co/datasets/allenai/soda": "allenai/soda",
-	"TED https://huggingface.co/datasets/ted_talks_iwslt": "sbmaruf/forai_ml-ted_talk_iwslt",
-	"WikiCatSum https://huggingface.co/datasets/GEM/wiki_cat_sum": "GEM/wiki_cat_sum",
-	"X-CSQA https://huggingface.co/datasets/xcsr": "xcsr",
-	"xlel_wd https://huggingface.co/datasets/adithya7/xlel_wd": "adithya7/xlel_wd"
-}
-
-def check(
-	json_example: str, 
-	template_name: str, 
-	jinja_template: str, 
-	template_reference: Optional[str] = None, 
-	original_task: Optional[str] = None, 
-	choices_in_prompt: Optional[bool] = None,
-	metrics: Optional[List[str]] = None,
-	languages: Optional[List[str]] = None,
-	answer_choices: Optional[str] = None
-)-> Tuple[str, str]:
-	"""
-	Given an example (`json_example`) from a huggingface dataset and prompt template (`jinja_template`),
-	the objective is to check if we can project the example in language model i/o format.  
-	Args:
-		json_example (str): a string contains json object. The json object is loaded 
-								by `json.loads()`. Typically this is a sample from 
-								huggingface dataset converted to a string by a `json.dumps()`. 
-		template_name: unique name (per dataset) for template
-        jinja_template: template expressed in Jinja
-        template_reference: string describing author or paper reference for template
-		original_task: If True, this prompt asks a model to perform the original task designed for
-                this dataset.
-		choices_in_prompt: If True, the answer choices are included in the templates such that models
-			see those choices in the input. Only applicable to classification tasks.
-		metrics: List of strings denoting metrics to use for evaluation
-		languages: List of strings denoting languages used in the prompt (not the associated dataset!)
-        answer_choices: Jinja expression for answer choices. Should produce
-                            	a ||| delimited string of choices that enumerates
-                            	the possible completions for templates that should
-                            	be evaluated as ranked completions. If None, then
-                            	the template is open-ended. This list is accessible
-                            	from within Jinja as the variable `answer_choices`.
-	"""
-	json_example = json.loads(json_example)
-	metadata = Template.Metadata(
-		original_task,
-		choices_in_prompt,
-		metrics,
-		languages
-	)
-	template = Template(
-		template_name, 
-	 	jinja_template, 
-	  	template_reference, 
-		metadata=metadata,
-	   	answer_choices=answer_choices
-	)
-	lm_io = template.apply(json_example, highlight_variables=False)
-	return lm_io
-
-
-def validate(prompt_template_data, row_id):
-	"""
-	Validate a prompt template
-	"""
-	try:
-		print(json.dumps(prompt_template_data, indent=4))
-		dataset_info = prompt_template_data['What dataset do you pick?']
-		if dataset_info not in dataset_mapper:
-			dataset_signature = dataset_info.split()[0].lower()
-		else:
-			dataset_signature = dataset_mapper[dataset_info]
-		dataset_subsets = SERIES_A_DATASET_NAME_DICT[dataset_signature]
-		for dataset_subset in dataset_subsets:
-			dataset = datasets.load_dataset(dataset_signature, dataset_subset)
-			splits = dataset.keys()
-			for split in splits:
-				data = dataset[split]
-				model_input = prompt_template_data['Input to the model']
-				model_exp_output = prompt_template_data['Model\'s expected output']
-				for sample in data:
-					lm_io = check(
-						json_example = json.dumps(sample),
-						template_name = prompt_template_data['Name'],
-						jinja_template = f"{model_input} ||| {model_exp_output}",
-						template_reference = prompt_template_data['Discord username'],
-					)
-					if len(lm_io) == 2:
-						print(f"Validating dataset_signature:dataset_subset:split={dataset_signature}:{dataset_subset}:{split} with prompt template... [DONE]")
-					else:
-						print(f"Validating dataset_signature:dataset_subset:split={dataset_signature}:{dataset_subset}:{split} with prompt template... [FAILED]")
-						raise ValueError("Templating Error.")
-					break
-	except:
-		print(f"Error in row {row_id}")
-		raise
-	
-def parse(prompt_file_path, validate_rows):
-	"""
-	Parse list of rows menntioned in validate_rows. 
-	"""
-	_prmompt_dict, dt_structure, idx_to_header = {}, {}, {}
-	with open(prompt_file_path, 'r') as csvfile:
-		csvreader = csv.reader(csvfile)
-		for row_idx, row in enumerate(csvreader):
-			if row_idx == 0:
-				for idx, dt in enumerate(row):
-					dt_structure[dt] = {}
-					idx_to_header[idx] = dt
-			if row_idx+1 in validate_rows: # 1 based indexing
-				sample = copy.deepcopy(dt_structure)
-				for idx, dt in enumerate(row):
-					sample[idx_to_header[idx]] = dt
-				_prmompt_dict[ row_idx+1 ] = sample
-	return _prmompt_dict
-
-
-def main():
-	parser = argparse.ArgumentParser()
-	parser.add_argument(
-		"--form_path",
-		type=str,
-		default="https://docs.google.com/spreadsheets/d/10bCwOhM8zKNkqKi54gIvdwrR44YlWQFV9fpGm7acHv8/export?format=csv&id=10bCwOhM8zKNkqKi54gIvdwrR44YlWQFV9fpGm7acHv8&gid=726399306",
-		help="Path of the google sheet."
-	)
-	parser.add_argument(
-		"--overwrite",
-		action="store_true",
-		help="Overwrite eexisting prompt file prompts.csv."
-	)
-	parser.add_argument(
-		"--prompt-dir",
-		type=str,
-		default="data/",
-		help="Overwrite existing prompt file prompts.csv."
-	)
-	parser.add_argument(
-		"--validate-rows",
-		nargs='*',
-		default=[3],
-		type=int,
-		help="List of row indices (1-based indexing ). The row mentioned here will indicate the row of `--form_path` spreadsheet."
-	)
-	args = parser.parse_args()
-	prompt_file_path = f"{args.prompt_dir}/prompts.csv"
-	if os.path.exists(prompt_file_path) and args.overwrite: # if file exists, it may be from prev. run/download.
-		subprocess.check_output(f"mv {prompt_file_path} {prompt_file_path}.old", shell=True)
-	if not os.path.exists(prompt_file_path):
-		cmd = f"curl -L '{args.form_path}' -o {prompt_file_path}"
-		subprocess.check_output(cmd, shell=True)
-
-	prompt_dict = parse(prompt_file_path, args.validate_rows)
-	for row_id, prompt_template_data in prompt_dict.items():
-		print(f"Validating row {row_id} ...")
-		validate(prompt_template_data, row_id)
-
-
-
-if __name__ == "__main__":
-	main()
\ No newline at end of file
diff --git a/scripts/check_prompt.sh b/scripts/check_prompt.sh
deleted file mode 100644
index 729dc5d..0000000
--- a/scripts/check_prompt.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-python3 -m data.check_prompts \
---prompt-dir data \
---validate-rows 3
\ No newline at end of file

From a9f210caa68dc105bb74d3836e1a37773d307059 Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Fri, 26 May 2023 00:20:25 +0800
Subject: [PATCH 17/34] add split language

---
 data/data_stat.py | 658 +++++++++++++++++++++-------------------------
 1 file changed, 296 insertions(+), 362 deletions(-)

diff --git a/data/data_stat.py b/data/data_stat.py
index 3ea52cb..0fb7337 100644
--- a/data/data_stat.py
+++ b/data/data_stat.py
@@ -6,364 +6,299 @@
 
 # huggingface dataset signature with configs
 SERIES_A_DATASET_NAME_DICT = {
-	"udhr": None,
-	"AmazonScience/mintaka": None,
-	"xcsr": [
-		'X-CSQA-en', 
-		'X-CSQA-zh', 
-		'X-CSQA-de', 
-		'X-CSQA-es', 
-		'X-CSQA-fr', 
-		'X-CSQA-it', 
-		'X-CSQA-jap', 
-		'X-CSQA-nl', 
-		'X-CSQA-pl', 
-		'X-CSQA-pt', 
-		'X-CSQA-ru', 
-		'X-CSQA-ar', 
-		'X-CSQA-vi', 
-		'X-CSQA-hi', 
-		'X-CSQA-sw', 
-		'X-CSQA-ur', 
-		'X-CODAH-en', 
-		'X-CODAH-zh', 
-		'X-CODAH-de', 
-		'X-CODAH-es', 
-		'X-CODAH-fr', 
-		'X-CODAH-it', 
-		'X-CODAH-jap', 
-		'X-CODAH-nl', 
-		'X-CODAH-pl', 
-		'X-CODAH-pt', 
-		'X-CODAH-ru', 
-		'X-CODAH-ar', 
-		'X-CODAH-vi', 
-		'X-CODAH-hi', 
-		'X-CODAH-sw', 
-		'X-CODAH-ur'
-	],
-	"shmuhammad/AfriSenti-twitter-sentiment": [
-		'amh', 
-		'hau', 
-		'ibo',
-		'arq', 
-		'ary', 
-		'yor', 
-		'por', 
-		'twi', 
-		'tso', 
-		'tir', 
-		'pcm', 
-		'kin', 
-		'swa'
-	], # orm is not workin
-	"indonlp/NusaX-senti": [
-		'ace', 
-		'ban', 
-		'bjn', 
-		'bug',
-		'eng',
-		'ind',
-		'jav', 
-		'mad', 
-		'min', 
-		'nij', 
-		'sun', 
-		'bbc'
-	],
-	"masakhane/masakhanews": [
-		'amh', 
-		'eng', 
-		'fra', 
-		'hau', 
-		'ibo', 
-		'lin', 
-		'lug', 
-		'orm', 
-		'pcm', 
-		'run', 
-		'sna', 
-		'som', 
-		'swa', 
-		'tir', 
-		'xho', 
-		'yor'
-	],
-	"papluca/language-identification": [
-		'wikipedia-zero-shot', 
-		'wikipedia-zero-shot.af', 
-		'wikipedia-zero-shot.ar', 
-		'wikipedia-zero-shot.be', 
-		'wikipedia-zero-shot.bg', 
-		'wikipedia-zero-shot.bn',
-		'wikipedia-zero-shot.ca',
-		'wikipedia-zero-shot.cs', 
-		'wikipedia-zero-shot.da', 
-		'wikipedia-zero-shot.de', 
-		'wikipedia-zero-shot.el', 
-		'wikipedia-zero-shot.en',
-		'wikipedia-zero-shot.es',
-		'wikipedia-zero-shot.fa', 
-		'wikipedia-zero-shot.fi',
-		'wikipedia-zero-shot.fr',
-		'wikipedia-zero-shot.he',
-		'wikipedia-zero-shot.hi',
-		'wikipedia-zero-shot.hu',
-		'wikipedia-zero-shot.id',
-		'wikipedia-zero-shot.it',
-		'wikipedia-zero-shot.ja',
-		'wikipedia-zero-shot.ko',
-		'wikipedia-zero-shot.ml',
-		'wikipedia-zero-shot.mr',
-		'wikipedia-zero-shot.ms',
-		'wikipedia-zero-shot.nl',
-		'wikipedia-zero-shot.no',
-		'wikipedia-zero-shot.pl',
-		'wikipedia-zero-shot.pt',
-		'wikipedia-zero-shot.ro',
-		'wikipedia-zero-shot.ru',
-		'wikipedia-zero-shot.si',
-		'wikipedia-zero-shot.sk',
-  		'wikipedia-zero-shot.sl',
-		'wikipedia-zero-shot.sr', 
-		'wikipedia-zero-shot.sv', 
-		'wikipedia-zero-shot.sw',
-		'wikipedia-zero-shot.ta', 
-		'wikipedia-zero-shot.te',
-		'wikipedia-zero-shot.th',
-		'wikipedia-zero-shot.tr',
-		'wikipedia-zero-shot.uk',
-		'wikipedia-zero-shot.vi',
-		'wikipedia-zero-shot.zh',
-		'wikinews-zero-shot',
-		'wikinews-zero-shot.ar',
-		'wikinews-zero-shot.cs',
-		'wikinews-zero-shot.de',
-		'wikinews-zero-shot.en',
-		'wikinews-zero-shot.es',
-		'wikinews-zero-shot.fi', 
-		'wikinews-zero-shot.fr',
-		'wikinews-zero-shot.it',
-		'wikinews-zero-shot.ja',
-		'wikinews-zero-shot.ko',
-		'wikinews-zero-shot.nl',
-		'wikinews-zero-shot.no',
-		'wikinews-zero-shot.pl',
-		'wikinews-zero-shot.pt',
-		'wikinews-zero-shot.ru',
-		'wikinews-zero-shot.sr',
-		'wikinews-zero-shot.sv',
-		'wikinews-zero-shot.ta',
-		'wikinews-zero-shot.tr',
-		'wikinews-zero-shot.uk',
-		'wikinews-zero-shot.zh',
-		'wikinews-cross-domain', 
-		'wikinews-cross-domain.ar',
-		'wikinews-cross-domain.bg',
-		'wikinews-cross-domain.ca',
-		'wikinews-cross-domain.cs',
-		'wikinews-cross-domain.de',
-		'wikinews-cross-domain.el',
-		'wikinews-cross-domain.en',
-		'wikinews-cross-domain.es',
-		'wikinews-cross-domain.fi',
-		'wikinews-cross-domain.fr',
-		'wikinews-cross-domain.he', 
-		'wikinews-cross-domain.hu', 
-		'wikinews-cross-domain.it', 
-		'wikinews-cross-domain.ja',
-  		'wikinews-cross-domain.ko',
-		'wikinews-cross-domain.nl',
-		'wikinews-cross-domain.no',
-  		'wikinews-cross-domain.pl',
-		'wikinews-cross-domain.pt',
-		'wikinews-cross-domain.ro',
-  		'wikinews-cross-domain.ru',
-		'wikinews-cross-domain.sr',
-		'wikinews-cross-domain.sv',
-		'wikinews-cross-domain.ta',
-	 	'wikinews-cross-domain.tr',
-	  	'wikinews-cross-domain.uk', 
-		'wikinews-cross-domain.zh'
-	],
-	"adithya7/xlel_wd": [
-		'wikipedia-zero-shot', 
-		'wikipedia-zero-shot.af', 
-		'wikipedia-zero-shot.ar', 
-		'wikipedia-zero-shot.be', 
-		'wikipedia-zero-shot.bg',
-		'wikipedia-zero-shot.bn',
-		'wikipedia-zero-shot.ca',
-		'wikipedia-zero-shot.cs',
-		'wikipedia-zero-shot.da',
-		'wikipedia-zero-shot.de',
-		'wikipedia-zero-shot.el',
-		'wikipedia-zero-shot.en',
-		'wikipedia-zero-shot.es',
-		'wikipedia-zero-shot.fa',
-		'wikipedia-zero-shot.fi',
-		'wikipedia-zero-shot.fr',
-		'wikipedia-zero-shot.he',
-		'wikipedia-zero-shot.hi',
-		'wikipedia-zero-shot.hu', 
-		'wikipedia-zero-shot.id',
-		'wikipedia-zero-shot.it',
-		'wikipedia-zero-shot.ja',
-		'wikipedia-zero-shot.ko',
-		'wikipedia-zero-shot.ml',
-		'wikipedia-zero-shot.mr',
-		'wikipedia-zero-shot.ms',
-		'wikipedia-zero-shot.nl',
-		'wikipedia-zero-shot.no', 
-		'wikipedia-zero-shot.pl',
-		'wikipedia-zero-shot.pt',
-		'wikipedia-zero-shot.ro',
-		'wikipedia-zero-shot.ru',
-		'wikipedia-zero-shot.si',
-		'wikipedia-zero-shot.sk',
-		'wikipedia-zero-shot.sl',
-		'wikipedia-zero-shot.sr',
-		'wikipedia-zero-shot.sv',
-		'wikipedia-zero-shot.sw',
-		'wikipedia-zero-shot.ta',
-		'wikipedia-zero-shot.te',
-		'wikipedia-zero-shot.th', 
-		'wikipedia-zero-shot.tr',
-		'wikipedia-zero-shot.uk',
-	 	'wikipedia-zero-shot.vi', 
-		'wikipedia-zero-shot.zh',
-	 	'wikinews-zero-shot',
-	  	'wikinews-zero-shot.ar',
-	   	'wikinews-zero-shot.cs',
-		'wikinews-zero-shot.de',
-		'wikinews-zero-shot.en',
-		'wikinews-zero-shot.es',
-		'wikinews-zero-shot.fi',
-		'wikinews-zero-shot.fr',
-		'wikinews-zero-shot.it',
-		'wikinews-zero-shot.ja',
-		'wikinews-zero-shot.ko',
-		'wikinews-zero-shot.nl', 
-		'wikinews-zero-shot.no', 
-		'wikinews-zero-shot.pl',
-	 	'wikinews-zero-shot.pt', 
-		'wikinews-zero-shot.ru', 
-		'wikinews-zero-shot.sr',
-		'wikinews-zero-shot.sv', 
-		'wikinews-zero-shot.ta',
-	 	'wikinews-zero-shot.tr',
-	  	'wikinews-zero-shot.uk',
-		'wikinews-zero-shot.zh',
-		'wikinews-cross-domain',
-		'wikinews-cross-domain.ar',
-		'wikinews-cross-domain.bg',
-		'wikinews-cross-domain.ca',
-		'wikinews-cross-domain.cs',
-	 	'wikinews-cross-domain.de',
-		'wikinews-cross-domain.el',
-	 	'wikinews-cross-domain.en',
-	  	'wikinews-cross-domain.es',
-	   	'wikinews-cross-domain.fi',
-		'wikinews-cross-domain.fr', 
-		'wikinews-cross-domain.he', 
-		'wikinews-cross-domain.hu', 
-		'wikinews-cross-domain.it', 
-		'wikinews-cross-domain.ja', 
-		'wikinews-cross-domain.ko', 
-		'wikinews-cross-domain.nl', 
-		'wikinews-cross-domain.no', 
-		'wikinews-cross-domain.pl', 
-		'wikinews-cross-domain.pt', 
-		'wikinews-cross-domain.ro', 
-		'wikinews-cross-domain.ru', 
-		'wikinews-cross-domain.sr', 
-		'wikinews-cross-domain.sv', 
-		'wikinews-cross-domain.ta', 
-		'wikinews-cross-domain.tr', 
-		'wikinews-cross-domain.uk', 
-		'wikinews-cross-domain.zh'
-	],
-	"sbmaruf/forai_ml-ted_talk_iwslt": [
-		'eu_ca_2014', 
-	 	'eu_ca_2015', 
-	  	'eu_ca_2016', 
-	   	'nl_en_2014', 
-		'nl_en_2015', 
-		'nl_en_2016', 
-		'nl_hi_2014', 
-		'nl_hi_2015', 
-		'nl_hi_2016', 
-		'de_ja_2014', 
-		'de_ja_2015', 
-		'de_ja_2016', 
-		'fr-ca_hi_2014', 
-		'fr-ca_hi_2015', 
-		'fr-ca_hi_2016'
-	],
-	"sbmaruf/forai_ml_masakhane_mafand":[
-		'en-amh', 
-		'en-hau', 
-		'en-ibo', 
-		'en-kin', 
-		'en-lug', 
-		'en-nya', 
-		'en-pcm', 
-		'en-sna', 
-		'en-swa', 
-		'en-tsn', 
-		'en-twi', 
-		'en-xho', 
-		'en-yor', 
-		'en-zul', 
-		'fr-bam', 
-		'fr-bbj', 
-		'fr-ewe', 
-		'fr-fon', 
-		'fr-mos', 
-		'fr-wol'
-	],
-	"exams":[
-		'alignments', 
-		'multilingual', 
-		'multilingual_with_para', 
-		'crosslingual_test', 
-		'crosslingual_with_para_test', 
-		'crosslingual_bg', 
-		'crosslingual_with_para_bg', 
-		'crosslingual_hr', 
-		'crosslingual_with_para_hr', 
-		'crosslingual_hu', 
-		'crosslingual_with_para_hu', 
-		'crosslingual_it', 
-		'crosslingual_with_para_it', 
-		'crosslingual_mk', 
-		'crosslingual_with_para_mk', 
-		'crosslingual_pl', 
-		'crosslingual_with_para_pl', 
-		'crosslingual_pt', 
-		'crosslingual_with_para_pt', 
-		'crosslingual_sq', 
-		'crosslingual_with_para_sq', 
-		'crosslingual_sr', 
-		'crosslingual_with_para_sr', 
-		'crosslingual_tr', 
-		'crosslingual_with_para_tr', 
-		'crosslingual_vi', 
-		'crosslingual_with_para_vi'
-	],
-	"allenai/soda": None, 
-	"arabic_billion_words":[
-		'Alittihad', 
-		'Almasryalyoum', 
-		'Almustaqbal', 
-		'Alqabas', 
-		'Echoroukonline', 
-		'Ryiadh', 
-		'Sabanews', 
-		'SaudiYoum', 
-		'Techreen', 
-		'Youm7'
-	]
+	"udhr": {
+		None: "mixed"
+	},
+	"AmazonScience/mintaka": {
+		"ar": "ar",
+		"de": "de",
+		"en": "en",
+		"es": "es", 
+		"fr": "fr",
+		"hi": "hi",
+		"it": "it",
+		"ja": "ja",
+		"pt": "pt",
+	},
+	"xcsr": {
+		'X-CSQA-en': "en", 
+		'X-CSQA-zh': "zh", 
+		'X-CSQA-de': "de", 
+		'X-CSQA-es': "es", 
+		'X-CSQA-fr': "fr", 
+		'X-CSQA-it': "it", 
+		'X-CSQA-jap': "ja", 
+		'X-CSQA-nl': "nl", 
+		'X-CSQA-pl': "pl", 
+		'X-CSQA-pt': "pt", 
+		'X-CSQA-ru': "ru", 
+		'X-CSQA-ar': "ar", 
+		'X-CSQA-vi': "vi", 
+		'X-CSQA-hi': "hi", 
+		'X-CSQA-sw': "sw", 
+		'X-CSQA-ur': "ur", 
+		# 'X-CODAH-en': "en", 
+		# 'X-CODAH-zh': "zh", 
+		# 'X-CODAH-de': "de", 
+		# 'X-CODAH-es': "es", 
+		# 'X-CODAH-fr': "fr", 
+		# 'X-CODAH-it': "it", 
+		# 'X-CODAH-jap': "ja", 
+		# 'X-CODAH-nl': "nl", 
+		# 'X-CODAH-pl': "pl", 
+		# 'X-CODAH-pt': "pt", 
+		# 'X-CODAH-ru': "ru", 
+		# 'X-CODAH-ar': "ar", 
+		# 'X-CODAH-vi': "vi", 
+		# 'X-CODAH-hi': "hi", 
+		# 'X-CODAH-sw': "sw", 
+		# 'X-CODAH-ur': "ur",
+	},
+	"shmuhammad/AfriSenti-twitter-sentiment": {
+		'amh':'amh', 
+		'hau':'hau', 
+		'ibo':'ibo',
+		'arq':'arq', 
+		'ary':'ary', 
+		# 'yor':'yor', 
+		'por':'por', 
+		'twi':'twi', 
+		'tso':'tso', 
+		'tir':'tir', 
+		'pcm':'pcm', 
+		'kin':'kin', 
+		'swa': 'swa',
+		# 'orm': 'orm',
+	}, 
+	"indonlp/NusaX-senti": {
+		'ace':'ace', 
+		'ban':'ban', 
+		'bjn':'bjn', 
+		# 'bug':'bug',
+		'eng':'eng',
+		'ind':'ind',
+		# 'jav':'jav', 
+		'mad':'mad', 
+		'min':'min', 
+		'nij':'nij', 
+		'sun':'sun', 
+		'bbc':'bbc',
+	},
+	"masakhane/masakhanews": {
+		'amh':'amh', 
+		'eng':'eng', 
+		'fra':'fra', 
+		'hau':'hau', 
+		'ibo':'ibo', 
+		'lin':'lin', 
+		'lug':'lug', 
+		'orm':'orm', 
+		'pcm':'pcm', 
+		'run':'run', 
+		'sna':'sna', 
+		'som':'som', 
+		'swa':'swa', 
+		'tir':'tir', 
+		'xho':'xho', 
+		'yor':'yor',	
+	},
+	"papluca/language-identification": {
+		None: "mixed",
+	},
+	"adithya7/xlel_wd": {
+		'wikipedia-zero-shot': "mixed",
+		'wikinews-zero-shot': "mixed",
+		'wikinews-cross-domain': "mixed", 
+		'wikipedia-zero-shot.af': 'af', 
+		'wikipedia-zero-shot.ar': 'ar', 
+		'wikipedia-zero-shot.be': 'be', 
+		'wikipedia-zero-shot.bg': 'bg', 
+		'wikipedia-zero-shot.bn': 'bn',
+		'wikipedia-zero-shot.ca': 'ca',
+		'wikipedia-zero-shot.cs': 'cs', 
+		'wikipedia-zero-shot.da': 'da', 
+		'wikipedia-zero-shot.de': 'de', 
+		'wikipedia-zero-shot.el': 'el', 
+		'wikipedia-zero-shot.en': 'en',
+		'wikipedia-zero-shot.es': 'es',
+		'wikipedia-zero-shot.fa': 'fa', 
+		'wikipedia-zero-shot.fi': 'fi',
+		'wikipedia-zero-shot.fr': 'fr',
+		'wikipedia-zero-shot.he': 'he',
+		'wikipedia-zero-shot.hi': 'hi',
+		'wikipedia-zero-shot.hu': 'hu',
+		'wikipedia-zero-shot.id': 'id',
+		'wikipedia-zero-shot.it': 'it',
+		'wikipedia-zero-shot.ja': 'ja',
+		'wikipedia-zero-shot.ko': 'ko',
+		'wikipedia-zero-shot.ml': 'ml',
+		'wikipedia-zero-shot.mr': 'mr',
+		'wikipedia-zero-shot.ms': 'ms',
+		'wikipedia-zero-shot.nl': 'nl',
+		'wikipedia-zero-shot.no': 'no',
+		'wikipedia-zero-shot.pl': 'pl',
+		'wikipedia-zero-shot.pt': 'pt',
+		'wikipedia-zero-shot.ro': 'ro',
+		'wikipedia-zero-shot.ru': 'ru',
+		'wikipedia-zero-shot.si': 'si',
+		'wikipedia-zero-shot.sk': 'sk',
+		'wikipedia-zero-shot.sl': 'sl',
+		'wikipedia-zero-shot.sr': 'sr', 
+		'wikipedia-zero-shot.sv': 'sv', 
+		'wikipedia-zero-shot.sw': 'sw',
+		'wikipedia-zero-shot.ta': 'ta', 
+		'wikipedia-zero-shot.te': 'te',
+		'wikipedia-zero-shot.th': 'th',
+		'wikipedia-zero-shot.tr': 'tr',
+		'wikipedia-zero-shot.uk': 'uk',
+		'wikipedia-zero-shot.vi': 'vi',
+		'wikipedia-zero-shot.zh': 'zh',
+		'wikinews-zero-shot.ar': 'ar',
+		'wikinews-zero-shot.cs': 'cs',
+		'wikinews-zero-shot.de': 'de',
+		'wikinews-zero-shot.en': 'en',
+		'wikinews-zero-shot.es': 'es',
+		'wikinews-zero-shot.fi': 'fi', 
+		'wikinews-zero-shot.fr': 'fr',
+		'wikinews-zero-shot.it': 'it',
+		'wikinews-zero-shot.ja': 'ja',
+		'wikinews-zero-shot.ko': 'ko',
+		'wikinews-zero-shot.nl': 'nl',
+		'wikinews-zero-shot.no': 'no',
+		'wikinews-zero-shot.pl': 'pl',
+		'wikinews-zero-shot.pt': 'pt',
+		'wikinews-zero-shot.ru': 'ru',
+		'wikinews-zero-shot.sr': 'sr',
+		'wikinews-zero-shot.sv': 'sv',
+		'wikinews-zero-shot.ta': 'ta',
+		# 'wikinews-zero-shot.tr': 'tr',
+		'wikinews-zero-shot.uk': 'uk',
+		'wikinews-zero-shot.zh': 'zh',
+		'wikinews-cross-domain.ar': 'ar',
+		'wikinews-cross-domain.bg': 'bg',
+		'wikinews-cross-domain.ca': 'ca',
+		'wikinews-cross-domain.cs': 'cs',
+		'wikinews-cross-domain.de': 'de',
+		'wikinews-cross-domain.el': 'el',
+		'wikinews-cross-domain.en': 'en',
+		'wikinews-cross-domain.es': 'es',
+		'wikinews-cross-domain.fi': 'fi',
+		'wikinews-cross-domain.fr': 'fr',
+		'wikinews-cross-domain.he': 'he', 
+		'wikinews-cross-domain.hu': 'hu', 
+		'wikinews-cross-domain.it': 'it', 
+		'wikinews-cross-domain.ja': 'ja',
+		'wikinews-cross-domain.ko': 'ko',
+		'wikinews-cross-domain.nl': 'nl',
+		'wikinews-cross-domain.no': 'no',
+		'wikinews-cross-domain.pl': 'pl',
+		'wikinews-cross-domain.pt': 'pt',
+		'wikinews-cross-domain.ro': 'ro',
+		'wikinews-cross-domain.ru': 'ru',
+		'wikinews-cross-domain.sr': 'sr',
+		'wikinews-cross-domain.sv': 'sv',
+		'wikinews-cross-domain.ta': 'ta',
+		'wikinews-cross-domain.tr': 'tr',
+		'wikinews-cross-domain.uk': 'uk', 
+		'wikinews-cross-domain.zh': 'zh',
+	},
+	"sbmaruf/forai_ml-ted_talk_iwslt": {
+		'eu_ca_2014': 'eu_ca', 
+		'eu_ca_2015': 'eu_ca', 
+		'eu_ca_2016': 'eu_ca', 
+		'nl_en_2014': 'nl_en', 
+		'nl_en_2015': 'nl_en', 
+		'nl_en_2016': 'nl_en', 
+		'nl_hi_2014': 'nl_hi', 
+		'nl_hi_2015': 'nl_hi', 
+		'nl_hi_2016': 'nl_hi', 
+		'de_ja_2014': 'de_ja', 
+		'de_ja_2015': 'de_ja', 
+		'de_ja_2016': 'de_ja', 
+		'fr-ca_hi_2014': 'fr_hi', 
+		'fr-ca_hi_2015': 'fr_hi', 
+		'fr-ca_hi_2016': 'fr_hi',
+	},
+	"sbmaruf/forai_ml_masakhane_mafand":{
+		'en-amh': 'en-amh', 
+		'en-hau': 'en-hau', 
+		'en-ibo': 'en-ibo', 
+		'en-kin': 'en-kin', 
+		'en-lug': 'en-lug', 
+		'en-nya': 'en-nya', 
+		'en-pcm': 'en-pcm', 
+		'en-sna': 'en-sna', 
+		'en-swa': 'en-swa', 
+		'en-tsn': 'en-tsn', 
+		'en-twi': 'en-twi', 
+		'en-xho': 'en-xho', 
+		'en-yor': 'en-yor', 
+		'en-zul': 'en-zul', 
+		'fr-bam': 'fr-bam', 
+		'fr-bbj': 'fr-bbj', 
+		'fr-ewe': 'fr-ewe', 
+		'fr-fon': 'fr-fon', 
+		'fr-mos': 'fr-mos', 
+		'fr-wol': 'fr-wol',
+	},
+	"exams":{
+		# 'alignments': 'mixed', 
+		'multilingual': 'mixed', 
+		'multilingual_with_para': 'mixed', 
+		'crosslingual_test':'mixed', 
+		'crosslingual_with_para_test': 'mixed', 
+		'crosslingual_bg': "bg", 
+		'crosslingual_with_para_bg': "bg", 
+		'crosslingual_hr': "hr", 
+		'crosslingual_with_para_hr': "hr", 
+		'crosslingual_hu': "hu", 
+		'crosslingual_with_para_hu': "hu", 
+		'crosslingual_it': "it", 
+		'crosslingual_with_para_it': "it", 
+		'crosslingual_mk': "mk", 
+		'crosslingual_with_para_mk': "mk", 
+		'crosslingual_pl': "pl", 
+		'crosslingual_with_para_pl': "pl", 
+		'crosslingual_pt': "pt", 
+		'crosslingual_with_para_pt': "pt", 
+		'crosslingual_sq': "sq", 
+		'crosslingual_with_para_sq': "sq", 
+		'crosslingual_sr': "sr", 
+		'crosslingual_with_para_sr': "sr", 
+		'crosslingual_tr': "tr", 
+		'crosslingual_with_para_tr': "tr", 
+		'crosslingual_vi': "vi", 
+		'crosslingual_with_para_vi': "vi",
+	},
+	"allenai/soda": {
+		None: "en",
+	}, 
+	"arabic_billion_words": {
+		'Alittihad': "Alittihad", 
+		'Almasryalyoum': "Almasryalyoum", 
+		'Almustaqbal': "Almustaqbal", 
+		'Alqabas': "Alqabas", 
+		'Echoroukonline': "Echoroukonline", 
+		'Ryiadh': "Ryiadh", 
+		'Sabanews': "Sabanews", 
+		'SaudiYoumSaudi': "", 
+		'Techreen': "Techreen", 
+		'Youm7': "Youm7",
+	},
+	"theblackcat102/joke_explaination": {
+		None: "en",
+	},
+	"narrativeqa": {
+		None: "en",
+	},
+	"svakulenk0/qrecc": {
+		None: "en",
+	},
+	"GEM/wiki_cat_sum": {
+		"animan": "en",
+		"company": "en",
+		"film": "en",
+	}
 }
 
 def main():
@@ -389,16 +324,15 @@ def main():
 	stat_dict = {}
 	if args.dataset_names is None:
 		args.dataset_names = list(SERIES_A_DATASET_NAME_DICT.keys())
-	for dataset_name, subset_names in SERIES_A_DATASET_NAME_DICT.items():
+	for dataset_name, subset_dict in SERIES_A_DATASET_NAME_DICT.items():
 		if dataset_name not in args.dataset_names:
 			continue
 		assert dataset_name not in stat_dict
 		stat_dict[dataset_name] = {}
-		subset_names = [None] if subset_names is None else subset_names
-		for subset in subset_names:
+		for subset, subset_lang in subset_dict.items():
 			assert subset not in stat_dict[dataset_name]
 			stat_dict[dataset_name][subset] = {}
-			dt = datasets.load_dataset(dataset_name, name=subset, ignore_verifications=True)
+			dt = datasets.load_dataset(dataset_name, name=subset, verification_mode="no_checks")
 			for split in dt.keys():
 				stat_dict[dataset_name][subset][split] = {
 					"size": len(dt[split]),

From 023b257771c0ef2cc8a2dbc2d33f27c22f2f6aa6 Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Fri, 26 May 2023 00:21:17 +0800
Subject: [PATCH 18/34] Automatic script running

---
 data/validate_and_generate.py    | 324 +++++++++++++++++++++++++++++++
 scripts/validate_and_generate.sh |   6 +
 2 files changed, 330 insertions(+)
 create mode 100644 data/validate_and_generate.py
 create mode 100644 scripts/validate_and_generate.sh

diff --git a/data/validate_and_generate.py b/data/validate_and_generate.py
new file mode 100644
index 0000000..ffc5f71
--- /dev/null
+++ b/data/validate_and_generate.py
@@ -0,0 +1,324 @@
+import os
+import csv
+import copy
+import json
+import tqdm
+import argparse
+import datasets
+import subprocess
+from datetime import date
+import concurrent.futures
+from typing import Tuple, Optional, List	
+from promptsource.templates import Template
+from .data_stat import SERIES_A_DATASET_NAME_DICT
+datasets.logging.set_verbosity_error()
+
+mt5_langs_name_pair = [
+    ("Afrikaans", "af"), ("Albanian", "sq"), ("Amharic", "am"), ("Arabic", "ar"), ("Armenian", "hy"), ("Azerbaijani", "az"), 
+    ("Basque", "eu"), ("Belarusian", "be"), ("Bengali", "bn"), ("Bulgarian","bg"), ("Burmese", "my"), 
+    ("Catalan", "ca"), ("Cebuano", "ceb"), ("Chichewa", "ny"), ("Chinese", "zh"), ("Corsican", "co"), ("Czech", "cs"), 
+    ("Danish", "da"), ("Dutch", "nl"), 
+    ("English", "en"), ("Esperanto", "eo"), ("Estonian", "et"), 
+    ("Filipino", "fil"), ("Finnish", "fi"), ("French", "fr"), 
+    ("Galician", "gl"), ("Georgian", "ka"), ("German", "de"), ("Greek", "el"), ("Gujarati", "gu"), 
+    ("Haitian Creole", "ht"), ("Hausa", "ha"), ("Hawaiian", "haw"), ("Hebrew", "iw"), ("Hindi", "hi"), ("Hmong", "hmn"), ("Hungarian", "hu"), 
+    ("Icelandic", "is"), ("Igbo", "ig"), ("Indonesian", "id"), ("Irish", "ga"), ("Italian", "it"), 
+    ("Japanese", "ja"), ("Javanese", "jv"), 
+    ("Kannada", "kn"), ("Kazakh", "kk"), ("Khmer", "km"), ("Korean", "ko"), ("Kurdish", "ku"), ("Kyrgyz", "ky"), 
+    ("Lao", "lo"), ("Latin", "la"), ("Latvian", "lv"), ("Lithuanian", "lt"), ("Luxembourgish", "lb"), 
+    ("Macedonian", "mk"), ("Malagasy", "mg"), ("Malay", "ms"), ("Malayalam", "ml"), ("Maltese", "mt"), ("Maori", "mi"), ("Marathi", "mr"), ("Mongolian", "mn"), 
+    ("Nepali", "ne"), ("Norwegian", "no"), 
+    ("Pashto", "ps"), ("Persian", "fa"), ("Polish", "pl"), ("Portuguese", "pt"), ("Punjabi", "pa"), 
+    ("Romanian", "ro"), ("Russian", "ru"), 
+    ("Samoan", "sm"), ("Scottish Gaelic", "gd"), ("Serbian", "sr"), ("Shona", "sn"), ("Sindhi", "sd"), ("Sinhala", "si"), ("Slovak","sk"), ("Slovenian", "sl"), ("Somali", "so"), ("Sotho", "st"), ("Spanish", "es"), ("Sundanese", "su"), ("Swahili", "sw"), ("Swedish", "sv"), 
+    ("Tajik", "tg"), ("Tamil", "ta"), ("Telugu", "te"), ("Thai", "th"), ("Turkish", "tr"), 
+    ("Ukrainian", "uk"), ("Urdu", "ur"), ("Uzbek", "uz"), 
+    ("Vietnamese", "vi"), 
+    ("Welsh", "cy"), ("West Frisian", "fy"), 
+    ("Xhosa", "xh"), 
+    ("Yiddish", "yi"), ("Yoruba", "yo"), ("Zulu", "zu")
+]
+mt5_langs_full_name_to_iso_name = { full_name: iso_name for full_name, iso_name in mt5_langs_name_pair}
+
+dataset_mapper = {
+	"AfriSenti-twitter-sentiment https://huggingface.co/datasets/shmuhammad/AfriSenti-twitter-sentiment": "shmuhammad/AfriSenti-twitter-sentiment",
+	"Joke-explanation https://huggingface.co/datasets/theblackcat102/joke_explaination": "theblackcat102/joke_explaination",
+	"Language Identification https://huggingface.co/datasets/papluca/language-identification": "papluca/language-identification",
+	"Mafand - a machine translation task https://huggingface.co/datasets/masakhane/mafand": "sbmaruf/forai_ml_masakhane_mafand",
+	"Masakhanews https://github.com/masakhane-io/masakhane-news": "masakhane/masakhanews",
+	"Mintaka https://huggingface.co/datasets/AmazonScience/mintaka":"AmazonScience/mintaka",
+	"NarrativeQA https://huggingface.co/datasets/narrativeqa": "narrativeqa",
+	"NusaX - sentiment classification https://huggingface.co/datasets/indonlp/NusaX-senti": "indonlp/NusaX-senti",
+	"qrecc https://huggingface.co/datasets/svakulenk0/qrecc": "svakulenk0/qrecc",
+	"SODA https://huggingface.co/datasets/allenai/soda": "allenai/soda",
+	"TED https://huggingface.co/datasets/ted_talks_iwslt": "sbmaruf/forai_ml-ted_talk_iwslt",
+	"WikiCatSum https://huggingface.co/datasets/GEM/wiki_cat_sum": "GEM/wiki_cat_sum",
+	"X-CSQA https://huggingface.co/datasets/xcsr": "xcsr",
+	"xlel_wd https://huggingface.co/datasets/adithya7/xlel_wd": "adithya7/xlel_wd"
+}
+
+IGNORE_TASKS = [
+	"arabic_billion_words",
+	"narrativeqa",
+	"svakulenk0/qrecc"
+]
+def check(
+	json_example: str, 
+	template_name: str, 
+	jinja_template: str, 
+	template_reference: Optional[str] = None, 
+	original_task: Optional[str] = None, 
+	choices_in_prompt: Optional[bool] = None,
+	metrics: Optional[List[str]] = None,
+	languages: Optional[List[str]] = None,
+	answer_choices: Optional[str] = None
+)-> Tuple[str, str]:
+	"""
+	Given an example (`json_example`) from a huggingface dataset and prompt template (`jinja_template`),
+	the objective is to check if we can project the example in language model i/o format.  
+	Args:
+		json_example (str): a string contains json object. The json object is loaded 
+								by `json.loads()`. Typically this is a sample from 
+								huggingface dataset converted to a string by a `json.dumps()`. 
+		template_name: unique name (per dataset) for template
+        jinja_template: template expressed in Jinja
+        template_reference: string describing author or paper reference for template
+		original_task: If True, this prompt asks a model to perform the original task designed for
+                this dataset.
+		choices_in_prompt: If True, the answer choices are included in the templates such that models
+			see those choices in the input. Only applicable to classification tasks.
+		metrics: List of strings denoting metrics to use for evaluation
+		languages: List of strings denoting languages used in the prompt (not the associated dataset!)
+        answer_choices: Jinja expression for answer choices. Should produce
+                            	a ||| delimited string of choices that enumerates
+                            	the possible completions for templates that should
+                            	be evaluated as ranked completions. If None, then
+                            	the template is open-ended. This list is accessible
+                            	from within Jinja as the variable `answer_choices`.
+	"""
+	json_example = json.loads(json_example)
+	metadata = Template.Metadata(
+		original_task,
+		choices_in_prompt,
+		metrics,
+		languages
+	)
+	template = Template(
+		template_name, 
+	 	jinja_template, 
+	  	template_reference, 
+		metadata=metadata,
+	   	answer_choices=answer_choices
+	)
+	lm_io = template.apply(json_example, highlight_variables=False)
+	return lm_io
+	
+
+def create_name_with_hierarchy(output_dir, dataset_signature, dataset_subset, split_name, template_name, template_lang):
+	"""
+	<original-dataset-name>/<dataset_subset>__<split_name>/template-generation/<template>/<date>/foraiml__<original-dataset-name>__<dataset_subset>__<split>__<dataset_lang>__<prompt_lang>.jsonl
+	"""
+	dataset_signature = dataset_signature.replace("/", "_").replace("\\","")
+	dataset_uid = f"{dataset_signature}" 
+	file_path = os.path.join(output_dir, dataset_uid)
+	split_lang = SERIES_A_DATASET_NAME_DICT[dataset_signature][dataset_subset]
+	split_uid = f"{dataset_subset}__{split_name}"
+	file_path = os.path.join(file_path, split_uid) 
+	file_path = os.path.join(file_path, "template-generation") 
+	file_path = os.path.join(file_path, template_name) 
+	file_path = os.path.join(file_path, f"{date.today()}")
+	dataset_file_uid = f"foraiml__{dataset_uid}__{split_uid}__{split_lang}__{mt5_langs_full_name_to_iso_name[template_lang]}.jsonl" 
+	file_path = os.path.join(file_path, dataset_file_uid) 
+	return file_path
+
+
+def get_template_name(prompt_template_data):
+	"""
+		Prompt template named as the discord contributor.
+		template_name: Name + discord_user_name
+	"""
+	name = prompt_template_data['Name'].replace(" ", "_").replace("#", "_") + \
+			"__" + \
+			prompt_template_data['Discord username'].replace(" ", "_").replace("#", "_")
+	return name
+
+def process(args):
+	data, prompt_template_data, row_id, model_input, model_exp_output, export_file_path, generate, error_msg, generate_desc = args
+	if generate:
+		dir_name = os.path.dirname(export_file_path)
+		if not os.path.exists(dir_name):
+			os.makedirs(dir_name, exist_ok=True)
+		if os.path.exists(export_file_path):
+			print(f"[IGNORE] {export_file_path}: Data already exist.")
+			return 0
+		export_file_ptr = open(export_file_path, "w")
+	
+	for sample in tqdm.tqdm(data, total=len(data), desc=generate_desc):
+		lm_io = check(
+			json_example = json.dumps(sample),
+			template_name = prompt_template_data['Name'],
+			jinja_template = f"{model_input} ||| {model_exp_output}",
+			template_reference = prompt_template_data['Discord username'],
+		)
+		assert len(lm_io) == 2, error_msg
+		if not generate:
+			break
+		out_data = {
+			"inputs": lm_io[0],
+			"targets": lm_io[1],
+		}
+		export_file_ptr.write(f"{json.dumps(out_data)}\n")
+	if generate:
+		export_file_ptr.close()
+	return 0
+
+def validate_and_generate(prompt_template_data, row_id, output_dir="dumped", generate=False, num_proc=1):
+	"""
+	Generate data from a prompt template
+	"""
+	try:
+		print(json.dumps(prompt_template_data, indent=4))
+		dataset_info = prompt_template_data['What dataset do you pick?']
+		if dataset_info not in dataset_mapper:
+			dataset_signature = dataset_info.split()[0].lower()
+		else:
+			dataset_signature = dataset_mapper[dataset_info]
+		if dataset_signature in IGNORE_TASKS:
+			print(f"[IGNORE] row_id : {row_id}: Task exists in IGNORE_TASKS.")
+			return 0
+		if prompt_template_data['Automatic Generation'] != '1':
+			print(f"[IGNORE] row_id : {row_id}: Automatic Generation = {prompt_template_data['Automatic Generation']}.")
+			return 0
+		dataset_subsets_dict = SERIES_A_DATASET_NAME_DICT[dataset_signature]
+		template_name = get_template_name(prompt_template_data)
+		with concurrent.futures.ProcessPoolExecutor(max_workers=num_proc) as process_executor:
+			future_to_val_results = []
+			for dataset_subset_idx, (dataset_subset, subset_lang) in enumerate(dataset_subsets_dict.items()):
+				dataset = datasets.load_dataset(dataset_signature, dataset_subset, verification_mode="no_checks")
+				splits = dataset.keys()
+				for split_idx, split in enumerate(splits):
+					data = dataset[split]
+					# validate & generate native lang prompt
+					model_input = prompt_template_data['Input to the model']
+					model_exp_output = prompt_template_data['Model\'s expected output']
+					template_lang = prompt_template_data['What language do you want to write your prompt in?']
+					export_file_path = create_name_with_hierarchy(output_dir, dataset_signature, dataset_subset, split, template_name, template_lang)
+					error_msg = f"Validating dataset_signature:dataset_subset:split={dataset_signature}:{dataset_subset}:{split}:{row_id}:{template_lang} with prompt template... [FAILED]"
+					generate_desc = f"row id: [{row_id}:{template_lang}] subset: [{dataset_subset_idx+1}/{len(dataset_subsets_dict)}]  split: [{split_idx+1}/{len(splits)}] {os.path.basename(export_file_path)}"
+					# process(data, prompt_template_data, row_id, model_input, model_exp_output, export_file_path, generate, error_msg, generate_desc)
+					
+					args = (data, prompt_template_data, row_id, model_input, model_exp_output, export_file_path, generate, error_msg, generate_desc)
+					future = process_executor.submit(process, args)
+					future_to_val_results.append(future)
+
+					# validate & generate english prompt
+					model_input = prompt_template_data['English translation of the input']
+					model_exp_output = prompt_template_data['English translation of the output']
+					template_lang="English"
+					export_file_path = create_name_with_hierarchy(output_dir, dataset_signature, dataset_subset, split, template_name, template_lang)
+					error_msg = f"Validating dataset_signature:dataset_subset:split={dataset_signature}:{dataset_subset}:{split}:{row_id}:{template_lang} with prompt template... [FAILED]"
+					generate_desc = f"row id: [{row_id}:{template_lang}] subset: [{dataset_subset_idx+1}/{len(dataset_subsets_dict)}]  split: [{split_idx+1}/{len(splits)}] {os.path.basename(export_file_path)}"
+					# process(data, prompt_template_data, row_id, model_input, model_exp_output, export_file_path, generate, error_msg, generate_desc)
+					
+					args = (data, prompt_template_data, row_id, model_input, model_exp_output, export_file_path, generate, error_msg, generate_desc)
+					future = process_executor.submit(process, args)
+					future_to_val_results.append(future)
+
+				print(f"[DONE] dataset_signature:dataset_subset:split={dataset_signature}:{dataset_subset}:{split}")
+		concurrent.futures.wait(future_to_val_results)
+
+	except:
+		print(f"Error in row {row_id}, {dataset_signature=}, {dataset_subset=}")
+		raise
+	
+	return 0
+	
+def parse(prompt_file_path, validate_rows):
+	"""
+	Parse list of rows menntioned in validate_rows. 
+	"""
+	_prmompt_dict, dt_structure, idx_to_header = {}, {}, {}
+	with open(prompt_file_path, 'r') as csvfile:
+		csvreader = csv.reader(csvfile)
+		for row_idx, row in enumerate(csvreader):
+			if row_idx == 0:
+				for idx, dt in enumerate(row):
+					dt_structure[dt] = {}
+					idx_to_header[idx] = dt
+			if row_idx+1 in validate_rows or validate_rows == []: # 1 based indexing
+				sample = copy.deepcopy(dt_structure)
+				for idx, dt in enumerate(row):
+					sample[idx_to_header[idx]] = dt
+				_prmompt_dict[ row_idx+1 ] = sample
+	return _prmompt_dict
+
+
+def main():
+	parser = argparse.ArgumentParser()
+	parser.add_argument(
+		"--form_path",
+		type=str,
+		default="https://docs.google.com/spreadsheets/d/10bCwOhM8zKNkqKi54gIvdwrR44YlWQFV9fpGm7acHv8/export?format=csv&id=10bCwOhM8zKNkqKi54gIvdwrR44YlWQFV9fpGm7acHv8&gid=726399306",
+		help="Path of the google sheet."
+	)
+	parser.add_argument(
+		"--overwrite",
+		action="store_true",
+		help="Overwrite eexisting prompt file prompts.csv."
+	)
+	parser.add_argument(
+		"--prompt-dir",
+		type=str,
+		default="data/",
+		help="Overwrite existing prompt file prompts.csv."
+	)
+	parser.add_argument(
+		"--validate-rows",
+		nargs='*',
+		default=[],
+		type=int,
+		help="List of row indices (1-based indexing ). The row mentioned here will indicate the row of `--form_path` spreadsheet. If empty, it will validate all the rows."
+	)
+	parser.add_argument(
+		"--generate",
+		action="store_true",
+		help="Generate projected samples."
+	)
+	parser.add_argument(
+		"--output-dir",
+		default=None,
+		help="The path to the folder where data will be saved."
+	)
+	parser.add_argument(
+		"--num-proc",
+		default=1,
+		type=int,
+		help="Number of parallel process to run."
+	)
+	args = parser.parse_args()
+	
+	prompt_file_path = f"{args.prompt_dir}/prompts.csv"
+	if not os.path.exists(prompt_file_path):
+		os.makedirs(prompt_file_path)
+
+	if os.path.exists(prompt_file_path) and args.overwrite: # if file exists, it may be from prev. run/download.
+		subprocess.check_output(f"mv {prompt_file_path} {prompt_file_path}.old", shell=True)
+	if not os.path.exists(prompt_file_path):
+		cmd = f"curl -L '{args.form_path}' -o {prompt_file_path}"
+		subprocess.check_output(cmd, shell=True)
+
+	prompt_dict = parse(prompt_file_path, args.validate_rows)
+	for row_id, prompt_template_data in prompt_dict.items():
+		print(f"Working on row {row_id} ...")
+		validate_and_generate(
+			prompt_template_data, 
+			row_id, 
+			output_dir=args.output_dir, 
+			generate=args.generate, 
+			num_proc=args.num_proc
+		)
+
+
+
+if __name__ == "__main__":
+	main()
\ No newline at end of file
diff --git a/scripts/validate_and_generate.sh b/scripts/validate_and_generate.sh
new file mode 100644
index 0000000..6956284
--- /dev/null
+++ b/scripts/validate_and_generate.sh
@@ -0,0 +1,6 @@
+python3 -m data.validate_and_generate \
+--prompt-dir data \
+--output-dir "dumped" \
+--generate
+
+# 3 5 8 10 15 17 18  19 21 25 27 28 29 30 31 32 34 38 39 40 42 45 46 47 48 53 56 58 61 64 66 67 68 71 72 74 75 76 83 84 86 87 88 89 92 93 94 95 111 116 118 121 122 124 125 126 128 129 132 133 134 135 136 137 138 140 141 142 143
\ No newline at end of file

From 57c0f32cf2a9bf9b831a8f2dc618466e49a6b5ad Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Fri, 26 May 2023 00:21:51 +0800
Subject: [PATCH 19/34] gitignore updated

---
 .gitignore | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index ac8a99b..66f16d9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -129,4 +129,6 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
-.DS_Store
\ No newline at end of file
+.DS_Store
+
+dumped/
\ No newline at end of file

From 415bb290fb9a98fc747086448a2db03c448e855e Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Fri, 26 May 2023 00:24:53 +0800
Subject: [PATCH 20/34] formatting issue.

---
 data/validate_and_generate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/validate_and_generate.py b/data/validate_and_generate.py
index ffc5f71..c256835 100644
--- a/data/validate_and_generate.py
+++ b/data/validate_and_generate.py
@@ -118,10 +118,10 @@ def create_name_with_hierarchy(output_dir, dataset_signature, dataset_subset, sp
 	"""
 	<original-dataset-name>/<dataset_subset>__<split_name>/template-generation/<template>/<date>/foraiml__<original-dataset-name>__<dataset_subset>__<split>__<dataset_lang>__<prompt_lang>.jsonl
 	"""
+	split_lang = SERIES_A_DATASET_NAME_DICT[dataset_signature][dataset_subset]
 	dataset_signature = dataset_signature.replace("/", "_").replace("\\","")
 	dataset_uid = f"{dataset_signature}" 
 	file_path = os.path.join(output_dir, dataset_uid)
-	split_lang = SERIES_A_DATASET_NAME_DICT[dataset_signature][dataset_subset]
 	split_uid = f"{dataset_subset}__{split_name}"
 	file_path = os.path.join(file_path, split_uid) 
 	file_path = os.path.join(file_path, "template-generation") 

From 0529a48b7fff3c14711e42f4054bb5518a581f58 Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Fri, 26 May 2023 00:25:06 +0800
Subject: [PATCH 21/34] update readme

---
 README.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/README.md b/README.md
index 99ad3d0..ad8507b 100644
--- a/README.md
+++ b/README.md
@@ -39,6 +39,7 @@ This script translate the samsum dataset using the inference server
 python main.py
 ```
 
+
 ## Translate
 
 ```shell
@@ -47,3 +48,11 @@ python -m instructmultilingual.translate \
           --source_language="English" \
           --target_language="Egyptian Arabic"
 ```
+
+
+### Automatic Data Generation
+
+Run the following script.
+```
+bash scripts/validate_and_generate.sh
+```

From 29202a09c3fdd3bcc483ed77c4bfddb0881cdc8b Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Fri, 26 May 2023 00:30:26 +0800
Subject: [PATCH 22/34] update --num-proc arg.

---
 scripts/validate_and_generate.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/validate_and_generate.sh b/scripts/validate_and_generate.sh
index 6956284..2fff828 100644
--- a/scripts/validate_and_generate.sh
+++ b/scripts/validate_and_generate.sh
@@ -1,6 +1,7 @@
 python3 -m data.validate_and_generate \
 --prompt-dir data \
 --output-dir "dumped" \
---generate
+--generate \
+--num-proc 8
 
 # 3 5 8 10 15 17 18  19 21 25 27 28 29 30 31 32 34 38 39 40 42 45 46 47 48 53 56 58 61 64 66 67 68 71 72 74 75 76 83 84 86 87 88 89 92 93 94 95 111 116 118 121 122 124 125 126 128 129 132 133 134 135 136 137 138 140 141 142 143
\ No newline at end of file

From 70355f34890ba3ccb8ad4d80645d51c08eba8c63 Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Tue, 30 May 2023 06:17:35 +0800
Subject: [PATCH 23/34] ignore dump folder

---
 .gitignore | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 66f16d9..6e5f8b6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -131,4 +131,5 @@ dmypy.json
 .pyre/
 .DS_Store
 
-dumped/
\ No newline at end of file
+dumped*
+.vscode/
\ No newline at end of file

From e9fad7ee46e0ceacde01dfebad69d8c6208badd9 Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Tue, 30 May 2023 06:18:56 +0800
Subject: [PATCH 24/34] update hf-subset info

---
 data/data_stat.py | 733 ++++++++++++++++++++++++----------------------
 1 file changed, 383 insertions(+), 350 deletions(-)

diff --git a/data/data_stat.py b/data/data_stat.py
index 0fb7337..9063f85 100644
--- a/data/data_stat.py
+++ b/data/data_stat.py
@@ -6,359 +6,392 @@
 
 # huggingface dataset signature with configs
 SERIES_A_DATASET_NAME_DICT = {
-	"udhr": {
-		None: "mixed"
-	},
-	"AmazonScience/mintaka": {
-		"ar": "ar",
-		"de": "de",
-		"en": "en",
-		"es": "es", 
-		"fr": "fr",
-		"hi": "hi",
-		"it": "it",
-		"ja": "ja",
-		"pt": "pt",
-	},
-	"xcsr": {
-		'X-CSQA-en': "en", 
-		'X-CSQA-zh': "zh", 
-		'X-CSQA-de': "de", 
-		'X-CSQA-es': "es", 
-		'X-CSQA-fr': "fr", 
-		'X-CSQA-it': "it", 
-		'X-CSQA-jap': "ja", 
-		'X-CSQA-nl': "nl", 
-		'X-CSQA-pl': "pl", 
-		'X-CSQA-pt': "pt", 
-		'X-CSQA-ru': "ru", 
-		'X-CSQA-ar': "ar", 
-		'X-CSQA-vi': "vi", 
-		'X-CSQA-hi': "hi", 
-		'X-CSQA-sw': "sw", 
-		'X-CSQA-ur': "ur", 
-		# 'X-CODAH-en': "en", 
-		# 'X-CODAH-zh': "zh", 
-		# 'X-CODAH-de': "de", 
-		# 'X-CODAH-es': "es", 
-		# 'X-CODAH-fr': "fr", 
-		# 'X-CODAH-it': "it", 
-		# 'X-CODAH-jap': "ja", 
-		# 'X-CODAH-nl': "nl", 
-		# 'X-CODAH-pl': "pl", 
-		# 'X-CODAH-pt': "pt", 
-		# 'X-CODAH-ru': "ru", 
-		# 'X-CODAH-ar': "ar", 
-		# 'X-CODAH-vi': "vi", 
-		# 'X-CODAH-hi': "hi", 
-		# 'X-CODAH-sw': "sw", 
-		# 'X-CODAH-ur': "ur",
-	},
-	"shmuhammad/AfriSenti-twitter-sentiment": {
-		'amh':'amh', 
-		'hau':'hau', 
-		'ibo':'ibo',
-		'arq':'arq', 
-		'ary':'ary', 
-		# 'yor':'yor', 
-		'por':'por', 
-		'twi':'twi', 
-		'tso':'tso', 
-		'tir':'tir', 
-		'pcm':'pcm', 
-		'kin':'kin', 
-		'swa': 'swa',
-		# 'orm': 'orm',
-	}, 
-	"indonlp/NusaX-senti": {
-		'ace':'ace', 
-		'ban':'ban', 
-		'bjn':'bjn', 
-		# 'bug':'bug',
-		'eng':'eng',
-		'ind':'ind',
-		# 'jav':'jav', 
-		'mad':'mad', 
-		'min':'min', 
-		'nij':'nij', 
-		'sun':'sun', 
-		'bbc':'bbc',
-	},
-	"masakhane/masakhanews": {
-		'amh':'amh', 
-		'eng':'eng', 
-		'fra':'fra', 
-		'hau':'hau', 
-		'ibo':'ibo', 
-		'lin':'lin', 
-		'lug':'lug', 
-		'orm':'orm', 
-		'pcm':'pcm', 
-		'run':'run', 
-		'sna':'sna', 
-		'som':'som', 
-		'swa':'swa', 
-		'tir':'tir', 
-		'xho':'xho', 
-		'yor':'yor',	
-	},
-	"papluca/language-identification": {
-		None: "mixed",
-	},
-	"adithya7/xlel_wd": {
-		'wikipedia-zero-shot': "mixed",
-		'wikinews-zero-shot': "mixed",
-		'wikinews-cross-domain': "mixed", 
-		'wikipedia-zero-shot.af': 'af', 
-		'wikipedia-zero-shot.ar': 'ar', 
-		'wikipedia-zero-shot.be': 'be', 
-		'wikipedia-zero-shot.bg': 'bg', 
-		'wikipedia-zero-shot.bn': 'bn',
-		'wikipedia-zero-shot.ca': 'ca',
-		'wikipedia-zero-shot.cs': 'cs', 
-		'wikipedia-zero-shot.da': 'da', 
-		'wikipedia-zero-shot.de': 'de', 
-		'wikipedia-zero-shot.el': 'el', 
-		'wikipedia-zero-shot.en': 'en',
-		'wikipedia-zero-shot.es': 'es',
-		'wikipedia-zero-shot.fa': 'fa', 
-		'wikipedia-zero-shot.fi': 'fi',
-		'wikipedia-zero-shot.fr': 'fr',
-		'wikipedia-zero-shot.he': 'he',
-		'wikipedia-zero-shot.hi': 'hi',
-		'wikipedia-zero-shot.hu': 'hu',
-		'wikipedia-zero-shot.id': 'id',
-		'wikipedia-zero-shot.it': 'it',
-		'wikipedia-zero-shot.ja': 'ja',
-		'wikipedia-zero-shot.ko': 'ko',
-		'wikipedia-zero-shot.ml': 'ml',
-		'wikipedia-zero-shot.mr': 'mr',
-		'wikipedia-zero-shot.ms': 'ms',
-		'wikipedia-zero-shot.nl': 'nl',
-		'wikipedia-zero-shot.no': 'no',
-		'wikipedia-zero-shot.pl': 'pl',
-		'wikipedia-zero-shot.pt': 'pt',
-		'wikipedia-zero-shot.ro': 'ro',
-		'wikipedia-zero-shot.ru': 'ru',
-		'wikipedia-zero-shot.si': 'si',
-		'wikipedia-zero-shot.sk': 'sk',
-		'wikipedia-zero-shot.sl': 'sl',
-		'wikipedia-zero-shot.sr': 'sr', 
-		'wikipedia-zero-shot.sv': 'sv', 
-		'wikipedia-zero-shot.sw': 'sw',
-		'wikipedia-zero-shot.ta': 'ta', 
-		'wikipedia-zero-shot.te': 'te',
-		'wikipedia-zero-shot.th': 'th',
-		'wikipedia-zero-shot.tr': 'tr',
-		'wikipedia-zero-shot.uk': 'uk',
-		'wikipedia-zero-shot.vi': 'vi',
-		'wikipedia-zero-shot.zh': 'zh',
-		'wikinews-zero-shot.ar': 'ar',
-		'wikinews-zero-shot.cs': 'cs',
-		'wikinews-zero-shot.de': 'de',
-		'wikinews-zero-shot.en': 'en',
-		'wikinews-zero-shot.es': 'es',
-		'wikinews-zero-shot.fi': 'fi', 
-		'wikinews-zero-shot.fr': 'fr',
-		'wikinews-zero-shot.it': 'it',
-		'wikinews-zero-shot.ja': 'ja',
-		'wikinews-zero-shot.ko': 'ko',
-		'wikinews-zero-shot.nl': 'nl',
-		'wikinews-zero-shot.no': 'no',
-		'wikinews-zero-shot.pl': 'pl',
-		'wikinews-zero-shot.pt': 'pt',
-		'wikinews-zero-shot.ru': 'ru',
-		'wikinews-zero-shot.sr': 'sr',
-		'wikinews-zero-shot.sv': 'sv',
-		'wikinews-zero-shot.ta': 'ta',
-		# 'wikinews-zero-shot.tr': 'tr',
-		'wikinews-zero-shot.uk': 'uk',
-		'wikinews-zero-shot.zh': 'zh',
-		'wikinews-cross-domain.ar': 'ar',
-		'wikinews-cross-domain.bg': 'bg',
-		'wikinews-cross-domain.ca': 'ca',
-		'wikinews-cross-domain.cs': 'cs',
-		'wikinews-cross-domain.de': 'de',
-		'wikinews-cross-domain.el': 'el',
-		'wikinews-cross-domain.en': 'en',
-		'wikinews-cross-domain.es': 'es',
-		'wikinews-cross-domain.fi': 'fi',
-		'wikinews-cross-domain.fr': 'fr',
-		'wikinews-cross-domain.he': 'he', 
-		'wikinews-cross-domain.hu': 'hu', 
-		'wikinews-cross-domain.it': 'it', 
-		'wikinews-cross-domain.ja': 'ja',
-		'wikinews-cross-domain.ko': 'ko',
-		'wikinews-cross-domain.nl': 'nl',
-		'wikinews-cross-domain.no': 'no',
-		'wikinews-cross-domain.pl': 'pl',
-		'wikinews-cross-domain.pt': 'pt',
-		'wikinews-cross-domain.ro': 'ro',
-		'wikinews-cross-domain.ru': 'ru',
-		'wikinews-cross-domain.sr': 'sr',
-		'wikinews-cross-domain.sv': 'sv',
-		'wikinews-cross-domain.ta': 'ta',
-		'wikinews-cross-domain.tr': 'tr',
-		'wikinews-cross-domain.uk': 'uk', 
-		'wikinews-cross-domain.zh': 'zh',
-	},
-	"sbmaruf/forai_ml-ted_talk_iwslt": {
-		'eu_ca_2014': 'eu_ca', 
-		'eu_ca_2015': 'eu_ca', 
-		'eu_ca_2016': 'eu_ca', 
-		'nl_en_2014': 'nl_en', 
-		'nl_en_2015': 'nl_en', 
-		'nl_en_2016': 'nl_en', 
-		'nl_hi_2014': 'nl_hi', 
-		'nl_hi_2015': 'nl_hi', 
-		'nl_hi_2016': 'nl_hi', 
-		'de_ja_2014': 'de_ja', 
-		'de_ja_2015': 'de_ja', 
-		'de_ja_2016': 'de_ja', 
-		'fr-ca_hi_2014': 'fr_hi', 
-		'fr-ca_hi_2015': 'fr_hi', 
-		'fr-ca_hi_2016': 'fr_hi',
-	},
-	"sbmaruf/forai_ml_masakhane_mafand":{
-		'en-amh': 'en-amh', 
-		'en-hau': 'en-hau', 
-		'en-ibo': 'en-ibo', 
-		'en-kin': 'en-kin', 
-		'en-lug': 'en-lug', 
-		'en-nya': 'en-nya', 
-		'en-pcm': 'en-pcm', 
-		'en-sna': 'en-sna', 
-		'en-swa': 'en-swa', 
-		'en-tsn': 'en-tsn', 
-		'en-twi': 'en-twi', 
-		'en-xho': 'en-xho', 
-		'en-yor': 'en-yor', 
-		'en-zul': 'en-zul', 
-		'fr-bam': 'fr-bam', 
-		'fr-bbj': 'fr-bbj', 
-		'fr-ewe': 'fr-ewe', 
-		'fr-fon': 'fr-fon', 
-		'fr-mos': 'fr-mos', 
-		'fr-wol': 'fr-wol',
-	},
-	"exams":{
-		# 'alignments': 'mixed', 
-		'multilingual': 'mixed', 
-		'multilingual_with_para': 'mixed', 
-		'crosslingual_test':'mixed', 
-		'crosslingual_with_para_test': 'mixed', 
-		'crosslingual_bg': "bg", 
-		'crosslingual_with_para_bg': "bg", 
-		'crosslingual_hr': "hr", 
-		'crosslingual_with_para_hr': "hr", 
-		'crosslingual_hu': "hu", 
-		'crosslingual_with_para_hu': "hu", 
-		'crosslingual_it': "it", 
-		'crosslingual_with_para_it': "it", 
-		'crosslingual_mk': "mk", 
-		'crosslingual_with_para_mk': "mk", 
-		'crosslingual_pl': "pl", 
-		'crosslingual_with_para_pl': "pl", 
-		'crosslingual_pt': "pt", 
-		'crosslingual_with_para_pt': "pt", 
-		'crosslingual_sq': "sq", 
-		'crosslingual_with_para_sq': "sq", 
-		'crosslingual_sr': "sr", 
-		'crosslingual_with_para_sr': "sr", 
-		'crosslingual_tr': "tr", 
-		'crosslingual_with_para_tr': "tr", 
-		'crosslingual_vi': "vi", 
-		'crosslingual_with_para_vi': "vi",
-	},
-	"allenai/soda": {
-		None: "en",
-	}, 
-	"arabic_billion_words": {
-		'Alittihad': "Alittihad", 
-		'Almasryalyoum': "Almasryalyoum", 
-		'Almustaqbal': "Almustaqbal", 
-		'Alqabas': "Alqabas", 
-		'Echoroukonline': "Echoroukonline", 
-		'Ryiadh': "Ryiadh", 
-		'Sabanews': "Sabanews", 
-		'SaudiYoumSaudi': "", 
-		'Techreen': "Techreen", 
-		'Youm7': "Youm7",
-	},
-	"theblackcat102/joke_explaination": {
-		None: "en",
-	},
-	"narrativeqa": {
-		None: "en",
-	},
-	"svakulenk0/qrecc": {
-		None: "en",
-	},
-	"GEM/wiki_cat_sum": {
-		"animan": "en",
-		"company": "en",
-		"film": "en",
-	}
+    "udhr": {
+        # None: "mixed"
+    },
+    "AmazonScience/mintaka": {
+        "ar": "ar",
+        "de": "de",
+        "en": "en",
+        "es": "es",
+        "fr": "fr",
+        "hi": "hi",
+        "it": "it",
+        "ja": "ja",
+        "pt": "pt",
+    },
+    "xcsr": {
+        "X-CSQA-en": "en",
+        "X-CSQA-zh": "zh",
+        "X-CSQA-de": "de",
+        "X-CSQA-es": "es",
+        "X-CSQA-fr": "fr",
+        "X-CSQA-it": "it",
+        "X-CSQA-jap": "ja",
+        "X-CSQA-nl": "nl",
+        "X-CSQA-pl": "pl",
+        "X-CSQA-pt": "pt",
+        "X-CSQA-ru": "ru",
+        "X-CSQA-ar": "ar",
+        "X-CSQA-vi": "vi",
+        "X-CSQA-hi": "hi",
+        "X-CSQA-sw": "sw",
+        "X-CSQA-ur": "ur",
+        # 'X-CODAH-en': "en",
+        # 'X-CODAH-zh': "zh",
+        # 'X-CODAH-de': "de",
+        # 'X-CODAH-es': "es",
+        # 'X-CODAH-fr': "fr",
+        # 'X-CODAH-it': "it",
+        # 'X-CODAH-jap': "ja",
+        # 'X-CODAH-nl': "nl",
+        # 'X-CODAH-pl': "pl",
+        # 'X-CODAH-pt': "pt",
+        # 'X-CODAH-ru': "ru",
+        # 'X-CODAH-ar': "ar",
+        # 'X-CODAH-vi': "vi",
+        # 'X-CODAH-hi': "hi",
+        # 'X-CODAH-sw': "sw",
+        # 'X-CODAH-ur': "ur",
+    },
+    "shmuhammad/AfriSenti-twitter-sentiment": {
+        "amh": "amh",
+        "hau": "hau",
+        "ibo": "ibo",
+        "arq": "arq",
+        "ary": "ary",
+        # 'yor':'yor',
+        "por": "por",
+        "twi": "twi",
+        "tso": "tso",
+        "tir": "tir",
+        "pcm": "pcm",
+        "kin": "kin",
+        "swa": "swa",
+        # 'orm': 'orm',
+    },
+    "indonlp/NusaX-senti": {
+        "ace": "ace",
+        "ban": "ban",
+        "bjn": "bjn",
+        # 'bug':'bug',
+        "eng": "eng",
+        "ind": "ind",
+        # 'jav':'jav',
+        "mad": "mad",
+        "min": "min",
+        "nij": "nij",
+        "sun": "sun",
+        "bbc": "bbc",
+    },
+    "masakhane/masakhanews": {
+        "amh": "amh",
+        "eng": "eng",
+        "fra": "fra",
+        "hau": "hau",
+        "ibo": "ibo",
+        "lin": "lin",
+        "lug": "lug",
+        "orm": "orm",
+        "pcm": "pcm",
+        "run": "run",
+        "sna": "sna",
+        "som": "som",
+        "swa": "swa",
+        "tir": "tir",
+        "xho": "xho",
+        "yor": "yor",
+    },
+    "papluca/language-identification": {
+        # None: "mixed",
+    },
+    "adithya7/xlel_wd": {
+        # 'wikipedia-zero-shot': "mixed",
+        # 'wikinews-zero-shot': "mixed",
+        # 'wikinews-cross-domain': "mixed",
+        "wikipedia-zero-shot.af": "af",
+        "wikipedia-zero-shot.ar": "ar",
+        "wikipedia-zero-shot.be": "be",
+        "wikipedia-zero-shot.bg": "bg",
+        "wikipedia-zero-shot.bn": "bn",
+        "wikipedia-zero-shot.ca": "ca",
+        "wikipedia-zero-shot.cs": "cs",
+        "wikipedia-zero-shot.da": "da",
+        "wikipedia-zero-shot.de": "de",
+        "wikipedia-zero-shot.el": "el",
+        "wikipedia-zero-shot.en": "en",
+        "wikipedia-zero-shot.es": "es",
+        "wikipedia-zero-shot.fa": "fa",
+        "wikipedia-zero-shot.fi": "fi",
+        "wikipedia-zero-shot.fr": "fr",
+        "wikipedia-zero-shot.he": "he",
+        "wikipedia-zero-shot.hi": "hi",
+        "wikipedia-zero-shot.hu": "hu",
+        "wikipedia-zero-shot.id": "id",
+        "wikipedia-zero-shot.it": "it",
+        "wikipedia-zero-shot.ja": "ja",
+        "wikipedia-zero-shot.ko": "ko",
+        "wikipedia-zero-shot.ml": "ml",
+        "wikipedia-zero-shot.mr": "mr",
+        "wikipedia-zero-shot.ms": "ms",
+        "wikipedia-zero-shot.nl": "nl",
+        "wikipedia-zero-shot.no": "no",
+        "wikipedia-zero-shot.pl": "pl",
+        "wikipedia-zero-shot.pt": "pt",
+        "wikipedia-zero-shot.ro": "ro",
+        "wikipedia-zero-shot.ru": "ru",
+        "wikipedia-zero-shot.si": "si",
+        "wikipedia-zero-shot.sk": "sk",
+        "wikipedia-zero-shot.sl": "sl",
+        "wikipedia-zero-shot.sr": "sr",
+        "wikipedia-zero-shot.sv": "sv",
+        "wikipedia-zero-shot.sw": "sw",
+        "wikipedia-zero-shot.ta": "ta",
+        "wikipedia-zero-shot.te": "te",
+        "wikipedia-zero-shot.th": "th",
+        "wikipedia-zero-shot.tr": "tr",
+        "wikipedia-zero-shot.uk": "uk",
+        "wikipedia-zero-shot.vi": "vi",
+        "wikipedia-zero-shot.zh": "zh",
+        "wikinews-zero-shot.ar": "ar",
+        "wikinews-zero-shot.cs": "cs",
+        "wikinews-zero-shot.de": "de",
+        "wikinews-zero-shot.en": "en",
+        "wikinews-zero-shot.es": "es",
+        "wikinews-zero-shot.fi": "fi",
+        "wikinews-zero-shot.fr": "fr",
+        "wikinews-zero-shot.it": "it",
+        "wikinews-zero-shot.ja": "ja",
+        "wikinews-zero-shot.ko": "ko",
+        "wikinews-zero-shot.nl": "nl",
+        "wikinews-zero-shot.no": "no",
+        "wikinews-zero-shot.pl": "pl",
+        "wikinews-zero-shot.pt": "pt",
+        "wikinews-zero-shot.ru": "ru",
+        "wikinews-zero-shot.sr": "sr",
+        "wikinews-zero-shot.sv": "sv",
+        "wikinews-zero-shot.ta": "ta",
+        # 'wikinews-zero-shot.tr': 'tr',
+        "wikinews-zero-shot.uk": "uk",
+        "wikinews-zero-shot.zh": "zh",
+        "wikinews-cross-domain.ar": "ar",
+        "wikinews-cross-domain.bg": "bg",
+        "wikinews-cross-domain.ca": "ca",
+        "wikinews-cross-domain.cs": "cs",
+        "wikinews-cross-domain.de": "de",
+        "wikinews-cross-domain.el": "el",
+        "wikinews-cross-domain.en": "en",
+        "wikinews-cross-domain.es": "es",
+        "wikinews-cross-domain.fi": "fi",
+        "wikinews-cross-domain.fr": "fr",
+        "wikinews-cross-domain.he": "he",
+        "wikinews-cross-domain.hu": "hu",
+        "wikinews-cross-domain.it": "it",
+        "wikinews-cross-domain.ja": "ja",
+        "wikinews-cross-domain.ko": "ko",
+        "wikinews-cross-domain.nl": "nl",
+        "wikinews-cross-domain.no": "no",
+        "wikinews-cross-domain.pl": "pl",
+        "wikinews-cross-domain.pt": "pt",
+        "wikinews-cross-domain.ro": "ro",
+        "wikinews-cross-domain.ru": "ru",
+        "wikinews-cross-domain.sr": "sr",
+        "wikinews-cross-domain.sv": "sv",
+        "wikinews-cross-domain.ta": "ta",
+        "wikinews-cross-domain.tr": "tr",
+        "wikinews-cross-domain.uk": "uk",
+        "wikinews-cross-domain.zh": "zh",
+    },
+    "sbmaruf/forai_ml-ted_talk_iwslt": {
+        "eu_ca_2014": "eu_ca",
+        "eu_ca_2015": "eu_ca",
+        "eu_ca_2016": "eu_ca",
+        "nl_en_2014": "nl_en",
+        "nl_en_2015": "nl_en",
+        "nl_en_2016": "nl_en",
+        "nl_hi_2014": "nl_hi",
+        "nl_hi_2015": "nl_hi",
+        "nl_hi_2016": "nl_hi",
+        "de_ja_2014": "de_ja",
+        "de_ja_2015": "de_ja",
+        "de_ja_2016": "de_ja",
+        "fr-ca_hi_2014": "fr_hi",
+        "fr-ca_hi_2015": "fr_hi",
+        "fr-ca_hi_2016": "fr_hi",
+    },
+    "sbmaruf/forai_ml_masakhane_mafand": {
+        "en-amh": "en-amh",
+        "en-hau": "en-hau",
+        "en-ibo": "en-ibo",
+        "en-kin": "en-kin",
+        "en-lug": "en-lug",
+        "en-nya": "en-nya",
+        "en-pcm": "en-pcm",
+        "en-sna": "en-sna",
+        "en-swa": "en-swa",
+        "en-tsn": "en-tsn",
+        "en-twi": "en-twi",
+        "en-xho": "en-xho",
+        "en-yor": "en-yor",
+        "en-zul": "en-zul",
+        "fr-bam": "fr-bam",
+        "fr-bbj": "fr-bbj",
+        "fr-ewe": "fr-ewe",
+        "fr-fon": "fr-fon",
+        "fr-mos": "fr-mos",
+        "fr-wol": "fr-wol",
+    },
+    "exams": {
+        # 'alignments': 'mixed',
+        # 'multilingual': 'mixed',
+        # 'multilingual_with_para': 'mixed',
+        # 'crosslingual_test':'mixed',
+        # 'crosslingual_with_para_test': 'mixed',
+        "crosslingual_bg": "bg",
+        "crosslingual_with_para_bg": "bg",
+        "crosslingual_hr": "hr",
+        "crosslingual_with_para_hr": "hr",
+        "crosslingual_hu": "hu",
+        "crosslingual_with_para_hu": "hu",
+        "crosslingual_it": "it",
+        "crosslingual_with_para_it": "it",
+        "crosslingual_mk": "mk",
+        "crosslingual_with_para_mk": "mk",
+        "crosslingual_pl": "pl",
+        "crosslingual_with_para_pl": "pl",
+        "crosslingual_pt": "pt",
+        "crosslingual_with_para_pt": "pt",
+        "crosslingual_sq": "sq",
+        "crosslingual_with_para_sq": "sq",
+        "crosslingual_sr": "sr",
+        "crosslingual_with_para_sr": "sr",
+        "crosslingual_tr": "tr",
+        "crosslingual_with_para_tr": "tr",
+        "crosslingual_vi": "vi",
+        "crosslingual_with_para_vi": "vi",
+    },
+    "allenai/soda": {
+        None: "en",
+    },
+    "arabic_billion_words": {
+        # 'Alittihad': "ar",
+        "Almasryalyoum": "ar",
+        "Almustaqbal": "ar",
+        "Alqabas": "ar",
+        "Echoroukonline": "ar",
+        "Ryiadh": "ar",
+        "Sabanews": "ar",
+        "SaudiYoumSaudi": "ar",
+        "Techreen": "ar",
+        "Youm7": "ar",
+    },
+    "theblackcat102/joke_explaination": {
+        None: "en",
+    },
+    "narrativeqa": {
+        None: "en",
+    },
+    "svakulenk0/qrecc": {
+        None: "en",
+    },
+    "GEM/wiki_cat_sum": {
+        "animal": "en",
+        "company": "en",
+        "film": "en",
+    },
+    "allenai/scirepeval": {
+        # "fos": "en",
+        # "mesh_descriptors": "en",
+        # "cite_count": "en",
+        # "pub_year": "en",
+        # "cite_prediction": "en",
+        # "cite_prediction_new": "en",
+        # "high_influence_cite": "en",
+        # "same_author": "en",
+        # "search": "en",
+        "biomimicry": "en",
+        # "drsm": "en",
+        # "feeds_1": "en",
+        # "feeds_m": "en",
+        # "feeds_title": "en",
+        # "peer_review_score_hIndex": "en",
+        # "trec_covid": "en",
+        # "tweet_mentions": "en",
+        # "scidocs_mag_mesh": "en",
+        # "scidocs_view_cite_read": "en",
+        # "paper_reviewer_matching": "en",
+    },
+    "TurkuNLP/turku_paraphrase_corpus": {
+        "plain": "fi",
+        "plain-context": "fi",
+        "classification": "fi",
+        "classification-context": "fi",
+        "generation": "fi",
+    },
+    "wiki_split": {None: "en"},
 }
 
+
 def main():
-	parser = argparse.ArgumentParser()
-	parser.add_argument(
-		"--dataset-names",
-		nargs="+",
-		default=None,
-		help="Print the stat of the dataset. If `None` it will print stat of all the used data."
-	)
-	parser.add_argument(
-		"--export-format",
-		choices=['json', "csv"],
-		default=".json",
-		help="Which format you want to export."
-	)
-	parser.add_argument(
-		"--output-dir",
-		default=None,
-		help="The path to the folder where stat will be saved."
-	)
-	args = parser.parse_args()
-	stat_dict = {}
-	if args.dataset_names is None:
-		args.dataset_names = list(SERIES_A_DATASET_NAME_DICT.keys())
-	for dataset_name, subset_dict in SERIES_A_DATASET_NAME_DICT.items():
-		if dataset_name not in args.dataset_names:
-			continue
-		assert dataset_name not in stat_dict
-		stat_dict[dataset_name] = {}
-		for subset, subset_lang in subset_dict.items():
-			assert subset not in stat_dict[dataset_name]
-			stat_dict[dataset_name][subset] = {}
-			dt = datasets.load_dataset(dataset_name, name=subset, verification_mode="no_checks")
-			for split in dt.keys():
-				stat_dict[dataset_name][subset][split] = {
-					"size": len(dt[split]),
-					"column": list(dt[split].column_names),
-				}
-				# re-valuation of hypothesis considered in prompt template
-				if subset is not None and "X-CSQA" in subset:
-					for sample in dt[split]:
-						assert len(sample['question']['choices']['label']) == 5
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--dataset-names",
+        nargs="+",
+        default=None,
+        help="Print the stat of the dataset. If `None` it will print stat of all the used data.",
+    )
+    parser.add_argument(
+        "--export-format",
+        choices=["json", "csv"],
+        default=".json",
+        help="Which format you want to export.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default=None,
+        help="The path to the folder where stat will be saved.",
+    )
+    args = parser.parse_args()
+    stat_dict = {}
+    if args.dataset_names is None:
+        args.dataset_names = list(SERIES_A_DATASET_NAME_DICT.keys())
+    for dataset_name, subset_dict in SERIES_A_DATASET_NAME_DICT.items():
+        if dataset_name not in args.dataset_names:
+            continue
+        assert dataset_name not in stat_dict
+        stat_dict[dataset_name] = {}
+        for subset, subset_lang in subset_dict.items():
+            assert subset not in stat_dict[dataset_name]
+            stat_dict[dataset_name][subset] = {}
+            dt = datasets.load_dataset(
+                dataset_name, name=subset, verification_mode="no_checks"
+            )
+            for split in dt.keys():
+                stat_dict[dataset_name][subset][split] = {
+                    "size": len(dt[split]),
+                    "column": list(dt[split].column_names),
+                }
+                # re-valuation of hypothesis considered in prompt template
+                if subset is not None and "X-CSQA" in subset:
+                    for sample in dt[split]:
+                        assert len(sample["question"]["choices"]["label"]) == 5
+
+    if args.output_dir != "None":
+        file_name = os.path.join(args.output_dir, "stat") + f".{args.export_format}"
+        if args.export_format == "json":
+            with open(file_name, "w") as file_ptr:
+                file_ptr.write(f"{json.dumps(stat_dict, indent=4)}\n")
+        elif args.export_format == "csv":
+            # with open(file_name, mode='w') as file_ptr:
+            # 	writer = csv.writer(file_ptr)
+            # 	for dataset_name, subset_name, in SERIES_A_DATASET_NAME_DICT.keys():
+            # 		row = [f"{dataset_name}"]
+
+            # 	writer.writerow(stat_dict.values())
+            pass
+        else:
+            raise NotImplementedError
 
-	if args.output_dir != 'None': 
-		file_name = os.path.join(args.output_dir, "stat") + f".{args.export_format}"
-		if args.export_format == "json":
-			with open(file_name, "w") as file_ptr:
-				file_ptr.write(f"{json.dumps(stat_dict, indent=4)}\n")
-		elif args.export_format == "csv":
-			# with open(file_name, mode='w') as file_ptr:
-			# 	writer = csv.writer(file_ptr)
-			# 	for dataset_name, subset_name, in SERIES_A_DATASET_NAME_DICT.keys():
-			# 		row = [f"{dataset_name}"]
-				
-			# 	writer.writerow(stat_dict.values())
-			pass
-		else:
-			raise NotImplementedError
 
 if __name__ == "__main__":
-	main()
-   
+    main()

From 3bb89fed6923bbf9ddd7c95456a7c2b36092c6a1 Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Tue, 30 May 2023 06:19:42 +0800
Subject: [PATCH 25/34] black; 3 letter lang, len(data) condition added

---
 data/validate_and_generate.py | 777 +++++++++++++++++++++-------------
 1 file changed, 492 insertions(+), 285 deletions(-)

diff --git a/data/validate_and_generate.py b/data/validate_and_generate.py
index c256835..33efea0 100644
--- a/data/validate_and_generate.py
+++ b/data/validate_and_generate.py
@@ -8,317 +8,524 @@
 import subprocess
 from datetime import date
 import concurrent.futures
-from typing import Tuple, Optional, List	
+from typing import Tuple, Optional, List
 from promptsource.templates import Template
 from .data_stat import SERIES_A_DATASET_NAME_DICT
+
 datasets.logging.set_verbosity_error()
 
+MT5_2_NLLB_ONE_TO_ONE = json.load(open("data/MT5_2_NLLB_ONE_TO_ONE.json"))
+
 mt5_langs_name_pair = [
-    ("Afrikaans", "af"), ("Albanian", "sq"), ("Amharic", "am"), ("Arabic", "ar"), ("Armenian", "hy"), ("Azerbaijani", "az"), 
-    ("Basque", "eu"), ("Belarusian", "be"), ("Bengali", "bn"), ("Bulgarian","bg"), ("Burmese", "my"), 
-    ("Catalan", "ca"), ("Cebuano", "ceb"), ("Chichewa", "ny"), ("Chinese", "zh"), ("Corsican", "co"), ("Czech", "cs"), 
-    ("Danish", "da"), ("Dutch", "nl"), 
-    ("English", "en"), ("Esperanto", "eo"), ("Estonian", "et"), 
-    ("Filipino", "fil"), ("Finnish", "fi"), ("French", "fr"), 
-    ("Galician", "gl"), ("Georgian", "ka"), ("German", "de"), ("Greek", "el"), ("Gujarati", "gu"), 
-    ("Haitian Creole", "ht"), ("Hausa", "ha"), ("Hawaiian", "haw"), ("Hebrew", "iw"), ("Hindi", "hi"), ("Hmong", "hmn"), ("Hungarian", "hu"), 
-    ("Icelandic", "is"), ("Igbo", "ig"), ("Indonesian", "id"), ("Irish", "ga"), ("Italian", "it"), 
-    ("Japanese", "ja"), ("Javanese", "jv"), 
-    ("Kannada", "kn"), ("Kazakh", "kk"), ("Khmer", "km"), ("Korean", "ko"), ("Kurdish", "ku"), ("Kyrgyz", "ky"), 
-    ("Lao", "lo"), ("Latin", "la"), ("Latvian", "lv"), ("Lithuanian", "lt"), ("Luxembourgish", "lb"), 
-    ("Macedonian", "mk"), ("Malagasy", "mg"), ("Malay", "ms"), ("Malayalam", "ml"), ("Maltese", "mt"), ("Maori", "mi"), ("Marathi", "mr"), ("Mongolian", "mn"), 
-    ("Nepali", "ne"), ("Norwegian", "no"), 
-    ("Pashto", "ps"), ("Persian", "fa"), ("Polish", "pl"), ("Portuguese", "pt"), ("Punjabi", "pa"), 
-    ("Romanian", "ro"), ("Russian", "ru"), 
-    ("Samoan", "sm"), ("Scottish Gaelic", "gd"), ("Serbian", "sr"), ("Shona", "sn"), ("Sindhi", "sd"), ("Sinhala", "si"), ("Slovak","sk"), ("Slovenian", "sl"), ("Somali", "so"), ("Sotho", "st"), ("Spanish", "es"), ("Sundanese", "su"), ("Swahili", "sw"), ("Swedish", "sv"), 
-    ("Tajik", "tg"), ("Tamil", "ta"), ("Telugu", "te"), ("Thai", "th"), ("Turkish", "tr"), 
-    ("Ukrainian", "uk"), ("Urdu", "ur"), ("Uzbek", "uz"), 
-    ("Vietnamese", "vi"), 
-    ("Welsh", "cy"), ("West Frisian", "fy"), 
-    ("Xhosa", "xh"), 
-    ("Yiddish", "yi"), ("Yoruba", "yo"), ("Zulu", "zu")
+    ("Afrikaans", "af"),
+    ("Albanian", "sq"),
+    ("Amharic", "am"),
+    ("Arabic", "ar"),
+    ("Armenian", "hy"),
+    ("Azerbaijani", "az"),
+    ("Basque", "eu"),
+    ("Belarusian", "be"),
+    ("Bengali", "bn"),
+    ("Bulgarian", "bg"),
+    ("Burmese", "my"),
+    ("Catalan", "ca"),
+    ("Cebuano", "ceb"),
+    ("Chichewa", "ny"),
+    ("Chinese", "zh"),
+    ("Chinese (Traditional)", "zh"),
+    ("Corsican", "co"),
+    ("Czech", "cs"),
+    ("Danish", "da"),
+    ("Dutch", "nl"),
+    ("English", "en"),
+    ("Esperanto", "eo"),
+    ("Estonian", "et"),
+    ("Filipino", "fil"),
+    ("Finnish", "fi"),
+    ("French", "fr"),
+    ("Galician", "gl"),
+    ("Georgian", "ka"),
+    ("German", "de"),
+    ("Greek", "el"),
+    ("Gujarati", "gu"),
+    ("Haitian Creole", "ht"),
+    ("Hausa", "ha"),
+    ("Hawaiian", "haw"),
+    ("Hebrew", "iw"),
+    ("Hindi", "hi"),
+    ("Hmong", "hmn"),
+    ("Hungarian", "hu"),
+    ("Icelandic", "is"),
+    ("Igbo", "ig"),
+    ("Indonesian", "id"),
+    ("Irish", "ga"),
+    ("Italian", "it"),
+    ("Japanese", "ja"),
+    ("Javanese", "jv"),
+    ("Kannada", "kn"),
+    ("Kazakh", "kk"),
+    ("Khmer", "km"),
+    ("Korean", "ko"),
+    ("Kurdish", "ku"),
+    ("Kyrgyz", "ky"),
+    ("Lao", "lo"),
+    ("Latin", "la"),
+    ("Latvian", "lv"),
+    ("Lithuanian", "lt"),
+    ("Luxembourgish", "lb"),
+    ("Macedonian", "mk"),
+    ("Malagasy", "mg"),
+    ("Malay", "ms"),
+    ("Malayalam", "ml"),
+    ("Maltese", "mt"),
+    ("Maori", "mi"),
+    ("Marathi", "mr"),
+    ("Mongolian", "mn"),
+    ("Nepali", "ne"),
+    ("Norwegian", "no"),
+    ("Pashto", "ps"),
+    ("Persian", "fa"),
+    ("Polish", "pl"),
+    ("Portuguese", "pt"),
+    ("Punjabi", "pa"),
+    ("Romanian", "ro"),
+    ("Russian", "ru"),
+    ("Samoan", "sm"),
+    ("Scottish Gaelic", "gd"),
+    ("Serbian", "sr"),
+    ("Shona", "sn"),
+    ("Sindhi", "sd"),
+    ("Sinhala", "si"),
+    ("Slovak", "sk"),
+    ("Slovenian", "sl"),
+    ("Somali", "so"),
+    ("Sotho", "st"),
+    ("Spanish", "es"),
+    ("Sundanese", "su"),
+    ("Swahili", "sw"),
+    ("Swedish", "sv"),
+    ("Tajik", "tg"),
+    ("Tamil", "ta"),
+    ("Telugu", "te"),
+    ("Thai", "th"),
+    ("Turkish", "tr"),
+    ("Ukrainian", "uk"),
+    ("Urdu", "ur"),
+    ("Uzbek", "uz"),
+    ("Vietnamese", "vi"),
+    ("Welsh", "cy"),
+    ("West Frisian", "fy"),
+    ("Xhosa", "xh"),
+    ("Yiddish", "yi"),
+    ("Yoruba", "yo"),
+    ("Zulu", "zu"),
 ]
-mt5_langs_full_name_to_iso_name = { full_name: iso_name for full_name, iso_name in mt5_langs_name_pair}
+mt5_langs_full_name_to_iso_name = {
+    full_name: iso_name for full_name, iso_name in mt5_langs_name_pair
+}
 
 dataset_mapper = {
-	"AfriSenti-twitter-sentiment https://huggingface.co/datasets/shmuhammad/AfriSenti-twitter-sentiment": "shmuhammad/AfriSenti-twitter-sentiment",
-	"Joke-explanation https://huggingface.co/datasets/theblackcat102/joke_explaination": "theblackcat102/joke_explaination",
-	"Language Identification https://huggingface.co/datasets/papluca/language-identification": "papluca/language-identification",
-	"Mafand - a machine translation task https://huggingface.co/datasets/masakhane/mafand": "sbmaruf/forai_ml_masakhane_mafand",
-	"Masakhanews https://github.com/masakhane-io/masakhane-news": "masakhane/masakhanews",
-	"Mintaka https://huggingface.co/datasets/AmazonScience/mintaka":"AmazonScience/mintaka",
-	"NarrativeQA https://huggingface.co/datasets/narrativeqa": "narrativeqa",
-	"NusaX - sentiment classification https://huggingface.co/datasets/indonlp/NusaX-senti": "indonlp/NusaX-senti",
-	"qrecc https://huggingface.co/datasets/svakulenk0/qrecc": "svakulenk0/qrecc",
-	"SODA https://huggingface.co/datasets/allenai/soda": "allenai/soda",
-	"TED https://huggingface.co/datasets/ted_talks_iwslt": "sbmaruf/forai_ml-ted_talk_iwslt",
-	"WikiCatSum https://huggingface.co/datasets/GEM/wiki_cat_sum": "GEM/wiki_cat_sum",
-	"X-CSQA https://huggingface.co/datasets/xcsr": "xcsr",
-	"xlel_wd https://huggingface.co/datasets/adithya7/xlel_wd": "adithya7/xlel_wd"
+    "AfriSenti-twitter-sentiment https://huggingface.co/datasets/shmuhammad/AfriSenti-twitter-sentiment": "shmuhammad/AfriSenti-twitter-sentiment",
+    "Joke-explanation https://huggingface.co/datasets/theblackcat102/joke_explaination": "theblackcat102/joke_explaination",
+    "Language Identification https://huggingface.co/datasets/papluca/language-identification": "papluca/language-identification",
+    "Mafand - a machine translation task https://huggingface.co/datasets/masakhane/mafand": "sbmaruf/forai_ml_masakhane_mafand",
+    "Masakhanews https://github.com/masakhane-io/masakhane-news": "masakhane/masakhanews",
+    "Mintaka https://huggingface.co/datasets/AmazonScience/mintaka": "AmazonScience/mintaka",
+    "NarrativeQA https://huggingface.co/datasets/narrativeqa": "narrativeqa",
+    "NusaX - sentiment classification https://huggingface.co/datasets/indonlp/NusaX-senti": "indonlp/NusaX-senti",
+    "qrecc https://huggingface.co/datasets/svakulenk0/qrecc": "svakulenk0/qrecc",
+    "SODA https://huggingface.co/datasets/allenai/soda": "allenai/soda",
+    "TED https://huggingface.co/datasets/ted_talks_iwslt": "sbmaruf/forai_ml-ted_talk_iwslt",
+    "WikiCatSum https://huggingface.co/datasets/GEM/wiki_cat_sum": "GEM/wiki_cat_sum",
+    "X-CSQA https://huggingface.co/datasets/xcsr": "xcsr",
+    "xlel_wd https://huggingface.co/datasets/adithya7/xlel_wd": "adithya7/xlel_wd",
+    "allenai/scirepeval/biomimicry https://huggingface.co/datasets/allenai/scirepeval/viewer/biomimicry/train": "allenai/scirepeval",
+    "Turku Paraphrase https://huggingface.co/datasets/TurkuNLP/turku_paraphrase_corpus": "TurkuNLP/turku_paraphrase_corpus",
 }
 
-IGNORE_TASKS = [
-	"arabic_billion_words",
-	"narrativeqa",
-	"svakulenk0/qrecc"
-]
+IGNORE_TASKS = ["arabic_billion_words", "narrativeqa", "svakulenk0/qrecc"]
+
+
 def check(
-	json_example: str, 
-	template_name: str, 
-	jinja_template: str, 
-	template_reference: Optional[str] = None, 
-	original_task: Optional[str] = None, 
-	choices_in_prompt: Optional[bool] = None,
-	metrics: Optional[List[str]] = None,
-	languages: Optional[List[str]] = None,
-	answer_choices: Optional[str] = None
-)-> Tuple[str, str]:
-	"""
-	Given an example (`json_example`) from a huggingface dataset and prompt template (`jinja_template`),
-	the objective is to check if we can project the example in language model i/o format.  
-	Args:
-		json_example (str): a string contains json object. The json object is loaded 
-								by `json.loads()`. Typically this is a sample from 
-								huggingface dataset converted to a string by a `json.dumps()`. 
-		template_name: unique name (per dataset) for template
-        jinja_template: template expressed in Jinja
-        template_reference: string describing author or paper reference for template
-		original_task: If True, this prompt asks a model to perform the original task designed for
-                this dataset.
-		choices_in_prompt: If True, the answer choices are included in the templates such that models
-			see those choices in the input. Only applicable to classification tasks.
-		metrics: List of strings denoting metrics to use for evaluation
-		languages: List of strings denoting languages used in the prompt (not the associated dataset!)
-        answer_choices: Jinja expression for answer choices. Should produce
-                            	a ||| delimited string of choices that enumerates
-                            	the possible completions for templates that should
-                            	be evaluated as ranked completions. If None, then
-                            	the template is open-ended. This list is accessible
-                            	from within Jinja as the variable `answer_choices`.
-	"""
-	json_example = json.loads(json_example)
-	metadata = Template.Metadata(
-		original_task,
-		choices_in_prompt,
-		metrics,
-		languages
-	)
-	template = Template(
-		template_name, 
-	 	jinja_template, 
-	  	template_reference, 
-		metadata=metadata,
-	   	answer_choices=answer_choices
-	)
-	lm_io = template.apply(json_example, highlight_variables=False)
-	return lm_io
-	
+    json_example: str,
+    template_name: str,
+    jinja_template: str,
+    template_reference: Optional[str] = None,
+    original_task: Optional[str] = None,
+    choices_in_prompt: Optional[bool] = None,
+    metrics: Optional[List[str]] = None,
+    languages: Optional[List[str]] = None,
+    answer_choices: Optional[str] = None,
+) -> Tuple[str, str]:
+    """
+    Given an example (`json_example`) from a huggingface dataset and prompt template (`jinja_template`),
+    the objective is to check if we can project the example in language model i/o format.
+    Args:
+            json_example (str): a string contains json object. The json object is loaded
+                                                            by `json.loads()`. Typically this is a sample from
+                                                            huggingface dataset converted to a string by a `json.dumps()`.
+            template_name: unique name (per dataset) for template
+            jinja_template: template expressed in Jinja
+            template_reference: string describing author or paper reference for template
+            original_task: If True, this prompt asks a model to perform the original task designed for
+                            this dataset.
+            choices_in_prompt: If True, the answer choices are included in the templates such that models
+                    see those choices in the input. Only applicable to classification tasks.
+            metrics: List of strings denoting metrics to use for evaluation
+            languages: List of strings denoting languages used in the prompt (not the associated dataset!)
+            answer_choices: Jinja expression for answer choices. Should produce
+                                                            a ||| delimited string of choices that enumerates
+                                                            the possible completions for templates that should
+                                                            be evaluated as ranked completions. If None, then
+                                                            the template is open-ended. This list is accessible
+                                                            from within Jinja as the variable `answer_choices`.
+    """
+    json_example = json.loads(json_example)
+    metadata = Template.Metadata(original_task, choices_in_prompt, metrics, languages)
+    template = Template(
+        template_name,
+        jinja_template,
+        template_reference,
+        metadata=metadata,
+        answer_choices=answer_choices,
+    )
+    lm_io = template.apply(json_example, highlight_variables=False)
+    return lm_io
 
-def create_name_with_hierarchy(output_dir, dataset_signature, dataset_subset, split_name, template_name, template_lang):
-	"""
-	<original-dataset-name>/<dataset_subset>__<split_name>/template-generation/<template>/<date>/foraiml__<original-dataset-name>__<dataset_subset>__<split>__<dataset_lang>__<prompt_lang>.jsonl
-	"""
-	split_lang = SERIES_A_DATASET_NAME_DICT[dataset_signature][dataset_subset]
-	dataset_signature = dataset_signature.replace("/", "_").replace("\\","")
-	dataset_uid = f"{dataset_signature}" 
-	file_path = os.path.join(output_dir, dataset_uid)
-	split_uid = f"{dataset_subset}__{split_name}"
-	file_path = os.path.join(file_path, split_uid) 
-	file_path = os.path.join(file_path, "template-generation") 
-	file_path = os.path.join(file_path, template_name) 
-	file_path = os.path.join(file_path, f"{date.today()}")
-	dataset_file_uid = f"foraiml__{dataset_uid}__{split_uid}__{split_lang}__{mt5_langs_full_name_to_iso_name[template_lang]}.jsonl" 
-	file_path = os.path.join(file_path, dataset_file_uid) 
-	return file_path
+
+def create_name_with_hierarchy(
+    output_dir,
+    dataset_signature,
+    dataset_subset,
+    split_name,
+    template_name,
+    template_lang,
+):
+    """
+    <original-dataset-name>/<fromlanguage>_<charset>_to_<tolanguage>_<charset>/template-generation/<template>/<date>/<split-name>_<prompttemplatelanguage>.jsonl
+    """
+    split_lang = MT5_2_NLLB_ONE_TO_ONE[
+        SERIES_A_DATASET_NAME_DICT[dataset_signature][dataset_subset]
+    ]
+    dataset_signature = dataset_signature.replace("/", "_").replace("\\", "")
+    file_name = f"{dataset_signature}_{dataset_subset}"
+    file_name = os.path.join(file_name, "{}_to_{}".format(split_lang, split_lang))
+    file_name = os.path.join(file_name, "template-generation")
+    file_name = os.path.join(file_name, f"{date.today()}")
+    file_name = os.path.join(file_name, f"{split_name}_{template_name}_{template_lang}")
+    file_path = os.path.join(output_dir, file_name) + ".jsonl"
+    return file_path
 
 
 def get_template_name(prompt_template_data):
-	"""
-		Prompt template named as the discord contributor.
-		template_name: Name + discord_user_name
-	"""
-	name = prompt_template_data['Name'].replace(" ", "_").replace("#", "_") + \
-			"__" + \
-			prompt_template_data['Discord username'].replace(" ", "_").replace("#", "_")
-	return name
+    """
+    Prompt template named as the discord contributor.
+    template_name: UID
+    """
+    name = prompt_template_data["UID"]
+    return name
+
 
 def process(args):
-	data, prompt_template_data, row_id, model_input, model_exp_output, export_file_path, generate, error_msg, generate_desc = args
-	if generate:
-		dir_name = os.path.dirname(export_file_path)
-		if not os.path.exists(dir_name):
-			os.makedirs(dir_name, exist_ok=True)
-		if os.path.exists(export_file_path):
-			print(f"[IGNORE] {export_file_path}: Data already exist.")
-			return 0
-		export_file_ptr = open(export_file_path, "w")
-	
-	for sample in tqdm.tqdm(data, total=len(data), desc=generate_desc):
-		lm_io = check(
-			json_example = json.dumps(sample),
-			template_name = prompt_template_data['Name'],
-			jinja_template = f"{model_input} ||| {model_exp_output}",
-			template_reference = prompt_template_data['Discord username'],
-		)
-		assert len(lm_io) == 2, error_msg
-		if not generate:
-			break
-		out_data = {
-			"inputs": lm_io[0],
-			"targets": lm_io[1],
-		}
-		export_file_ptr.write(f"{json.dumps(out_data)}\n")
-	if generate:
-		export_file_ptr.close()
-	return 0
+    (
+        data,
+        prompt_template_data,
+        row_id,
+        model_input,
+        model_exp_output,
+        export_file_path,
+        generate,
+        error_msg,
+        generate_desc,
+        add_template_metadata,
+        prompt_template_data,
+        projected_template_lang,
+    ) = args
+    dir_name = os.path.dirname(export_file_path)
+    if not os.path.exists(dir_name):
+        os.makedirs(dir_name, exist_ok=True)
+    if os.path.exists(export_file_path):
+        print(f"[IGNORE] {export_file_path}: Data already exist.")
+        return 0
+    export_file_ptr = open(export_file_path, "w")
+    for sample in tqdm.tqdm(data, total=len(data), desc=generate_desc):
+        lm_io = check(
+            json_example=json.dumps(sample),
+            template_name=prompt_template_data["Name"],
+            jinja_template=f"{model_input} ||| {model_exp_output}",
+            template_reference=prompt_template_data["Discord username"],
+        )
+        assert len(lm_io) == 2, error_msg
 
-def validate_and_generate(prompt_template_data, row_id, output_dir="dumped", generate=False, num_proc=1):
-	"""
-	Generate data from a prompt template
-	"""
-	try:
-		print(json.dumps(prompt_template_data, indent=4))
-		dataset_info = prompt_template_data['What dataset do you pick?']
-		if dataset_info not in dataset_mapper:
-			dataset_signature = dataset_info.split()[0].lower()
-		else:
-			dataset_signature = dataset_mapper[dataset_info]
-		if dataset_signature in IGNORE_TASKS:
-			print(f"[IGNORE] row_id : {row_id}: Task exists in IGNORE_TASKS.")
-			return 0
-		if prompt_template_data['Automatic Generation'] != '1':
-			print(f"[IGNORE] row_id : {row_id}: Automatic Generation = {prompt_template_data['Automatic Generation']}.")
-			return 0
-		dataset_subsets_dict = SERIES_A_DATASET_NAME_DICT[dataset_signature]
-		template_name = get_template_name(prompt_template_data)
-		with concurrent.futures.ProcessPoolExecutor(max_workers=num_proc) as process_executor:
-			future_to_val_results = []
-			for dataset_subset_idx, (dataset_subset, subset_lang) in enumerate(dataset_subsets_dict.items()):
-				dataset = datasets.load_dataset(dataset_signature, dataset_subset, verification_mode="no_checks")
-				splits = dataset.keys()
-				for split_idx, split in enumerate(splits):
-					data = dataset[split]
-					# validate & generate native lang prompt
-					model_input = prompt_template_data['Input to the model']
-					model_exp_output = prompt_template_data['Model\'s expected output']
-					template_lang = prompt_template_data['What language do you want to write your prompt in?']
-					export_file_path = create_name_with_hierarchy(output_dir, dataset_signature, dataset_subset, split, template_name, template_lang)
-					error_msg = f"Validating dataset_signature:dataset_subset:split={dataset_signature}:{dataset_subset}:{split}:{row_id}:{template_lang} with prompt template... [FAILED]"
-					generate_desc = f"row id: [{row_id}:{template_lang}] subset: [{dataset_subset_idx+1}/{len(dataset_subsets_dict)}]  split: [{split_idx+1}/{len(splits)}] {os.path.basename(export_file_path)}"
-					# process(data, prompt_template_data, row_id, model_input, model_exp_output, export_file_path, generate, error_msg, generate_desc)
-					
-					args = (data, prompt_template_data, row_id, model_input, model_exp_output, export_file_path, generate, error_msg, generate_desc)
-					future = process_executor.submit(process, args)
-					future_to_val_results.append(future)
+        out_data = copy.deepcopy(prompt_template_data) if add_template_metadata else {}
+        out_data["projected_template_lang"] = projected_template_lang
+        out_data["inputs"] = lm_io[0]
+        out_data["targets"] = lm_io[1]
+        export_file_ptr.write(f"{json.dumps(out_data)}\n")
+        if not generate:
+            break
+    if generate:
+        export_file_ptr.close()
 
-					# validate & generate english prompt
-					model_input = prompt_template_data['English translation of the input']
-					model_exp_output = prompt_template_data['English translation of the output']
-					template_lang="English"
-					export_file_path = create_name_with_hierarchy(output_dir, dataset_signature, dataset_subset, split, template_name, template_lang)
-					error_msg = f"Validating dataset_signature:dataset_subset:split={dataset_signature}:{dataset_subset}:{split}:{row_id}:{template_lang} with prompt template... [FAILED]"
-					generate_desc = f"row id: [{row_id}:{template_lang}] subset: [{dataset_subset_idx+1}/{len(dataset_subsets_dict)}]  split: [{split_idx+1}/{len(splits)}] {os.path.basename(export_file_path)}"
-					# process(data, prompt_template_data, row_id, model_input, model_exp_output, export_file_path, generate, error_msg, generate_desc)
-					
-					args = (data, prompt_template_data, row_id, model_input, model_exp_output, export_file_path, generate, error_msg, generate_desc)
-					future = process_executor.submit(process, args)
-					future_to_val_results.append(future)
+    return 0
 
-				print(f"[DONE] dataset_signature:dataset_subset:split={dataset_signature}:{dataset_subset}:{split}")
-		concurrent.futures.wait(future_to_val_results)
 
-	except:
-		print(f"Error in row {row_id}, {dataset_signature=}, {dataset_subset=}")
-		raise
-	
-	return 0
-	
-def parse(prompt_file_path, validate_rows):
-	"""
-	Parse list of rows menntioned in validate_rows. 
-	"""
-	_prmompt_dict, dt_structure, idx_to_header = {}, {}, {}
-	with open(prompt_file_path, 'r') as csvfile:
-		csvreader = csv.reader(csvfile)
-		for row_idx, row in enumerate(csvreader):
-			if row_idx == 0:
-				for idx, dt in enumerate(row):
-					dt_structure[dt] = {}
-					idx_to_header[idx] = dt
-			if row_idx+1 in validate_rows or validate_rows == []: # 1 based indexing
-				sample = copy.deepcopy(dt_structure)
-				for idx, dt in enumerate(row):
-					sample[idx_to_header[idx]] = dt
-				_prmompt_dict[ row_idx+1 ] = sample
-	return _prmompt_dict
+def select_and_generate(
+    prompt_dict,
+    start_row_id=0,
+    output_dir="dumped",
+    generate=False,
+    add_template_metadata=False,
+    num_proc=1,
+):
+    """
+    Generate data from a prompt template
+    """
+    with concurrent.futures.ProcessPoolExecutor(
+        max_workers=num_proc
+    ) as process_executor:
+        for row_id, prompt_template_data in prompt_dict.items():
+            if row_id > start_row_id:
+                print(f"Working on row {row_id} ...")
+                try:
+                    print(json.dumps(prompt_template_data, indent=4))
+                    dataset_info = prompt_template_data["What dataset do you pick?"]
+                    if dataset_info not in dataset_mapper:
+                        dataset_signature = dataset_info.split()[0].lower()
+                    else:
+                        dataset_signature = dataset_mapper[dataset_info]
+                    if dataset_signature in IGNORE_TASKS:
+                        print(f"[IGNORE][RID:{row_id}] Task exists in IGNORE_TASKS.")
+                        continue
+                    if prompt_template_data["Automatic Generation"] != "1":
+                        print(
+                            f"[IGNORE][RID : {row_id}] Automatic Generation = {prompt_template_data['Automatic Generation']}."
+                        )
+                        continue
+                    dataset_subsets_dict = SERIES_A_DATASET_NAME_DICT[dataset_signature]
+                    template_name = get_template_name(prompt_template_data)
+                    future_to_val_results = []
+                    for dataset_subset_idx, (dataset_subset, subset_lang) in enumerate(
+                        dataset_subsets_dict.items()
+                    ):
+                        dataset = datasets.load_dataset(
+                            dataset_signature, dataset_subset
+                        )
+                        splits = dataset.keys()
+                        for split_idx, split in enumerate(splits):
+                            data = dataset[split]
+                            if len(data) == 0:
+                                print(
+                                    "[IGNORE] due to empty dataset_signature:dataset_subset:split={dataset_signature}:{dataset_subset}:{split}"
+                                )
+                                continue
+                            if prompt_template_data["include (non-English)?"] == "TRUE":
+                                template_lang = MT5_2_NLLB_ONE_TO_ONE[
+                                    mt5_langs_full_name_to_iso_name[
+                                        prompt_template_data[
+                                            "What language do you want to write your prompt in?"
+                                        ]
+                                    ]
+                                ]
+                                split_lang = MT5_2_NLLB_ONE_TO_ONE[
+                                    SERIES_A_DATASET_NAME_DICT[dataset_signature][
+                                        dataset_subset
+                                    ]
+                                ]
+                                if template_lang == split_lang:
+                                    # select & generate native lang prompt
+                                    model_input = prompt_template_data[
+                                        "Input to the model"
+                                    ]
+                                    model_exp_output = prompt_template_data[
+                                        "Model's expected output"
+                                    ]
+                                    export_file_path = create_name_with_hierarchy(
+                                        output_dir,
+                                        dataset_signature,
+                                        dataset_subset,
+                                        split,
+                                        template_name,
+                                        template_lang,
+                                    )
+                                    error_msg = f"Validating dataset_signature:dataset_subset:split={dataset_signature}:{dataset_subset}:{split}:{row_id}:{template_lang} with prompt template... [FAILED]"
+                                    generate_desc = f"[RID:{row_id}:{template_lang}] [SUBSET:{dataset_subset_idx+1}/{len(dataset_subsets_dict)}] [SPLIT:{split_idx+1}/{len(splits)}] {os.path.basename(export_file_path)}"
+                                    args = (
+                                        data,
+                                        prompt_template_data,
+                                        row_id,
+                                        model_input,
+                                        model_exp_output,
+                                        export_file_path,
+                                        generate,
+                                        error_msg,
+                                        generate_desc,
+                                        add_template_metadata,
+                                        prompt_template_data,
+                                        template_lang,
+                                    )
+                                    # process(args)
+                                    future = process_executor.submit(process, args)
+                                    future_to_val_results.append(future)
+                                    # input(":")
+                                else:
+                                    print(
+                                        f"[IGNORE][RID:{row_id}] Due to missmatch between temaplate language vs dataset language. {split_lang=}, {template_lang=}"
+                                    )
+                            else:
+                                print(
+                                    f"[IGNORE][RID:{row_id}] Due to `include (non-English)?` column value in the spreadsheet."
+                                )
 
+                            # select & generate english prompt
+                            if (
+                                prompt_template_data[
+                                    "include EN (is English unique for dataset?)"
+                                ]
+                                == "TRUE"
+                            ):
+                                model_input = prompt_template_data[
+                                    "English translation of the input"
+                                ]
+                                model_exp_output = prompt_template_data[
+                                    "English translation of the output"
+                                ]
+                                template_lang = MT5_2_NLLB_ONE_TO_ONE["en"]
+                                export_file_path = create_name_with_hierarchy(
+                                    output_dir,
+                                    dataset_signature,
+                                    dataset_subset,
+                                    split,
+                                    template_name,
+                                    template_lang,
+                                )
+                                error_msg = f"Validating dataset_signature:dataset_subset:split={dataset_signature}:{dataset_subset}:{split}:{row_id}:{template_lang} with prompt template... [FAILED]"
+                                generate_desc = f"[RID:{row_id}:{template_lang}] [SUBSET:{dataset_subset_idx+1}/{len(dataset_subsets_dict)}] [SPLIT:{split_idx+1}/{len(splits)}] {os.path.basename(export_file_path)}"
+                                args = (
+                                    data,
+                                    prompt_template_data,
+                                    row_id,
+                                    model_input,
+                                    model_exp_output,
+                                    export_file_path,
+                                    generate,
+                                    error_msg,
+                                    generate_desc,
+                                    add_template_metadata,
+                                    prompt_template_data,
+                                    template_lang,
+                                )
+                                # process(args)
+                                future = process_executor.submit(process, args)
+                                future_to_val_results.append(future)
+                                # input(":")
+                            else:
+                                print(
+                                    f"[IGNORE][RID:{row_id}] Due to `include EN (is English unique for dataset?)` column value in the spreadsheet."
+                                )
 
-def main():
-	parser = argparse.ArgumentParser()
-	parser.add_argument(
-		"--form_path",
-		type=str,
-		default="https://docs.google.com/spreadsheets/d/10bCwOhM8zKNkqKi54gIvdwrR44YlWQFV9fpGm7acHv8/export?format=csv&id=10bCwOhM8zKNkqKi54gIvdwrR44YlWQFV9fpGm7acHv8&gid=726399306",
-		help="Path of the google sheet."
-	)
-	parser.add_argument(
-		"--overwrite",
-		action="store_true",
-		help="Overwrite eexisting prompt file prompts.csv."
-	)
-	parser.add_argument(
-		"--prompt-dir",
-		type=str,
-		default="data/",
-		help="Overwrite existing prompt file prompts.csv."
-	)
-	parser.add_argument(
-		"--validate-rows",
-		nargs='*',
-		default=[],
-		type=int,
-		help="List of row indices (1-based indexing ). The row mentioned here will indicate the row of `--form_path` spreadsheet. If empty, it will validate all the rows."
-	)
-	parser.add_argument(
-		"--generate",
-		action="store_true",
-		help="Generate projected samples."
-	)
-	parser.add_argument(
-		"--output-dir",
-		default=None,
-		help="The path to the folder where data will be saved."
-	)
-	parser.add_argument(
-		"--num-proc",
-		default=1,
-		type=int,
-		help="Number of parallel process to run."
-	)
-	args = parser.parse_args()
-	
-	prompt_file_path = f"{args.prompt_dir}/prompts.csv"
-	if not os.path.exists(prompt_file_path):
-		os.makedirs(prompt_file_path)
+                        print(
+                            f"[DONE][RID:{row_id}] dataset_signature:dataset_subset:split={dataset_signature}:{dataset_subset}:{split}"
+                        )
+                except:
+                    print(
+                        f"Error in row {row_id}, {dataset_signature=}, {dataset_subset=}"
+                    )
+                    raise
+        concurrent.futures.wait(future_to_val_results)
+    return 0
 
-	if os.path.exists(prompt_file_path) and args.overwrite: # if file exists, it may be from prev. run/download.
-		subprocess.check_output(f"mv {prompt_file_path} {prompt_file_path}.old", shell=True)
-	if not os.path.exists(prompt_file_path):
-		cmd = f"curl -L '{args.form_path}' -o {prompt_file_path}"
-		subprocess.check_output(cmd, shell=True)
 
-	prompt_dict = parse(prompt_file_path, args.validate_rows)
-	for row_id, prompt_template_data in prompt_dict.items():
-		print(f"Working on row {row_id} ...")
-		validate_and_generate(
-			prompt_template_data, 
-			row_id, 
-			output_dir=args.output_dir, 
-			generate=args.generate, 
-			num_proc=args.num_proc
-		)
+def parse(prompt_file_path, select_rows):
+    """
+    Parse list of rows menntioned in select_rows.
+    """
+    _prmompt_dict, dt_structure, idx_to_header = {}, {}, {}
+    with open(prompt_file_path, "r") as csvfile:
+        csvreader = csv.reader(csvfile)
+        for row_idx, row in enumerate(csvreader):
+            if row_idx == 0:
+                for idx, dt in enumerate(row):
+                    dt_structure[dt] = {}
+                    idx_to_header[idx] = dt
+            if row_idx + 1 in select_rows or select_rows == []:  # 1 based indexing
+                sample = copy.deepcopy(dt_structure)
+                for idx, dt in enumerate(row):
+                    sample[idx_to_header[idx]] = dt
+                _prmompt_dict[row_idx + 1] = sample
+    return _prmompt_dict
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--form_path",
+        type=str,
+        default="https://docs.google.com/spreadsheets/d/10bCwOhM8zKNkqKi54gIvdwrR44YlWQFV9fpGm7acHv8/export?format=csv&id=10bCwOhM8zKNkqKi54gIvdwrR44YlWQFV9fpGm7acHv8&gid=726399306",
+        help="Path of the google sheet.",
+    )
+    parser.add_argument(
+        "--overwrite",
+        action="store_true",
+        help="Overwrite eexisting prompt file prompts.csv.",
+    )
+    parser.add_argument(
+        "--prompt-dir",
+        type=str,
+        default="data/",
+        help="Overwrite existing prompt file prompts.csv.",
+    )
+    parser.add_argument(
+        "--select-rows",
+        nargs="*",
+        default=[],
+        type=int,
+        help="List of row indices (1-based indexing ). The row mentioned here will indicate the row of `--form_path` spreadsheet. If empty, it will select all the rows.",
+    )
+    parser.add_argument(
+        "--generate", action="store_true", help="Generate projected samples."
+    )
+    parser.add_argument(
+        "--add-template-metadata",
+        action="store_true",
+        help="Add Template related metadata.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default=None,
+        help="The path to the folder where data will be saved.",
+    )
+    parser.add_argument(
+        "--num-proc", default=1, type=int, help="Number of parallel process to run."
+    )
+    parser.add_argument(
+        "--start-row-id",
+        default=0,
+        type=int,
+        help="The row id from where we will start parsing data.",
+    )
+    args = parser.parse_args()
+    prompt_file_path = f"{args.prompt_dir}/prompts.csv"
+    if (
+        os.path.exists(prompt_file_path) and args.overwrite
+    ):  # if file exists, it may be from prev. run/download.
+        subprocess.check_output(
+            f"mv {prompt_file_path} {prompt_file_path}.old", shell=True
+        )
+    if not os.path.exists(prompt_file_path):
+        cmd = f"curl -L '{args.form_path}' -o {prompt_file_path}"
+        subprocess.check_output(cmd, shell=True)
 
+    prompt_dict = parse(prompt_file_path, args.select_rows)
+    select_and_generate(
+        prompt_dict,
+        start_row_id=args.start_row_id,
+        output_dir=args.output_dir,
+        generate=args.generate,
+        add_template_metadata=args.add_template_metadata,
+        num_proc=args.num_proc,
+    )
 
 
 if __name__ == "__main__":
-	main()
\ No newline at end of file
+    main()

From 5e8b4f3d4ad38058020fd9b3321a18c0b5a51b02 Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Tue, 30 May 2023 06:20:52 +0800
Subject: [PATCH 26/34] one to one mapping between iso639-2 vs iso639-3

---
 instructmultilingual/lang_mapper.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/instructmultilingual/lang_mapper.py b/instructmultilingual/lang_mapper.py
index ee42597..7ffdba5 100644
--- a/instructmultilingual/lang_mapper.py
+++ b/instructmultilingual/lang_mapper.py
@@ -70,6 +70,18 @@ def get_mt5_2_nllb_mapper(mt5_langs_name_pair, nllb_lang_names):
 			MT5_2_NLLB[short_lang] = names_found
 	return MT5_2_NLLB
 
+def iso639_to_single_flores(MT5_2_NLLB_ONE_TO_MANY):
+	MT5_2_NLLB_ONE_TO_ONE = {}
+	cnt = 0
+	for k, v in MT5_2_NLLB_ONE_TO_MANY.items():
+		if len(v) == 1:
+			MT5_2_NLLB_ONE_TO_ONE[k] = v[0][0]
+		else:
+			print(k, v)
+			cnt += 1
+	print(cnt)
+	return MT5_2_NLLB_ONE_TO_ONE
+
 def main():
 	mt5_full_lang_names = set([full_lang for (full_lang, _) in mt5_langs_name_pair])
 	assert len(mt5_full_lang_names) == 101
@@ -80,10 +92,13 @@ def main():
 	ISO_LANG_NAME_LIST = [lag_obj.name for lag_obj in list(iso_languages)]
 	
 	test_iso_validity(ISO_LANG_NAME_LIST)
-	MT5_2_NLLB = get_mt5_2_nllb_mapper(mt5_langs_name_pair, nllb_lang_names)
-	# print(json.dumps(MT5_2_NLLB, indent=4))
-	with open("MT5_2_NLLB.json", "w") as file_ptr:
-		file_ptr.write(json.dumps(MT5_2_NLLB, indent=4))
+	MT5_2_NLLB_ONE_TO_MANY = get_mt5_2_nllb_mapper(mt5_langs_name_pair, nllb_lang_names)
+	with open("data/MT5_2_NLLB_ONE_TO_MANY.json", "w") as file_ptr:
+		file_ptr.write(json.dumps(MT5_2_NLLB_ONE_TO_MANY, indent=4))
+
+	MT5_2_NLLB_ONE_TO_ONE = iso639_to_single_flores(MT5_2_NLLB_ONE_TO_MANY)
+	with open("data/MT5_2_NLLB_ONE_TO_ONE.json", "w") as file_ptr:
+		file_ptr.write(json.dumps(MT5_2_NLLB_ONE_TO_ONE, indent=4))
 	
 if __name__ == "__main__":
 	main()
\ No newline at end of file

From 4c7cc1152e7b0c76a7e08971472d03a56b7753c6 Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Tue, 30 May 2023 06:21:18 +0800
Subject: [PATCH 27/34] runner

---
 scripts/validate_and_generate.sh | 80 ++++++++++++++++++++++++++++++--
 1 file changed, 76 insertions(+), 4 deletions(-)

diff --git a/scripts/validate_and_generate.sh b/scripts/validate_and_generate.sh
index 2fff828..fb0c452 100644
--- a/scripts/validate_and_generate.sh
+++ b/scripts/validate_and_generate.sh
@@ -1,7 +1,79 @@
 python3 -m data.validate_and_generate \
 --prompt-dir data \
---output-dir "dumped" \
---generate \
---num-proc 8
+--output-dir "GEM_wiki_cat_sum" \
+--add-template-metadata \
+--num-proc 16 \
+--select-rows 155 156 157 158 159 160 161 162 \
+--generate
 
-# 3 5 8 10 15 17 18  19 21 25 27 28 29 30 31 32 34 38 39 40 42 45 46 47 48 53 56 58 61 64 66 67 68 71 72 74 75 76 83 84 86 87 88 89 92 93 94 95 111 116 118 121 122 124 125 126 128 129 132 133 134 135 136 137 138 140 141 142 143
\ No newline at end of file
+python3 -m data.validate_and_generate \
+--prompt-dir data \
+--output-dir "AmazonScience_mintaka" \
+--add-template-metadata \
+--num-proc 16 \
+--select-rows 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 \
+--generate
+
+python3 -m data.validate_and_generate \
+--prompt-dir data \
+--output-dir "exams" \
+--add-template-metadata \
+--num-proc 16 \
+--select-rows 26 \
+--generate
+
+python3 -m data.validate_and_generate \
+--prompt-dir data \
+--output-dir "adithya7_xlel_wd" \
+--add-template-metadata \
+--num-proc 16 \
+--select-rows 168 169 170 171 172 173 174 175 176 \
+--generate
+
+python3 -m data.validate_and_generate \
+--prompt-dir data \
+--output-dir "xcsr" \
+--add-template-metadata \
+--num-proc 16 \
+--select-rows 163 164 165 166 167 \
+--generate
+
+python3 -m data.validate_and_generate \
+--prompt-dir data \
+--output-dir "wiki_split" \
+--add-template-metadata \
+--num-proc 16 \
+--select-rows 152 153 154 \
+--generate
+
+python3 -m data.validate_and_generate \
+--prompt-dir data \
+--output-dir "TurkuNLP_turku_paraphrase_corpus" \
+--add-template-metadata \
+--num-proc 16 \
+--select-rows 144 145 \
+--generate
+
+python3 -m data.validate_and_generate \
+--prompt-dir data \
+--output-dir "soda" \
+--add-template-metadata \
+--num-proc 16 \
+--select-rows 136 138 139 140 141 142 \
+--generate
+
+python3 -m data.validate_and_generate \
+--prompt-dir data \
+--output-dir "theblackcat102_joke_explaination" \
+--add-template-metadata \
+--num-proc 16 \
+--select-rows 27 28 29 30 31 33 37  \
+--generate
+
+python3 -m data.validate_and_generate \
+--prompt-dir data \
+--output-dir "allenai_scirepeval" \
+--add-template-metadata \
+--num-proc 16 \
+--select-rows 15 17 18 19 21 \
+--generate
\ No newline at end of file

From 916ba1b00c0a5b316757c689ff8b4ec8802c831a Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Tue, 30 May 2023 06:22:00 +0800
Subject: [PATCH 28/34] lang dicts added

---
 data/MT5_2_NLLB_ONE_TO_MANY.json | 670 +++++++++++++++++++++++++++++++
 data/MT5_2_NLLB_ONE_TO_ONE.json  |  95 +++++
 2 files changed, 765 insertions(+)
 create mode 100644 data/MT5_2_NLLB_ONE_TO_MANY.json
 create mode 100644 data/MT5_2_NLLB_ONE_TO_ONE.json

diff --git a/data/MT5_2_NLLB_ONE_TO_MANY.json b/data/MT5_2_NLLB_ONE_TO_MANY.json
new file mode 100644
index 0000000..8dda6c4
--- /dev/null
+++ b/data/MT5_2_NLLB_ONE_TO_MANY.json
@@ -0,0 +1,670 @@
+{
+    "af": [
+        [
+            "afr_Latn",
+            "Afrikaans"
+        ]
+    ],
+    "sq": [
+        [
+            "als_Latn",
+            "Tosk Albanian"
+        ]
+    ],
+    "am": [
+        [
+            "amh_Ethi",
+            "Amharic"
+        ]
+    ],
+    "ar": [
+        [
+            "ace_Arab",
+            "Acehnese (Arabic script)"
+        ],
+        [
+            "acm_Arab",
+            "Mesopotamian Arabic"
+        ],
+        [
+            "acq_Arab",
+            "Ta\u2019izzi-Adeni Arabic"
+        ],
+        [
+            "aeb_Arab",
+            "Tunisian Arabic"
+        ],
+        [
+            "ajp_Arab",
+            "South Levantine Arabic"
+        ],
+        [
+            "apc_Arab",
+            "North Levantine Arabic"
+        ],
+        [
+            "arb_Arab",
+            "Modern Standard Arabic"
+        ],
+        [
+            "arb_Latn",
+            "Modern Standard Arabic (Romanized)"
+        ],
+        [
+            "ars_Arab",
+            "Najdi Arabic"
+        ],
+        [
+            "ary_Arab",
+            "Moroccan Arabic"
+        ],
+        [
+            "arz_Arab",
+            "Egyptian Arabic"
+        ],
+        [
+            "bjn_Arab",
+            "Banjar (Arabic script)"
+        ],
+        [
+            "kas_Arab",
+            "Kashmiri (Arabic script)"
+        ],
+        [
+            "knc_Arab",
+            "Central Kanuri (Arabic script)"
+        ],
+        [
+            "min_Arab",
+            "Minangkabau (Arabic script)"
+        ]
+    ],
+    "hy": [
+        [
+            "hye_Armn",
+            "Armenian"
+        ]
+    ],
+    "az": [
+        [
+            "azb_Arab",
+            "South Azerbaijani"
+        ],
+        [
+            "azj_Latn",
+            "North Azerbaijani"
+        ]
+    ],
+    "eu": [
+        [
+            "eus_Latn",
+            "Basque"
+        ]
+    ],
+    "be": [
+        [
+            "bel_Cyrl",
+            "Belarusian"
+        ]
+    ],
+    "bn": [
+        [
+            "ben_Beng",
+            "Bengali"
+        ],
+        [
+            "mni_Beng",
+            "Meitei (Bengali script)"
+        ]
+    ],
+    "bg": [
+        [
+            "bul_Cyrl",
+            "Bulgarian"
+        ]
+    ],
+    "my": [
+        [
+            "mya_Mymr",
+            "Burmese"
+        ]
+    ],
+    "ca": [
+        [
+            "cat_Latn",
+            "Catalan"
+        ]
+    ],
+    "ceb": [
+        [
+            "ceb_Latn",
+            "Cebuano"
+        ]
+    ],
+    "zh": [
+        [
+            "yue_Hant",
+            "Yue Chinese"
+        ],
+        [
+            "zho_Hans",
+            "Chinese (Simplified)"
+        ],
+        [
+            "zho_Hant",
+            "Chinese (Traditional)"
+        ]
+    ],
+    "cs": [
+        [
+            "ces_Latn",
+            "Czech"
+        ]
+    ],
+    "da": [
+        [
+            "dan_Latn",
+            "Danish"
+        ]
+    ],
+    "nl": [
+        [
+            "nld_Latn",
+            "Dutch"
+        ]
+    ],
+    "en": [
+        [
+            "eng_Latn",
+            "English"
+        ]
+    ],
+    "eo": [
+        [
+            "epo_Latn",
+            "Esperanto"
+        ]
+    ],
+    "et": [
+        [
+            "est_Latn",
+            "Estonian"
+        ]
+    ],
+    "fi": [
+        [
+            "fin_Latn",
+            "Finnish"
+        ]
+    ],
+    "fr": [
+        [
+            "fra_Latn",
+            "French"
+        ]
+    ],
+    "gl": [
+        [
+            "glg_Latn",
+            "Galician"
+        ]
+    ],
+    "ka": [
+        [
+            "kat_Geor",
+            "Georgian"
+        ]
+    ],
+    "de": [
+        [
+            "deu_Latn",
+            "German"
+        ]
+    ],
+    "el": [
+        [
+            "ell_Grek",
+            "Greek"
+        ]
+    ],
+    "gu": [
+        [
+            "guj_Gujr",
+            "Gujarati"
+        ]
+    ],
+    "ht": [
+        [
+            "hat_Latn",
+            "Haitian Creole"
+        ]
+    ],
+    "ha": [
+        [
+            "hau_Latn",
+            "Hausa"
+        ]
+    ],
+    "iw": [
+        [
+            "heb_Hebr",
+            "Hebrew"
+        ]
+    ],
+    "hi": [
+        [
+            "hin_Deva",
+            "Hindi"
+        ]
+    ],
+    "hu": [
+        [
+            "hun_Latn",
+            "Hungarian"
+        ]
+    ],
+    "is": [
+        [
+            "isl_Latn",
+            "Icelandic"
+        ]
+    ],
+    "ig": [
+        [
+            "ibo_Latn",
+            "Igbo"
+        ]
+    ],
+    "id": [
+        [
+            "ind_Latn",
+            "Indonesian"
+        ]
+    ],
+    "ga": [
+        [
+            "gle_Latn",
+            "Irish"
+        ]
+    ],
+    "it": [
+        [
+            "ita_Latn",
+            "Italian"
+        ]
+    ],
+    "ja": [
+        [
+            "jpn_Jpan",
+            "Japanese"
+        ]
+    ],
+    "jv": [
+        [
+            "jav_Latn",
+            "Javanese"
+        ]
+    ],
+    "kn": [
+        [
+            "kan_Knda",
+            "Kannada"
+        ]
+    ],
+    "kk": [
+        [
+            "kaz_Cyrl",
+            "Kazakh"
+        ]
+    ],
+    "km": [
+        [
+            "khm_Khmr",
+            "Khmer"
+        ]
+    ],
+    "ko": [
+        [
+            "kor_Hang",
+            "Korean"
+        ]
+    ],
+    "ku": [
+        [
+            "ckb_Arab",
+            "Central Kurdish"
+        ],
+        [
+            "kmr_Latn",
+            "Northern Kurdish"
+        ]
+    ],
+    "ky": [
+        [
+            "kir_Cyrl",
+            "Kyrgyz"
+        ]
+    ],
+    "lo": [
+        [
+            "lao_Laoo",
+            "Lao"
+        ]
+    ],
+    "la": [
+        [
+            "ace_Latn",
+            "Acehnese (Latin script)"
+        ],
+        [
+            "bjn_Latn",
+            "Banjar (Latin script)"
+        ],
+        [
+            "knc_Latn",
+            "Central Kanuri (Latin script)"
+        ],
+        [
+            "min_Latn",
+            "Minangkabau (Latin script)"
+        ],
+        [
+            "taq_Latn",
+            "Tamasheq (Latin script)"
+        ]
+    ],
+    "lv": [
+        [
+            "lvs_Latn",
+            "Standard Latvian"
+        ]
+    ],
+    "lt": [
+        [
+            "lit_Latn",
+            "Lithuanian"
+        ]
+    ],
+    "lb": [
+        [
+            "ltz_Latn",
+            "Luxembourgish"
+        ]
+    ],
+    "mk": [
+        [
+            "mkd_Cyrl",
+            "Macedonian"
+        ]
+    ],
+    "mg": [
+        [
+            "plt_Latn",
+            "Plateau Malagasy"
+        ]
+    ],
+    "ms": [
+        [
+            "mal_Mlym",
+            "Malayalam"
+        ],
+        [
+            "zsm_Latn",
+            "Standard Malay"
+        ]
+    ],
+    "ml": [
+        [
+            "mal_Mlym",
+            "Malayalam"
+        ]
+    ],
+    "mt": [
+        [
+            "mlt_Latn",
+            "Maltese"
+        ]
+    ],
+    "mi": [
+        [
+            "mri_Latn",
+            "Maori"
+        ]
+    ],
+    "mr": [
+        [
+            "mar_Deva",
+            "Marathi"
+        ]
+    ],
+    "mn": [
+        [
+            "khk_Cyrl",
+            "Halh Mongolian"
+        ]
+    ],
+    "ne": [
+        [
+            "npi_Deva",
+            "Nepali"
+        ]
+    ],
+    "no": [
+        [
+            "nno_Latn",
+            "Norwegian Nynorsk"
+        ],
+        [
+            "nob_Latn",
+            "Norwegian Bokm\u00e5l"
+        ]
+    ],
+    "ps": [
+        [
+            "pbt_Arab",
+            "Southern Pashto"
+        ]
+    ],
+    "fa": [
+        [
+            "pes_Arab",
+            "Western Persian"
+        ]
+    ],
+    "pl": [
+        [
+            "pol_Latn",
+            "Polish"
+        ]
+    ],
+    "pt": [
+        [
+            "por_Latn",
+            "Portuguese"
+        ]
+    ],
+    "ro": [
+        [
+            "ron_Latn",
+            "Romanian"
+        ]
+    ],
+    "ru": [
+        [
+            "rus_Cyrl",
+            "Russian"
+        ]
+    ],
+    "sm": [
+        [
+            "smo_Latn",
+            "Samoan"
+        ]
+    ],
+    "gd": [
+        [
+            "gla_Latn",
+            "Scottish Gaelic"
+        ]
+    ],
+    "sr": [
+        [
+            "srp_Cyrl",
+            "Serbian"
+        ]
+    ],
+    "sn": [
+        [
+            "sna_Latn",
+            "Shona"
+        ]
+    ],
+    "sd": [
+        [
+            "snd_Arab",
+            "Sindhi"
+        ]
+    ],
+    "si": [
+        [
+            "sin_Sinh",
+            "Sinhala"
+        ]
+    ],
+    "sk": [
+        [
+            "slk_Latn",
+            "Slovak"
+        ]
+    ],
+    "sl": [
+        [
+            "slv_Latn",
+            "Slovenian"
+        ]
+    ],
+    "so": [
+        [
+            "som_Latn",
+            "Somali"
+        ]
+    ],
+    "st": [
+        [
+            "nso_Latn",
+            "Northern Sotho"
+        ],
+        [
+            "sot_Latn",
+            "Southern Sotho"
+        ]
+    ],
+    "es": [
+        [
+            "spa_Latn",
+            "Spanish"
+        ]
+    ],
+    "su": [
+        [
+            "sun_Latn",
+            "Sundanese"
+        ]
+    ],
+    "sw": [
+        [
+            "swh_Latn",
+            "Swahili"
+        ]
+    ],
+    "sv": [
+        [
+            "swe_Latn",
+            "Swedish"
+        ]
+    ],
+    "tg": [
+        [
+            "tgk_Cyrl",
+            "Tajik"
+        ]
+    ],
+    "ta": [
+        [
+            "tam_Taml",
+            "Tamil"
+        ]
+    ],
+    "te": [
+        [
+            "tel_Telu",
+            "Telugu"
+        ]
+    ],
+    "th": [
+        [
+            "tha_Thai",
+            "Thai"
+        ]
+    ],
+    "tr": [
+        [
+            "tur_Latn",
+            "Turkish"
+        ]
+    ],
+    "uk": [
+        [
+            "ukr_Cyrl",
+            "Ukrainian"
+        ]
+    ],
+    "ur": [
+        [
+            "urd_Arab",
+            "Urdu"
+        ]
+    ],
+    "uz": [
+        [
+            "uzn_Latn",
+            "Northern Uzbek"
+        ]
+    ],
+    "vi": [
+        [
+            "vie_Latn",
+            "Vietnamese"
+        ]
+    ],
+    "cy": [
+        [
+            "cym_Latn",
+            "Welsh"
+        ]
+    ],
+    "xh": [
+        [
+            "xho_Latn",
+            "Xhosa"
+        ]
+    ],
+    "yi": [
+        [
+            "ydd_Hebr",
+            "Eastern Yiddish"
+        ]
+    ],
+    "yo": [
+        [
+            "yor_Latn",
+            "Yoruba"
+        ]
+    ],
+    "zu": [
+        [
+            "zul_Latn",
+            "Zulu"
+        ]
+    ]
+}
\ No newline at end of file
diff --git a/data/MT5_2_NLLB_ONE_TO_ONE.json b/data/MT5_2_NLLB_ONE_TO_ONE.json
new file mode 100644
index 0000000..75cf7c9
--- /dev/null
+++ b/data/MT5_2_NLLB_ONE_TO_ONE.json
@@ -0,0 +1,95 @@
+{
+    "af": "afr_Latn",
+    "sq": "als_Latn",
+    "am": "amh_Ethi",
+    "hy": "hye_Armn",
+    "eu": "eus_Latn",
+    "be": "bel_Cyrl",
+    "bg": "bul_Cyrl",
+    "my": "mya_Mymr",
+    "ca": "cat_Latn",
+    "ceb": "ceb_Latn",
+    "cs": "ces_Latn",
+    "da": "dan_Latn",
+    "nl": "nld_Latn",
+    "en": "eng_Latn",
+    "eo": "epo_Latn",
+    "et": "est_Latn",
+    "fi": "fin_Latn",
+    "fr": "fra_Latn",
+    "gl": "glg_Latn",
+    "ka": "kat_Geor",
+    "de": "deu_Latn",
+    "el": "ell_Grek",
+    "gu": "guj_Gujr",
+    "ht": "hat_Latn",
+    "ha": "hau_Latn",
+    "iw": "heb_Hebr",
+    "hi": "hin_Deva",
+    "hu": "hun_Latn",
+    "is": "isl_Latn",
+    "ig": "ibo_Latn",
+    "id": "ind_Latn",
+    "ga": "gle_Latn",
+    "it": "ita_Latn",
+    "ja": "jpn_Jpan",
+    "jv": "jav_Latn",
+    "kn": "kan_Knda",
+    "kk": "kaz_Cyrl",
+    "km": "khm_Khmr",
+    "ko": "kor_Hang",
+    "ky": "kir_Cyrl",
+    "lo": "lao_Laoo",
+    "lv": "lvs_Latn",
+    "lt": "lit_Latn",
+    "lb": "ltz_Latn",
+    "mk": "mkd_Cyrl",
+    "mg": "plt_Latn",
+    "ml": "mal_Mlym",
+    "mt": "mlt_Latn",
+    "mi": "mri_Latn",
+    "mr": "mar_Deva",
+    "mn": "khk_Cyrl",
+    "ne": "npi_Deva",
+    "ps": "pbt_Arab",
+    "fa": "pes_Arab",
+    "pl": "pol_Latn",
+    "pt": "por_Latn",
+    "ro": "ron_Latn",
+    "ru": "rus_Cyrl",
+    "sm": "smo_Latn",
+    "gd": "gla_Latn",
+    "sr": "srp_Cyrl",
+    "sn": "sna_Latn",
+    "sd": "snd_Arab",
+    "si": "sin_Sinh",
+    "sk": "slk_Latn",
+    "sl": "slv_Latn",
+    "so": "som_Latn",
+    "es": "spa_Latn",
+    "su": "sun_Latn",
+    "sw": "swh_Latn",
+    "sv": "swe_Latn",
+    "tg": "tgk_Cyrl",
+    "ta": "tam_Taml",
+    "te": "tel_Telu",
+    "th": "tha_Thai",
+    "tr": "tur_Latn",
+    "uk": "ukr_Cyrl",
+    "ur": "urd_Arab",
+    "uz": "uzn_Latn",
+    "vi": "vie_Latn",
+    "cy": "cym_Latn",
+    "xh": "xho_Latn",
+    "yi": "ydd_Hebr",
+    "yo": "yor_Latn",
+    "zu": "zul_Latn",
+    "ar": "arb_Arab",
+    "bn": "ben_Beng",
+    "zh": "zho_Hans",
+    "fil": "fil_Latn",
+    "hr": "hrv_Latn",
+    "he": "heb_Hebr",
+    "ms": "msa_Latn",
+    "no": "nno_Latn"
+}
\ No newline at end of file

From 4a8b323e0f0eb9684d861f29410cc697582f2263 Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Tue, 30 May 2023 06:22:22 +0800
Subject: [PATCH 29/34] script for creating audit data

---
 scripts/create_csv.py | 66 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 scripts/create_csv.py

diff --git a/scripts/create_csv.py b/scripts/create_csv.py
new file mode 100644
index 0000000..b96c37a
--- /dev/null
+++ b/scripts/create_csv.py
@@ -0,0 +1,66 @@
+import os
+import csv
+import json
+
+
+def read_file_paths(file_path):
+    file_paths = set({})
+    with open(file_path, "r") as csv_file:
+        csv_reader = csv.reader(csv_file)
+        for _idx, row in enumerate(csv_reader):
+            if _idx == 0:
+                continue
+            file_paths.add(row[-1])
+    return file_paths
+
+
+def list_files_recursively(folder_path):
+    path_list = []
+    for root, dirs, files in os.walk(folder_path):
+        for file in files:
+            file_path = os.path.join(root, file)
+            path_list.append(file_path)
+    return path_list
+
+
+TASKS = [
+    # "AmazonScience_mintaka",
+    # "GEM_wiki_cat_sum",
+    # "exams",
+    # "adithya7_xlel_wd",
+    # "wiki_split",
+    # "xcsr",
+    # "TurkuNLP_turku_paraphrase_corpus",
+    # "theblackcat102_joke_explaination",
+    # "allenai_scirepeval",
+    # "soda",
+]
+for task in TASKS:
+    files = sorted(list_files_recursively(f"./dumped_{task}/"))
+    prompted_sample_file_path = f"dumped_{task}/single_sample_{task}.csv"
+    file_paths_in_prompted_sample = {}
+    FLAG = "w"
+    if os.path.exists(prompted_sample_file_path):
+        file_paths_in_prompted_sample = read_file_paths(prompted_sample_file_path)
+        FLAG = "a"
+
+    with open(prompted_sample_file_path, FLAG, newline="") as csv_file:
+        csv_writer = csv.writer(csv_file)
+        FLAG = 1
+        rows = []
+        for _idx, _file in enumerate(files):
+            if _file.endswith("jsonl"):
+                print(_idx, _file)
+                data = json.loads(next(iter(open(_file))))
+                if FLAG:
+                    HEADERS = list(data.keys())
+                    HEADERS.append(["FILE PATH"])
+                    csv_writer.writerow(HEADERS)
+                    FLAG = 0
+                row = [data[header] for header in HEADERS[:-1]]
+                row.append(_file)
+                rows.append(row)
+        print(f"Total rows {len(rows)}")
+        for row in rows:
+            if row[-1] not in file_paths_in_prompted_sample:
+                csv_writer.writerow(row)

From 31d7e5e9f2677092d12a4653844ecc755e4a4e3d Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Sat, 3 Jun 2023 18:13:59 +0800
Subject: [PATCH 30/34] code re-factor

---
 data/validate_and_generate.py | 119 +++++++++++++++++++++++-----------
 1 file changed, 80 insertions(+), 39 deletions(-)

diff --git a/data/validate_and_generate.py b/data/validate_and_generate.py
index 33efea0..7456c39 100644
--- a/data/validate_and_generate.py
+++ b/data/validate_and_generate.py
@@ -10,13 +10,12 @@
 import concurrent.futures
 from typing import Tuple, Optional, List
 from promptsource.templates import Template
-from .data_stat import SERIES_A_DATASET_NAME_DICT
+from data.data_stat import SERIES_A_DATASET_NAME_DICT
 
 datasets.logging.set_verbosity_error()
 
-MT5_2_NLLB_ONE_TO_ONE = json.load(open("data/MT5_2_NLLB_ONE_TO_ONE.json"))
-
-mt5_langs_name_pair = [
+# mT5 101 language mapper with it's native symbol mentioned in the paper.
+MT5_LANGS_NAME_PAIR = [
     ("Afrikaans", "af"),
     ("Albanian", "sq"),
     ("Amharic", "am"),
@@ -120,11 +119,13 @@
     ("Yoruba", "yo"),
     ("Zulu", "zu"),
 ]
-mt5_langs_full_name_to_iso_name = {
-    full_name: iso_name for full_name, iso_name in mt5_langs_name_pair
+MT5_LANGS_FULL_NAME_TO_ISO_NAME = {
+    full_name: iso_name for full_name, iso_name in MT5_LANGS_NAME_PAIR
 }
+MT5_2_NLLB_ONE_TO_ONE = json.load(open("data/MT5_2_NLLB_ONE_TO_ONE.json"))
 
-dataset_mapper = {
+# dataset mapper from spreadsheet (downloaded by --form_path argument) to huggingface dataset signature.
+DATASET_MAPPER = {
     "AfriSenti-twitter-sentiment https://huggingface.co/datasets/shmuhammad/AfriSenti-twitter-sentiment": "shmuhammad/AfriSenti-twitter-sentiment",
     "Joke-explanation https://huggingface.co/datasets/theblackcat102/joke_explaination": "theblackcat102/joke_explaination",
     "Language Identification https://huggingface.co/datasets/papluca/language-identification": "papluca/language-identification",
@@ -143,6 +144,7 @@
     "Turku Paraphrase https://huggingface.co/datasets/TurkuNLP/turku_paraphrase_corpus": "TurkuNLP/turku_paraphrase_corpus",
 }
 
+# These tasks have huggingface data loading error
 IGNORE_TASKS = ["arabic_billion_words", "narrativeqa", "svakulenk0/qrecc"]
 
 
@@ -159,7 +161,7 @@ def check(
 ) -> Tuple[str, str]:
     """
     Given an example (`json_example`) from a huggingface dataset and prompt template (`jinja_template`),
-    the objective is to check if we can project the example in language model i/o format.
+    the objective is to project sample using `jinja_template`. It return a pair of string, inputs (str), targets(str).
     Args:
             json_example (str): a string contains json object. The json object is loaded
                                                             by `json.loads()`. Typically this is a sample from
@@ -202,7 +204,17 @@ def create_name_with_hierarchy(
     template_lang,
 ):
     """
-    <original-dataset-name>/<fromlanguage>_<charset>_to_<tolanguage>_<charset>/template-generation/<template>/<date>/<split-name>_<prompttemplatelanguage>.jsonl
+    Returns a full path name for a prompt template with different folder hierarchy. Strictly fixed for pushing the data into the database.
+    <hf-dataset-name>_<hf-dataset-subset-name>/<fromlanguage>_<charset>_to_<tolanguage>_<charset>/template-generation/<template>/<date>/<split-name>_<prompt-template-language>.jsonl
+
+    Args:
+            output_dir (str): The path to the folder where data will be saved.
+            dataset_signature: `path` argument of huggingface load_dataset() api.
+            dataset_subset: `name` argument of huggingface load_dataset() api.
+            split_name: `split` argument of huggingface load_dataset() api.
+            template_name: template name generated by `get_template_name()` module.
+            template_lang: language of the prompt template (not to confuse with the language name of the dataset).
+
     """
     split_lang = MT5_2_NLLB_ONE_TO_ONE[
         SERIES_A_DATASET_NAME_DICT[dataset_signature][dataset_subset]
@@ -219,14 +231,31 @@ def create_name_with_hierarchy(
 
 def get_template_name(prompt_template_data):
     """
-    Prompt template named as the discord contributor.
-    template_name: UID
+    Given a prompt template data loaded from spreadsheet (downloaded by --form_path argument), it will return a name (str) for that template.
+    Args:
+            prompt_template_data: a dictionary contains all the information from spreadsheed accessed by the column name of the spreadsheet.
     """
     name = prompt_template_data["UID"]
     return name
 
 
 def process(args):
+    """
+    Given the following arguments, generate and write projected samples into an output file (passed via export_file_path argument).
+    Args:
+            data: Huggingface dataset
+            prompt_template_data: a dictionary contains all the information from spreadsheed (downloaded by --form_path argument)
+                                    accessed by the column name of the spreadsheet.
+            row_id: row_id of the spreadsheed (downloaded by --form_path argument). Currently un-used.
+            model_input: "English translation of the input" column data from spreadsheed (downloaded by --form_path argument).
+            model_exp_output: "Model's expected output" column data from spreadsheed (downloaded by --form_path argument).
+            export_file_path: File name where generated data will be saved.
+            generate: if True, it will apply prompt template to the full dataset. If false, only one sample will be generated.
+            error_msg: A predefined error message.
+            generate_desc: A predefined tqdm --desc parameter.
+            add_template_metadata: Add all the row information in the metadata.
+            projected_template_lang: the language of the template.
+    """
     (
         data,
         prompt_template_data,
@@ -238,7 +267,6 @@ def process(args):
         error_msg,
         generate_desc,
         add_template_metadata,
-        prompt_template_data,
         projected_template_lang,
     ) = args
     dir_name = os.path.dirname(export_file_path)
@@ -279,7 +307,17 @@ def select_and_generate(
     num_proc=1,
 ):
     """
-    Generate data from a prompt template
+    Generate data from a prompt template. High level wrapper for multiprocessing over individual prompt templates.
+    Args:
+            prompt_dict: A dictionary of prompt template information.
+                                    key-> row id from spreadsheed (downloaded by --form_path argument),
+                                    value-> `prompt_template_data`, a dictionary contains all the information from spreadsheed (downloaded by --form_path argument) accessed by the column name of the spreadsheet.
+            start_row_id: The row id (from prompt.csv file) from where we will start parsing data.
+                                    When testing new datasets/code, this flag may help ignoring some rows from prompt.csv file
+            output_dir: The path to the folder where data will be saved.
+            generate: if True, it will apply prompt template to the full dataset. If false, only one sample will be generated.
+            add_template_metadata: "Model's expected output" column data from spreadsheed (downloaded by --form_path argument).
+            num_proc: Number of parallel process.
     """
     with concurrent.futures.ProcessPoolExecutor(
         max_workers=num_proc
@@ -290,10 +328,10 @@ def select_and_generate(
                 try:
                     print(json.dumps(prompt_template_data, indent=4))
                     dataset_info = prompt_template_data["What dataset do you pick?"]
-                    if dataset_info not in dataset_mapper:
+                    if dataset_info not in DATASET_MAPPER:
                         dataset_signature = dataset_info.split()[0].lower()
                     else:
-                        dataset_signature = dataset_mapper[dataset_info]
+                        dataset_signature = DATASET_MAPPER[dataset_info]
                     if dataset_signature in IGNORE_TASKS:
                         print(f"[IGNORE][RID:{row_id}] Task exists in IGNORE_TASKS.")
                         continue
@@ -321,7 +359,7 @@ def select_and_generate(
                                 continue
                             if prompt_template_data["include (non-English)?"] == "TRUE":
                                 template_lang = MT5_2_NLLB_ONE_TO_ONE[
-                                    mt5_langs_full_name_to_iso_name[
+                                    MT5_LANGS_FULL_NAME_TO_ISO_NAME[
                                         prompt_template_data[
                                             "What language do you want to write your prompt in?"
                                         ]
@@ -361,7 +399,6 @@ def select_and_generate(
                                         error_msg,
                                         generate_desc,
                                         add_template_metadata,
-                                        prompt_template_data,
                                         template_lang,
                                     )
                                     # process(args)
@@ -412,13 +449,10 @@ def select_and_generate(
                                     error_msg,
                                     generate_desc,
                                     add_template_metadata,
-                                    prompt_template_data,
                                     template_lang,
                                 )
-                                # process(args)
                                 future = process_executor.submit(process, args)
                                 future_to_val_results.append(future)
-                                # input(":")
                             else:
                                 print(
                                     f"[IGNORE][RID:{row_id}] Due to `include EN (is English unique for dataset?)` column value in the spreadsheet."
@@ -438,7 +472,7 @@ def select_and_generate(
 
 def parse(prompt_file_path, select_rows):
     """
-    Parse list of rows menntioned in select_rows.
+    Parse list of rows menntioned in --select-rows (select_rows).
     """
     _prmompt_dict, dt_structure, idx_to_header = {}, {}, {}
     with open(prompt_file_path, "r") as csvfile:
@@ -462,48 +496,55 @@ def main():
         "--form_path",
         type=str,
         default="https://docs.google.com/spreadsheets/d/10bCwOhM8zKNkqKi54gIvdwrR44YlWQFV9fpGm7acHv8/export?format=csv&id=10bCwOhM8zKNkqKi54gIvdwrR44YlWQFV9fpGm7acHv8&gid=726399306",
-        help="Path of the google sheet.",
+        help="Path (https://docs.google.com/spreadsheets/d/...) of the google sheet. See the default value for an example."
+        "It will be downloaded as prompt.csv file in --prompt-dir",
     )
     parser.add_argument(
         "--overwrite",
         action="store_true",
-        help="Overwrite eexisting prompt file prompts.csv.",
+        help="By default the code doesn't attempt to download prompt spreadsheet it the file is already downloaded."
+        "If this flag is activates, it will overwrite (make it prompt.csv.old) existing prompt file prompts.csv.",
     )
     parser.add_argument(
         "--prompt-dir",
         type=str,
         default="data/",
-        help="Overwrite existing prompt file prompts.csv.",
+        help="The directory where prompts.csv file will be saved.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default=None,
+        help="The path to the folder where data will be saved.",
+    )
+    parser.add_argument(
+        "--generate",
+        action="store_true",
+        help="Generate projected samples."
+        "If this flag is activate the whole dataset will be generated. If not then only a single sample will be generate for audit purpose.",
+    )
+    parser.add_argument(
+        "--start-row-id",
+        default=0,
+        type=int,
+        help="The row id (from prompt.csv file) from where we will start parsing data."
+        "When testing new datasets/code, this flag may help ignoring some rows from prompt.csv file",
     )
     parser.add_argument(
         "--select-rows",
         nargs="*",
         default=[],
         type=int,
-        help="List of row indices (1-based indexing ). The row mentioned here will indicate the row of `--form_path` spreadsheet. If empty, it will select all the rows.",
-    )
-    parser.add_argument(
-        "--generate", action="store_true", help="Generate projected samples."
+        help="List of row indices (1-based indexing ). The row mentioned here will indicate the row of `--form_path` spreadsheet."
+        "If empty, it will select all the rows.",
     )
     parser.add_argument(
         "--add-template-metadata",
         action="store_true",
         help="Add Template related metadata.",
     )
-    parser.add_argument(
-        "--output-dir",
-        default=None,
-        help="The path to the folder where data will be saved.",
-    )
     parser.add_argument(
         "--num-proc", default=1, type=int, help="Number of parallel process to run."
     )
-    parser.add_argument(
-        "--start-row-id",
-        default=0,
-        type=int,
-        help="The row id from where we will start parsing data.",
-    )
     args = parser.parse_args()
     prompt_file_path = f"{args.prompt_dir}/prompts.csv"
     if (

From 440b3f1070ed654626722e6e571cf61b4ad2867c Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Sat, 3 Jun 2023 18:16:43 +0800
Subject: [PATCH 31/34] add help info

---
 data/validate_and_generate.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/data/validate_and_generate.py b/data/validate_and_generate.py
index 7456c39..144406f 100644
--- a/data/validate_and_generate.py
+++ b/data/validate_and_generate.py
@@ -497,13 +497,13 @@ def main():
         type=str,
         default="https://docs.google.com/spreadsheets/d/10bCwOhM8zKNkqKi54gIvdwrR44YlWQFV9fpGm7acHv8/export?format=csv&id=10bCwOhM8zKNkqKi54gIvdwrR44YlWQFV9fpGm7acHv8&gid=726399306",
         help="Path (https://docs.google.com/spreadsheets/d/...) of the google sheet. See the default value for an example."
-        "It will be downloaded as prompt.csv file in --prompt-dir",
+        " It will be downloaded as prompt.csv file in --prompt-dir",
     )
     parser.add_argument(
         "--overwrite",
         action="store_true",
-        help="By default the code doesn't attempt to download prompt spreadsheet it the file is already downloaded."
-        "If this flag is activates, it will overwrite (make it prompt.csv.old) existing prompt file prompts.csv.",
+        help="By default the code doesn't attempt to download prompt spreadsheet (downloaded by --form_path argument) if the file is already downloaded."
+        " If this flag is activates, it will overwrite (make it prompt.csv.old) existing prompt file prompts.csv.",
     )
     parser.add_argument(
         "--prompt-dir",
@@ -520,22 +520,22 @@ def main():
         "--generate",
         action="store_true",
         help="Generate projected samples."
-        "If this flag is activate the whole dataset will be generated. If not then only a single sample will be generate for audit purpose.",
+        " If this flag is activate the whole dataset will be generated. If not then only a single sample will be generate for audit purpose.",
     )
     parser.add_argument(
         "--start-row-id",
         default=0,
         type=int,
         help="The row id (from prompt.csv file) from where we will start parsing data."
-        "When testing new datasets/code, this flag may help ignoring some rows from prompt.csv file",
+        " When testing new datasets/code, this flag may help ignoring some rows from prompt.csv file",
     )
     parser.add_argument(
         "--select-rows",
         nargs="*",
         default=[],
         type=int,
-        help="List of row indices (1-based indexing ). The row mentioned here will indicate the row of `--form_path` spreadsheet."
-        "If empty, it will select all the rows.",
+        help="List of row indices (1-based indexing ). The row mentioned here will indicate the row of `--form_path` spreadsheet (saves to `args.prompt_dir/prompts.csv`)."
+        " If empty, it will select all the rows.",
     )
     parser.add_argument(
         "--add-template-metadata",

From 427d9a05f015ca412a59436525771c9d9e1c6c42 Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Sun, 4 Jun 2023 03:13:23 +0800
Subject: [PATCH 32/34] update truncation issue

---
 data/validate_and_generate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/validate_and_generate.py b/data/validate_and_generate.py
index 144406f..7fda7f2 100644
--- a/data/validate_and_generate.py
+++ b/data/validate_and_generate.py
@@ -191,7 +191,7 @@ def check(
         metadata=metadata,
         answer_choices=answer_choices,
     )
-    lm_io = template.apply(json_example, highlight_variables=False)
+    lm_io = template.apply(json_example, highlight_variables=False, truncate=False)
     return lm_io
 
 

From f90752443b9a0e92501e471ae90a31984c1b45a0 Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Thu, 29 Jun 2023 19:02:33 +0800
Subject: [PATCH 33/34] cleaning

---
 data/data_stat.py                |  28 +++----
 scripts/validate_and_generate.sh | 128 +++++++++++++++----------------
 2 files changed, 78 insertions(+), 78 deletions(-)

diff --git a/data/data_stat.py b/data/data_stat.py
index 9063f85..a7e3662 100644
--- a/data/data_stat.py
+++ b/data/data_stat.py
@@ -376,21 +376,21 @@ def main():
                     for sample in dt[split]:
                         assert len(sample["question"]["choices"]["label"]) == 5
 
-    if args.output_dir != "None":
-        file_name = os.path.join(args.output_dir, "stat") + f".{args.export_format}"
-        if args.export_format == "json":
-            with open(file_name, "w") as file_ptr:
-                file_ptr.write(f"{json.dumps(stat_dict, indent=4)}\n")
-        elif args.export_format == "csv":
-            # with open(file_name, mode='w') as file_ptr:
-            # 	writer = csv.writer(file_ptr)
-            # 	for dataset_name, subset_name, in SERIES_A_DATASET_NAME_DICT.keys():
-            # 		row = [f"{dataset_name}"]
+    # if args.output_dir != "None":
+    #     file_name = os.path.join(args.output_dir, "stat") + f".{args.export_format}"
+    #     if args.export_format == "json":
+    #         with open(file_name, "w") as file_ptr:
+    #             file_ptr.write(f"{json.dumps(stat_dict, indent=4)}\n")
+    #     elif args.export_format == "csv":
+    #         # with open(file_name, mode='w') as file_ptr:
+    #         # 	writer = csv.writer(file_ptr)
+    #         # 	for dataset_name, subset_name, in SERIES_A_DATASET_NAME_DICT.keys():
+    #         # 		row = [f"{dataset_name}"]
 
-            # 	writer.writerow(stat_dict.values())
-            pass
-        else:
-            raise NotImplementedError
+    #         # 	writer.writerow(stat_dict.values())
+    #         pass
+    #     else:
+    #         raise NotImplementedError
 
 
 if __name__ == "__main__":
diff --git a/scripts/validate_and_generate.sh b/scripts/validate_and_generate.sh
index fb0c452..b2b4021 100644
--- a/scripts/validate_and_generate.sh
+++ b/scripts/validate_and_generate.sh
@@ -3,77 +3,77 @@ python3 -m data.validate_and_generate \
 --output-dir "GEM_wiki_cat_sum" \
 --add-template-metadata \
 --num-proc 16 \
---select-rows 155 156 157 158 159 160 161 162 \
+--select-rows 171 \
 --generate
 
-python3 -m data.validate_and_generate \
---prompt-dir data \
---output-dir "AmazonScience_mintaka" \
---add-template-metadata \
---num-proc 16 \
---select-rows 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 \
---generate
+# python3 -m data.validate_and_generate \
+# --prompt-dir data \
+# --output-dir "AmazonScience_mintaka" \
+# --add-template-metadata \
+# --num-proc 16 \
+# --select-rows 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 \
+# --generate
 
-python3 -m data.validate_and_generate \
---prompt-dir data \
---output-dir "exams" \
---add-template-metadata \
---num-proc 16 \
---select-rows 26 \
---generate
+# python3 -m data.validate_and_generate \
+# --prompt-dir data \
+# --output-dir "exams" \
+# --add-template-metadata \
+# --num-proc 16 \
+# --select-rows 26 \
+# --generate
 
-python3 -m data.validate_and_generate \
---prompt-dir data \
---output-dir "adithya7_xlel_wd" \
---add-template-metadata \
---num-proc 16 \
---select-rows 168 169 170 171 172 173 174 175 176 \
---generate
+# python3 -m data.validate_and_generate \
+# --prompt-dir data \
+# --output-dir "adithya7_xlel_wd" \
+# --add-template-metadata \
+# --num-proc 16 \
+# --select-rows 168 169 170 171 172 173 174 175 176 \
+# --generate
 
-python3 -m data.validate_and_generate \
---prompt-dir data \
---output-dir "xcsr" \
---add-template-metadata \
---num-proc 16 \
---select-rows 163 164 165 166 167 \
---generate
+# python3 -m data.validate_and_generate \
+# --prompt-dir data \
+# --output-dir "xcsr" \
+# --add-template-metadata \
+# --num-proc 16 \
+# --select-rows 163 164 165 166 167 \
+# --generate
 
-python3 -m data.validate_and_generate \
---prompt-dir data \
---output-dir "wiki_split" \
---add-template-metadata \
---num-proc 16 \
---select-rows 152 153 154 \
---generate
+# python3 -m data.validate_and_generate \
+# --prompt-dir data \
+# --output-dir "wiki_split" \
+# --add-template-metadata \
+# --num-proc 16 \
+# --select-rows 152 153 154 \
+# --generate
 
-python3 -m data.validate_and_generate \
---prompt-dir data \
---output-dir "TurkuNLP_turku_paraphrase_corpus" \
---add-template-metadata \
---num-proc 16 \
---select-rows 144 145 \
---generate
+# python3 -m data.validate_and_generate \
+# --prompt-dir data \
+# --output-dir "TurkuNLP_turku_paraphrase_corpus" \
+# --add-template-metadata \
+# --num-proc 16 \
+# --select-rows 144 145 \
+# --generate
 
-python3 -m data.validate_and_generate \
---prompt-dir data \
---output-dir "soda" \
---add-template-metadata \
---num-proc 16 \
---select-rows 136 138 139 140 141 142 \
---generate
+# python3 -m data.validate_and_generate \
+# --prompt-dir data \
+# --output-dir "soda" \
+# --add-template-metadata \
+# --num-proc 16 \
+# --select-rows 136 138 139 140 141 142 \
+# --generate
 
-python3 -m data.validate_and_generate \
---prompt-dir data \
---output-dir "theblackcat102_joke_explaination" \
---add-template-metadata \
---num-proc 16 \
---select-rows 27 28 29 30 31 33 37  \
---generate
+# python3 -m data.validate_and_generate \
+# --prompt-dir data \
+# --output-dir "theblackcat102_joke_explaination" \
+# --add-template-metadata \
+# --num-proc 16 \
+# --select-rows 27 28 29 30 31 33 37  \
+# --generate
 
-python3 -m data.validate_and_generate \
---prompt-dir data \
---output-dir "allenai_scirepeval" \
---add-template-metadata \
---num-proc 16 \
---select-rows 15 17 18 19 21 \
---generate
\ No newline at end of file
+# python3 -m data.validate_and_generate \
+# --prompt-dir data \
+# --output-dir "allenai_scirepeval" \
+# --add-template-metadata \
+# --num-proc 16 \
+# --select-rows 15 17 18 19 21 \
+# --generate
\ No newline at end of file

From baf5326396efc571a98e9c1aa8596dbfe08de4d1 Mon Sep 17 00:00:00 2001
From: M Saiful Bari <sbmaruf@gmail.com>
Date: Fri, 30 Jun 2023 12:17:39 +0800
Subject: [PATCH 34/34] handle exception

---
 data/validate_and_generate.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/data/validate_and_generate.py b/data/validate_and_generate.py
index 7fda7f2..0d99c89 100644
--- a/data/validate_and_generate.py
+++ b/data/validate_and_generate.py
@@ -142,6 +142,7 @@
     "xlel_wd https://huggingface.co/datasets/adithya7/xlel_wd": "adithya7/xlel_wd",
     "allenai/scirepeval/biomimicry https://huggingface.co/datasets/allenai/scirepeval/viewer/biomimicry/train": "allenai/scirepeval",
     "Turku Paraphrase https://huggingface.co/datasets/TurkuNLP/turku_paraphrase_corpus": "TurkuNLP/turku_paraphrase_corpus",
+    "Xwikis https://huggingface.co/datasets/GEM/xwikis": "GEM/xwikis",
 }
 
 # These tasks have huggingface data loading error
@@ -283,8 +284,8 @@ def process(args):
             jinja_template=f"{model_input} ||| {model_exp_output}",
             template_reference=prompt_template_data["Discord username"],
         )
-        assert len(lm_io) == 2, error_msg
-
+        if len(lm_io) != 2:
+            continue
         out_data = copy.deepcopy(prompt_template_data) if add_template_metadata else {}
         out_data["projected_template_lang"] = projected_template_lang
         out_data["inputs"] = lm_io[0]