From 383ba2ef3246e5ba4b4b10fc5b27dd70814bfe14 Mon Sep 17 00:00:00 2001 From: donghyyun Date: Fri, 6 May 2022 17:46:32 +0000 Subject: [PATCH] Update catboost - added exp description optn - renamed dataset description => feature description - update total_features_config according to fe version (=v1.2) --- catboost/dataset.py | 14 ++++---- catboost/features.py | 34 +++++++++++++++++++ catboost/main.py | 20 ++++++----- catboost/model.py | 6 ++-- catboost/total_features_for_fe_ver1-1.json | 24 ------------- ...json => total_features_for_fe_ver1-2.json} | 0 6 files changed, 58 insertions(+), 40 deletions(-) create mode 100644 catboost/features.py delete mode 100644 catboost/total_features_for_fe_ver1-1.json rename catboost/{feature_config.json => total_features_for_fe_ver1-2.json} (100%) diff --git a/catboost/dataset.py b/catboost/dataset.py index 4047994..9411c1c 100644 --- a/catboost/dataset.py +++ b/catboost/dataset.py @@ -2,12 +2,13 @@ import pandas as pd +from features import Features class GBMDataset: - def __init__(self, data_path, descript=None, feat_path="/opt/ml/input/code/catboost/feature_config.json"): - features = self._get_features_from(feat_path) - self.features = features["FEAT"] - self.cat_features = features["CAT_FEAT"] + def __init__(self, data_path, descript=None): + features = Features() + self.features = features.FEAT + self.cat_features = features.CAT_FEAT self.num_features = list(set(self.features) - set(self.cat_features)) self.df = pd.read_csv(data_path) @@ -26,8 +27,9 @@ def split_data(self, pseudo_labeling=False): train_valid_df = self.get_train_data(pseudo_labeling) - train = train_valid_df[train_valid_df.user == train_valid_df.user.shift(-1)] - valid = train_valid_df[train_valid_df.user != train_valid_df.user.shift(-1)] + valid = train_valid_df[(train_valid_df.user != train_valid_df.user.shift(-1)) & + (train_valid_df["label"] == "train")] + train = train_valid_df[~train_valid_df.index.isin(valid.index)] X_train, y_train = train[self.features], train["answer"] X_valid, y_valid = valid[self.features], valid["answer"] diff --git a/catboost/features.py b/catboost/features.py new file mode 100644 index 0000000..1a8a49e --- /dev/null +++ b/catboost/features.py @@ -0,0 +1,34 @@ +class Features: + FEAT = ["user", "assessmentItemID", "testId", "tag", "category", "test", "item", + "month", "day", "weekday", "hour", + # "elapsed", "test_elapsed", + # "prev_elapsed", + "prev_test_elapsed", + + "user_correct_answer", "user_total_answer", "user_acc", + + "user_category_correct_answer", "user_category_total_answer", "user_category_acc", + # "user_tag_correct_answer", "user_tag_total_answer", "user_tag_acc", + "user_testId_correct_answer", "user_testId_total_answer", "user_testId_acc", + + "user_category_cum_telapsed", "user_category_mean_telapsed", + # "user_tag_cum_telapsed", "user_tag_mean_telapsed", + "user_testId_cum_telapsed", "user_testId_mean_telapsed", + + "testId_answer_mean", "testId_test_elapsed_mean", + # "testId_answer_sum", + + # "tag_answer_mean", "tag_test_elapsed_mean", + # "tag_answer_sum", + + "assessmentItemID_answer_mean", "assessmentItemID_test_elapsed_mean", + # "assessmentItemID_answer_sum", + + "category_answer_mean", "category_test_elapsed_mean", + # "category_answer_sum", + + # "last_prob" + ] + CAT_FEAT = ["user", "assessmentItemID", "testId", "tag", "category", "test", "item", + "month", "day", "weekday", "hour" + ] diff --git a/catboost/main.py b/catboost/main.py index 8c24121..18a3820 100644 --- a/catboost/main.py +++ b/catboost/main.py @@ -1,6 +1,7 @@ import argparse import os import random +from sys import float_info import pandas as pd import numpy as np @@ -17,13 +18,14 @@ def seed_everythings(seed): def main(args): # dataset setting print(">>> load dataset...") - dataset = GBMDataset(args.data_dir, descript=args.dataset_descript, feat_path=args.feat_path) + dataset = GBMDataset(args.data_dir, descript=args.feature_descript) X_train, X_valid, y_train, y_valid = dataset.split_data() - print("<<< done!\n") - if dataset.descript: - args.output_dir = os.path.join(args.output_dir, dataset.descript) + print(f"# of features: {len(dataset.features)}") + args.output_dir = os.path.join(args.output_dir, dataset.descript + "_" + args.exp_descript) os.makedirs(args.output_dir, exist_ok=True) + print(f"The result will be saved in {args.output_dir}") + print("<<< done!\n") # model setting & run print(">>>load model with configurations...") @@ -36,7 +38,7 @@ def main(args): verbose=args.verbose, ) # save model feature information - model.save_features(dataset.features, dataset.cat_features) + model.save_features(dataset.features, dataset.cat_features, args.feature_descript) if args.save_model: model.save_model(args.output_dir) @@ -50,17 +52,19 @@ def main(args): parser = argparse.ArgumentParser() parser.add_argument("--data_dir", type=str, default="/opt/ml/input/data/feature_engineering/processed_data.csv") - parser.add_argument("--feat_path", type=str, default="./feature_config.json") parser.add_argument("--inference_dir", type=str, default="/opt/ml/input/data/sample_submission.csv") - parser.add_argument("--dataset_descript", type=str, default=None) + parser.add_argument("--feature_descript", type=str, default=None) parser.add_argument("--output_dir", type=str, default="/opt/ml/output/catboost") + parser.add_argument("--lr",type=float, default=None) parser.add_argument("--iteration", type=int, default=10000) - parser.add_argument("--early_stopping", type=int, default=300) + parser.add_argument("--early_stopping", type=int, default=1000) parser.add_argument("--seed", type=int, default=2022) parser.add_argument("--verbose", type=int, default=100) parser.add_argument("--save_model", type=bool, default=False) + parser.add_argument("--exp_descript", type=str, default=None) + args = parser.parse_args() seed_everythings(args.seed) diff --git a/catboost/model.py b/catboost/model.py index 8988792..5e45cd6 100644 --- a/catboost/model.py +++ b/catboost/model.py @@ -14,6 +14,7 @@ def __init__(self, args, output_dir): eval_metric="AUC", early_stopping_rounds=args.early_stopping, train_dir=self.output_dir, + learning_rate=args.lr, task_type="GPU", devices="0") @@ -24,11 +25,12 @@ def fit(self, X, y, cat_features, eval_set, verbose=100): eval_set=eval_set, verbose=verbose) - def save_features(self, features, cat_features): + def save_features(self, features, cat_features, feature_descript): # feature names with open(os.path.join(self.output_dir, "features.json"), "w") as f: feature_dict = { "num_feats": len(features), + "description": feature_descript, "FEAT": features, "CAT_FEAT": cat_features } @@ -47,4 +49,4 @@ def save_features(self, features, cat_features): plt.savefig(os.path.join(self.output_dir, "feature_importances.png")) def inference(self, X_test): - return self.model.predict_proba(X_test)[:, 1] \ No newline at end of file + return self.model.predict_proba(X_test)[:, 1] diff --git a/catboost/total_features_for_fe_ver1-1.json b/catboost/total_features_for_fe_ver1-1.json deleted file mode 100644 index 257d163..0000000 --- a/catboost/total_features_for_fe_ver1-1.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "FEAT": ["user", "assessmentItemID", "testId", "tag", "category", "test", "item", - "month", "day", "weekday", "hour", - "elapsed", "test_elapsed", - - "user_correct_answer", "user_total_answer", "user_acc", - "user_category_correct_answer", "user_category_total_answer", "user_category_acc", - "user_tag_correct_answer", "user_tag_total_answer", "user_tag_acc", - "user_testId_correct_answer", "user_testId_total_answer", "user_testId_acc", - - "user_category_cum_telapsed", "user_category_mean_telapsed", - "user_tag_cum_telapsed", "user_tag_mean_telapsed", - "user_testId_cum_telapsed", "user_testId_mean_telapsed", - - "testId_answer_mean", "testId_answer_sum", "testId_test_elapsed_mean", - "tag_answer_mean", "tag_answer_sum", "tag_test_elapsed_mean", - "assessmentItemID_answer_mean", "assessmentItemID_answer_sum", "assessmentItemID_test_elapsed_mean", - "category_answer_mean", "category_answer_sum", "category_test_elapsed_mean", - "last_prob" - ], - - "CAT_FEAT": ["user", "assessmentItemID", "testId", "tag", "category", "test", "item", - "month", "day", "weekday", "hour"] -} \ No newline at end of file diff --git a/catboost/feature_config.json b/catboost/total_features_for_fe_ver1-2.json similarity index 100% rename from catboost/feature_config.json rename to catboost/total_features_for_fe_ver1-2.json