Skip to content

Commit

Permalink
Update catboost
Browse files Browse the repository at this point in the history
- added exp description optn
- renamed dataset description => feature description
- update total_features_config according to fe version (=v1.2)
  • Loading branch information
donghyyun committed May 7, 2022
1 parent baf6e60 commit 383ba2e
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 40 deletions.
14 changes: 8 additions & 6 deletions catboost/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@

import pandas as pd

from features import Features

class GBMDataset:
def __init__(self, data_path, descript=None, feat_path="/opt/ml/input/code/catboost/feature_config.json"):
features = self._get_features_from(feat_path)
self.features = features["FEAT"]
self.cat_features = features["CAT_FEAT"]
def __init__(self, data_path, descript=None):
features = Features()
self.features = features.FEAT
self.cat_features = features.CAT_FEAT
self.num_features = list(set(self.features) - set(self.cat_features))
self.df = pd.read_csv(data_path)

Expand All @@ -26,8 +27,9 @@ def split_data(self, pseudo_labeling=False):

train_valid_df = self.get_train_data(pseudo_labeling)

train = train_valid_df[train_valid_df.user == train_valid_df.user.shift(-1)]
valid = train_valid_df[train_valid_df.user != train_valid_df.user.shift(-1)]
valid = train_valid_df[(train_valid_df.user != train_valid_df.user.shift(-1)) &
(train_valid_df["label"] == "train")]
train = train_valid_df[~train_valid_df.index.isin(valid.index)]

X_train, y_train = train[self.features], train["answer"]
X_valid, y_valid = valid[self.features], valid["answer"]
Expand Down
34 changes: 34 additions & 0 deletions catboost/features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
class Features:
FEAT = ["user", "assessmentItemID", "testId", "tag", "category", "test", "item",
"month", "day", "weekday", "hour",
# "elapsed", "test_elapsed",
# "prev_elapsed",
"prev_test_elapsed",

"user_correct_answer", "user_total_answer", "user_acc",

"user_category_correct_answer", "user_category_total_answer", "user_category_acc",
# "user_tag_correct_answer", "user_tag_total_answer", "user_tag_acc",
"user_testId_correct_answer", "user_testId_total_answer", "user_testId_acc",

"user_category_cum_telapsed", "user_category_mean_telapsed",
# "user_tag_cum_telapsed", "user_tag_mean_telapsed",
"user_testId_cum_telapsed", "user_testId_mean_telapsed",

"testId_answer_mean", "testId_test_elapsed_mean",
# "testId_answer_sum",

# "tag_answer_mean", "tag_test_elapsed_mean",
# "tag_answer_sum",

"assessmentItemID_answer_mean", "assessmentItemID_test_elapsed_mean",
# "assessmentItemID_answer_sum",

"category_answer_mean", "category_test_elapsed_mean",
# "category_answer_sum",

# "last_prob"
]
CAT_FEAT = ["user", "assessmentItemID", "testId", "tag", "category", "test", "item",
"month", "day", "weekday", "hour"
]
20 changes: 12 additions & 8 deletions catboost/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import argparse
import os
import random
from sys import float_info

import pandas as pd
import numpy as np
Expand All @@ -17,13 +18,14 @@ def seed_everythings(seed):
def main(args):
# dataset setting
print(">>> load dataset...")
dataset = GBMDataset(args.data_dir, descript=args.dataset_descript, feat_path=args.feat_path)
dataset = GBMDataset(args.data_dir, descript=args.feature_descript)
X_train, X_valid, y_train, y_valid = dataset.split_data()
print("<<< done!\n")

if dataset.descript:
args.output_dir = os.path.join(args.output_dir, dataset.descript)
print(f"# of features: {len(dataset.features)}")
args.output_dir = os.path.join(args.output_dir, dataset.descript + "_" + args.exp_descript)
os.makedirs(args.output_dir, exist_ok=True)
print(f"The result will be saved in {args.output_dir}")
print("<<< done!\n")

# model setting & run
print(">>>load model with configurations...")
Expand All @@ -36,7 +38,7 @@ def main(args):
verbose=args.verbose,
)
# save model feature information
model.save_features(dataset.features, dataset.cat_features)
model.save_features(dataset.features, dataset.cat_features, args.feature_descript)
if args.save_model:
model.save_model(args.output_dir)

Expand All @@ -50,17 +52,19 @@ def main(args):
parser = argparse.ArgumentParser()

parser.add_argument("--data_dir", type=str, default="/opt/ml/input/data/feature_engineering/processed_data.csv")
parser.add_argument("--feat_path", type=str, default="./feature_config.json")
parser.add_argument("--inference_dir", type=str, default="/opt/ml/input/data/sample_submission.csv")
parser.add_argument("--dataset_descript", type=str, default=None)
parser.add_argument("--feature_descript", type=str, default=None)

parser.add_argument("--output_dir", type=str, default="/opt/ml/output/catboost")
parser.add_argument("--lr",type=float, default=None)
parser.add_argument("--iteration", type=int, default=10000)
parser.add_argument("--early_stopping", type=int, default=300)
parser.add_argument("--early_stopping", type=int, default=1000)
parser.add_argument("--seed", type=int, default=2022)
parser.add_argument("--verbose", type=int, default=100)
parser.add_argument("--save_model", type=bool, default=False)

parser.add_argument("--exp_descript", type=str, default=None)

args = parser.parse_args()

seed_everythings(args.seed)
Expand Down
6 changes: 4 additions & 2 deletions catboost/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ def __init__(self, args, output_dir):
eval_metric="AUC",
early_stopping_rounds=args.early_stopping,
train_dir=self.output_dir,
learning_rate=args.lr,

task_type="GPU",
devices="0")
Expand All @@ -24,11 +25,12 @@ def fit(self, X, y, cat_features, eval_set, verbose=100):
eval_set=eval_set,
verbose=verbose)

def save_features(self, features, cat_features):
def save_features(self, features, cat_features, feature_descript):
# feature names
with open(os.path.join(self.output_dir, "features.json"), "w") as f:
feature_dict = {
"num_feats": len(features),
"description": feature_descript,
"FEAT": features,
"CAT_FEAT": cat_features
}
Expand All @@ -47,4 +49,4 @@ def save_features(self, features, cat_features):
plt.savefig(os.path.join(self.output_dir, "feature_importances.png"))

def inference(self, X_test):
return self.model.predict_proba(X_test)[:, 1]
return self.model.predict_proba(X_test)[:, 1]
24 changes: 0 additions & 24 deletions catboost/total_features_for_fe_ver1-1.json

This file was deleted.

File renamed without changes.

0 comments on commit 383ba2e

Please sign in to comment.