import gc import joblib import random import numpy as np import pandas as pd from tqdm import tqdm import os, warnings import lightgbm as lgb from sklearn.svm import SVR from sklearn.linear_model import Ridge from sklearn.kernel_ridge import KernelRidge from sklearn.metrics import mean_squared_error from sklearn.metrics.pairwise import rbf_kernel from .utils import * from .config import model_dir, grid_dir, store_list, lgb_params_list warnings.filterwarnings("ignore") def train_lgb_model(train_x, train_y, val_x, val_y, store, lr, nl, md, best, save=True, n_estimators=0, train_flag=0): lgb_params = { "boosting_type": "gbdt", "objective": "rmse", "metric": "rmse", "learning_rate": lr, "num_leaves": nl, "max_depth": md, "n_estimators": 100000, "boost_from_average": False, "verbose": -1, } if train_flag: idx = int(len(train_y) * 0.1) train_data = lgb.Dataset(train_x[:-idx], label=train_y[:-idx]) val_data = lgb.Dataset(train_x[-idx:], label=train_y[-idx:]) else: train_data = lgb.Dataset(train_x, label=train_y) val_data = lgb.Dataset(val_x, label=val_y) if n_estimators: lgb_params["n_estimators"] = n_estimators gbm = lgb.train(lgb_params, train_data, verbose_eval=100) else: gbm = lgb.train(lgb_params, train_data, valid_sets=[val_data], verbose_eval=100, early_stopping_rounds=1000) test_y = gbm.predict(val_x, num_iteration=gbm.best_iteration) res = mean_squared_error(val_y, test_y, squared=False) if res < best: best = res if save: gbm.save_model(os.path.join(model_dir, f"lgb_{store}.out")) return best def train_ridge_model(train_x, train_y, val_x, val_y, store, a, best, save=True): model = Ridge(alpha=a) model.fit(train_x, train_y) test_y = model.predict(val_x) res = mean_squared_error(val_y, test_y, squared=False) if res < best: best = res if save: joblib.dump(model, os.path.join(model_dir, f"ridge_{store}.out")) return best def train_svm_model( train_x, train_y, val_x, val_y, store, C, epsilon, best, save=True, gamma=0.1, adaptation_model=[], K1=None, K2=None ): if K1 is None: model = SVR(C=C, epsilon=epsilon, max_iter=30000, cache_size=10240, verbose=True, gamma=gamma) else: model = AuxiliarySVR( C=C, epsilon=epsilon, gamma=gamma, adaptation_model=adaptation_model, max_iter=30000, cache_size=10240, verbose=True, K1=K1, K2=K2, ) model.fit(train_x, train_y) test_y = model.predict(val_x) res = mean_squared_error(val_y, test_y, squared=False) if res < best: best = res if save: joblib.dump(model, os.path.join(model_dir, f"svm_{store}.out")) return best def train_krr_model(train_x, train_y, val_x, val_y, store, a, best, save=True, gamma=0.1, K1=None, K2=None): if K1 is None: model = KernelRidge(kernel="rbf", alpha=a, gamma=gamma) model.fit(train_x, train_y) test_y = model.predict(val_x) res = mean_squared_error(val_y, test_y, squared=False) else: len1, len2 = len(train_y), len(val_y) model = KernelRidge(kernel="precomputed", alpha=a) model.fit(K1[-len1:, -len1:], train_y) test_y = model.predict(K2[-len2:, -len1:]) res = mean_squared_error(val_y, test_y, squared=False) if res < best: best = res if save: joblib.dump(model, os.path.join(model_dir, f"krr_{store}.out")) return best def grid_search(store_id, algo, search_lgb_flag=False): store = store_list[store_id] if algo == "lgb": train_x, train_y, val_x, val_y = acquire_data(store, True) learning_rate = [0.005, 0.01, 0.015] num_leaves = [128, 224, 300] max_depth = [50, 66, 80] best = 10000000 if search_lgb_flag: for lr in learning_rate: for nl in num_leaves: for md in max_depth: best = train_lgb_model(train_x, train_y, val_x, val_y, store, lr, nl, md, best) print(f"store: {store}, lr: {lr}, nl: {nl}, md: {md}, best: {best}") else: lr, nl, md = lgb_params_list[store_id] best = train_lgb_model(train_x, train_y, val_x, val_y, store, lr, nl, md, best) print(f"store: {store}, lr: {lr}, nl: {nl}, md: {md}, best: {best}") elif algo == "ridge": train_x, train_y, val_x, val_y = acquire_data(store, True) alpha = [0.01, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10, 20, 30] best = 10000000 for a in alpha: best = train_ridge_model(train_x, train_y, val_x, val_y, store, a, best) print(f"store: {store}, alpha: {a}, best: {best}") def grid_training_sample(algo, user_list=list(range(10))): for i in range(len(user_list)): store_id = user_list[i] store = store_list[store_id] org_train_x, org_train_y, val_x, val_y = acquire_data(store, True) res = [] proportion_list = [ 100, 300, 500, 700, 900, 1000, 3000, 5000, 7000, 9000, 10000, 30000, 50000, 70000, 90000, 100000, 300000, 500000, 700000, 900000, 1000000, 3000000, 5000000, ] for proportion in proportion_list: """ random org_idx_list = list(range(len(org_train_y))) idx_list = random.sample(org_idx_list, min(proportion, len(org_train_y))) train_x = org_train_x.iloc[idx_list] train_y = org_train_y.iloc[idx_list] """ train_x = org_train_x[-proportion:] train_y = org_train_y[-proportion:] best = 10000000 if algo == "lgb": lr, nl, md = lgb_params_list[store_id] best = train_lgb_model( train_x, train_y, val_x, val_y, store, lr, nl, md, best, save=False, n_estimators=3000, train_flag=0 ) print(f"store: {store}, lr: {lr}, nl: {nl}, md: {md}, best: {best}") elif algo == "ridge": alpha = [0.01, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10, 20, 30] for a in alpha: best = train_ridge_model(train_x, train_y, val_x, val_y, store, a, best, save=False) print(f"store: {store}, alpha: {a}, best: {best}") elif algo == "svm": C = [1, 10, 100] epsilon = 0.001 for c in C: best = train_svm_model(train_x, train_y, val_x, val_y, store, c, epsilon, best, save=False) print(f"store: {store}, C: {c}, epsilon: {epsilon}, best: {best}") res.append([proportion, best]) np.savetxt(os.path.join(grid_dir, f"grid_sample_{algo}_{store}.out"), np.array(res)) if proportion > len(org_train_y): break def retrain_models(algo): for store_id in range(10): grid_search(store_id, algo) def train_adaptation_grid( algo, max_sample, test_sample, user_list=list(range(10)), adaptation_model=[], residual=False ): """ adaptation_model = [ [("lgb", 1), ("ridge", 2)], [("lgb", 1), ("ridge", 2)] ] """ proportion_list = [ 100, 300, 500, 700, 900, 1000, 3000, 5000, 7000, 9000, 10000, 30000, 50000, 70000, 90000, 100000, 300000, 500000, 700000, 900000, 1000000, 3000000, 5000000, ] sample_idx = proportion_list.index(max_sample) + 1 for i in range(len(user_list)): store_id = user_list[i] store = store_list[store_id] org_train_x, org_train_y, val_x, val_y = acquire_data(store, True) val_x = val_x[-test_sample:] val_y = val_y[-test_sample:] if algo == "lgb" or algo == "ridge": res = [] if adaptation_model != []: if residual: aux_algo, model_idx = adaptation_model[i][0] org_train_y -= model_predict(aux_algo, model_idx, org_train_x) val_y -= model_predict(aux_algo, model_idx, val_x) else: train_y_list, val_y_list = [], [] for aux_algo, model_idx in adaptation_model[i]: train_y_list.append(model_predict(aux_algo, model_idx, org_train_x)) val_y_list.append(model_predict(aux_algo, model_idx, val_x)) for j in range(len(train_y_list)): org_train_x[f"model_values_{j}"] = train_y_list[j] val_x[f"model_values_{j}"] = val_y_list[j] for proportion in proportion_list[:sample_idx]: """ random org_idx_list = list(range(len(org_train_y))) idx_list = random.sample(org_idx_list, min(proportion, len(org_train_y))) train_x = org_train_x.iloc[idx_list] train_y = org_train_y.iloc[idx_list] """ train_x = org_train_x[-proportion:] train_y = org_train_y[-proportion:] best = 10000000 if algo == "lgb": if max_sample < 50000: learning_rate = [0.005, 0.01, 0.015] num_leaves = [128, 224, 300] max_depth = [50, 66, 80] for lr in learning_rate: for nl in num_leaves: for md in max_depth: best = train_lgb_model( train_x, train_y, val_x, val_y, store, lr, nl, md, best, save=False ) print(f"store: {store}, lr: {lr}, nl: {nl}, md: {md}, best: {best}") else: lr, nl, md = lgb_params_list[store_id] best = train_lgb_model( train_x, train_y, val_x, val_y, store, lr, nl, md, best, save=False, n_estimators=3000, train_flag=0, ) print(f"store: {store}, lr: {lr}, nl: {nl}, md: {md}, best: {best}") elif algo == "ridge": alpha = [0.01, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10, 20, 30] for a in alpha: best = train_ridge_model(train_x, train_y, val_x, val_y, store, a, best, save=False) print(f"store: {store}, alpha: {a}, best: {best}") res.append([proportion, best]) text = str(adaptation_model[i]) if adaptation_model != [] else "null" text += "_residual_" if residual else "" np.savetxt(os.path.join(grid_dir, f"{algo}_using_{text}_{store}.out"), np.array(res)) if proportion > len(org_train_y): break elif algo == "svm" or algo == "krr": res = [[proportion, 10000] for proportion in proportion_list[:sample_idx]] org_train_x = org_train_x.to_numpy() org_train_y = org_train_y.to_numpy() val_x = val_x.to_numpy() val_y = val_y.to_numpy() y1_list, y2_list = [], [] gamma_list = [0.01, 0.1, 0.5, 1] if residual: aux_algo, model_idx = adaptation_model[i][0] org_train_y = org_train_y.astype(np.float64) val_y = val_y.astype(np.float64) org_train_y -= model_predict(aux_algo, model_idx, org_train_x) val_y -= model_predict(aux_algo, model_idx, val_x) elif adaptation_model != []: for aux_algo, idx in adaptation_model[i]: y1_list.append(model_predict(aux_algo, idx, org_train_x[-max_sample:]).reshape(-1, 1)) y2_list.append(model_predict(aux_algo, idx, val_x).reshape(-1, 1)) for gamma in gamma_list: K1 = np.zeros((max_sample, max_sample)) K2 = np.zeros((len(val_x), max_sample)) if (not residual) and adaptation_model != []: for j in range(len(adaptation_model[i])): aux_algo, idx = adaptation_model[i][j] y1 = y1_list[j] y2 = y2_list[j] K1 += np.dot(y1, y1.T) K2 += np.dot(y2, y1.T) K1 += rbf_kernel(org_train_x[-max_sample:], org_train_x[-max_sample:], gamma=gamma) K2 += rbf_kernel(val_x, org_train_x[-max_sample:], gamma=gamma) for idx in range(len(proportion_list[:sample_idx])): proportion = proportion_list[idx] """ random org_idx_list = list(range(len(org_train_y))) idx_list = random.sample(org_idx_list, min(proportion, len(org_train_y))) train_x = org_train_x.iloc[idx_list] train_y = org_train_y.iloc[idx_list] """ train_x = org_train_x[-proportion:] train_y = org_train_y[-proportion:] best = 10000000 if algo == "svm": C = [1, 10, 50, 100, 200] epsilon = 0.001 for c in C: adapt_m = [] if adaptation_model == [] else adaptation_model[i] best = train_svm_model( train_x, train_y, val_x, val_y, store, c, epsilon, best, save=False, gamma=gamma, adaptation_model=adapt_m, K1=K1, K2=K2, ) print(f"store: {store}, gamma: {gamma}, C: {c}, epsilon: {epsilon}, best: {best}") elif algo == "krr": alpha = [0.01, 0.1, 0.5, 1.0, 5.0, 10] for a in alpha: best = train_krr_model( train_x, train_y, val_x, val_y, store, a, best, save=False, gamma=gamma, K1=K1, K2=K2 ) print(f"store: {store}, a: {a}, gamma: {gamma}, best: {best}") if best < res[idx][1]: res[idx][1] = best text = str(adaptation_model[i]) if adaptation_model != [] else "null" text += "_residual" if residual else "" np.savetxt(os.path.join(grid_dir, f"{algo}_using_{text}_{store}.out"), np.array(res)) if proportion > len(org_train_y): break del train_x, train_y gc.collect() del K1, K2 gc.collect() del org_train_x, org_train_y gc.collect()