|
- from math import gamma
- from tkinter import Y
- import joblib
- from tqdm import tqdm
- import numpy as np
- import pandas as pd
- import lightgbm as lgb
- from sklearn.svm import SVR
- from sklearn.metrics import mean_squared_error
- from sklearn.metrics.pairwise import rbf_kernel
- import os, sys, gc, time, warnings, pickle, psutil, random
- import matplotlib.pyplot as plt
- from mpl_toolkits.axes_grid1 import make_axes_locatable
-
-
- from .config import *
-
-
- class AuxiliarySVR:
- def __init__(
- self, C, epsilon, gamma, adaptation_model=[], max_iter=30000, cache_size=10240, verbose=False, K1=None, K2=None
- ):
- self.gamma = gamma
- self.adaptation_model = adaptation_model
- self.model = SVR(
- C=C,
- epsilon=epsilon,
- kernel=self.auxiliary_rbf_kernel,
- max_iter=max_iter,
- cache_size=cache_size,
- verbose=verbose,
- )
- self.K1 = K1
- self.K2 = K2
-
- def auxiliary_rbf_kernel(self, X1, X2):
- if self.K1 is not None:
- if X1.shape[0] == X2.shape[0]:
- return self.K1[-X1.shape[0] :, -X2.shape[0] :]
- else:
- return self.K2[-X1.shape[0] :, -X2.shape[0] :]
- else:
- K = np.zeros((len(X1), len(X2)))
-
- for algo, idx in self.adaptation_model:
- Y1 = model_predict(algo, idx, X1).reshape(-1, 1)
- Y2 = model_predict(algo, idx, X2).reshape(-1, 1)
- K += Y1 @ Y2.T
-
- K += rbf_kernel(X1, X2, self.gamma)
- return K
-
- def fit(self, X, Y):
- self.gamma = 1 / X.shape[1]
- self.model.fit(X, Y)
-
- def predict(self, X):
- return self.model.predict(X)
-
-
- def measure_aux_algo(idx, test_sample, model):
- """
- model = ("lgb", 1)
- """
- store = store_list[idx]
- org_train_x, org_train_y, val_x, val_y = acquire_data(store, True)
- pred_y = model_predict(model[0], model[1], val_x[-test_sample:])
- return score(pred_y, val_y[-test_sample:])
-
-
- # Simple "Memory profilers" to see memory usage
- def get_memory_usage():
- return np.round(psutil.Process(os.getpid()).memory_info()[0] / 2.0**30, 2)
-
-
- def sizeof_fmt(num, suffix="B"):
- for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
- if abs(num) < 1024.0:
- return "%3.1f%s%s" % (num, unit, suffix)
- num /= 1024.0
- return "%.1f%s%s" % (num, "Yi", suffix)
-
-
- # Memory Reducer
- def reduce_mem_usage(df, float16_flag=True, verbose=True):
- numerics = ["int16", "int32", "int64", "float16", "float32", "float64"]
- start_mem = df.memory_usage().sum() / 1024**2
- for col in df.columns:
- col_type = df[col].dtypes
- if col_type in numerics:
- c_min = df[col].min()
- c_max = df[col].max()
- if str(col_type)[:3] == "int":
- if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
- df[col] = df[col].astype(np.int8)
- elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
- df[col] = df[col].astype(np.int16)
- elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
- df[col] = df[col].astype(np.int32)
- elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
- df[col] = df[col].astype(np.int64)
- else:
- if float16_flag and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
- df[col] = df[col].astype(np.float16)
- elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
- df[col] = df[col].astype(np.float32)
- else:
- df[col] = df[col].astype(np.float64)
- end_mem = df.memory_usage().sum() / 1024**2
- if verbose:
- print(
- "Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)".format(
- end_mem, 100 * (start_mem - end_mem) / start_mem
- )
- )
- return df
-
-
- # Merging by concat to not lose dtypes
- def merge_by_concat(df1, df2, merge_on):
- merged_gf = df1[merge_on]
- merged_gf = merged_gf.merge(df2, on=merge_on, how="left")
- new_columns = [col for col in list(merged_gf) if col not in merge_on]
- df1 = pd.concat([df1, merged_gf[new_columns]], axis=1)
- return df1
-
-
- def model_predict(algo, idx, test_x):
- store = store_list[idx]
-
- if algo == "lgb":
- model = lgb.Booster(model_file=os.path.join(model_dir, f"lgb_{store}.out"))
- return model.predict(test_x, num_iteration=model.best_iteration)
- elif algo == "ridge":
- model = joblib.load(os.path.join(model_dir, f"ridge_{store}.out"))
- return model.predict(test_x)
- elif algo == "svm":
- model = joblib.load(os.path.join(model_dir, f"svm_{store}.out"))
- return model.predict(test_x)
-
-
- def get_weights(algo):
- weights = []
-
- if algo == "lgb":
- for store in store_list:
- model = lgb.Booster(model_file=os.path.join(model_dir, f"lgb_{store}.out"))
- weights.append(model.feature_importance())
- else:
- for store in store_list:
- model = joblib.load(os.path.join(model_dir, f"ridge_{store}.out"))
- weights.append(model.coef_)
-
- return np.array(weights)
-
-
- def score(real_y, pred_y, sample_weight, multioutput):
- return mean_squared_error(real_y, pred_y, sample_weight=sample_weight, multioutput=multioutput, squared=False)
-
-
- def acquire_data(store, fill_flag=False):
- TARGET = "sales"
- suffix = f"_fill" if fill_flag else ""
- train = pd.read_pickle(os.path.join(processed_data_dir, f"train_{store}{suffix}.pkl"))
- val = pd.read_pickle(os.path.join(processed_data_dir, f"val_{store}{suffix}.pkl"))
-
- train_y = train[TARGET]
- train_x = train.drop(columns=TARGET, axis=1)
- val_y = val[TARGET]
- val_x = val.drop(columns=TARGET, axis=1)
-
- train_x = train_x.to_numpy()
- train_y = train_y.to_numpy()
- val_x = val_x.to_numpy()
- val_y = val_y.to_numpy()
-
- return train_x, train_y, val_x, val_y
|