beimingwu
/
learnware

from math import gamma
from tkinter import Y
import joblib
from tqdm import tqdm
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import rbf_kernel
import os, sys, gc, time, warnings, pickle, psutil, random
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable


from .config import *


class AuxiliarySVR:
    def __init__(
        self, C, epsilon, gamma, adaptation_model=[], max_iter=30000, cache_size=10240, verbose=False, K1=None, K2=None
    ):
        self.gamma = gamma
        self.adaptation_model = adaptation_model
        self.model = SVR(
            C=C,
            epsilon=epsilon,
            kernel=self.auxiliary_rbf_kernel,
            max_iter=max_iter,
            cache_size=cache_size,
            verbose=verbose,
        )
        self.K1 = K1
        self.K2 = K2

    def auxiliary_rbf_kernel(self, X1, X2):
        if self.K1 is not None:
            if X1.shape[0] == X2.shape[0]:
                return self.K1[-X1.shape[0] :, -X2.shape[0] :]
            else:
                return self.K2[-X1.shape[0] :, -X2.shape[0] :]
        else:
            K = np.zeros((len(X1), len(X2)))

            for algo, idx in self.adaptation_model:
                Y1 = model_predict(algo, idx, X1).reshape(-1, 1)
                Y2 = model_predict(algo, idx, X2).reshape(-1, 1)
                K += Y1 @ Y2.T

            K += rbf_kernel(X1, X2, self.gamma)
            return K

    def fit(self, X, Y):
        self.gamma = 1 / X.shape[1]
        self.model.fit(X, Y)

    def predict(self, X):
        return self.model.predict(X)


def measure_aux_algo(idx, test_sample, model):
    """
    model = ("lgb", 1)
    """
    store = store_list[idx]
    org_train_x, org_train_y, val_x, val_y = acquire_data(store, True)
    pred_y = model_predict(model[0], model[1], val_x[-test_sample:])
    return score(pred_y, val_y[-test_sample:])


# Simple "Memory profilers" to see memory usage
def get_memory_usage():
    return np.round(psutil.Process(os.getpid()).memory_info()[0] / 2.0**30, 2)


def sizeof_fmt(num, suffix="B"):
    for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, "Yi", suffix)


# Memory Reducer
def reduce_mem_usage(df, float16_flag=True, verbose=True):
    numerics = ["int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if float16_flag and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print(
            "Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df


# Merging by concat to not lose dtypes
def merge_by_concat(df1, df2, merge_on):
    merged_gf = df1[merge_on]
    merged_gf = merged_gf.merge(df2, on=merge_on, how="left")
    new_columns = [col for col in list(merged_gf) if col not in merge_on]
    df1 = pd.concat([df1, merged_gf[new_columns]], axis=1)
    return df1


def model_predict(algo, idx, test_x):
    store = store_list[idx]

    if algo == "lgb":
        model = lgb.Booster(model_file=os.path.join(model_dir, f"lgb_{store}.out"))
        return model.predict(test_x, num_iteration=model.best_iteration)
    elif algo == "ridge":
        model = joblib.load(os.path.join(model_dir, f"ridge_{store}.out"))
        return model.predict(test_x)
    elif algo == "svm":
        model = joblib.load(os.path.join(model_dir, f"svm_{store}.out"))
        return model.predict(test_x)


def get_weights(algo):
    weights = []

    if algo == "lgb":
        for store in store_list:
            model = lgb.Booster(model_file=os.path.join(model_dir, f"lgb_{store}.out"))
            weights.append(model.feature_importance())
    else:
        for store in store_list:
            model = joblib.load(os.path.join(model_dir, f"ridge_{store}.out"))
            weights.append(model.coef_)

    return np.array(weights)


def score(real_y, pred_y, sample_weight, multioutput):
    return mean_squared_error(real_y, pred_y, sample_weight=sample_weight, multioutput=multioutput, squared=False)


def acquire_data(store, fill_flag=False):
    TARGET = "sales"
    suffix = f"_fill" if fill_flag else ""
    train = pd.read_pickle(os.path.join(processed_data_dir, f"train_{store}{suffix}.pkl"))
    val = pd.read_pickle(os.path.join(processed_data_dir, f"val_{store}{suffix}.pkl"))

    train_y = train[TARGET]
    train_x = train.drop(columns=TARGET, axis=1)
    val_y = val[TARGET]
    val_x = val.drop(columns=TARGET, axis=1)

    train_x = train_x.to_numpy()
    train_y = train_y.to_numpy()
    val_x = val_x.to_numpy()
    val_y = val_y.to_numpy()

    return train_x, train_y, val_x, val_y