You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

utils.py 6.1 kB

3 years ago
3 years ago
3 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. from math import gamma
  2. from tkinter import Y
  3. import joblib
  4. from tqdm import tqdm
  5. import numpy as np
  6. import pandas as pd
  7. import lightgbm as lgb
  8. from sklearn.svm import SVR
  9. from sklearn.metrics import mean_squared_error
  10. from sklearn.metrics.pairwise import rbf_kernel
  11. import os, sys, gc, time, warnings, pickle, psutil, random
  12. import matplotlib.pyplot as plt
  13. from mpl_toolkits.axes_grid1 import make_axes_locatable
  14. from .config import *
  15. class AuxiliarySVR:
  16. def __init__(
  17. self, C, epsilon, gamma, adaptation_model=[], max_iter=30000, cache_size=10240, verbose=False, K1=None, K2=None
  18. ):
  19. self.gamma = gamma
  20. self.adaptation_model = adaptation_model
  21. self.model = SVR(
  22. C=C,
  23. epsilon=epsilon,
  24. kernel=self.auxiliary_rbf_kernel,
  25. max_iter=max_iter,
  26. cache_size=cache_size,
  27. verbose=verbose,
  28. )
  29. self.K1 = K1
  30. self.K2 = K2
  31. def auxiliary_rbf_kernel(self, X1, X2):
  32. if self.K1 is not None:
  33. if X1.shape[0] == X2.shape[0]:
  34. return self.K1[-X1.shape[0] :, -X2.shape[0] :]
  35. else:
  36. return self.K2[-X1.shape[0] :, -X2.shape[0] :]
  37. else:
  38. K = np.zeros((len(X1), len(X2)))
  39. for algo, idx in self.adaptation_model:
  40. Y1 = model_predict(algo, idx, X1).reshape(-1, 1)
  41. Y2 = model_predict(algo, idx, X2).reshape(-1, 1)
  42. K += Y1 @ Y2.T
  43. K += rbf_kernel(X1, X2, self.gamma)
  44. return K
  45. def fit(self, X, Y):
  46. self.gamma = 1 / X.shape[1]
  47. self.model.fit(X, Y)
  48. def predict(self, X):
  49. return self.model.predict(X)
  50. def measure_aux_algo(idx, test_sample, model):
  51. """
  52. model = ("lgb", 1)
  53. """
  54. store = store_list[idx]
  55. org_train_x, org_train_y, val_x, val_y = acquire_data(store, True)
  56. pred_y = model_predict(model[0], model[1], val_x[-test_sample:])
  57. return score(pred_y, val_y[-test_sample:])
  58. # Simple "Memory profilers" to see memory usage
  59. def get_memory_usage():
  60. return np.round(psutil.Process(os.getpid()).memory_info()[0] / 2.0**30, 2)
  61. def sizeof_fmt(num, suffix="B"):
  62. for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
  63. if abs(num) < 1024.0:
  64. return "%3.1f%s%s" % (num, unit, suffix)
  65. num /= 1024.0
  66. return "%.1f%s%s" % (num, "Yi", suffix)
  67. # Memory Reducer
  68. def reduce_mem_usage(df, float16_flag=True, verbose=True):
  69. numerics = ["int16", "int32", "int64", "float16", "float32", "float64"]
  70. start_mem = df.memory_usage().sum() / 1024**2
  71. for col in df.columns:
  72. col_type = df[col].dtypes
  73. if col_type in numerics:
  74. c_min = df[col].min()
  75. c_max = df[col].max()
  76. if str(col_type)[:3] == "int":
  77. if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
  78. df[col] = df[col].astype(np.int8)
  79. elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
  80. df[col] = df[col].astype(np.int16)
  81. elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
  82. df[col] = df[col].astype(np.int32)
  83. elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
  84. df[col] = df[col].astype(np.int64)
  85. else:
  86. if float16_flag and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
  87. df[col] = df[col].astype(np.float16)
  88. elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
  89. df[col] = df[col].astype(np.float32)
  90. else:
  91. df[col] = df[col].astype(np.float64)
  92. end_mem = df.memory_usage().sum() / 1024**2
  93. if verbose:
  94. print(
  95. "Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)".format(
  96. end_mem, 100 * (start_mem - end_mem) / start_mem
  97. )
  98. )
  99. return df
  100. # Merging by concat to not lose dtypes
  101. def merge_by_concat(df1, df2, merge_on):
  102. merged_gf = df1[merge_on]
  103. merged_gf = merged_gf.merge(df2, on=merge_on, how="left")
  104. new_columns = [col for col in list(merged_gf) if col not in merge_on]
  105. df1 = pd.concat([df1, merged_gf[new_columns]], axis=1)
  106. return df1
  107. def model_predict(algo, idx, test_x):
  108. store = store_list[idx]
  109. if algo == "lgb":
  110. model = lgb.Booster(model_file=os.path.join(model_dir, f"lgb_{store}.out"))
  111. return model.predict(test_x, num_iteration=model.best_iteration)
  112. elif algo == "ridge":
  113. model = joblib.load(os.path.join(model_dir, f"ridge_{store}.out"))
  114. return model.predict(test_x)
  115. elif algo == "svm":
  116. model = joblib.load(os.path.join(model_dir, f"svm_{store}.out"))
  117. return model.predict(test_x)
  118. def get_weights(algo):
  119. weights = []
  120. if algo == "lgb":
  121. for store in store_list:
  122. model = lgb.Booster(model_file=os.path.join(model_dir, f"lgb_{store}.out"))
  123. weights.append(model.feature_importance())
  124. else:
  125. for store in store_list:
  126. model = joblib.load(os.path.join(model_dir, f"ridge_{store}.out"))
  127. weights.append(model.coef_)
  128. return np.array(weights)
  129. def score(real_y, pred_y, sample_weight, multioutput):
  130. return mean_squared_error(real_y, pred_y, sample_weight=sample_weight, multioutput=multioutput, squared=False)
  131. def acquire_data(store, fill_flag=False):
  132. TARGET = "sales"
  133. suffix = f"_fill" if fill_flag else ""
  134. train = pd.read_pickle(os.path.join(processed_data_dir, f"train_{store}{suffix}.pkl"))
  135. val = pd.read_pickle(os.path.join(processed_data_dir, f"val_{store}{suffix}.pkl"))
  136. train_y = train[TARGET]
  137. train_x = train.drop(columns=TARGET, axis=1)
  138. val_y = val[TARGET]
  139. val_x = val.drop(columns=TARGET, axis=1)
  140. train_x = train_x.to_numpy()
  141. train_y = train_y.to_numpy()
  142. val_x = val_x.to_numpy()
  143. val_y = val_y.to_numpy()
  144. return train_x, train_y, val_x, val_y

基于学件范式,全流程地支持学件上传、检测、组织、查搜、部署和复用等功能。同时,该仓库作为北冥坞系统的引擎,支撑北冥坞系统的核心功能。