You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

split_data.py 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384
  1. import os
  2. import pickle
  3. import pandas as pd
  4. import numpy as np
  5. from itertools import product
  6. from sklearn.preprocessing import LabelEncoder
  7. from sklearn.preprocessing import MinMaxScaler
  8. import calendar
  9. from .paths import pfs_data_dir
  10. from .paths import pfs_split_dir
  11. def feature_engineering():
  12. # read data
  13. sales = pd.read_csv(os.path.join(pfs_data_dir, "sales_train.csv"))
  14. shops = pd.read_csv(os.path.join(pfs_data_dir, "shops.csv"))
  15. items = pd.read_csv(os.path.join(pfs_data_dir, "items.csv"))
  16. item_cats = pd.read_csv(os.path.join(pfs_data_dir, "item_categories.csv"))
  17. test = pd.read_csv(os.path.join(pfs_data_dir, "test.csv"))
  18. # remove outliers
  19. train = sales[(sales.item_price < 10000) & (sales.item_price > 0)]
  20. train = train[sales.item_cnt_day < 1001]
  21. print(train.shape, sales.shape)
  22. print(train.tail(5))
  23. print(sales.tail(5))
  24. # combine shops with different id but the same name
  25. train.loc[train.shop_id == 0, "shop_id"] = 57
  26. test.loc[test.shop_id == 0, "shop_id"] = 57
  27. train.loc[train.shop_id == 1, "shop_id"] = 58
  28. test.loc[test.shop_id == 1, "shop_id"] = 58
  29. train.loc[train.shop_id == 40, "shop_id"] = 39
  30. test.loc[test.shop_id == 40, "shop_id"] = 39
  31. # obtain shop_id, item_id, month information
  32. index_cols = ["shop_id", "item_id", "date_block_num"]
  33. df = []
  34. for block_num in train["date_block_num"].unique():
  35. cur_shops = train.loc[sales["date_block_num"] == block_num, "shop_id"].unique()
  36. cur_items = train.loc[sales["date_block_num"] == block_num, "item_id"].unique()
  37. df.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])), dtype="int32"))
  38. df = pd.DataFrame(np.vstack(df), columns=index_cols, dtype=np.int32)
  39. print("df.shape: ", df.shape)
  40. print(df.head(5))
  41. # Add month sales
  42. group = train.groupby(["date_block_num", "shop_id", "item_id"]).agg({"item_cnt_day": ["sum"]})
  43. group.columns = ["item_cnt_month"]
  44. group.reset_index(inplace=True)
  45. print("group.shape: ", group.shape)
  46. print(group.head(5))
  47. df = pd.merge(df, group, on=index_cols, how="left")
  48. df["item_cnt_month"] = (
  49. df["item_cnt_month"]
  50. .fillna(0)
  51. .astype(np.float32)
  52. # df['item_cnt_month'].fillna(0).clip(0, 20).astype(np.float32)
  53. )
  54. # fill test data
  55. test["date_block_num"] = 34
  56. test["date_block_num"] = test["date_block_num"].astype(np.int8)
  57. test["shop_id"] = test["shop_id"].astype(np.int8)
  58. test["item_id"] = test["item_id"].astype(np.int16)
  59. df = pd.concat([df, test], ignore_index=True, sort=False, keys=index_cols)
  60. df.fillna(0, inplace=True)
  61. # shop location features
  62. shops["city"] = shops["shop_name"].apply(lambda x: x.split()[0].lower())
  63. shops.loc[shops.city == "!якутск", "city"] = "якутск"
  64. shops["city_code"] = LabelEncoder().fit_transform(shops["city"])
  65. coords = dict()
  66. coords["якутск"] = (62.028098, 129.732555, 4)
  67. coords["адыгея"] = (44.609764, 40.100516, 3)
  68. coords["балашиха"] = (55.8094500, 37.9580600, 1)
  69. coords["волжский"] = (53.4305800, 50.1190000, 3)
  70. coords["вологда"] = (59.2239000, 39.8839800, 2)
  71. coords["воронеж"] = (51.6720400, 39.1843000, 3)
  72. coords["выездная"] = (0, 0, 0)
  73. coords["жуковский"] = (55.5952800, 38.1202800, 1)
  74. coords["интернет-магазин"] = (0, 0, 0)
  75. coords["казань"] = (55.7887400, 49.1221400, 4)
  76. coords["калуга"] = (54.5293000, 36.2754200, 4)
  77. coords["коломна"] = (55.0794400, 38.7783300, 4)
  78. coords["красноярск"] = (56.0183900, 92.8671700, 4)
  79. coords["курск"] = (51.7373300, 36.1873500, 3)
  80. coords["москва"] = (55.7522200, 37.6155600, 1)
  81. coords["мытищи"] = (55.9116300, 37.7307600, 1)
  82. coords["н.новгород"] = (56.3286700, 44.0020500, 4)
  83. coords["новосибирск"] = (55.0415000, 82.9346000, 4)
  84. coords["омск"] = (54.9924400, 73.3685900, 4)
  85. coords["ростовнадону"] = (47.2313500, 39.7232800, 3)
  86. coords["спб"] = (59.9386300, 30.3141300, 2)
  87. coords["самара"] = (53.2000700, 50.1500000, 4)
  88. coords["сергиев"] = (56.3000000, 38.1333300, 4)
  89. coords["сургут"] = (61.2500000, 73.4166700, 4)
  90. coords["томск"] = (56.4977100, 84.9743700, 4)
  91. coords["тюмень"] = (57.1522200, 65.5272200, 4)
  92. coords["уфа"] = (54.7430600, 55.9677900, 4)
  93. coords["химки"] = (55.8970400, 37.4296900, 1)
  94. coords["цифровой"] = (0, 0, 0)
  95. coords["чехов"] = (55.1477000, 37.4772800, 4)
  96. coords["ярославль"] = (57.6298700, 39.8736800, 2)
  97. shops["city_coord_1"] = shops["city"].apply(lambda x: coords[x][0])
  98. shops["city_coord_2"] = shops["city"].apply(lambda x: coords[x][1])
  99. shops["country_part"] = shops["city"].apply(lambda x: coords[x][2])
  100. shops = shops[["shop_id", "city_code", "city_coord_1", "city_coord_2", "country_part"]]
  101. df = pd.merge(df, shops, on=["shop_id"], how="left")
  102. # process items category name
  103. map_dict = {
  104. "Чистые носители (штучные)": "Чистые носители",
  105. "Чистые носители (шпиль)": "Чистые носители",
  106. "PC ": "Аксессуары",
  107. "Служебные": "Служебные ",
  108. }
  109. items = pd.merge(items, item_cats, on="item_category_id")
  110. items["item_category"] = items["item_category_name"].apply(lambda x: x.split("-")[0])
  111. items["item_category"] = items["item_category"].apply(lambda x: map_dict[x] if x in map_dict.keys() else x)
  112. items["item_category_common"] = LabelEncoder().fit_transform(items["item_category"])
  113. items["item_category_code"] = LabelEncoder().fit_transform(items["item_category_name"])
  114. items = items[["item_id", "item_category_common", "item_category_code"]]
  115. df = pd.merge(df, items, on=["item_id"], how="left")
  116. # Weekends count / number of days in a month
  117. def count_days(date_block_num):
  118. year = 2013 + date_block_num // 12
  119. month = 1 + date_block_num % 12
  120. weeknd_count = len([1 for i in calendar.monthcalendar(year, month) if i[6] != 0])
  121. days_in_month = calendar.monthrange(year, month)[1]
  122. return weeknd_count, days_in_month, month
  123. map_dict = {i: count_days(i) for i in range(35)}
  124. df["weeknd_count"] = df["date_block_num"].apply(lambda x: map_dict[x][0])
  125. df["days_in_month"] = df["date_block_num"].apply(lambda x: map_dict[x][1])
  126. # Interation features: Item is new / Item was bought in this shop before
  127. first_item_block = df.groupby(["item_id"])["date_block_num"].min().reset_index()
  128. first_item_block["item_first_interaction"] = 1
  129. first_shop_item_buy_block = (
  130. df[df["date_block_num"] > 0].groupby(["shop_id", "item_id"])["date_block_num"].min().reset_index()
  131. )
  132. first_shop_item_buy_block["first_date_block_num"] = first_shop_item_buy_block["date_block_num"]
  133. df = pd.merge(
  134. df,
  135. first_item_block[["item_id", "date_block_num", "item_first_interaction"]],
  136. on=["item_id", "date_block_num"],
  137. how="left",
  138. )
  139. df = pd.merge(
  140. df,
  141. first_shop_item_buy_block[["item_id", "shop_id", "first_date_block_num"]],
  142. on=["item_id", "shop_id"],
  143. how="left",
  144. )
  145. df["first_date_block_num"].fillna(100, inplace=True)
  146. df["shop_item_sold_before"] = (df["first_date_block_num"] < df["date_block_num"]).astype("int8")
  147. df.drop(["first_date_block_num"], axis=1, inplace=True)
  148. df["item_first_interaction"].fillna(0, inplace=True)
  149. df["shop_item_sold_before"].fillna(0, inplace=True)
  150. df["item_first_interaction"] = df["item_first_interaction"].astype("int8")
  151. df["shop_item_sold_before"] = df["shop_item_sold_before"].astype("int8")
  152. def lag_feature(df, lags, col):
  153. tmp = df[["date_block_num", "shop_id", "item_id", col]]
  154. for i in lags:
  155. shifted = tmp.copy()
  156. shifted.columns = ["date_block_num", "shop_id", "item_id", col + "_lag_" + str(i)]
  157. shifted["date_block_num"] += i
  158. df = pd.merge(df, shifted, on=["date_block_num", "shop_id", "item_id"], how="left")
  159. lag_name = col + "_lag_" + str(i)
  160. df[lag_name] = df[lag_name].astype("float32")
  161. return df
  162. df = lag_feature(df, [1, 2, 3], "item_cnt_month")
  163. index_cols = ["shop_id", "item_id", "date_block_num"]
  164. group = (
  165. train.groupby(index_cols)["item_price"]
  166. .mean()
  167. .reset_index()
  168. .rename(columns={"item_price": "avg_shop_price"}, errors="raise")
  169. )
  170. df = pd.merge(df, group, on=index_cols, how="left")
  171. df["avg_shop_price"] = df["avg_shop_price"].fillna(0).astype(np.float32)
  172. index_cols = ["item_id", "date_block_num"]
  173. group = (
  174. train.groupby(["date_block_num", "item_id"])["item_price"]
  175. .mean()
  176. .reset_index()
  177. .rename(columns={"item_price": "avg_item_price"}, errors="raise")
  178. )
  179. df = pd.merge(df, group, on=index_cols, how="left")
  180. df["avg_item_price"] = df["avg_item_price"].fillna(0).astype(np.float32)
  181. df["item_shop_price_avg"] = (df["avg_shop_price"] - df["avg_item_price"]) / df["avg_item_price"]
  182. df["item_shop_price_avg"].fillna(0, inplace=True)
  183. df = lag_feature(df, [1, 2, 3], "item_shop_price_avg")
  184. df.drop(["avg_shop_price", "avg_item_price", "item_shop_price_avg"], axis=1, inplace=True)
  185. item_id_target_mean = (
  186. df.groupby(["date_block_num", "item_id"])["item_cnt_month"]
  187. .mean()
  188. .reset_index()
  189. .rename(columns={"item_cnt_month": "item_target_enc"}, errors="raise")
  190. )
  191. df = pd.merge(df, item_id_target_mean, on=["date_block_num", "item_id"], how="left")
  192. df["item_target_enc"] = df["item_target_enc"].fillna(0).astype(np.float32)
  193. df = lag_feature(df, [1, 2, 3], "item_target_enc")
  194. df.drop(["item_target_enc"], axis=1, inplace=True)
  195. item_id_target_mean = (
  196. df.groupby(["date_block_num", "item_id", "city_code"])["item_cnt_month"]
  197. .mean()
  198. .reset_index()
  199. .rename(columns={"item_cnt_month": "item_loc_target_enc"}, errors="raise")
  200. )
  201. df = pd.merge(df, item_id_target_mean, on=["date_block_num", "item_id", "city_code"], how="left")
  202. df["item_loc_target_enc"] = df["item_loc_target_enc"].fillna(0).astype(np.float32)
  203. df = lag_feature(df, [1, 2, 3], "item_loc_target_enc")
  204. df.drop(["item_loc_target_enc"], axis=1, inplace=True)
  205. item_id_target_mean = (
  206. df.groupby(["date_block_num", "item_id", "shop_id"])["item_cnt_month"]
  207. .mean()
  208. .reset_index()
  209. .rename(columns={"item_cnt_month": "item_shop_target_enc"}, errors="raise")
  210. )
  211. df = pd.merge(df, item_id_target_mean, on=["date_block_num", "item_id", "shop_id"], how="left")
  212. df["item_shop_target_enc"] = df["item_shop_target_enc"].fillna(0).astype(np.float32)
  213. df = lag_feature(df, [1, 2, 3], "item_shop_target_enc")
  214. df.drop(["item_shop_target_enc"], axis=1, inplace=True)
  215. item_id_target_mean = (
  216. df[df["item_first_interaction"] == 1]
  217. .groupby(["date_block_num", "item_category_code"])["item_cnt_month"]
  218. .mean()
  219. .reset_index()
  220. .rename(columns={"item_cnt_month": "new_item_cat_avg"}, errors="raise")
  221. )
  222. df = pd.merge(df, item_id_target_mean, on=["date_block_num", "item_category_code"], how="left")
  223. df["new_item_cat_avg"] = df["new_item_cat_avg"].fillna(0).astype(np.float32)
  224. df = lag_feature(df, [1, 2, 3], "new_item_cat_avg")
  225. df.drop(["new_item_cat_avg"], axis=1, inplace=True)
  226. # For new items add avg category sales in a separate store for last 3 months
  227. item_id_target_mean = (
  228. df[df["item_first_interaction"] == 1]
  229. .groupby(["date_block_num", "item_category_code", "shop_id"])["item_cnt_month"]
  230. .mean()
  231. .reset_index()
  232. .rename(columns={"item_cnt_month": "new_item_shop_cat_avg"}, errors="raise")
  233. )
  234. df = pd.merge(df, item_id_target_mean, on=["date_block_num", "item_category_code", "shop_id"], how="left")
  235. df["new_item_shop_cat_avg"] = df["new_item_shop_cat_avg"].fillna(0).astype(np.float32)
  236. df = lag_feature(df, [1, 2, 3], "new_item_shop_cat_avg")
  237. df.drop(["new_item_shop_cat_avg"], axis=1, inplace=True)
  238. def lag_feature_adv(df, lags, col):
  239. tmp = df[["date_block_num", "shop_id", "item_id", col]]
  240. for i in lags:
  241. shifted = tmp.copy()
  242. shifted.columns = ["date_block_num", "shop_id", "item_id", col + "_lag_" + str(i) + "_adv"]
  243. shifted["date_block_num"] += i
  244. shifted["item_id"] -= 1
  245. df = pd.merge(df, shifted, on=["date_block_num", "shop_id", "item_id"], how="left")
  246. lag_name = col + "_lag_" + str(i) + "_adv"
  247. df[lag_name] = df[lag_name].astype("float32")
  248. return df
  249. df = lag_feature_adv(df, [1, 2, 3], "item_cnt_month")
  250. # df.fillna(0, inplace=True)
  251. df = df[(df["date_block_num"] > 2)]
  252. df.drop(["ID"], axis=1, inplace=True, errors="ignore")
  253. print(df.shape)
  254. print(df.columns)
  255. print(df.head(10))
  256. fill_dict = {}
  257. for col in df.columns:
  258. fill_dict[col] = df[col].mean()
  259. group_df = df.groupby(["shop_id"])
  260. for shop_id, shop_df in group_df:
  261. # remove data of data_block_num=34, i.e., 2015.11
  262. # this is test set in competition
  263. shop_df = shop_df[shop_df.date_block_num <= 33]
  264. # fill the null
  265. cols = shop_df.isnull().any()
  266. idx = list(cols[cols.values].index)
  267. shop_df[idx] = shop_df.groupby("item_id", sort=False)[idx].apply(
  268. lambda x: x.fillna(method="ffill").fillna(method="bfill")
  269. )
  270. shop_df[idx] = shop_df[idx].fillna(shop_df[idx].mean())
  271. for col in idx:
  272. shop_df[col] = shop_df[col].fillna(fill_dict[col])
  273. # min-max scale
  274. drop_fea_list = [
  275. "shop_id",
  276. "city_code",
  277. "city_coord_1",
  278. "city_coord_2",
  279. "country_part",
  280. "item_cnt_month",
  281. "date_block_num",
  282. ]
  283. fea_list = [col for col in shop_df.columns if col not in drop_fea_list]
  284. mms = MinMaxScaler()
  285. shop_df[fea_list] = mms.fit_transform(shop_df[fea_list])
  286. shop_df = shop_df[fea_list + ["item_cnt_month", "date_block_num"]]
  287. date_split = 29
  288. split = False
  289. while split is False:
  290. df1 = shop_df[shop_df["date_block_num"] <= date_split]
  291. df2 = shop_df[shop_df["date_block_num"] > date_split]
  292. if df2.shape[0] > 0 and df1.shape[0] > 0:
  293. split = True
  294. else:
  295. date_split -= 1
  296. if date_split < 0:
  297. break
  298. if split is True:
  299. print("ShopID:{}, split block:{}".format(shop_id, date_split))
  300. print(df1.shape, df2.shape)
  301. # save train csv
  302. fpath = os.path.join(pfs_split_dir, "Shop{:0>2d}-train.csv".format(shop_id))
  303. df1.to_csv(fpath, index=False)
  304. # save val csv
  305. fpath = os.path.join(pfs_split_dir, "Shop{:0>2d}-val.csv".format(shop_id))
  306. df2.to_csv(fpath, index=False)

基于学件范式,全流程地支持学件上传、检测、组织、查搜、部署和复用等功能。同时,该仓库作为北冥坞系统的引擎,支撑北冥坞系统的核心功能。