import os import pickle import pandas as pd import numpy as np from itertools import product from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import MinMaxScaler import calendar from .paths import pfs_data_dir from .paths import pfs_split_dir def feature_engineering(): # read data sales = pd.read_csv(os.path.join(pfs_data_dir, "sales_train.csv")) shops = pd.read_csv(os.path.join(pfs_data_dir, "shops.csv")) items = pd.read_csv(os.path.join(pfs_data_dir, "items.csv")) item_cats = pd.read_csv(os.path.join(pfs_data_dir, "item_categories.csv")) test = pd.read_csv(os.path.join(pfs_data_dir, "test.csv")) # remove outliers train = sales[(sales.item_price < 10000) & (sales.item_price > 0)] train = train[sales.item_cnt_day < 1001] print(train.shape, sales.shape) print(train.tail(5)) print(sales.tail(5)) # combine shops with different id but the same name train.loc[train.shop_id == 0, "shop_id"] = 57 test.loc[test.shop_id == 0, "shop_id"] = 57 train.loc[train.shop_id == 1, "shop_id"] = 58 test.loc[test.shop_id == 1, "shop_id"] = 58 train.loc[train.shop_id == 40, "shop_id"] = 39 test.loc[test.shop_id == 40, "shop_id"] = 39 # obtain shop_id, item_id, month information index_cols = ["shop_id", "item_id", "date_block_num"] df = [] for block_num in train["date_block_num"].unique(): cur_shops = train.loc[sales["date_block_num"] == block_num, "shop_id"].unique() cur_items = train.loc[sales["date_block_num"] == block_num, "item_id"].unique() df.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])), dtype="int32")) df = pd.DataFrame(np.vstack(df), columns=index_cols, dtype=np.int32) print("df.shape: ", df.shape) print(df.head(5)) # Add month sales group = train.groupby(["date_block_num", "shop_id", "item_id"]).agg({"item_cnt_day": ["sum"]}) group.columns = ["item_cnt_month"] group.reset_index(inplace=True) print("group.shape: ", group.shape) print(group.head(5)) df = pd.merge(df, group, on=index_cols, how="left") df["item_cnt_month"] = ( df["item_cnt_month"] .fillna(0) .astype(np.float32) # df['item_cnt_month'].fillna(0).clip(0, 20).astype(np.float32) ) # fill test data test["date_block_num"] = 34 test["date_block_num"] = test["date_block_num"].astype(np.int8) test["shop_id"] = test["shop_id"].astype(np.int8) test["item_id"] = test["item_id"].astype(np.int16) df = pd.concat([df, test], ignore_index=True, sort=False, keys=index_cols) df.fillna(0, inplace=True) # shop location features shops["city"] = shops["shop_name"].apply(lambda x: x.split()[0].lower()) shops.loc[shops.city == "!якутск", "city"] = "якутск" shops["city_code"] = LabelEncoder().fit_transform(shops["city"]) coords = dict() coords["якутск"] = (62.028098, 129.732555, 4) coords["адыгея"] = (44.609764, 40.100516, 3) coords["балашиха"] = (55.8094500, 37.9580600, 1) coords["волжский"] = (53.4305800, 50.1190000, 3) coords["вологда"] = (59.2239000, 39.8839800, 2) coords["воронеж"] = (51.6720400, 39.1843000, 3) coords["выездная"] = (0, 0, 0) coords["жуковский"] = (55.5952800, 38.1202800, 1) coords["интернет-магазин"] = (0, 0, 0) coords["казань"] = (55.7887400, 49.1221400, 4) coords["калуга"] = (54.5293000, 36.2754200, 4) coords["коломна"] = (55.0794400, 38.7783300, 4) coords["красноярск"] = (56.0183900, 92.8671700, 4) coords["курск"] = (51.7373300, 36.1873500, 3) coords["москва"] = (55.7522200, 37.6155600, 1) coords["мытищи"] = (55.9116300, 37.7307600, 1) coords["н.новгород"] = (56.3286700, 44.0020500, 4) coords["новосибирск"] = (55.0415000, 82.9346000, 4) coords["омск"] = (54.9924400, 73.3685900, 4) coords["ростовнадону"] = (47.2313500, 39.7232800, 3) coords["спб"] = (59.9386300, 30.3141300, 2) coords["самара"] = (53.2000700, 50.1500000, 4) coords["сергиев"] = (56.3000000, 38.1333300, 4) coords["сургут"] = (61.2500000, 73.4166700, 4) coords["томск"] = (56.4977100, 84.9743700, 4) coords["тюмень"] = (57.1522200, 65.5272200, 4) coords["уфа"] = (54.7430600, 55.9677900, 4) coords["химки"] = (55.8970400, 37.4296900, 1) coords["цифровой"] = (0, 0, 0) coords["чехов"] = (55.1477000, 37.4772800, 4) coords["ярославль"] = (57.6298700, 39.8736800, 2) shops["city_coord_1"] = shops["city"].apply(lambda x: coords[x][0]) shops["city_coord_2"] = shops["city"].apply(lambda x: coords[x][1]) shops["country_part"] = shops["city"].apply(lambda x: coords[x][2]) shops = shops[["shop_id", "city_code", "city_coord_1", "city_coord_2", "country_part"]] df = pd.merge(df, shops, on=["shop_id"], how="left") # process items category name map_dict = { "Чистые носители (штучные)": "Чистые носители", "Чистые носители (шпиль)": "Чистые носители", "PC ": "Аксессуары", "Служебные": "Служебные ", } items = pd.merge(items, item_cats, on="item_category_id") items["item_category"] = items["item_category_name"].apply(lambda x: x.split("-")[0]) items["item_category"] = items["item_category"].apply(lambda x: map_dict[x] if x in map_dict.keys() else x) items["item_category_common"] = LabelEncoder().fit_transform(items["item_category"]) items["item_category_code"] = LabelEncoder().fit_transform(items["item_category_name"]) items = items[["item_id", "item_category_common", "item_category_code"]] df = pd.merge(df, items, on=["item_id"], how="left") # Weekends count / number of days in a month def count_days(date_block_num): year = 2013 + date_block_num // 12 month = 1 + date_block_num % 12 weeknd_count = len([1 for i in calendar.monthcalendar(year, month) if i[6] != 0]) days_in_month = calendar.monthrange(year, month)[1] return weeknd_count, days_in_month, month map_dict = {i: count_days(i) for i in range(35)} df["weeknd_count"] = df["date_block_num"].apply(lambda x: map_dict[x][0]) df["days_in_month"] = df["date_block_num"].apply(lambda x: map_dict[x][1]) # Interation features: Item is new / Item was bought in this shop before first_item_block = df.groupby(["item_id"])["date_block_num"].min().reset_index() first_item_block["item_first_interaction"] = 1 first_shop_item_buy_block = ( df[df["date_block_num"] > 0].groupby(["shop_id", "item_id"])["date_block_num"].min().reset_index() ) first_shop_item_buy_block["first_date_block_num"] = first_shop_item_buy_block["date_block_num"] df = pd.merge( df, first_item_block[["item_id", "date_block_num", "item_first_interaction"]], on=["item_id", "date_block_num"], how="left", ) df = pd.merge( df, first_shop_item_buy_block[["item_id", "shop_id", "first_date_block_num"]], on=["item_id", "shop_id"], how="left", ) df["first_date_block_num"].fillna(100, inplace=True) df["shop_item_sold_before"] = (df["first_date_block_num"] < df["date_block_num"]).astype("int8") df.drop(["first_date_block_num"], axis=1, inplace=True) df["item_first_interaction"].fillna(0, inplace=True) df["shop_item_sold_before"].fillna(0, inplace=True) df["item_first_interaction"] = df["item_first_interaction"].astype("int8") df["shop_item_sold_before"] = df["shop_item_sold_before"].astype("int8") def lag_feature(df, lags, col): tmp = df[["date_block_num", "shop_id", "item_id", col]] for i in lags: shifted = tmp.copy() shifted.columns = ["date_block_num", "shop_id", "item_id", col + "_lag_" + str(i)] shifted["date_block_num"] += i df = pd.merge(df, shifted, on=["date_block_num", "shop_id", "item_id"], how="left") lag_name = col + "_lag_" + str(i) df[lag_name] = df[lag_name].astype("float32") return df df = lag_feature(df, [1, 2, 3], "item_cnt_month") index_cols = ["shop_id", "item_id", "date_block_num"] group = ( train.groupby(index_cols)["item_price"] .mean() .reset_index() .rename(columns={"item_price": "avg_shop_price"}, errors="raise") ) df = pd.merge(df, group, on=index_cols, how="left") df["avg_shop_price"] = df["avg_shop_price"].fillna(0).astype(np.float32) index_cols = ["item_id", "date_block_num"] group = ( train.groupby(["date_block_num", "item_id"])["item_price"] .mean() .reset_index() .rename(columns={"item_price": "avg_item_price"}, errors="raise") ) df = pd.merge(df, group, on=index_cols, how="left") df["avg_item_price"] = df["avg_item_price"].fillna(0).astype(np.float32) df["item_shop_price_avg"] = (df["avg_shop_price"] - df["avg_item_price"]) / df["avg_item_price"] df["item_shop_price_avg"].fillna(0, inplace=True) df = lag_feature(df, [1, 2, 3], "item_shop_price_avg") df.drop(["avg_shop_price", "avg_item_price", "item_shop_price_avg"], axis=1, inplace=True) item_id_target_mean = ( df.groupby(["date_block_num", "item_id"])["item_cnt_month"] .mean() .reset_index() .rename(columns={"item_cnt_month": "item_target_enc"}, errors="raise") ) df = pd.merge(df, item_id_target_mean, on=["date_block_num", "item_id"], how="left") df["item_target_enc"] = df["item_target_enc"].fillna(0).astype(np.float32) df = lag_feature(df, [1, 2, 3], "item_target_enc") df.drop(["item_target_enc"], axis=1, inplace=True) item_id_target_mean = ( df.groupby(["date_block_num", "item_id", "city_code"])["item_cnt_month"] .mean() .reset_index() .rename(columns={"item_cnt_month": "item_loc_target_enc"}, errors="raise") ) df = pd.merge(df, item_id_target_mean, on=["date_block_num", "item_id", "city_code"], how="left") df["item_loc_target_enc"] = df["item_loc_target_enc"].fillna(0).astype(np.float32) df = lag_feature(df, [1, 2, 3], "item_loc_target_enc") df.drop(["item_loc_target_enc"], axis=1, inplace=True) item_id_target_mean = ( df.groupby(["date_block_num", "item_id", "shop_id"])["item_cnt_month"] .mean() .reset_index() .rename(columns={"item_cnt_month": "item_shop_target_enc"}, errors="raise") ) df = pd.merge(df, item_id_target_mean, on=["date_block_num", "item_id", "shop_id"], how="left") df["item_shop_target_enc"] = df["item_shop_target_enc"].fillna(0).astype(np.float32) df = lag_feature(df, [1, 2, 3], "item_shop_target_enc") df.drop(["item_shop_target_enc"], axis=1, inplace=True) item_id_target_mean = ( df[df["item_first_interaction"] == 1] .groupby(["date_block_num", "item_category_code"])["item_cnt_month"] .mean() .reset_index() .rename(columns={"item_cnt_month": "new_item_cat_avg"}, errors="raise") ) df = pd.merge(df, item_id_target_mean, on=["date_block_num", "item_category_code"], how="left") df["new_item_cat_avg"] = df["new_item_cat_avg"].fillna(0).astype(np.float32) df = lag_feature(df, [1, 2, 3], "new_item_cat_avg") df.drop(["new_item_cat_avg"], axis=1, inplace=True) # For new items add avg category sales in a separate store for last 3 months item_id_target_mean = ( df[df["item_first_interaction"] == 1] .groupby(["date_block_num", "item_category_code", "shop_id"])["item_cnt_month"] .mean() .reset_index() .rename(columns={"item_cnt_month": "new_item_shop_cat_avg"}, errors="raise") ) df = pd.merge(df, item_id_target_mean, on=["date_block_num", "item_category_code", "shop_id"], how="left") df["new_item_shop_cat_avg"] = df["new_item_shop_cat_avg"].fillna(0).astype(np.float32) df = lag_feature(df, [1, 2, 3], "new_item_shop_cat_avg") df.drop(["new_item_shop_cat_avg"], axis=1, inplace=True) def lag_feature_adv(df, lags, col): tmp = df[["date_block_num", "shop_id", "item_id", col]] for i in lags: shifted = tmp.copy() shifted.columns = ["date_block_num", "shop_id", "item_id", col + "_lag_" + str(i) + "_adv"] shifted["date_block_num"] += i shifted["item_id"] -= 1 df = pd.merge(df, shifted, on=["date_block_num", "shop_id", "item_id"], how="left") lag_name = col + "_lag_" + str(i) + "_adv" df[lag_name] = df[lag_name].astype("float32") return df df = lag_feature_adv(df, [1, 2, 3], "item_cnt_month") # df.fillna(0, inplace=True) df = df[(df["date_block_num"] > 2)] df.drop(["ID"], axis=1, inplace=True, errors="ignore") print(df.shape) print(df.columns) print(df.head(10)) fill_dict = {} for col in df.columns: fill_dict[col] = df[col].mean() group_df = df.groupby(["shop_id"]) for shop_id, shop_df in group_df: # remove data of data_block_num=34, i.e., 2015.11 # this is test set in competition shop_df = shop_df[shop_df.date_block_num <= 33] # fill the null cols = shop_df.isnull().any() idx = list(cols[cols.values].index) shop_df[idx] = shop_df.groupby("item_id", sort=False)[idx].apply( lambda x: x.fillna(method="ffill").fillna(method="bfill") ) shop_df[idx] = shop_df[idx].fillna(shop_df[idx].mean()) for col in idx: shop_df[col] = shop_df[col].fillna(fill_dict[col]) # min-max scale drop_fea_list = [ "shop_id", "city_code", "city_coord_1", "city_coord_2", "country_part", "item_cnt_month", "date_block_num", ] fea_list = [col for col in shop_df.columns if col not in drop_fea_list] mms = MinMaxScaler() shop_df[fea_list] = mms.fit_transform(shop_df[fea_list]) shop_df = shop_df[fea_list + ["item_cnt_month", "date_block_num"]] date_split = 29 split = False while split is False: df1 = shop_df[shop_df["date_block_num"] <= date_split] df2 = shop_df[shop_df["date_block_num"] > date_split] if df2.shape[0] > 0 and df1.shape[0] > 0: split = True else: date_split -= 1 if date_split < 0: break if split is True: print("ShopID:{}, split block:{}".format(shop_id, date_split)) print(df1.shape, df2.shape) # save train csv fpath = os.path.join(pfs_split_dir, "Shop{:0>2d}-train.csv".format(shop_id)) df1.to_csv(fpath, index=False) # save val csv fpath = os.path.join(pfs_split_dir, "Shop{:0>2d}-val.csv".format(shop_id)) df2.to_csv(fpath, index=False)