Browse Source

delete redundant codes

tags/v1.0.0
Yi Huaijie 5 years ago
parent
commit
836d84e9f6
5 changed files with 14 additions and 35 deletions
  1. +0
    -1
      model_zoo/official/recommend/wide_and_deep/src/datasets.py
  2. +14
    -30
      model_zoo/official/recommend/wide_and_deep/src/process_data.py
  3. +0
    -2
      model_zoo/official/recommend/wide_and_deep_multitable/src/wide_and_deep.py
  4. +0
    -1
      model_zoo/official/recommend/wide_and_deep_multitable/train_and_eval.py
  5. +0
    -1
      model_zoo/official/recommend/wide_and_deep_multitable/train_and_eval_distribute.py

+ 0
- 1
model_zoo/official/recommend/wide_and_deep/src/datasets.py View File

@@ -230,7 +230,6 @@ def _get_tf_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000,
ds = ds.map(operations=_padding_func(batch_size, manual_shape, target_column), ds = ds.map(operations=_padding_func(batch_size, manual_shape, target_column),
input_columns=['feat_ids', 'feat_vals', 'label'], input_columns=['feat_ids', 'feat_vals', 'label'],
column_order=['feat_ids', 'feat_vals', 'label'], num_parallel_workers=8) column_order=['feat_ids', 'feat_vals', 'label'], num_parallel_workers=8)
# if train_mode:
ds = ds.repeat(epochs) ds = ds.repeat(epochs)
return ds return ds




+ 14
- 30
model_zoo/official/recommend/wide_and_deep/src/process_data.py View File

@@ -33,17 +33,17 @@ class RecommendationDatasetStatsDict():
self.field_size = 39 # value_1-13; cat_1-26; self.field_size = 39 # value_1-13; cat_1-26;
self.val_cols = ["val_{}".format(i+1) for i in range(13)] self.val_cols = ["val_{}".format(i+1) for i in range(13)]
self.cat_cols = ["cat_{}".format(i+1) for i in range(26)] self.cat_cols = ["cat_{}".format(i+1) for i in range(26)]
#
self.val_min_dict = {col: 0 for col in self.val_cols} self.val_min_dict = {col: 0 for col in self.val_cols}
self.val_max_dict = {col: 0 for col in self.val_cols} self.val_max_dict = {col: 0 for col in self.val_cols}
self.cat_count_dict = {col: collections.defaultdict(int) for col in self.cat_cols} self.cat_count_dict = {col: collections.defaultdict(int) for col in self.cat_cols}
#
self.oov_prefix = "OOV_" self.oov_prefix = "OOV_"


self.cat2id_dict = {} self.cat2id_dict = {}
self.cat2id_dict.update({col: i for i, col in enumerate(self.val_cols)}) self.cat2id_dict.update({col: i for i, col in enumerate(self.val_cols)})
self.cat2id_dict.update({self.oov_prefix + col: i + len(self.val_cols) for i, col in enumerate(self.cat_cols)}) self.cat2id_dict.update({self.oov_prefix + col: i + len(self.val_cols) for i, col in enumerate(self.cat_cols)})
#
def stats_vals(self, val_list): def stats_vals(self, val_list):
"""vals status""" """vals status"""
assert len(val_list) == len(self.val_cols) assert len(val_list) == len(self.val_cols)
@@ -54,19 +54,19 @@ class RecommendationDatasetStatsDict():
self.val_max_dict[key] = float(val) self.val_max_dict[key] = float(val)
if float(val) < self.val_min_dict[key]: if float(val) < self.val_min_dict[key]:
self.val_min_dict[key] = float(val) self.val_min_dict[key] = float(val)
#
for i, val in enumerate(val_list): for i, val in enumerate(val_list):
map_max_min(i, val) map_max_min(i, val)
#
def stats_cats(self, cat_list): def stats_cats(self, cat_list):
assert len(cat_list) == len(self.cat_cols) assert len(cat_list) == len(self.cat_cols)
def map_cat_count(i, cat): def map_cat_count(i, cat):
key = self.cat_cols[i] key = self.cat_cols[i]
self.cat_count_dict[key][cat] += 1 self.cat_count_dict[key][cat] += 1
#
for i, cat in enumerate(cat_list): for i, cat in enumerate(cat_list):
map_cat_count(i, cat) map_cat_count(i, cat)
#
def save_dict(self, output_path, prefix=""): def save_dict(self, output_path, prefix=""):
with open(os.path.join(output_path, "{}val_max_dict.pkl".format(prefix)), "wb") as file_wrt: with open(os.path.join(output_path, "{}val_max_dict.pkl".format(prefix)), "wb") as file_wrt:
pickle.dump(self.val_max_dict, file_wrt) pickle.dump(self.val_max_dict, file_wrt)
@@ -74,7 +74,7 @@ class RecommendationDatasetStatsDict():
pickle.dump(self.val_min_dict, file_wrt) pickle.dump(self.val_min_dict, file_wrt)
with open(os.path.join(output_path, "{}cat_count_dict.pkl".format(prefix)), "wb") as file_wrt: with open(os.path.join(output_path, "{}cat_count_dict.pkl".format(prefix)), "wb") as file_wrt:
pickle.dump(self.cat_count_dict, file_wrt) pickle.dump(self.cat_count_dict, file_wrt)
#
def load_dict(self, dict_path, prefix=""): def load_dict(self, dict_path, prefix=""):
with open(os.path.join(dict_path, "{}val_max_dict.pkl".format(prefix)), "rb") as file_wrt: with open(os.path.join(dict_path, "{}val_max_dict.pkl".format(prefix)), "rb") as file_wrt:
self.val_max_dict = pickle.load(file_wrt) self.val_max_dict = pickle.load(file_wrt)
@@ -84,27 +84,20 @@ class RecommendationDatasetStatsDict():
self.cat_count_dict = pickle.load(file_wrt) self.cat_count_dict = pickle.load(file_wrt)
print("val_max_dict.items()[:50]: {}".format(list(self.val_max_dict.items()))) print("val_max_dict.items()[:50]: {}".format(list(self.val_max_dict.items())))
print("val_min_dict.items()[:50]: {}".format(list(self.val_min_dict.items()))) print("val_min_dict.items()[:50]: {}".format(list(self.val_min_dict.items())))
#
#

def get_cat2id(self, threshold=100): def get_cat2id(self, threshold=100):
"""get cat to id""" """get cat to id"""
# before_all_count = 0
# after_all_count = 0
for key, cat_count_d in self.cat_count_dict.items(): for key, cat_count_d in self.cat_count_dict.items():
new_cat_count_d = dict(filter(lambda x: x[1] > threshold, cat_count_d.items())) new_cat_count_d = dict(filter(lambda x: x[1] > threshold, cat_count_d.items()))
for cat_str, _ in new_cat_count_d.items(): for cat_str, _ in new_cat_count_d.items():
self.cat2id_dict[key + "_" + cat_str] = len(self.cat2id_dict) self.cat2id_dict[key + "_" + cat_str] = len(self.cat2id_dict)
# print("before_all_count: {}".format(before_all_count)) # before_all_count: 33762577
# print("after_all_count: {}".format(after_all_count)) # after_all_count: 184926
print("cat2id_dict.size: {}".format(len(self.cat2id_dict))) print("cat2id_dict.size: {}".format(len(self.cat2id_dict)))
print("cat2id_dict.items()[:50]: {}".format(self.cat2id_dict.items()[:50])) print("cat2id_dict.items()[:50]: {}".format(self.cat2id_dict.items()[:50]))
#
def map_cat2id(self, values, cats): def map_cat2id(self, values, cats):
"""map cat to id""" """map cat to id"""
def minmax_sclae_value(i, val): def minmax_sclae_value(i, val):
# min_v = float(self.val_min_dict["val_{}".format(i+1)])
max_v = float(self.val_max_dict["val_{}".format(i + 1)]) max_v = float(self.val_max_dict["val_{}".format(i + 1)])
# return (float(val) - min_v) * 1.0 / (max_v - min_v)
return float(val) * 1.0 / max_v return float(val) * 1.0 / max_v


id_list = [] id_list = []
@@ -117,7 +110,7 @@ class RecommendationDatasetStatsDict():
key = "val_{}".format(i + 1) key = "val_{}".format(i + 1)
id_list.append(self.cat2id_dict[key]) id_list.append(self.cat2id_dict[key])
weight_list.append(minmax_sclae_value(i, float(val))) weight_list.append(minmax_sclae_value(i, float(val)))
#
for i, cat_str in enumerate(cats): for i, cat_str in enumerate(cats):
key = "cat_{}".format(i + 1) + "_" + cat_str key = "cat_{}".format(i + 1) + "_" + cat_str
if key in self.cat2id_dict: if key in self.cat2id_dict:
@@ -126,14 +119,12 @@ class RecommendationDatasetStatsDict():
id_list.append(self.cat2id_dict[self.oov_prefix + "cat_{}".format(i + 1)]) id_list.append(self.cat2id_dict[self.oov_prefix + "cat_{}".format(i + 1)])
weight_list.append(1.0) weight_list.append(1.0)
return id_list, weight_list return id_list, weight_list
#





def mkdir_path(file_path): def mkdir_path(file_path):
if not os.path.exists(file_path): if not os.path.exists(file_path):
os.makedirs(file_path) os.makedirs(file_path)
#


def statsdata(data_file_path, output_path, recommendation_dataset_stats): def statsdata(data_file_path, output_path, recommendation_dataset_stats):
"""data status""" """data status"""
@@ -150,9 +141,7 @@ def statsdata(data_file_path, output_path, recommendation_dataset_stats):
continue continue
if count % 1000000 == 0: if count % 1000000 == 0:
print("Have handle {}w lines.".format(count//10000)) print("Have handle {}w lines.".format(count//10000))
# if count % 5000000 == 0:
# print("Have handle {}w lines.".format(count//10000))
# label = items[0]

values = items[1:14] values = items[1:14]
cats = items[14:] cats = items[14:]
assert len(values) == 13, "value.size: {}".format(len(values)) assert len(values) == 13, "value.size: {}".format(len(values))
@@ -160,25 +149,21 @@ def statsdata(data_file_path, output_path, recommendation_dataset_stats):
recommendation_dataset_stats.stats_vals(values) recommendation_dataset_stats.stats_vals(values)
recommendation_dataset_stats.stats_cats(cats) recommendation_dataset_stats.stats_cats(cats)
recommendation_dataset_stats.save_dict(output_path) recommendation_dataset_stats.save_dict(output_path)
#




def add_write(file_path, wr_str): def add_write(file_path, wr_str):
with open(file_path, "a", encoding="utf-8") as file_out: with open(file_path, "a", encoding="utf-8") as file_out:
file_out.write(wr_str + "\n") file_out.write(wr_str + "\n")
#




def random_split_trans2h5(in_file_path, output_path, recommendation_dataset_stats, def random_split_trans2h5(in_file_path, output_path, recommendation_dataset_stats,
part_rows=2000000, test_size=0.1, seed=2020): part_rows=2000000, test_size=0.1, seed=2020):
"""random split trans2h5""" """random split trans2h5"""
test_size = int(TRAIN_LINE_COUNT * test_size) test_size = int(TRAIN_LINE_COUNT * test_size)
# train_size = TRAIN_LINE_COUNT - test_size
all_indices = [i for i in range(TRAIN_LINE_COUNT)] all_indices = [i for i in range(TRAIN_LINE_COUNT)]
np.random.seed(seed) np.random.seed(seed)
np.random.shuffle(all_indices) np.random.shuffle(all_indices)
print("all_indices.size: {}".format(len(all_indices))) print("all_indices.size: {}".format(len(all_indices)))
# lines_count_dict = collections.defaultdict(int)
test_indices_set = set(all_indices[:test_size]) test_indices_set = set(all_indices[:test_size])
print("test_indices_set.size: {}".format(len(test_indices_set))) print("test_indices_set.size: {}".format(len(test_indices_set)))
print("------" * 10 + "\n" * 2) print("------" * 10 + "\n" * 2)
@@ -231,7 +216,7 @@ def random_split_trans2h5(in_file_path, output_path, recommendation_dataset_stat
test_feature_list = [] test_feature_list = []
test_label_list = [] test_label_list = []
test_part_number += 1 test_part_number += 1
#
if train_label_list: if train_label_list:
pd.DataFrame(np.asarray(train_feature_list)).to_hdf(train_feature_file_name.format(train_part_number), pd.DataFrame(np.asarray(train_feature_list)).to_hdf(train_feature_file_name.format(train_part_number),
key="fixed") key="fixed")
@@ -242,7 +227,6 @@ def random_split_trans2h5(in_file_path, output_path, recommendation_dataset_stat
key="fixed") key="fixed")
pd.DataFrame(np.asarray(test_label_list)).to_hdf(test_label_file_name.format(test_part_number), pd.DataFrame(np.asarray(test_label_list)).to_hdf(test_label_file_name.format(test_part_number),
key="fixed") key="fixed")
#








+ 0
- 2
model_zoo/official/recommend/wide_and_deep_multitable/src/wide_and_deep.py View File

@@ -156,7 +156,6 @@ class WideDeepModel(nn.Cell):
emb64_multi_size = 20900 emb64_multi_size = 20900
indicator_size = 16 indicator_size = 16
deep_dim_list = [1024, 1024, 1024, 1024, 1024] deep_dim_list = [1024, 1024, 1024, 1024, 1024]
# deep_dropout=0.0
wide_reg_coef = [0.0, 0.0] wide_reg_coef = [0.0, 0.0]
deep_reg_coef = [0.0, 0.0] deep_reg_coef = [0.0, 0.0]
wide_lr = 0.2 wide_lr = 0.2
@@ -530,7 +529,6 @@ class TrainStepWrap(nn.Cell):
initial_accum=0.1, initial_accum=0.1,
loss_scale=sens) loss_scale=sens)
#self.optimizer_d = ProximalAdagrad(self.weights_d, learning_rate=config.adam_lr,loss_scale=sens)
self.optimizer_d = Adam(self.weights_d, self.optimizer_d = Adam(self.weights_d,
learning_rate=config.adam_lr, learning_rate=config.adam_lr,
eps=1e-6, eps=1e-6,


+ 0
- 1
model_zoo/official/recommend/wide_and_deep_multitable/train_and_eval.py View File

@@ -90,7 +90,6 @@ def train_and_eval(config):
eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)
callback = LossCallBack(config) callback = LossCallBack(config)
# Only save the last checkpoint at the last epoch. For saving epochs at each epoch, please # Only save the last checkpoint at the last epoch. For saving epochs at each epoch, please
# set save_checkpoint_steps=ds_train.get_dataset_size()
ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size()*config.epochs, ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size()*config.epochs,
keep_checkpoint_max=10) keep_checkpoint_max=10)
ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',


+ 0
- 1
model_zoo/official/recommend/wide_and_deep_multitable/train_and_eval_distribute.py View File

@@ -95,7 +95,6 @@ def train_and_eval(config):
eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)
callback = LossCallBack(config) callback = LossCallBack(config)
# Only save the last checkpoint at the last epoch. For saving epochs at each epoch, please # Only save the last checkpoint at the last epoch. For saving epochs at each epoch, please
# set save_checkpoint_steps=ds_train.get_dataset_size()
ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size()*config.epochs, ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size()*config.epochs,
keep_checkpoint_max=10) keep_checkpoint_max=10)
ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',


Loading…
Cancel
Save