diff --git a/example.py b/example.py index 30c50fc..3751db3 100644 --- a/example.py +++ b/example.py @@ -11,12 +11,14 @@ # ================================================================# from utils.plog import logger, INFO +from utils.utils import reduce_dimension import torch.nn as nn import torch from models.nn import LeNet5, SymbolNet from models.basic_model import BasicModel, BasicDataset from models.wabl_models import DecisionTree, WABLBasicModel +from sklearn.neighbors import KNeighborsClassifier from multiprocessing import Pool from abducer.abducer_base import AbducerBase @@ -25,6 +27,7 @@ from datasets.mnist_add.get_mnist_add import get_mnist_add from datasets.hwf.get_hwf import get_hwf from datasets.hed.get_hed import get_hed, split_equation import framework_hed +import framework_hed_knn def run_test(): @@ -41,29 +44,45 @@ def run_test(): total_train_data = get_hed(train=True) train_data, val_data = split_equation(total_train_data, 3, 1) test_data = get_hed(train=False) - - # cls = LeNet5(num_classes=len(kb.pseudo_label_list), image_size=(train_data[0][0][0].shape[1:])) - cls = SymbolNet(num_classes=len(kb.pseudo_label_list)) - device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - - framework_hed.hed_pretrain(kb, cls, recorder) - - criterion = nn.CrossEntropyLoss() - optimizer = torch.optim.RMSprop(cls.parameters(), lr=0.001, weight_decay=1e-6) - # optimizer = torch.optim.Adam(cls.parameters(), lr=0.00001, betas=(0.9, 0.99)) - - - base_model = BasicModel(cls, criterion, optimizer, device, save_interval=1, save_dir=recorder.save_dir, batch_size=32, num_epochs=10, recorder=recorder) + + # ======================== non-NN model ========================== # + reduce_dimension(train_data) + reduce_dimension(val_data) + reduce_dimension(test_data) + base_model = KNeighborsClassifier(n_neighbors=3) + pretrain_data_X, pretrain_data_Y = framework_hed_knn.hed_pretrain(base_model) model = WABLBasicModel(base_model, kb.pseudo_label_list) - - # train_X, train_Z, train_Y = get_mnist_add(train = True, get_pseudo_label = True) - # test_X, test_Z, test_Y = get_mnist_add(train = False, get_pseudo_label = True) + model, mapping = framework_hed_knn.train_with_rule( + model, abducer, train_data, val_data, (pretrain_data_X, pretrain_data_Y), select_num=10, min_len=5, max_len=8 + ) + framework_hed_knn.hed_test( + model, abducer, mapping, train_data, test_data, min_len=5, max_len=8 + ) + # ============================ End =============================== # + + # ========================== NN model ============================ # + # # cls = LeNet5(num_classes=len(kb.pseudo_label_list), image_size=(train_data[0][0][0].shape[1:])) + # cls = SymbolNet(num_classes=len(kb.pseudo_label_list)) + # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + + # framework_hed.hed_pretrain(kb, cls, recorder) + + # criterion = nn.CrossEntropyLoss() + # optimizer = torch.optim.RMSprop(cls.parameters(), lr=0.001, weight_decay=1e-6) + # # optimizer = torch.optim.Adam(cls.parameters(), lr=0.00001, betas=(0.9, 0.99)) + + # base_model = BasicModel(cls, criterion, optimizer, device, save_interval=1, save_dir=recorder.save_dir, batch_size=32, num_epochs=10, recorder=recorder) + # model = WABLBasicModel(base_model, kb.pseudo_label_list) + + # # train_X, train_Z, train_Y = get_mnist_add(train = True, get_pseudo_label = True) + # # test_X, test_Z, test_Y = get_mnist_add(train = False, get_pseudo_label = True) - # train_data = get_hwf(train = True, get_pseudo_label = True) - # test_data = get_hwf(train = False, get_pseudo_label = True) + # # train_data = get_hwf(train = True, get_pseudo_label = True) + # # test_data = get_hwf(train = False, get_pseudo_label = True) - model, mapping = framework_hed.train_with_rule(model, abducer, train_data, val_data, select_num=10, min_len=5, max_len=8) - framework_hed.hed_test(model, abducer, mapping, train_data, test_data, min_len=5, max_len=8) + # model, mapping = framework_hed.train_with_rule(model, abducer, train_data, val_data, select_num=10, min_len=5, max_len=8) + # framework_hed.hed_test(model, abducer, mapping, train_data, test_data, min_len=5, max_len=8) + # ============================ End =============================== # recorder.dump() return True diff --git a/framework_hed_knn.py b/framework_hed_knn.py new file mode 100644 index 0000000..a5e5506 --- /dev/null +++ b/framework_hed_knn.py @@ -0,0 +1,407 @@ +# coding: utf-8 +# ================================================================# +# Copyright (C) 2021 Freecss All rights reserved. +# +# File Name :framework.py +# Author :freecss +# Email :karlfreecss@gmail.com +# Created Date :2021/06/07 +# Description : +# +# ================================================================# + +import pickle as pk +import torch +import torch.nn as nn +import numpy as np +import os + +from utils.plog import INFO, DEBUG, clocker +from utils.utils import ( + flatten, + reform_idx, + block_sample, + gen_mappings, + mapping_res, + remapping_res, + extract_feature, +) + +from models.nn import MLP, SymbolNetAutoencoder +from models.basic_model import BasicModel, BasicDataset +from datasets.hed.get_hed import get_pretrain_data + + +def result_statistics(pred_Z, Z, Y, logic_forward, char_acc_flag): + result = {} + if char_acc_flag: + char_acc_num = 0 + char_num = 0 + for pred_z, z in zip(pred_Z, Z): + char_num += len(z) + for zidx in range(len(z)): + if pred_z[zidx] == z[zidx]: + char_acc_num += 1 + char_acc = char_acc_num / char_num + result["Character level accuracy"] = char_acc + + abl_acc_num = 0 + for pred_z, y in zip(pred_Z, Y): + if logic_forward(pred_z) == y: + abl_acc_num += 1 + abl_acc = abl_acc_num / len(Y) + result["ABL accuracy"] = abl_acc + + return result + + +def filter_data(X, abduced_Z): + finetune_Z = [] + finetune_X = [] + for abduced_x, abduced_z in zip(X, abduced_Z): + if abduced_z is not []: + finetune_X.append(abduced_x) + finetune_Z.append(abduced_z) + return finetune_X, finetune_Z + + +def hed_pretrain(cls, image_size=(28, 28, 1)): + import cv2 + + INFO("Pretrain Start") + pretrain_data_X, pretrain_data_Y = [], [] + for i, label in enumerate(["0", "1", "10", "11"]): + label_path = os.path.join("./datasets/hed/dataset/mnist_images", label) + img_path_list = os.listdir(label_path) + for j in range(10): + img = cv2.imread( + os.path.join(label_path, img_path_list[j]), cv2.IMREAD_GRAYSCALE + ) + img = np.array(cv2.resize(img, (image_size[1], image_size[0])), np.float32) + img = (img - 127) / 128.0 + pretrain_data_X.append( + extract_feature(img.reshape((1, image_size[0], image_size[1]))) + ) + pretrain_data_Y.append(i) + cls.fit(pretrain_data_X, pretrain_data_Y) + import random + + for i, label in enumerate(["0", "1", "10", "11"]): + label_path = os.path.join("./datasets/hed/dataset/mnist_images", label) + img_path_list = os.listdir(label_path) + cnt = 0 + for j in range(50): + img = cv2.imread( + os.path.join(label_path, random.choice(img_path_list)), + cv2.IMREAD_GRAYSCALE, + ) + img = np.array(cv2.resize(img, (image_size[1], image_size[0])), np.float32) + img = (img - 127) / 128.0 + predict_label = cls.predict( + [extract_feature(img.reshape((1, image_size[0], image_size[1])))] + ) + # predict_label = cls.predict_proba( + # [ + # extract_feature( + # np.array(img, dtype=np.float32).reshape( + # (1, image_size[0], image_size[1]) + # ) + # ) + # ] + # ).argmax(axis=1) + + if predict_label == i: + cnt += 1 + INFO( + "%d predict accuracy is " % i, + cnt / 50, + ) + + return pretrain_data_X, pretrain_data_Y + + +def _get_char_acc(model, X, consistent_pred_res, mapping): + original_pred_res = model.predict(X)["cls"] + pred_res = flatten(mapping_res(original_pred_res, mapping)) + INFO("Current model's output: ", pred_res) + INFO("Abduced labels: ", flatten(consistent_pred_res)) + assert len(pred_res) == len(flatten(consistent_pred_res)) + return sum( + [ + pred_res[idx] == flatten(consistent_pred_res)[idx] + for idx in range(len(pred_res)) + ] + ) / len(pred_res) + + +def abduce_and_train(model, abducer, mapping, train_X_true, pretrain_data, select_num): + select_idx = np.random.randint(len(train_X_true), size=select_num) + X = [] + for idx in select_idx: + X.append(train_X_true[idx]) + + original_pred_res = model.predict(X)["cls"] + + if mapping == None: + mappings = gen_mappings(["+", "=", 0, 1], ["+", "=", 0, 1]) + else: + mappings = [mapping] + + consistent_idx = [] + consistent_pred_res = [] + + for m in mappings: + pred_res = mapping_res(original_pred_res, m) + max_abduce_num = 20 + solution = abducer.zoopt_get_solution( + pred_res, [1] * len(pred_res), max_abduce_num + ) + all_address_flag = reform_idx(solution, pred_res) + + consistent_idx_tmp = [] + consistent_pred_res_tmp = [] + + for idx in range(len(pred_res)): + address_idx = [ + i for i, flag in enumerate(all_address_flag[idx]) if flag != 0 + ] + candidate = abducer.kb.address_by_idx([pred_res[idx]], 1, address_idx, True) + if len(candidate) > 0: + consistent_idx_tmp.append(idx) + consistent_pred_res_tmp.append(candidate[0][0]) + + if len(consistent_idx_tmp) > len(consistent_idx): + consistent_idx = consistent_idx_tmp + consistent_pred_res = consistent_pred_res_tmp + if len(mappings) > 1: + mapping = m + + if len(consistent_idx) == 0: + return 0, 0, None + + if len(mappings) > 1: + INFO("Final mapping is: ", mapping) + + INFO("Train pool size is:", len(flatten(consistent_pred_res))) + INFO("Start to use abduced pseudo label to train model...") + pretrain_data_X, pretrain_data_Y = pretrain_data + pretrain_mappping = {0: 0, 1: 1, 2: "+", 3: "="} + pretrain_data_X = [[X] for X in pretrain_data_X] + pretrain_data_Y = [[pretrain_mappping[Y]] for Y in pretrain_data_Y] + model.train( + [X[idx] for idx in consistent_idx] + pretrain_data_X, + remapping_res(consistent_pred_res + pretrain_data_Y, mapping), + ) + + consistent_acc = len(consistent_idx) / select_num + char_acc = _get_char_acc( + model, [X[idx] for idx in consistent_idx], consistent_pred_res, mapping + ) + INFO("consistent_acc is %s, char_acc is %s" % (consistent_acc, char_acc)) + return consistent_acc, char_acc, mapping + + +def _remove_duplicate_rule(rule_dict): + add_nums_dict = {} + for r in list(rule_dict): + add_nums = str(r.split("]")[0].split("[")[1]) + str( + r.split("]")[1].split("[")[1] + ) # r = 'my_op([1], [0], [1, 0])' then add_nums = '10' + if add_nums in add_nums_dict: + old_r = add_nums_dict[add_nums] + if rule_dict[r] >= rule_dict[old_r]: + rule_dict.pop(old_r) + add_nums_dict[add_nums] = r + else: + rule_dict.pop(r) + else: + add_nums_dict[add_nums] = r + return list(rule_dict) + + +def get_rules_from_data( + model, abducer, mapping, train_X_true, samples_per_rule, samples_num +): + rules = [] + for _ in range(samples_num): + while True: + select_idx = np.random.randint(len(train_X_true), size=samples_per_rule) + X = [] + for idx in select_idx: + X.append(train_X_true[idx]) + original_pred_res = model.predict(X)["cls"] + pred_res = mapping_res(original_pred_res, mapping) + + consistent_idx = [] + consistent_pred_res = [] + for idx in range(len(pred_res)): + if abducer.kb.logic_forward([pred_res[idx]]): + consistent_idx.append(idx) + consistent_pred_res.append(pred_res[idx]) + + if len(consistent_pred_res) != 0: + rule = abducer.abduce_rules(consistent_pred_res) + if rule != None: + break + rules.append(rule) + + all_rule_dict = {} + for rule in rules: + for r in rule: + all_rule_dict[r] = 1 if r not in all_rule_dict else all_rule_dict[r] + 1 + rule_dict = {rule: cnt for rule, cnt in all_rule_dict.items() if cnt >= 5} + rules = _remove_duplicate_rule(rule_dict) + + return rules + + +def _get_consist_rule_acc(model, abducer, mapping, rules, X): + cnt = 0 + for x in X: + original_pred_res = model.predict([x])["cls"] + pred_res = flatten(mapping_res(original_pred_res, mapping)) + if abducer.kb.consist_rule(pred_res, rules): + cnt += 1 + return cnt / len(X) + + +def train_with_rule( + model, + abducer, + train_data, + val_data, + pretrain_data, + select_num=10, + min_len=5, + max_len=8, +): + train_X = train_data + val_X = val_data + + samples_num = 50 + samples_per_rule = 3 + + # Start training / for each length of equations + for equation_len in range(min_len, max_len): + INFO( + "============== equation_len: %d-%d ================" + % (equation_len, equation_len + 1) + ) + train_X_true = train_X[1][equation_len] + train_X_false = train_X[0][equation_len] + val_X_true = val_X[1][equation_len] + val_X_false = val_X[0][equation_len] + + train_X_true.extend(train_X[1][equation_len + 1]) + train_X_false.extend(train_X[0][equation_len + 1]) + val_X_true.extend(val_X[1][equation_len + 1]) + val_X_false.extend(val_X[0][equation_len + 1]) + + condition_cnt = 0 + while True: + if equation_len == min_len: + mapping = None + + # Abduce and train NN + consistent_acc, char_acc, mapping = abduce_and_train( + model, abducer, mapping, train_X_true, pretrain_data, select_num + ) + if consistent_acc == 0: + continue + + # Test if we can use mlp to evaluate + if consistent_acc >= 0.9 and char_acc >= 0.9: + condition_cnt += 1 + else: + condition_cnt = 0 + + # The condition has been satisfied continuously five times + if condition_cnt >= 5: + INFO("Now checking if we can go to next course") + rules = get_rules_from_data( + model, abducer, mapping, train_X_true, samples_per_rule, samples_num + ) + INFO("Learned rules from data:", rules) + + true_consist_rule_acc = _get_consist_rule_acc( + model, abducer, mapping, rules, val_X_true + ) + false_consist_rule_acc = _get_consist_rule_acc( + model, abducer, mapping, rules, val_X_false + ) + + INFO( + "consist_rule_acc is %f, %f\n" + % (true_consist_rule_acc, false_consist_rule_acc) + ) + # decide next course or restart + if true_consist_rule_acc > 0.9 and false_consist_rule_acc < 0.1: + break + else: + if equation_len == min_len: + # model.cls_list[0].model.load_state_dict( + # torch.load("./weights/pretrain_weights.pth") + # ) + pretrain_data_X, pretrain_data_Y = pretrain_data + model.cls_list[0].fit(pretrain_data_X, pretrain_data_Y) + else: + pretrain_data_X, pretrain_data_Y = pretrain_data + model.cls_list[0].fit(pretrain_data_X, pretrain_data_Y) + # model.cls_list[0].model.load_state_dict( + # torch.load("./weights/weights_%d.pth" % (equation_len - 1)) + # ) + condition_cnt = 0 + INFO("Reload Model and retrain") + + return model, mapping + + +def hed_test(model, abducer, mapping, train_data, test_data, min_len=5, max_len=8): + train_X = train_data + test_X = test_data + + # Calcualte how many equations should be selected in each length + # for each length, there are equation_samples_num[equation_len] rules + print("Now begin to train final mlp model") + equation_samples_num = [] + len_cnt = max_len - min_len + 1 + samples_num = 50 + equation_samples_num += [0] * min_len + if samples_num % len_cnt == 0: + equation_samples_num += [samples_num // len_cnt] * len_cnt + else: + equation_samples_num += [samples_num // len_cnt] * len_cnt + equation_samples_num[-1] += samples_num % len_cnt + assert sum(equation_samples_num) == samples_num + + # Abduce rules + rules = [] + samples_per_rule = 3 + for equation_len in range(min_len, max_len + 1): + equation_rules = get_rules_from_data( + model, + abducer, + mapping, + train_X[1][equation_len], + samples_per_rule, + equation_samples_num[equation_len], + ) + rules.extend(equation_rules) + rules = list(set(rules)) + INFO("Learned rules from data:", rules) + + for equation_len in range(5, 27): + true_consist_rule_acc = _get_consist_rule_acc( + model, abducer, mapping, rules, test_X[1][equation_len] + ) + false_consist_rule_acc = _get_consist_rule_acc( + model, abducer, mapping, rules, test_X[0][equation_len] + ) + INFO( + "consist_rule_acc of testing length %d equations are %f, %f" + % (equation_len, true_consist_rule_acc, false_consist_rule_acc) + ) + + +if __name__ == "__main__": + pass diff --git a/utils/utils.py b/utils/utils.py index 1138361..5cd433d 100644 --- a/utils/utils.py +++ b/utils/utils.py @@ -1,11 +1,18 @@ +import torch +import torch.nn as nn import numpy as np from utils.plog import INFO from collections import OrderedDict # for multiple predictions, modify from `learn_add.py` def flatten(l): - return [item for sublist in l for item in flatten(sublist)] if isinstance(l, list) else [l] - + return ( + [item for sublist in l for item in flatten(sublist)] + if isinstance(l, list) + else [l] + ) + + # for multiple predictions, modify from `learn_add.py` def reform_idx(flatten_pred_res, save_pred_res): re = [] @@ -20,10 +27,12 @@ def reform_idx(flatten_pred_res, save_pred_res): i = i + j return re + def hamming_dist(A, B): B = np.array(B) - A = np.expand_dims(A, axis = 0).repeat(axis=0, repeats=(len(B))) - return np.sum(A != B, axis = 1) + A = np.expand_dims(A, axis=0).repeat(axis=0, repeats=(len(B))) + return np.sum(A != B, axis=1) + def confidence_dist(A, B): B = np.array(B) @@ -31,10 +40,10 @@ def confidence_dist(A, B): A = np.expand_dims(A, axis=0) A = A.repeat(axis=0, repeats=(len(B))) rows = np.array(range(len(B))) - rows = np.expand_dims(rows, axis = 1).repeat(axis = 1, repeats = len(B[0])) + rows = np.expand_dims(rows, axis=1).repeat(axis=1, repeats=len(B[0])) cols = np.array(range(len(B[0]))) - cols = np.expand_dims(cols, axis = 0).repeat(axis = 0, repeats = len(B)) - return 1 - np.prod(A[rows, cols, B], axis = 1) + cols = np.expand_dims(cols, axis=0).repeat(axis=0, repeats=len(B)) + return 1 - np.prod(A[rows, cols, B], axis=1) def block_sample(X, Z, Y, sample_num, epoch_idx): @@ -51,32 +60,36 @@ def block_sample(X, Z, Y, sample_num, epoch_idx): def gen_mappings(chars, symbs): - n_char = len(chars) - n_symbs = len(symbs) - if n_char != n_symbs: - print('Characters and symbols size dosen\'t match.') - return - from itertools import permutations - mappings = [] - # returned mappings - perms = permutations(symbs) - for p in perms: - mappings.append(dict(zip(chars, list(p)))) - return mappings + n_char = len(chars) + n_symbs = len(symbs) + if n_char != n_symbs: + print("Characters and symbols size dosen't match.") + return + from itertools import permutations + + mappings = [] + # returned mappings + perms = permutations(symbs) + for p in perms: + mappings.append(dict(zip(chars, list(p)))) + return mappings + def mapping_res(original_pred_res, m): return [[m[symbol] for symbol in formula] for formula in original_pred_res] + def remapping_res(pred_res, m): remapping = {} for key, value in m.items(): remapping[value] = key return [[remapping[symbol] for symbol in formula] for formula in pred_res] + def check_equal(a, b): if isinstance(a, (int, float)) and isinstance(b, (int, float)): return abs(a - b) <= 1e-3 - + if isinstance(a, list) and isinstance(b, list): if len(a) != len(b): return False @@ -84,6 +97,26 @@ def check_equal(a, b): if not check_equal(a[i], b[i]): return False return True - - else: - return a == b + + else: + return a == b + + +def extract_feature(img): + extractor = nn.AvgPool2d(2, stride=2) + feature_map = np.array(extractor(torch.Tensor(img))) + return feature_map.reshape((-1,)) + return np.concatenate( + (np.squeeze(np.sum(img, axis=1)), np.squeeze(np.sum(img, axis=2))), axis=0 + ) + + +def reduce_dimension(data): + for truth_value in [0, 1]: + for equation_len in range(5, 27): + equations = data[truth_value][equation_len] + reduced_equations = [ + [extract_feature(symbol_img) for symbol_img in equation] + for equation in equations + ] + data[truth_value][equation_len] = reduced_equations