Browse Source

complete knn model for HED

pull/3/head
Gao Enhao 2 years ago
parent
commit
f65a277e4d
3 changed files with 502 additions and 43 deletions
  1. +39
    -20
      example.py
  2. +407
    -0
      framework_hed_knn.py
  3. +56
    -23
      utils/utils.py

+ 39
- 20
example.py View File

@@ -11,12 +11,14 @@
# ================================================================#

from utils.plog import logger, INFO
from utils.utils import reduce_dimension
import torch.nn as nn
import torch

from models.nn import LeNet5, SymbolNet
from models.basic_model import BasicModel, BasicDataset
from models.wabl_models import DecisionTree, WABLBasicModel
from sklearn.neighbors import KNeighborsClassifier

from multiprocessing import Pool
from abducer.abducer_base import AbducerBase
@@ -25,6 +27,7 @@ from datasets.mnist_add.get_mnist_add import get_mnist_add
from datasets.hwf.get_hwf import get_hwf
from datasets.hed.get_hed import get_hed, split_equation
import framework_hed
import framework_hed_knn


def run_test():
@@ -41,29 +44,45 @@ def run_test():
total_train_data = get_hed(train=True)
train_data, val_data = split_equation(total_train_data, 3, 1)
test_data = get_hed(train=False)
# cls = LeNet5(num_classes=len(kb.pseudo_label_list), image_size=(train_data[0][0][0].shape[1:]))
cls = SymbolNet(num_classes=len(kb.pseudo_label_list))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
framework_hed.hed_pretrain(kb, cls, recorder)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.RMSprop(cls.parameters(), lr=0.001, weight_decay=1e-6)
# optimizer = torch.optim.Adam(cls.parameters(), lr=0.00001, betas=(0.9, 0.99))

base_model = BasicModel(cls, criterion, optimizer, device, save_interval=1, save_dir=recorder.save_dir, batch_size=32, num_epochs=10, recorder=recorder)

# ======================== non-NN model ========================== #
reduce_dimension(train_data)
reduce_dimension(val_data)
reduce_dimension(test_data)
base_model = KNeighborsClassifier(n_neighbors=3)
pretrain_data_X, pretrain_data_Y = framework_hed_knn.hed_pretrain(base_model)
model = WABLBasicModel(base_model, kb.pseudo_label_list)
# train_X, train_Z, train_Y = get_mnist_add(train = True, get_pseudo_label = True)
# test_X, test_Z, test_Y = get_mnist_add(train = False, get_pseudo_label = True)
model, mapping = framework_hed_knn.train_with_rule(
model, abducer, train_data, val_data, (pretrain_data_X, pretrain_data_Y), select_num=10, min_len=5, max_len=8
)
framework_hed_knn.hed_test(
model, abducer, mapping, train_data, test_data, min_len=5, max_len=8
)
# ============================ End =============================== #

# ========================== NN model ============================ #
# # cls = LeNet5(num_classes=len(kb.pseudo_label_list), image_size=(train_data[0][0][0].shape[1:]))
# cls = SymbolNet(num_classes=len(kb.pseudo_label_list))
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# framework_hed.hed_pretrain(kb, cls, recorder)

# criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.RMSprop(cls.parameters(), lr=0.001, weight_decay=1e-6)
# # optimizer = torch.optim.Adam(cls.parameters(), lr=0.00001, betas=(0.9, 0.99))

# base_model = BasicModel(cls, criterion, optimizer, device, save_interval=1, save_dir=recorder.save_dir, batch_size=32, num_epochs=10, recorder=recorder)
# model = WABLBasicModel(base_model, kb.pseudo_label_list)

# # train_X, train_Z, train_Y = get_mnist_add(train = True, get_pseudo_label = True)
# # test_X, test_Z, test_Y = get_mnist_add(train = False, get_pseudo_label = True)

# train_data = get_hwf(train = True, get_pseudo_label = True)
# test_data = get_hwf(train = False, get_pseudo_label = True)
# # train_data = get_hwf(train = True, get_pseudo_label = True)
# # test_data = get_hwf(train = False, get_pseudo_label = True)

model, mapping = framework_hed.train_with_rule(model, abducer, train_data, val_data, select_num=10, min_len=5, max_len=8)
framework_hed.hed_test(model, abducer, mapping, train_data, test_data, min_len=5, max_len=8)
# model, mapping = framework_hed.train_with_rule(model, abducer, train_data, val_data, select_num=10, min_len=5, max_len=8)
# framework_hed.hed_test(model, abducer, mapping, train_data, test_data, min_len=5, max_len=8)
# ============================ End =============================== #

recorder.dump()
return True


+ 407
- 0
framework_hed_knn.py View File

@@ -0,0 +1,407 @@
# coding: utf-8
# ================================================================#
# Copyright (C) 2021 Freecss All rights reserved.
#
# File Name :framework.py
# Author :freecss
# Email :karlfreecss@gmail.com
# Created Date :2021/06/07
# Description :
#
# ================================================================#

import pickle as pk
import torch
import torch.nn as nn
import numpy as np
import os

from utils.plog import INFO, DEBUG, clocker
from utils.utils import (
flatten,
reform_idx,
block_sample,
gen_mappings,
mapping_res,
remapping_res,
extract_feature,
)

from models.nn import MLP, SymbolNetAutoencoder
from models.basic_model import BasicModel, BasicDataset
from datasets.hed.get_hed import get_pretrain_data


def result_statistics(pred_Z, Z, Y, logic_forward, char_acc_flag):
result = {}
if char_acc_flag:
char_acc_num = 0
char_num = 0
for pred_z, z in zip(pred_Z, Z):
char_num += len(z)
for zidx in range(len(z)):
if pred_z[zidx] == z[zidx]:
char_acc_num += 1
char_acc = char_acc_num / char_num
result["Character level accuracy"] = char_acc

abl_acc_num = 0
for pred_z, y in zip(pred_Z, Y):
if logic_forward(pred_z) == y:
abl_acc_num += 1
abl_acc = abl_acc_num / len(Y)
result["ABL accuracy"] = abl_acc

return result


def filter_data(X, abduced_Z):
finetune_Z = []
finetune_X = []
for abduced_x, abduced_z in zip(X, abduced_Z):
if abduced_z is not []:
finetune_X.append(abduced_x)
finetune_Z.append(abduced_z)
return finetune_X, finetune_Z


def hed_pretrain(cls, image_size=(28, 28, 1)):
import cv2

INFO("Pretrain Start")
pretrain_data_X, pretrain_data_Y = [], []
for i, label in enumerate(["0", "1", "10", "11"]):
label_path = os.path.join("./datasets/hed/dataset/mnist_images", label)
img_path_list = os.listdir(label_path)
for j in range(10):
img = cv2.imread(
os.path.join(label_path, img_path_list[j]), cv2.IMREAD_GRAYSCALE
)
img = np.array(cv2.resize(img, (image_size[1], image_size[0])), np.float32)
img = (img - 127) / 128.0
pretrain_data_X.append(
extract_feature(img.reshape((1, image_size[0], image_size[1])))
)
pretrain_data_Y.append(i)
cls.fit(pretrain_data_X, pretrain_data_Y)
import random

for i, label in enumerate(["0", "1", "10", "11"]):
label_path = os.path.join("./datasets/hed/dataset/mnist_images", label)
img_path_list = os.listdir(label_path)
cnt = 0
for j in range(50):
img = cv2.imread(
os.path.join(label_path, random.choice(img_path_list)),
cv2.IMREAD_GRAYSCALE,
)
img = np.array(cv2.resize(img, (image_size[1], image_size[0])), np.float32)
img = (img - 127) / 128.0
predict_label = cls.predict(
[extract_feature(img.reshape((1, image_size[0], image_size[1])))]
)
# predict_label = cls.predict_proba(
# [
# extract_feature(
# np.array(img, dtype=np.float32).reshape(
# (1, image_size[0], image_size[1])
# )
# )
# ]
# ).argmax(axis=1)

if predict_label == i:
cnt += 1
INFO(
"%d predict accuracy is " % i,
cnt / 50,
)

return pretrain_data_X, pretrain_data_Y


def _get_char_acc(model, X, consistent_pred_res, mapping):
original_pred_res = model.predict(X)["cls"]
pred_res = flatten(mapping_res(original_pred_res, mapping))
INFO("Current model's output: ", pred_res)
INFO("Abduced labels: ", flatten(consistent_pred_res))
assert len(pred_res) == len(flatten(consistent_pred_res))
return sum(
[
pred_res[idx] == flatten(consistent_pred_res)[idx]
for idx in range(len(pred_res))
]
) / len(pred_res)


def abduce_and_train(model, abducer, mapping, train_X_true, pretrain_data, select_num):
select_idx = np.random.randint(len(train_X_true), size=select_num)
X = []
for idx in select_idx:
X.append(train_X_true[idx])

original_pred_res = model.predict(X)["cls"]

if mapping == None:
mappings = gen_mappings(["+", "=", 0, 1], ["+", "=", 0, 1])
else:
mappings = [mapping]

consistent_idx = []
consistent_pred_res = []

for m in mappings:
pred_res = mapping_res(original_pred_res, m)
max_abduce_num = 20
solution = abducer.zoopt_get_solution(
pred_res, [1] * len(pred_res), max_abduce_num
)
all_address_flag = reform_idx(solution, pred_res)

consistent_idx_tmp = []
consistent_pred_res_tmp = []

for idx in range(len(pred_res)):
address_idx = [
i for i, flag in enumerate(all_address_flag[idx]) if flag != 0
]
candidate = abducer.kb.address_by_idx([pred_res[idx]], 1, address_idx, True)
if len(candidate) > 0:
consistent_idx_tmp.append(idx)
consistent_pred_res_tmp.append(candidate[0][0])

if len(consistent_idx_tmp) > len(consistent_idx):
consistent_idx = consistent_idx_tmp
consistent_pred_res = consistent_pred_res_tmp
if len(mappings) > 1:
mapping = m

if len(consistent_idx) == 0:
return 0, 0, None

if len(mappings) > 1:
INFO("Final mapping is: ", mapping)

INFO("Train pool size is:", len(flatten(consistent_pred_res)))
INFO("Start to use abduced pseudo label to train model...")
pretrain_data_X, pretrain_data_Y = pretrain_data
pretrain_mappping = {0: 0, 1: 1, 2: "+", 3: "="}
pretrain_data_X = [[X] for X in pretrain_data_X]
pretrain_data_Y = [[pretrain_mappping[Y]] for Y in pretrain_data_Y]
model.train(
[X[idx] for idx in consistent_idx] + pretrain_data_X,
remapping_res(consistent_pred_res + pretrain_data_Y, mapping),
)

consistent_acc = len(consistent_idx) / select_num
char_acc = _get_char_acc(
model, [X[idx] for idx in consistent_idx], consistent_pred_res, mapping
)
INFO("consistent_acc is %s, char_acc is %s" % (consistent_acc, char_acc))
return consistent_acc, char_acc, mapping


def _remove_duplicate_rule(rule_dict):
add_nums_dict = {}
for r in list(rule_dict):
add_nums = str(r.split("]")[0].split("[")[1]) + str(
r.split("]")[1].split("[")[1]
) # r = 'my_op([1], [0], [1, 0])' then add_nums = '10'
if add_nums in add_nums_dict:
old_r = add_nums_dict[add_nums]
if rule_dict[r] >= rule_dict[old_r]:
rule_dict.pop(old_r)
add_nums_dict[add_nums] = r
else:
rule_dict.pop(r)
else:
add_nums_dict[add_nums] = r
return list(rule_dict)


def get_rules_from_data(
model, abducer, mapping, train_X_true, samples_per_rule, samples_num
):
rules = []
for _ in range(samples_num):
while True:
select_idx = np.random.randint(len(train_X_true), size=samples_per_rule)
X = []
for idx in select_idx:
X.append(train_X_true[idx])
original_pred_res = model.predict(X)["cls"]
pred_res = mapping_res(original_pred_res, mapping)

consistent_idx = []
consistent_pred_res = []
for idx in range(len(pred_res)):
if abducer.kb.logic_forward([pred_res[idx]]):
consistent_idx.append(idx)
consistent_pred_res.append(pred_res[idx])

if len(consistent_pred_res) != 0:
rule = abducer.abduce_rules(consistent_pred_res)
if rule != None:
break
rules.append(rule)

all_rule_dict = {}
for rule in rules:
for r in rule:
all_rule_dict[r] = 1 if r not in all_rule_dict else all_rule_dict[r] + 1
rule_dict = {rule: cnt for rule, cnt in all_rule_dict.items() if cnt >= 5}
rules = _remove_duplicate_rule(rule_dict)

return rules


def _get_consist_rule_acc(model, abducer, mapping, rules, X):
cnt = 0
for x in X:
original_pred_res = model.predict([x])["cls"]
pred_res = flatten(mapping_res(original_pred_res, mapping))
if abducer.kb.consist_rule(pred_res, rules):
cnt += 1
return cnt / len(X)


def train_with_rule(
model,
abducer,
train_data,
val_data,
pretrain_data,
select_num=10,
min_len=5,
max_len=8,
):
train_X = train_data
val_X = val_data

samples_num = 50
samples_per_rule = 3

# Start training / for each length of equations
for equation_len in range(min_len, max_len):
INFO(
"============== equation_len: %d-%d ================"
% (equation_len, equation_len + 1)
)
train_X_true = train_X[1][equation_len]
train_X_false = train_X[0][equation_len]
val_X_true = val_X[1][equation_len]
val_X_false = val_X[0][equation_len]

train_X_true.extend(train_X[1][equation_len + 1])
train_X_false.extend(train_X[0][equation_len + 1])
val_X_true.extend(val_X[1][equation_len + 1])
val_X_false.extend(val_X[0][equation_len + 1])

condition_cnt = 0
while True:
if equation_len == min_len:
mapping = None

# Abduce and train NN
consistent_acc, char_acc, mapping = abduce_and_train(
model, abducer, mapping, train_X_true, pretrain_data, select_num
)
if consistent_acc == 0:
continue

# Test if we can use mlp to evaluate
if consistent_acc >= 0.9 and char_acc >= 0.9:
condition_cnt += 1
else:
condition_cnt = 0

# The condition has been satisfied continuously five times
if condition_cnt >= 5:
INFO("Now checking if we can go to next course")
rules = get_rules_from_data(
model, abducer, mapping, train_X_true, samples_per_rule, samples_num
)
INFO("Learned rules from data:", rules)

true_consist_rule_acc = _get_consist_rule_acc(
model, abducer, mapping, rules, val_X_true
)
false_consist_rule_acc = _get_consist_rule_acc(
model, abducer, mapping, rules, val_X_false
)

INFO(
"consist_rule_acc is %f, %f\n"
% (true_consist_rule_acc, false_consist_rule_acc)
)
# decide next course or restart
if true_consist_rule_acc > 0.9 and false_consist_rule_acc < 0.1:
break
else:
if equation_len == min_len:
# model.cls_list[0].model.load_state_dict(
# torch.load("./weights/pretrain_weights.pth")
# )
pretrain_data_X, pretrain_data_Y = pretrain_data
model.cls_list[0].fit(pretrain_data_X, pretrain_data_Y)
else:
pretrain_data_X, pretrain_data_Y = pretrain_data
model.cls_list[0].fit(pretrain_data_X, pretrain_data_Y)
# model.cls_list[0].model.load_state_dict(
# torch.load("./weights/weights_%d.pth" % (equation_len - 1))
# )
condition_cnt = 0
INFO("Reload Model and retrain")

return model, mapping


def hed_test(model, abducer, mapping, train_data, test_data, min_len=5, max_len=8):
train_X = train_data
test_X = test_data

# Calcualte how many equations should be selected in each length
# for each length, there are equation_samples_num[equation_len] rules
print("Now begin to train final mlp model")
equation_samples_num = []
len_cnt = max_len - min_len + 1
samples_num = 50
equation_samples_num += [0] * min_len
if samples_num % len_cnt == 0:
equation_samples_num += [samples_num // len_cnt] * len_cnt
else:
equation_samples_num += [samples_num // len_cnt] * len_cnt
equation_samples_num[-1] += samples_num % len_cnt
assert sum(equation_samples_num) == samples_num

# Abduce rules
rules = []
samples_per_rule = 3
for equation_len in range(min_len, max_len + 1):
equation_rules = get_rules_from_data(
model,
abducer,
mapping,
train_X[1][equation_len],
samples_per_rule,
equation_samples_num[equation_len],
)
rules.extend(equation_rules)
rules = list(set(rules))
INFO("Learned rules from data:", rules)

for equation_len in range(5, 27):
true_consist_rule_acc = _get_consist_rule_acc(
model, abducer, mapping, rules, test_X[1][equation_len]
)
false_consist_rule_acc = _get_consist_rule_acc(
model, abducer, mapping, rules, test_X[0][equation_len]
)
INFO(
"consist_rule_acc of testing length %d equations are %f, %f"
% (equation_len, true_consist_rule_acc, false_consist_rule_acc)
)


if __name__ == "__main__":
pass

+ 56
- 23
utils/utils.py View File

@@ -1,11 +1,18 @@
import torch
import torch.nn as nn
import numpy as np
from utils.plog import INFO
from collections import OrderedDict

# for multiple predictions, modify from `learn_add.py`
def flatten(l):
return [item for sublist in l for item in flatten(sublist)] if isinstance(l, list) else [l]
return (
[item for sublist in l for item in flatten(sublist)]
if isinstance(l, list)
else [l]
)


# for multiple predictions, modify from `learn_add.py`
def reform_idx(flatten_pred_res, save_pred_res):
re = []
@@ -20,10 +27,12 @@ def reform_idx(flatten_pred_res, save_pred_res):
i = i + j
return re


def hamming_dist(A, B):
B = np.array(B)
A = np.expand_dims(A, axis = 0).repeat(axis=0, repeats=(len(B)))
return np.sum(A != B, axis = 1)
A = np.expand_dims(A, axis=0).repeat(axis=0, repeats=(len(B)))
return np.sum(A != B, axis=1)


def confidence_dist(A, B):
B = np.array(B)
@@ -31,10 +40,10 @@ def confidence_dist(A, B):
A = np.expand_dims(A, axis=0)
A = A.repeat(axis=0, repeats=(len(B)))
rows = np.array(range(len(B)))
rows = np.expand_dims(rows, axis = 1).repeat(axis = 1, repeats = len(B[0]))
rows = np.expand_dims(rows, axis=1).repeat(axis=1, repeats=len(B[0]))
cols = np.array(range(len(B[0])))
cols = np.expand_dims(cols, axis = 0).repeat(axis = 0, repeats = len(B))
return 1 - np.prod(A[rows, cols, B], axis = 1)
cols = np.expand_dims(cols, axis=0).repeat(axis=0, repeats=len(B))
return 1 - np.prod(A[rows, cols, B], axis=1)


def block_sample(X, Z, Y, sample_num, epoch_idx):
@@ -51,32 +60,36 @@ def block_sample(X, Z, Y, sample_num, epoch_idx):


def gen_mappings(chars, symbs):
n_char = len(chars)
n_symbs = len(symbs)
if n_char != n_symbs:
print('Characters and symbols size dosen\'t match.')
return
from itertools import permutations
mappings = []
# returned mappings
perms = permutations(symbs)
for p in perms:
mappings.append(dict(zip(chars, list(p))))
return mappings
n_char = len(chars)
n_symbs = len(symbs)
if n_char != n_symbs:
print("Characters and symbols size dosen't match.")
return
from itertools import permutations

mappings = []
# returned mappings
perms = permutations(symbs)
for p in perms:
mappings.append(dict(zip(chars, list(p))))
return mappings


def mapping_res(original_pred_res, m):
return [[m[symbol] for symbol in formula] for formula in original_pred_res]


def remapping_res(pred_res, m):
remapping = {}
for key, value in m.items():
remapping[value] = key
return [[remapping[symbol] for symbol in formula] for formula in pred_res]


def check_equal(a, b):
if isinstance(a, (int, float)) and isinstance(b, (int, float)):
return abs(a - b) <= 1e-3
if isinstance(a, list) and isinstance(b, list):
if len(a) != len(b):
return False
@@ -84,6 +97,26 @@ def check_equal(a, b):
if not check_equal(a[i], b[i]):
return False
return True
else:
return a == b

else:
return a == b


def extract_feature(img):
extractor = nn.AvgPool2d(2, stride=2)
feature_map = np.array(extractor(torch.Tensor(img)))
return feature_map.reshape((-1,))
return np.concatenate(
(np.squeeze(np.sum(img, axis=1)), np.squeeze(np.sum(img, axis=2))), axis=0
)


def reduce_dimension(data):
for truth_value in [0, 1]:
for equation_len in range(5, 27):
equations = data[truth_value][equation_len]
reduced_equations = [
[extract_feature(symbol_img) for symbol_img in equation]
for equation in equations
]
data[truth_value][equation_len] = reduced_equations

Loading…
Cancel
Save