Browse Source

Merge branch 'dev' of git.nju.edu.cn:learnware/learnware-market into dev

tags/v0.3.2
bxdd 3 years ago
parent
commit
e9cfbae700
10 changed files with 321 additions and 80 deletions
  1. +2
    -5
      examples/example_market_db/example_db.py
  2. +1
    -1
      examples/examples2/svm/__init__.py
  3. +1
    -1
      examples/learnware_config/svm/__init__.py
  4. +4
    -16
      examples/workflow_by_code/main.py
  5. +5
    -20
      learnware/config.py
  6. +2
    -4
      learnware/learnware/__init__.py
  7. +187
    -2
      learnware/learnware/reuse.py
  8. +30
    -26
      learnware/market/easy.py
  9. +87
    -5
      learnware/specification/rkme.py
  10. +2
    -0
      setup.py

+ 2
- 5
examples/example_market_db/example_db.py View File

@@ -40,14 +40,11 @@ semantic_specs = [

user_senmantic = {
"Data": {"Values": ["Tabular"], "Type": "Class"},
"Task": {
"Values": ["Classification"],
"Type": "Class",
},
"Task": {"Values": ["Classification"], "Type": "Class",},
"Device": {"Values": ["GPU"], "Type": "Tag"},
"Scenario": {"Values": ["Business"], "Type": "Tag"},
"Description": {"Values": "", "Type": "Description"},
"Name": {"Values": "learnware_4", "Type": "Name"},
"Name": {"Values": "learnware", "Type": "Name"},
}




+ 1
- 1
examples/examples2/svm/__init__.py View File

@@ -15,5 +15,5 @@ class SVM(BaseModel):
def predict(self, X: np.ndarray) -> np.ndarray:
return self.model.predict(X)

def fintune(self, X: np.ndarray, y: np.ndarray):
def finetune(self, X: np.ndarray, y: np.ndarray):
pass

+ 1
- 1
examples/learnware_config/svm/__init__.py View File

@@ -15,5 +15,5 @@ class SVM(BaseModel):
def predict(self, X: np.ndarray) -> np.ndarray:
return self.model.predict(X)

def fintune(self, X: np.ndarray, y: np.ndarray):
def finetune(self, X: np.ndarray, y: np.ndarray):
pass

+ 4
- 16
examples/workflow_by_code/main.py View File

@@ -16,10 +16,7 @@ curr_root = os.path.dirname(os.path.abspath(__file__))
semantic_specs = [
{
"Data": {"Values": ["Tabular"], "Type": "Class"},
"Task": {
"Values": ["Classification"],
"Type": "Class",
},
"Task": {"Values": ["Classification"], "Type": "Class",},
"Device": {"Values": ["GPU"], "Type": "Tag"},
"Scenario": {"Values": ["Nature"], "Type": "Tag"},
"Description": {"Values": "", "Type": "Description"},
@@ -27,10 +24,7 @@ semantic_specs = [
},
{
"Data": {"Values": ["Tabular"], "Type": "Class"},
"Task": {
"Values": ["Classification"],
"Type": "Class",
},
"Task": {"Values": ["Classification"], "Type": "Class",},
"Device": {"Values": ["GPU"], "Type": "Tag"},
"Scenario": {"Values": ["Business", "Nature"], "Type": "Tag"},
"Description": {"Values": "", "Type": "Description"},
@@ -38,10 +32,7 @@ semantic_specs = [
},
{
"Data": {"Values": ["Tabular"], "Type": "Class"},
"Task": {
"Values": ["Classification"],
"Type": "Class",
},
"Task": {"Values": ["Classification"], "Type": "Class",},
"Device": {"Values": ["GPU"], "Type": "Tag"},
"Scenario": {"Values": ["Business"], "Type": "Tag"},
"Description": {"Values": "", "Type": "Description"},
@@ -51,10 +42,7 @@ semantic_specs = [

user_senmantic = {
"Data": {"Values": ["Tabular"], "Type": "Class"},
"Task": {
"Values": ["Classification"],
"Type": "Class",
},
"Task": {"Values": ["Classification"], "Type": "Class",},
"Device": {"Values": ["GPU"], "Type": "Tag"},
"Scenario": {"Values": ["Business"], "Type": "Tag"},
"Description": {"Values": "", "Type": "Description"},


+ 5
- 20
learnware/config.py View File

@@ -66,10 +66,7 @@ os.makedirs(LEARNWARE_FOLDER_POOL_PATH, exist_ok=True)
os.makedirs(DATABASE_PATH, exist_ok=True)

semantic_config = {
"Data": {
"Values": ["Tabular", "Image", "Video", "Text", "Audio"],
"Type": "Class",
}, # Choose only one class
"Data": {"Values": ["Tabular", "Image", "Video", "Text", "Audio"], "Type": "Class",}, # Choose only one class
"Task": {
"Values": [
"Classification",
@@ -82,10 +79,7 @@ semantic_config = {
],
"Type": "Class", # Choose only one class
},
"Device": {
"Values": ["CPU", "GPU"],
"Type": "Tag",
}, # Choose one or more tags
"Device": {"Values": ["CPU", "GPU"], "Type": "Tag",}, # Choose one or more tags
"Scenario": {
"Values": [
"Business",
@@ -105,14 +99,8 @@ semantic_config = {
],
"Type": "Tag", # Choose one or more tags
},
"Description": {
"Values": None,
"Type": "Description",
},
"Name": {
"Values": None,
"Type": "Name",
},
"Description": {"Values": None, "Type": "Description",},
"Name": {"Values": None, "Type": "Name",},
}

_DEFAULT_CONFIG = {
@@ -123,10 +111,7 @@ _DEFAULT_CONFIG = {
"learnware_pool_path": LEARNWARE_POOL_PATH,
"learnware_zip_pool_path": LEARNWARE_ZIP_POOL_PATH,
"learnware_folder_pool_path": LEARNWARE_FOLDER_POOL_PATH,
"learnware_folder_config": {
"yaml_file": "learnware.yaml",
"module_file": "__init__.py",
},
"learnware_folder_config": {"yaml_file": "learnware.yaml", "module_file": "__init__.py",},
"database_path": DATABASE_PATH,
}



+ 2
- 4
learnware/learnware/__init__.py View File

@@ -2,6 +2,7 @@ import os
import copy

from .base import Learnware
from .reuse import BaseReuse
from .utils import get_stat_spec_from_config, get_model_from_config
from ..specification import Specification
from ..utils import read_yaml_to_dict
@@ -29,10 +30,7 @@ def get_learnware_from_dirpath(id: str, semantic_spec: dict, learnware_dirpath:
The contructed learnware object, return None if build failed
"""
learnware_config = {
"model": {
"class_name": "Model",
"kwargs": {},
},
"model": {"class_name": "Model", "kwargs": {},},
"stat_specifications": [
{
"module_path": "learnware.specification",


+ 187
- 2
learnware/learnware/reuse.py View File

@@ -1,3 +1,188 @@
import numpy as np
from typing import Tuple, Any, List, Union, Dict
from cvxopt import matrix, solvers
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

from learnware.learnware import Learnware
import learnware.specification as specification
from ..specification import RKMEStatSpecification
from ..logger import get_module_logger

logger = get_module_logger("BaseReuse")

class BaseReuse:
def __init__(self):
pass
"""Baseline Multiple Learnware Reuse uing Job Selector Method"""
def __init__(self, learnware_list: List[Learnware], herding_num: int = 100):
self.learnware_list = learnware_list
self.herding_num = herding_num
def predict(self, user_data: np.ndarray) -> np.ndarray:
"""Give prediction for user data using baseline job-selector method

Parameters
----------
user_data : np.ndarray
User's labeled raw data.

Returns
-------
np.ndarray
Prediction given by job-selector method
"""
_, select_result = self.job_selector(user_data)
selector_pred_y = np.zeros(len(user_data.shape[0]))

for idx in range(len(self.learnware_list)):
data_idx_list = np.where(select_result == idx)[0]
if len(data_idx_list) > 0:
selector_pred_y[data_idx_list] = self.learnware_list[idx].predict(data_idx_list)

return selector_pred_y

def job_selector(self, user_data: np.ndarray):
"""Train job selector based on user's data, which predicts which learnware in the pool should be selected

Parameters
----------
user_data : np.ndarray
_description_
"""
learnware_rkme_spec_list = [
learnware.specification.get_stat_spec_by_name("RKMEStatSpecification") for learnware in self.learnware_list
]
task_matrix = np.zeros((len(learnware_rkme_spec_list), len(learnware_rkme_spec_list)))

for i in range(len(self.learnware_list)):
task_rkme1 = learnware_rkme_spec_list[i]
for j in range(i, len(self.learnware_list)):
task_rkme2 = learnware_rkme_spec_list[j]
task_matrix[i][j] = task_matrix[j][i] = task_rkme1.inner_prd(task_rkme2)
task_mixture_weight = self._calculate_rkme_spec_mixture_weight(user_data, learnware_rkme_spec_list, task_matrix)

herding_X, train_herding_X, val_herding_X = None, None, None
herding_y, train_herding_y, val_herding_y = [], [], []
for i in range(len(self.learnware_list)):
task_spec = learnware_rkme_spec_list[i]
task_herding_num = max(5, int(self.herding_num * task_mixture_weight[i]))
task_val_num = task_herding_num // 5

herding_X_i = task_spec.herding(task_herding_num).detach().cpu().numpy()
train_X_i = herding_X_i[:-task_val_num]
val_X_i = herding_X_i[task_val_num:]

herding_X = herding_X_i if herding_X is None else np.concatenate((herding_X, herding_X_i), axis=0)
train_herding_X = train_X_i if train_herding_X is None else np.concatenate((train_herding_X, train_X_i), axis=0)
val_herding_X = val_X_i if val_herding_X is None else np.concatenate((val_herding_X, val_X_i), axis=0)

herding_y += [i] * task_herding_num
train_herding_y += [i] * (task_herding_num - task_val_num)
val_herding_y += [i] * task_val_num
herding_y = np.array(herding_y)
train_herding_y = np.array(train_herding_y)
val_herding_y = np.array(val_herding_y)

# use herding samples to train a job selector
job_selector = self._selector_grid_search(herding_X, herding_y, train_herding_X, train_herding_y, val_herding_X, val_herding_y, len(self.learnware_list))
job_select_result = np.array(job_selector.predict(user_data))

return job_selector, job_select_result


def _calculate_rkme_spec_mixture_weight(
self, user_data: np.ndarray, task_rkme_list: List[RKMEStatSpecification], task_rkme_matrix: np.ndarray
) -> List[float]:
"""_summary_

Parameters
----------
user_data : np.ndarray
_description_
task_rkme_list : List[RKMEStatSpecification]
_description_
task_rkme_matrix : np.ndarray
_description_
"""
task_num = len(task_rkme_list)
user_rkme_spec = specification.utils.generate_rkme_spec(X=user_data, reduce=False)
K = task_rkme_matrix
v = np.array([user_rkme_spec.inner_prod(task_rkme) for task_rkme in task_rkme_list])

P = matrix(K)
q = matrix(-v)
G = matrix(-np.eye(task_num))
h = matrix(np.zeros((task_num, 1)))
A = matrix(np.ones((1, task_num)))
b = matrix(np.ones((1, 1)))
solvers.options["show_progress"] = False
sol = solvers.qp(P, q, G, h, A, b)
task_mixture_weight = np.array(sol["x"]).reshape(-1)

return task_mixture_weight
def _selector_grid_search(
org_train_x: np.ndarray, org_train_y: np.ndarray, train_x: np.ndarray, train_y: np.ndarray, val_x: np.ndarray, val_y: np.ndarray, num_class:int
) -> LGBMClassifier:
"""Train a LGBMClassifier as job selector using the herding data as training instances.

Parameters
----------
org_train_x : np.ndarray
The original herding features.
org_train_y : np.ndarray
The original hearding labels(which are learnware indexes).
train_x : np.ndarray
Herding features used for training.
train_y : np.ndarray
Herding labels used for training.
val_x : np.ndarray
Herding features used for validation.
val_y : np.ndarray
Herding labels used for validation.
num_class : int
Total number of classes for the job selector(which is exactly the total number of learnwares to be reused).

Returns
-------
LGBMClassifier
The job selector model.
"""
score_best = -1
learning_rate = [0.01]
max_depth = [66]
params = (0, 0)

for lr in learning_rate:
for md in max_depth:
model = LGBMClassifier(
max_depth=md,
learning_rate=lr,
n_estimators=2000,
objective="multiclass",
num_class=num_class,
booster="gbtree",
seed=0,
)
model.fit(train_x, train_y, eval_set=[(val_x, val_y)], verbose=100, early_stopping_rounds=300)
pred_y = model.predict(org_train_x)
score = accuracy_score(pred_y, org_train_y)
if score > score_best:
score_best = score
params = (lr, md)

model = LGBMClassifier(
max_depth=params[1],
learning_rate=params[0],
n_estimators=2000,
objective="multiclass",
num_class=num_class,
booster="gbtree",
seed=0,
)
model.fit(org_train_x, org_train_y, eval_set=[(org_train_x, org_train_y)], verbose=100, early_stopping_rounds=300)
return model

+ 30
- 26
learnware/market/easy.py View File

@@ -119,10 +119,7 @@ class EasyMarket(BaseMarket):
self.learnware_folder_list[id] = target_folder_dir
self.count += 1
add_learnware_to_db(
id,
semantic_spec=semantic_spec,
zip_path=target_zip_dir,
folder_path=target_folder_dir,
id, semantic_spec=semantic_spec, zip_path=target_zip_dir, folder_path=target_folder_dir,
)
return id, True

@@ -241,7 +238,7 @@ class EasyMarket(BaseMarket):
return intermediate_K, intermediate_C

def _search_by_rkme_spec_mixture(
self, learnware_list: List[Learnware], user_rkme: RKMEStatSpecification, search_num: int
self, learnware_list: List[Learnware], user_rkme: RKMEStatSpecification, max_search_num: int = 5, score_cutoff: float = 0.1
) -> Tuple[List[float], List[Learnware]]:
"""Get search_num learnwares with their mixture weight from the given learnware_list

@@ -251,8 +248,10 @@ class EasyMarket(BaseMarket):
The list of learnwares whose mixture approximates the user's rkme
user_rkme : RKMEStatSpecification
User RKME statistical specification
search_num : int
The number of the returned learnwares
max_search_num : int
The maximum number of the returned learnwares
score_cutof: float
The minimum mmd dist as threshold to stop further rkme_spec matching

Returns
-------
@@ -265,14 +264,14 @@ class EasyMarket(BaseMarket):
if learnware_num == 0:
return [], []
if learnware_num < search_num:
logger.warning("Available Learnware num less than search_num")
logger.warning("Available Learnware num less than search_num!")
search_num = learnware_num

flag_list = [0 for i in range(learnware_num)]
flag_list = [0 for _ in range(learnware_num)]
mixture_list = []
intermediate_K, intermediate_C = np.zeros((1, 1)), np.zeros((1, 1))

for k in range(search_num):
for k in range(max_search_num):
idx_min, score_min = -1, -1
weight_min = None
mixture_list.append(None)
@@ -294,11 +293,14 @@ class EasyMarket(BaseMarket):
if idx_min == -1 or score < score_min:
idx_min, score_min, weight_min = idx, score, weight

flag_list[idx_min] = 1
mixture_list[-1] = learnware_list[idx_min]
intermediate_K, intermediate_C = self._calculate_intermediate_K_and_C(
mixture_list, user_rkme, intermediate_K, intermediate_C
)
if score_min >= score_cutoff:
flag_list[idx_min] = 1
mixture_list[-1] = learnware_list[idx_min]
intermediate_K, intermediate_C = self._calculate_intermediate_K_and_C(
mixture_list, user_rkme, intermediate_K, intermediate_C
)
else:
break

return weight_min, mixture_list

@@ -335,8 +337,8 @@ class EasyMarket(BaseMarket):

return sorted_dist_list, sorted_learnware_list

def _search_by_semantic_tags(self, learnware_list: List[Learnware], user_info: BaseUserInfo) -> List[Learnware]:
def match_semantic_tags(semantic_spec1, semantic_spec2):
def _search_by_semantic_spec(self, learnware_list: List[Learnware], user_info: BaseUserInfo) -> List[Learnware]:
def match_semantic_spec(semantic_spec1, semantic_spec2):
if semantic_spec1.keys() != semantic_spec2.keys():
# raise Exception("semantic_spec key error")
logger.warning("semantic_spec key error!")
@@ -346,18 +348,20 @@ class EasyMarket(BaseMarket):
continue
if len(semantic_spec2[key]["Values"]) == 0:
continue
v1 = semantic_spec1[key]["Values"]
v2 = semantic_spec2[key]["Values"]
if semantic_spec1[key]["Type"] == "Class":
if isinstance(semantic_spec1[key]["Values"], list):
semantic_spec1[key]["Values"] = semantic_spec1[key]["Values"][0]
if isinstance(semantic_spec2[key]["Values"], list):
semantic_spec2[key]["Values"] = semantic_spec2[key]["Values"][0]
if semantic_spec1[key]["Values"] != semantic_spec2[key]["Values"]:
if isinstance(v1, list):
v1 = v1[0]
if isinstance(v2, list):
v2 = v2[0]
if v1 != v2:
return False
elif semantic_spec1[key]["Type"] == "Tag":
if not (set(semantic_spec1[key]["Values"]) & set(semantic_spec2[key]["Values"])):
if not (set(v1) & set(v2)):
return False
elif semantic_spec1[key]["Type"] == "Name":
if semantic_spec2[key]["Values"] not in semantic_spec1[key]["Values"]:
if v2 not in v1 and v2 not in semantic_spec1["Description"]["Values"]:
return False
return True

@@ -365,7 +369,7 @@ class EasyMarket(BaseMarket):
for learnware in learnware_list:
learnware_semantic_spec = learnware.get_specification().get_semantic_spec()
user_semantic_spec = user_info.get_semantic_spec()
if match_semantic_tags(learnware_semantic_spec, user_semantic_spec):
if match_semantic_spec(learnware_semantic_spec, user_semantic_spec):
match_learnwares.append(learnware)
return match_learnwares

@@ -389,7 +393,7 @@ class EasyMarket(BaseMarket):
the third is the list of Learnware (mixture), the size is search_num
"""
learnware_list = [self.learnware_list[key] for key in self.learnware_list]
learnware_list = self._search_by_semantic_tags(learnware_list, user_info)
learnware_list = self._search_by_semantic_spec(learnware_list, user_info)
# learnware_list = list(set(learnware_list_tags + learnware_list_description))

if "RKMEStatSpecification" not in user_info.stat_info:


+ 87
- 5
learnware/specification/rkme.py View File

@@ -10,10 +10,13 @@ import codecs
import random
import numpy as np
from cvxopt import solvers, matrix
from collections import Counter
from typing import Tuple, Any, List, Union, Dict

from .base import BaseStatSpecification
from ..logger import get_module_logger

logger = get_module_logger("rkme")

class RKMEStatSpecification(BaseStatSpecification):
"""Reduced-set Kernel Mean Embedding (RKME) Specification"""
@@ -196,6 +199,59 @@ class RKMEStatSpecification(BaseStatSpecification):

Z = Z - step_size * grad_Z
self.z = Z
def _inner_prod_with_X(self, X: Any) -> float:
"""Compute the inner product between RKME specification and X

Parameters
----------
X : np.ndarray or torch.tensor
Raw data in np.ndarray format or torch.tensor format.

Returns
-------
float
The inner product between RKME specification and X
"""
beta = self.beta.reshape(1, -1).double().to(self.device)
Z = self.z.double().to(self.device)
if not torch.is_tensor(X):
X = torch.from_numpy(X)
X = X.to(self.device).double()
v = torch_rbf_kernel(Z, X, self.gamma) * beta.double
v = torch.sum(v, axis = 0)
return v.detach().cpu().numpy()
def _sampling_candidates(self, N: int) -> np.ndarray:
"""Generate a large set of candidates as preparation for herding

Parameters
----------
N : int
The number of herding candidates.

Returns
-------
np.ndarray
The herding candidates.
"""
beta = self.beta
beta[beta < 0] = 0 # currently we cannot use negative weight
beta = beta / torch.sum(beta)
sample_assign = torch.multinomial(beta, N, replacement=True)

sample_list = []
for i, n in Counter(np.array(sample_assign.cpu())).items():
for _ in range(n):
sample_list.append(torch.normal(mean=self.z[i], std=0.25).reshape(1, -1))
if len(sample_list) > 1:
return torch.cat(sample_list, axis=0)
elif len(sample_list) == 1:
return sample_list[0]
else:
logger.warning("Not enough candidates for herding!")


def inner_prod(self, Phi2: RKMEStatSpecification) -> float:
"""Compute the inner product between two RKME specifications
@@ -226,7 +282,7 @@ class RKMEStatSpecification(BaseStatSpecification):
Phi2 : RKMEStatSpecification
The other RKME specification.
omit_term1 : bool, optional
True if the inner product of self with itself can be omitted, by default False
True if the inner product of self with itself can be omitted, by default False.
"""
if omit_term1:
term1 = 0
@@ -236,6 +292,34 @@ class RKMEStatSpecification(BaseStatSpecification):
term3 = Phi2.inner_prod(Phi2)

return float(term1 - 2 * term2 + term3)
def herding(self, T: int) -> np.ndarray:
"""Iteratively sample examples from an unknown distribution with the help of its RKME specification

Parameters
----------
T : int
Total iteration number for sampling.

Returns
-------
np.ndarray
A collection of examples which approximate the unknown distribution.
"""
Nstart = 100 * T
Xstart = self._sampling_candidates(Nstart).to(self.device)
D = self.z[0].shape[0]
S = torch.zeros((T, D)).to(self.device)
fsX = torch.from_numpy(self._inner_prod_with_X(Xstart)).to(self.device)
fsS = torch.zeros(Nstart).to(self.device)
for i in range(T):
if i > 0:
fsS = torch.sum(torch_rbf_kernel(S[:i, :], Xstart, self.gamma), axis=0)
fs = (i + 1) * fsX - fsS
idx = torch.argmax(fs)
S[i, :] = Xstart[idx, :]
return S

def save(self, filepath: str):
"""Save the computed RKME specification to a specified path in JSON format.
@@ -255,9 +339,7 @@ class RKMEStatSpecification(BaseStatSpecification):
rkme_to_save["beta"] = rkme_to_save["beta"].tolist()
rkme_to_save["device"] = "gpu" if rkme_to_save["cuda_idx"] != -1 else "cpu"
json.dump(
rkme_to_save,
codecs.open(save_path, "w", encoding="utf-8"),
separators=(",", ":"),
rkme_to_save, codecs.open(save_path, "w", encoding="utf-8"), separators=(",", ":"),
)

def load(self, filepath: str) -> bool:
@@ -345,7 +427,7 @@ def torch_rbf_kernel(x1, x2, gamma) -> torch.Tensor:
"""
x1 = x1.double()
x2 = x2.double()
X12norm = torch.sum(x1**2, 1, keepdim=True) - 2 * x1 @ x2.T + torch.sum(x2**2, 1, keepdim=True).T
X12norm = torch.sum(x1 ** 2, 1, keepdim=True) - 2 * x1 @ x2.T + torch.sum(x2 ** 2, 1, keepdim=True).T
return torch.exp(-X12norm * gamma)




+ 2
- 0
setup.py View File

@@ -45,6 +45,8 @@ REQUIRED = [
"joblib>=1.2.0",
"pyyaml>=6.0",
"fire>=0.5.0",
"sklearn>=1.0.2",
"lightgbm>=3.3.5"
]

here = os.path.abspath(os.path.dirname(__file__))


Loading…
Cancel
Save