Merge branch 'dev' of git.nju.edu.cn:learnware/learnware-market into dev

3 years ago · e9cfbae700
--- a/examples/example_market_db/example_db.py
+++ b/examples/example_market_db/example_db.py
@@ -40,14 +40,11 @@ semantic_specs = [

 user_senmantic = {
    "Data": {"Values": ["Tabular"], "Type": "Class"},
    "Task": {
        "Values": ["Classification"],
        "Type": "Class",
    },
    "Task": {"Values": ["Classification"], "Type": "Class",},
    "Device": {"Values": ["GPU"], "Type": "Tag"},
    "Scenario": {"Values": ["Business"], "Type": "Tag"},
    "Description": {"Values": "", "Type": "Description"},
    "Name": {"Values": "learnware_4", "Type": "Name"},
    "Name": {"Values": "learnware", "Type": "Name"},
 }


--- a/examples/examples2/svm/init.py
+++ b/examples/examples2/svm/init.py
@@ -15,5 +15,5 @@ class SVM(BaseModel):
    def predict(self, X: np.ndarray) -> np.ndarray:
        return self.model.predict(X)

    def fintune(self, X: np.ndarray, y: np.ndarray):
    def finetune(self, X: np.ndarray, y: np.ndarray):
        pass
--- a/examples/learnware_config/svm/init.py
+++ b/examples/learnware_config/svm/init.py
@@ -15,5 +15,5 @@ class SVM(BaseModel):
    def predict(self, X: np.ndarray) -> np.ndarray:
        return self.model.predict(X)

    def fintune(self, X: np.ndarray, y: np.ndarray):
    def finetune(self, X: np.ndarray, y: np.ndarray):
        pass
--- a/examples/workflow_by_code/main.py
+++ b/examples/workflow_by_code/main.py
@@ -16,10 +16,7 @@ curr_root = os.path.dirname(os.path.abspath(__file__))
 semantic_specs = [
    {
        "Data": {"Values": ["Tabular"], "Type": "Class"},
        "Task": {
            "Values": ["Classification"],
            "Type": "Class",
        },
        "Task": {"Values": ["Classification"], "Type": "Class",},
        "Device": {"Values": ["GPU"], "Type": "Tag"},
        "Scenario": {"Values": ["Nature"], "Type": "Tag"},
        "Description": {"Values": "", "Type": "Description"},
@@ -27,10 +24,7 @@ semantic_specs = [
    },
    {
        "Data": {"Values": ["Tabular"], "Type": "Class"},
        "Task": {
            "Values": ["Classification"],
            "Type": "Class",
        },
        "Task": {"Values": ["Classification"], "Type": "Class",},
        "Device": {"Values": ["GPU"], "Type": "Tag"},
        "Scenario": {"Values": ["Business", "Nature"], "Type": "Tag"},
        "Description": {"Values": "", "Type": "Description"},
@@ -38,10 +32,7 @@ semantic_specs = [
    },
    {
        "Data": {"Values": ["Tabular"], "Type": "Class"},
        "Task": {
            "Values": ["Classification"],
            "Type": "Class",
        },
        "Task": {"Values": ["Classification"], "Type": "Class",},
        "Device": {"Values": ["GPU"], "Type": "Tag"},
        "Scenario": {"Values": ["Business"], "Type": "Tag"},
        "Description": {"Values": "", "Type": "Description"},
@@ -51,10 +42,7 @@ semantic_specs = [

 user_senmantic = {
    "Data": {"Values": ["Tabular"], "Type": "Class"},
    "Task": {
        "Values": ["Classification"],
        "Type": "Class",
    },
    "Task": {"Values": ["Classification"], "Type": "Class",},
    "Device": {"Values": ["GPU"], "Type": "Tag"},
    "Scenario": {"Values": ["Business"], "Type": "Tag"},
    "Description": {"Values": "", "Type": "Description"},
--- a/learnware/config.py
+++ b/learnware/config.py
@@ -66,10 +66,7 @@ os.makedirs(LEARNWARE_FOLDER_POOL_PATH, exist_ok=True)
 os.makedirs(DATABASE_PATH, exist_ok=True)

 semantic_config = {
    "Data": {
        "Values": ["Tabular", "Image", "Video", "Text", "Audio"],
        "Type": "Class",
    },  # Choose only one class
    "Data": {"Values": ["Tabular", "Image", "Video", "Text", "Audio"], "Type": "Class",},  # Choose only one class
    "Task": {
        "Values": [
            "Classification",
@@ -82,10 +79,7 @@ semantic_config = {
        ],
        "Type": "Class",  # Choose only one class
    },
    "Device": {
        "Values": ["CPU", "GPU"],
        "Type": "Tag",
    },  # Choose one or more tags
    "Device": {"Values": ["CPU", "GPU"], "Type": "Tag",},  # Choose one or more tags
    "Scenario": {
        "Values": [
            "Business",
@@ -105,14 +99,8 @@ semantic_config = {
        ],
        "Type": "Tag",  # Choose one or more tags
    },
    "Description": {
        "Values": None,
        "Type": "Description",
    },
    "Name": {
        "Values": None,
        "Type": "Name",
    },
    "Description": {"Values": None, "Type": "Description",},
    "Name": {"Values": None, "Type": "Name",},
 }

 _DEFAULT_CONFIG = {
@@ -123,10 +111,7 @@ _DEFAULT_CONFIG = {
    "learnware_pool_path": LEARNWARE_POOL_PATH,
    "learnware_zip_pool_path": LEARNWARE_ZIP_POOL_PATH,
    "learnware_folder_pool_path": LEARNWARE_FOLDER_POOL_PATH,
    "learnware_folder_config": {
        "yaml_file": "learnware.yaml",
        "module_file": "__init__.py",
    },
    "learnware_folder_config": {"yaml_file": "learnware.yaml", "module_file": "__init__.py",},
    "database_path": DATABASE_PATH,
 }

--- a/learnware/learnware/init.py
+++ b/learnware/learnware/init.py
@@ -2,6 +2,7 @@ import os
 import copy

 from .base import Learnware
 from .reuse import BaseReuse
 from .utils import get_stat_spec_from_config, get_model_from_config
 from ..specification import Specification
 from ..utils import read_yaml_to_dict
@@ -29,10 +30,7 @@ def get_learnware_from_dirpath(id: str, semantic_spec: dict, learnware_dirpath:
        The contructed learnware object, return None if build failed
    """
    learnware_config = {
        "model": {
            "class_name": "Model",
            "kwargs": {},
        },
        "model": {"class_name": "Model", "kwargs": {},},
        "stat_specifications": [
            {
                "module_path": "learnware.specification",
--- a/learnware/learnware/reuse.py
+++ b/learnware/learnware/reuse.py
@@ -1,3 +1,188 @@
 import numpy as np
 from typing import Tuple, Any, List, Union, Dict
 from cvxopt import matrix, solvers
 from lightgbm import LGBMClassifier
 from sklearn.metrics import accuracy_score

 from learnware.learnware import Learnware
 import learnware.specification as specification
 from ..specification import RKMEStatSpecification
 from ..logger import get_module_logger

 logger = get_module_logger("BaseReuse")

 class BaseReuse:
    def __init__(self):
        pass
    """Baseline Multiple Learnware Reuse uing Job Selector Method"""
    
    def __init__(self, learnware_list: List[Learnware], herding_num: int = 100):
        self.learnware_list = learnware_list
        self.herding_num = herding_num
    
    def predict(self, user_data: np.ndarray) -> np.ndarray:
        """Give prediction for user data using baseline job-selector method

        Parameters
        ----------
        user_data : np.ndarray
            User's labeled raw data.

        Returns
        -------
        np.ndarray
            Prediction given by job-selector method
        """
        _, select_result = self.job_selector(user_data)
        selector_pred_y = np.zeros(len(user_data.shape[0]))

        for idx in range(len(self.learnware_list)):
            data_idx_list = np.where(select_result == idx)[0]
            if len(data_idx_list) > 0:
                selector_pred_y[data_idx_list] = self.learnware_list[idx].predict(data_idx_list)

        return selector_pred_y

    def job_selector(self, user_data: np.ndarray):
        """Train job selector based on user's data, which predicts which learnware in the pool should be selected

        Parameters
        ----------
        user_data : np.ndarray
            _description_
        """
        learnware_rkme_spec_list = [
            learnware.specification.get_stat_spec_by_name("RKMEStatSpecification") for learnware in self.learnware_list
        ]
        task_matrix = np.zeros((len(learnware_rkme_spec_list), len(learnware_rkme_spec_list)))

        for i in range(len(self.learnware_list)):
            task_rkme1 = learnware_rkme_spec_list[i]
            for j in range(i, len(self.learnware_list)):
                task_rkme2 = learnware_rkme_spec_list[j]
                task_matrix[i][j] = task_matrix[j][i] = task_rkme1.inner_prd(task_rkme2)
        
        task_mixture_weight = self._calculate_rkme_spec_mixture_weight(user_data, learnware_rkme_spec_list, task_matrix)

        herding_X, train_herding_X, val_herding_X = None, None, None
        herding_y, train_herding_y, val_herding_y = [], [], []
        for i in range(len(self.learnware_list)):
            task_spec = learnware_rkme_spec_list[i]
            task_herding_num = max(5, int(self.herding_num * task_mixture_weight[i]))
            task_val_num = task_herding_num // 5

            herding_X_i = task_spec.herding(task_herding_num).detach().cpu().numpy()
            train_X_i = herding_X_i[:-task_val_num]
            val_X_i = herding_X_i[task_val_num:]

            herding_X = herding_X_i if herding_X is None else np.concatenate((herding_X, herding_X_i), axis=0)
            train_herding_X = train_X_i if train_herding_X is None else np.concatenate((train_herding_X, train_X_i), axis=0)
            val_herding_X = val_X_i if val_herding_X is None else np.concatenate((val_herding_X, val_X_i), axis=0)

            herding_y += [i] * task_herding_num
            train_herding_y += [i] * (task_herding_num - task_val_num)
            val_herding_y += [i] * task_val_num
        
        herding_y = np.array(herding_y)
        train_herding_y = np.array(train_herding_y)
        val_herding_y = np.array(val_herding_y)

        # use herding samples to train a job selector
        job_selector = self._selector_grid_search(herding_X, herding_y, train_herding_X, train_herding_y, val_herding_X, val_herding_y, len(self.learnware_list))
        job_select_result = np.array(job_selector.predict(user_data))

        return job_selector, job_select_result


    def _calculate_rkme_spec_mixture_weight(
            self, user_data: np.ndarray, task_rkme_list: List[RKMEStatSpecification], task_rkme_matrix: np.ndarray
        ) -> List[float]:
        """_summary_

        Parameters
        ----------
        user_data : np.ndarray
            _description_
        task_rkme_list : List[RKMEStatSpecification]
            _description_
        task_rkme_matrix : np.ndarray
            _description_
        """
        task_num = len(task_rkme_list)
        user_rkme_spec = specification.utils.generate_rkme_spec(X=user_data, reduce=False)
        K = task_rkme_matrix
        v = np.array([user_rkme_spec.inner_prod(task_rkme) for task_rkme in task_rkme_list])

        P = matrix(K)
        q = matrix(-v)
        G = matrix(-np.eye(task_num))
        h = matrix(np.zeros((task_num, 1)))
        A = matrix(np.ones((1, task_num)))
        b = matrix(np.ones((1, 1)))
        solvers.options["show_progress"] = False
        sol = solvers.qp(P, q, G, h, A, b)
        task_mixture_weight = np.array(sol["x"]).reshape(-1)

        return task_mixture_weight
    
    def _selector_grid_search(
            org_train_x: np.ndarray, org_train_y: np.ndarray, train_x: np.ndarray, train_y: np.ndarray, val_x: np.ndarray, val_y: np.ndarray, num_class:int
        ) -> LGBMClassifier:
        """Train a LGBMClassifier as job selector using the herding data as training instances.

        Parameters
        ----------
        org_train_x : np.ndarray
            The original herding features.
        org_train_y : np.ndarray
            The original hearding labels(which are learnware indexes).
        train_x : np.ndarray
            Herding features used for training.
        train_y : np.ndarray
            Herding labels used for training.
        val_x : np.ndarray
            Herding features used for validation.
        val_y : np.ndarray
            Herding labels used for validation.
        num_class : int
            Total number of classes for the job selector(which is exactly the total number of learnwares to be reused).

        Returns
        -------
        LGBMClassifier
            The job selector model.
        """
        score_best = -1
        learning_rate = [0.01]
        max_depth = [66]
        params = (0, 0)

        for lr in learning_rate:
            for md in max_depth:
                model = LGBMClassifier(
                    max_depth=md,
                    learning_rate=lr,
                    n_estimators=2000,
                    objective="multiclass",
                    num_class=num_class,
                    booster="gbtree",
                    seed=0,
                )
                model.fit(train_x, train_y, eval_set=[(val_x, val_y)], verbose=100, early_stopping_rounds=300)
                pred_y = model.predict(org_train_x)
                score = accuracy_score(pred_y, org_train_y)
                
                if score > score_best:
                    score_best = score
                    params = (lr, md)

        model = LGBMClassifier(
            max_depth=params[1],
            learning_rate=params[0],
            n_estimators=2000,
            objective="multiclass",
            num_class=num_class,
            booster="gbtree",
            seed=0,
        )
        model.fit(org_train_x, org_train_y, eval_set=[(org_train_x, org_train_y)], verbose=100, early_stopping_rounds=300)
        
        return model
--- a/learnware/market/easy.py
+++ b/learnware/market/easy.py
@@ -119,10 +119,7 @@ class EasyMarket(BaseMarket):
            self.learnware_folder_list[id] = target_folder_dir
            self.count += 1
            add_learnware_to_db(
                id,
                semantic_spec=semantic_spec,
                zip_path=target_zip_dir,
                folder_path=target_folder_dir,
                id, semantic_spec=semantic_spec, zip_path=target_zip_dir, folder_path=target_folder_dir,
            )
            return id, True

@@ -241,7 +238,7 @@ class EasyMarket(BaseMarket):
        return intermediate_K, intermediate_C

    def _search_by_rkme_spec_mixture(
        self, learnware_list: List[Learnware], user_rkme: RKMEStatSpecification, search_num: int
        self, learnware_list: List[Learnware], user_rkme: RKMEStatSpecification, max_search_num: int = 5, score_cutoff: float = 0.1
    ) -> Tuple[List[float], List[Learnware]]:
        """Get search_num learnwares with their mixture weight from the given learnware_list

@@ -251,8 +248,10 @@ class EasyMarket(BaseMarket):
            The list of learnwares whose mixture approximates the user's rkme
        user_rkme : RKMEStatSpecification
            User RKME statistical specification
        search_num : int
            The number of the returned learnwares
        max_search_num : int
            The maximum number of the returned learnwares
        score_cutof: float
            The minimum mmd dist as threshold to stop further rkme_spec matching

        Returns
        -------
@@ -265,14 +264,14 @@ class EasyMarket(BaseMarket):
        if learnware_num == 0:
            return [], []
        if learnware_num < search_num:
            logger.warning("Available Learnware num less than search_num")
            logger.warning("Available Learnware num less than search_num!")
            search_num = learnware_num

        flag_list = [0 for i in range(learnware_num)]
        flag_list = [0 for _ in range(learnware_num)]
        mixture_list = []
        intermediate_K, intermediate_C = np.zeros((1, 1)), np.zeros((1, 1))

        for k in range(search_num):
        for k in range(max_search_num):
            idx_min, score_min = -1, -1
            weight_min = None
            mixture_list.append(None)
@@ -294,11 +293,14 @@ class EasyMarket(BaseMarket):
                    if idx_min == -1 or score < score_min:
                        idx_min, score_min, weight_min = idx, score, weight

            flag_list[idx_min] = 1
            mixture_list[-1] = learnware_list[idx_min]
            intermediate_K, intermediate_C = self._calculate_intermediate_K_and_C(
                mixture_list, user_rkme, intermediate_K, intermediate_C
            )
            if score_min >= score_cutoff:
                flag_list[idx_min] = 1
                mixture_list[-1] = learnware_list[idx_min]
                intermediate_K, intermediate_C = self._calculate_intermediate_K_and_C(
                    mixture_list, user_rkme, intermediate_K, intermediate_C
                )
            else:
                break

        return weight_min, mixture_list

@@ -335,8 +337,8 @@ class EasyMarket(BaseMarket):

        return sorted_dist_list, sorted_learnware_list

    def _search_by_semantic_tags(self, learnware_list: List[Learnware], user_info: BaseUserInfo) -> List[Learnware]:
        def match_semantic_tags(semantic_spec1, semantic_spec2):
    def _search_by_semantic_spec(self, learnware_list: List[Learnware], user_info: BaseUserInfo) -> List[Learnware]:
        def match_semantic_spec(semantic_spec1, semantic_spec2):
            if semantic_spec1.keys() != semantic_spec2.keys():
                # raise Exception("semantic_spec key error")
                logger.warning("semantic_spec key error!")
@@ -346,18 +348,20 @@ class EasyMarket(BaseMarket):
                    continue
                if len(semantic_spec2[key]["Values"]) == 0:
                    continue
                v1 = semantic_spec1[key]["Values"]
                v2 = semantic_spec2[key]["Values"]
                if semantic_spec1[key]["Type"] == "Class":
                    if isinstance(semantic_spec1[key]["Values"], list):
                        semantic_spec1[key]["Values"] = semantic_spec1[key]["Values"][0]
                    if isinstance(semantic_spec2[key]["Values"], list):
                        semantic_spec2[key]["Values"] = semantic_spec2[key]["Values"][0]
                    if semantic_spec1[key]["Values"] != semantic_spec2[key]["Values"]:
                    if isinstance(v1, list):
                        v1 = v1[0]
                    if isinstance(v2, list):
                        v2 = v2[0]
                    if v1 != v2:
                        return False
                elif semantic_spec1[key]["Type"] == "Tag":
                    if not (set(semantic_spec1[key]["Values"]) & set(semantic_spec2[key]["Values"])):
                    if not (set(v1) & set(v2)):
                        return False
                elif semantic_spec1[key]["Type"] == "Name":
                    if semantic_spec2[key]["Values"] not in semantic_spec1[key]["Values"]:
                    if v2 not in v1 and v2 not in semantic_spec1["Description"]["Values"]:
                        return False
            return True

@@ -365,7 +369,7 @@ class EasyMarket(BaseMarket):
        for learnware in learnware_list:
            learnware_semantic_spec = learnware.get_specification().get_semantic_spec()
            user_semantic_spec = user_info.get_semantic_spec()
            if match_semantic_tags(learnware_semantic_spec, user_semantic_spec):
            if match_semantic_spec(learnware_semantic_spec, user_semantic_spec):
                match_learnwares.append(learnware)
        return match_learnwares

@@ -389,7 +393,7 @@ class EasyMarket(BaseMarket):
            the third is the list of Learnware (mixture), the size is search_num
        """
        learnware_list = [self.learnware_list[key] for key in self.learnware_list]
        learnware_list = self._search_by_semantic_tags(learnware_list, user_info)
        learnware_list = self._search_by_semantic_spec(learnware_list, user_info)
        # learnware_list = list(set(learnware_list_tags + learnware_list_description))

        if "RKMEStatSpecification" not in user_info.stat_info:
--- a/learnware/specification/rkme.py
+++ b/learnware/specification/rkme.py
@@ -10,10 +10,13 @@ import codecs
 import random
 import numpy as np
 from cvxopt import solvers, matrix
 from collections import Counter
 from typing import Tuple, Any, List, Union, Dict

 from .base import BaseStatSpecification
 from ..logger import get_module_logger

 logger = get_module_logger("rkme")

 class RKMEStatSpecification(BaseStatSpecification):
    """Reduced-set Kernel Mean Embedding (RKME) Specification"""
@@ -196,6 +199,59 @@ class RKMEStatSpecification(BaseStatSpecification):

        Z = Z - step_size * grad_Z
        self.z = Z
    
    def _inner_prod_with_X(self, X: Any) -> float:
        """Compute the inner product between RKME specification and X

        Parameters
        ----------
        X : np.ndarray or torch.tensor
            Raw data in np.ndarray format or torch.tensor format.

        Returns
        -------
        float
            The inner product between RKME specification and X
        """
        beta = self.beta.reshape(1, -1).double().to(self.device)
        Z = self.z.double().to(self.device)
        if not torch.is_tensor(X):
            X = torch.from_numpy(X)
        X = X.to(self.device).double()
        
        v = torch_rbf_kernel(Z, X, self.gamma) * beta.double
        v = torch.sum(v, axis = 0)
        return v.detach().cpu().numpy()
    
    def _sampling_candidates(self, N: int) -> np.ndarray:
        """Generate a large set of candidates as preparation for herding

        Parameters
        ----------
        N : int
            The number of herding candidates.

        Returns
        -------
        np.ndarray
            The herding candidates.
        """
        beta = self.beta
        beta[beta < 0] = 0  # currently we cannot use negative weight
        beta = beta / torch.sum(beta)
        sample_assign = torch.multinomial(beta, N, replacement=True)

        sample_list = []
        for i, n in Counter(np.array(sample_assign.cpu())).items():
            for _ in range(n):
                sample_list.append(torch.normal(mean=self.z[i], std=0.25).reshape(1, -1))
        if len(sample_list) > 1:
            return torch.cat(sample_list, axis=0)
        elif len(sample_list) == 1:
            return sample_list[0]
        else:
            logger.warning("Not enough candidates for herding!")


    def inner_prod(self, Phi2: RKMEStatSpecification) -> float:
        """Compute the inner product between two RKME specifications
@@ -226,7 +282,7 @@ class RKMEStatSpecification(BaseStatSpecification):
        Phi2 : RKMEStatSpecification
            The other RKME specification.
        omit_term1 : bool, optional
            True if the inner product of self with itself can be omitted, by default False
            True if the inner product of self with itself can be omitted, by default False.
        """
        if omit_term1:
            term1 = 0
@@ -236,6 +292,34 @@ class RKMEStatSpecification(BaseStatSpecification):
        term3 = Phi2.inner_prod(Phi2)

        return float(term1 - 2 * term2 + term3)
    
    def herding(self, T: int) -> np.ndarray:
        """Iteratively sample examples from an unknown distribution with the help of its RKME specification

        Parameters
        ----------
        T : int
            Total iteration number for sampling.

        Returns
        -------
        np.ndarray
            A collection of examples which approximate the unknown distribution.
        """
        Nstart = 100 * T
        Xstart = self._sampling_candidates(Nstart).to(self.device)
        D = self.z[0].shape[0]
        S = torch.zeros((T, D)).to(self.device)
        fsX = torch.from_numpy(self._inner_prod_with_X(Xstart)).to(self.device)
        fsS = torch.zeros(Nstart).to(self.device)
        for i in range(T):
            if i > 0:
                fsS = torch.sum(torch_rbf_kernel(S[:i, :], Xstart, self.gamma), axis=0)
            fs = (i + 1) * fsX - fsS
            idx = torch.argmax(fs)
            S[i, :] = Xstart[idx, :]
        
        return S

    def save(self, filepath: str):
        """Save the computed RKME specification to a specified path in JSON format.
@@ -255,9 +339,7 @@ class RKMEStatSpecification(BaseStatSpecification):
        rkme_to_save["beta"] = rkme_to_save["beta"].tolist()
        rkme_to_save["device"] = "gpu" if rkme_to_save["cuda_idx"] != -1 else "cpu"
        json.dump(
            rkme_to_save,
            codecs.open(save_path, "w", encoding="utf-8"),
            separators=(",", ":"),
            rkme_to_save, codecs.open(save_path, "w", encoding="utf-8"), separators=(",", ":"),
        )

    def load(self, filepath: str) -> bool:
@@ -345,7 +427,7 @@ def torch_rbf_kernel(x1, x2, gamma) -> torch.Tensor:
    """
    x1 = x1.double()
    x2 = x2.double()
    X12norm = torch.sum(x1**2, 1, keepdim=True) - 2 * x1 @ x2.T + torch.sum(x2**2, 1, keepdim=True).T
    X12norm = torch.sum(x1 ** 2, 1, keepdim=True) - 2 * x1 @ x2.T + torch.sum(x2 ** 2, 1, keepdim=True).T
    return torch.exp(-X12norm * gamma)


--- a/setup.py
+++ b/setup.py
@@ -45,6 +45,8 @@ REQUIRED = [
    "joblib>=1.2.0",
    "pyyaml>=6.0",
    "fire>=0.5.0",
    "sklearn>=1.0.2",
    "lightgbm>=3.3.5"
 ]

 here = os.path.abspath(os.path.dirname(__file__))