Merge pull request #9 from Learnware-LAMDA/bixd/dev

Fix bugs for container when pip install
2 years ago · 5d2dcc76dc
--- a/examples/dataset_image_workflow/get_data.py
+++ b/examples/dataset_image_workflow/get_data.py
@@ -192,7 +192,7 @@ def get_zca_matrix(X, reg_coef=0.1):
 def layernorm_data(X):
    X_processed = X - torch.mean(X, [1, 2, 3], keepdim=True)
    X_processed = X_processed / torch.sqrt(torch.sum(X_processed ** 2, [1, 2, 3], keepdim=True))
    X_processed = X_processed / torch.sqrt(torch.sum(X_processed**2, [1, 2, 3], keepdim=True))
    return X_processed
@@ -240,7 +240,10 @@ def augment(images, dc_aug_param, device):
        def scalefun(i):
            h = int((np.random.uniform(1 - scale, 1 + scale)) * shape[2])
            w = int((np.random.uniform(1 - scale, 1 + scale)) * shape[2])
            tmp = F.interpolate(images[i : i + 1], [h, w],)[0]
            tmp = F.interpolate(
                images[i : i + 1],
                [h, w],
            )[0]
            mhw = max(h, w, shape[2], shape[3])
            im_ = torch.zeros(shape[1], mhw, mhw, dtype=torch.float, device=device)
            r = int((mhw - h) / 2)
--- a/examples/dataset_m5_workflow/m5/utils.py
+++ b/examples/dataset_m5_workflow/m5/utils.py
@@ -70,7 +70,7 @@ def measure_aux_algo(idx, test_sample, model):
 # Simple "Memory profilers" to see memory usage
 def get_memory_usage():
    return np.round(psutil.Process(os.getpid()).memory_info()[0] / 2.0 ** 30, 2)
    return np.round(psutil.Process(os.getpid()).memory_info()[0] / 2.0**30, 2)
 def sizeof_fmt(num, suffix="B"):
@@ -84,7 +84,7 @@ def sizeof_fmt(num, suffix="B"):
 # Memory Reducer
 def reduce_mem_usage(df, float16_flag=True, verbose=True):
    numerics = ["int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
@@ -106,7 +106,7 @@ def reduce_mem_usage(df, float16_flag=True, verbose=True):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print(
            "Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)".format(
--- a/examples/dataset_m5_workflow/upload.py
+++ b/examples/dataset_m5_workflow/upload.py
@@ -69,8 +69,15 @@ def main():
        }
        res = session.post(
            submit_url,
            data={"semantic_specification": json.dumps(semantic_specification),},
            files={"learnware_file": open(os.path.join(os.path.abspath("."), "learnware_pool", learnware), "rb",)},
            data={
                "semantic_specification": json.dumps(semantic_specification),
            },
            files={
                "learnware_file": open(
                    os.path.join(os.path.abspath("."), "learnware_pool", learnware),
                    "rb",
                )
            },
        )
        assert json.loads(res.text)["code"] == 0, "Upload error"
--- a/examples/dataset_pfs_workflow/pfs/pfs_cross_transfer.py
+++ b/examples/dataset_pfs_workflow/pfs/pfs_cross_transfer.py
@@ -67,7 +67,7 @@ def get_split_errs(algo):
            for tmp in range(len(proportion_list)):
                model = lgb.LGBMModel(
                    boosting_type="gbdt",
                    num_leaves=2 ** 7 - 1,
                    num_leaves=2**7 - 1,
                    learning_rate=0.01,
                    objective="rmse",
                    metric="rmse",
@@ -119,7 +119,7 @@ def get_errors(algo):
        if algo == "lgb":
            model = lgb.LGBMModel(
                boosting_type="gbdt",
                num_leaves=2 ** 7 - 1,
                num_leaves=2**7 - 1,
                learning_rate=0.01,
                objective="rmse",
                metric="rmse",
--- a/examples/dataset_pfs_workflow/upload.py
+++ b/examples/dataset_pfs_workflow/upload.py
@@ -72,8 +72,15 @@ def main():
        }
        res = session.post(
            submit_url,
            data={"semantic_specification": json.dumps(semantic_specification),},
            files={"learnware_file": open(os.path.join(os.path.abspath("."), "learnware_pool", learnware), "rb",)},
            data={
                "semantic_specification": json.dumps(semantic_specification),
            },
            files={
                "learnware_file": open(
                    os.path.join(os.path.abspath("."), "learnware_pool", learnware),
                    "rb",
                )
            },
        )
        assert json.loads(res.text)["code"] == 0, "Upload error"
--- a/examples/workflow_by_code/main.py
+++ b/examples/workflow_by_code/main.py
@@ -19,7 +19,10 @@ curr_root = os.path.dirname(os.path.abspath(__file__))
 user_semantic = {
    "Data": {"Values": ["Table"], "Type": "Class"},
    "Task": {"Values": ["Classification"], "Type": "Class",},
    "Task": {
        "Values": ["Classification"],
        "Type": "Class",
    },
    "Library": {"Values": ["Scikit-learn"], "Type": "Class"},
    "Scenario": {"Values": ["Education"], "Type": "Tag"},
    "Description": {"Values": "", "Type": "String"},
--- a/learnware/client/container.py
+++ b/learnware/client/container.py
@@ -18,7 +18,6 @@ logger = get_module_logger(module_name="client_container")
 class ModelEnvContainer(BaseModel):
    def __init__(self, model_config: dict, learnware_zippath: str):
        self.model_script = os.path.join(C.package_path, "client", "scripts", "run_model.py")
        self.model_config = model_config
        self.conda_env = f"learnware_{shortuuid.uuid()}"
--- a/learnware/client/learnware_client.py
+++ b/learnware/client/learnware_client.py
@@ -104,8 +104,13 @@ class LearnwareClient:
        for chunk in file_chunks(learnware_file):
            response = requests.post(
                url_upload,
                files={"chunk_file": chunk,},
                data={"file_hash": file_hash, "chunk_begin": begin,},
                files={
                    "chunk_file": chunk,
                },
                data={
                    "file_hash": file_hash,
                    "chunk_begin": begin,
                },
                headers=self.headers,
            )
@@ -123,7 +128,10 @@ class LearnwareClient:
        response = requests.post(
            url_add,
            json={"file_hash": file_hash, "semantic_specification": json.dumps(semantic_specification),},
            json={
                "file_hash": file_hash,
                "semantic_specification": json.dumps(semantic_specification),
            },
            headers=self.headers,
        )
@@ -137,7 +145,14 @@ class LearnwareClient:
    def download_learnware(self, learnware_id, save_path):
        url = f"{self.host}/engine/download_learnware"
        response = requests.get(url, params={"learnware_id": learnware_id,}, headers=self.headers, stream=True,)
        response = requests.get(
            url,
            params={
                "learnware_id": learnware_id,
            },
            headers=self.headers,
            stream=True,
        )
        if response.status_code != 200:
            raise Exception("download failed: " + json.dumps(response.json()))
@@ -269,7 +284,6 @@ class LearnwareClient:
    def create_semantic_specification(
        self, name, description, data_type, task_type, library_type, senarioes, input_description, output_description
    ):
        semantic_specification = dict()
        semantic_specification["Input"] = input_description
        semantic_specification["Output"] = output_description
--- a/learnware/client/package_utils.py
+++ b/learnware/client/package_utils.py
@@ -24,8 +24,7 @@ def try_to_run(args, timeout=5, retry=5):
 def parse_pip_requirement(line: str):
    """Parse pip requirement line to package name
    """
    """Parse pip requirement line to package name"""
    line = line.strip()
@@ -47,8 +46,7 @@ def parse_pip_requirement(line: str):
 def read_pip_packages_from_requirements(requirements_file: str) -> List[str]:
    """Read requiremnts.txt and parse it to list
    """
    """Read requiremnts.txt and parse it to list"""
    packages = []
    lines = []
@@ -174,7 +172,6 @@ def filter_nonexist_conda_packages_file(yaml_file: str, output_yaml_file: str):
 def filter_nonexist_pip_packages_file(requirements_file: str, output_file: str):
    packages, lines = read_pip_packages_from_requirements(requirements_file)
    exist_packages, nonexist_packages = filter_nonexist_pip_packages(packages)
--- a/learnware/client/utils.py
+++ b/learnware/client/utils.py
@@ -10,14 +10,11 @@ logger = get_module_logger(module_name="client_utils")
 def system_execute(args, timeout=None):
    com_process = subprocess.run(
        args, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, timeout=timeout
    )
    com_process = subprocess.run(args, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, timeout=timeout)
    try:
        com_process.check_returncode()
    except subprocess.CalledProcessError as err:
        print(com_process.stderr)
        print("System Execute Error:", str(com_process.stderr))
        raise err
@@ -27,14 +24,14 @@ def remove_enviroment(conda_env):
 def install_environment(zip_path, conda_env):
    """Install environment of a learnware
    Parameters
    ----------
    zip_path : str
        Path of the learnware zip file
    conda_env : str
        a new conda environment will be created with the given name;
    Raises
    ------
    Exception
@@ -59,7 +56,7 @@ def install_environment(zip_path, conda_env):
                z_file.extract(member="requirements.txt", path=tempdir)
                requirements_path: str = os.path.join(tempdir, "requirements.txt")
                requirements_path_filter: str = os.path.join(tempdir, "requirements_filter.txt")
                logger.info(f"checking the avaliabe pip packages for {yaml_path}")
                logger.info(f"checking the avaliabe pip packages for {conda_env}")
                filter_nonexist_pip_packages_file(
                    requirements_file=requirements_path, output_file=requirements_path_filter
                )
--- a/learnware/config.py
+++ b/learnware/config.py
@@ -72,7 +72,10 @@ os.makedirs(DATABASE_PATH, exist_ok=True)
 os.makedirs(STDOUT_PATH, exist_ok=True)
 semantic_config = {
    "Data": {"Values": ["Table", "Image", "Video", "Text", "Audio"], "Type": "Class",},  # Choose only one class
    "Data": {
        "Values": ["Table", "Image", "Video", "Text", "Audio"],
        "Type": "Class",
    },  # Choose only one class
    "Task": {
        "Values": [
            "Classification",
@@ -113,8 +116,14 @@ semantic_config = {
        ],
        "Type": "Tag",  # Choose one or more tags
    },
    "Description": {"Values": None, "Type": "String",},
    "Name": {"Values": None, "Type": "String",},
    "Description": {
        "Values": None,
        "Type": "String",
    },
    "Name": {
        "Values": None,
        "Type": "String",
    },
 }
 _DEFAULT_CONFIG = {
@@ -128,7 +137,10 @@ _DEFAULT_CONFIG = {
    "learnware_pool_path": LEARNWARE_POOL_PATH,
    "learnware_zip_pool_path": LEARNWARE_ZIP_POOL_PATH,
    "learnware_folder_pool_path": LEARNWARE_FOLDER_POOL_PATH,
    "learnware_folder_config": {"yaml_file": "learnware.yaml", "module_file": "__init__.py",},
    "learnware_folder_config": {
        "yaml_file": "learnware.yaml",
        "module_file": "__init__.py",
    },
    "database_url": f"sqlite:///{DATABASE_PATH}",
    "max_reduced_set_size": 1310720,
    "backend_host": "http://www.lamda.nju.edu.cn/learnware/api",
--- a/learnware/learnware/init.py
+++ b/learnware/learnware/init.py
@@ -31,7 +31,10 @@ def get_learnware_from_dirpath(id: str, semantic_spec: dict, learnware_dirpath:
        The contructed learnware object, return None if build failed
    """
    learnware_config = {
        "model": {"class_name": "Model", "kwargs": {},},
        "model": {
            "class_name": "Model",
            "kwargs": {},
        },
        "stat_specifications": [
            {
                "module_path": "learnware.specification",
--- a/learnware/learnware/reuse.py
+++ b/learnware/learnware/reuse.py
@@ -302,7 +302,7 @@ class AveragingReuser(BaseReuser):
                pred_y = pred_y.detach().cpu().numpy()
            if not isinstance(pred_y, np.ndarray):
                raise TypeError(f"Model output must be np.ndarray or torch.Tensor")
            if len(pred_y.shape) == 1:
                pred_y = pred_y.reshape(-1, 1)
            else:
@@ -312,7 +312,7 @@ class AveragingReuser(BaseReuser):
                elif self.mode == "vote_by_prob":
                    pred_y = softmax(pred_y, axis=-1)
            preds.append(pred_y)
        if self.mode == "vote_by_prob":
            return np.mean(preds, axis=0)
        else:
@@ -325,9 +325,9 @@ class AveragingReuser(BaseReuser):
 class EnsemblePruningReuser(BaseReuser):
    """
        Baseline Multiple Learnware Reuser uing Marign Distribution guided multi-objective evolutionary Ensemble Pruning (MDEP) Method.
        References: [1] Yu-Chang Wu, Yi-Xiao He, Chao Qian, and Zhi-Hua Zhou. Multi-objective Evolutionary Ensemble Pruning Guided by Margin Distribution. In: Proceedings of the 17th International Conference on Parallel Problem Solving from Nature (PPSN'22), Dortmund, Germany, 2022.
    Baseline Multiple Learnware Reuser uing Marign Distribution guided multi-objective evolutionary Ensemble Pruning (MDEP) Method.
    References: [1] Yu-Chang Wu, Yi-Xiao He, Chao Qian, and Zhi-Hua Zhou. Multi-objective Evolutionary Ensemble Pruning Guided by Margin Distribution. In: Proceedings of the 17th International Conference on Parallel Problem Solving from Nature (PPSN'22), Dortmund, Germany, 2022.
    """
    def __init__(self, learnware_list: List[Learnware], mode: str):
@@ -359,7 +359,7 @@ class EnsemblePruningReuser(BaseReuser):
            - The ground truth of validation set.
            - The dimension is (number of instances, 1).
        maxgen : int
            - The maximum number of iteration rounds. 
            - The maximum number of iteration rounds.
        Returns
        -------
@@ -443,7 +443,7 @@ class EnsemblePruningReuser(BaseReuser):
            - The ground truth of validation set.
            - The dimension is (number of instances, 1).
        maxgen : int
            - The maximum number of iteration rounds. 
            - The maximum number of iteration rounds.
        Returns
        -------
@@ -557,7 +557,7 @@ class EnsemblePruningReuser(BaseReuser):
            - The ground truth of validation set.
            - The dimension is (number of instances, 1).
        maxgen : int
            - The maximum number of iteration rounds. 
            - The maximum number of iteration rounds.
        Returns
        -------
@@ -645,7 +645,7 @@ class EnsemblePruningReuser(BaseReuser):
    def _get_predict(self, X: np.ndarray, selected_idxes: List[int]):
        """Concatenate the output of learnwares corresponding to selected_idxes
        Parameters
        ----------
        X : np.ndarray
--- a/learnware/specification/base.py
+++ b/learnware/specification/base.py
@@ -74,7 +74,7 @@ class Specification:
    def update_stat_spec(self, *args, **kwargs):
        """Update the statistical specification by the way of 'name'='value'
             or use class name as default name
        or use class name as default name
        """
        for _v in args:
            self.stat_spec[_v.__class__.__name__] = _v
--- a/learnware/specification/rkme.py
+++ b/learnware/specification/rkme.py
@@ -428,7 +428,9 @@ class RKMEStatSpecification(BaseStatSpecification):
        rkme_to_save["beta"] = rkme_to_save["beta"].tolist()
        rkme_to_save["device"] = "gpu" if rkme_to_save["cuda_idx"] != -1 else "cpu"
        json.dump(
            rkme_to_save, codecs.open(save_path, "w", encoding="utf-8"), separators=(",", ":"),
            rkme_to_save,
            codecs.open(save_path, "w", encoding="utf-8"),
            separators=(",", ":"),
        )
    def load(self, filepath: str) -> bool:
@@ -521,7 +523,7 @@ def torch_rbf_kernel(x1, x2, gamma) -> torch.Tensor:
    """
    x1 = x1.double()
    x2 = x2.double()
    X12norm = torch.sum(x1 ** 2, 1, keepdim=True) - 2 * x1 @ x2.T + torch.sum(x2 ** 2, 1, keepdim=True).T
    X12norm = torch.sum(x1**2, 1, keepdim=True) - 2 * x1 @ x2.T + torch.sum(x2**2, 1, keepdim=True).T
    return torch.exp(-X12norm * gamma)
--- a/tests/test_learnware_client/test_reuse.py
+++ b/tests/test_learnware_client/test_reuse.py
@@ -33,7 +33,6 @@ if __name__ == "__main__":
        learnware_list.append(learnware)
    with LearnwaresContainer(learnware_list, zip_paths) as env_container:
        learnware_list = env_container.get_learnware_list_with_container()
        reuser = AveragingReuser(learnware_list, mode="vote")
        input_array = np.random.randint(0, 3, size=(20, 9))
--- a/tests/test_workflow/test_workflow.py
+++ b/tests/test_workflow/test_workflow.py
@@ -19,7 +19,10 @@ curr_root = os.path.dirname(os.path.abspath(__file__))
 user_semantic = {
    "Data": {"Values": ["Tabular"], "Type": "Class"},
    "Task": {"Values": ["Classification"], "Type": "Class",},
    "Task": {
        "Values": ["Classification"],
        "Type": "Class",
    },
    "Library": {"Values": ["Scikit-learn"], "Type": "Class"},
    "Scenario": {"Values": ["Education"], "Type": "Tag"},
    "Description": {"Values": "", "Type": "String"},