| @@ -0,0 +1,7 @@ | |||
| [flake8] | |||
| max-line-length = 120 | |||
| ignore = | |||
| E203,E501,F841,W503 | |||
| per-file-ignores = | |||
| __init__.py: F401 | |||
| ./learnware/utils/import_utils.py: F401 | |||
| @@ -0,0 +1,131 @@ | |||
| # Contributor Covenant Code of Conduct | |||
| ## Our Pledge | |||
| We as members, contributors, and leaders pledge to make participation in our | |||
| community a harassment-free experience for everyone, regardless of age, body | |||
| size, visible or invisible disability, ethnicity, sex characteristics, gender | |||
| identity and expression, level of experience, education, socio-economic status, | |||
| nationality, personal appearance, race, caste, color, religion, or sexual | |||
| identity and orientation. | |||
| We pledge to act and interact in ways that contribute to an open, welcoming, | |||
| diverse, inclusive, and healthy community. | |||
| ## Our Standards | |||
| Examples of behavior that contributes to a positive environment for our | |||
| community include: | |||
| * Demonstrating empathy and kindness toward other people | |||
| * Being respectful of differing opinions, viewpoints, and experiences | |||
| * Giving and gracefully accepting constructive feedback | |||
| * Accepting responsibility and apologizing to those affected by our mistakes, | |||
| and learning from the experience | |||
| * Focusing on what is best not just for us as individuals, but for the overall | |||
| community | |||
| Examples of unacceptable behavior include: | |||
| * The use of sexualized language or imagery, and sexual attention or advances of | |||
| any kind | |||
| * Trolling, insulting or derogatory comments, and personal or political attacks | |||
| * Public or private harassment | |||
| * Publishing others' private information, such as a physical or email address, | |||
| without their explicit permission | |||
| * Other conduct which could reasonably be considered inappropriate in a | |||
| professional setting | |||
| ## Enforcement Responsibilities | |||
| Community leaders are responsible for clarifying and enforcing our standards of | |||
| acceptable behavior and will take appropriate and fair corrective action in | |||
| response to any behavior that they deem inappropriate, threatening, offensive, | |||
| or harmful. | |||
| Community leaders have the right and responsibility to remove, edit, or reject | |||
| comments, commits, code, wiki edits, issues, and other contributions that are | |||
| not aligned to this Code of Conduct, and will communicate reasons for moderation | |||
| decisions when appropriate. | |||
| ## Scope | |||
| This Code of Conduct applies within all community spaces, and also applies when | |||
| an individual is officially representing the community in public spaces. | |||
| Examples of representing our community include using an official email address, | |||
| posting via an official social media account, or acting as an appointed | |||
| representative at an online or offline event. | |||
| ## Enforcement | |||
| Instances of abusive, harassing, or otherwise unacceptable behavior may be | |||
| reported to the community leaders responsible for enforcement at bmwu-support@lamda.nju.edu.cn. | |||
| All complaints will be reviewed and investigated promptly and fairly. | |||
| All community leaders are obligated to respect the privacy and security of the | |||
| reporter of any incident. | |||
| ## Enforcement Guidelines | |||
| Community leaders will follow these Community Impact Guidelines in determining | |||
| the consequences for any action they deem in violation of this Code of Conduct: | |||
| ### 1. Correction | |||
| **Community Impact**: Use of inappropriate language or other behavior deemed | |||
| unprofessional or unwelcome in the community. | |||
| **Consequence**: A private, written warning from community leaders, providing | |||
| clarity around the nature of the violation and an explanation of why the | |||
| behavior was inappropriate. A public apology may be requested. | |||
| ### 2. Warning | |||
| **Community Impact**: A violation through a single incident or series of | |||
| actions. | |||
| **Consequence**: A warning with consequences for continued behavior. No | |||
| interaction with the people involved, including unsolicited interaction with | |||
| those enforcing the Code of Conduct, for a specified period of time. This | |||
| includes avoiding interactions in community spaces as well as external channels | |||
| like social media. Violating these terms may lead to a temporary or permanent | |||
| ban. | |||
| ### 3. Temporary Ban | |||
| **Community Impact**: A serious violation of community standards, including | |||
| sustained inappropriate behavior. | |||
| **Consequence**: A temporary ban from any sort of interaction or public | |||
| communication with the community for a specified period of time. No public or | |||
| private interaction with the people involved, including unsolicited interaction | |||
| with those enforcing the Code of Conduct, is allowed during this period. | |||
| Violating these terms may lead to a permanent ban. | |||
| ### 4. Permanent Ban | |||
| **Community Impact**: Demonstrating a pattern of violation of community | |||
| standards, including sustained inappropriate behavior, harassment of an | |||
| individual, or aggression toward or disparagement of classes of individuals. | |||
| **Consequence**: A permanent ban from any sort of public interaction within the | |||
| community. | |||
| ## Attribution | |||
| This Code of Conduct is adapted from the [Contributor Covenant][homepage], | |||
| version 2.1, available at | |||
| [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. | |||
| Community Impact Guidelines were inspired by | |||
| [Mozilla's code of conduct enforcement ladder][Mozilla CoC]. | |||
| For answers to common questions about this code of conduct, see the FAQ at | |||
| [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at | |||
| [https://www.contributor-covenant.org/translations][translations]. | |||
| [homepage]: https://www.contributor-covenant.org | |||
| [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html | |||
| [Mozilla CoC]: https://github.com/mozilla/diversity | |||
| [FAQ]: https://www.contributor-covenant.org/faq | |||
| [translations]: https://www.contributor-covenant.org/translations | |||
| @@ -0,0 +1,203 @@ | |||
| Copyright 2024 LAMDA Beimingwu. All rights reserved. | |||
| Apache License | |||
| Version 2.0, January 2004 | |||
| http://www.apache.org/licenses/ | |||
| TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION | |||
| 1. Definitions. | |||
| "License" shall mean the terms and conditions for use, reproduction, | |||
| and distribution as defined by Sections 1 through 9 of this document. | |||
| "Licensor" shall mean the copyright owner or entity authorized by | |||
| the copyright owner that is granting the License. | |||
| "Legal Entity" shall mean the union of the acting entity and all | |||
| other entities that control, are controlled by, or are under common | |||
| control with that entity. For the purposes of this definition, | |||
| "control" means (i) the power, direct or indirect, to cause the | |||
| direction or management of such entity, whether by contract or | |||
| otherwise, or (ii) ownership of fifty percent (50%) or more of the | |||
| outstanding shares, or (iii) beneficial ownership of such entity. | |||
| "You" (or "Your") shall mean an individual or Legal Entity | |||
| exercising permissions granted by this License. | |||
| "Source" form shall mean the preferred form for making modifications, | |||
| including but not limited to software source code, documentation | |||
| source, and configuration files. | |||
| "Object" form shall mean any form resulting from mechanical | |||
| transformation or translation of a Source form, including but | |||
| not limited to compiled object code, generated documentation, | |||
| and conversions to other media types. | |||
| "Work" shall mean the work of authorship, whether in Source or | |||
| Object form, made available under the License, as indicated by a | |||
| copyright notice that is included in or attached to the work | |||
| (an example is provided in the Appendix below). | |||
| "Derivative Works" shall mean any work, whether in Source or Object | |||
| form, that is based on (or derived from) the Work and for which the | |||
| editorial revisions, annotations, elaborations, or other modifications | |||
| represent, as a whole, an original work of authorship. For the purposes | |||
| of this License, Derivative Works shall not include works that remain | |||
| separable from, or merely link (or bind by name) to the interfaces of, | |||
| the Work and Derivative Works thereof. | |||
| "Contribution" shall mean any work of authorship, including | |||
| the original version of the Work and any modifications or additions | |||
| to that Work or Derivative Works thereof, that is intentionally | |||
| submitted to Licensor for inclusion in the Work by the copyright owner | |||
| or by an individual or Legal Entity authorized to submit on behalf of | |||
| the copyright owner. For the purposes of this definition, "submitted" | |||
| means any form of electronic, verbal, or written communication sent | |||
| to the Licensor or its representatives, including but not limited to | |||
| communication on electronic mailing lists, source code control systems, | |||
| and issue tracking systems that are managed by, or on behalf of, the | |||
| Licensor for the purpose of discussing and improving the Work, but | |||
| excluding communication that is conspicuously marked or otherwise | |||
| designated in writing by the copyright owner as "Not a Contribution." | |||
| "Contributor" shall mean Licensor and any individual or Legal Entity | |||
| on behalf of whom a Contribution has been received by Licensor and | |||
| subsequently incorporated within the Work. | |||
| 2. Grant of Copyright License. Subject to the terms and conditions of | |||
| this License, each Contributor hereby grants to You a perpetual, | |||
| worldwide, non-exclusive, no-charge, royalty-free, irrevocable | |||
| copyright license to reproduce, prepare Derivative Works of, | |||
| publicly display, publicly perform, sublicense, and distribute the | |||
| Work and such Derivative Works in Source or Object form. | |||
| 3. Grant of Patent License. Subject to the terms and conditions of | |||
| this License, each Contributor hereby grants to You a perpetual, | |||
| worldwide, non-exclusive, no-charge, royalty-free, irrevocable | |||
| (except as stated in this section) patent license to make, have made, | |||
| use, offer to sell, sell, import, and otherwise transfer the Work, | |||
| where such license applies only to those patent claims licensable | |||
| by such Contributor that are necessarily infringed by their | |||
| Contribution(s) alone or by combination of their Contribution(s) | |||
| with the Work to which such Contribution(s) was submitted. If You | |||
| institute patent litigation against any entity (including a | |||
| cross-claim or counterclaim in a lawsuit) alleging that the Work | |||
| or a Contribution incorporated within the Work constitutes direct | |||
| or contributory patent infringement, then any patent licenses | |||
| granted to You under this License for that Work shall terminate | |||
| as of the date such litigation is filed. | |||
| 4. Redistribution. You may reproduce and distribute copies of the | |||
| Work or Derivative Works thereof in any medium, with or without | |||
| modifications, and in Source or Object form, provided that You | |||
| meet the following conditions: | |||
| (a) You must give any other recipients of the Work or | |||
| Derivative Works a copy of this License; and | |||
| (b) You must cause any modified files to carry prominent notices | |||
| stating that You changed the files; and | |||
| (c) You must retain, in the Source form of any Derivative Works | |||
| that You distribute, all copyright, patent, trademark, and | |||
| attribution notices from the Source form of the Work, | |||
| excluding those notices that do not pertain to any part of | |||
| the Derivative Works; and | |||
| (d) If the Work includes a "NOTICE" text file as part of its | |||
| distribution, then any Derivative Works that You distribute must | |||
| include a readable copy of the attribution notices contained | |||
| within such NOTICE file, excluding those notices that do not | |||
| pertain to any part of the Derivative Works, in at least one | |||
| of the following places: within a NOTICE text file distributed | |||
| as part of the Derivative Works; within the Source form or | |||
| documentation, if provided along with the Derivative Works; or, | |||
| within a display generated by the Derivative Works, if and | |||
| wherever such third-party notices normally appear. The contents | |||
| of the NOTICE file are for informational purposes only and | |||
| do not modify the License. You may add Your own attribution | |||
| notices within Derivative Works that You distribute, alongside | |||
| or as an addendum to the NOTICE text from the Work, provided | |||
| that such additional attribution notices cannot be construed | |||
| as modifying the License. | |||
| You may add Your own copyright statement to Your modifications and | |||
| may provide additional or different license terms and conditions | |||
| for use, reproduction, or distribution of Your modifications, or | |||
| for any such Derivative Works as a whole, provided Your use, | |||
| reproduction, and distribution of the Work otherwise complies with | |||
| the conditions stated in this License. | |||
| 5. Submission of Contributions. Unless You explicitly state otherwise, | |||
| any Contribution intentionally submitted for inclusion in the Work | |||
| by You to the Licensor shall be under the terms and conditions of | |||
| this License, without any additional terms or conditions. | |||
| Notwithstanding the above, nothing herein shall supersede or modify | |||
| the terms of any separate license agreement you may have executed | |||
| with Licensor regarding such Contributions. | |||
| 6. Trademarks. This License does not grant permission to use the trade | |||
| names, trademarks, service marks, or product names of the Licensor, | |||
| except as required for reasonable and customary use in describing the | |||
| origin of the Work and reproducing the content of the NOTICE file. | |||
| 7. Disclaimer of Warranty. Unless required by applicable law or | |||
| agreed to in writing, Licensor provides the Work (and each | |||
| Contributor provides its Contributions) on an "AS IS" BASIS, | |||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| implied, including, without limitation, any warranties or conditions | |||
| of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A | |||
| PARTICULAR PURPOSE. You are solely responsible for determining the | |||
| appropriateness of using or redistributing the Work and assume any | |||
| risks associated with Your exercise of permissions under this License. | |||
| 8. Limitation of Liability. In no event and under no legal theory, | |||
| whether in tort (including negligence), contract, or otherwise, | |||
| unless required by applicable law (such as deliberate and grossly | |||
| negligent acts) or agreed to in writing, shall any Contributor be | |||
| liable to You for damages, including any direct, indirect, special, | |||
| incidental, or consequential damages of any character arising as a | |||
| result of this License or out of the use or inability to use the | |||
| Work (including but not limited to damages for loss of goodwill, | |||
| work stoppage, computer failure or malfunction, or any and all | |||
| other commercial damages or losses), even if such Contributor | |||
| has been advised of the possibility of such damages. | |||
| 9. Accepting Warranty or Additional Liability. While redistributing | |||
| the Work or Derivative Works thereof, You may choose to offer, | |||
| and charge a fee for, acceptance of support, warranty, indemnity, | |||
| or other liability obligations and/or rights consistent with this | |||
| License. However, in accepting such obligations, You may act only | |||
| on Your own behalf and on Your sole responsibility, not on behalf | |||
| of any other Contributor, and only if You agree to indemnify, | |||
| defend, and hold each Contributor harmless for any liability | |||
| incurred by, or claims asserted against, such Contributor by reason | |||
| of your accepting any such warranty or additional liability. | |||
| END OF TERMS AND CONDITIONS | |||
| APPENDIX: How to apply the Apache License to your work. | |||
| To apply the Apache License to your work, attach the following | |||
| boilerplate notice, with the fields enclosed by brackets "[]" | |||
| replaced with your own identifying information. (Don't include | |||
| the brackets!) The text should be enclosed in the appropriate | |||
| comment syntax for the file format. We also recommend that a | |||
| file or class name and description of purpose be included on the | |||
| same "printed page" as the copyright notice for easier | |||
| identification within third-party archives. | |||
| Copyright 2024 LAMDA Beimingwu. All rights reserved. | |||
| Licensed under the Apache License, Version 2.0 (the "License"); | |||
| you may not use this file except in compliance with the License. | |||
| You may obtain a copy of the License at | |||
| http://www.apache.org/licenses/LICENSE-2.0 | |||
| Unless required by applicable law or agreed to in writing, software | |||
| distributed under the License is distributed on an "AS IS" BASIS, | |||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| See the License for the specific language governing permissions and | |||
| limitations under the License. | |||
| @@ -1,40 +1,54 @@ | |||
| [](https://pypi.org/project/learnware/#files) | |||
| [](https://pypi.org/project/learnware/#files) | |||
| [](https://pypi.org/project/learnware/#history) | |||
| [](https://learnware.readthedocs.io/en/latest/?badge=latest) | |||
| [](LICENSE) | |||
| <div align=center> | |||
| <img src="./docs/_static/img/logo/logo1.png" width="50%"/> | |||
| <br/> | |||
| <br/> | |||
| </div> | |||
| ``Learnware`` is a model sharing platform, which give a basic implementation of the learnware paradigm. A learnware is a well-performed trained machine learning model with a specification that enables it to be adequately identified to reuse according to the requirement of future users who may know nothing about the learnware in advance. The learnware paradigm can solve entangled problems in the current machine learning paradigm, like continual learning and catastrophic forgetting. It also reduces resources for training a well-performed model. | |||
| <p align="center"> | |||
| <a href="https://pypi.org/project/learnware/#files"> | |||
| <img alt="Python Versions" src="https://img.shields.io/pypi/pyversions/learnware.svg?logo=python&logoColor=white"> | |||
| </a> | |||
| <a href="https://pypi.org/project/learnware/#files"> | |||
| <img alt="Platform" src="https://img.shields.io/badge/platform-linux%20%7C%20windows%20%7C%20macos-lightgrey"> | |||
| </a> | |||
| <a href="https://github.com/Learnware-LAMDA/Learnware/actions"> | |||
| <img alt="Test" src="https://github.com/Learnware-LAMDA/Learnware/actions/workflows/install_learnware_with_source.yaml/badge.svg"> | |||
| </a> | |||
| <a href="https://pypi.org/project/learnware/#history"> | |||
| <img alt="PypI Versions" src="https://img.shields.io/pypi/v/learnware"> | |||
| </a> | |||
| <a href="https://learnware.readthedocs.io/en/latest/?badge=latest"> | |||
| <img alt="Documentation Status" src="https://readthedocs.org/projects/learnware/badge/?version=latest"> | |||
| </a> | |||
| <a href="https://github.com/Learnware-LAMDA/Learnware/blob/main/LICENSE"> | |||
| <img alt="License" src="https://img.shields.io/pypi/l/learnware"> | |||
| </a> | |||
| </p> | |||
| <h3 align="center"> | |||
| <p> | |||
| <b>English</b> | | |||
| <a href="https://github.com/Learnware-LAMDA/Learnware/blob/main/docs/README_zh.md">中文</a> | |||
| </p> | |||
| </h3> | |||
| # Introduction | |||
| ## Framework | |||
| The _learnware_ paradigm, proposed by Professor Zhi-Hua Zhou in 2016 [1, 2], aims to build a vast model platform system, i.e., a _learnware dock system_, which systematically accommodates and organizes models shared by machine learning developers worldwide, and can efficiently identify and assemble existing helpful model(s) to solve future tasks in a unified way. | |||
| <div align="center"> | |||
| <img src="./docs/_static/img/learnware_paradigm.jpg" width="70%"/> | |||
| </div> | |||
| The `learnware` package provides a fundamental implementation of the central concepts and procedures within the learnware paradigm. Its well-structured design ensures high scalability and facilitates the seamless integration of additional features and techniques in the future. | |||
| Machine learning, especially the prevailing big model paradigm, has achieved great success in natural language processing and computer vision applications. However, it still faces challenges such as the requirement of a large amount of labeled training data, difficulty in adapting to changing environments, and catastrophic forgetting when refining trained models incrementally. These big models, while useful in their targeted tasks, often fail to address the above issues and struggle to generalize beyond their specific purposes. | |||
| In addition, the `learnware` package serves as the engine for the [Beimingwu System](https://bmwu.cloud) and can be effectively employed for conducting experiments related to learnware. | |||
| <div align="center"> | |||
| <img src="./docs/_static/img/learnware_market.jpg" width="70%" /> | |||
| </div> | |||
| The learnware paradigm introduces the concept of a well-performed, trained machine learning model with a specification that allows future users, who have no prior knowledge of the learnware, to reuse it based on their requirements. | |||
| [1] Zhi-Hua Zhou. Learnware: on the future of machine learning. _Frontiers of Computer Science_, 2016, 10(4): 589–590 <br/> | |||
| [2] Zhi-Hua Zhou. Machine Learning: Development and Future. _Communications of CCF_, 2017, vol.13, no.1 (2016 CNCC keynote) | |||
| Developers or owners of trained machine learning models can submit their models to a learnware market. If accepted, the market assigns a specification to the model and accommodates it. The learnware market could host thousands or millions of well-performed models from different developers, for various tasks, using diverse data, and optimizing different objectives. | |||
| ## Learnware Paradigm | |||
| Instead of building a model from scratch, users can submit their requirements to the learnware market, which then identifies and deploys helpful learnware(s) based on the specifications. Users can apply the learnware directly, adapt it using their data, or exploit it in other ways to improve their model. This process is more efficient and less expensive than building a model from scratch. | |||
| A learnware consists of a high-performance machine learning model and specifications that characterize the model, i.e., "Learnware = Model + Specification". | |||
| These specifications, encompassing both semantic and statistical aspects, detail the model's functionality and statistical information, making it easier for future users to identify and reuse these models. | |||
| ## Benefits of the Learnware Paradigm | |||
| The need for Learnware arises due to challenges in machine learning, such as the need for extensive training data, advanced techniques, continuous learning, catastrophic forgetting, and data privacy issues. Although there are many efforts focusing on one of these issues separately, they are entangled, and solving one problem may exacerbate others. The learnware paradigm aims to address many of these challenges through a unified framework. Its benefits are listed as follows. | |||
| | Benefit | Description | | |||
| | ---- | ---- | | |||
| @@ -46,227 +60,372 @@ Instead of building a model from scratch, users can submit their requirements to | |||
| | Unplanned tasks | Open to all legal developers, the learnware market can accommodate helpful learnwares for various tasks. | | |||
| | Carbon emission | Assembling small models may offer good-enough performance, reducing interest in training large models and the carbon footprint. | | |||
| # Quick Start | |||
| ## Installation | |||
| Learnware is currently hosted on [PyPI](https://pypi.org/). You can easily intsall ``Learnware`` according to the following steps: | |||
| - For Windows and Linux users: | |||
| The learnware paradigm consists of two distinct stages: | |||
| - `Submitting Stage`: Developers voluntarily submit various learnwares to the learnware market, and the system conducts quality checks and further organization of these learnwares. | |||
| - `Deploying Stage`: When users submit task requirements, the learnware market automatically selects whether to recommend a single learnware or a combination of multiple learnwares and provides efficient deployment methods. Whether it’s a single learnware or a combination of multiple learnwares, the system offers convenient learnware reuse interfaces. | |||
| ```bash | |||
| pip install learnware | |||
| ``` | |||
| <div align="center"> | |||
| <img src="./docs/_static/img/learnware_market.svg" width="70%" /> | |||
| </div> | |||
| - For macOS users: | |||
| ## Learnware Package Design | |||
| ```bash | |||
| conda install -c pytorch faiss | |||
| pip install learnware | |||
| ``` | |||
| <div align="center"> | |||
| <img src="./docs/_static/img/learnware_framework.svg" width="70%"/> | |||
| </div> | |||
| ## Prepare Learnware | |||
| The Learnware Market consists of a wide range of learnwares. A valid learnware is a zipfile which | |||
| is composed of the following four parts. | |||
| At the workflow level, the `learnware` package consists of `Submitting Stage` and `Deploying Stage`. | |||
| At the module level, the `learnware` package is a platform that consists of above components. The components are designed as loose-coupled modules and each component could be used stand-alone. | |||
| - ``__init__.py`` | |||
| # Quick Start | |||
| A python file offering interfaces for your model's fitting, predicting and fine-tuning. | |||
| ## Installation | |||
| - ``rkme.json`` | |||
| Learnware is currently hosted on [PyPI](https://pypi.org/project/learnware/). You can easily install `learnware` by following these steps: | |||
| A json file containing the statistical specification of your data. | |||
| ```bash | |||
| pip install learnware | |||
| ``` | |||
| - ``learnware.yaml`` | |||
| A config file describing your model class name, type of statistical specification(e.g. Reduced Kernel Mean Embedding, ``RKMETableSpecification``), and | |||
| the file name of your statistical specification file. | |||
| In the `learnware` package, besides the base classes, many core functionalities such as "learnware specification generation" and "learnware deployment" rely on the `torch` library. Users have the option to manually install `torch`, or they can directly use the following command to install the `learnware` package: | |||
| - ``environment.yaml`` | |||
| ```bash | |||
| pip install learnware[full] | |||
| ``` | |||
| A Conda environment configuration file for running the model (if the model environment is incompatible, you can rely on this for manual configuration). | |||
| You can generate this file according to the following steps: | |||
| **Note:** However, it's crucial to note that due to the potential complexity of the user's local environment, installing `learnware[full]` does not guarantee that `torch` will successfully invoke `CUDA` in the user's local setting. | |||
| - Create env config for conda: | |||
| ## Prepare Learnware | |||
| ```bash | |||
| conda env export | grep -v "^prefix: " > environment.yaml | |||
| ``` | |||
| - Recover env from config: | |||
| In the `learnware` package, each learnware is encapsulated in a `zip` package, which should contain at least the following four files: | |||
| ```bash | |||
| conda env create -f environment.yaml | |||
| ``` | |||
| - `learnware.yaml`: learnware configuration file. | |||
| - `__init__.py`: methods for using the model. | |||
| - `stat.json`: the statistical specification of the learnware. Its filename can be customized and recorded in learnware.yaml. | |||
| - `environment.yaml` or `requirements.txt`: specifies the environment for the model. | |||
| We also demonstrate the detail format of learnware zipfile in [DOC link], and also please refer to [Examples](./examples/workflow_by_code/learnware_example) for concrete learnware zipfile example. | |||
| To facilitate the construction of a learnware, we provide a [Learnware Template](https://www.bmwu.cloud/static/learnware-template.zip) that users can use as a basis for building their own learnware. We've also detailed the format of the learnware `zip` package in [Learnware Preparation](docs/workflows/upload:prepare-learnware). | |||
| ## Learnware Market Workflow | |||
| ## Learnware Package Workflow | |||
| Users can start an ``Learnware`` workflow according to the following steps: | |||
| Users can start a `learnware` workflow according to the following steps: | |||
| ### Initialize a Learnware Market | |||
| The ``EasyMarket`` class implements the most basic set of functions in a ``Learnware``. | |||
| You can use the following code snippet to initialize a basic ``Learnware`` named "demo": | |||
| The `EasyMarket` class provides the core functions of a `Learnware Market`. You can initialize a basic `Learnware Market` named "demo" using the code snippet below: | |||
| ```python | |||
| import learnware | |||
| from learnware.market import EasyMarket | |||
| from learnware.market import instantiate_learnware_market | |||
| learnware.init() | |||
| easy_market = EasyMarket(market_id="demo", rebuild=True) | |||
| # instantiate a demo market | |||
| demo_market = instantiate_learnware_market(market_id="demo", name="easy", rebuild=True) | |||
| ``` | |||
| ### Upload Leanwares | |||
| ### Upload Learnware | |||
| Before uploading your learnware into the ``Learnware``, | |||
| create a semantic specification ``semantic_spec`` by selecting or filling in values for the predefined semantic tags | |||
| to describe the features of your task and model. | |||
| Before uploading your learnware to the `Learnware Market`, you'll need to create a semantic specification, `semantic_spec`. This involves selecting or inputting values for predefined semantic tags to describe the features of your task and model. | |||
| For example, the following code snippet demonstrates the semantic specification | |||
| of a Scikit-Learn type model, which is designed for business scenario and performs classification on tabular data: | |||
| For instance, the following code illustrates the semantic specification for a Scikit-Learn type model. This model is tailored for education scenarios and performs classification tasks on tabular data: | |||
| ```python | |||
| semantic_spec = { | |||
| "Data": {"Values": ["Tabular"], "Type": "Class"}, | |||
| "Task": {"Values": ["Classification"], "Type": "Class"}, | |||
| "Library": {"Values": ["Scikit-learn"], "Type": "Class"}, | |||
| "Scenario": {"Values": ["Business"], "Type": "Tag"}, | |||
| "Description": {"Values": "", "Type": "String"}, | |||
| "Name": {"Values": "demo_learnware", "Type": "String"}, | |||
| } | |||
| from learnware.specification import generate_semantic_spec | |||
| semantic_spec = generate_semantic_spec( | |||
| name="demo_learnware", | |||
| data_type="Table", | |||
| task_type="Classification", | |||
| library_type="Scikit-learn", | |||
| scenarios="Education", | |||
| license="MIT", | |||
| ) | |||
| ``` | |||
| Once the semantic specification is defined, | |||
| you can easily upload your learnware with a single line of code: | |||
| After defining the semantic specification, you can upload your learnware using a single line of code: | |||
| ```python | |||
| easy_market.add_learnware(zip_path, semantic_spec) | |||
| demo_market.add_learnware(zip_path, semantic_spec) | |||
| ``` | |||
| Here, ``zip_path`` is the directory of your learnware zipfile. | |||
| Here, `zip_path` is the directory of your learnware `zip` package. | |||
| ### Semantic Specification Search | |||
| To search for learnwares that fit your task purpose, | |||
| you should also provide a semantic specification ``user_semantic`` that describes the characteristics of your task. | |||
| The ``Learnware`` will perform a first-stage search based on ``user_semantic``, | |||
| identifying potentially helpful leranwares whose models solve tasks similar to your requirements. | |||
| To find learnwares that align with your task's purpose, you'll need to provide a semantic specification, `user_semantic`, that outlines your task's characteristics. The `Learnware Market` will then perform an initial search using `user_semantic`, identifying potentially useful learnwares with models that solve tasks similar to your requirements. | |||
| ```python | |||
| # construct user_info which includes semantic specification for searching learnware | |||
| # construct user_info, which includes a semantic specification | |||
| user_info = BaseUserInfo(id="user", semantic_spec=semantic_spec) | |||
| # search_learnware performs semantic specification search if user_info doesn't include a statistical specification | |||
| _, single_learnware_list, _ = easy_market.search_learnware(user_info) | |||
| # search_learnware: performs semantic specification search when user_info doesn't include a statistical specification | |||
| search_result = easy_market.search_learnware(user_info) | |||
| single_result = search_results.get_single_results() | |||
| # single_learnware_list is the learnware list by semantic specification searching | |||
| print(single_learnware_list) | |||
| # single_result: the List of Tuple[Score, Learnware] returned by semantic specification search | |||
| print(single_result) | |||
| ``` | |||
| ### Statistical Specification Search | |||
| If you choose to porvide your own statistical specification file ``stat.json``, | |||
| the ``Learnware`` can perform a more accurate leanware selection from | |||
| the learnwares returned by the previous step. This second-stage search is based on statistical information | |||
| and returns one or more learnwares that are most likely to be helpful for your task. | |||
| If you decide in favor of providing your own statistical specification file, `stat.json`, the `Learnware Market` can further refine the selection of learnwares from the previous step. This second-stage search leverages statistical information to identify one or more learnwares that are most likely to be beneficial for your task. | |||
| For example, the following code is designed to work with Reduced Set Kernel Embedding as a statistical specification: | |||
| For example, the code below executes learnware search when using Reduced Set Kernel Embedding as the statistical specification: | |||
| ```python | |||
| import learnware.specification as specification | |||
| user_spec = specification.RKMETableSpecification() | |||
| # unzip_path: directory for unzipped learnware zipfile | |||
| user_spec.load(os.path.join(unzip_path, "rkme.json")) | |||
| user_info = BaseUserInfo( | |||
| semantic_spec=user_semantic, stat_info={"RKMETableSpecification": user_spec} | |||
| ) | |||
| (sorted_score_list, single_learnware_list, | |||
| mixture_score, mixture_learnware_list) = easy_market.search_learnware(user_info) | |||
| # sorted_score_list is the learnware scores based on MMD distances, sorted in descending order | |||
| print(sorted_score_list) | |||
| # single_learnware_list is the learnwares sorted in descending order based on their scores | |||
| print(single_learnware_list) | |||
| # mixture_learnware_list is the learnwares whose mixture is helpful for your task | |||
| print(mixture_learnware_list) | |||
| # mixture_score is the score of the mixture of learnwares | |||
| print(mixture_score) | |||
| search_result = easy_market.search_learnware(user_info) | |||
| single_result = search_results.get_single_results() | |||
| multiple_result = search_results.get_multiple_results() | |||
| # search_item.score: based on MMD distances, sorted in descending order | |||
| # search_item.learnware.id: id of learnwares, sorted by scores in descending order | |||
| for search_item in single_result: | |||
| print(f"score: {search_item.score}, learnware_id: {search_item.learnware.id}") | |||
| # mixture_item.learnwares: collection of learnwares whose combined use is beneficial | |||
| # mixture_item.score: score assigned to the combined set of learnwares in `mixture_item.learnwares` | |||
| for mixture_item in multiple_result: | |||
| print(f"mixture_score: {mixture_item.score}\n") | |||
| mixture_id = " ".join([learnware.id for learnware in mixture_item.learnwares]) | |||
| print(f"mixture_learnware: {mixture_id}\n") | |||
| ``` | |||
| ### Reuse Learnwares | |||
| Based on the returned list of learnwares ``mixture_learnware_list`` in the previous step, | |||
| you can easily reuse them to make predictions your own data, instead of training a model from scratch. | |||
| We provide two baseline methods for reusing a given list of learnwares, namely ``JobSelectorReuser`` and ``AveragingReuser``. | |||
| Simply replace ``test_x`` in the code snippet below with your own testing data and start reusing learnwares! | |||
| With the list of learnwares, `mixture_learnware_list`, returned from the previous step, you can readily apply them to make predictions on your own data, bypassing the need to train a model from scratch. We provide two methods for reusing a given list of learnwares: `JobSelectorReuser` and `AveragingReuser`. Substitute `test_x` in the code snippet below with your testing data, and you're all set to reuse learnwares: | |||
| ```python | |||
| from learnware.reuse import JobSelectorReuser, AveragingReuser | |||
| # using jobselector reuser to reuse the searched learnwares to make prediction | |||
| reuse_job_selector = JobSelectorReuser(learnware_list=mixture_learnware_list) | |||
| reuse_job_selector = JobSelectorReuser(learnware_list=mixture_item.learnwares) | |||
| job_selector_predict_y = reuse_job_selector.predict(user_data=test_x) | |||
| # using averaging ensemble reuser to reuse the searched learnwares to make prediction | |||
| reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list) | |||
| reuse_ensemble = AveragingReuser(learnware_list=mixture_item.learnwares) | |||
| ensemble_predict_y = reuse_ensemble.predict(user_data=test_x) | |||
| ``` | |||
| ## Auto Workflow Example | |||
| We also provide two methods when the user has labeled data for reusing a given list of learnwares: `EnsemblePruningReuser` and `FeatureAugmentReuser`. Substitute `test_x` in the code snippet below with your testing data, and substitute `train_X, train_y` with your training labeled data, and you're all set to reuse learnwares: | |||
| ```python | |||
| from learnware.reuse import EnsemblePruningReuser, FeatureAugmentReuser | |||
| # Use ensemble pruning reuser to reuse the searched learnwares to make prediction | |||
| reuse_ensemble = EnsemblePruningReuser(learnware_list=mixture_item.learnwares, mode="classification") | |||
| reuse_ensemble.fit(train_X, train_y) | |||
| ensemble_pruning_predict_y = reuse_ensemble.predict(user_data=data_X) | |||
| # Use feature augment reuser to reuse the searched learnwares to make prediction | |||
| reuse_feature_augment = FeatureAugmentReuser(learnware_list=mixture_item.learnwares, mode="classification") | |||
| reuse_feature_augment.fit(train_X, train_y) | |||
| feature_augment_predict_y = reuse_feature_augment.predict(user_data=data_X) | |||
| ``` | |||
| ``Learnware`` also provides an auto workflow example, which includes preparing learnwares, upload and delete learnware from markets, search learnware with semantic specifications and statistical specifications. The users can run ``examples/workflow_by_code.py`` to try the basic workflow of ``Learnware``. | |||
| ### Auto Workflow Example | |||
| The `learnware` package also offers automated workflow examples. This includes preparing learnwares, uploading and deleting learnwares from the market, and searching for learnwares using both semantic and statistical specifications. To experience the basic workflow of the `learnware` package, the users can run `test/test_workflow/test_workflow.py` to try the basic workflow of `learnware`. | |||
| # Experiments and Examples | |||
| ## Environment | |||
| For all experiments, we used a single linux server. Details on the specifications are listed in the table below. All processors were used for training and evaluating. | |||
| For all experiments, we used a single Linux server. Details on the specifications are listed in the table below. All processors were used for training and evaluating. | |||
| <div align=center> | |||
| | System | GPU | CPU | | |||
| |----------------------|--------------------|--------------------------| | |||
| | Ubuntu 20.04.4 LTS | Nvidia Tesla V100S | Intel(R) Xeon(R) Gold 6240R | | |||
| </div> | |||
| ## Tabular Scenario Experiments | |||
| ### Datasets | |||
| Our study involved three public datasets in the sales forecasting field: [Predict Future Sales (PFS)](https://www.kaggle.com/c/competitive-data-science-predict-future-sales/data), [M5 Forecasting (M5)](https://www.kaggle.com/competitions/m5-forecasting-accuracy/data), and [Corporacion](https://www.kaggle.com/competitions/favorita-grocery-sales-forecasting/data). | |||
| We applied various pre-processing methods to these datasets to enhance the richness of the data. After pre-processing, we first divided each dataset by store and then split the data for each store into training and test sets. Specifically: | |||
| - For PFS, the test set consisted of the last month's data from each store. | |||
| - For M5, we designated the final 28 days' data from each store as the test set. | |||
| - For Corporacion, the test set was composed of the last 16 days of data from each store. | |||
| In the submitting stage, the Corporacion dataset's 55 stores are regarded as 165 uploaders, each employing one of three different feature engineering methods. For the PFS dataset, 100 uploaders are established, each using one of two feature engineering approaches. These uploaders then utilize their respective stores' training data to develop LightGBM models. As a result, the learnware market comprises 265 learnwares, derived from five types of feature spaces and two types of label spaces. | |||
| Based on the specific design of user tasks, our experiments were primarily categorized into two types: | |||
| - **homogeneous experiments** are designed to evaluate performance when users can reuse learnwares in the learnware market that have the same feature space as their tasks (homogeneous learnwares). This contributes to showing the effectiveness of using learnwares that align closely with the user's specific requirements. | |||
| - **heterogeneous experiments** aim to evaluate the performance of identifying and reusing helpful heterogeneous learnwares in situations where no available learnwares match the feature space of the user's task. This helps to highlight the potential of learnwares for applications beyond their original purpose. | |||
| ### Homogeneous Tabular Scenario | |||
| For homogeneous experiments, the 55 stores in the Corporacion dataset act as 55 users, each applying one feature engineering method, and using the test data from their respective store as user data. These users can then search for homogeneous learnwares in the market with the same feature spaces as their tasks. | |||
| The Mean Squared Error (MSE) of search and reuse across all users is presented in the table below: | |||
| <div align=center> | |||
| | Setting | MSE | | |||
| |-----------------------------------|--------| | |||
| | Mean in Market (Single) | 0.331 | | |||
| | Best in Market (Single) | 0.151 | | |||
| | Top-1 Reuse (Single) | 0.280 | | |||
| | Job Selector Reuse (Multiple) | 0.274 | | |||
| | Average Ensemble Reuse (Multiple) | 0.267 | | |||
| </div> | |||
| | System | GPU | CPU | | |||
| | ---- | ---- | ---- | | |||
| | Ubuntu 20.04.4 LTS | Nvidia Tesla V100S | Intel(R) Xeon(R) Gold 6240R | | |||
| When users have both test data and limited training data derived from their original data, reusing single or multiple searched learnwares from the market can often yield better results than training models from scratch on limited training data. We present the change curves in MSE for the user's self-trained model, as well as for the Feature Augmentation single learnware reuse method and the Ensemble Pruning multiple learnware reuse method. These curves display their performance on the user's test data as the amount of labeled training data increases. The average results across 55 users are depicted in the figure below: | |||
| <div align=center> | |||
| <img src="./docs/_static/img/table_homo_labeled.png" width="50%"/> | |||
| </div> | |||
| From the figure, it's evident that when users have limited training data, the performance of reusing single/multiple table learnwares is superior to that of the user's own model. This emphasizes the benefit of learnware reuse in significantly reducing the need for extensive training data and achieving enhanced results when available user training data is limited. | |||
| ## Datasets | |||
| ### Heterogeneous Tabular Scenario | |||
| We designed experiments on three publicly available datasets, namely Prediction Future Sales (PFS), M5 Forecasting (M5) and CIFAR 10. For the two sales forecasting data sets of PFS and M5, we divide the user data according to different stores, and train the Ridge model and LightGBM model on the corresponding data respectively. For the CIFAR10 image classification task, we first randomly pick 6 to 10 categories, and randomly select 800 to 2000 samples from each category from the categories corresponding to the training set, constituting a total of 50 different uploaders. For test users, we first randomly pick 3 to 6 categories, and randomly select 150 to 350 samples from each category from the corresponding categories from the test set, constituting a total of 20 different users. | |||
| In heterogeneous experiments, the learnware market would recommend helpful heterogeneous learnwares with different feature spaces with the user tasks. Based on whether there are learnwares in the market that handle tasks similar to the user's task, the experiments can be further subdivided into the following two types: | |||
| We tested the efficiency of the specification generation and the accuracy of the search and reuse model respectively. The evaluation index on PFS and M5 data is RMSE, and the evaluation index on CIFAR10 classification task is classification accuracy | |||
| #### Cross Feature Space Experiments | |||
| ## Results | |||
| We designate the 41 stores in the PFS dataset as users, creating their user data with an alternative feature engineering approach that varies from the methods employed by learnwares in the market. Consequently, while the market's learnwares from the PFS dataset undertake tasks very similar to our users, the feature spaces do not match exactly. In this experimental configuration, we tested various heterogeneous learnware reuse methods (without using user's labeled data) and compared them to the user's self-trained model based on a small amount of training data. The average MSE performance across 41 users is as follows: | |||
| <div align=center> | |||
| The time-consuming specification generation is shown in the table below: | |||
| | Setting | MSE | | |||
| |-----------------------------------|--------| | |||
| | Mean in Market (Single) | 1.459 | | |||
| | Best in Market (Single) | 1.226 | | |||
| | Top-1 Reuse (Single) | 1.407 | | |||
| | Average Ensemble Reuse (Multiple) | 1.312 | | |||
| | User model with 50 labeled data | 1.267 | | |||
| | Dataset | Data Dimensions | Specification Generation Time (s) | | |||
| | ---- | ---- | ---- | | |||
| | PFS | 8714274*31 | < 1.5 | | |||
| | M5 | 46027957*82 | 9~15 | | |||
| | CIFAR 10 | 9000\*3\*32\*32 | 7~10 | | |||
| </div> | |||
| From the results, it is noticeable that the learnware market still performs quite well even when users lack labeled data, provided it includes learnwares addressing tasks that are similar but not identical to the user's. In these instances, the market's effectiveness can match or even rival scenarios where users have access to a limited quantity of labeled data. | |||
| #### Cross Task Experiments | |||
| Here we have chosen the 10 stores from the M5 dataset to act as users. Although the broad task of sales forecasting is similar to the tasks addressed by the learnwares in the market, there are no learnwares available that directly cater to the M5 sales forecasting requirements. All learnwares show variations in both feature and label spaces compared to the tasks of M5 users. We present the change curves in RMSE for the user's self-trained model and several learnware reuse methods. These curves display their performance on the user's test data as the amount of labeled training data increases. The average results across 10 users are depicted in the figure below: | |||
| <div align=center> | |||
| <img src="./docs/_static/img/table_hetero_labeled.png" width="50%"/> | |||
| </div> | |||
| We can observe that heterogeneous learnwares are beneficial when there's a limited amount of the user's labeled training data available, aiding in better alignment with the user's specific task. This underscores the potential of learnwares to be applied to tasks beyond their original purpose. | |||
| ## Image Scenario Experiment | |||
| For the CIFAR-10 dataset, we sampled the training set unevenly by category and constructed unbalanced training datasets for the 50 learnwares that contained only some of the categories. This makes it unlikely that there exists any learnware in the learnware market that can accurately handle all categories of data; only the learnware whose training data is closest to the data distribution of the target task is likely to perform well on the target task. Specifically, the probability of each category being sampled obeys a random multinomial distribution, with a non-zero probability of sampling on only 4 categories, and the sampling ratio is 0.4: 0.4: 0.1: 0.1. Ultimately, the training set for each learnware contains 12,000 samples covering the data of 4 categories in CIFAR-10. | |||
| We constructed 50 target tasks using data from the test set of CIFAR-10. Similar to constructing the training set for the learnwares, to allow for some variation between tasks, we sampled the test set unevenly. Specifically, the probability of each category being sampled obeys a random multinomial distribution, with non-zero sampling probability on 6 categories, and the sampling ratio is 0.3: 0.3: 0.1: 0.1: 0.1: 0.1. Ultimately, each target task contains 3000 samples covering the data of 6 categories in CIFAR-10. | |||
| With this experimental setup, we evaluated the performance of RKME Image using 1 - Accuracy as the loss. | |||
| <div align=center> | |||
| | Setting | Accuracy | | |||
| |-----------------------------------|----------| | |||
| | Mean in Market (Single) | 0.655 | | |||
| | Best in Market (Single) | 0.304 | | |||
| | Top-1 Reuse (Single) | 0.406 | | |||
| | Job Selector Reuse (Multiple) | 0.406 | | |||
| | Average Ensemble Reuse (Multiple) | 0.310 | | |||
| </div> | |||
| In some specific settings, the user will have a small number of labelled samples. In such settings, learning the weight of selected learnwares on a limited number of labelled samples can result in better performance than training directly on a limited number of labelled samples. | |||
| <div align=center> | |||
| <img src="./docs/_static/img/image_labeled.svg" width="50%"/> | |||
| </div> | |||
| ## Text Scenario Experiment | |||
| The accuracy of search and reuse is shown in the table below: | |||
| ### Datasets | |||
| | Dataset | Top-1 Performance | Job Selector Reuse | Average Ensemble Reuse | | |||
| | ---- | ---- | ---- | ---- | | |||
| | PFS | 1.955 +/- 2.866 | 2.175 +/- 2.847 | 1.950 +/- 2.888 | | |||
| | M5 | 2.066 +/- 0.424 | 2.116 +/- 0.472 | 2.512 +/- 0.573 | | |||
| | CIFAR 10 | 0.619 +/- 0.138 | 0.585 +/- 0.056 | .715 +/- 0.075 | | |||
| We conducted experiments on the widely used text benchmark dataset: [20-newsgroup](http://qwone.com/~jason/20Newsgroups/). 20-newsgroup is a renowned text classification benchmark with a hierarchical structure, featuring 5 superclasses {comp, rec, sci, talk, misc}. | |||
| In the submitting stage, we enumerated all combinations of three superclasses from the five available, randomly sampling 50% of each combination from the training set to create datasets for 50 uploaders. | |||
| In the deploying stage, we considered all combinations of two superclasses out of the five, selecting all data for each combination from the testing set as a test dataset for one user. This resulted in 10 users. The user's own training data was generated using the same sampling procedure as the user test data, despite originating from the training dataset. | |||
| Model training comprised two parts: the first part involved training a tfidf feature extractor, and the second part used the extracted text feature vectors to train a naive Bayes classifier. | |||
| Our experiments comprise two components: | |||
| - **unlabeled_text_example** is designed to evaluate performance when users possess only testing data, searching and reusing learnware available in the market. | |||
| - **labeled_text_example** aims to assess performance when users have both testing and limited training data, searching and reusing learnware directly from the market instead of training a model from scratch. This helps determine the amount of training data saved for the user. | |||
| ### Results | |||
| - **unlabeled_text_example**: | |||
| The table below presents the mean accuracy of search and reuse across all users: | |||
| <div align=center> | |||
| | Setting | Accuracy | | |||
| |-----------------------------------|----------| | |||
| | Mean in Market (Single) | 0.507 | | |||
| | Best in Market (Single) | 0.859 | | |||
| | Top-1 Reuse (Single) | 0.846 | | |||
| | Job Selector Reuse (Multiple) | 0.845 | | |||
| | Average Ensemble Reuse (Multiple) | 0.862 | | |||
| </div> | |||
| - **labeled_text_example**: | |||
| We present the change curves in classification error rates for both the user's self-trained model and the multiple learnware reuse (EnsemblePrune), showcasing their performance on the user's test data as the user's training data increases. The average results across 10 users are depicted below: | |||
| <div align=center> | |||
| <img src="./docs/_static/img/text_labeled.svg" width="50%"/> | |||
| </div> | |||
| From the figure above, it is evident that when the user's own training data is limited, the performance of multiple learnware reuse surpasses that of the user's own model. As the user's training data grows, it is expected that the user's model will eventually outperform the learnware reuse. This underscores the value of reusing learnware to significantly conserve training data and achieve superior performance when user training data is limited. | |||
| # Citation | |||
| If you use our project in your research or work, we kindly request that you cite the following papers: | |||
| ```bibtex | |||
| @article{zhou2022learnware, | |||
| author = {Zhou, Zhi-Hua and Tan, Zhi-Hao}, | |||
| title = {Learnware: Small Models Do Big}, | |||
| journal = {SCIENCE CHINA Information Sciences}, | |||
| year = {2024}, | |||
| volume = {67}, | |||
| number = {1}, | |||
| pages = {1--12}, | |||
| } | |||
| ``` | |||
| Please acknowledge the use of our project by citing these papers in your work. Thank you for your support! | |||
| # About | |||
| ## Contributor | |||
| ## Contributors | |||
| We appreciate all contributions and thank all the contributors! | |||
| TODO: Here paste the github API after publishing: | |||
| [Pic after publish]() | |||
| <div align=center> | |||
| <img src="https://github.com/Learnware-LAMDA/Learnware/graphs/contributors"/> | |||
| </div> | |||
| ## About us | |||
| ## About Us | |||
| Visit [LAMDA's official website](http://www.lamda.nju.edu.cn/MainPage.ashx). | |||
| The Learnware repository is developed and maintained by the LAMDA Beimingwu R&D Team. | |||
| To learn more about our team, please visit the [Team Overview](https://docs.bmwu.cloud/en/about-us.html). | |||
| @@ -2,7 +2,10 @@ | |||
| About Us | |||
| ================ | |||
| We thank all the contributors for the development of learnware package: | |||
| Contributors | |||
| ================ | |||
| .. image:: https://github.com/Learnware-LAMDA/Learnware/graphs/contributors | |||
| :align: center | |||
| In LAMDA Group, also many people participate the discussions, learnware package design and development and so on. | |||
| For more details about us, please refer to `LAMDA Group <https://www.lamda.nju.edu.cn/>`_. | |||
| @@ -3,6 +3,39 @@ | |||
| For Developer | |||
| ================ | |||
| Install with Dev Mode | |||
| ======================= | |||
| As a developer, you often want make changes to ``Learnware Market`` and hope it would reflect directly in your environment without reinstalling it. You can install ``Learnware Market`` in editable mode with following command. | |||
| .. code-block:: bash | |||
| $ git clone https://github.com/Learnware-LAMDA/Learnware.git && cd Learnware | |||
| $ pip install -e .[dev] | |||
| .. note:: | |||
| It's recommended to use anaconda/miniconda to setup the environment. Also you can run ``pip install -e .[full, dev]`` to install ``torch`` automatically. | |||
| Commit Format | |||
| ============== | |||
| Please submit in the following manner: Submit using the format ``prefix`` + ``space`` + ``suffix``. | |||
| There are four choices for the prefix, and they can be combined using commas: | |||
| - [ENH]: Represents enhancement, indicating the addition of new features. | |||
| - [DOC]: Indicates modifications to the documentation. | |||
| - [FIX]: Represents bug fixes and typo corrections. | |||
| - [MNT]: Indicates other minor modifications, such as version updates. | |||
| The suffix specifies the specific nature of the modification, with the initial letter capitalized. | |||
| Examples: The following are all valid: | |||
| - [DOC] Fix the document | |||
| - [FIX, ENH] Fix the bug and add some feature" | |||
| Docstring | |||
| ============ | |||
| Please use the `Numpydoc Style <https://stackoverflow.com/a/24385103>`_. | |||
| @@ -15,7 +48,7 @@ Continuous Integration | |||
| Continuous Integration (CI) tools help you stick to the quality standards by running tests every time you push a new commit and reporting the results to a pull request. | |||
| ``Learnware Market`` will check the following tests when you pull a request: | |||
| 1. We will check your code style pylint, you can fix your code style by the following commands: | |||
| 1. We will check your code length, you can fix your code style by the following commands: | |||
| .. code-block:: bash | |||
| @@ -30,22 +63,34 @@ Continuous Integration (CI) tools help you stick to the quality standards by run | |||
| pip install pytest | |||
| python -m pytest tests | |||
| Development Guidance | |||
| ================================= | |||
| ``pre-commit`` Config | |||
| ======================== | |||
| As a developer, you often want make changes to ``Learnware Market`` and hope it would reflect directly in your environment without reinstalling it. You can install ``Learnware Market`` in editable mode with following command. | |||
| The ``learnware`` package support config ``pre-commit``. Run the following command to install ``pre-commit``: | |||
| - For Windows and Linux users: | |||
| .. code-block:: bash | |||
| pip install pre-commit | |||
| Run the following command in the root directory of ``Learnware`` Project to enable ``pre-commit``: | |||
| .. code-block:: bash | |||
| pre-commit install | |||
| .. code-block:: bash | |||
| $ git clone https://git.nju.edu.cn/learnware/learnware-market.git && cd learnware-market | |||
| $ python setup.py install | |||
| ``isort`` Config | |||
| =================== | |||
| The codes in the ``learnware`` package will be processed by ``isort`` (``examples`` and ``tests`` are excluded). Run the following command to install ``isort``: | |||
| .. code-block:: bash | |||
| pip install isort | |||
| Run the following command in the root directory of ``Learnware`` Project to run ``isort``: | |||
| .. code-block:: bash | |||
| - For macOS users: | |||
| isort learnware --reverse-relative | |||
| .. code-block:: bash | |||
| $ conda install -c pytorch faiss | |||
| $ git clone https://git.nju.edu.cn/learnware/learnware-market.git && cd learnware-market | |||
| $ python setup.py install | |||
| @@ -4,7 +4,7 @@ | |||
| Learnware & Reuser | |||
| ========================================== | |||
| ``Learnware`` is the most basic concept in the ``learnware paradigm``. In this section, we will introduce the concept and design of ``learnware`` and its extension for ``Hetero Reuse``. Then we will introduce the ``Reuse Methods``, which applies one or several ``learnware``\ s to solve the user's task. | |||
| ``Learnware`` is the most basic concept in the ``learnware paradigm``. In this section, we will introduce the concept and design of ``Learnware`` and its extension for ``Hetero Reuse``. Then we will introduce the ``Reuse Methods``, which applies one or several ``Learnware``\ s to solve the user's task. | |||
| Concepts | |||
| =================== | |||
| @@ -16,7 +16,7 @@ In our implementation, the class ``Learnware`` has 3 important member variables: | |||
| - ``model``: The model in the learnware, can be a ``BaseModel`` or a dict including model name and path. When it is a dict, the function ``Learnware.instantiate_model`` is used to transform it to a ``BaseModel``. The function ``Learnware.predict`` use the model to predict for an input ``X``. See more in `COMPONENTS: Model <./model.html>`_. | |||
| - ``specification``: The specification including the semantic specification and the statistic specification. | |||
| Learnware for Hetero Reuse (Feature Align + Hetero Map Learnware) | |||
| Learnware for Hetero Reuse | |||
| ======================================================================= | |||
| In the Hetero Market(see `COMPONENTS: Hetero Market <./market.html#hetero-market>`_ for details), ``HeteroSearcher`` identifies and recommends helpful learnwares among all learnwares in the market, | |||
| @@ -107,7 +107,7 @@ specifies the ensemble method(default is set to ``mean``). | |||
| Reuse Learnware with Labeled Data | |||
| ---------------------------------- | |||
| When users have a small amount of labeled data available, ``learnware`` package provides two methods: ``EnsemblePruningReuser`` and ``FeatureAugmentReuser`` to help reuse learnwares. | |||
| When users have a small amount of labeled data available, the ``learnware`` package provides two methods: ``EnsemblePruningReuser`` and ``FeatureAugmentReuser`` to help reuse learnwares. | |||
| They are both initialized with a list of ``Learnware`` objects ``learnware_list``, and have different implementations of ``fit`` and ``predict`` methods. | |||
| EnsemblePruningReuser | |||
| @@ -4,20 +4,20 @@ | |||
| Learnware Market | |||
| ================================ | |||
| The ``learnware market`` receives high-performance machine learning models from developers, incorporates them into the system, and provides services to users by identifying and reusing learnware to help users solve current tasks. Developers voluntarily submit various learnwares to the learnware market, and the market conducts quality checks and further organization of these learnwares. When users submit task requirements, the learnware market automatically selects whether to recommend a single learnware or a combination of multiple learnwares. | |||
| The ``Learnware Market`` receives high-performance machine learning models from developers, incorporates them into the system, and provides services to users by identifying and reusing learnware to help users solve current tasks. Developers voluntarily submit various learnwares to the learnware market, and the market conducts quality checks and further organization of these learnwares. When users submit task requirements, the learnware market automatically selects whether to recommend a single learnware or a combination of multiple learnwares. | |||
| The ``learnware market`` will receive various kinds of learnwares, and learnwares from different feature/label spaces form numerous islands of specifications. All these islands together constitute the ``specification world`` in the learnware market. The market should discover and establish connections between different islands, and then merge them into a unified specification world. This further organization of learnwares support search learnwares among all learnwares, not just among learnwares which has the same feature space and label space with the user's task requirements. | |||
| The ``Learnware Market`` will receive various kinds of learnwares, and learnwares from different feature/label spaces form numerous islands of specifications. All these islands together constitute the ``specification world`` in the learnware market. The market should discover and establish connections between different islands, and then merge them into a unified specification world. This further organization of learnwares support search learnwares among all learnwares, not just among learnwares which has the same feature space and label space with the user's task requirements. | |||
| Framework | |||
| ====================================== | |||
| The ``learnware market`` is combined with a ``organizer``, a ``searcher``, and a list of ``checker``\ s. | |||
| The ``Learnware Market`` is combined with a ``organizer``, a ``searcher``, and a list of ``checker``\ s. | |||
| The ``organizer`` can store and organize learnwares in the market. It supports ``add``, ``delete``, and ``update`` operations for learnwares. It also provides the interface for ``searcher`` to search learnwares based on user requirement. | |||
| The ``searcher`` can search learnwares based on user requirement. The implementation of ``searcher`` is dependent on the concrete implementation and interface for ``organizer``, where usually an ``organizer`` can be compatible with multiple different ``searcher``\ s. | |||
| The ``checker`` is used for checking the learnware in some standards. It should check the utility of a learnware and is supposed to return the status and a message related to the learnware's check result. Only the learnwares who passed the ``checker`` could be able to be stored and added into the ``learnware market``. | |||
| The ``checker`` is used for checking the learnware in some standards. It should check the utility of a learnware and is supposed to return the status and a message related to the learnware's check result. Only the learnwares who passed the ``checker`` could be able to be stored and added into the ``Learnware Market``. | |||
| @@ -37,6 +37,9 @@ Semantic Specification | |||
| The semantic specification consists of a "dict" structure that includes keywords "Data", "Task", "Library", "Scenario", "License", "Description", and "Name". | |||
| In the case of table learnwares, users should additionally provide descriptions for each feature dimension and output dimension through the "Input" and "Output" keywords. | |||
| - If "data_type" is "Table", you need to specify the semantics of each dimension of the model's input data to make the uploaded learnware suitable for tasks with heterogeneous feature spaces. | |||
| - If "task_type" is "Classification", you need to provide the semantics of model output labels (prediction labels start from 0), making the uploaded learnware suitable for classification tasks with heterogeneous output spaces. | |||
| - If "task_type" is "Regression", you need to specify the semantics of each dimension of the model output, making the uploaded learnware suitable for regression tasks with heterogeneous output spaces. | |||
| Regular Specification | |||
| ====================================== | |||
| @@ -131,7 +134,7 @@ with particular learnware market implementations. | |||
| - Learnware searchers perform helpful learnware recommendations among all table learnwares in the market, leveraging the ``system specification``\ s generated for users. | |||
| ``learnware`` package now includes a type of ``system specification``, named ``HeteroMapTableSpecification``, made especially for the ``Hetero Market`` implementation. | |||
| The ``learnware`` package now includes a type of ``system specification``, named ``HeteroMapTableSpecification``, made especially for the ``Hetero Market`` implementation. | |||
| This specification is automatically given to all table learnwares when they are added to the ``Hetero Market``. | |||
| It is also set up to be updated periodically, ensuring it remains accurate as the learnware market evolves and builds more precise specification worlds. | |||
| Please refer to `COMPONENTS: Hetero Market <../components/market.html#hetero-market>`_ for implementation details. | |||
| @@ -100,12 +100,12 @@ html_logo = "_static/img/logo/logo1.png" | |||
| # These folders are copied to the documentation's HTML output | |||
| html_static_path = ['_static'] | |||
| html_static_path = ["_static"] | |||
| # These paths are either relative to html_static_path | |||
| # or fully qualified paths (eg. https://...) | |||
| html_css_files = [ | |||
| 'css/custom_style.css', | |||
| "css/custom_style.css", | |||
| ] | |||
| # -- Options for HTMLHelp output ------------------------------------------ | |||
| @@ -7,7 +7,9 @@ | |||
| ``Learnware`` Documentation | |||
| ============================================================ | |||
| ``Learnware`` is a model sharing platform, which give a basic implementation of the learnware paradigm. A learnware is a well-performed trained machine learning model with a specification that enables it to be adequately identified to reuse according to the requirement of future users who may know nothing about the learnware in advance. The learnware paradigm can solve entangled problems in the current machine learning paradigm, like continual learning and catastrophic forgetting. It also reduces resources for training a well-performed model. | |||
| The ``learnware`` package provides a fundamental implementation of the central concepts and procedures for the learnware paradigm. | |||
| A learnware is a well-performed trained machine learning model with a specification that enables it to be adequately identified to reuse according to the requirement of future users who may know nothing about the learnware in advance. | |||
| The learnware paradigm is a new paradigm aimed at enabling users to reuse existed well-trained models to solve their AI tasks instead of starting from scratch. | |||
| .. _user_guide: | |||
| @@ -58,7 +60,7 @@ Document Structure | |||
| :caption: REFERENCES: | |||
| API <references/api.rst> | |||
| BeimingWu System <references/beiming.rst> | |||
| Beimingwu System <references/beimingwu.rst> | |||
| FAQ <references/FAQ.rst> | |||
| .. toctree:: | |||
| @@ -1,5 +1,12 @@ | |||
| .. _faq: | |||
| ==================== | |||
| FAQ | |||
| Learnware FAQ | |||
| ==================== | |||
| Learnware Frequently Asked Questions | |||
| ===================================== | |||
| .. contents:: | |||
| :depth: 1 | |||
| :local: | |||
| :backlinks: none | |||
| @@ -3,7 +3,7 @@ | |||
| API Reference | |||
| ================================ | |||
| Here you can find all ``learnware`` interfaces. | |||
| Here you can find high-level ``Learnware`` interfaces. | |||
| Market | |||
| ==================== | |||
| @@ -13,23 +13,96 @@ Market | |||
| .. autoclass:: learnware.market.BaseUserInfo | |||
| :members: | |||
| Learnware & Reuser | |||
| Organizer | |||
| ------------------ | |||
| .. autoclass:: learnware.market.BaseOrganizer | |||
| :members: | |||
| .. autoclass:: learnware.market.EasyOrganizer | |||
| :members: | |||
| .. autoclass:: learnware.market.HeteroOrganizer | |||
| :members: | |||
| Searcher | |||
| ------------------ | |||
| .. autoclass:: learnware.market.BaseSearcher | |||
| :members: | |||
| .. autoclass:: learnware.market.EasySearcher | |||
| :members: | |||
| .. autoclass:: learnware.market.EasyExactSemanticSearcher | |||
| :members: | |||
| .. autoclass:: learnware.market.EasyFuzzSemanticSearcher | |||
| :members: | |||
| .. autoclass:: learnware.market.EasyStatSearcher | |||
| :members: | |||
| .. autoclass:: learnware.market.HeteroSearcher | |||
| :members: | |||
| Checker | |||
| ------------------ | |||
| .. autoclass:: learnware.market.BaseChecker | |||
| :members: | |||
| .. autoclass:: learnware.market.EasyChecker | |||
| :members: | |||
| .. autoclass:: learnware.market.EasySemanticChecker | |||
| :members: | |||
| .. autoclass:: learnware.market.EasyStatChecker | |||
| :members: | |||
| Learnware | |||
| ==================== | |||
| .. autoclass:: learnware.learnware.Learnware | |||
| :members: | |||
| Reuser | |||
| ==================== | |||
| .. autoclass:: learnware.reuse.BaseReuser | |||
| :members: | |||
| Data Independent Reuser | |||
| ------------------------- | |||
| .. autoclass:: learnware.reuse.JobSelectorReuser | |||
| :members: | |||
| .. autoclass:: learnware.reuse.AveragingReuser | |||
| :members: | |||
| Data Dependent Reuser | |||
| ------------------------- | |||
| .. autoclass:: learnware.reuse.EnsemblePruningReuser | |||
| :members: | |||
| .. autoclass:: learnware.reuse.FeatureAugmentReuser | |||
| :members: | |||
| Aligned Learnware | |||
| -------------------- | |||
| .. autoclass:: learnware.reuse.AlignLearnware | |||
| :members: | |||
| .. autoclass:: learnware.reuse.FeatureAlignLearnware | |||
| :members: | |||
| .. autoclass:: learnware.reuse.HeteroMapAlignLearnware | |||
| :members: | |||
| Specification | |||
| ==================== | |||
| @@ -39,6 +112,12 @@ Specification | |||
| .. autoclass:: learnware.specification.BaseStatSpecification | |||
| :members: | |||
| Regular Specification | |||
| -------------------------- | |||
| .. autoclass:: learnware.specification.RegularStatSpecification | |||
| :members: | |||
| .. autoclass:: learnware.specification.RKMETableSpecification | |||
| :members: | |||
| @@ -48,8 +127,32 @@ Specification | |||
| .. autoclass:: learnware.specification.RKMETextSpecification | |||
| :members: | |||
| System Specification | |||
| -------------------------- | |||
| .. autoclass:: learnware.specification.HeteroMapTableSpecification | |||
| :members: | |||
| Model | |||
| ==================== | |||
| Base Model | |||
| -------------- | |||
| .. autoclass:: learnware.model.BaseModel | |||
| :members: | |||
| Container | |||
| ------------- | |||
| .. autoclass:: learnware.client.ModelContainer | |||
| :members: | |||
| .. autoclass:: learnware.client.ModelCondaContainer | |||
| :members: | |||
| .. autoclass:: learnware.client.ModelDockerContainer | |||
| :members: | |||
| .. autoclass:: learnware.client.LearnwaresContainer | |||
| :members: | |||
| @@ -1,6 +0,0 @@ | |||
| .. _beiming: | |||
| ==================== | |||
| BeimingWu System | |||
| ==================== | |||
| `Clik here for beiming system <https://bmwu.cloud/>`_ | |||
| @@ -0,0 +1,30 @@ | |||
| .. _beimingwu: | |||
| ==================== | |||
| Beimingwu System | |||
| ==================== | |||
| `Beimingwu System <https://bmwu.cloud/>`_ is based on the learnware paradigm, which systematically implements the entire process of learnware from submission to deployment, helping users effectively search and reuse learnwares without the need to build machine learning models from scratch. | |||
| The ``learnware`` package is the cornerstone of the Beimingwu system, functioning as its core engine. | |||
| It offers a comprehensive suite of central APIs that encompass a wide range of functionalities, including the submission, verification, organization, search, and deployment of learnware. | |||
| This integration ensures a streamlined and efficient process, facilitating seamless interactions within the system. | |||
| Core Features in the Beimingwu System | |||
| ======================================= | |||
| Beimingwu systematically implements the core process of the learnware paradigm for the first time: | |||
| - ``Submitting Stage``: The system includes multiple detection mechanisms to ensure the quality of uploaded learnwares. Additionally, the system trains a heterogeneous engine based on existing learnware specifications in the system to merge different specification islands and assign new specifications to learnwares. With more learnwares are submitted, the heterogeneous engine will continue to update, achieving continuous iteration of learnware specifications and building a more precise specification world. | |||
| - ``Deploying Stage``: After users upload task requirements, the system automatically selects whether to recommend a single learnware or multiple learnware combinations and provides efficient deployment methods. Whether it's a single learnware or a combination of multiple learnwares, the system offers convenient learnware reuse tools. | |||
| In addition, the Beimingwu system also has the following features: | |||
| - ``Learnware Specification Generation``: The Beimingwu system provides specification generation interfaces in the learnware package, supporting various data types (tables, images, and text) for efficient local generation. | |||
| - ``Learnware Quality Inspection``: The Beimingwu system includes multiple detection mechanisms to ensure the quality of each learnware in the system. | |||
| - ``Diverse Learnware Search``: The Beimingwu system supports both semantic specifications and statistical specifications searches, covering data types such as tables, images, and text. In addition, for table-based tasks, the system also supports the search for heterogeneous table learnwares. | |||
| - ``Local Learnware Deployment``: The Beimingwu system provides interfaces for learnware deployment and learnware reuse in the learnware package, facilitating users' convenient and secure learnware deployment. | |||
| - ``Data Privacy Protection``: The Beimingwu system operations, including learnware upload, search, and deployment, do not require users to upload local data. All relevant statistical specifications are generated locally by users, ensuring data privacy. | |||
| - ``Fully Open Source``: The Beimingwu system's source code is completely open-source, including the learnware package and frontend/backend code. The learnware package is highly extensible, making it easy to integrate new specification designs, learnware system designs, and learnware reuse methods in the future. | |||
| Beimingwu is the first system-level implementation of the learnware paradigm. | |||
| This pioneering venture is just the beginning, with vast opportunities for enhancement and growth in the related technological fields still ahead. | |||
| @@ -16,8 +16,8 @@ Ubuntu 20.04.4 LTS Nvidia Tesla V100S Intel(R) Xeon(R) Gold 6240R | |||
| ==================== ==================== =============================== | |||
| Table: homo+hetero | |||
| ==================== | |||
| Tabular Data Experiments | |||
| =========================== | |||
| Datasets | |||
| ------------------ | |||
| @@ -43,8 +43,8 @@ Based on the specific design of user tasks, our experiments were primarily categ | |||
| - ``heterogeneous experiments`` aim to evaluate the performance of identifying and reusing helpful heterogeneous learnwares in situations where | |||
| no available learnwares match the feature space of the user's task. This helps to highlight the potential of learnwares for applications beyond their original purpose. | |||
| Homo Experiments | |||
| ----------------------- | |||
| Homogeneous Tabular Dataset | |||
| ----------------------------- | |||
| In homogeneous experiments, the 55 stores in the Corporacion dataset are considered as 55 users. Each store uses the same feature engineering method | |||
| and their own test set as user data. These users then search for and reuse homogeneous learnwares in the market which exactly match the feature spaces of their tasks. | |||
| @@ -52,17 +52,20 @@ and their own test set as user data. These users then search for and reuse homog | |||
| The Mean Squared Error (MSE) of search and reuse across all users is presented in the table below: | |||
| +-----------------------------------+---------------------+ | |||
| | Mean in Market (Single) | 0.331 | | |||
| | Setting | MSE | | |||
| +===================================+=====================+ | |||
| | Mean in Market (Single) | 0.331 | | |||
| +-----------------------------------+---------------------+ | |||
| | Best in Market (Single) | 0.151 | | |||
| | Best in Market (Single) | 0.151 | | |||
| +-----------------------------------+---------------------+ | |||
| | Top-1 Reuse (Single) | 0.280 | | |||
| | Top-1 Reuse (Single) | 0.280 | | |||
| +-----------------------------------+---------------------+ | |||
| | Job Selector Reuse (Multiple) | 0.274 | | |||
| | Job Selector Reuse (Multiple) | 0.274 | | |||
| +-----------------------------------+---------------------+ | |||
| | Average Ensemble Reuse (Multiple) | 0.267 | | |||
| | Average Ensemble Reuse (Multiple) | 0.267 | | |||
| +-----------------------------------+---------------------+ | |||
| When users have both test data and limited training data derived from their original data, reusing single or multiple searched learnwares from the market can often yield | |||
| better results than training models from scratch on limited training data. We present the change curves in MSE for the user's self-trained model, as well as for the Feature Augmentation single learnware reuse method and the Ensemble Pruning multiple learnware reuse method. | |||
| These curves display their performance on the user's test data as the amount of labeled training data increases. | |||
| @@ -76,8 +79,8 @@ The figure clearly shows that when users have limited training data, reusing sin | |||
| This highlights the advantage of reusing learnwares in substantially reducing the need for large training datasets and achieving better outcomes with restricted user training data. | |||
| Hetero Experiments | |||
| ------------------------- | |||
| Heterogeneous Tabular Dataset | |||
| ------------------------------ | |||
| In heterogeneous experiments, the learnware market would recommend helpful heterogeneous learnwares with different feature spaces with | |||
| the user tasks. Based on whether there are learnwares in the market that handle tasks similar to the user's task, the experiments can be further subdivided into the following two types: | |||
| @@ -91,6 +94,8 @@ we tested various heterogeneous learnware reuse methods (without using user's la | |||
| The average MSE performance across 41 users are as follows: | |||
| +-----------------------------------+---------------------+ | |||
| | Setting | MSE | | |||
| +===================================+=====================+ | |||
| | Mean in Market (Single) | 1.459 | | |||
| +-----------------------------------+---------------------+ | |||
| | Best in Market (Single) | 1.226 | | |||
| @@ -122,35 +127,36 @@ The average results across 10 users are depicted in the figure below: | |||
| We can observe that heterogeneous learnwares are beneficial when there's a limited amount of the user's labeled training data available, | |||
| aiding in better alignment with the user's specific task. This underscores the potential of learnwares to be applied to tasks beyond their original purpose. | |||
| Image Experiment | |||
| ==================== | |||
| Image Data Experiment | |||
| ========================= | |||
| For the CIFAR-10 dataset, we sampled the training set unevenly by category and constructed unbalanced training datasets for the 50 learnwares that contained only some of the categories. This makes it unlikely that there exists any learnware in the learnware market that can accurately handle all categories of data; only the learnware whose training data is closest to the data distribution of the target task is likely to perform well on the target task. Specifically, the probability of each category being sampled obeys a random multinomial distribution, with a non-zero probability of sampling on only 4 categories, and the sampling ratio is 0.4: 0.4: 0.1: 0.1. Ultimately, the training set for each learnware contains 12,000 samples covering the data of 4 categories in CIFAR-10. | |||
| We constructed 50 target tasks using data from the test set of CIFAR-10. Similar to constructing the training set for the learnwares, in order to allow for some variation between tasks, we sampled the test set unevenly. Specifically, the probability of each category being sampled obeys a random multinomial distribution, with non-zero sampling probability on 6 categories, and the sampling ratio is 0.3: 0.3: 0.1: 0.1: 0.1: 0.1. Ultimately, each target task contains 3000 samples covering the data of 6 categories in CIFAR-10. | |||
| With this experimental setup, we evaluated the performance of RKME Image by calculating the mean accuracy across all users. | |||
| With this experimental setup, we evaluated the performance of RKME Image using 1 - Accuracy as the loss. | |||
| +-----------------------------------+---------------------+ | |||
| | Mean in Market (Single) | 0.346 | | |||
| | Setting | Accuracy | | |||
| +===================================+=====================+ | |||
| | Mean in Market (Single) | 0.655 | | |||
| +-----------------------------------+---------------------+ | |||
| | Best in Market (Single) | 0.688 | | |||
| | Best in Market (Single) | 0.304 | | |||
| +-----------------------------------+---------------------+ | |||
| | Top-1 Reuse (Single) | 0.534 | | |||
| | Top-1 Reuse (Single) | 0.406 | | |||
| +-----------------------------------+---------------------+ | |||
| | Job Selector Reuse (Multiple) | 0.534 | | |||
| | Job Selector Reuse (Multiple) | 0.406 | | |||
| +-----------------------------------+---------------------+ | |||
| | Average Ensemble Reuse (Multiple) | 0.676 | | |||
| | Average Ensemble Reuse (Multiple) | 0.310 | | |||
| +-----------------------------------+---------------------+ | |||
| In some specific settings, the user will have a small number of labeled samples. In such settings, learning the weight of selected learnwares on a limited number of labeled samples can result in a better performance than training directly on a limited number of labeled samples. | |||
| In some specific settings, the user will have a small number of labelled samples. In such settings, learning the weight of selected learnwares on a limited number of labelled samples can result in a better performance than training directly on a limited number of labelled samples. | |||
| .. image:: ../_static/img/image_labeled.svg | |||
| :align: center | |||
| Text Experiment | |||
| ==================== | |||
| Text Data Experiment | |||
| ========================== | |||
| Datasets | |||
| ------------------ | |||
| @@ -177,6 +183,8 @@ Results | |||
| The table below presents the mean accuracy of search and reuse across all users: | |||
| +-----------------------------------+---------------------+ | |||
| | Setting | Accuracy | | |||
| +===================================+=====================+ | |||
| | Mean in Market (Single) | 0.507 | | |||
| +-----------------------------------+---------------------+ | |||
| | Best in Market (Single) | 0.859 | | |||
| @@ -199,17 +207,23 @@ We present the change curves in classification error rates for both the user's s | |||
| From the figure above, it is evident that when the user's own training data is limited, the performance of multiple learnware reuse surpasses that of the user's own model. As the user's training data grows, it is expected that the user's model will eventually outperform the learnware reuse. This underscores the value of reusing learnware to significantly conserve training data and achieve superior performance when user training data is limited. | |||
| Get Start Examples | |||
| ========================= | |||
| We utilize the `fire` module to construct our experiments, including table, image and text scenario. | |||
| Examples for `Tabular, Text` and `Image` data sets are available at `Learnware Examples <https://github.com/Learnware-LAMDA/Learnware/tree/main/examples>`_. You can run { main.py } directly to reproduce related experiments. | |||
| We utilize the `fire` module to construct our experiments. | |||
| Examples for `Image` are available at [examples/dataset_image_workflow]. | |||
| Text Examples | |||
| ------------------ | |||
| You can execute the experiment with the following commands: | |||
| * `python workflow.py image_example`: Run both the unlabeled_image_example and labeled_image_example experiments. The results will be printed in the terminal, and the curves will be automatically saved in the `figs` directory. | |||
| * `python main.py unlabeled_text_example`: Executes the unlabeled_text_example experiment; the results will be printed in the terminal. | |||
| * `python main.py labeled_text_example`: Executes the labeled_text_example experiment; result curves will be automatically saved in the `figs` directory. | |||
| Examples for `Text` are available at [examples/dataset_text_workflow]. | |||
| Image Examples | |||
| ------------------ | |||
| You can execute the experiment with the following commands: | |||
| * `python workflow.py unlabeled_text_example`: Run the unlabeled_text_example experiment. The results will be printed in the terminal. | |||
| * `python workflow.py labeled_text_example`: Run the labeled_text_example experiment. The result curves will be automatically saved in the `figs` directory. | |||
| .. code-block:: bash | |||
| python workflow.py image_example | |||
| @@ -4,50 +4,43 @@ Installation Guide | |||
| ======================== | |||
| ``Learnware Market`` Installation | |||
| ================================= | |||
| ``learnware`` Package Installation | |||
| =================================== | |||
| .. note:: | |||
| ``Learnware Market`` supports `Windows`, `Linux` and `Macos`. It's recommended to use ``Learnware Market`` in `Linux`. ``Learnware Market`` supports Python3, which is up to Python3.8. | |||
| The ``learnware`` package supports `Windows`, `Linux`. It's recommended to use ``Learnware`` in `Linux`. ``Learnware`` supports Python3, which is up to Python3.11. | |||
| Users can easily install ``Learnware Market`` by pip according to the following command: | |||
| Users can easily install ``Learnware`` by pip according to the following command: | |||
| - For Windows and Linux users: | |||
| .. code-block:: bash | |||
| .. code-block:: bash | |||
| pip install learnware | |||
| - For macOS users: | |||
| pip install learnware | |||
| .. code-block:: bash | |||
| In the ``learnware`` package, besides the base classes, many core functionalities such as "learnware specification generation" and "learnware deployment" rely on the ``torch`` library. Users have the option to manually install ``torch``, or they can directly use the following command to install the ``learnware`` package: | |||
| conda install -c pytorch faiss | |||
| pip install learnware | |||
| .. code-block:: bash | |||
| pip install learnware[full] | |||
| Also, Users can install ``Learnware Market`` by the source code according to the following steps: | |||
| .. note:: | |||
| However, it's crucial to note that due to the potential complexity of the user's local environment, installing ``learnware[full]`` does not guarantee that ``torch`` will successfully invoke ``CUDA`` in the user's local setting. | |||
| - Enter the root directory of ``Learnware Market``, in which the file ``setup.py`` exists. | |||
| - Then, please execute the following command to install the environment dependencies and install ``Learnware Market``: | |||
| - For Windows and Linux users: | |||
| Install ``learnware`` Package From Source | |||
| ========================================== | |||
| .. code-block:: bash | |||
| $ git clone https://git.nju.edu.cn/learnware/learnware-market.git && cd learnware-market | |||
| $ python setup.py install | |||
| Also, Users can install ``Learnware`` by the source code according to the following steps: | |||
| - For macOS users: | |||
| - Enter the root directory of ``Learnware``, in which the file ``setup.py`` exists. | |||
| - Then, please execute the following command to install the environment dependencies and install ``Learnware``: | |||
| .. code-block:: bash | |||
| $ conda install -c pytorch faiss | |||
| $ git clone https://git.nju.edu.cn/learnware/learnware-market.git && cd learnware-market | |||
| $ python setup.py install | |||
| $ git clone hhttps://github.com/Learnware-LAMDA/Learnware.git && cd Learnware | |||
| $ pip install -e .[dev] | |||
| .. note:: | |||
| It's recommended to use anaconda/miniconda to setup the environment. | |||
| It's recommended to use anaconda/miniconda to setup the environment. Also you can run ``pip install -e .[full, dev]`` to install ``torch`` automatically as well. | |||
| Use the following code to make sure the installation successful: | |||
| @@ -3,61 +3,36 @@ | |||
| Introduction | |||
| ================ | |||
| ``Learnware`` is a model sharing platform, which give a basic implementation of the learnware paradigm. A learnware is a well-performed trained machine learning model with a specification that enables it to be adequately identified to reuse according to the requirement of future users who may know nothing about the learnware in advance. The learnware paradigm can solve entangled problems in the current machine learning paradigm, like continual learning and catastrophic forgetting. It also reduces resources for training a well-performed model. | |||
| The *learnware* paradigm, proposed by Professor Zhi-Hua Zhou in 2016 [1, 2], aims to build a vast model platform system, i.e., a *learnware dock system*, which systematically accommodates and organizes models shared by machine learning developers worldwide, and can efficiently identify and assemble existing helpful model(s) to solve future tasks in a unified way. | |||
| The ``learnware`` package provides a fundamental implementation of the central concepts and procedures within the learnware paradigm. Its well-structured design ensures high scalability and facilitates the seamless integration of additional features and techniques in the future. | |||
| Motivation | |||
| ================= | |||
| In addition, the ``learnware`` package serves as the engine for the `Beimingwu System <https://bmwu.cloud/#/>`_ and can be effectively employed for conducting experiments related to learnware. | |||
| .. image:: ../_static/img/learnware_paradigm.jpg | |||
| :align: center | |||
| | [1] Zhi-Hua Zhou. Learnware: on the future of machine learning. *Frontiers of Computer Science*, 2016, 10(4): 589–590 | |||
| | [2] Zhi-Hua Zhou. Machine Learning: Development and Future. *Communications of CCF*, 2017, vol.13, no.1 (2016 CNCC keynote) | |||
| Machine learning, especially the prevailing big model paradigm, has achieved great success in natural language processing and computer vision applications. However, it still faces challenges such as the requirement of a large amount of labeled training data, difficulty in adapting to changing environments, and catastrophic forgetting when refining trained models incrementally. These big models, while useful in their targeted tasks, often fail to address the above issues and struggle to generalize beyond their specific purposes. | |||
| To better address the entangled issues in machine learning, we should consider the following aspects: | |||
| +------------------------------------------------------------------------------------+ | |||
| | Aspect | | |||
| +====================================================================================+ | |||
| | 1. Investigate techniques that address multiple challenges simultaneously, | | |||
| | recognizing that these issues are often intertwined in real-world applications. | | |||
| +------------------------------------------------------------------------------------+ | |||
| | 2. Explore paradigms like learnware, which offers the possibility of | | |||
| | systematically reusing small models for tasks beyond their original purposes, | | |||
| | reducing the need for users to build models from scratch. | | |||
| +------------------------------------------------------------------------------------+ | |||
| | 3. Develop solutions that enable ordinary users to create well-performing models | | |||
| | without requiring proficient training skills. | | |||
| +------------------------------------------------------------------------------------+ | |||
| | 4. Address data privacy and proprietary concerns to facilitate experience | | |||
| | sharing among different users while respecting confidentiality. | | |||
| +------------------------------------------------------------------------------------+ | |||
| | 5. Adapt to the constraints of big data applications, where it may be | | |||
| | unaffordable or infeasible to hold all data for multiple passes of scanning. | | |||
| +------------------------------------------------------------------------------------+ | |||
| | 6. Consider the environmental impact of training large models, as their carbon | | |||
| | emissions pose a threat to our environment. | | |||
| +------------------------------------------------------------------------------------+ | |||
| By considering these factors, we can develop a more comprehensive framework for tackling the complex challenges in machine learning, moving beyond the limitations of the big model paradigm, called Learnware. | |||
| Framework | |||
| ======================= | |||
| .. image:: ../_static/img/learnware_market.jpg | |||
| :align: center | |||
| What is Learnware? | |||
| ==================== | |||
| A learnware consists of a high-performance machine learning model and specifications that characterize the model, i.e., "Learnware = Model + Specification". | |||
| The learnware paradigm introduces the concept of a well-performed, trained machine learning model with a specification that allows future users, who have no prior knowledge of the learnware, to reuse it based on their requirements. | |||
| The learnware specification consists of "semantic specification" and "statistical specification": | |||
| Developers or owners of trained machine learning models can submit their models to a learnware market. If accepted, the market assigns a specification to the model and accommodates it. The learnware market could host thousands or millions of well-performed models from different developers, for various tasks, using diverse data, and optimizing different objectives. | |||
| - ``Semantic Specification``: Describe the type and functionality of the model through text. | |||
| - ``Statistical Specification``: Characterize the statistical information contained in the model using various machine learning techniques. | |||
| Instead of building a model from scratch, users can submit their requirements to the learnware market, which then identifies and deploys helpful learnware(s) based on the specifications. Users can apply the learnware directly, adapt it using their data, or exploit it in other ways to improve their model. This process is more efficient and less expensive than building a model from scratch. | |||
| Learnware specifications describe the model's capabilities, enabling the model to be identified and reused by future users who may know nothing about the learnware in advance. | |||
| Why do we need Learnware? | |||
| ============================ | |||
| Benefits of the Learnware Paradigm | |||
| ============================================== | |||
| The Benefits of Learnware Paradigm | |||
| ------------------------------------- | |||
| Machine learning has achieved great success in many fields but still faces various challenges, such as the need for extensive training data and advanced training techniques, the difficulty of continuous learning, the risk of catastrophic forgetting, and the leakage of data privacy. | |||
| Although there are many efforts focusing on one of these issues separately, they are entangled, and solving one problem may exacerbate others. The learnware paradigm aims to address many of these challenges through a unified framework. | |||
| +-----------------------+-----------------------------------------------------------------------------------------------+ | |||
| | Benefit | Description | | |||
| @@ -83,11 +58,29 @@ Benefits of the Learnware Paradigm | |||
| | | large models and the carbon footprint. | | |||
| +-----------------------+-----------------------------------------------------------------------------------------------+ | |||
| Challenges and Future Work | |||
| ============================================== | |||
| How to Solve Future Tasks with Learnware Paradigm? | |||
| ---------------------------------------------------- | |||
| .. image:: ../_static/img/learnware_paradigm.jpg | |||
| :align: center | |||
| Although the learnware proposal shows promise, much work remains to make it a reality. The next sections will present some of the progress made so far. | |||
| Instead of building a model from scratch, users can submit their requirements to the learnware market, which then identifies and deploys helpful learnware(s) based on the specifications. Users can apply the learnware directly, adapt it using their data, or exploit it in other ways to improve their models. This process is more efficient and less expensive than building a model from scratch. | |||
| Procedure of Learnware Paradigm | |||
| ================================== | |||
| - ``Submitting Stage``: Developers voluntarily submit various learnwares to the learnware market, and the system conducts quality checks and further organization of these learnwares. | |||
| - ``Deploying Stage``: When users submit task requirements, the learnware market automatically selects whether to recommend a single learnware or a combination of multiple learnwares and provides efficient deployment methods. Whether it's a single learnware or a combination of multiple learnwares, the system offers convenient learnware reuse interfaces. | |||
| .. image:: ../_static/img/learnware_market.svg | |||
| :align: center | |||
| Learnware Package Design | |||
| ========================== | |||
| .. image:: ../_static/img/learnware_framework.svg | |||
| :align: center | |||
| At the workflow level, the ``learnware`` package consists of ``Submitting Stage`` and ``Deploying Stage``. | |||
| At the module level, the ``learnware`` package is a platform that consists of above components. The components are designed as loose-coupled modules and each component could be used stand-alone. | |||
| @@ -7,90 +7,44 @@ Quick Start | |||
| Introduction | |||
| ==================== | |||
| This ``Quick Start`` guide aims to illustrate the straightforward process of establishing a full ``Learnware Market`` workflow | |||
| and utilizing ``Learnware Market`` to handle user tasks. | |||
| This ``Quick Start`` guide aims to illustrate the straightforward process of establishing a full ``Learnware`` workflow | |||
| and utilizing ``Learnware`` to handle user tasks. | |||
| Installation | |||
| ==================== | |||
| Learnware is currently hosted on `PyPI <https://pypi.org/>`__. You can easily intsall ``learnware`` by following these steps: | |||
| Learnware is currently hosted on `PyPI <https://pypi.org/>`_. You can easily intsall ``Learnware`` by following these steps: | |||
| - For Windows and Linux users: | |||
| .. code-block:: bash | |||
| .. code-block:: | |||
| pip install learnware | |||
| pip install learnware | |||
| In the ``learnware`` package, besides the base classes, many core functionalities such as "learnware specification generation" and "learnware deployment" rely on the ``torch`` library. Users have the option to manually install ``torch``, or they can directly use the following command to install the ``learnware`` package: | |||
| - For macOS users: | |||
| .. code-block:: bash | |||
| .. code-block:: | |||
| conda install -c pytorch faiss | |||
| pip install learnware | |||
| pip install learnware[full] | |||
| .. note:: | |||
| However, it's crucial to note that due to the potential complexity of the user's local environment, installing ``learnware[full]`` does not guarantee that ``torch`` will successfully invoke ``CUDA`` in the user's local setting. | |||
| Prepare Learnware | |||
| ==================== | |||
| The Learnware Market encompasses a board variety of learnwares. A valid learnware is a zipfile that | |||
| includes the following four components: | |||
| - ``__init__.py`` | |||
| A Python file that provides interfaces for fitting, predicting, and fine-tuning your model. | |||
| - ``rkme.json`` | |||
| A JSON file that contains the statistical specification of your data. | |||
| - ``learnware.yaml`` | |||
| A configuration file that details your model's class name, the type of statistical specification(e.g. ``RKMETableSpecification`` for Reduced Kernel Mean Embedding), and | |||
| the file name of your statistical specification file. | |||
| - ``environment.yaml`` or ``requirements.txt`` | |||
| - ``environment.yaml`` for conda: | |||
| A Conda environment configuration file for running the model. If the model environment is incompatible, this file can be used for manual configuration. | |||
| Here's how you can generate this file: | |||
| - Create env config for conda: | |||
| - For Windows users: | |||
| .. code-block:: | |||
| conda env export | findstr /v "^prefix: " > environment.yaml | |||
| - For macOS and Linux users | |||
| .. code-block:: | |||
| conda env export | grep -v "^prefix: " > environment.yaml | |||
| - Recover env from config: | |||
| .. code-block:: | |||
| In learnware ``learnware`` package, each learnware is encapsulated in a ``zip`` package, which should contain at least the following four files: | |||
| conda env create -f environment.yaml | |||
| - ``requirements.txt`` for pip: | |||
| A plain text documents that lists all packages necessary for executing the model. These dependencies can be effortlessly installed using pip with the command: | |||
| .. code-block:: | |||
| pip install -r requirements.txt. | |||
| We've also detailed the format of the learnware zipfile in :ref:`Learnware Preparation<workflows/upload:Prepare Learnware>`. | |||
| - ``learnware.yaml``: learnware configuration file. | |||
| - ``__init__.py``: methods for using the model. | |||
| - ``stat.json``: the statistical specification of the learnware. Its filename can be customized and recorded in learnware.yaml. | |||
| - ``environment.yaml`` or ``requirements.txt``: specifies the environment for the model. | |||
| To facilitate the construction of a learnware, we provide a `Learnware Template <https://www.bmwu.cloud/static/learnware-template.zip>`_ that the users can use as a basis for building your own learnware. We've also detailed the format of the learnware ``zip`` package in `Learnware Preparation<../workflows/upload:prepare-learnware>`. | |||
| Learnware Market Workflow | |||
| Learnware Package Workflow | |||
| ============================ | |||
| Users can start a ``Learnware Market`` workflow according to the following steps: | |||
| Users can start a ``Learnware`` workflow according to the following steps: | |||
| Initialize a Learnware Market | |||
| ------------------------------- | |||
| @@ -100,11 +54,10 @@ You can initialize a basic ``Learnware Market`` named "demo" using the code snip | |||
| .. code-block:: python | |||
| import learnware | |||
| from learnware.market import EasyMarket | |||
| from learnware.market import instantiate_learnware_market | |||
| learnware.init() | |||
| easy_market = EasyMarket(market_id="demo", rebuild=True) | |||
| # instantiate a demo market | |||
| demo_market = instantiate_learnware_market(market_id="demo", name="easy", rebuild=True) | |||
| Upload Leanware | |||
| @@ -114,28 +67,30 @@ Before uploading your learnware to the ``Learnware Market``, | |||
| you'll need to create a semantic specification, ``semantic_spec``. This involves selecting or inputting values for predefined semantic tags | |||
| to describe the features of your task and model. | |||
| For instance, the dictionary snippet below illustrates the semantic specification for a Scikit-Learn type model. | |||
| This model is tailored for business scenarios and performs classification tasks on tabular data: | |||
| For instance, the following codes illustrates the semantic specification for a Scikit-Learn type model. | |||
| This model is tailored for education scenarios and performs classification tasks on tabular data: | |||
| .. code-block:: python | |||
| semantic_spec = { | |||
| "Data": {"Values": ["Tabular"], "Type": "Class"}, | |||
| "Task": {"Values": ["Classification"], "Type": "Class"}, | |||
| "Library": {"Values": ["Scikit-learn"], "Type": "Class"}, | |||
| "Scenario": {"Values": ["Business"], "Type": "Tag"}, | |||
| "Description": {"Values": "", "Type": "String"}, | |||
| "Name": {"Values": "demo_learnware", "Type": "String"}, | |||
| } | |||
| from learnware.specification import generate_semantic_spec | |||
| semantic_spec = generate_semantic_spec( | |||
| name="demo_learnware", | |||
| data_type="Table", | |||
| task_type="Classification", | |||
| library_type="Scikit-learn", | |||
| scenarios="Education", | |||
| license="MIT", | |||
| ) | |||
| After defining the semantic specification, | |||
| you can upload your learnware using a single line of code: | |||
| .. code-block:: python | |||
| easy_market.add_learnware(zip_path, semantic_spec) | |||
| Here, ``zip_path`` is the directory of your learnware zipfile. | |||
| demo_market.add_learnware(zip_path, semantic_spec) | |||
| Here, ``zip_path`` is the directory of your learnware ``zip`` package. | |||
| Semantic Specification Search | |||
| @@ -150,10 +105,11 @@ The ``Learnware Market`` will then perform an initial search using ``user_semant | |||
| user_info = BaseUserInfo(id="user", semantic_spec=semantic_spec) | |||
| # search_learnware: performs semantic specification search when user_info doesn't include a statistical specification | |||
| _, single_learnware_list, _ = easy_market.search_learnware(user_info) | |||
| search_result = easy_market.search_learnware(user_info) | |||
| single_result = search_results.get_single_results() | |||
| # single_learnware_list: the learnware list returned by semantic specification search | |||
| print(single_learnware_list) | |||
| # single_result: the List of Tuple[Score, Learnware] returned by semantic specification search | |||
| print(single_result) | |||
| Statistical Specification Search | |||
| @@ -176,43 +132,64 @@ For example, the code below executes learnware search when using Reduced Set Ker | |||
| user_info = BaseUserInfo( | |||
| semantic_spec=user_semantic, stat_info={"RKMETableSpecification": user_spec} | |||
| ) | |||
| (sorted_score_list, single_learnware_list, | |||
| mixture_score, mixture_learnware_list) = easy_market.search_learnware(user_info) | |||
| search_result = easy_market.search_learnware(user_info) | |||
| # sorted_score_list: learnware scores(based on MMD distances), sorted in descending order | |||
| print(sorted_score_list) | |||
| single_result = search_results.get_single_results() | |||
| multiple_result = search_results.get_multiple_results() | |||
| # single_learnware_list: learnwares, sorted by scores in descending order | |||
| print(single_learnware_list) | |||
| # search_item.score: based on MMD distances, sorted in descending order | |||
| # search_item.learnware.id: id of learnwares, sorted by scores in descending order | |||
| for search_item in single_result: | |||
| print(f"score: {search_item.score}, learnware_id: {search_item.learnware.id}") | |||
| # mixture_learnware_list: collection of learnwares whose combined use is beneficial | |||
| print(mixture_learnware_list) | |||
| # mixture_score: score assigned to the combined set of learnwares in `mixture_learnware_list` | |||
| print(mixture_score) | |||
| # mixture_item.learnwares: collection of learnwares whose combined use is beneficial | |||
| # mixture_item.score: score assigned to the combined set of learnwares in `mixture_item.learnwares` | |||
| for mixture_item in multiple_result: | |||
| print(f"mixture_score: {mixture_item.score}\n") | |||
| mixture_id = " ".join([learnware.id for learnware in mixture_item.learnwares]) | |||
| print(f"mixture_learnware: {mixture_id}\n") | |||
| Reuse Learnwares | |||
| ------------------------------- | |||
| With the list of learnwares, ``mixture_learnware_list``, returned from the previous step, you can readily apply them to make predictions on your own data, bypassing the need to train a model from scratch. | |||
| We offer two baseline methods for reusing a given list of learnwares: ``JobSelectorReuser`` and ``AveragingReuser``. | |||
| Just substitute ``test_x`` in the code snippet below with your own testing data, and you're all set to reuse learnwares! | |||
| We offer provide two methods for reusing a given list of learnwares: ``JobSelectorReuser`` and ``AveragingReuser``. | |||
| Just substitute ``test_x`` in the code snippet below with your own testing data, and you're all set to reuse learnwares: | |||
| .. code-block:: python | |||
| from learnware.reuse import JobSelectorReuser, AveragingReuser | |||
| # using jobselector reuser to reuse the searched learnwares to make prediction | |||
| reuse_job_selector = JobSelectorReuser(learnware_list=mixture_learnware_list) | |||
| reuse_job_selector = JobSelectorReuser(learnware_list=mixture_item.learnwares) | |||
| job_selector_predict_y = reuse_job_selector.predict(user_data=test_x) | |||
| # using averaging ensemble reuser to reuse the searched learnwares to make prediction | |||
| reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list) | |||
| reuse_ensemble = AveragingReuser(learnware_list=mixture_item.learnwares) | |||
| ensemble_predict_y = reuse_ensemble.predict(user_data=test_x) | |||
| We also provide two method when the user has labeled data for reusing a given list of learnwares: ``EnsemblePruningReuser`` and ``FeatureAugmentReuser``. | |||
| Just substitute ``test_x`` in the code snippet below with your own testing data, and substitute ``train_X, train_y`` with your own training labeled data, and you're all set to reuse learnwares: | |||
| .. code-block:: python | |||
| from learnware.reuse import EnsemblePruningReuser, FeatureAugmentReuser | |||
| # Use ensemble pruning reuser to reuse the searched learnwares to make prediction | |||
| reuse_ensemble = EnsemblePruningReuser(learnware_list=mixture_item.learnwares, mode="classification") | |||
| reuse_ensemble.fit(train_X, train_y) | |||
| ensemble_pruning_predict_y = reuse_ensemble.predict(user_data=data_X) | |||
| # Use feature augment reuser to reuse the searched learnwares to make prediction | |||
| reuse_feature_augment = FeatureAugmentReuser(learnware_list=mixture_item.learnwares, mode="classification") | |||
| reuse_feature_augment.fit(train_X, train_y) | |||
| feature_augment_predict_y = reuse_feature_augment.predict(user_data=data_X) | |||
| Auto Workflow Example | |||
| ============================ | |||
| The ``Learnware Market`` also offers an automated workflow example. | |||
| The ``Learnware`` also offers automated workflow examples. | |||
| This includes preparing learnwares, uploading and deleting learnwares from the market, and searching for learnwares using both semantic and statistical specifications. | |||
| To experience the basic workflow of the Learnware Market, users can run [workflow code link]. | |||
| To experience the basic workflow of the Learnware Market, please refer to `Learnware Examples <https://github.com/Learnware-LAMDA/Learnware/tree/main/examples>`_. | |||
| @@ -132,5 +132,27 @@ combine ``HeteroMapAlignLearnware`` with the homogeneous reuse methods ``Averagi | |||
| reuse_ensemble.fit(val_x, val_y) | |||
| ensemble_pruning_predict_y = reuse_ensemble.predict(user_data=test_x) | |||
| Reuse with Container | |||
| ===================== | |||
| Reuse with ``Model Container`` | |||
| ================================ | |||
| The ``learnware`` package provides ``Model Container`` to build executive environment for learnwares according to their runtime dependent files. The learnware's model will be executed in the containers and its env will be installed and uninstalled automatically. | |||
| Run the following codes to try run a learnware with ``Model Container``: | |||
| .. code-block:: python | |||
| from learnware.learnware import Learnware | |||
| with LearnwaresContainer(learnware, mode="conda") as env_container: # Let learnware be instance of Learnware Class, and its input shape is (20, 204) | |||
| learnware = env_container.get_learnwares_with_container()[0] | |||
| input_array = np.random.random(size=(20, 204)) | |||
| print(learnware.predict(input_array)) | |||
| The ``mode`` parameter has two options, each for a specific learnware environment loading method: | |||
| - ``'conda'``: Install a separate conda virtual environment for each learnware (automatically deleted after execution); run each learnware independently within its virtual environment. | |||
| - ``'docker'``: Install a conda virtual environment inside a Docker container (automatically destroyed after execution); run each learnware independently within the container (requires Docker privileges). | |||
| .. note:: | |||
| It's important to note that the "conda" modes are not secure if there are any malicious learnwares. If the user cannot guarantee the security of the learnware they want to load, it's recommended to use the "docker" mode to load the learnware. | |||
| @@ -51,7 +51,7 @@ Hetero Search | |||
| For table-based user tasks, | |||
| homogeneous searchers like ``EasySearcher`` fail to recommend learnwares when no table learnware matches the user task's feature dimension, returning empty results. | |||
| To enhance functionality, ``learnware`` package includes the heterogeneous learnware search feature, whose processions is as follows: | |||
| To enhance functionality, the ``learnware`` package includes the heterogeneous learnware search feature, whose processions is as follows: | |||
| - Learnware markets such as ``Hetero Market`` integrate different specification islands into a unified "specification world" by assigning system-level specifications to all learnwares. This allows heterogeneous searchers like ``HeteroSearcher`` to find helpful learnwares from all available table learnwares. | |||
| - Searchers assign system-level specifications to users based on ``UserInfo``'s statistical specification, using methods provided by corresponding organizers. In ``Hetero Market``, for example, ``HeteroOrganizer.generate_hetero_map_spec`` generates system-level specifications for users. | |||
| @@ -1,180 +1,274 @@ | |||
| .. _submit: | |||
| ========================================== | |||
| Learnware Preparation and Submission | |||
| Learnware Preparation and Uoloading | |||
| ========================================== | |||
| In this section, we provide a comprehensive guide on submitting your custom learnware to the Learnware Market. | |||
| In this section, we provide a comprehensive guide on submitting your custom learnware to the ``Learnware Market``. | |||
| We will first discuss the necessary components of a valid learnware, followed by a detailed explanation on how to upload and remove learnwares within ``Learnware Market``. | |||
| Prepare Learnware | |||
| ==================== | |||
| ==================================== | |||
| A valid learnware is encapsulated in a zipfile, comprising four essential components. | |||
| Below, we illustrate the detailed structure of a learnware zipfile. | |||
| In the ``learnware`` package, each learnware is encapsulated in a ``zip`` package, which should contain at least the following four files: | |||
| ``__init__.py`` | |||
| --------------- | |||
| - ``learnware.yaml``: learnware configuration file. | |||
| - ``__init__.py``: methods for using the model. | |||
| - ``stat.json``: the statistical specification of the learnware. Its filename can be customized and recorded in learnware.yaml. | |||
| - ``environment.yaml`` or ``requirements.txt``: specifies the environment for the model. | |||
| Within ``Learnware Market``, every uploader must provide a unified set of interfaces for their model, | |||
| facilitating easy utilization for future users. | |||
| The ``__init__.py`` file serves as the Python interface for your model's fitting, prediction, and fine-tuning processes. | |||
| For example, the code snippet below is used to train and save a SVM model for a sample dataset on sklearn digits classification: | |||
| To facilitate the construction of a learnware, we provide a `Learnware Template <https://www.bmwu.cloud/static/learnware-template.zip>`_ that you can use as a basis for building your own learnware. | |||
| .. code-block:: python | |||
| import joblib | |||
| from sklearn.datasets import load_digits | |||
| from sklearn.model_selection import train_test_split | |||
| X, y = load_digits(return_X_y=True) | |||
| data_X, _, data_y, _ = train_test_split(X, y, test_size=0.3, shuffle=True) | |||
| # input dimension: (64, ), output dimension: (10, ) | |||
| clf = svm.SVC(kernel="linear", probability=True) | |||
| clf.fit(data_X, data_y) | |||
| Next, we will provide detailed explanations for the content of these four files. | |||
| joblib.dump(clf, "svm.pkl") # model is stored as file "svm.pkl" | |||
| Model Invocation File ``__init__.py`` | |||
| ------------------------------------- | |||
| To ensure that the uploaded learnware can be used by subsequent users, you need to provide interfaces for model fitting ``fit(X, y)``, prediction ``predict(X)``, and fine-tuning ``finetune(X, y)`` in ``__init__.py``. Among these interfaces, only the ```predict(X)``` interface is mandatory, while the others depend on the functionality of your model. | |||
| Then the corresponding ``__init__.py`` for this SVM model should be structured as follows: | |||
| Below is a reference template for the ```__init__.py``` file. Please make sure that the input parameter format (the number of parameters and parameter names) for each interface in your model invocation file matches the template below. | |||
| .. code-block:: python | |||
| import os | |||
| import joblib | |||
| import pickle | |||
| import numpy as np | |||
| from learnware.model import BaseModel | |||
| class SVM(BaseModel): | |||
| class MyModel(BaseModel): | |||
| def __init__(self): | |||
| super(SVM, self).__init__(input_shape=(64,), output_shape=(10,)) | |||
| super(MyModel, self).__init__(input_shape=(37,), output_shape=(1,)) | |||
| dir_path = os.path.dirname(os.path.abspath(__file__)) | |||
| self.model = joblib.load(os.path.join(dir_path, "svm.pkl")) | |||
| model_path = os.path.join(dir_path, "model.pkl") | |||
| with open(model_path, "rb") as f: | |||
| self.model = pickle.load(f) | |||
| def fit(self, X: np.ndarray, y: np.ndarray): | |||
| pass | |||
| self.model = self.model.fit(X) | |||
| def predict(self, X: np.ndarray) -> np.ndarray: | |||
| return self.model.predict_proba(X) | |||
| return self.model.predict(X) | |||
| def finetune(self, X: np.ndarray, y: np.ndarray): | |||
| pass | |||
| Please remember to specify the ``input_shape`` and ``output_shape`` corresponding to your model. | |||
| In our sklearn digits classification example, these would be (64,) and (10,) respectively. | |||
| ``stat.json`` | |||
| ------------- | |||
| Please ensure that the ``MyModel`` class inherits from ``BaseModel`` in the ``learnware.model`` module, and specify the class name (e.g., ``MyModel``) in the ``learnware.yaml`` file later. | |||
| Input and Output Dimensions | |||
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |||
| To accurately and effectively match users with appropriate learnwares for their tasks, we require information about your training dataset. | |||
| Specifically, you are required to provide a statistical specification | |||
| stored as a json file, such as ``stat.json``, which contains the statistical information of the dataset. | |||
| This json file meets all our requirements regarding your training data, so you don't need to upload the local original data. | |||
| ``input_shape`` and ``output_shape`` represent the input and output dimensions of the model, respectively. You can refer to the following guidelines when filling them out: | |||
| - ``input_shape`` specifies a single input sample's dimension, and ``output_shape`` refers to the model's output dimension for a single sample. | |||
| - When the data type being processed is text data, there are no specific requirements for the value of ``input_shape``, and it can be filled in as ``None``. | |||
| - When the ``output_shape`` corresponds to tasks with variable outputs (such as object detection, text segmentation, etc.), there are no specific requirements for the value of ``output_shape``, and it can be filled in as ``None``. | |||
| - For classification tasks, ``output_shape`` should be (1, ) if the model directly outputs predicted labels, and the sample labels need to start from 0. If the model outputs logits, ``output_shape`` should be specified as the number of classes, i.e., (class_num, ). | |||
| There are various methods to generate a statistical specification. | |||
| If you choose to use Reduced Kernel Mean Embedding (RKME) as your statistical specification, | |||
| the following code snippet offers guidance on how to construct and store the RKME of a dataset: | |||
| File Path | |||
| ^^^^^^^^^^^^^^^^^^ | |||
| If you need to load certain files within the zip package in the ``__init__.py`` file (and any other Python files that may be involved), please follow the method shown in the template above about obtaining the ``model_path``: | |||
| - First, obtain the root directory path of the entire package by getting ``dir_path``. | |||
| - - Then, based on the specific file's relative location within the package, obtain the specific file's path, ``model_path``. | |||
| Module Imports | |||
| ^^^^^^^^^^^^^^^^^^ | |||
| Please note that module imports between Python files within the zip package should be done using **relative imports**. For instance: | |||
| .. code-block:: python | |||
| from learnware.specification import generate_rkme_spec | |||
| # generate rkme specification for digits dataset | |||
| spec = generate_rkme_spec(X=data_X) | |||
| from .package_name import * | |||
| from .package_name import module_name | |||
| Learnware Statistical Specification ``stat.json`` | |||
| --------------------------------------------------- | |||
| A learnware consists of a model and a specification. Therefore, after preparing the model, you need to generate a statistical specification for it. Specifically, using the previously installed ``learnware`` package, you can use the training data ``train_x`` (supported types include numpy.ndarray, pandas.DataFrame, and torch.Tensor) as input to generate the statistical specification of the model. | |||
| Here is an example of the code: | |||
| .. code-block:: python | |||
| from learnware.specification import generate_stat_spec | |||
| data_type = "table" # Data types: ["table", "image", "text"] | |||
| spec = generate_stat_spec(type=data_type, X=train_x) | |||
| spec.save("stat.json") | |||
| Significantly, the RKME generation process is entirely conducted on your local machine, without any involvement of cloud services, | |||
| guaranteeing the security and privacy of your local original data. | |||
| It's worth noting that the above code only runs on your local computer and does not interact with any cloud servers or leak any local private data. | |||
| Additionally, if the model's training data is too large, causing the above code to fail, you can consider sampling the training data to ensure it's of a suitable size before proceeding with reduction generation. | |||
| ``learnware.yaml`` | |||
| ------------------ | |||
| Additionally, you are asked to prepare a configuration file in YAML format. | |||
| The file should detail your model's class name, the type of statistical specification(e.g. Reduced Kernel Mean Embedding, ``RKMETableSpecification``), and | |||
| the file name of your statistical specification file. The following ``learnware.yaml`` provides an example of | |||
| how your learnware configuration file should be structured, based on our previous discussion: | |||
| Learnware Configuration File ``learnware.yaml`` | |||
| ------------------------------------------------- | |||
| This file is used to specify the class name (``MyModel``) in the model invocation file ``__init__.py``, the module called for generating the statistical specification (``learnware.specification``), the category of the statistical specification (``RKMETableSpecification``), and the specific filename (``stat.json``): | |||
| .. code-block:: yaml | |||
| model: | |||
| class_name: SVM | |||
| kwargs: {} | |||
| class_name: MyModel | |||
| kwargs: {} | |||
| stat_specifications: | |||
| - module_path: learnware.specification | |||
| - module_path: learnware.specification | |||
| class_name: RKMETableSpecification | |||
| file_name: stat.json | |||
| kwargs: {} | |||
| kwargs: {} | |||
| Please note that the statistical specification class name for different data types ``['table', 'image', 'text']`` is ``[RKMETableSpecification, RKMEImageSpecification, RKMETextSpecification]``, respectively. | |||
| ``environment.yaml`` or ``requirements.txt`` | |||
| Model Runtime Dependent File | |||
| -------------------------------------------- | |||
| In order to allow others to execute your learnware, it's necessary to specify your model's dependencies. | |||
| You can do this by providing either an ``environment.yaml`` file or a ``requirements.txt`` file. | |||
| To ensure that your uploaded learnware can be used by other users, the ``zip`` package of the uploaded learnware should specify the model's runtime dependencies. The Beimingwu System supports the following two ways to specify runtime dependencies: | |||
| - Provide an ``environment.yaml`` file supported by ``conda``. | |||
| - Provide a ``requirements.txt`` file supported by ``pip``. | |||
| You can choose either method, but please try to remove unnecessary dependencies to keep the dependency list as minimal as possible. | |||
| - ``environment.yaml`` for conda: | |||
| Using ``environment.yaml`` File | |||
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |||
| If you provide an ``environment.yaml``, a new conda environment will be created based on this file | |||
| when users install your learnware. You can generate this yaml file using the following command: | |||
| - For Windows users: | |||
| You can export the `environment.yaml` file directly from the `conda` virtual environment using the following command: | |||
| .. code-block:: | |||
| - For Linux and macOS systems | |||
| conda env export | findstr /v "^prefix: " > environment.yaml | |||
| .. code-block:: bash | |||
| conda env export | grep -v "^prefix: " > environment.yaml | |||
| - For macOS and Linux users: | |||
| - For Windows systems: | |||
| .. code-block:: | |||
| .. code-block:: bash | |||
| conda env export | findstr /v "^prefix: " > environment.yaml | |||
| conda env export | grep -v "^prefix: " > environment.yaml | |||
| Note that the ``environment.yaml`` file in the ``zip`` package needs to be encoded in ``UTF-8`` format. Please check the encoding format of the ``environment.yaml`` file after using the above command. Due to the ``conda`` version and system differences, you may not get a ``UTF-8`` encoded file (e.g. get a ``UTF-16LE`` encoded file). You'll need to manually convert the file to ``UTF-8``, which is supported by most text editors. The following ``Python`` code for encoding conversion is also for reference: | |||
| .. code-block:: python | |||
| import codecs | |||
| # Read the output file from the 'conda env export' command | |||
| # Assuming the file name is environment.yaml and the export format is UTF-16LE | |||
| with codecs.open('environment.yaml', 'r', encoding='utf-16le') as file: | |||
| content = file.read() | |||
| # Convert the content to UTF-8 encoding | |||
| output_content = content.encode('utf-8') | |||
| # Write to UTF-8 encoded file | |||
| with open('environment.yaml', 'wb') as file: | |||
| file.write(output_content) | |||
| - ``requirements.txt`` for pip: | |||
| If you provide a ``requirements.txt``, the dependent packages will be installed using the `-r` option of pip. | |||
| You can find more information about ``requirements.txt`` in | |||
| `pip documentation <https://pip.pypa.io/en/stable/user_guide/#requirements-files>`_. | |||
| Additionally, due to the complexity of users' local ``conda`` virtual environments, you can execute the following command before uploading to confirm that there are no dependency conflicts in the ``environment.yaml`` file: | |||
| .. code-block:: bash | |||
| We recommend using ``environment.yaml`` as it can help minimize conflicts between different packages. | |||
| conda env create --name test_env --file environment.yaml | |||
| .. note:: | |||
| Whether you choose to use ``environment.yaml`` or ``requirements.txt``, | |||
| it's important to keep your dependencies as minimal as possible. | |||
| This may involve manually opening the file and removing any unnecessary packages. | |||
| The above command will create a virtual environment based on the ``environment.yaml`` file, and if successful, it indicates that there are no dependency conflicts. You can delete the created virtual environment using the following command: | |||
| .. code-block:: bash | |||
| Check Learnware | |||
| ==================== | |||
| conda env remove --name test_env | |||
| Upload Learnware | |||
| ================== | |||
| Using `requirements.txt` File | |||
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |||
| The ``requirements.txt`` file should list the packages required for running the ``__init__.py`` file and their specific versions. You can obtain these version details by executing the ``pip show <package_name>`` or ``conda list <package_name>`` command. Here is an example file: | |||
| .. code-block:: text | |||
| numpy==1.23.5 | |||
| scikit-learn==1.2.2 | |||
| Manually listing these dependencies can be cumbersome, so you can also use the ``pipreqs`` package to automatically scan your entire project and export the packages used along with their specific versions (though some manual verification may be required): | |||
| After preparing the four required files mentioned above, | |||
| you can bundle them into your own learnware zipfile. Along with the generated semantic specification that | |||
| succinctly describes the features of your task and model (for more details, please refer to :ref:`semantic specification<components/spec:Semantic Specification>`), | |||
| you can effortlessly upload your learnware to the ``Learnware Market`` using a single line of code: | |||
| .. code-block:: bash | |||
| pip install pipreqs | |||
| pipreqs ./ # Run this command in the project's root directory | |||
| Please note that if you use the ``requirements.txt`` file to specify runtime dependencies, the system will by default install these dependencies in a ``conda`` virtual environment running ``Python 3.8`` during the learnware deployment. | |||
| Furthermore, for version-sensitive packages like ``torch``, it's essential to specify package versions in the ``requirements.txt`` file to ensure successful deployment of the uploaded learnware on other machines. | |||
| Upload Learnware | |||
| ================================== | |||
| After preparing the four required files mentioned above, you can bundle them into your own learnware ``zip`` package. | |||
| Prepare Sematic Specifcation | |||
| ----------------------------- | |||
| The semantic specification succinctly describes the features of your task and model. For uploading learnware ``zip`` package, the user need to prepare the semantic specification. Here is an example of a "Table Data" for a "Classification Task": | |||
| .. code-block:: python | |||
| import learnware | |||
| from learnware.market import EasyMarket | |||
| from learnware.specification import generate_semantic_spec | |||
| # Prepare input description when data_type="Table" | |||
| input_description = { | |||
| "Dimension": 5, | |||
| "Description": { | |||
| "0": "age", | |||
| "1": "weight", | |||
| "2": "body length", | |||
| "3": "animal type", | |||
| "4": "claw length" | |||
| }, | |||
| } | |||
| # Prepare output description when task_type in ["Classification", "Regression"] | |||
| output_description = { | |||
| "Dimension": 3, | |||
| "Description": { | |||
| "0": "cat", | |||
| "1": "dog", | |||
| "2": "bird", | |||
| }, | |||
| } | |||
| # Create semantic specification | |||
| semantic_spec = generate_semantic_spec( | |||
| name="learnware_example", | |||
| description="Just an example for uploading learnware", | |||
| data_type="Table", | |||
| task_type="Classification", | |||
| library_type="Scikit-learn", | |||
| scenarios=["Business", "Financial"], | |||
| input_description=input_description, | |||
| output_description=output_description, | |||
| ) | |||
| For more details, please refer to :ref:`semantic specification<components/spec:Semantic Specification>`, | |||
| Uploading | |||
| -------------- | |||
| you can effortlessly upload your learnware to the ``Learnware Market`` as follows. | |||
| learnware.init() | |||
| # EasyMarket: most basic set of functions in a Learnware Market | |||
| easy_market = EasyMarket(market_id="demo", rebuild=True) | |||
| .. code-block:: python | |||
| from learnware.market import BaseChecker | |||
| from learnware.market import instantiate_learnware_market | |||
| # instantiate a demo market | |||
| demo_market = instantiate_learnware_market(market_id="demo", name="hetero", rebuild=True) | |||
| # upload the learnware into the market | |||
| learnware_id, learnware_status = demo_market.add_learnware(zip_path, semantic_spec) | |||
| # single line uploading | |||
| easy_market.add_learnware(zip_path, semantic_spec) | |||
| # assert whether the learnware passed the check and was uploaded successfully. | |||
| assert learnware_status != BaseChecker.INVALID_LEARNWARE, "Insert learnware failed!" | |||
| Here, ``zip_path`` refers to the directory of your learnware zipfile. | |||
| Here, ``zip_path`` refers to the directory of your learnware ``zip`` package. ``learnware_id`` indicates the id assigned by ``Learnware Market``, and the ``learnware_status`` indicates the check status for learnware. | |||
| .. note:: | |||
| The learnware ``zip`` package uploaded into ``LearnwareMarket`` will be checked semantically and statistically, and ``add_learnware`` will return the concrete check status. The check status ``BaseChecker.INVALID_LEARNWARE`` indicates the learnware did not pass the check. For more details about learnware checker, please refer to `Learnware Market <../components/market.html#easy-checker>` | |||
| Remove Learnware | |||
| ================== | |||
| @@ -2,9 +2,18 @@ | |||
| ## Introduction | |||
| For the CIFAR-10 dataset, we sampled the training set unevenly by category and constructed unbalanced training datasets for the 50 learnwares that contained only some of the categories. This makes it unlikely that there exists any learnware in the learnware market that can accurately handle all categories of data; only the learnware whose training data is closest to the data distribution of the target task is likely to perform well on the target task. Specifically, the probability of each category being sampled obeys a random multinomial distribution, with a non-zero probability of sampling on only 4 categories, and the sampling ratio is 0.4: 0.4: 0.1: 0.1. Ultimately, the training set for each learnware contains 12,000 samples covering the data of 4 categories in CIFAR-10. | |||
| We conducted experiments on the widely used image benchmark dataset: [``CIFAR-10``](https://www.cs.toronto.edu/~kriz/cifar.html). | |||
| The ``CIFAR-10`` dataset consists of 60000 32x32 color images in 10 classes, with 6000 images per class. There are 50000 training images and 10000 test images. The 10 different classes represent airplanes, cars, birds, cats, deer, dogs, frogs, horses, ships, and trucks. | |||
| We constructed 50 target tasks using data from the test set of CIFAR-10. Similar to constructing the training set for the learnwares, in order to allow for some variation between tasks, we sampled the test set unevenly. Specifically, the probability of each category being sampled obeys a random multinomial distribution, with non-zero sampling probability on 6 categories, and the sampling ratio is 0.3: 0.3: 0.1: 0.1: 0.1: 0.1. Ultimately, each target task contains 3000 samples covering the data of 6 categories in CIFAR-10. | |||
| In the submitting stage, we sampled the training set non-uniformly by category, and constructed unbalanced training datasets for the 50 learnwares that contained only part of the categories randomly. Specifically, the probability of each category being sampled obeys a random multinomial distribution, with positive sampling probability on only 4 categories, and a sampling ratio of 0.4: 0.4: 0.1: 0.1. The training set for each learnware contains 12,500 samples covering data from the 4 categories in CIFAR-10. | |||
| In the deploying stage, we constructed 100 user tasks using the CIFAR-10 test set data. Similar to constructing the training set, the probability of each category being sampled obeys a random multinomial distribution, with positive sampling probabilities on only 6 categories, with a sampling ratio of 0.3: 0.3: 0.1: 0.1: 0.1: 0.1. Each user task contains 3,000 samples covering the data of 6 categories in CIFAR-10. | |||
| Our example ``image_example`` shows the performance in two different scenarios: | |||
| **Unlabelled Sample Scenario**: This scenario is designed to evaluate performance when users possess only testing data, searching and reusing learnware available in the market. | |||
| **Labelled Sample Scenario**: This scenario aims to assess performance when users have both testing and limited training data, searching and reusing learnware directly from the market instead of training a model from scratch. This helps determine the amount of training data saved for the user. | |||
| ## Run the code | |||
| @@ -18,6 +27,8 @@ python workflow.py image_example | |||
| With the experimental setup above, we evaluated the performance of RKME Image by calculating the mean accuracy across all users. | |||
| ### Unlabelled Sample Scenario | |||
| | Metric | Value | | |||
| |--------------------------------------|---------------------| | |||
| | Mean in Market (Single) | 0.346 | | |||
| @@ -26,8 +37,12 @@ With the experimental setup above, we evaluated the performance of RKME Image by | |||
| | Job Selector Reuse (Multiple) | 0.534 | | |||
| | Average Ensemble Reuse (Multiple) | 0.676 | | |||
| ### Labelled Sample Scenario | |||
| In some specific settings, the user will have a small number of labeled samples. In such settings, learning the weight of selected learnwares on a limited number of labeled samples can result in a better performance than training directly on a limited number of labeled samples. | |||
| <div align=center> | |||
| <img src="../../docs/_static/img/image_labeled.svg" alt="Results on Image Experimental Scenario" style="width:50%;" /> | |||
| </div> | |||
| </div> | |||
| Note that in labelled sample scenario, the labelled samples are repeatedly sampled 3 to 10 times, in order to reduce the estimation error in accuracy due to random sampling. | |||
| @@ -9,19 +9,15 @@ from learnware.utils import choose_device | |||
| @torch.no_grad() | |||
| def evaluate(model, evaluate_set: Dataset, device=None, distribution=True): | |||
| device = choose_device(0) if device is None else device | |||
| if isinstance(model, nn.Module): | |||
| model.eval() | |||
| mapping = lambda m, x: m(x) | |||
| else: | |||
| mapping = lambda m, x: m.predict(x) | |||
| criterion = nn.CrossEntropyLoss(reduction="sum") | |||
| total, correct, loss = 0, 0, torch.as_tensor(0.0, dtype=torch.float32, device=device) | |||
| dataloader = DataLoader(evaluate_set, batch_size=1024, shuffle=True) | |||
| for i, (X, y) in enumerate(dataloader): | |||
| X, y = X.to(device), y.to(device) | |||
| out = mapping(model, X) | |||
| out = model(X) if isinstance(model, nn.Module) else model.predict(X) | |||
| if not torch.is_tensor(out): | |||
| out = torch.from_numpy(out).to(device) | |||
| @@ -49,7 +49,7 @@ class ImageDatasetWorkflow: | |||
| plt.xlabel("Amout of Labeled User Data", fontsize=14) | |||
| plt.ylabel("1 - Accuracy", fontsize=14) | |||
| plt.title(f"Results on Image Experimental Scenario", fontsize=16) | |||
| plt.title("Results on Image Experimental Scenario", fontsize=16) | |||
| plt.legend(fontsize=14) | |||
| plt.tight_layout() | |||
| plt.savefig(os.path.join(self.fig_path, "image_labeled_curves.svg"), bbox_inches="tight", dpi=700) | |||
| @@ -61,7 +61,7 @@ class ImageDatasetWorkflow: | |||
| self.user_semantic = client.get_semantic_specification(self.image_benchmark.learnware_ids[0]) | |||
| self.user_semantic["Name"]["Values"] = "" | |||
| if len(self.image_market) == 0 or rebuild == True: | |||
| if len(self.image_market) == 0 or rebuild is True: | |||
| for learnware_id in self.image_benchmark.learnware_ids: | |||
| with tempfile.TemporaryDirectory(prefix="image_benchmark_") as tempdir: | |||
| zip_path = os.path.join(tempdir, f"{learnware_id}.zip") | |||
| @@ -71,16 +71,15 @@ class ImageDatasetWorkflow: | |||
| client.download_learnware(learnware_id, zip_path) | |||
| self.image_market.add_learnware(zip_path, semantic_spec) | |||
| break | |||
| except: | |||
| except Exception: | |||
| time.sleep(1) | |||
| continue | |||
| logger.info("Total Item: %d" % (len(self.image_market))) | |||
| def image_example(self, rebuild=False): | |||
| def image_example(self, rebuild=False, skip_test=False): | |||
| np.random.seed(1) | |||
| random.seed(1) | |||
| self._prepare_market(rebuild) | |||
| self.n_labeled_list = [100, 200, 500, 1000, 2000, 4000] | |||
| self.repeated_list = [10, 10, 10, 3, 3, 3] | |||
| device = choose_device(0) | |||
| @@ -99,142 +98,149 @@ class ImageDatasetWorkflow: | |||
| improve_list = [] | |||
| job_selector_score_list = [] | |||
| ensemble_score_list = [] | |||
| all_learnwares = self.image_market.get_learnwares() | |||
| for i in range(self.image_benchmark.user_num): | |||
| test_x, test_y = self.image_benchmark.get_test_data(user_ids=i) | |||
| train_x, train_y = self.image_benchmark.get_train_data(user_ids=i) | |||
| if not skip_test: | |||
| self._prepare_market(rebuild) | |||
| all_learnwares = self.image_market.get_learnwares() | |||
| test_x = torch.from_numpy(test_x) | |||
| test_y = torch.from_numpy(test_y) | |||
| test_dataset = TensorDataset(test_x, test_y) | |||
| for i in range(image_benchmark_config.user_num): | |||
| test_x, test_y = self.image_benchmark.get_test_data(user_ids=i) | |||
| train_x, train_y = self.image_benchmark.get_train_data(user_ids=i) | |||
| user_stat_spec = generate_stat_spec(type="image", X=test_x, whitening=False) | |||
| user_info = BaseUserInfo(semantic_spec=self.user_semantic, stat_info={user_stat_spec.type: user_stat_spec}) | |||
| logger.info("Searching Market for user: %d" % (i)) | |||
| test_x = torch.from_numpy(test_x) | |||
| test_y = torch.from_numpy(test_y) | |||
| test_dataset = TensorDataset(test_x, test_y) | |||
| search_result = self.image_market.search_learnware(user_info) | |||
| single_result = search_result.get_single_results() | |||
| multiple_result = search_result.get_multiple_results() | |||
| user_stat_spec = generate_stat_spec(type="image", X=test_x, whitening=False) | |||
| user_info = BaseUserInfo( | |||
| semantic_spec=self.user_semantic, stat_info={user_stat_spec.type: user_stat_spec} | |||
| ) | |||
| logger.info("Searching Market for user: %d" % (i)) | |||
| print(f"search result of user{i}:") | |||
| print( | |||
| f"single model num: {len(single_result)}, max_score: {single_result[0].score}, min_score: {single_result[-1].score}" | |||
| ) | |||
| search_result = self.image_market.search_learnware(user_info) | |||
| single_result = search_result.get_single_results() | |||
| multiple_result = search_result.get_multiple_results() | |||
| acc_list = [] | |||
| for idx in range(len(all_learnwares)): | |||
| learnware = all_learnwares[idx] | |||
| loss, acc = evaluate(learnware, test_dataset) | |||
| acc_list.append(acc) | |||
| learnware = single_result[0].learnware | |||
| best_loss, best_acc = evaluate(learnware, test_dataset) | |||
| best_list.append(np.max(acc_list)) | |||
| select_list.append(best_acc) | |||
| avg_list.append(np.mean(acc_list)) | |||
| improve_list.append((best_acc - np.mean(acc_list)) / np.mean(acc_list)) | |||
| print(f"market mean accuracy: {np.mean(acc_list)}, market best accuracy: {np.max(acc_list)}") | |||
| print( | |||
| f"Top1-score: {single_result[0].score}, learnware_id: {single_result[0].learnware.id}, acc: {best_acc}" | |||
| ) | |||
| print(f"search result of user{i}:") | |||
| print( | |||
| f"single model num: {len(single_result)}, max_score: {single_result[0].score}, min_score: {single_result[-1].score}" | |||
| ) | |||
| if len(multiple_result) > 0: | |||
| mixture_id = " ".join([learnware.id for learnware in multiple_result[0].learnwares]) | |||
| print(f"mixture_score: {multiple_result[0].score}, mixture_learnware: {mixture_id}") | |||
| mixture_learnware_list = multiple_result[0].learnwares | |||
| else: | |||
| mixture_learnware_list = [single_result[0].learnware] | |||
| # test reuse (job selector) | |||
| reuse_job_selector = JobSelectorReuser(learnware_list=mixture_learnware_list, use_herding=False) | |||
| job_loss, job_acc = evaluate(reuse_job_selector, test_dataset) | |||
| job_selector_score_list.append(job_acc) | |||
| print(f"mixture reuse accuracy (job selector): {job_acc}") | |||
| # test reuse (ensemble) | |||
| reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list, mode="vote_by_prob") | |||
| ensemble_loss, ensemble_acc = evaluate(reuse_ensemble, test_dataset) | |||
| ensemble_score_list.append(ensemble_acc) | |||
| print(f"mixture reuse accuracy (ensemble): {ensemble_acc}\n") | |||
| user_model_score_mat = [] | |||
| pruning_score_mat = [] | |||
| single_score_mat = [] | |||
| for n_label, repeated in zip(self.n_labeled_list, self.repeated_list): | |||
| user_model_score_list, reuse_pruning_score_list = [], [] | |||
| if n_label > len(train_x): | |||
| n_label = len(train_x) | |||
| for _ in range(repeated): | |||
| x_train, y_train = zip(*random.sample(list(zip(train_x, train_y)), k=n_label)) | |||
| x_train = np.array(list(x_train)) | |||
| y_train = np.array(list(y_train)) | |||
| x_train = torch.from_numpy(x_train) | |||
| y_train = torch.from_numpy(y_train) | |||
| sampled_dataset = TensorDataset(x_train, y_train) | |||
| mode_save_path = os.path.abspath(os.path.join(self.model_path, "model.pth")) | |||
| model = ConvModel( | |||
| channel=x_train.shape[1], im_size=(x_train.shape[2], x_train.shape[3]), n_random_features=10 | |||
| ).to(device) | |||
| train_model( | |||
| model, | |||
| sampled_dataset, | |||
| sampled_dataset, | |||
| mode_save_path, | |||
| epochs=35, | |||
| batch_size=128, | |||
| device=device, | |||
| verbose=False, | |||
| ) | |||
| model.load_state_dict(torch.load(mode_save_path)) | |||
| _, user_model_acc = evaluate(model, test_dataset, distribution=True) | |||
| user_model_score_list.append(user_model_acc) | |||
| reuse_pruning = EnsemblePruningReuser(learnware_list=mixture_learnware_list, mode="classification") | |||
| reuse_pruning.fit(x_train, y_train) | |||
| _, pruning_acc = evaluate(reuse_pruning, test_dataset, distribution=False) | |||
| reuse_pruning_score_list.append(pruning_acc) | |||
| single_score_mat.append([best_acc] * repeated) | |||
| user_model_score_mat.append(user_model_score_list) | |||
| pruning_score_mat.append(reuse_pruning_score_list) | |||
| acc_list = [] | |||
| for idx in range(len(all_learnwares)): | |||
| learnware = all_learnwares[idx] | |||
| loss, acc = evaluate(learnware, test_dataset) | |||
| acc_list.append(acc) | |||
| learnware = single_result[0].learnware | |||
| best_loss, best_acc = evaluate(learnware, test_dataset) | |||
| best_list.append(np.max(acc_list)) | |||
| select_list.append(best_acc) | |||
| avg_list.append(np.mean(acc_list)) | |||
| improve_list.append((best_acc - np.mean(acc_list)) / np.mean(acc_list)) | |||
| print(f"market mean accuracy: {np.mean(acc_list)}, market best accuracy: {np.max(acc_list)}") | |||
| print( | |||
| f"user_label_num: {n_label}, user_acc: {np.mean(user_model_score_mat[-1])}, pruning_acc: {np.mean(pruning_score_mat[-1])}" | |||
| f"Top1-score: {single_result[0].score}, learnware_id: {single_result[0].learnware.id}, acc: {best_acc}" | |||
| ) | |||
| logger.info(f"Saving Curves for User_{i}") | |||
| user_curves_data = (single_score_mat, user_model_score_mat, pruning_score_mat) | |||
| with open(os.path.join(self.curve_path, f"curve{str(i)}.pkl"), "wb") as f: | |||
| pickle.dump(user_curves_data, f) | |||
| logger.info( | |||
| "Accuracy of selected learnware: %.3f +/- %.3f, Average performance: %.3f +/- %.3f, Best performance: %.3f +/- %.3f" | |||
| % ( | |||
| np.mean(select_list), | |||
| np.std(select_list), | |||
| np.mean(avg_list), | |||
| np.std(avg_list), | |||
| np.mean(best_list), | |||
| np.std(best_list), | |||
| if len(multiple_result) > 0: | |||
| mixture_id = " ".join([learnware.id for learnware in multiple_result[0].learnwares]) | |||
| print(f"mixture_score: {multiple_result[0].score}, mixture_learnware: {mixture_id}") | |||
| mixture_learnware_list = multiple_result[0].learnwares | |||
| else: | |||
| mixture_learnware_list = [single_result[0].learnware] | |||
| # test reuse (job selector) | |||
| reuse_job_selector = JobSelectorReuser(learnware_list=mixture_learnware_list, use_herding=False) | |||
| job_loss, job_acc = evaluate(reuse_job_selector, test_dataset) | |||
| job_selector_score_list.append(job_acc) | |||
| print(f"mixture reuse accuracy (job selector): {job_acc}") | |||
| # test reuse (ensemble) | |||
| reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list, mode="vote_by_prob") | |||
| ensemble_loss, ensemble_acc = evaluate(reuse_ensemble, test_dataset) | |||
| ensemble_score_list.append(ensemble_acc) | |||
| print(f"mixture reuse accuracy (ensemble): {ensemble_acc}\n") | |||
| user_model_score_mat = [] | |||
| pruning_score_mat = [] | |||
| single_score_mat = [] | |||
| for n_label, repeated in zip(self.n_labeled_list, self.repeated_list): | |||
| user_model_score_list, reuse_pruning_score_list = [], [] | |||
| if n_label > len(train_x): | |||
| n_label = len(train_x) | |||
| for _ in range(repeated): | |||
| x_train, y_train = zip(*random.sample(list(zip(train_x, train_y)), k=n_label)) | |||
| x_train = np.array(list(x_train)) | |||
| y_train = np.array(list(y_train)) | |||
| x_train = torch.from_numpy(x_train) | |||
| y_train = torch.from_numpy(y_train) | |||
| sampled_dataset = TensorDataset(x_train, y_train) | |||
| mode_save_path = os.path.abspath(os.path.join(self.model_path, "model.pth")) | |||
| model = ConvModel( | |||
| channel=x_train.shape[1], im_size=(x_train.shape[2], x_train.shape[3]), n_random_features=10 | |||
| ).to(device) | |||
| train_model( | |||
| model, | |||
| sampled_dataset, | |||
| sampled_dataset, | |||
| mode_save_path, | |||
| epochs=35, | |||
| batch_size=128, | |||
| device=device, | |||
| verbose=False, | |||
| ) | |||
| model.load_state_dict(torch.load(mode_save_path)) | |||
| _, user_model_acc = evaluate(model, test_dataset, distribution=True) | |||
| user_model_score_list.append(user_model_acc) | |||
| reuse_pruning = EnsemblePruningReuser( | |||
| learnware_list=mixture_learnware_list, mode="classification" | |||
| ) | |||
| reuse_pruning.fit(x_train, y_train) | |||
| _, pruning_acc = evaluate(reuse_pruning, test_dataset, distribution=False) | |||
| reuse_pruning_score_list.append(pruning_acc) | |||
| single_score_mat.append([best_acc] * repeated) | |||
| user_model_score_mat.append(user_model_score_list) | |||
| pruning_score_mat.append(reuse_pruning_score_list) | |||
| print( | |||
| f"user_label_num: {n_label}, user_acc: {np.mean(user_model_score_mat[-1])}, pruning_acc: {np.mean(pruning_score_mat[-1])}" | |||
| ) | |||
| logger.info(f"Saving Curves for User_{i}") | |||
| user_curves_data = (single_score_mat, user_model_score_mat, pruning_score_mat) | |||
| with open(os.path.join(self.curve_path, f"curve{str(i)}.pkl"), "wb") as f: | |||
| pickle.dump(user_curves_data, f) | |||
| logger.info( | |||
| "Accuracy of selected learnware: %.3f +/- %.3f, Average performance: %.3f +/- %.3f, Best performance: %.3f +/- %.3f" | |||
| % ( | |||
| np.mean(select_list), | |||
| np.std(select_list), | |||
| np.mean(avg_list), | |||
| np.std(avg_list), | |||
| np.mean(best_list), | |||
| np.std(best_list), | |||
| ) | |||
| ) | |||
| logger.info("Average performance improvement: %.3f" % (np.mean(improve_list))) | |||
| logger.info( | |||
| "Average Job Selector Reuse Performance: %.3f +/- %.3f" | |||
| % (np.mean(job_selector_score_list), np.std(job_selector_score_list)) | |||
| ) | |||
| logger.info( | |||
| "Averaging Ensemble Reuse Performance: %.3f +/- %.3f" | |||
| % (np.mean(ensemble_score_list), np.std(ensemble_score_list)) | |||
| ) | |||
| ) | |||
| logger.info("Average performance improvement: %.3f" % (np.mean(improve_list))) | |||
| logger.info( | |||
| "Average Job Selector Reuse Performance: %.3f +/- %.3f" | |||
| % (np.mean(job_selector_score_list), np.std(job_selector_score_list)) | |||
| ) | |||
| logger.info( | |||
| "Averaging Ensemble Reuse Performance: %.3f +/- %.3f" | |||
| % (np.mean(ensemble_score_list), np.std(ensemble_score_list)) | |||
| ) | |||
| pruning_curves_data, user_model_curves_data = [], [] | |||
| total_user_model_score_mat = [np.zeros(self.repeated_list[i]) for i in range(len(self.n_labeled_list))] | |||
| total_pruning_score_mat = [np.zeros(self.repeated_list[i]) for i in range(len(self.n_labeled_list))] | |||
| for user_idx in range(self.image_benchmark.user_num): | |||
| for user_idx in range(image_benchmark_config.user_num): | |||
| with open(os.path.join(self.curve_path, f"curve{str(user_idx)}.pkl"), "rb") as f: | |||
| user_curves_data = pickle.load(f) | |||
| (single_score_mat, user_model_score_mat, pruning_score_mat) = user_curves_data | |||
| @@ -244,8 +250,8 @@ class ImageDatasetWorkflow: | |||
| total_pruning_score_mat[i] += 1 - np.array(pruning_score_mat[i]) / 100 | |||
| for i in range(len(self.n_labeled_list)): | |||
| total_user_model_score_mat[i] /= self.image_benchmark.user_num | |||
| total_pruning_score_mat[i] /= self.image_benchmark.user_num | |||
| total_user_model_score_mat[i] /= image_benchmark_config.user_num | |||
| total_pruning_score_mat[i] /= image_benchmark_config.user_num | |||
| user_model_curves_data.append( | |||
| (np.mean(total_user_model_score_mat[i]), np.std(total_user_model_score_mat[i])) | |||
| ) | |||
| @@ -1,8 +0,0 @@ | |||
| model: | |||
| class_name: Model | |||
| kwargs: {} | |||
| stat_specifications: | |||
| - module_path: learnware.specification | |||
| class_name: RKMETableSpecification | |||
| file_name: rkme.json | |||
| kwargs: {} | |||
| @@ -1,21 +0,0 @@ | |||
| import os | |||
| import joblib | |||
| import numpy as np | |||
| import lightgbm as lgb | |||
| from learnware.model import BaseModel | |||
| class Model(BaseModel): | |||
| def __init__(self): | |||
| super(Model, self).__init__(input_shape=(82,), output_shape=(1,)) | |||
| dir_path = os.path.dirname(os.path.abspath(__file__)) | |||
| self.model = lgb.Booster(model_file=os.path.join(dir_path, "model.out")) | |||
| def fit(self, X: np.ndarray, y: np.ndarray): | |||
| pass | |||
| def predict(self, X: np.ndarray) -> np.ndarray: | |||
| return self.model.predict(X) | |||
| def finetune(self, X: np.ndarray, y: np.ndarray): | |||
| pass | |||
| @@ -1,3 +0,0 @@ | |||
| # M5 Dataset | |||
| Walmart store, involves the unit sales of various products sold in the USA, organized in the form of grouped time series. More specifically, the dataset involves the unit sales of 3049 products, classified in 3 product categories (Hobbies, Foods, and Household). | |||
| @@ -1,65 +0,0 @@ | |||
| from cgi import test | |||
| import os | |||
| import joblib | |||
| import lightgbm as lgb | |||
| from .config import store_list, model_dir | |||
| from .utils import acquire_data, get_weights, model_predict, score, measure_aux_algo | |||
| from .generate_data import regenerate_data | |||
| from .train import retrain_models, grid_training_sample, train_adaptation_grid | |||
| class DataLoader: | |||
| def __init__(self): | |||
| self.algo = "ridge" | |||
| def set_algo(self, algo): | |||
| self.algo = algo | |||
| def get_algo_list(self): | |||
| return ["lgb", "ridge"] | |||
| def get_idx_list(self): | |||
| return list(range(len(store_list))) | |||
| def get_idx_data(self, idx): | |||
| store = store_list[idx] | |||
| # fill_flag = self.algo == "ridge" | |||
| fill_flag = True | |||
| return acquire_data(store, fill_flag) | |||
| def get_weights(self): | |||
| return get_weights(self.algo) | |||
| def get_model_path(self, idx): | |||
| return os.path.join(model_dir, "{}_{}.out".format(self.algo, store_list[idx])) | |||
| def predict(self, idx, test_x): | |||
| store = store_list[idx] | |||
| if os.path.exists(os.path.join(model_dir, f"{self.algo}_{store}.out")): | |||
| return model_predict(self.algo, idx, test_x) | |||
| else: | |||
| self.retrain_models() | |||
| return model_predict(self.algo, idx, test_x) | |||
| def score(self, real_y, pred_y, sample_weight=None, multioutput="raw_values"): | |||
| return score(real_y, pred_y, sample_weight, multioutput) | |||
| def regenerate_data(self): | |||
| regenerate_data() | |||
| def retrain_models(self): | |||
| retrain_models(self.algo) | |||
| def grid_training_sample(self, user_list=list(range(10))): | |||
| grid_training_sample(self.algo, user_list) | |||
| def train_adaptation_grid( | |||
| self, max_sample, test_sample, user_list=list(range(10)), adaptation_model=[], residual=False | |||
| ): | |||
| train_adaptation_grid(self.algo, max_sample, test_sample, user_list, adaptation_model, residual) | |||
| def measure_aux_algo(self, idx, test_sample, model): | |||
| return measure_aux_algo(idx, test_sample, model) | |||
| @@ -1,139 +0,0 @@ | |||
| import os | |||
| ROOT_PATH = os.path.abspath(os.path.join(__file__, "..", "data")) | |||
| raw_data_dir = os.path.join(ROOT_PATH, "raw") | |||
| processed_data_dir = os.path.join(ROOT_PATH, "processed") | |||
| model_dir = os.path.join(ROOT_PATH, "models") | |||
| grid_dir = os.path.join(ROOT_PATH, "grid_sample") | |||
| TARGET = "sales" | |||
| START_TRAIN = 1 | |||
| END_TRAIN = 1941 - 28 | |||
| category_list = ["item_id", "dept_id", "cat_id", "event_name_1", "event_name_2", "event_type_1", "event_type_2"] | |||
| features_columns = [ | |||
| "item_id", | |||
| "dept_id", | |||
| "cat_id", | |||
| "release", | |||
| "sell_price", | |||
| "price_max", | |||
| "price_min", | |||
| "price_std", | |||
| "price_mean", | |||
| "price_norm", | |||
| "price_nunique", | |||
| "item_nunique", | |||
| "price_momentum", | |||
| "price_momentum_m", | |||
| "price_momentum_y", | |||
| "event_name_1", | |||
| "event_type_1", | |||
| "event_name_2", | |||
| "event_type_2", | |||
| "snap", | |||
| "tm_d", | |||
| "tm_w", | |||
| "tm_m", | |||
| "tm_y", | |||
| "tm_wm", | |||
| "tm_dw", | |||
| "tm_w_end", | |||
| "sales_lag_28", | |||
| "sales_lag_29", | |||
| "sales_lag_30", | |||
| "sales_lag_31", | |||
| "sales_lag_32", | |||
| "sales_lag_33", | |||
| "sales_lag_34", | |||
| "sales_lag_35", | |||
| "sales_lag_36", | |||
| "sales_lag_37", | |||
| "sales_lag_38", | |||
| "sales_lag_39", | |||
| "sales_lag_40", | |||
| "sales_lag_41", | |||
| "sales_lag_42", | |||
| "rolling_mean_7", | |||
| "rolling_std_7", | |||
| "rolling_mean_14", | |||
| "rolling_std_14", | |||
| "rolling_mean_30", | |||
| "rolling_std_30", | |||
| "rolling_mean_60", | |||
| "rolling_std_60", | |||
| "rolling_mean_180", | |||
| "rolling_std_180", | |||
| "rolling_mean_tmp_1_7", | |||
| "rolling_mean_tmp_1_14", | |||
| "rolling_mean_tmp_1_30", | |||
| "rolling_mean_tmp_1_60", | |||
| "rolling_mean_tmp_7_7", | |||
| "rolling_mean_tmp_7_14", | |||
| "rolling_mean_tmp_7_30", | |||
| "rolling_mean_tmp_7_60", | |||
| "rolling_mean_tmp_14_7", | |||
| "rolling_mean_tmp_14_14", | |||
| "rolling_mean_tmp_14_30", | |||
| "rolling_mean_tmp_14_60", | |||
| # "enc_state_id_mean", | |||
| # "enc_state_id_std", | |||
| # "enc_store_id_mean", | |||
| # "enc_store_id_std", | |||
| "enc_cat_id_mean", | |||
| "enc_cat_id_std", | |||
| "enc_dept_id_mean", | |||
| "enc_dept_id_std", | |||
| "enc_state_id_cat_id_mean", | |||
| "enc_state_id_cat_id_std", | |||
| "enc_state_id_dept_id_mean", | |||
| "enc_state_id_dept_id_std", | |||
| "enc_store_id_cat_id_mean", | |||
| "enc_store_id_cat_id_std", | |||
| "enc_store_id_dept_id_mean", | |||
| "enc_store_id_dept_id_std", | |||
| "enc_item_id_mean", | |||
| "enc_item_id_std", | |||
| "enc_item_id_state_id_mean", | |||
| "enc_item_id_state_id_std", | |||
| "enc_item_id_store_id_mean", | |||
| "enc_item_id_store_id_std", | |||
| ] | |||
| label_column = ["sales"] | |||
| lgb_params_list = [ | |||
| [0.015, 224, 66], | |||
| [0.01, 224, 50], | |||
| [0.01, 300, 80], | |||
| [0.015, 128, 50], | |||
| [0.015, 300, 50], | |||
| [0.01, 300, 66], | |||
| [0.015, 300, 80], | |||
| [0.15, 224, 80], | |||
| [0.005, 300, 50], | |||
| [0.015, 224, 50], | |||
| ] | |||
| store_list = ["CA_1", "CA_2", "CA_3", "CA_4", "TX_1", "TX_2", "TX_3", "WI_1", "WI_2", "WI_3"] | |||
| dataset_info = { | |||
| "name": "M5", | |||
| "range of date": "2011.01.29-2016.06.19", | |||
| "description": "Walmart store, involves the unit sales of various products sold in the USA, organized in the form of grouped time series. More specifically, the dataset involves the unit sales of 3049 products, classified in 3 product categories (Hobbies, Foods, and Household).", | |||
| "location": [ | |||
| "California, United States", | |||
| "California, United States", | |||
| "California, United States", | |||
| "California, United States", | |||
| "Texas, United States", | |||
| "Texas, United States", | |||
| "Texas, United States", | |||
| "Wisconsin, United States", | |||
| "Wisconsin, United States", | |||
| "Wisconsin, United States", | |||
| ], | |||
| } | |||
| @@ -1,338 +0,0 @@ | |||
| import numpy as np | |||
| import pandas as pd | |||
| from math import ceil | |||
| from tqdm import tqdm | |||
| from copy import deepcopy as dco | |||
| import os, sys, gc, time, warnings, pickle, psutil, random | |||
| from sklearn.preprocessing import LabelEncoder | |||
| from sklearn.preprocessing import MinMaxScaler | |||
| from .utils import * | |||
| from .config import raw_data_dir, processed_data_dir, TARGET | |||
| warnings.filterwarnings("ignore") | |||
| # ==================== preprocessing ==================== | |||
| def melt_raw_data(train_df): | |||
| if os.path.exists(os.path.join(processed_data_dir, "melt_raw_data.pkl")): | |||
| return pd.read_pickle(os.path.join(processed_data_dir, "melt_raw_data.pkl")) | |||
| index_columns = ["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"] | |||
| grid_df = pd.melt(train_df, id_vars=index_columns, var_name="d", value_name=TARGET) | |||
| for col in index_columns: | |||
| grid_df[col] = grid_df[col].astype("category") | |||
| grid_df.to_pickle(os.path.join(processed_data_dir, "melt_raw_data.pkl")) | |||
| return grid_df | |||
| def add_release_week(grid_df, prices_df, calendar_df): | |||
| if os.path.exists(os.path.join(processed_data_dir, "add_release_week.pkl")): | |||
| return pd.read_pickle(os.path.join(processed_data_dir, "add_release_week.pkl")) | |||
| release_df = prices_df.groupby(["store_id", "item_id"])["wm_yr_wk"].agg(["min"]).reset_index() | |||
| release_df.columns = ["store_id", "item_id", "release"] | |||
| grid_df = merge_by_concat(grid_df, release_df, ["store_id", "item_id"]) | |||
| grid_df = merge_by_concat(grid_df, calendar_df[["wm_yr_wk", "d"]], ["d"]) | |||
| # cutoff meaningless rows | |||
| grid_df = grid_df[grid_df["wm_yr_wk"] >= grid_df["release"]] | |||
| grid_df = grid_df.reset_index(drop=True) | |||
| # scale the release | |||
| grid_df["release"] = grid_df["release"] - grid_df["release"].min() | |||
| grid_df["release"] = grid_df["release"].astype(np.int16) | |||
| grid_df.to_pickle(os.path.join(processed_data_dir, "add_release_week.pkl")) | |||
| return grid_df | |||
| def add_prices(grid_df, prices_df, calendar_df): | |||
| if os.path.exists(os.path.join(processed_data_dir, "add_prices.pkl")): | |||
| return pd.read_pickle(os.path.join(processed_data_dir, "add_prices.pkl")) | |||
| prices_df["price_max"] = prices_df.groupby(["store_id", "item_id"])["sell_price"].transform("max") | |||
| prices_df["price_min"] = prices_df.groupby(["store_id", "item_id"])["sell_price"].transform("min") | |||
| prices_df["price_std"] = prices_df.groupby(["store_id", "item_id"])["sell_price"].transform("std") | |||
| prices_df["price_mean"] = prices_df.groupby(["store_id", "item_id"])["sell_price"].transform("mean") | |||
| prices_df["price_norm"] = prices_df["sell_price"] / prices_df["price_max"] | |||
| prices_df["price_nunique"] = prices_df.groupby(["store_id", "item_id"])["sell_price"].transform("nunique") | |||
| prices_df["item_nunique"] = prices_df.groupby(["store_id", "sell_price"])["item_id"].transform("nunique") | |||
| calendar_prices = calendar_df[["wm_yr_wk", "month", "year"]] | |||
| calendar_prices = calendar_prices.drop_duplicates(subset=["wm_yr_wk"]) | |||
| prices_df = prices_df.merge(calendar_prices[["wm_yr_wk", "month", "year"]], on=["wm_yr_wk"], how="left") | |||
| prices_df["price_momentum"] = prices_df["sell_price"] / prices_df.groupby(["store_id", "item_id"])[ | |||
| "sell_price" | |||
| ].transform(lambda x: x.shift(1)) | |||
| prices_df["price_momentum_m"] = prices_df["sell_price"] / prices_df.groupby(["store_id", "item_id", "month"])[ | |||
| "sell_price" | |||
| ].transform("mean") | |||
| prices_df["price_momentum_y"] = prices_df["sell_price"] / prices_df.groupby(["store_id", "item_id", "year"])[ | |||
| "sell_price" | |||
| ].transform("mean") | |||
| grid_df = reduce_mem_usage(grid_df) | |||
| prices_df = reduce_mem_usage(prices_df) | |||
| original_columns = list(grid_df) | |||
| grid_df = grid_df.merge(prices_df, on=["store_id", "item_id", "wm_yr_wk"], how="left") | |||
| grid_df = reduce_mem_usage(grid_df) | |||
| grid_df.to_pickle(os.path.join(processed_data_dir, "add_prices.pkl")) | |||
| return grid_df | |||
| def add_date(grid_df, calendar_df): | |||
| if os.path.exists(os.path.join(processed_data_dir, "add_date.pkl")): | |||
| return pd.read_pickle(os.path.join(processed_data_dir, "add_date.pkl")) | |||
| # merge calendar partly | |||
| icols = [ | |||
| "date", | |||
| "d", | |||
| "event_name_1", | |||
| "event_type_1", | |||
| "event_name_2", | |||
| "event_type_2", | |||
| "snap_CA", | |||
| "snap_TX", | |||
| "snap_WI", | |||
| ] | |||
| grid_df = grid_df.merge(calendar_df[icols], on=["d"], how="left") | |||
| # convert to category | |||
| icols = [ | |||
| "event_name_1", | |||
| "event_type_1", | |||
| "event_name_2", | |||
| "event_type_2", | |||
| "snap_CA", | |||
| "snap_TX", | |||
| "snap_WI", | |||
| ] | |||
| for col in icols: | |||
| grid_df[col] = grid_df[col].astype("category") | |||
| # make some features from date | |||
| grid_df["date"] = pd.to_datetime(grid_df["date"]) | |||
| grid_df["tm_d"] = grid_df["date"].dt.day.astype(np.int8) | |||
| grid_df["tm_w"] = grid_df["date"].dt.week.astype(np.int8) | |||
| grid_df["tm_m"] = grid_df["date"].dt.month.astype(np.int8) | |||
| grid_df["tm_y"] = grid_df["date"].dt.year | |||
| grid_df["tm_y"] = (grid_df["tm_y"] - grid_df["tm_y"].min()).astype(np.int8) | |||
| grid_df["tm_wm"] = grid_df["tm_d"].apply(lambda x: ceil(x / 7)).astype(np.int8) | |||
| grid_df["tm_dw"] = grid_df["date"].dt.dayofweek.astype(np.int8) | |||
| grid_df["tm_w_end"] = (grid_df["tm_dw"] >= 5).astype(np.int8) | |||
| # clear columns | |||
| grid_df["d"] = grid_df["d"].apply(lambda x: x[2:]).astype(np.int16) | |||
| grid_df = grid_df.drop("wm_yr_wk", 1) | |||
| grid_df.to_pickle(os.path.join(processed_data_dir, "add_date.pkl")) | |||
| return grid_df | |||
| def add_lags_rollings(grid_df): | |||
| if os.path.exists(os.path.join(processed_data_dir, "add_lags_rollings.pkl")): | |||
| return pd.read_pickle(os.path.join(processed_data_dir, "add_lags_rollings.pkl")) | |||
| # add lags | |||
| SHIFT_DAY = 28 | |||
| LAG_DAYS = [col for col in range(SHIFT_DAY, SHIFT_DAY + 15)] | |||
| grid_df = grid_df.assign( | |||
| **{ | |||
| "{}_lag_{}".format(col, l): grid_df.groupby(["id"])[col].transform(lambda x: x.shift(l)) | |||
| for l in LAG_DAYS | |||
| for col in [TARGET] | |||
| } | |||
| ) | |||
| for col in list(grid_df): | |||
| if "lag" in col: | |||
| grid_df[col] = grid_df[col].astype(np.float16) | |||
| # add rollings | |||
| for i in [7, 14, 30, 60, 180]: | |||
| grid_df["rolling_mean_" + str(i)] = ( | |||
| grid_df.groupby(["id"])[TARGET].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).mean()).astype(np.float16) | |||
| ) | |||
| grid_df["rolling_std_" + str(i)] = ( | |||
| grid_df.groupby(["id"])[TARGET].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).std()).astype(np.float16) | |||
| ) | |||
| # sliding window | |||
| for d_shift in [1, 7, 14]: | |||
| for d_window in [7, 14, 30, 60]: | |||
| col_name = "rolling_mean_tmp_" + str(d_shift) + "_" + str(d_window) | |||
| grid_df[col_name] = ( | |||
| grid_df.groupby(["id"])[TARGET] | |||
| .transform(lambda x: x.shift(SHIFT_DAY + d_shift).rolling(d_window).mean()) | |||
| .astype(np.float16) | |||
| ) | |||
| grid_df.to_pickle(os.path.join(processed_data_dir, "add_lags_rollings.pkl")) | |||
| return grid_df | |||
| def add_mean_enc(grid_df): | |||
| if os.path.exists(os.path.join(processed_data_dir, "add_mean_enc.pkl")): | |||
| return pd.read_pickle(os.path.join(processed_data_dir, "add_mean_enc.pkl")) | |||
| sales_df = dco(grid_df["sales"]) | |||
| grid_df["sales"][grid_df["d"] > (1941 - 28)] = np.nan | |||
| icols = [ | |||
| ["state_id"], | |||
| ["store_id"], | |||
| ["cat_id"], | |||
| ["dept_id"], | |||
| ["state_id", "cat_id"], | |||
| ["state_id", "dept_id"], | |||
| ["store_id", "cat_id"], | |||
| ["store_id", "dept_id"], | |||
| ["item_id"], | |||
| ["item_id", "state_id"], | |||
| ["item_id", "store_id"], | |||
| ] | |||
| for col in icols: | |||
| col_name = "_" + "_".join(col) + "_" | |||
| grid_df["enc" + col_name + "mean"] = grid_df.groupby(col)["sales"].transform("mean").astype(np.float16) | |||
| grid_df["enc" + col_name + "std"] = grid_df.groupby(col)["sales"].transform("std").astype(np.float16) | |||
| grid_df["sales"] = sales_df | |||
| grid_df.to_pickle(os.path.join(processed_data_dir, "add_mean_enc.pkl")) | |||
| return grid_df | |||
| def add_snap(grid_df): | |||
| if os.path.exists(os.path.join(processed_data_dir, "all_data_df.pkl")): | |||
| return pd.read_pickle(os.path.join(processed_data_dir, "all_data_df.pkl")) | |||
| mask_CA = grid_df["state_id"] == "CA" | |||
| mask_WI = grid_df["state_id"] == "WI" | |||
| mask_TX = grid_df["state_id"] == "TX" | |||
| grid_df["snap"] = grid_df["snap_CA"] | |||
| grid_df.loc[mask_WI, "snap"] = grid_df["snap_WI"] | |||
| grid_df.loc[mask_TX, "snap"] = grid_df["snap_TX"] | |||
| grid_df.to_pickle(os.path.join(processed_data_dir, "all_data_df.pkl")) | |||
| return grid_df | |||
| def preprocessing_m5(): | |||
| train_df = pd.read_csv(os.path.join(raw_data_dir, "sales_train_evaluation.csv")) | |||
| prices_df = pd.read_csv(os.path.join(raw_data_dir, "sell_prices.csv")) | |||
| calendar_df = pd.read_csv(os.path.join(raw_data_dir, "calendar.csv")) | |||
| grid_df = melt_raw_data(train_df) | |||
| print(f"df: ({grid_df.shape[0]}, {grid_df.shape[1]}) Melting raw data down!") | |||
| grid_df = add_release_week(grid_df, prices_df, calendar_df) | |||
| print(f"df: ({grid_df.shape[0]}, {grid_df.shape[1]}) Adding release week down!") | |||
| grid_df = add_prices(grid_df, prices_df, calendar_df) | |||
| print(f"df: ({grid_df.shape[0]}, {grid_df.shape[1]}) Adding prices down!") | |||
| grid_df = add_date(grid_df, calendar_df) | |||
| print(f"df: ({grid_df.shape[0]}, {grid_df.shape[1]}) Adding date down!") | |||
| grid_df = add_lags_rollings(grid_df) | |||
| print(f"df: ({grid_df.shape[0]}, {grid_df.shape[1]}) Adding lags and rollings down!") | |||
| grid_df = add_mean_enc(grid_df) | |||
| print(f"df: ({grid_df.shape[0]}, {grid_df.shape[1]}) Adding mean encoding down!") | |||
| grid_df = pd.read_pickle(os.path.join(processed_data_dir, "add_mean_enc.pkl")) | |||
| grid_df = add_snap(grid_df) | |||
| print("Save the data down!") | |||
| # ==================== split dataset ==================== | |||
| def label_encode(df, columns): | |||
| le = LabelEncoder() | |||
| data_list = [] | |||
| for column in columns: | |||
| data_list += df[column].drop_duplicates().values.tolist() | |||
| le.fit(data_list) | |||
| for column in columns: | |||
| df[column] = le.transform(df[column].values.tolist()) | |||
| return df | |||
| def reorganize_data(grid_df): | |||
| grid_df["snap"] = grid_df["snap"].astype("int8") | |||
| columns_list = [ | |||
| ["item_id"], | |||
| ["dept_id"], | |||
| ["cat_id"], | |||
| ["event_name_1", "event_name_2"], | |||
| ["event_type_1", "event_type_2"], | |||
| ] | |||
| for columns in columns_list: | |||
| grid_df[columns] = label_encode(grid_df[columns], columns) | |||
| return reduce_mem_usage(grid_df) | |||
| def split_data(df, store, fill_flag=False): | |||
| for cat in category_list: | |||
| df[cat] = df[cat].astype("category") | |||
| if fill_flag: | |||
| df = reduce_mem_usage(df, float16_flag=False) | |||
| cols = df.isnull().any() | |||
| idx = list(cols[cols.values].index) | |||
| df[idx] = df.groupby("item_id", sort=False)[idx].apply(lambda x: x.ffill().bfill()) | |||
| df[idx] = df[idx].fillna(df[idx].mean()) | |||
| mms = MinMaxScaler() | |||
| df[features_columns] = mms.fit_transform(df[features_columns]) | |||
| df = reduce_mem_usage(df) | |||
| train_df = df[df["d"] <= END_TRAIN] | |||
| val_df = df[df["d"] > END_TRAIN] | |||
| train_df = train_df[features_columns + label_column] | |||
| val_df = val_df[features_columns + label_column] | |||
| print(train_df.shape, val_df.shape) | |||
| suffix = f"_fill" if fill_flag else "" | |||
| train_df.to_pickle(os.path.join(processed_data_dir, f"train_{store}{suffix}.pkl")) | |||
| val_df.to_pickle(os.path.join(processed_data_dir, f"val_{store}{suffix}.pkl")) | |||
| def split_m5(): | |||
| grid_df = pd.read_pickle(os.path.join(processed_data_dir, "all_data_df.pkl")) | |||
| if os.path.exists(os.path.join(processed_data_dir, "label_encode.pkl")): | |||
| grid_df = pd.read_pickle(os.path.join(processed_data_dir, "label_encode.pkl")) | |||
| else: | |||
| grid_df = reorganize_data(grid_df) | |||
| grid_df.to_pickle(os.path.join(processed_data_dir, "label_encode.pkl")) | |||
| for store in store_list: | |||
| # split_data(grid_df[grid_df["store_id"] == store], store) | |||
| split_data(grid_df[grid_df["store_id"] == store], store, True) | |||
| def regenerate_data(): | |||
| preprocessing_m5() | |||
| split_m5() | |||
| @@ -1,452 +0,0 @@ | |||
| import gc | |||
| import joblib | |||
| import random | |||
| import numpy as np | |||
| import pandas as pd | |||
| from tqdm import tqdm | |||
| import os, warnings | |||
| import lightgbm as lgb | |||
| from sklearn.svm import SVR | |||
| from sklearn.linear_model import Ridge | |||
| from sklearn.kernel_ridge import KernelRidge | |||
| from sklearn.metrics import mean_squared_error | |||
| from sklearn.metrics.pairwise import rbf_kernel | |||
| from .utils import * | |||
| from .config import model_dir, grid_dir, store_list, lgb_params_list | |||
| warnings.filterwarnings("ignore") | |||
| def train_lgb_model(train_x, train_y, val_x, val_y, store, lr, nl, md, best, save=True, n_estimators=0, train_flag=0): | |||
| lgb_params = { | |||
| "boosting_type": "gbdt", | |||
| "objective": "rmse", | |||
| "metric": "rmse", | |||
| "learning_rate": lr, | |||
| "num_leaves": nl, | |||
| "max_depth": md, | |||
| "n_estimators": 100000, | |||
| "boost_from_average": False, | |||
| "verbose": -1, | |||
| } | |||
| if train_flag: | |||
| idx = int(len(train_y) * 0.1) | |||
| train_data = lgb.Dataset(train_x[:-idx], label=train_y[:-idx]) | |||
| val_data = lgb.Dataset(train_x[-idx:], label=train_y[-idx:]) | |||
| else: | |||
| train_data = lgb.Dataset(train_x, label=train_y) | |||
| val_data = lgb.Dataset(val_x, label=val_y) | |||
| if n_estimators: | |||
| lgb_params["n_estimators"] = n_estimators | |||
| gbm = lgb.train(lgb_params, train_data, verbose_eval=100) | |||
| else: | |||
| gbm = lgb.train(lgb_params, train_data, valid_sets=[val_data], verbose_eval=100, early_stopping_rounds=1000) | |||
| test_y = gbm.predict(val_x, num_iteration=gbm.best_iteration) | |||
| res = mean_squared_error(val_y, test_y, squared=False) | |||
| if res < best: | |||
| best = res | |||
| if save: | |||
| gbm.save_model(os.path.join(model_dir, f"lgb_{store}.out")) | |||
| return best | |||
| def train_ridge_model(train_x, train_y, val_x, val_y, store, a, best, save=True): | |||
| model = Ridge(alpha=a) | |||
| model.fit(train_x, train_y) | |||
| test_y = model.predict(val_x) | |||
| res = mean_squared_error(val_y, test_y, squared=False) | |||
| if res < best: | |||
| best = res | |||
| if save: | |||
| joblib.dump(model, os.path.join(model_dir, f"ridge_{store}.out")) | |||
| return best | |||
| def train_svm_model( | |||
| train_x, train_y, val_x, val_y, store, C, epsilon, best, save=True, gamma=0.1, adaptation_model=[], K1=None, K2=None | |||
| ): | |||
| if K1 is None: | |||
| model = SVR(C=C, epsilon=epsilon, max_iter=30000, cache_size=10240, verbose=True, gamma=gamma) | |||
| else: | |||
| model = AuxiliarySVR( | |||
| C=C, | |||
| epsilon=epsilon, | |||
| gamma=gamma, | |||
| adaptation_model=adaptation_model, | |||
| max_iter=30000, | |||
| cache_size=10240, | |||
| verbose=True, | |||
| K1=K1, | |||
| K2=K2, | |||
| ) | |||
| model.fit(train_x, train_y) | |||
| test_y = model.predict(val_x) | |||
| res = mean_squared_error(val_y, test_y, squared=False) | |||
| if res < best: | |||
| best = res | |||
| if save: | |||
| joblib.dump(model, os.path.join(model_dir, f"svm_{store}.out")) | |||
| return best | |||
| def train_krr_model(train_x, train_y, val_x, val_y, store, a, best, save=True, gamma=0.1, K1=None, K2=None): | |||
| if K1 is None: | |||
| model = KernelRidge(kernel="rbf", alpha=a, gamma=gamma) | |||
| model.fit(train_x, train_y) | |||
| test_y = model.predict(val_x) | |||
| res = mean_squared_error(val_y, test_y, squared=False) | |||
| else: | |||
| len1, len2 = len(train_y), len(val_y) | |||
| model = KernelRidge(kernel="precomputed", alpha=a) | |||
| model.fit(K1[-len1:, -len1:], train_y) | |||
| test_y = model.predict(K2[-len2:, -len1:]) | |||
| res = mean_squared_error(val_y, test_y, squared=False) | |||
| if res < best: | |||
| best = res | |||
| if save: | |||
| joblib.dump(model, os.path.join(model_dir, f"krr_{store}.out")) | |||
| return best | |||
| def grid_search(store_id, algo, search_lgb_flag=False): | |||
| store = store_list[store_id] | |||
| if algo == "lgb": | |||
| train_x, train_y, val_x, val_y = acquire_data(store, True) | |||
| learning_rate = [0.005, 0.01, 0.015] | |||
| num_leaves = [128, 224, 300] | |||
| max_depth = [50, 66, 80] | |||
| best = 10000000 | |||
| if search_lgb_flag: | |||
| for lr in learning_rate: | |||
| for nl in num_leaves: | |||
| for md in max_depth: | |||
| best = train_lgb_model(train_x, train_y, val_x, val_y, store, lr, nl, md, best) | |||
| print(f"store: {store}, lr: {lr}, nl: {nl}, md: {md}, best: {best}") | |||
| else: | |||
| lr, nl, md = lgb_params_list[store_id] | |||
| best = train_lgb_model(train_x, train_y, val_x, val_y, store, lr, nl, md, best) | |||
| print(f"store: {store}, lr: {lr}, nl: {nl}, md: {md}, best: {best}") | |||
| elif algo == "ridge": | |||
| train_x, train_y, val_x, val_y = acquire_data(store, True) | |||
| alpha = [0.01, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10, 20, 30] | |||
| best = 10000000 | |||
| for a in alpha: | |||
| best = train_ridge_model(train_x, train_y, val_x, val_y, store, a, best) | |||
| print(f"store: {store}, alpha: {a}, best: {best}") | |||
| def grid_training_sample(algo, user_list=list(range(10))): | |||
| for i in range(len(user_list)): | |||
| store_id = user_list[i] | |||
| store = store_list[store_id] | |||
| org_train_x, org_train_y, val_x, val_y = acquire_data(store, True) | |||
| res = [] | |||
| proportion_list = [ | |||
| 100, | |||
| 300, | |||
| 500, | |||
| 700, | |||
| 900, | |||
| 1000, | |||
| 3000, | |||
| 5000, | |||
| 7000, | |||
| 9000, | |||
| 10000, | |||
| 30000, | |||
| 50000, | |||
| 70000, | |||
| 90000, | |||
| 100000, | |||
| 300000, | |||
| 500000, | |||
| 700000, | |||
| 900000, | |||
| 1000000, | |||
| 3000000, | |||
| 5000000, | |||
| ] | |||
| for proportion in proportion_list: | |||
| """ | |||
| random | |||
| org_idx_list = list(range(len(org_train_y))) | |||
| idx_list = random.sample(org_idx_list, min(proportion, len(org_train_y))) | |||
| train_x = org_train_x.iloc[idx_list] | |||
| train_y = org_train_y.iloc[idx_list] | |||
| """ | |||
| train_x = org_train_x[-proportion:] | |||
| train_y = org_train_y[-proportion:] | |||
| best = 10000000 | |||
| if algo == "lgb": | |||
| lr, nl, md = lgb_params_list[store_id] | |||
| best = train_lgb_model( | |||
| train_x, train_y, val_x, val_y, store, lr, nl, md, best, save=False, n_estimators=3000, train_flag=0 | |||
| ) | |||
| print(f"store: {store}, lr: {lr}, nl: {nl}, md: {md}, best: {best}") | |||
| elif algo == "ridge": | |||
| alpha = [0.01, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10, 20, 30] | |||
| for a in alpha: | |||
| best = train_ridge_model(train_x, train_y, val_x, val_y, store, a, best, save=False) | |||
| print(f"store: {store}, alpha: {a}, best: {best}") | |||
| elif algo == "svm": | |||
| C = [1, 10, 100] | |||
| epsilon = 0.001 | |||
| for c in C: | |||
| best = train_svm_model(train_x, train_y, val_x, val_y, store, c, epsilon, best, save=False) | |||
| print(f"store: {store}, C: {c}, epsilon: {epsilon}, best: {best}") | |||
| res.append([proportion, best]) | |||
| np.savetxt(os.path.join(grid_dir, f"grid_sample_{algo}_{store}.out"), np.array(res)) | |||
| if proportion > len(org_train_y): | |||
| break | |||
| def retrain_models(algo): | |||
| for store_id in range(10): | |||
| grid_search(store_id, algo) | |||
| def train_adaptation_grid( | |||
| algo, max_sample, test_sample, user_list=list(range(10)), adaptation_model=[], residual=False | |||
| ): | |||
| """ | |||
| adaptation_model = [ | |||
| [("lgb", 1), ("ridge", 2)], | |||
| [("lgb", 1), ("ridge", 2)] | |||
| ] | |||
| """ | |||
| proportion_list = [ | |||
| 100, | |||
| 300, | |||
| 500, | |||
| 700, | |||
| 900, | |||
| 1000, | |||
| 3000, | |||
| 5000, | |||
| 7000, | |||
| 9000, | |||
| 10000, | |||
| 30000, | |||
| 50000, | |||
| 70000, | |||
| 90000, | |||
| 100000, | |||
| 300000, | |||
| 500000, | |||
| 700000, | |||
| 900000, | |||
| 1000000, | |||
| 3000000, | |||
| 5000000, | |||
| ] | |||
| sample_idx = proportion_list.index(max_sample) + 1 | |||
| for i in range(len(user_list)): | |||
| store_id = user_list[i] | |||
| store = store_list[store_id] | |||
| org_train_x, org_train_y, val_x, val_y = acquire_data(store, True) | |||
| val_x = val_x[-test_sample:] | |||
| val_y = val_y[-test_sample:] | |||
| if algo == "lgb" or algo == "ridge": | |||
| res = [] | |||
| if adaptation_model != []: | |||
| if residual: | |||
| aux_algo, model_idx = adaptation_model[i][0] | |||
| org_train_y -= model_predict(aux_algo, model_idx, org_train_x) | |||
| val_y -= model_predict(aux_algo, model_idx, val_x) | |||
| else: | |||
| train_y_list, val_y_list = [], [] | |||
| for aux_algo, model_idx in adaptation_model[i]: | |||
| train_y_list.append(model_predict(aux_algo, model_idx, org_train_x)) | |||
| val_y_list.append(model_predict(aux_algo, model_idx, val_x)) | |||
| for j in range(len(train_y_list)): | |||
| org_train_x[f"model_values_{j}"] = train_y_list[j] | |||
| val_x[f"model_values_{j}"] = val_y_list[j] | |||
| for proportion in proportion_list[:sample_idx]: | |||
| """ | |||
| random | |||
| org_idx_list = list(range(len(org_train_y))) | |||
| idx_list = random.sample(org_idx_list, min(proportion, len(org_train_y))) | |||
| train_x = org_train_x.iloc[idx_list] | |||
| train_y = org_train_y.iloc[idx_list] | |||
| """ | |||
| train_x = org_train_x[-proportion:] | |||
| train_y = org_train_y[-proportion:] | |||
| best = 10000000 | |||
| if algo == "lgb": | |||
| if max_sample < 50000: | |||
| learning_rate = [0.005, 0.01, 0.015] | |||
| num_leaves = [128, 224, 300] | |||
| max_depth = [50, 66, 80] | |||
| for lr in learning_rate: | |||
| for nl in num_leaves: | |||
| for md in max_depth: | |||
| best = train_lgb_model( | |||
| train_x, train_y, val_x, val_y, store, lr, nl, md, best, save=False | |||
| ) | |||
| print(f"store: {store}, lr: {lr}, nl: {nl}, md: {md}, best: {best}") | |||
| else: | |||
| lr, nl, md = lgb_params_list[store_id] | |||
| best = train_lgb_model( | |||
| train_x, | |||
| train_y, | |||
| val_x, | |||
| val_y, | |||
| store, | |||
| lr, | |||
| nl, | |||
| md, | |||
| best, | |||
| save=False, | |||
| n_estimators=3000, | |||
| train_flag=0, | |||
| ) | |||
| print(f"store: {store}, lr: {lr}, nl: {nl}, md: {md}, best: {best}") | |||
| elif algo == "ridge": | |||
| alpha = [0.01, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10, 20, 30] | |||
| for a in alpha: | |||
| best = train_ridge_model(train_x, train_y, val_x, val_y, store, a, best, save=False) | |||
| print(f"store: {store}, alpha: {a}, best: {best}") | |||
| res.append([proportion, best]) | |||
| text = str(adaptation_model[i]) if adaptation_model != [] else "null" | |||
| text += "_residual_" if residual else "" | |||
| np.savetxt(os.path.join(grid_dir, f"{algo}_using_{text}_{store}.out"), np.array(res)) | |||
| if proportion > len(org_train_y): | |||
| break | |||
| elif algo == "svm" or algo == "krr": | |||
| res = [[proportion, 10000] for proportion in proportion_list[:sample_idx]] | |||
| org_train_x = org_train_x.to_numpy() | |||
| org_train_y = org_train_y.to_numpy() | |||
| val_x = val_x.to_numpy() | |||
| val_y = val_y.to_numpy() | |||
| y1_list, y2_list = [], [] | |||
| gamma_list = [0.01, 0.1, 0.5, 1] | |||
| if residual: | |||
| aux_algo, model_idx = adaptation_model[i][0] | |||
| org_train_y = org_train_y.astype(np.float64) | |||
| val_y = val_y.astype(np.float64) | |||
| org_train_y -= model_predict(aux_algo, model_idx, org_train_x) | |||
| val_y -= model_predict(aux_algo, model_idx, val_x) | |||
| elif adaptation_model != []: | |||
| for aux_algo, idx in adaptation_model[i]: | |||
| y1_list.append(model_predict(aux_algo, idx, org_train_x[-max_sample:]).reshape(-1, 1)) | |||
| y2_list.append(model_predict(aux_algo, idx, val_x).reshape(-1, 1)) | |||
| for gamma in gamma_list: | |||
| K1 = np.zeros((max_sample, max_sample)) | |||
| K2 = np.zeros((len(val_x), max_sample)) | |||
| if (not residual) and adaptation_model != []: | |||
| for j in range(len(adaptation_model[i])): | |||
| aux_algo, idx = adaptation_model[i][j] | |||
| y1 = y1_list[j] | |||
| y2 = y2_list[j] | |||
| K1 += np.dot(y1, y1.T) | |||
| K2 += np.dot(y2, y1.T) | |||
| K1 += rbf_kernel(org_train_x[-max_sample:], org_train_x[-max_sample:], gamma=gamma) | |||
| K2 += rbf_kernel(val_x, org_train_x[-max_sample:], gamma=gamma) | |||
| for idx in range(len(proportion_list[:sample_idx])): | |||
| proportion = proportion_list[idx] | |||
| """ | |||
| random | |||
| org_idx_list = list(range(len(org_train_y))) | |||
| idx_list = random.sample(org_idx_list, min(proportion, len(org_train_y))) | |||
| train_x = org_train_x.iloc[idx_list] | |||
| train_y = org_train_y.iloc[idx_list] | |||
| """ | |||
| train_x = org_train_x[-proportion:] | |||
| train_y = org_train_y[-proportion:] | |||
| best = 10000000 | |||
| if algo == "svm": | |||
| C = [1, 10, 50, 100, 200] | |||
| epsilon = 0.001 | |||
| for c in C: | |||
| adapt_m = [] if adaptation_model == [] else adaptation_model[i] | |||
| best = train_svm_model( | |||
| train_x, | |||
| train_y, | |||
| val_x, | |||
| val_y, | |||
| store, | |||
| c, | |||
| epsilon, | |||
| best, | |||
| save=False, | |||
| gamma=gamma, | |||
| adaptation_model=adapt_m, | |||
| K1=K1, | |||
| K2=K2, | |||
| ) | |||
| print(f"store: {store}, gamma: {gamma}, C: {c}, epsilon: {epsilon}, best: {best}") | |||
| elif algo == "krr": | |||
| alpha = [0.01, 0.1, 0.5, 1.0, 5.0, 10] | |||
| for a in alpha: | |||
| best = train_krr_model( | |||
| train_x, train_y, val_x, val_y, store, a, best, save=False, gamma=gamma, K1=K1, K2=K2 | |||
| ) | |||
| print(f"store: {store}, a: {a}, gamma: {gamma}, best: {best}") | |||
| if best < res[idx][1]: | |||
| res[idx][1] = best | |||
| text = str(adaptation_model[i]) if adaptation_model != [] else "null" | |||
| text += "_residual" if residual else "" | |||
| np.savetxt(os.path.join(grid_dir, f"{algo}_using_{text}_{store}.out"), np.array(res)) | |||
| if proportion > len(org_train_y): | |||
| break | |||
| del train_x, train_y | |||
| gc.collect() | |||
| del K1, K2 | |||
| gc.collect() | |||
| del org_train_x, org_train_y | |||
| gc.collect() | |||
| @@ -1,177 +0,0 @@ | |||
| from math import gamma | |||
| from tkinter import Y | |||
| import joblib | |||
| from tqdm import tqdm | |||
| import numpy as np | |||
| import pandas as pd | |||
| import lightgbm as lgb | |||
| from sklearn.svm import SVR | |||
| from sklearn.metrics import mean_squared_error | |||
| from sklearn.metrics.pairwise import rbf_kernel | |||
| import os, sys, gc, time, warnings, pickle, psutil, random | |||
| import matplotlib.pyplot as plt | |||
| from mpl_toolkits.axes_grid1 import make_axes_locatable | |||
| from .config import * | |||
| class AuxiliarySVR: | |||
| def __init__( | |||
| self, C, epsilon, gamma, adaptation_model=[], max_iter=30000, cache_size=10240, verbose=False, K1=None, K2=None | |||
| ): | |||
| self.gamma = gamma | |||
| self.adaptation_model = adaptation_model | |||
| self.model = SVR( | |||
| C=C, | |||
| epsilon=epsilon, | |||
| kernel=self.auxiliary_rbf_kernel, | |||
| max_iter=max_iter, | |||
| cache_size=cache_size, | |||
| verbose=verbose, | |||
| ) | |||
| self.K1 = K1 | |||
| self.K2 = K2 | |||
| def auxiliary_rbf_kernel(self, X1, X2): | |||
| if self.K1 is not None: | |||
| if X1.shape[0] == X2.shape[0]: | |||
| return self.K1[-X1.shape[0] :, -X2.shape[0] :] | |||
| else: | |||
| return self.K2[-X1.shape[0] :, -X2.shape[0] :] | |||
| else: | |||
| K = np.zeros((len(X1), len(X2))) | |||
| for algo, idx in self.adaptation_model: | |||
| Y1 = model_predict(algo, idx, X1).reshape(-1, 1) | |||
| Y2 = model_predict(algo, idx, X2).reshape(-1, 1) | |||
| K += Y1 @ Y2.T | |||
| K += rbf_kernel(X1, X2, self.gamma) | |||
| return K | |||
| def fit(self, X, Y): | |||
| self.gamma = 1 / X.shape[1] | |||
| self.model.fit(X, Y) | |||
| def predict(self, X): | |||
| return self.model.predict(X) | |||
| def measure_aux_algo(idx, test_sample, model): | |||
| """ | |||
| model = ("lgb", 1) | |||
| """ | |||
| store = store_list[idx] | |||
| org_train_x, org_train_y, val_x, val_y = acquire_data(store, True) | |||
| pred_y = model_predict(model[0], model[1], val_x[-test_sample:]) | |||
| return score(pred_y, val_y[-test_sample:]) | |||
| # Simple "Memory profilers" to see memory usage | |||
| def get_memory_usage(): | |||
| return np.round(psutil.Process(os.getpid()).memory_info()[0] / 2.0**30, 2) | |||
| def sizeof_fmt(num, suffix="B"): | |||
| for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]: | |||
| if abs(num) < 1024.0: | |||
| return "%3.1f%s%s" % (num, unit, suffix) | |||
| num /= 1024.0 | |||
| return "%.1f%s%s" % (num, "Yi", suffix) | |||
| # Memory Reducer | |||
| def reduce_mem_usage(df, float16_flag=True, verbose=True): | |||
| numerics = ["int16", "int32", "int64", "float16", "float32", "float64"] | |||
| start_mem = df.memory_usage().sum() / 1024**2 | |||
| for col in df.columns: | |||
| col_type = df[col].dtypes | |||
| if col_type in numerics: | |||
| c_min = df[col].min() | |||
| c_max = df[col].max() | |||
| if str(col_type)[:3] == "int": | |||
| if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: | |||
| df[col] = df[col].astype(np.int8) | |||
| elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: | |||
| df[col] = df[col].astype(np.int16) | |||
| elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: | |||
| df[col] = df[col].astype(np.int32) | |||
| elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: | |||
| df[col] = df[col].astype(np.int64) | |||
| else: | |||
| if float16_flag and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: | |||
| df[col] = df[col].astype(np.float16) | |||
| elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: | |||
| df[col] = df[col].astype(np.float32) | |||
| else: | |||
| df[col] = df[col].astype(np.float64) | |||
| end_mem = df.memory_usage().sum() / 1024**2 | |||
| if verbose: | |||
| print( | |||
| "Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)".format( | |||
| end_mem, 100 * (start_mem - end_mem) / start_mem | |||
| ) | |||
| ) | |||
| return df | |||
| # Merging by concat to not lose dtypes | |||
| def merge_by_concat(df1, df2, merge_on): | |||
| merged_gf = df1[merge_on] | |||
| merged_gf = merged_gf.merge(df2, on=merge_on, how="left") | |||
| new_columns = [col for col in list(merged_gf) if col not in merge_on] | |||
| df1 = pd.concat([df1, merged_gf[new_columns]], axis=1) | |||
| return df1 | |||
| def model_predict(algo, idx, test_x): | |||
| store = store_list[idx] | |||
| if algo == "lgb": | |||
| model = lgb.Booster(model_file=os.path.join(model_dir, f"lgb_{store}.out")) | |||
| return model.predict(test_x, num_iteration=model.best_iteration) | |||
| elif algo == "ridge": | |||
| model = joblib.load(os.path.join(model_dir, f"ridge_{store}.out")) | |||
| return model.predict(test_x) | |||
| elif algo == "svm": | |||
| model = joblib.load(os.path.join(model_dir, f"svm_{store}.out")) | |||
| return model.predict(test_x) | |||
| def get_weights(algo): | |||
| weights = [] | |||
| if algo == "lgb": | |||
| for store in store_list: | |||
| model = lgb.Booster(model_file=os.path.join(model_dir, f"lgb_{store}.out")) | |||
| weights.append(model.feature_importance()) | |||
| else: | |||
| for store in store_list: | |||
| model = joblib.load(os.path.join(model_dir, f"ridge_{store}.out")) | |||
| weights.append(model.coef_) | |||
| return np.array(weights) | |||
| def score(real_y, pred_y, sample_weight, multioutput): | |||
| return mean_squared_error(real_y, pred_y, sample_weight=sample_weight, multioutput=multioutput, squared=False) | |||
| def acquire_data(store, fill_flag=False): | |||
| TARGET = "sales" | |||
| suffix = f"_fill" if fill_flag else "" | |||
| train = pd.read_pickle(os.path.join(processed_data_dir, f"train_{store}{suffix}.pkl")) | |||
| val = pd.read_pickle(os.path.join(processed_data_dir, f"val_{store}{suffix}.pkl")) | |||
| train_y = train[TARGET] | |||
| train_x = train.drop(columns=TARGET, axis=1) | |||
| val_y = val[TARGET] | |||
| val_x = val.drop(columns=TARGET, axis=1) | |||
| train_x = train_x.to_numpy() | |||
| train_y = train_y.to_numpy() | |||
| val_x = val_x.to_numpy() | |||
| val_y = val_y.to_numpy() | |||
| return train_x, train_y, val_x, val_y | |||
| @@ -1,211 +0,0 @@ | |||
| import os | |||
| import fire | |||
| import time | |||
| import zipfile | |||
| import numpy as np | |||
| from tqdm import tqdm | |||
| from shutil import copyfile, rmtree | |||
| import learnware | |||
| from learnware.market import instantiate_learnware_market, BaseUserInfo | |||
| from learnware.reuse import JobSelectorReuser, AveragingReuser | |||
| from learnware.specification import generate_rkme_table_spec | |||
| from m5 import DataLoader | |||
| from learnware.logger import get_module_logger | |||
| logger = get_module_logger("m5_test", level="INFO") | |||
| output_description = { | |||
| "Dimension": 1, | |||
| "Description": {}, | |||
| } | |||
| input_description = { | |||
| "Dimension": 82, | |||
| "Description": {}, | |||
| } | |||
| semantic_specs = [ | |||
| { | |||
| "Data": {"Values": ["Table"], "Type": "Class"}, | |||
| "Task": {"Values": ["Regression"], "Type": "Class"}, | |||
| "Library": {"Values": ["Scikit-learn"], "Type": "Class"}, | |||
| "Scenario": {"Values": ["Business"], "Type": "Tag"}, | |||
| "Description": {"Values": "", "Type": "String"}, | |||
| "Name": {"Values": "learnware_1", "Type": "String"}, | |||
| "Input": input_description, | |||
| "Output": output_description, | |||
| "License": {"Values": ["MIT"], "Type": "Class"}, | |||
| } | |||
| ] | |||
| user_semantic = { | |||
| "Data": {"Values": ["Table"], "Type": "Class"}, | |||
| "Task": {"Values": ["Regression"], "Type": "Class"}, | |||
| "Library": {"Values": ["Scikit-learn"], "Type": "Class"}, | |||
| "Scenario": {"Values": ["Business"], "Type": "Tag"}, | |||
| "Description": {"Values": "", "Type": "String"}, | |||
| "Name": {"Values": "", "Type": "String"}, | |||
| "Input": input_description, | |||
| "Output": output_description, | |||
| "License": {"Values": ["MIT"], "Type": "Class"}, | |||
| } | |||
| class M5DatasetWorkflow: | |||
| def _init_m5_dataset(self): | |||
| m5 = DataLoader() | |||
| m5.regenerate_data() | |||
| algo_list = ["ridge", "lgb"] | |||
| for algo in algo_list: | |||
| m5.set_algo(algo) | |||
| m5.retrain_models() | |||
| def _init_learnware_market(self): | |||
| """initialize learnware market""" | |||
| # database_ops.clear_learnware_table() | |||
| learnware.init() | |||
| easy_market = instantiate_learnware_market(name="easy", rebuild=True) | |||
| print("Total Item:", len(easy_market)) | |||
| zip_path_list = [] | |||
| curr_root = os.path.dirname(os.path.abspath(__file__)) | |||
| curr_root = os.path.join(curr_root, "learnware_pool") | |||
| for zip_path in os.listdir(curr_root): | |||
| zip_path_list.append(os.path.join(curr_root, zip_path)) | |||
| for idx, zip_path in enumerate(zip_path_list): | |||
| semantic_spec = semantic_specs[0] | |||
| semantic_spec["Name"]["Values"] = "learnware_%d" % (idx) | |||
| semantic_spec["Description"]["Values"] = "test_learnware_number_%d" % (idx) | |||
| easy_market.add_learnware(zip_path, semantic_spec) | |||
| print("Total Item:", len(easy_market)) | |||
| def prepare_learnware(self, regenerate_flag=False): | |||
| if regenerate_flag: | |||
| self._init_m5_dataset() | |||
| m5 = DataLoader() | |||
| idx_list = m5.get_idx_list() | |||
| algo_list = ["lgb"] # algo_list = ["ridge", "lgb"] | |||
| curr_root = os.path.dirname(os.path.abspath(__file__)) | |||
| curr_root = os.path.join(curr_root, "learnware_pool") | |||
| os.makedirs(curr_root, exist_ok=True) | |||
| for idx in tqdm(idx_list): | |||
| train_x, train_y, test_x, test_y = m5.get_idx_data(idx) | |||
| st = time.time() | |||
| spec = generate_rkme_table_spec(X=train_x, gamma=0.1, cuda_idx=0) | |||
| ed = time.time() | |||
| logger.info("Stat spec generated in %.3f s" % (ed - st)) | |||
| for algo in algo_list: | |||
| m5.set_algo(algo) | |||
| dir_path = os.path.join(curr_root, f"{algo}_{idx}") | |||
| os.makedirs(dir_path, exist_ok=True) | |||
| spec_path = os.path.join(dir_path, "rkme.json") | |||
| spec.save(spec_path) | |||
| model_path = m5.get_model_path(idx) | |||
| model_file = os.path.join(dir_path, "model.out") | |||
| copyfile(model_path, model_file) | |||
| init_file = os.path.join(dir_path, "__init__.py") | |||
| copyfile("example_init.py", init_file) | |||
| yaml_file = os.path.join(dir_path, "learnware.yaml") | |||
| copyfile("example.yaml", yaml_file) | |||
| zip_file = dir_path + ".zip" | |||
| with zipfile.ZipFile(zip_file, "w") as zip_obj: | |||
| for foldername, subfolders, filenames in os.walk(dir_path): | |||
| for filename in filenames: | |||
| file_path = os.path.join(foldername, filename) | |||
| zip_info = zipfile.ZipInfo(filename) | |||
| zip_info.compress_type = zipfile.ZIP_STORED | |||
| with open(file_path, "rb") as file: | |||
| zip_obj.writestr(zip_info, file.read()) | |||
| rmtree(dir_path) | |||
| def test(self, regenerate_flag=False): | |||
| self.prepare_learnware(regenerate_flag) | |||
| self._init_learnware_market() | |||
| easy_market = instantiate_learnware_market(name="easy") | |||
| print("Total Item:", len(easy_market)) | |||
| m5 = DataLoader() | |||
| idx_list = m5.get_idx_list() | |||
| os.makedirs("./user_spec", exist_ok=True) | |||
| single_score_list = [] | |||
| random_score_list = [] | |||
| job_selector_score_list = [] | |||
| ensemble_score_list = [] | |||
| improve_list = [] | |||
| for idx in idx_list: | |||
| train_x, train_y, test_x, test_y = m5.get_idx_data(idx) | |||
| user_spec = generate_rkme_table_spec(X=test_x, gamma=0.1, cuda_idx=0) | |||
| user_spec_path = f"./user_spec/user_{idx}.json" | |||
| user_spec.save(user_spec_path) | |||
| user_info = BaseUserInfo(semantic_spec=user_semantic, stat_info={"RKMETableSpecification": user_spec}) | |||
| search_result = easy_market.search_learnware(user_info) | |||
| single_result = search_result.get_single_results() | |||
| multiple_result = search_result.get_multiple_results() | |||
| print(f"search result of user{idx}:") | |||
| print( | |||
| f"single model num: {len(single_result)}, max_score: {single_result[0].score}, min_score: {single_result[-1].score}" | |||
| ) | |||
| loss_list = [] | |||
| for single_item in single_result: | |||
| pred_y = single_item.learnware.predict(test_x) | |||
| loss_list.append(m5.score(test_y, pred_y)) | |||
| print( | |||
| f"Top1-score: {single_result[0].score}, learnware_id: {single_result[0].learnware.id}, loss: {loss_list[0]}" | |||
| ) | |||
| if len(multiple_result) > 0: | |||
| mixture_id = " ".join([learnware.id for learnware in multiple_result[0].learnwares]) | |||
| print(f"mixture_score: {multiple_result[0].score}, mixture_learnware: {mixture_id}") | |||
| mixture_learnware_list = multiple_result[0].learnwares | |||
| else: | |||
| mixture_learnware_list = [single_result[0].learnware] | |||
| reuse_job_selector = JobSelectorReuser(learnware_list=mixture_learnware_list, use_herding=False) | |||
| job_selector_predict_y = reuse_job_selector.predict(user_data=test_x) | |||
| job_selector_score = m5.score(test_y, job_selector_predict_y) | |||
| print(f"mixture reuse loss (job selector): {job_selector_score}") | |||
| reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list, mode="vote_by_prob") | |||
| ensemble_predict_y = reuse_ensemble.predict(user_data=test_x) | |||
| ensemble_score = m5.score(test_y, ensemble_predict_y) | |||
| print(f"mixture reuse loss (ensemble): {ensemble_score}\n") | |||
| single_score_list.append(loss_list[0]) | |||
| random_score_list.append(np.mean(loss_list)) | |||
| job_selector_score_list.append(job_selector_score) | |||
| ensemble_score_list.append(ensemble_score) | |||
| improve_list.append((np.mean(loss_list) - loss_list[0]) / np.mean(loss_list)) | |||
| logger.info("Single search score %.3f +/- %.3f" % (np.mean(single_score_list), np.std(single_score_list))) | |||
| logger.info("Random search score: %.3f +/- %.3f" % (np.mean(random_score_list), np.std(random_score_list))) | |||
| logger.info("Average score improvement: %.3f" % (np.mean(improve_list))) | |||
| logger.info( | |||
| "Job selector score: %.3f +/- %.3f" % (np.mean(job_selector_score_list), np.std(job_selector_score_list)) | |||
| ) | |||
| logger.info( | |||
| "Average ensemble score: %.3f +/- %.3f" % (np.mean(ensemble_score_list), np.std(ensemble_score_list)) | |||
| ) | |||
| if __name__ == "__main__": | |||
| fire.Fire(M5DatasetWorkflow) | |||
| @@ -1,87 +0,0 @@ | |||
| import hashlib | |||
| import requests | |||
| import os | |||
| import random | |||
| import json | |||
| import time | |||
| from tqdm import tqdm | |||
| email = "tanzh@lamda.nju.edu.cn" | |||
| password = hashlib.md5(b"Qwerty123").hexdigest() | |||
| login_url = "http://210.28.134.201:8089/auth/login" | |||
| submit_url = "http://210.28.134.201:8089/user/add_learnware" | |||
| all_data_type = ["Table", "Image", "Video", "Text", "Audio"] | |||
| all_task_type = [ | |||
| "Classification", | |||
| "Regression", | |||
| "Clustering", | |||
| "Feature Extraction", | |||
| "Generation", | |||
| "Segmentation", | |||
| "Object Detection", | |||
| ] | |||
| all_device_type = ["CPU", "GPU"] | |||
| all_scenario = [ | |||
| "Business", | |||
| "Financial", | |||
| "Health", | |||
| "Politics", | |||
| "Computer", | |||
| "Internet", | |||
| "Traffic", | |||
| "Nature", | |||
| "Fashion", | |||
| "Industry", | |||
| "Agriculture", | |||
| "Education", | |||
| "Entertainment", | |||
| "Architecture", | |||
| ] | |||
| # ############### | |||
| # 以上部分无需修改 # | |||
| # ############### | |||
| def main(): | |||
| session = requests.Session() | |||
| res = session.post(login_url, json={"email": email, "password": password}) | |||
| # /path/to/learnware/folder 修改为学件文件夹地址 | |||
| learnware_pool = os.listdir(os.path.join(os.path.abspath("."), "learnware_pool")) | |||
| for learnware in learnware_pool: | |||
| # 修改相应的语义规约 | |||
| name = "M5_Shop" + "%02d" % int(learnware.split(".")[0].split("_")[1]) | |||
| name = name + "_" + time.strftime("%Y%m%d%H%M%S", time.localtime()) | |||
| description = f"This is a description of learnware {name}" | |||
| data = random.choice(all_data_type) | |||
| task = random.choice(all_task_type) | |||
| device = list(set(random.choices(all_device_type, k=2))) | |||
| scenario = list(set(random.choices(all_scenario, k=5))) | |||
| semantic_specification = { | |||
| "Data": {"Values": ["Table"], "Type": "Class"}, | |||
| "Task": {"Values": ["Regression"], "Type": "Class"}, | |||
| "Device": {"Values": ["CPU"], "Type": "Tag"}, | |||
| "Scenario": {"Values": ["Business"], "Type": "Tag"}, | |||
| "Description": {"Values": "A sales-forecasting model from Walmart store", "Type": "String"}, | |||
| "Name": {"Values": name, "Type": "String"}, | |||
| "License": {"Values": ["MIT"], "Type": "Class"}, | |||
| } | |||
| res = session.post( | |||
| submit_url, | |||
| data={ | |||
| "semantic_specification": json.dumps(semantic_specification), | |||
| }, | |||
| files={ | |||
| "learnware_file": open( | |||
| os.path.join(os.path.abspath("."), "learnware_pool", learnware), | |||
| "rb", | |||
| ) | |||
| }, | |||
| ) | |||
| assert json.loads(res.text)["code"] == 0, "Upload error" | |||
| if __name__ == "__main__": | |||
| main() | |||
| @@ -1,8 +0,0 @@ | |||
| model: | |||
| class_name: Model | |||
| kwargs: {} | |||
| stat_specifications: | |||
| - module_path: learnware.specification | |||
| class_name: RKMETableSpecification | |||
| file_name: rkme.json | |||
| kwargs: {} | |||
| @@ -1,20 +0,0 @@ | |||
| import os | |||
| import joblib | |||
| import numpy as np | |||
| from learnware.model import BaseModel | |||
| class Model(BaseModel): | |||
| def __init__(self): | |||
| super(Model, self).__init__(input_shape=(31,), output_shape=(1,)) | |||
| dir_path = os.path.dirname(os.path.abspath(__file__)) | |||
| self.model = joblib.load(os.path.join(dir_path, "model.out")) | |||
| def fit(self, X: np.ndarray, y: np.ndarray): | |||
| pass | |||
| def predict(self, X: np.ndarray) -> np.ndarray: | |||
| return self.model.predict(X) | |||
| def finetune(self, X: np.ndarray, y: np.ndarray): | |||
| pass | |||
| @@ -1,208 +0,0 @@ | |||
| import os | |||
| import fire | |||
| import zipfile | |||
| import time | |||
| import numpy as np | |||
| from tqdm import tqdm | |||
| from shutil import copyfile, rmtree | |||
| import learnware | |||
| from learnware.market import instantiate_learnware_market, BaseUserInfo | |||
| from learnware.reuse import JobSelectorReuser, AveragingReuser | |||
| from learnware.specification import generate_rkme_table_spec | |||
| from pfs import Dataloader | |||
| from learnware.logger import get_module_logger | |||
| logger = get_module_logger("pfs_test", level="INFO") | |||
| output_description = { | |||
| "Dimension": 1, | |||
| "Description": {}, | |||
| } | |||
| input_description = { | |||
| "Dimension": 31, | |||
| "Description": {}, | |||
| } | |||
| semantic_specs = [ | |||
| { | |||
| "Data": {"Values": ["Table"], "Type": "Class"}, | |||
| "Task": {"Values": ["Regression"], "Type": "Class"}, | |||
| "Library": {"Values": ["Scikit-learn"], "Type": "Class"}, | |||
| "Scenario": {"Values": ["Business"], "Type": "Tag"}, | |||
| "Description": {"Values": "", "Type": "String"}, | |||
| "Name": {"Values": "learnware_1", "Type": "String"}, | |||
| "Input": input_description, | |||
| "Output": output_description, | |||
| "License": {"Values": ["MIT"], "Type": "Class"}, | |||
| } | |||
| ] | |||
| user_semantic = { | |||
| "Data": {"Values": ["Table"], "Type": "Class"}, | |||
| "Task": {"Values": ["Regression"], "Type": "Class"}, | |||
| "Library": {"Values": ["Scikit-learn"], "Type": "Class"}, | |||
| "Scenario": {"Values": ["Business"], "Type": "Tag"}, | |||
| "Description": {"Values": "", "Type": "String"}, | |||
| "Name": {"Values": "", "Type": "String"}, | |||
| "Input": input_description, | |||
| "Output": output_description, | |||
| "License": {"Values": ["MIT"], "Type": "Class"}, | |||
| } | |||
| class PFSDatasetWorkflow: | |||
| def _init_pfs_dataset(self): | |||
| pfs = Dataloader() | |||
| pfs.regenerate_data() | |||
| algo_list = ["ridge"] # "ridge", "lgb" | |||
| for algo in algo_list: | |||
| pfs.set_algo(algo) | |||
| pfs.retrain_models() | |||
| def _init_learnware_market(self): | |||
| """initialize learnware market""" | |||
| learnware.init() | |||
| easy_market = instantiate_learnware_market(market_id="pfs", name="easy", rebuild=True) | |||
| print("Total Item:", len(easy_market)) | |||
| zip_path_list = [] | |||
| curr_root = os.path.dirname(os.path.abspath(__file__)) | |||
| curr_root = os.path.join(curr_root, "learnware_pool") | |||
| for zip_path in os.listdir(curr_root): | |||
| zip_path_list.append(os.path.join(curr_root, zip_path)) | |||
| for idx, zip_path in enumerate(zip_path_list): | |||
| semantic_spec = semantic_specs[0] | |||
| semantic_spec["Name"]["Values"] = "learnware_%d" % (idx) | |||
| semantic_spec["Description"]["Values"] = "test_learnware_number_%d" % (idx) | |||
| easy_market.add_learnware(zip_path, semantic_spec) | |||
| print("Total Item:", len(easy_market)) | |||
| def prepare_learnware(self, regenerate_flag=False): | |||
| if regenerate_flag: | |||
| self._init_pfs_dataset() | |||
| pfs = Dataloader() | |||
| idx_list = pfs.get_idx_list() | |||
| algo_list = ["ridge"] # ["ridge", "lgb"] | |||
| curr_root = os.path.dirname(os.path.abspath(__file__)) | |||
| curr_root = os.path.join(curr_root, "learnware_pool") | |||
| os.makedirs(curr_root, exist_ok=True) | |||
| for idx in tqdm(idx_list): | |||
| train_x, train_y, test_x, test_y = pfs.get_idx_data(idx) | |||
| st = time.time() | |||
| spec = generate_rkme_table_spec(X=train_x, gamma=0.1, cuda_idx=0) | |||
| ed = time.time() | |||
| logger.info("Stat spec generated in %.3f s" % (ed - st)) | |||
| for algo in algo_list: | |||
| pfs.set_algo(algo) | |||
| dir_path = os.path.join(curr_root, f"{algo}_{idx}") | |||
| os.makedirs(dir_path, exist_ok=True) | |||
| spec_path = os.path.join(dir_path, "rkme.json") | |||
| spec.save(spec_path) | |||
| model_path = pfs.get_model_path(idx) | |||
| model_file = os.path.join(dir_path, "model.out") | |||
| copyfile(model_path, model_file) | |||
| init_file = os.path.join(dir_path, "__init__.py") | |||
| copyfile("example_init.py", init_file) | |||
| yaml_file = os.path.join(dir_path, "learnware.yaml") | |||
| copyfile("example.yaml", yaml_file) | |||
| zip_file = dir_path + ".zip" | |||
| with zipfile.ZipFile(zip_file, "w") as zip_obj: | |||
| for foldername, subfolders, filenames in os.walk(dir_path): | |||
| for filename in filenames: | |||
| file_path = os.path.join(foldername, filename) | |||
| zip_info = zipfile.ZipInfo(filename) | |||
| zip_info.compress_type = zipfile.ZIP_STORED | |||
| with open(file_path, "rb") as file: | |||
| zip_obj.writestr(zip_info, file.read()) | |||
| rmtree(dir_path) | |||
| def test(self, regenerate_flag=False): | |||
| self.prepare_learnware(regenerate_flag) | |||
| self._init_learnware_market() | |||
| easy_market = instantiate_learnware_market(market_id="pfs", name="easy") | |||
| print("Total Item:", len(easy_market)) | |||
| pfs = Dataloader() | |||
| idx_list = pfs.get_idx_list() | |||
| os.makedirs("./user_spec", exist_ok=True) | |||
| single_score_list = [] | |||
| random_score_list = [] | |||
| job_selector_score_list = [] | |||
| ensemble_score_list = [] | |||
| improve_list = [] | |||
| for idx in idx_list: | |||
| train_x, train_y, test_x, test_y = pfs.get_idx_data(idx) | |||
| user_spec = generate_rkme_table_spec(X=test_x, gamma=0.1, cuda_idx=0) | |||
| user_spec_path = f"./user_spec/user_{idx}.json" | |||
| user_spec.save(user_spec_path) | |||
| user_info = BaseUserInfo(semantic_spec=user_semantic, stat_info={"RKMETableSpecification": user_spec}) | |||
| search_result = easy_market.search_learnware(user_info) | |||
| single_result = search_result.get_single_results() | |||
| multiple_result = search_result.get_multiple_results() | |||
| print(f"search result of user{idx}:") | |||
| print( | |||
| f"single model num: {len(single_result)}, max_score: {single_result[0].score}, min_score: {single_result[-1].score}" | |||
| ) | |||
| loss_list = [] | |||
| for single_item in single_result: | |||
| pred_y = single_item.learnware.predict(test_x) | |||
| loss_list.append(pfs.score(test_y, pred_y)) | |||
| print( | |||
| f"Top1-score: {single_result[0].score}, learnware_id: {single_result[0].learnware.id}, loss: {loss_list[0]}, random: {np.mean(loss_list)}" | |||
| ) | |||
| if len(multiple_result) > 0: | |||
| mixture_id = " ".join([learnware.id for learnware in multiple_result[0].learnwares]) | |||
| print(f"mixture_score: {multiple_result[0].score}, mixture_learnware: {mixture_id}") | |||
| mixture_learnware_list = multiple_result[0].learnwares | |||
| else: | |||
| mixture_learnware_list = [single_result[0].learnware] | |||
| reuse_job_selector = JobSelectorReuser(learnware_list=mixture_learnware_list, use_herding=False) | |||
| job_selector_predict_y = reuse_job_selector.predict(user_data=test_x) | |||
| job_selector_score = pfs.score(test_y, job_selector_predict_y) | |||
| print(f"mixture reuse loss (job selector): {job_selector_score}") | |||
| reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list) | |||
| ensemble_predict_y = reuse_ensemble.predict(user_data=test_x) | |||
| ensemble_score = pfs.score(test_y, ensemble_predict_y) | |||
| print(f"mixture reuse loss (ensemble): {ensemble_score}\n") | |||
| single_score_list.append(loss_list[0]) | |||
| random_score_list.append(np.mean(loss_list)) | |||
| job_selector_score_list.append(job_selector_score) | |||
| ensemble_score_list.append(ensemble_score) | |||
| improve_list.append((np.mean(loss_list) - loss_list[0]) / np.mean(loss_list)) | |||
| logger.info("Single search score %.3f +/- %.3f" % (np.mean(single_score_list), np.std(single_score_list))) | |||
| logger.info("Random search score: %.3f +/- %.3f" % (np.mean(random_score_list), np.std(random_score_list))) | |||
| logger.info("Average score improvement: %.3f" % (np.mean(improve_list))) | |||
| logger.info( | |||
| "Job selector score: %.3f +/- %.3f" % (np.mean(job_selector_score_list), np.std(job_selector_score_list)) | |||
| ) | |||
| logger.info( | |||
| "Average ensemble score: %.3f +/- %.3f" % (np.mean(ensemble_score_list), np.std(ensemble_score_list)) | |||
| ) | |||
| if __name__ == "__main__": | |||
| fire.Fire(PFSDatasetWorkflow) | |||
| @@ -1,48 +0,0 @@ | |||
| # Learnware based on Prediction Future Sales (PFS) data downloaded from Kaggle | |||
| --> Data Page Link: https://www.kaggle.com/c/competitive-data-science-predict-future-sales/data | |||
| --> Code Page Link: https://www.kaggle.com/uladzimirkapeika/feature-engineering-lightgbm-top-1 | |||
| # PFS任务描述 | |||
| --> 目标:预测每个商店每个商品在下一个月的销量(注意:粒度为月,而不是每天) | |||
| --> 特征信息:商店所在城市信息、商品类别信息、商品价格信息、商品历史价格信息(特征工程中只使用了前三个月的历史信息然后拼接在一起)等 | |||
| --> 使用的模型:XgBoost, LightGBM, LinearRegression | |||
| --> 评价指标:RMSE | |||
| * split_pfs_data.py | |||
| --> 根据Kaggle上公开的数据预处理方案处理下载的数据 | |||
| --> 直接运行即可将数据根据Shop ID划分为每个商店的信息,包括: | |||
| ----> 每个商品在每个月下的特征和目标值,存储为pandas.DataFrame格式 | |||
| ----> 字段包括: | |||
| -- 标识信息: 'shop_id', 'item_id', 'date_block_num' (标识月份), | |||
| -- 目标值(本月销量): 'item_cnt_month', | |||
| -- 城市信息: 'city_code', 'city_coord_1', 'city_coord_2', 'country_part', | |||
| -- 商品种类信息: 'item_category_common', 'item_category_code', | |||
| -- 该月的时间信息: 'weeknd_count', 'days_in_month', | |||
| -- 商品是否第一次销售: 'item_first_interaction', 'shop_item_sold_before', | |||
| -- 商品前三个月的销售量和价格信息: | |||
| 'item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3', | |||
| 'item_shop_price_avg_lag_1', 'item_shop_price_avg_lag_2', 'item_shop_price_avg_lag_3', | |||
| 'item_target_enc_lag_1', 'item_target_enc_lag_2', 'item_target_enc_lag_3', | |||
| 'item_loc_target_enc_lag_1', 'item_loc_target_enc_lag_2', 'item_loc_target_enc_lag_3', 'item_shop_target_enc_lag_1', 'item_shop_target_enc_lag_2', 'item_shop_target_enc_lag_3', | |||
| 'new_item_cat_avg_lag_1', 'new_item_cat_avg_lag_2', 'new_item_cat_avg_lag_3', | |||
| 'new_item_shop_cat_avg_lag_1', 'new_item_shop_cat_avg_lag_2', 'new_item_shop_cat_avg_lag_3', | |||
| 'item_cnt_month_lag_1_adv', 'item_cnt_month_lag_2_adv', 'item_cnt_month_lag_3_adv' | |||
| ----> 特征: 除了'item_cnt_month'之外的列都当做特征列 | |||
| ----> 目标值: 'item_cnt_month' | |||
| ----> 时间标识: 'data_block_num'将2013.01到2015.10月的数据标识为0-33,要预测的2015.11月数据为34 | |||
| --> 存储结果分为两部分: 按照时间划分的train & val,是pandas.DataFrame格式 | |||
| * pfs_cross_transfer.py | |||
| --> 在各自商店训练集上训练一个模型,然后在所有商店的测试集上测试,保存两两预测的RMSE结果,并进行分析 | |||
| --> 分析包括两部分:(1) 对于一个目标商店,其余源域模型的性能均值,方差,最小值(最好的模型),最大值,超过均值的源域数目,选择最好模型能够提升的比例等等;(2) HeatMap | |||
| --> 需要扩展的方向:(1) LightGBM, Ridge, Xgboost,以及超参数调参;(2) 特征工程去除标识信息,例如shop_id, item_id等等 | |||
| * data_api.py | |||
| --> 后续封装的代码,需继续完善 | |||
| * packages | |||
| --> pip install lightgbm | |||
| @@ -1,77 +0,0 @@ | |||
| import joblib | |||
| import os | |||
| from sklearn.metrics import mean_squared_error | |||
| from .pfs_cross_transfer import * | |||
| from .split_data import feature_engineering | |||
| class Dataloader: | |||
| def __init__(self): | |||
| self.algo = "ridge" | |||
| def regenerate_data(self): | |||
| feature_engineering() | |||
| def set_algo(self, algo): | |||
| self.algo = algo | |||
| def get_algo_list(self): | |||
| return ["lgb", "ridge"] | |||
| def get_idx_list(self): | |||
| return [i for i in range(53)] | |||
| def get_idx_data(self, idx): | |||
| shop_ids = [i for i in range(60) if i not in [0, 1, 40]] | |||
| shop_ids = [i for i in shop_ids if i not in [8, 11, 23, 36]] | |||
| fpath = os.path.join(pfs_split_dir, "Shop{:0>2d}-train.csv".format(shop_ids[idx])) | |||
| train_xs, train_ys, _, _ = load_pfs_data(fpath) | |||
| fpath = os.path.join(pfs_split_dir, "Shop{:0>2d}-val.csv".format(shop_ids[idx])) | |||
| test_xs, test_ys, _, _ = load_pfs_data(fpath) | |||
| return train_xs, train_ys, test_xs, test_ys | |||
| def get_model_path(self, idx): | |||
| shop_ids = [i for i in range(60) if i not in [0, 1, 40]] | |||
| shop_ids = [i for i in shop_ids if i not in [8, 11, 23, 36]] | |||
| return os.path.join(model_dir, "{}_Shop{:0>2d}.out".format(self.algo, shop_ids[idx])) | |||
| def retrain_models(self): | |||
| algo = self.algo | |||
| errs = get_errors(algo=algo) | |||
| fpath = os.path.join(pfs_res_dir, "PFS_{}_errs.txt".format(algo)) | |||
| np.savetxt(fpath, errs.T) | |||
| plot_heatmap(errs.T, algo) | |||
| weights = np.loadtxt(os.path.join(pfs_res_dir, "PFS_{}_weights.txt".format(algo))) | |||
| plot_performance(errs.T, weights, algo) | |||
| def retrain_split_models(self): | |||
| fpath = os.path.join(pfs_res_dir, "PFS_{}_split_errs_user.txt".format(self.algo)) | |||
| if os.path.exists(fpath): | |||
| return np.loadtxt(fpath) | |||
| algo = self.algo | |||
| errs = get_split_errs(algo=algo) | |||
| fpath = os.path.join(pfs_res_dir, "PFS_{}_split_errs_user.txt".format(algo)) | |||
| np.savetxt(fpath, errs) | |||
| return errs | |||
| def get_errs(self): | |||
| return np.loadtxt(os.path.join(pfs_res_dir, "PFS_{}_errs.txt".format(self.algo))) | |||
| def get_weights(self): | |||
| return np.loadtxt(os.path.join(pfs_res_dir, "PFS_{}_weights.txt".format(self.algo))) | |||
| def predict(self, idx, test_x): | |||
| shop_ids = [i for i in range(60) if i not in [0, 1, 40]] | |||
| shop_ids = [i for i in shop_ids if i not in [8, 11, 23, 36]] | |||
| model = joblib.load(os.path.join(model_dir, "{}_Shop{:0>2d}.out".format(self.algo, shop_ids[idx]))) | |||
| # test_x = (test_x - test_x.min(0)) / (test_x.max(0) - test_x.min(0) + 0.0001) | |||
| return model.predict(test_x) | |||
| def score(self, real_y, pred_y, sample_weight=None): | |||
| return mean_squared_error(real_y, pred_y, sample_weight=sample_weight, squared=False) | |||
| @@ -1,272 +0,0 @@ | |||
| market_store_list = [ | |||
| 0, | |||
| 2, | |||
| 3, | |||
| 4, | |||
| 5, | |||
| 6, | |||
| 7, | |||
| 8, | |||
| 9, | |||
| 10, | |||
| 12, | |||
| 13, | |||
| 14, | |||
| 15, | |||
| 16, | |||
| 17, | |||
| 18, | |||
| 20, | |||
| 22, | |||
| 23, | |||
| 24, | |||
| 25, | |||
| 26, | |||
| 27, | |||
| 28, | |||
| 30, | |||
| 31, | |||
| 32, | |||
| 33, | |||
| 34, | |||
| 35, | |||
| 37, | |||
| 38, | |||
| 39, | |||
| 40, | |||
| 42, | |||
| 44, | |||
| 45, | |||
| 46, | |||
| 47, | |||
| 48, | |||
| 50, | |||
| 52, | |||
| ] | |||
| user_store_list = [1, 11, 19, 21, 29, 36, 43, 49] | |||
| dataset_info = { | |||
| "name": "PFS", | |||
| "range of date": "2014.01-2015.10", | |||
| "description": "You are provided with daily historical sales data. The task is to forecast the total amount of products sold in every shop for the test set. Note that the list of shops and products slightly changes every month. More specifically, the dataset involves 53 shops in Russia", | |||
| "location_original": [ | |||
| "Адыгея, Россия", | |||
| "Балашиха, Россия", | |||
| "Волжский, Россия", | |||
| "Вологда, Россия", | |||
| "Воронеж, Россия", | |||
| "Воронеж, Россия", | |||
| "Воронеж, Россия", | |||
| "выезд, Россия", | |||
| "Жуковский, Россия", | |||
| "интернет-магазин, Россия", | |||
| "Казань, Россия", | |||
| "Калуга, Россия", | |||
| "колонна, Россия", | |||
| "Красноярск, Россия", | |||
| "Красноярск, Россия", | |||
| "курск, Россия", | |||
| "Москва, Россия", | |||
| "Москва, Россия", | |||
| "Москва, Россия", | |||
| "Москва, Россия", | |||
| "Москва, Россия", | |||
| "Москва, Россия", | |||
| "Москва, Россия", | |||
| "Москва, Россия", | |||
| "Москва, Россия", | |||
| "Москва, Россия", | |||
| "Москва, Россия", | |||
| "Москва, Россия", | |||
| "Мытищи, Россия", | |||
| "Н.Новгород, Россия", | |||
| "Н.Новгород, Россия", | |||
| "Новосибирск, Россия", | |||
| "Новосибирск, Россия", | |||
| "Ростовнадон, Россия", | |||
| "Ростовнадон, Россия", | |||
| "спб, Россия", | |||
| "спб, Россия", | |||
| "самара, Россия", | |||
| "самара, Россия", | |||
| "Сергий, Россия", | |||
| "Сургут, Россия", | |||
| "томск, Россия", | |||
| "тюмень, Россия", | |||
| "тюмень, Россия", | |||
| "тюмень, Россия", | |||
| "Уфа, Россия", | |||
| "Уфа, Россия", | |||
| "Химки, Россия", | |||
| "цифровой, Россия", | |||
| "Чехи, Россия", | |||
| "Якутск, Россия", | |||
| "Якутск, Россия", | |||
| "Ярославль, Россия", | |||
| ], | |||
| "location_english": [ | |||
| "adygea, Russia", | |||
| "Balashikha, Russia", | |||
| "Volzhsky, Russia", | |||
| "Vologda, Russia", | |||
| "Voronezh, Russia", | |||
| "Voronezh, Russia", | |||
| "Voronezh, Russia", | |||
| "outbound, Russia", | |||
| "zhukovsky, Russia", | |||
| "online stor, Russia", | |||
| "Kazan, Russia", | |||
| "Kaluga, Russia", | |||
| "column, Russia", | |||
| "Krasnoyarsk, Russia", | |||
| "Krasnoyarsk, Russia", | |||
| "kursk, Russia", | |||
| "Moscow, Russia", | |||
| "Moscow, Russia", | |||
| "Moscow, Russia", | |||
| "Moscow, Russia", | |||
| "Moscow, Russia", | |||
| "Moscow, Russia", | |||
| "Moscow, Russia", | |||
| "Moscow, Russia", | |||
| "Moscow, Russia", | |||
| "Moscow, Russia", | |||
| "Moscow, Russia", | |||
| "Moscow, Russia", | |||
| "mytishchi, Russia", | |||
| "N.Novgorod, Russia", | |||
| "N.Novgorod, Russia", | |||
| "Novosibirsk, Russia", | |||
| "Novosibirsk, Russia", | |||
| "rostovnadon, Russia", | |||
| "rostovnadon, Russia", | |||
| "spb, Russia", | |||
| "spb, Russia", | |||
| "samara, Russia", | |||
| "samara, Russia", | |||
| "Sergius, Russia", | |||
| "surgut, Russia", | |||
| "tomsk, Russia", | |||
| "tyumen, Russia", | |||
| "tyumen, Russia", | |||
| "tyumen, Russia", | |||
| "Ufa, Russia", | |||
| "Ufa, Russia", | |||
| "Khimki, Russia", | |||
| "numeric, Russia", | |||
| "Czechs, Russia", | |||
| "Yakutsk, Russia", | |||
| "Yakutsk, Russia", | |||
| "Yaroslavl, Russia", | |||
| ], | |||
| "location_chinese": [ | |||
| "阿迪格亚, 俄罗斯", | |||
| "巴拉希哈, 俄罗斯", | |||
| "沃尔日斯基, 俄罗斯", | |||
| "沃洛格达, 俄罗斯", | |||
| "沃罗涅日, 俄罗斯", | |||
| "沃罗涅日, 俄罗斯", | |||
| "沃罗涅日, 俄罗斯", | |||
| "对外贸易, 俄罗斯", | |||
| "茹科夫斯基, 俄罗斯", | |||
| "在线商店, 俄罗斯", | |||
| "喀山, 俄罗斯", | |||
| "卡卢加, 俄罗斯", | |||
| "科洛姆纳, 俄罗斯", | |||
| "克拉斯诺亚尔斯克, 俄罗斯", | |||
| "克拉斯诺亚尔斯克, 俄罗斯", | |||
| "库尔斯克, 俄罗斯", | |||
| "莫斯科, 俄罗斯", | |||
| "莫斯科, 俄罗斯", | |||
| "莫斯科, 俄罗斯", | |||
| "莫斯科, 俄罗斯", | |||
| "莫斯科, 俄罗斯", | |||
| "莫斯科, 俄罗斯", | |||
| "莫斯科, 俄罗斯", | |||
| "莫斯科, 俄罗斯", | |||
| "莫斯科, 俄罗斯", | |||
| "莫斯科, 俄罗斯", | |||
| "莫斯科, 俄罗斯", | |||
| "莫斯科, 俄罗斯", | |||
| "梅季希, 俄罗斯", | |||
| "北诺夫哥罗德, 俄罗斯", | |||
| "北诺夫哥罗德, 俄罗斯", | |||
| "新西伯利亚, 俄罗斯", | |||
| "新西伯利亚, 俄罗斯", | |||
| "罗斯托夫纳东, 俄罗斯", | |||
| "罗斯托夫纳东, 俄罗斯", | |||
| "圣彼得堡, 俄罗斯", | |||
| "圣彼得堡, 俄罗斯", | |||
| "萨马拉, 俄罗斯", | |||
| "萨马拉, 俄罗斯", | |||
| "谢尔盖, 俄罗斯", | |||
| "苏尔古特, 俄罗斯", | |||
| "托木斯克, 俄罗斯", | |||
| "秋明, 俄罗斯", | |||
| "秋明, 俄罗斯", | |||
| "秋明, 俄罗斯", | |||
| "乌法, 俄罗斯", | |||
| "乌法, 俄罗斯", | |||
| "希姆基, 俄罗斯", | |||
| "在线商店, 俄罗斯", | |||
| "契诃夫, 俄罗斯", | |||
| "雅库茨克, 俄罗斯", | |||
| "雅库茨克, 俄罗斯", | |||
| "雅罗斯拉夫尔, 俄罗斯", | |||
| ], | |||
| "memory(KB)": [ | |||
| 246, | |||
| 302, | |||
| 3631, | |||
| 379, | |||
| 862, | |||
| 1020, | |||
| 471, | |||
| 867, | |||
| 588, | |||
| 233, | |||
| 657, | |||
| 1272, | |||
| 801, | |||
| 469, | |||
| 146, | |||
| 1309, | |||
| 98, | |||
| 1003, | |||
| 932, | |||
| 257, | |||
| 1959, | |||
| 1361, | |||
| 35, | |||
| 3265, | |||
| 217, | |||
| 283, | |||
| 4311, | |||
| 1155, | |||
| 43, | |||
| 1388, | |||
| 1971, | |||
| 971, | |||
| 7272, | |||
| 2782, | |||
| 304, | |||
| 6801, | |||
| 4942, | |||
| 181, | |||
| 190, | |||
| 3664, | |||
| 2061, | |||
| 170, | |||
| 807, | |||
| 593, | |||
| 1584, | |||
| 257, | |||
| 1819, | |||
| 50, | |||
| 1063, | |||
| 692, | |||
| 336, | |||
| 277, | |||
| 743, | |||
| ], | |||
| } | |||
| @@ -1,21 +0,0 @@ | |||
| import os | |||
| ROOT_PATH = os.path.abspath(os.path.join(__file__, "..", "data")) | |||
| raw_data_dir = os.path.join(ROOT_PATH, "raw_data") | |||
| split_data_dir = os.path.join(ROOT_PATH, "split_data") | |||
| res_dir = os.path.join(ROOT_PATH, "results") | |||
| model_dir = os.path.join(ROOT_PATH, "models") | |||
| model_dir2 = os.path.join(ROOT_PATH, "models2") | |||
| for dir_name in [ROOT_PATH, raw_data_dir, split_data_dir, res_dir, model_dir, model_dir2]: | |||
| if not os.path.exists(dir_name): | |||
| os.mkdir(dir_name) | |||
| pfs_data_dir = os.path.join(raw_data_dir, "PFS") | |||
| pfs_split_dir = os.path.join(split_data_dir, "PFS") | |||
| pfs_res_dir = os.path.join(res_dir, "PFS") | |||
| for dir_name in [pfs_data_dir, pfs_split_dir, pfs_res_dir]: | |||
| if not os.path.exists(dir_name): | |||
| os.mkdir(dir_name) | |||
| @@ -1,384 +0,0 @@ | |||
| import os | |||
| import pickle | |||
| import joblib | |||
| import numpy as np | |||
| import pandas as pd | |||
| import lightgbm as lgb | |||
| from sklearn.linear_model import Ridge | |||
| from sklearn.model_selection import GridSearchCV | |||
| from matplotlib import pyplot as plt | |||
| import matplotlib.ticker as ticker | |||
| from mpl_toolkits.axes_grid1 import make_axes_locatable | |||
| np.seterr(divide="ignore", invalid="ignore") | |||
| from .paths import pfs_split_dir, pfs_res_dir, model_dir | |||
| np.random.seed(0) | |||
| def load_pfs_data(fpath): | |||
| df = pd.read_csv(fpath) | |||
| features = list(df.columns) | |||
| features.remove("item_cnt_month") | |||
| features.remove("date_block_num") | |||
| # remove id info | |||
| # features.remove('shop_id') | |||
| # features.remove('item_id') | |||
| # remove discrete info | |||
| # features.remove('city_code') | |||
| # features.remove('item_category_code') | |||
| # features.remove('item_category_common') | |||
| xs = df[features].values | |||
| ys = df["item_cnt_month"].values | |||
| categorical_feature_names = ["country_part", "item_category_common", "item_category_code", "city_code"] | |||
| types = None | |||
| return xs, ys, features, types | |||
| def get_split_errs(algo): | |||
| """ | |||
| according to proportion_list, generate errs whose shape is [shop, split_data] | |||
| """ | |||
| shop_ids = [i for i in range(60) if i not in [0, 1, 40]] | |||
| shop_ids = [i for i in shop_ids if i not in [8, 11, 23, 36]] | |||
| user_list = [i for i in range(53)] | |||
| proportion_list = [100, 300, 500, 700, 900, 1000, 3000, 5000, 7000, 9000, 10000, 30000, 50000, 70000] | |||
| # train | |||
| errs = np.zeros((len(user_list), len(proportion_list))) | |||
| for s, sid in enumerate(user_list): | |||
| # load train data | |||
| fpath = os.path.join(pfs_split_dir, "Shop{:0>2d}-train.csv".format(shop_ids[sid])) | |||
| fpath_val = os.path.join(pfs_split_dir, "Shop{:0>2d}-val.csv".format(shop_ids[sid])) | |||
| train_xs, train_ys, _, _ = load_pfs_data(fpath) | |||
| val_xs, val_ys, _, _ = load_pfs_data(fpath_val) | |||
| print(shop_ids[sid], train_xs.shape, train_ys.shape) | |||
| # data regu | |||
| # train_xs = (train_xs - train_xs.min(0)) / (train_xs.max(0) - train_xs.min(0) + 0.0001) | |||
| # val_xs = (val_xs - val_xs.min(0)) / (val_xs.max(0) - val_xs.min(0) + 0.0001) | |||
| if algo == "lgb": | |||
| for tmp in range(len(proportion_list)): | |||
| model = lgb.LGBMModel( | |||
| boosting_type="gbdt", | |||
| num_leaves=2**7 - 1, | |||
| learning_rate=0.01, | |||
| objective="rmse", | |||
| metric="rmse", | |||
| feature_fraction=0.75, | |||
| bagging_fraction=0.75, | |||
| bagging_freq=5, | |||
| seed=1, | |||
| verbose=1, | |||
| n_estimators=100000, | |||
| ) | |||
| model_ori = joblib.load(os.path.join(model_dir, "{}_Shop{:0>2d}.out".format("lgb", shop_ids[sid]))) | |||
| para = model_ori.get_params() | |||
| para["n_estimators"] = 1000 | |||
| model.set_params(**para) | |||
| split = train_xs.shape[0] - proportion_list[tmp] | |||
| model.fit( | |||
| train_xs[ | |||
| split:, | |||
| ], | |||
| train_ys[split:], | |||
| eval_set=[(val_xs, val_ys)], | |||
| early_stopping_rounds=50, | |||
| verbose=100, | |||
| ) | |||
| pred_ys = model.predict(val_xs) | |||
| rmse = np.sqrt(((val_ys - pred_ys) ** 2).mean()) | |||
| errs[s][tmp] = rmse | |||
| return errs | |||
| def get_errors(algo): | |||
| shop_ids = [i for i in range(60) if i not in [0, 1, 40]] | |||
| shop_ids = [i for i in shop_ids if i not in [8, 11, 23, 36]] | |||
| # train | |||
| K = len(shop_ids) | |||
| feature_weight = np.zeros(()) | |||
| errs = np.zeros((K, K)) | |||
| for s, sid in enumerate(shop_ids): | |||
| # load train data | |||
| fpath = os.path.join(pfs_split_dir, "Shop{:0>2d}-train.csv".format(sid)) | |||
| fpath_val = os.path.join(pfs_split_dir, "Shop{:0>2d}-val.csv".format(sid)) | |||
| train_xs, train_ys, features, _ = load_pfs_data(fpath) | |||
| val_xs, val_ys, _, _ = load_pfs_data(fpath_val) | |||
| print(sid, train_xs.shape, train_ys.shape) | |||
| if s == 0: | |||
| feature_weight = np.zeros((K, len(features))) | |||
| if algo == "lgb": | |||
| model = lgb.LGBMModel( | |||
| boosting_type="gbdt", | |||
| num_leaves=2**7 - 1, | |||
| learning_rate=0.01, | |||
| objective="rmse", | |||
| metric="rmse", | |||
| feature_fraction=0.75, | |||
| bagging_fraction=0.75, | |||
| bagging_freq=5, | |||
| seed=1, | |||
| verbose=1, | |||
| n_estimators=1000, | |||
| ) | |||
| # train regu data | |||
| # train_xs = (train_xs - train_xs.min(0)) / (train_xs.max(0) - train_xs.min(0) + 0.0001) | |||
| # val_xs = (val_xs - val_xs.min(0)) / (val_xs.max(0) - val_xs.min(0) + 0.0001) | |||
| model.fit(train_xs, train_ys, eval_set=[(val_xs, val_ys)], early_stopping_rounds=100, verbose=100) | |||
| # grid search | |||
| # para = {'learning_rate': [0.005, 0.01, 0.015], 'num_leaves' : [128, 224, 300], 'max_depth' : [50, 66, 80]} | |||
| # grid_search = GridSearchCV(model, para, scoring='neg_mean_squared_error') | |||
| # grid_result = grid_search.fit(train_xs, train_ys, eval_set=[(val_xs, val_ys)], verbose = 1000, early_stopping_rounds=1000) | |||
| # model = grid_result.best_estimator_ | |||
| joblib.dump(model, os.path.join(model_dir, "{}_Shop{:0>2d}.out".format(algo, sid))) | |||
| importances = model.feature_importances_ | |||
| elif algo == "ridge": | |||
| # train_xs = (train_xs - train_xs.min(0)) / (train_xs.max(0) - train_xs.min(0) + 0.0001) | |||
| model = Ridge() | |||
| para = {"alpha": [0.01, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10, 20, 30]} | |||
| grid_search = GridSearchCV(model, para) | |||
| grid_result = grid_search.fit(train_xs, train_ys) | |||
| model = grid_result.best_estimator_ | |||
| importances = model.coef_ | |||
| joblib.dump(model, os.path.join(model_dir, "{}_Shop{:0>2d}.out".format(algo, sid))) | |||
| feature_weight[s] = importances | |||
| # leave one out test | |||
| for t, tid in enumerate(shop_ids): | |||
| # load test data | |||
| fpath = os.path.join(pfs_split_dir, "Shop{:0>2d}-val.csv".format(tid)) | |||
| test_xs, test_ys, _, _ = load_pfs_data(fpath) | |||
| # data regu | |||
| # test_xs = (test_xs - test_xs.min(0)) / (test_xs.max(0) - test_xs.min(0) + 0.0001) | |||
| pred_ys = model.predict(test_xs) | |||
| rmse = np.sqrt(((test_ys - pred_ys) ** 2).mean()) | |||
| print("Shop{} --> Shop{}: {}".format(s, t, rmse)) | |||
| errs[s][t] = rmse | |||
| np.savetxt(os.path.join(pfs_res_dir, "PFS_{}_weights.txt".format(algo)), feature_weight) | |||
| return errs | |||
| def plot_heatmap(mat, algo): | |||
| x_labels = [f"Model{i}" for i in range(mat.shape[1])] | |||
| y_labels = [f"Task{i}" for i in range(mat.shape[0])] | |||
| fig = plt.figure(figsize=(10, 9)) | |||
| plt.subplot(1, 1, 1) | |||
| ax = plt.gca() | |||
| im = plt.imshow(mat) | |||
| divider = make_axes_locatable(ax) | |||
| cax = divider.append_axes("right", size="4%", pad=0.3) | |||
| plt.colorbar(im, cax=cax) | |||
| ax.set_xticks(range(len(x_labels))) | |||
| ax.set_xticklabels(x_labels) | |||
| ax.set_yticks(range(len(y_labels))) | |||
| ax.set_yticklabels(y_labels) | |||
| ax.xaxis.set_major_locator(ticker.MultipleLocator(base=5)) | |||
| ax.yaxis.set_major_locator(ticker.MultipleLocator(base=5)) | |||
| ax.set_title(f"RMSE on Test set ({algo})") | |||
| plt.tight_layout() | |||
| plt.savefig(os.path.join(pfs_res_dir, "PFS_{}_heatmap.jpg".format(algo)), dpi=700) | |||
| def plot_var(errs, algo): | |||
| avg_err = [] | |||
| min_err = [] | |||
| med_err = [] | |||
| max_err = [] | |||
| std_err = [] | |||
| cnts = [] | |||
| improves = [] | |||
| for j in range(len(errs)): | |||
| inds = [i for i in range(len(errs)) if i != j] | |||
| ys = errs[:, j][inds] | |||
| avg_err.append(np.mean(ys)) | |||
| min_err.append(np.min(ys)) | |||
| med_err.append(np.median(ys)) | |||
| max_err.append(np.max(ys)) | |||
| std_err.append(np.std(ys)) | |||
| cnts.append(np.sum(ys >= np.mean(ys))) | |||
| improves.append((np.mean(ys) - np.min(ys)) / np.mean(ys)) | |||
| avg_err = np.array(avg_err) | |||
| min_err = np.array(min_err) | |||
| med_err = np.array(med_err) | |||
| max_err = np.array(max_err) | |||
| std_err = np.array(std_err) | |||
| cnts = np.array(cnts) | |||
| improves = np.array(improves) | |||
| inds = np.argsort(avg_err) | |||
| avg_err = avg_err[inds] | |||
| min_err = min_err[inds] | |||
| med_err = med_err[inds] | |||
| max_err = max_err[inds] | |||
| std_err = std_err[inds] | |||
| cnts = cnts[inds] | |||
| improves = improves[inds] | |||
| xs = list(range(len(inds))) | |||
| fig = plt.figure(figsize=(8, 8)) | |||
| ax = plt.subplot(3, 1, 1) | |||
| ax.plot(xs, avg_err, color="red", linestyle="solid", linewidth=2.5) | |||
| ax.plot(xs, min_err, color="blue", linestyle="dotted", linewidth=1.5) | |||
| ax.plot(xs, med_err, color="purple", linestyle="solid", linewidth=1.0) | |||
| ax.plot(xs, max_err, color="green", linestyle="dashed", linewidth=1.5) | |||
| ax.legend(["Avg", "Min", "Median", "Max"], fontsize=14) | |||
| ax.fill_between(xs, avg_err - std_err, avg_err + std_err, alpha=0.2) | |||
| gap = np.mean(avg_err - min_err) | |||
| ax.set_ylabel("RMSE", fontsize=14) | |||
| ax.set_title("RMSE of Source Models ({}) [Avg-Min:{:.3f}]".format(algo, gap), fontsize=18) | |||
| ax = plt.subplot(3, 1, 2) | |||
| ax.bar(xs, cnts) | |||
| ax.set_ylabel("Number", fontsize=14) | |||
| ax.set_title("Number of sources above average", fontsize=18) | |||
| ax = plt.subplot(3, 1, 3) | |||
| ax.plot(xs, improves) | |||
| ax.set_xlabel("Sorted Shop ID by Avg.Err", fontsize=14) | |||
| ax.set_ylabel("Ratio", fontsize=14) | |||
| ax.set_title("Best Improve Ratio: (Avg - Min) / Avg", fontsize=18) | |||
| fig.tight_layout() | |||
| fig.savefig(os.path.join(pfs_res_dir, "{}-var.jpg".format(algo))) | |||
| plt.show() | |||
| def plot_performance(errs, weights, algo): | |||
| avg_err = [] | |||
| min_err = [] | |||
| med_err = [] | |||
| max_err = [] | |||
| std_err = [] | |||
| cnts = [] | |||
| improves = [] | |||
| for i in range(errs.shape[0]): | |||
| inds = [j for j in range(errs.shape[1]) if j != i] | |||
| arr = errs[i][inds] | |||
| avg_err.append(np.mean(arr)) | |||
| min_err.append(np.min(arr)) | |||
| med_err.append(np.median(arr)) | |||
| max_err.append(np.max(arr)) | |||
| std_err.append(np.std(arr)) | |||
| cnts.append(np.sum(arr >= np.mean(arr))) | |||
| improves.append((np.mean(arr) - np.min(arr)) / np.mean(arr)) | |||
| avg_err = np.array(avg_err) | |||
| min_err = np.array(min_err) | |||
| med_err = np.array(med_err) | |||
| max_err = np.array(max_err) | |||
| std_err = np.array(std_err) | |||
| cnts = np.array(cnts) | |||
| improves = np.array(improves) | |||
| inds = np.argsort(avg_err) | |||
| avg_err = avg_err[inds] | |||
| min_err = min_err[inds] | |||
| med_err = med_err[inds] | |||
| max_err = max_err[inds] | |||
| std_err = std_err[inds] | |||
| cnts = cnts[inds] | |||
| improves = improves[inds] | |||
| xs = list(range(len(inds))) | |||
| fig = plt.figure(figsize=(12, 9)) | |||
| ax = plt.subplot(2, 2, 1) | |||
| ax.plot(xs, avg_err, color="red", linestyle="solid", linewidth=2.5) | |||
| ax.plot(xs, min_err, color="blue", linestyle="dotted", linewidth=1.5) | |||
| ax.plot(xs, med_err, color="purple", linestyle="solid", linewidth=1.0) | |||
| ax.plot(xs, max_err, color="green", linestyle="dashed", linewidth=1.5) | |||
| ax.legend(["Avg", "Min", "Median", "Max"], fontsize=14) | |||
| ax.fill_between(xs, avg_err - std_err, avg_err + std_err, alpha=0.2) | |||
| gap = np.mean(avg_err - min_err) | |||
| ax.set_ylabel("RMSE", fontsize=14) | |||
| ax.set_title("RMSE of Source Models ({}) [Avg-Min:{:.3f}]".format(algo, gap), fontsize=18) | |||
| ax = plt.subplot(2, 2, 2) | |||
| ax.bar(xs, cnts) | |||
| ax.set_ylabel("Number", fontsize=14) | |||
| ax.set_title("Number of sources above average", fontsize=18) | |||
| ax = plt.subplot(2, 2, 3) | |||
| ax.plot(xs, improves) | |||
| ax.set_xlabel("Sorted Shop ID by Avg.Err", fontsize=14) | |||
| ax.set_ylabel("Ratio", fontsize=14) | |||
| ax.set_title("Best Improve Ratio: (Avg - Min) / Avg", fontsize=18) | |||
| ax = plt.subplot(2, 2, 4) | |||
| weights = np.mean(weights, axis=0) / weights.sum() | |||
| weights = np.sort(weights) | |||
| xs = list(range(len(weights))) | |||
| ax.plot(xs, weights) | |||
| # ax.set_xlabel("Sorted Feature ID by Avg.Feature_Importance", fontsize=14) | |||
| ax.set_ylabel("Proportion", fontsize=14) | |||
| ax.set_title("Avg.Feature_Importances", fontsize=18) | |||
| fig.tight_layout() | |||
| fig.savefig(os.path.join(pfs_res_dir, "PFS_{}_performance.png".format(algo)), dpi=700) | |||
| # fig.savefig(f"{algo}_performance.png", dpi=700) | |||
| plt.show() | |||
| if __name__ == "__main__": | |||
| # for algo in ["ridge", "lgb", "xgboost_125"]: | |||
| for algo in ["ridge"]: | |||
| fpath = os.path.join(pfs_res_dir, "{}_errs.pkl".format(algo)) | |||
| if os.path.exists(fpath): | |||
| with open(fpath, "rb") as fr: | |||
| errs = pickle.load(fr) | |||
| else: | |||
| errs = get_errors(algo=algo) | |||
| with open(fpath, "wb") as fw: | |||
| pickle.dump(errs, fw) | |||
| index = ["Source{}".format(k) for k in range(len(errs))] | |||
| columns = ["Target{}".format(k) for k in range(len(errs[0]))] | |||
| df = pd.DataFrame(errs, index=index, columns=columns) | |||
| fpath = os.path.join(pfs_res_dir, "PFS_{}_errs.txt".format(algo)) | |||
| # df.to_csv(fpath, index=True) | |||
| np.savetxt(fpath, errs.T) | |||
| # plot_var(errs, algo) | |||
| plot_heatmap(errs.T, algo) | |||
| weights = np.loadtxt(os.path.join(pfs_res_dir, "PFS_{}_weights.txt".format(algo))) | |||
| plot_performance(errs.T, weights, algo) | |||
| @@ -1,384 +0,0 @@ | |||
| import os | |||
| import pickle | |||
| import pandas as pd | |||
| import numpy as np | |||
| from itertools import product | |||
| from sklearn.preprocessing import LabelEncoder | |||
| from sklearn.preprocessing import MinMaxScaler | |||
| import calendar | |||
| from .paths import pfs_data_dir | |||
| from .paths import pfs_split_dir | |||
| def feature_engineering(): | |||
| # read data | |||
| sales = pd.read_csv(os.path.join(pfs_data_dir, "sales_train.csv")) | |||
| shops = pd.read_csv(os.path.join(pfs_data_dir, "shops.csv")) | |||
| items = pd.read_csv(os.path.join(pfs_data_dir, "items.csv")) | |||
| item_cats = pd.read_csv(os.path.join(pfs_data_dir, "item_categories.csv")) | |||
| test = pd.read_csv(os.path.join(pfs_data_dir, "test.csv")) | |||
| # remove outliers | |||
| train = sales[(sales.item_price < 10000) & (sales.item_price > 0)] | |||
| train = train[sales.item_cnt_day < 1001] | |||
| print(train.shape, sales.shape) | |||
| print(train.tail(5)) | |||
| print(sales.tail(5)) | |||
| # combine shops with different id but the same name | |||
| train.loc[train.shop_id == 0, "shop_id"] = 57 | |||
| test.loc[test.shop_id == 0, "shop_id"] = 57 | |||
| train.loc[train.shop_id == 1, "shop_id"] = 58 | |||
| test.loc[test.shop_id == 1, "shop_id"] = 58 | |||
| train.loc[train.shop_id == 40, "shop_id"] = 39 | |||
| test.loc[test.shop_id == 40, "shop_id"] = 39 | |||
| # obtain shop_id, item_id, month information | |||
| index_cols = ["shop_id", "item_id", "date_block_num"] | |||
| df = [] | |||
| for block_num in train["date_block_num"].unique(): | |||
| cur_shops = train.loc[sales["date_block_num"] == block_num, "shop_id"].unique() | |||
| cur_items = train.loc[sales["date_block_num"] == block_num, "item_id"].unique() | |||
| df.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])), dtype="int32")) | |||
| df = pd.DataFrame(np.vstack(df), columns=index_cols, dtype=np.int32) | |||
| print("df.shape: ", df.shape) | |||
| print(df.head(5)) | |||
| # Add month sales | |||
| group = train.groupby(["date_block_num", "shop_id", "item_id"]).agg({"item_cnt_day": ["sum"]}) | |||
| group.columns = ["item_cnt_month"] | |||
| group.reset_index(inplace=True) | |||
| print("group.shape: ", group.shape) | |||
| print(group.head(5)) | |||
| df = pd.merge(df, group, on=index_cols, how="left") | |||
| df["item_cnt_month"] = ( | |||
| df["item_cnt_month"] | |||
| .fillna(0) | |||
| .astype(np.float32) | |||
| # df['item_cnt_month'].fillna(0).clip(0, 20).astype(np.float32) | |||
| ) | |||
| # fill test data | |||
| test["date_block_num"] = 34 | |||
| test["date_block_num"] = test["date_block_num"].astype(np.int8) | |||
| test["shop_id"] = test["shop_id"].astype(np.int8) | |||
| test["item_id"] = test["item_id"].astype(np.int16) | |||
| df = pd.concat([df, test], ignore_index=True, sort=False, keys=index_cols) | |||
| df.fillna(0, inplace=True) | |||
| # shop location features | |||
| shops["city"] = shops["shop_name"].apply(lambda x: x.split()[0].lower()) | |||
| shops.loc[shops.city == "!якутск", "city"] = "якутск" | |||
| shops["city_code"] = LabelEncoder().fit_transform(shops["city"]) | |||
| coords = dict() | |||
| coords["якутск"] = (62.028098, 129.732555, 4) | |||
| coords["адыгея"] = (44.609764, 40.100516, 3) | |||
| coords["балашиха"] = (55.8094500, 37.9580600, 1) | |||
| coords["волжский"] = (53.4305800, 50.1190000, 3) | |||
| coords["вологда"] = (59.2239000, 39.8839800, 2) | |||
| coords["воронеж"] = (51.6720400, 39.1843000, 3) | |||
| coords["выездная"] = (0, 0, 0) | |||
| coords["жуковский"] = (55.5952800, 38.1202800, 1) | |||
| coords["интернет-магазин"] = (0, 0, 0) | |||
| coords["казань"] = (55.7887400, 49.1221400, 4) | |||
| coords["калуга"] = (54.5293000, 36.2754200, 4) | |||
| coords["коломна"] = (55.0794400, 38.7783300, 4) | |||
| coords["красноярск"] = (56.0183900, 92.8671700, 4) | |||
| coords["курск"] = (51.7373300, 36.1873500, 3) | |||
| coords["москва"] = (55.7522200, 37.6155600, 1) | |||
| coords["мытищи"] = (55.9116300, 37.7307600, 1) | |||
| coords["н.новгород"] = (56.3286700, 44.0020500, 4) | |||
| coords["новосибирск"] = (55.0415000, 82.9346000, 4) | |||
| coords["омск"] = (54.9924400, 73.3685900, 4) | |||
| coords["ростовнадону"] = (47.2313500, 39.7232800, 3) | |||
| coords["спб"] = (59.9386300, 30.3141300, 2) | |||
| coords["самара"] = (53.2000700, 50.1500000, 4) | |||
| coords["сергиев"] = (56.3000000, 38.1333300, 4) | |||
| coords["сургут"] = (61.2500000, 73.4166700, 4) | |||
| coords["томск"] = (56.4977100, 84.9743700, 4) | |||
| coords["тюмень"] = (57.1522200, 65.5272200, 4) | |||
| coords["уфа"] = (54.7430600, 55.9677900, 4) | |||
| coords["химки"] = (55.8970400, 37.4296900, 1) | |||
| coords["цифровой"] = (0, 0, 0) | |||
| coords["чехов"] = (55.1477000, 37.4772800, 4) | |||
| coords["ярославль"] = (57.6298700, 39.8736800, 2) | |||
| shops["city_coord_1"] = shops["city"].apply(lambda x: coords[x][0]) | |||
| shops["city_coord_2"] = shops["city"].apply(lambda x: coords[x][1]) | |||
| shops["country_part"] = shops["city"].apply(lambda x: coords[x][2]) | |||
| shops = shops[["shop_id", "city_code", "city_coord_1", "city_coord_2", "country_part"]] | |||
| df = pd.merge(df, shops, on=["shop_id"], how="left") | |||
| # process items category name | |||
| map_dict = { | |||
| "Чистые носители (штучные)": "Чистые носители", | |||
| "Чистые носители (шпиль)": "Чистые носители", | |||
| "PC ": "Аксессуары", | |||
| "Служебные": "Служебные ", | |||
| } | |||
| items = pd.merge(items, item_cats, on="item_category_id") | |||
| items["item_category"] = items["item_category_name"].apply(lambda x: x.split("-")[0]) | |||
| items["item_category"] = items["item_category"].apply(lambda x: map_dict[x] if x in map_dict.keys() else x) | |||
| items["item_category_common"] = LabelEncoder().fit_transform(items["item_category"]) | |||
| items["item_category_code"] = LabelEncoder().fit_transform(items["item_category_name"]) | |||
| items = items[["item_id", "item_category_common", "item_category_code"]] | |||
| df = pd.merge(df, items, on=["item_id"], how="left") | |||
| # Weekends count / number of days in a month | |||
| def count_days(date_block_num): | |||
| year = 2013 + date_block_num // 12 | |||
| month = 1 + date_block_num % 12 | |||
| weeknd_count = len([1 for i in calendar.monthcalendar(year, month) if i[6] != 0]) | |||
| days_in_month = calendar.monthrange(year, month)[1] | |||
| return weeknd_count, days_in_month, month | |||
| map_dict = {i: count_days(i) for i in range(35)} | |||
| df["weeknd_count"] = df["date_block_num"].apply(lambda x: map_dict[x][0]) | |||
| df["days_in_month"] = df["date_block_num"].apply(lambda x: map_dict[x][1]) | |||
| # Interation features: Item is new / Item was bought in this shop before | |||
| first_item_block = df.groupby(["item_id"])["date_block_num"].min().reset_index() | |||
| first_item_block["item_first_interaction"] = 1 | |||
| first_shop_item_buy_block = ( | |||
| df[df["date_block_num"] > 0].groupby(["shop_id", "item_id"])["date_block_num"].min().reset_index() | |||
| ) | |||
| first_shop_item_buy_block["first_date_block_num"] = first_shop_item_buy_block["date_block_num"] | |||
| df = pd.merge( | |||
| df, | |||
| first_item_block[["item_id", "date_block_num", "item_first_interaction"]], | |||
| on=["item_id", "date_block_num"], | |||
| how="left", | |||
| ) | |||
| df = pd.merge( | |||
| df, | |||
| first_shop_item_buy_block[["item_id", "shop_id", "first_date_block_num"]], | |||
| on=["item_id", "shop_id"], | |||
| how="left", | |||
| ) | |||
| df["first_date_block_num"].fillna(100, inplace=True) | |||
| df["shop_item_sold_before"] = (df["first_date_block_num"] < df["date_block_num"]).astype("int8") | |||
| df.drop(["first_date_block_num"], axis=1, inplace=True) | |||
| df["item_first_interaction"].fillna(0, inplace=True) | |||
| df["shop_item_sold_before"].fillna(0, inplace=True) | |||
| df["item_first_interaction"] = df["item_first_interaction"].astype("int8") | |||
| df["shop_item_sold_before"] = df["shop_item_sold_before"].astype("int8") | |||
| def lag_feature(df, lags, col): | |||
| tmp = df[["date_block_num", "shop_id", "item_id", col]] | |||
| for i in lags: | |||
| shifted = tmp.copy() | |||
| shifted.columns = ["date_block_num", "shop_id", "item_id", col + "_lag_" + str(i)] | |||
| shifted["date_block_num"] += i | |||
| df = pd.merge(df, shifted, on=["date_block_num", "shop_id", "item_id"], how="left") | |||
| lag_name = col + "_lag_" + str(i) | |||
| df[lag_name] = df[lag_name].astype("float32") | |||
| return df | |||
| df = lag_feature(df, [1, 2, 3], "item_cnt_month") | |||
| index_cols = ["shop_id", "item_id", "date_block_num"] | |||
| group = ( | |||
| train.groupby(index_cols)["item_price"] | |||
| .mean() | |||
| .reset_index() | |||
| .rename(columns={"item_price": "avg_shop_price"}, errors="raise") | |||
| ) | |||
| df = pd.merge(df, group, on=index_cols, how="left") | |||
| df["avg_shop_price"] = df["avg_shop_price"].fillna(0).astype(np.float32) | |||
| index_cols = ["item_id", "date_block_num"] | |||
| group = ( | |||
| train.groupby(["date_block_num", "item_id"])["item_price"] | |||
| .mean() | |||
| .reset_index() | |||
| .rename(columns={"item_price": "avg_item_price"}, errors="raise") | |||
| ) | |||
| df = pd.merge(df, group, on=index_cols, how="left") | |||
| df["avg_item_price"] = df["avg_item_price"].fillna(0).astype(np.float32) | |||
| df["item_shop_price_avg"] = (df["avg_shop_price"] - df["avg_item_price"]) / df["avg_item_price"] | |||
| df["item_shop_price_avg"].fillna(0, inplace=True) | |||
| df = lag_feature(df, [1, 2, 3], "item_shop_price_avg") | |||
| df.drop(["avg_shop_price", "avg_item_price", "item_shop_price_avg"], axis=1, inplace=True) | |||
| item_id_target_mean = ( | |||
| df.groupby(["date_block_num", "item_id"])["item_cnt_month"] | |||
| .mean() | |||
| .reset_index() | |||
| .rename(columns={"item_cnt_month": "item_target_enc"}, errors="raise") | |||
| ) | |||
| df = pd.merge(df, item_id_target_mean, on=["date_block_num", "item_id"], how="left") | |||
| df["item_target_enc"] = df["item_target_enc"].fillna(0).astype(np.float32) | |||
| df = lag_feature(df, [1, 2, 3], "item_target_enc") | |||
| df.drop(["item_target_enc"], axis=1, inplace=True) | |||
| item_id_target_mean = ( | |||
| df.groupby(["date_block_num", "item_id", "city_code"])["item_cnt_month"] | |||
| .mean() | |||
| .reset_index() | |||
| .rename(columns={"item_cnt_month": "item_loc_target_enc"}, errors="raise") | |||
| ) | |||
| df = pd.merge(df, item_id_target_mean, on=["date_block_num", "item_id", "city_code"], how="left") | |||
| df["item_loc_target_enc"] = df["item_loc_target_enc"].fillna(0).astype(np.float32) | |||
| df = lag_feature(df, [1, 2, 3], "item_loc_target_enc") | |||
| df.drop(["item_loc_target_enc"], axis=1, inplace=True) | |||
| item_id_target_mean = ( | |||
| df.groupby(["date_block_num", "item_id", "shop_id"])["item_cnt_month"] | |||
| .mean() | |||
| .reset_index() | |||
| .rename(columns={"item_cnt_month": "item_shop_target_enc"}, errors="raise") | |||
| ) | |||
| df = pd.merge(df, item_id_target_mean, on=["date_block_num", "item_id", "shop_id"], how="left") | |||
| df["item_shop_target_enc"] = df["item_shop_target_enc"].fillna(0).astype(np.float32) | |||
| df = lag_feature(df, [1, 2, 3], "item_shop_target_enc") | |||
| df.drop(["item_shop_target_enc"], axis=1, inplace=True) | |||
| item_id_target_mean = ( | |||
| df[df["item_first_interaction"] == 1] | |||
| .groupby(["date_block_num", "item_category_code"])["item_cnt_month"] | |||
| .mean() | |||
| .reset_index() | |||
| .rename(columns={"item_cnt_month": "new_item_cat_avg"}, errors="raise") | |||
| ) | |||
| df = pd.merge(df, item_id_target_mean, on=["date_block_num", "item_category_code"], how="left") | |||
| df["new_item_cat_avg"] = df["new_item_cat_avg"].fillna(0).astype(np.float32) | |||
| df = lag_feature(df, [1, 2, 3], "new_item_cat_avg") | |||
| df.drop(["new_item_cat_avg"], axis=1, inplace=True) | |||
| # For new items add avg category sales in a separate store for last 3 months | |||
| item_id_target_mean = ( | |||
| df[df["item_first_interaction"] == 1] | |||
| .groupby(["date_block_num", "item_category_code", "shop_id"])["item_cnt_month"] | |||
| .mean() | |||
| .reset_index() | |||
| .rename(columns={"item_cnt_month": "new_item_shop_cat_avg"}, errors="raise") | |||
| ) | |||
| df = pd.merge(df, item_id_target_mean, on=["date_block_num", "item_category_code", "shop_id"], how="left") | |||
| df["new_item_shop_cat_avg"] = df["new_item_shop_cat_avg"].fillna(0).astype(np.float32) | |||
| df = lag_feature(df, [1, 2, 3], "new_item_shop_cat_avg") | |||
| df.drop(["new_item_shop_cat_avg"], axis=1, inplace=True) | |||
| def lag_feature_adv(df, lags, col): | |||
| tmp = df[["date_block_num", "shop_id", "item_id", col]] | |||
| for i in lags: | |||
| shifted = tmp.copy() | |||
| shifted.columns = ["date_block_num", "shop_id", "item_id", col + "_lag_" + str(i) + "_adv"] | |||
| shifted["date_block_num"] += i | |||
| shifted["item_id"] -= 1 | |||
| df = pd.merge(df, shifted, on=["date_block_num", "shop_id", "item_id"], how="left") | |||
| lag_name = col + "_lag_" + str(i) + "_adv" | |||
| df[lag_name] = df[lag_name].astype("float32") | |||
| return df | |||
| df = lag_feature_adv(df, [1, 2, 3], "item_cnt_month") | |||
| # df.fillna(0, inplace=True) | |||
| df = df[(df["date_block_num"] > 2)] | |||
| df.drop(["ID"], axis=1, inplace=True, errors="ignore") | |||
| print(df.shape) | |||
| print(df.columns) | |||
| print(df.head(10)) | |||
| fill_dict = {} | |||
| for col in df.columns: | |||
| fill_dict[col] = df[col].mean() | |||
| group_df = df.groupby(["shop_id"]) | |||
| for shop_id, shop_df in group_df: | |||
| # remove data of data_block_num=34, i.e., 2015.11 | |||
| # this is test set in competition | |||
| shop_df = shop_df[shop_df.date_block_num <= 33] | |||
| # fill the null | |||
| cols = shop_df.isnull().any() | |||
| idx = list(cols[cols.values].index) | |||
| shop_df[idx] = shop_df.groupby("item_id", sort=False)[idx].apply( | |||
| lambda x: x.fillna(method="ffill").fillna(method="bfill") | |||
| ) | |||
| shop_df[idx] = shop_df[idx].fillna(shop_df[idx].mean()) | |||
| for col in idx: | |||
| shop_df[col] = shop_df[col].fillna(fill_dict[col]) | |||
| # min-max scale | |||
| drop_fea_list = [ | |||
| "shop_id", | |||
| "city_code", | |||
| "city_coord_1", | |||
| "city_coord_2", | |||
| "country_part", | |||
| "item_cnt_month", | |||
| "date_block_num", | |||
| ] | |||
| fea_list = [col for col in shop_df.columns if col not in drop_fea_list] | |||
| mms = MinMaxScaler() | |||
| shop_df[fea_list] = mms.fit_transform(shop_df[fea_list]) | |||
| shop_df = shop_df[fea_list + ["item_cnt_month", "date_block_num"]] | |||
| date_split = 29 | |||
| split = False | |||
| while split is False: | |||
| df1 = shop_df[shop_df["date_block_num"] <= date_split] | |||
| df2 = shop_df[shop_df["date_block_num"] > date_split] | |||
| if df2.shape[0] > 0 and df1.shape[0] > 0: | |||
| split = True | |||
| else: | |||
| date_split -= 1 | |||
| if date_split < 0: | |||
| break | |||
| if split is True: | |||
| print("ShopID:{}, split block:{}".format(shop_id, date_split)) | |||
| print(df1.shape, df2.shape) | |||
| # save train csv | |||
| fpath = os.path.join(pfs_split_dir, "Shop{:0>2d}-train.csv".format(shop_id)) | |||
| df1.to_csv(fpath, index=False) | |||
| # save val csv | |||
| fpath = os.path.join(pfs_split_dir, "Shop{:0>2d}-val.csv".format(shop_id)) | |||
| df2.to_csv(fpath, index=False) | |||
| @@ -1,90 +0,0 @@ | |||
| import hashlib | |||
| import requests | |||
| import os | |||
| import random | |||
| import json | |||
| import time | |||
| from tqdm import tqdm | |||
| email = "liujd@lamda.nju.edu.cn" | |||
| password = hashlib.md5(b"liujdlamda").hexdigest() | |||
| login_url = "http://210.28.134.201:8089/auth/login" | |||
| submit_url = "http://210.28.134.201:8089/user/add_learnware" | |||
| all_data_type = ["Table", "Image", "Video", "Text", "Audio"] | |||
| all_task_type = [ | |||
| "Classification", | |||
| "Regression", | |||
| "Clustering", | |||
| "Feature Extraction", | |||
| "Generation", | |||
| "Segmentation", | |||
| "Object Detection", | |||
| ] | |||
| all_device_type = ["CPU", "GPU"] | |||
| all_scenario = [ | |||
| "Business", | |||
| "Financial", | |||
| "Health", | |||
| "Politics", | |||
| "Computer", | |||
| "Internet", | |||
| "Traffic", | |||
| "Nature", | |||
| "Fashion", | |||
| "Industry", | |||
| "Agriculture", | |||
| "Education", | |||
| "Entertainment", | |||
| "Architecture", | |||
| ] | |||
| # ############### | |||
| # 以上部分无需修改 # | |||
| # ############### | |||
| def main(): | |||
| session = requests.Session() | |||
| res = session.post(login_url, json={"email": email, "password": password}) | |||
| # /path/to/learnware/folder 修改为学件文件夹地址 | |||
| learnware_pool = os.listdir(os.path.join(os.path.abspath("."), "learnware_pool")) | |||
| for learnware in learnware_pool: | |||
| # 修改相应的语义规约 | |||
| name = "PFS_Shop" + "%02d" % int(learnware.split(".")[0].split("_")[1]) | |||
| name = name + "_" + time.strftime("%Y%m%d%H%M%S", time.localtime()) | |||
| description = f"This is a description of learnware {name}" | |||
| data = random.choice(all_data_type) | |||
| task = random.choice(all_task_type) | |||
| device = list(set(random.choices(all_device_type, k=2))) | |||
| scenario = list(set(random.choices(all_scenario, k=5))) | |||
| semantic_specification = { | |||
| "Data": {"Values": ["Table"], "Type": "Class"}, | |||
| "Library": {"Values": ["Scikit-learn"], "Type": "Class"}, | |||
| "Task": {"Values": ["Regression"], "Type": "Class"}, | |||
| "Scenario": {"Values": ["Business"], "Type": "Tag"}, | |||
| "Description": { | |||
| "Values": "A sales-forecasting model from Predict Future Sales Competition on Kaggle", | |||
| "Type": "String", | |||
| }, | |||
| "Name": {"Values": name, "Type": "String"}, | |||
| "License": {"Values": ["MIT"], "Type": "Class"}, | |||
| } | |||
| res = session.post( | |||
| submit_url, | |||
| data={ | |||
| "semantic_specification": json.dumps(semantic_specification), | |||
| }, | |||
| files={ | |||
| "learnware_file": open( | |||
| os.path.join(os.path.abspath("."), "learnware_pool", learnware), | |||
| "rb", | |||
| ) | |||
| }, | |||
| ) | |||
| assert json.loads(res.text)["code"] == 0, "Upload error" | |||
| if __name__ == "__main__": | |||
| main() | |||
| @@ -39,13 +39,14 @@ python workflow.py labeled_text_example | |||
| The table below presents the mean accuracy of search and reuse across all users: | |||
| | Metric | Value | | |||
| |--------------------------------------|---------------------| | |||
| | Mean in Market (Single) | 0.507 | | |||
| | Best in Market (Single) | 0.859 | | |||
| | Top-1 Reuse (Single) | 0.846 | | |||
| | Job Selector Reuse (Multiple) | 0.845 | | |||
| | Average Ensemble Reuse (Multiple) | 0.862 | | |||
| | Setting | Accuracy | | |||
| |---------------------------------------|---------------------| | |||
| | Mean in Market (Single) | 0.507 | | |||
| | Best in Market (Single) | 0.859 | | |||
| | Top-1 Reuse (Single) | 0.846 | | |||
| | Job Selector Reuse (Multiple) | 0.845 | | |||
| | Average Ensemble Reuse (Multiple) | 0.862 | | |||
| ### ``labeled_text_example``: | |||
| @@ -64,7 +64,7 @@ class TextDatasetWorkflow: | |||
| plt.xlabel("Amout of Labeled User Data", fontsize=14) | |||
| plt.ylabel("1 - Accuracy", fontsize=14) | |||
| plt.title(f"Results on Text Experimental Scenario", fontsize=16) | |||
| plt.title("Results on Text Experimental Scenario", fontsize=16) | |||
| plt.legend(fontsize=14) | |||
| plt.tight_layout() | |||
| plt.savefig(os.path.join(self.fig_path, "text_labeled_curves.svg"), bbox_inches="tight", dpi=700) | |||
| @@ -76,7 +76,7 @@ class TextDatasetWorkflow: | |||
| self.user_semantic = client.get_semantic_specification(self.text_benchmark.learnware_ids[0]) | |||
| self.user_semantic["Name"]["Values"] = "" | |||
| if len(self.text_market) == 0 or rebuild == True: | |||
| if len(self.text_market) == 0 or rebuild is True: | |||
| for learnware_id in self.text_benchmark.learnware_ids: | |||
| with tempfile.TemporaryDirectory(prefix="text_benchmark_") as tempdir: | |||
| zip_path = os.path.join(tempdir, f"{learnware_id}.zip") | |||
| @@ -86,7 +86,7 @@ class TextDatasetWorkflow: | |||
| client.download_learnware(learnware_id, zip_path) | |||
| self.text_market.add_learnware(zip_path, semantic_spec) | |||
| break | |||
| except: | |||
| except Exception: | |||
| time.sleep(1) | |||
| continue | |||
| @@ -103,7 +103,7 @@ class TextDatasetWorkflow: | |||
| ensemble_score_list = [] | |||
| all_learnwares = self.text_market.get_learnwares() | |||
| for i in range(self.text_benchmark.user_num): | |||
| for i in range(text_benchmark_config.user_num): | |||
| user_data, user_label = self.text_benchmark.get_test_data(user_ids=i) | |||
| user_stat_spec = RKMETextSpecification() | |||
| @@ -183,19 +183,19 @@ class TextDatasetWorkflow: | |||
| % (np.mean(ensemble_score_list), np.std(ensemble_score_list)) | |||
| ) | |||
| def labeled_text_example(self, rebuild=False, train_flag=True): | |||
| def labeled_text_example(self, rebuild=False, skip_test=False): | |||
| self.n_labeled_list = [100, 200, 500, 1000, 2000, 4000] | |||
| self.repeated_list = [10, 10, 10, 3, 3, 3] | |||
| self.root_path = os.path.dirname(os.path.abspath(__file__)) | |||
| self.fig_path = os.path.join(self.root_path, "figs") | |||
| self.curve_path = os.path.join(self.root_path, "curves") | |||
| self._prepare_market(rebuild) | |||
| if train_flag: | |||
| if not skip_test: | |||
| self._prepare_market(rebuild) | |||
| os.makedirs(self.fig_path, exist_ok=True) | |||
| os.makedirs(self.curve_path, exist_ok=True) | |||
| for i in range(self.text_benchmark.user_num): | |||
| for i in range(text_benchmark_config.user_num): | |||
| user_model_score_mat = [] | |||
| pruning_score_mat = [] | |||
| single_score_mat = [] | |||
| @@ -268,7 +268,7 @@ class TextDatasetWorkflow: | |||
| pruning_curves_data, user_model_curves_data = [], [] | |||
| total_user_model_score_mat = [np.zeros(self.repeated_list[i]) for i in range(len(self.n_labeled_list))] | |||
| total_pruning_score_mat = [np.zeros(self.repeated_list[i]) for i in range(len(self.n_labeled_list))] | |||
| for user_idx in range(self.text_benchmark.user_num): | |||
| for user_idx in range(text_benchmark_config.user_num): | |||
| with open(os.path.join(self.curve_path, f"curve{str(user_idx)}.pkl"), "rb") as f: | |||
| user_curves_data = pickle.load(f) | |||
| (single_score_mat, user_model_score_mat, pruning_score_mat) = user_curves_data | |||
| @@ -278,8 +278,8 @@ class TextDatasetWorkflow: | |||
| total_pruning_score_mat[i] += 1 - np.array(pruning_score_mat[i]) | |||
| for i in range(len(self.n_labeled_list)): | |||
| total_user_model_score_mat[i] /= self.text_benchmark.user_num | |||
| total_pruning_score_mat[i] /= self.text_benchmark.user_num | |||
| total_user_model_score_mat[i] /= text_benchmark_config.user_num | |||
| total_pruning_score_mat[i] /= text_benchmark_config.user_num | |||
| user_model_curves_data.append( | |||
| (np.mean(total_user_model_score_mat[i]), np.std(total_user_model_score_mat[i])) | |||
| ) | |||
| @@ -1,7 +1,8 @@ | |||
| __version__ = "0.2.0.9" | |||
| import os | |||
| import json | |||
| import os | |||
| from .logger import get_module_logger | |||
| from .utils import is_torch_available, setup_seed | |||
| @@ -35,12 +36,12 @@ def init(verbose=True, **kwargs): | |||
| with open(config_file, "r") as fin_config: | |||
| C.update(**dict(json.load(fin_config))) | |||
| ## random seed | |||
| # random seed | |||
| deterministic = kwargs.get("deterministic", True) | |||
| if deterministic: | |||
| setup_seed(C.random_seed) | |||
| ## make dirs | |||
| # make dirs | |||
| mkdir = kwargs.get("mkdir", True) | |||
| if mkdir: | |||
| os.makedirs(C.root_path, exist_ok=True) | |||
| @@ -48,7 +49,7 @@ def init(verbose=True, **kwargs): | |||
| os.makedirs(C.stdout_path, exist_ok=True) | |||
| os.makedirs(C.cache_path, exist_ok=True) | |||
| ## ignore tensorflow warning | |||
| # ignore tensorflow warning | |||
| tf_loglevel = kwargs.get("tf_loglevel", "2") | |||
| os.environ["TF_CPP_MIN_LOG_LEVEL"] = tf_loglevel | |||
| @@ -1,21 +1,19 @@ | |||
| import atexit | |||
| import os | |||
| import docker | |||
| import pickle | |||
| import atexit | |||
| import tarfile | |||
| import tempfile | |||
| import shortuuid | |||
| from concurrent.futures import ThreadPoolExecutor | |||
| from typing import List, Optional, Union | |||
| import docker | |||
| import shortuuid | |||
| from typing import List, Union, Optional | |||
| from .utils import system_execute, install_environment, remove_enviroment | |||
| from .utils import install_environment, remove_enviroment, system_execute | |||
| from ..config import C | |||
| from ..learnware import Learnware | |||
| from ..model.base import BaseModel | |||
| from .package_utils import filter_nonexist_conda_packages_file, filter_nonexist_pip_packages_file | |||
| from ..logger import get_module_logger | |||
| from ..model.base import BaseModel | |||
| logger = get_module_logger(module_name="client_container") | |||
| @@ -224,7 +222,7 @@ class ModelDockerContainer(ModelContainer): | |||
| } | |||
| container = client.containers.run(**container_config) | |||
| logger.info(f"Docker container {container.id[:12]} is generated.") | |||
| try: | |||
| environment_cmd = [ | |||
| "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple", | |||
| @@ -265,7 +263,7 @@ class ModelDockerContainer(ModelContainer): | |||
| if isinstance(docker_container, docker.models.containers.Container): | |||
| client = docker.from_env() | |||
| container_ids = [container.id for container in client.containers.list()] | |||
| if docker_container.id in container_ids: | |||
| docker_container.stop() | |||
| docker_container.remove() | |||
| @@ -521,7 +519,7 @@ class LearnwaresContainer: | |||
| except KeyboardInterrupt: | |||
| logger.warning("The KeyboardInterrupt is ignored when removing the container env!") | |||
| self._destroy_docker_container() | |||
| def __enter__(self): | |||
| if self.mode == "conda": | |||
| self.learnware_containers = [ | |||
| @@ -1,24 +1,23 @@ | |||
| import os | |||
| import uuid | |||
| import yaml | |||
| import json | |||
| import atexit | |||
| import zipfile | |||
| import hashlib | |||
| import requests | |||
| import json | |||
| import os | |||
| import tempfile | |||
| import uuid | |||
| import zipfile | |||
| from enum import Enum | |||
| from typing import List, Optional, Union | |||
| import requests | |||
| import yaml | |||
| from tqdm import tqdm | |||
| from typing import Union, List, Optional | |||
| from ..config import C | |||
| from .container import LearnwaresContainer | |||
| from ..market import BaseChecker | |||
| from ..specification import generate_semantic_spec | |||
| from ..logger import get_module_logger | |||
| from ..config import C | |||
| from ..learnware import get_learnware_from_dirpath | |||
| from ..market import BaseUserInfo | |||
| from ..logger import get_module_logger | |||
| from ..market import BaseChecker, BaseUserInfo | |||
| from ..specification import generate_semantic_spec | |||
| CHUNK_SIZE = 1024 * 1024 | |||
| logger = get_module_logger(module_name="LearnwareClient") | |||
| @@ -413,7 +412,7 @@ class LearnwareClient: | |||
| @staticmethod | |||
| def _check_stat_specification(learnware): | |||
| from ..market import EasyStatChecker, CondaChecker | |||
| from ..market import CondaChecker, EasyStatChecker | |||
| stat_checker = CondaChecker(inner_checker=EasyStatChecker()) | |||
| check_status, message = stat_checker(learnware) | |||
| @@ -1,14 +1,14 @@ | |||
| import json | |||
| import os | |||
| import re | |||
| import json | |||
| import yaml | |||
| import tempfile | |||
| import subprocess | |||
| from typing import List, Tuple | |||
| from . import utils | |||
| import tempfile | |||
| from concurrent.futures import ThreadPoolExecutor | |||
| from typing import List, Tuple | |||
| import yaml | |||
| from . import utils | |||
| from ..logger import get_module_logger | |||
| logger = get_module_logger("package_utils") | |||
| @@ -86,7 +86,7 @@ def filter_nonexist_pip_packages(packages: list) -> Tuple[List[str], List[str]]: | |||
| pass | |||
| except Exception as err: | |||
| logger.error(err) | |||
| return None | |||
| exist_packages = [] | |||
| @@ -101,7 +101,7 @@ def filter_nonexist_pip_packages(packages: list) -> Tuple[List[str], List[str]]: | |||
| exist_packages.append(result) | |||
| else: | |||
| nonexist_packages.append(package) | |||
| if len(nonexist_packages) > 0: | |||
| logger.info(f"Filtered out {len(nonexist_packages)} non-exist pip packages.") | |||
| return exist_packages, nonexist_packages | |||
| @@ -1,6 +1,6 @@ | |||
| import argparse | |||
| from learnware.client.utils import install_environment | |||
| from learnware.client.utils import install_environment | |||
| if __name__ == "__main__": | |||
| parser = argparse.ArgumentParser() | |||
| @@ -1,6 +1,7 @@ | |||
| import sys | |||
| import pickle | |||
| import argparse | |||
| import pickle | |||
| import sys | |||
| from learnware.utils import get_module_by_module_path | |||
| @@ -1,10 +1,9 @@ | |||
| import os | |||
| import zipfile | |||
| import tempfile | |||
| import subprocess | |||
| import tempfile | |||
| from ..logger import get_module_logger | |||
| from .package_utils import filter_nonexist_conda_packages_file, filter_nonexist_pip_packages_file | |||
| from ..logger import get_module_logger | |||
| logger = get_module_logger(module_name="client_utils") | |||
| @@ -22,14 +21,15 @@ def system_execute(args, timeout=None, env=None, stdout=subprocess.DEVNULL, stde | |||
| errmsg = err.stderr.decode() | |||
| logger.warning(f"System Execute Error: {errmsg}") | |||
| raise err | |||
| return com_process | |||
| def remove_enviroment(conda_env): | |||
| system_execute(args=["conda", "env", "remove", "-n", f"{conda_env}"]) | |||
| def install_environment(learnware_dirpath, conda_env): | |||
| def install_environment(learnware_dirpath, conda_env, conda_prefix=None): | |||
| """Install environment of a learnware | |||
| Parameters | |||
| @@ -38,12 +38,21 @@ def install_environment(learnware_dirpath, conda_env): | |||
| Path of the learnware folder | |||
| conda_env : str | |||
| a new conda environment will be created with the given name; | |||
| conda_prefix: str | |||
| install env in a specific location, not default env path; | |||
| Raises | |||
| ------ | |||
| Exception | |||
| Lack of the environment configuration file. | |||
| """ | |||
| if conda_prefix is not None: | |||
| args_location = ["--prefix", conda_prefix] | |||
| conda_env = conda_prefix | |||
| else: | |||
| args_location = ["--name", conda_env] | |||
| pass | |||
| with tempfile.TemporaryDirectory(prefix="learnware_") as tempdir: | |||
| logger.info(f"learnware_dir namelist: {os.listdir(learnware_dirpath)}") | |||
| if "environment.yaml" in os.listdir(learnware_dirpath): | |||
| @@ -53,7 +62,7 @@ def install_environment(learnware_dirpath, conda_env): | |||
| filter_nonexist_conda_packages_file(yaml_file=yaml_path, output_yaml_file=yaml_path_filter) | |||
| # create environment | |||
| logger.info(f"create conda env [{conda_env}] according to .yaml file") | |||
| system_execute(args=["conda", "env", "create", "--name", f"{conda_env}", "--file", f"{yaml_path_filter}"]) | |||
| system_execute(args=["conda", "env", "create"] + args_location + ["--file", f"{yaml_path_filter}"]) | |||
| elif "requirements.txt" in os.listdir(learnware_dirpath): | |||
| requirements_path: str = os.path.join(learnware_dirpath, "requirements.txt") | |||
| @@ -61,14 +70,15 @@ def install_environment(learnware_dirpath, conda_env): | |||
| logger.info(f"checking the available pip packages for {conda_env}") | |||
| filter_nonexist_pip_packages_file(requirements_file=requirements_path, output_file=requirements_path_filter) | |||
| logger.info(f"create empty conda env [{conda_env}]") | |||
| system_execute(args=["conda", "create", "-y", "--name", f"{conda_env}", "python=3.8"]) | |||
| system_execute(args=["conda", "create", "-y"] + args_location + ["python=3.8"]) | |||
| logger.info(f"install pip requirements for conda env [{conda_env}]") | |||
| system_execute( | |||
| args=[ | |||
| "conda", | |||
| "run", | |||
| "-n", | |||
| f"{conda_env}", | |||
| ] | |||
| + args_location | |||
| + [ | |||
| "--no-capture-output", | |||
| "python", | |||
| "-m", | |||
| @@ -86,8 +96,9 @@ def install_environment(learnware_dirpath, conda_env): | |||
| args=[ | |||
| "conda", | |||
| "run", | |||
| "-n", | |||
| f"{conda_env}", | |||
| ] | |||
| + args_location | |||
| + [ | |||
| "--no-capture-output", | |||
| "python", | |||
| "-m", | |||
| @@ -1,6 +1,6 @@ | |||
| import os | |||
| import copy | |||
| import logging | |||
| import os | |||
| from enum import Enum | |||
| @@ -1,19 +1,21 @@ | |||
| import os | |||
| import copy | |||
| from typing import Optional | |||
| import os | |||
| import traceback | |||
| from typing import Optional | |||
| from .base import Learnware | |||
| from .utils import get_stat_spec_from_config | |||
| from ..config import C | |||
| from ..logger import get_module_logger | |||
| from ..specification import Specification | |||
| from ..utils import read_yaml_to_dict | |||
| from ..logger import get_module_logger | |||
| from ..config import C | |||
| logger = get_module_logger("learnware.learnware") | |||
| def get_learnware_from_dirpath(id: str, semantic_spec: dict, learnware_dirpath, ignore_error=True) -> Optional[Learnware]: | |||
| def get_learnware_from_dirpath( | |||
| id: str, semantic_spec: dict, learnware_dirpath, ignore_error=True | |||
| ) -> Optional[Learnware]: | |||
| """Get the learnware object from dirpath, and provide the manage interface tor Learnware class | |||
| Parameters | |||
| @@ -46,11 +48,11 @@ def get_learnware_from_dirpath(id: str, semantic_spec: dict, learnware_dirpath, | |||
| } | |||
| try: | |||
| learnware_yaml_path = os.path.join(learnware_dirpath, C.learnware_folder_config["yaml_file"]) | |||
| assert os.path.exists(learnware_yaml_path), f"learnware.yaml is not found for learnware_{id}, please check the learnware folder or zipfile." | |||
| assert os.path.exists( | |||
| learnware_yaml_path | |||
| ), f"learnware.yaml is not found for learnware_{id}, please check the learnware folder or zipfile." | |||
| yaml_config = read_yaml_to_dict(learnware_yaml_path) | |||
| if "name" in yaml_config: | |||
| @@ -67,8 +69,10 @@ def get_learnware_from_dirpath(id: str, semantic_spec: dict, learnware_dirpath, | |||
| for _stat_spec in learnware_config["stat_specifications"]: | |||
| stat_spec = _stat_spec.copy() | |||
| stat_spec_path = os.path.join(learnware_dirpath, stat_spec["file_name"]) | |||
| assert os.path.exists(stat_spec_path), f"statistical specification file {stat_spec['file_name']} is not found for learnware_{id}, please check the learnware folder or zipfile." | |||
| assert os.path.exists( | |||
| stat_spec_path | |||
| ), f"statistical specification file {stat_spec['file_name']} is not found for learnware_{id}, please check the learnware folder or zipfile." | |||
| stat_spec["file_name"] = stat_spec_path | |||
| stat_spec_inst = get_stat_spec_from_config(stat_spec) | |||
| learnware_spec.update_stat_spec(**{stat_spec_inst.type: stat_spec_inst}) | |||
| @@ -1,19 +1,19 @@ | |||
| import os | |||
| import numpy as np | |||
| from typing import Union, List | |||
| import sys | |||
| from typing import Union | |||
| import numpy as np | |||
| from ..specification import Specification, BaseStatSpecification | |||
| from ..logger import get_module_logger | |||
| from ..model import BaseModel | |||
| from ..specification import BaseStatSpecification, Specification | |||
| from ..utils import get_module_by_module_path | |||
| from ..logger import get_module_logger | |||
| logger = get_module_logger("Learnware") | |||
| class Learnware: | |||
| """The learnware class, which is the basic components in learnware market | |||
| """ | |||
| """The learnware class, which is the basic components in learnware market""" | |||
| def __init__(self, id: str, model: Union[BaseModel, dict], specification: Specification, learnware_dirpath: str): | |||
| """The initialization method for learnware. | |||
| @@ -40,7 +40,7 @@ class Learnware: | |||
| dirpath: str | |||
| The path of the learnware directory | |||
| """ | |||
| self.id = id | |||
| self.model = model | |||
| self.specification = specification | |||
| @@ -1,4 +1,3 @@ | |||
| import copy | |||
| from typing import Union | |||
| from ..model import BaseModel | |||
| @@ -45,5 +44,5 @@ def get_stat_spec_from_config(stat_spec: dict) -> BaseStatSpecification: | |||
| f"Statistic specification must be type of BaseStatSpecification, not {BaseStatSpecification.__class__.__name__}" | |||
| ) | |||
| stat_spec_inst.load(stat_spec["file_name"]) | |||
| return stat_spec_inst | |||
| @@ -1,5 +1,5 @@ | |||
| import logging | |||
| from logging import Logger, handlers | |||
| from logging import Logger | |||
| from .config import C | |||
| @@ -1,9 +1,8 @@ | |||
| from .anchor import AnchoredUserInfo, AnchoredSearcher, AnchoredOrganizer | |||
| from .base import BaseUserInfo, LearnwareMarket, BaseChecker, BaseOrganizer, BaseSearcher | |||
| from .evolve_anchor import EvolvedAnchoredOrganizer | |||
| from .evolve import EvolvedOrganizer | |||
| from .anchor import AnchoredOrganizer, AnchoredSearcher, AnchoredUserInfo | |||
| from .base import BaseChecker, BaseOrganizer, BaseSearcher, BaseUserInfo, LearnwareMarket | |||
| from .classes import CondaChecker | |||
| from .easy import EasyOrganizer, EasySearcher, EasySemanticChecker, EasyStatChecker | |||
| from .evolve import EvolvedOrganizer | |||
| from .evolve_anchor import EvolvedAnchoredOrganizer | |||
| from .heterogeneous import HeteroMapTableOrganizer, HeteroSearcher | |||
| from .classes import CondaChecker | |||
| from .module import instantiate_learnware_market | |||
| @@ -1,8 +1,7 @@ | |||
| from .organizer import AnchoredOrganizer | |||
| from .user_info import AnchoredUserInfo | |||
| from ...utils import is_torch_available | |||
| from ...logger import get_module_logger | |||
| from ...utils import is_torch_available | |||
| logger = get_module_logger("market_anchor") | |||
| @@ -1,8 +1,8 @@ | |||
| from typing import Dict | |||
| from ..easy.organizer import EasyOrganizer | |||
| from ...logger import get_module_logger | |||
| from ...learnware import Learnware | |||
| from ...logger import get_module_logger | |||
| logger = get_module_logger("anchor_organizer") | |||
| @@ -44,7 +44,7 @@ class AnchoredOrganizer(EasyOrganizer): | |||
| Exception | |||
| Raise an excpetion when given anchor_id is NOT found in anchor_learnware_list | |||
| """ | |||
| if not anchor_id in self.anchor_learnware_list: | |||
| if anchor_id not in self.anchor_learnware_list: | |||
| raise Exception("Anchor learnware id:{} NOT Found!".format(anchor_id)) | |||
| self.anchor_learnware_list.pop(anchor_id) | |||
| @@ -1,9 +1,9 @@ | |||
| from typing import List, Tuple, Any | |||
| from typing import Any, List, Tuple | |||
| from .user_info import AnchoredUserInfo | |||
| from ..easy.searcher import EasySearcher | |||
| from ...logger import get_module_logger | |||
| from ...learnware import Learnware | |||
| from ...logger import get_module_logger | |||
| logger = get_module_logger("anchor_searcher") | |||
| @@ -1,4 +1,5 @@ | |||
| from typing import List, Any, Union | |||
| from typing import Any, List, Union | |||
| from ..base import BaseUserInfo | |||
| @@ -1,10 +1,11 @@ | |||
| from __future__ import annotations | |||
| import tempfile | |||
| import traceback | |||
| import zipfile | |||
| import tempfile | |||
| from typing import Tuple, Any, List, Union, Optional | |||
| from dataclasses import dataclass | |||
| from typing import Any, List, Optional, Tuple, Union | |||
| from ..learnware import Learnware, get_learnware_from_dirpath | |||
| from ..logger import get_module_logger | |||
| @@ -1,8 +1,9 @@ | |||
| import traceback | |||
| from typing import Tuple | |||
| from .base import BaseChecker | |||
| from ..learnware import Learnware | |||
| from ..client.container import LearnwaresContainer | |||
| from ..learnware import Learnware | |||
| from ..logger import get_module_logger | |||
| logger = get_module_logger("market_classes") | |||
| @@ -1,7 +1,6 @@ | |||
| from .organizer import EasyOrganizer | |||
| from ...utils import is_torch_available | |||
| from ...logger import get_module_logger | |||
| from ...utils import is_torch_available | |||
| logger = get_module_logger("market_easy") | |||
| @@ -11,5 +10,5 @@ if not is_torch_available(verbose=False): | |||
| EasyStatChecker = None | |||
| logger.error("EasySeacher and EasyChecker are not available because 'torch' is not installed!") | |||
| else: | |||
| from .searcher import EasySearcher, EasyStatSearcher, EasyFuzzSemanticSearcher, EasyExactSemanticSearcher | |||
| from .checker import EasySemanticChecker, EasyStatChecker | |||
| from .searcher import EasyExactSemanticSearcher, EasyFuzzSemanticSearcher, EasySearcher, EasyStatSearcher | |||
| @@ -1,10 +1,10 @@ | |||
| import traceback | |||
| import numpy as np | |||
| import torch | |||
| import random | |||
| import string | |||
| import traceback | |||
| import numpy as np | |||
| import torch | |||
| from ..base import BaseChecker | |||
| from ..utils import parse_specification_type | |||
| from ...config import C | |||
| @@ -1,9 +1,10 @@ | |||
| from sqlalchemy.ext.declarative import declarative_base | |||
| from sqlalchemy import create_engine, text | |||
| from sqlalchemy import Column, Text, String | |||
| import os | |||
| import json | |||
| import os | |||
| import traceback | |||
| from sqlalchemy import Column, String, Text, create_engine, text | |||
| from sqlalchemy.ext.declarative import declarative_base | |||
| from ...learnware import get_learnware_from_dirpath | |||
| from ...logger import get_module_logger | |||
| @@ -1,14 +1,13 @@ | |||
| import os | |||
| import copy | |||
| import zipfile | |||
| import os | |||
| import tempfile | |||
| import zipfile | |||
| from shutil import copyfile, rmtree | |||
| from typing import Tuple, List, Union, Dict | |||
| from typing import Dict, List, Tuple, Union | |||
| from .database_ops import DatabaseOperations | |||
| from ..base import BaseOrganizer, BaseChecker | |||
| from ..base import BaseChecker, BaseOrganizer | |||
| from ...config import C as conf | |||
| from ...logger import get_module_logger | |||
| from ...learnware import Learnware, get_learnware_from_dirpath | |||
| from ...logger import get_module_logger | |||
| @@ -95,34 +94,34 @@ class EasyOrganizer(BaseOrganizer): | |||
| new_learnware = get_learnware_from_dirpath( | |||
| id=learnware_id, semantic_spec=semantic_spec, learnware_dirpath=target_folder_dir | |||
| ) | |||
| except: | |||
| except Exception: | |||
| logger.warning("New learnware is not properly added!") | |||
| try: | |||
| os.remove(target_zip_dir) | |||
| rmtree(target_folder_dir) | |||
| except: | |||
| except Exception: | |||
| pass | |||
| return None, BaseChecker.INVALID_LEARNWARE | |||
| if new_learnware is None: | |||
| return None, BaseChecker.INVALID_LEARNWARE | |||
| learnwere_status = check_status if check_status is not None else BaseChecker.NONUSABLE_LEARNWARE | |||
| learnware_status = check_status if check_status is not None else BaseChecker.NONUSABLE_LEARNWARE | |||
| self.dbops.add_learnware( | |||
| id=learnware_id, | |||
| semantic_spec=semantic_spec, | |||
| zip_path=target_zip_dir, | |||
| folder_path=target_folder_dir, | |||
| use_flag=learnwere_status, | |||
| use_flag=learnware_status, | |||
| ) | |||
| self.learnware_list[learnware_id] = new_learnware | |||
| self.learnware_zip_list[learnware_id] = target_zip_dir | |||
| self.learnware_folder_list[learnware_id] = target_folder_dir | |||
| self.use_flags[learnware_id] = learnwere_status | |||
| self.use_flags[learnware_id] = learnware_status | |||
| self.count += 1 | |||
| return learnware_id, learnwere_status | |||
| return learnware_id, learnware_status | |||
| def delete_learnware(self, id: str) -> bool: | |||
| """Delete Learnware from market | |||
| @@ -138,7 +137,7 @@ class EasyOrganizer(BaseOrganizer): | |||
| True for successful operation. | |||
| False for id not found. | |||
| """ | |||
| if not id in self.learnware_list: | |||
| if id not in self.learnware_list: | |||
| logger.warning("Learnware id:'{}' NOT Found!".format(id)) | |||
| return False | |||
| @@ -254,7 +253,7 @@ class EasyOrganizer(BaseOrganizer): | |||
| else: | |||
| try: | |||
| return self.learnware_list[ids] | |||
| except: | |||
| except Exception: | |||
| logger.warning("Learnware ID '%s' NOT Found!" % (ids)) | |||
| return None | |||
| @@ -286,7 +285,7 @@ class EasyOrganizer(BaseOrganizer): | |||
| else: | |||
| try: | |||
| return self.learnware_zip_list[ids] | |||
| except: | |||
| except Exception: | |||
| logger.warning("Learnware ID '%s' NOT Found!" % (ids)) | |||
| return None | |||
| @@ -318,7 +317,7 @@ class EasyOrganizer(BaseOrganizer): | |||
| else: | |||
| try: | |||
| return self.learnware_folder_list[ids] | |||
| except: | |||
| except Exception: | |||
| logger.warning("Learnware ID '%s' NOT Found!" % (ids)) | |||
| return None | |||
| @@ -1,15 +1,16 @@ | |||
| import math | |||
| import torch | |||
| from typing import List, Optional, Tuple, Union | |||
| import numpy as np | |||
| import torch | |||
| from rapidfuzz import fuzz | |||
| from typing import Tuple, List, Union, Optional | |||
| from .organizer import EasyOrganizer | |||
| from ..base import BaseSearcher, BaseUserInfo, MultipleSearchItem, SearchResults, SingleSearchItem | |||
| from ..utils import parse_specification_type | |||
| from ..base import BaseUserInfo, BaseSearcher, SearchResults, SingleSearchItem, MultipleSearchItem | |||
| from ...learnware import Learnware | |||
| from ...specification import RKMETableSpecification, RKMEImageSpecification, RKMETextSpecification, rkme_solve_qp | |||
| from ...logger import get_module_logger | |||
| from ...specification import RKMEImageSpecification, RKMETableSpecification, RKMETextSpecification, rkme_solve_qp | |||
| logger = get_module_logger("easy_seacher") | |||
| @@ -278,7 +279,7 @@ class EasyStatSearcher(BaseSearcher): | |||
| learnware_num = len(learnware_list) | |||
| RKME_list = [learnware.specification.get_stat_spec_by_name(self.stat_spec_type) for learnware in learnware_list] | |||
| if type(intermediate_K) == np.ndarray: | |||
| if isinstance(intermediate_K, np.ndarray): | |||
| K = intermediate_K | |||
| else: | |||
| K = np.zeros((learnware_num, learnware_num)) | |||
| @@ -287,7 +288,7 @@ class EasyStatSearcher(BaseSearcher): | |||
| for j in range(i + 1, K.shape[0]): | |||
| K[i, j] = K[j, i] = RKME_list[i].inner_prod(RKME_list[j]) | |||
| if type(intermediate_C) == np.ndarray: | |||
| if isinstance(intermediate_C, np.ndarray): | |||
| C = intermediate_C | |||
| else: | |||
| C = np.zeros((learnware_num, 1)) | |||
| @@ -2,8 +2,8 @@ from typing import List | |||
| from ..easy.organizer import EasyOrganizer | |||
| from ...learnware import Learnware | |||
| from ...specification import BaseStatSpecification | |||
| from ...logger import get_module_logger | |||
| from ...specification import BaseStatSpecification | |||
| logger = get_module_logger("evolve_organizer") | |||
| @@ -1,7 +1,7 @@ | |||
| from typing import List | |||
| from ..evolve import EvolvedOrganizer | |||
| from ..anchor import AnchoredOrganizer, AnchoredUserInfo | |||
| from ..evolve import EvolvedOrganizer | |||
| from ...logger import get_module_logger | |||
| logger = get_module_logger("evolve_anchor_organizer") | |||
| @@ -1,5 +1,5 @@ | |||
| from ...utils import is_torch_available | |||
| from ...logger import get_module_logger | |||
| from ...utils import is_torch_available | |||
| logger = get_module_logger("market_hetero") | |||
| @@ -1,6 +1,5 @@ | |||
| import os | |||
| import traceback | |||
| import pandas as pd | |||
| from collections import defaultdict | |||
| from typing import List, Tuple, Union | |||
| @@ -14,7 +13,6 @@ from ....learnware import Learnware | |||
| from ....logger import get_module_logger | |||
| from ....specification import HeteroMapTableSpecification | |||
| logger = get_module_logger("hetero_map_table_organizer") | |||
| @@ -44,7 +42,7 @@ class HeteroMapTableOrganizer(EasyOrganizer): | |||
| for hetero_id in hetero_ids: | |||
| self._reload_learnware_hetero_spec(hetero_id) | |||
| else: | |||
| logger.warning(f"No market mapping to reload!") | |||
| logger.warning("No market mapping to reload!") | |||
| self.market_mapping = HeteroMap() | |||
| def reset(self, market_id, rebuild=False, auto_update=False, auto_update_limit=100, **training_args): | |||
| @@ -6,10 +6,10 @@ import torch | |||
| import torch.nn.functional as F | |||
| from torch import nn | |||
| from .....utils import allocate_cuda_idx, choose_device | |||
| from .....specification import HeteroMapTableSpecification, RKMETableSpecification | |||
| from .feature_extractor import CLSToken, FeatureProcessor, FeatureTokenizer | |||
| from .trainer import TransTabCollatorForCL, Trainer | |||
| from .trainer import Trainer, TransTabCollatorForCL | |||
| from .....specification import HeteroMapTableSpecification, RKMETableSpecification | |||
| from .....utils import allocate_cuda_idx, choose_device | |||
| class HeteroMap(nn.Module): | |||
| @@ -287,7 +287,7 @@ class HeteroMap(nn.Module): | |||
| # go through transformers, get the first cls embedding | |||
| encoder_output = self.encoder(**outputs) # bs, seqlen+1, hidden_dim | |||
| output_features = encoder_output[:, 0, :] | |||
| del inputs, outputs, encoder_output | |||
| torch.cuda.empty_cache() | |||
| @@ -1,5 +1,4 @@ | |||
| import math | |||
| import os | |||
| import time | |||
| from typing import Any, Callable, Dict, List | |||
| @@ -10,8 +9,8 @@ from torch import nn | |||
| from torch.utils.data import DataLoader, Dataset | |||
| from tqdm.autonotebook import trange | |||
| from .....logger import get_module_logger | |||
| from .feature_extractor import FeatureTokenizer | |||
| from .....logger import get_module_logger | |||
| logger = get_module_logger("hetero_mapping_trainer") | |||
| @@ -6,13 +6,16 @@ from ..easy import EasySearcher | |||
| from ..utils import parse_specification_type | |||
| from ...logger import get_module_logger | |||
| logger = get_module_logger("hetero_searcher") | |||
| class HeteroSearcher(EasySearcher): | |||
| def __call__( | |||
| self, user_info: BaseUserInfo, check_status: Optional[int] = None, max_search_num: int = 5, search_method: str = "greedy" | |||
| self, | |||
| user_info: BaseUserInfo, | |||
| check_status: Optional[int] = None, | |||
| max_search_num: int = 5, | |||
| search_method: str = "greedy", | |||
| ) -> SearchResults: | |||
| """Search learnwares based on user_info from learnwares with check_status. | |||
| Employs heterogeneous learnware search if specific requirements are met, otherwise resorts to homogeneous search methods. | |||
| @@ -1,4 +1,3 @@ | |||
| import traceback | |||
| from ...logger import get_module_logger | |||
| logger = get_module_logger("hetero_utils") | |||
| @@ -48,5 +47,5 @@ def is_hetero(stat_specs: dict, semantic_spec: dict, verbose=True) -> bool: | |||
| return True | |||
| except Exception as err: | |||
| if verbose: | |||
| logger.warning(f"Invalid heterogeneous search information provided.") | |||
| logger.warning("Invalid heterogeneous search information provided.") | |||
| return False | |||
| @@ -4,7 +4,9 @@ from .easy import EasyOrganizer, EasySearcher, EasySemanticChecker, EasyStatChec | |||
| from .heterogeneous import HeteroMapTableOrganizer, HeteroSearcher | |||
| def get_market_component(name, market_id, rebuild, organizer_kwargs=None, searcher_kwargs=None, checker_kwargs=None, conda_checker=False): | |||
| def get_market_component( | |||
| name, market_id, rebuild, organizer_kwargs=None, searcher_kwargs=None, checker_kwargs=None, conda_checker=False | |||
| ): | |||
| organizer_kwargs = {} if organizer_kwargs is None else organizer_kwargs | |||
| searcher_kwargs = {} if searcher_kwargs is None else searcher_kwargs | |||
| checker_kwargs = {} if checker_kwargs is None else checker_kwargs | |||
| @@ -12,7 +14,10 @@ def get_market_component(name, market_id, rebuild, organizer_kwargs=None, search | |||
| if name == "easy": | |||
| easy_organizer = EasyOrganizer(market_id=market_id, rebuild=rebuild) | |||
| easy_searcher = EasySearcher(organizer=easy_organizer) | |||
| easy_checker_list = [EasySemanticChecker(), EasyStatChecker() if conda_checker is False else CondaChecker(EasyStatChecker())] | |||
| easy_checker_list = [ | |||
| EasySemanticChecker(), | |||
| EasyStatChecker() if conda_checker is False else CondaChecker(EasyStatChecker()), | |||
| ] | |||
| market_component = { | |||
| "organizer": easy_organizer, | |||
| "searcher": easy_searcher, | |||
| @@ -21,7 +26,10 @@ def get_market_component(name, market_id, rebuild, organizer_kwargs=None, search | |||
| elif name == "hetero": | |||
| hetero_organizer = HeteroMapTableOrganizer(market_id=market_id, rebuild=rebuild, **organizer_kwargs) | |||
| hetero_searcher = HeteroSearcher(organizer=hetero_organizer) | |||
| hetero_checker_list = [EasySemanticChecker(), EasyStatChecker() if conda_checker is False else CondaChecker(EasyStatChecker())] | |||
| hetero_checker_list = [ | |||
| EasySemanticChecker(), | |||
| EasyStatChecker() if conda_checker is False else CondaChecker(EasyStatChecker()), | |||
| ] | |||
| market_component = { | |||
| "organizer": hetero_organizer, | |||
| @@ -44,7 +52,9 @@ def instantiate_learnware_market( | |||
| conda_checker: bool = False, | |||
| **kwargs, | |||
| ): | |||
| market_componets = get_market_component(name, market_id, rebuild, organizer_kwargs, searcher_kwargs, checker_kwargs, conda_checker) | |||
| market_componets = get_market_component( | |||
| name, market_id, rebuild, organizer_kwargs, searcher_kwargs, checker_kwargs, conda_checker | |||
| ) | |||
| return LearnwareMarket( | |||
| organizer=market_componets["organizer"], | |||
| searcher=market_componets["searcher"], | |||
| @@ -1,6 +1,3 @@ | |||
| from ..specification import Specification | |||
| def parse_specification_type( | |||
| stat_specs: dict, | |||
| spec_list=[ | |||
| @@ -1,5 +1,4 @@ | |||
| import numpy as np | |||
| from typing import Union | |||
| class BaseModel: | |||
| @@ -1,6 +1,5 @@ | |||
| from .base import BaseReuser | |||
| from .align import AlignLearnware | |||
| from .base import BaseReuser | |||
| from ..logger import get_module_logger | |||
| from ..utils import is_torch_available | |||
| @@ -18,7 +17,7 @@ if not is_torch_available(verbose=False): | |||
| ) | |||
| else: | |||
| from .averaging import AveragingReuser | |||
| from .ensemble_pruning import EnsemblePruningReuser | |||
| from .feature_augment import FeatureAugmentReuser | |||
| from .hetero import HeteroMapAlignLearnware, FeatureAlignLearnware | |||
| from .hetero import FeatureAlignLearnware, HeteroMapAlignLearnware | |||
| from .job_selector import JobSelectorReuser | |||
| from .ensemble_pruning import EnsemblePruningReuser | |||
| @@ -1,11 +1,11 @@ | |||
| import torch | |||
| from typing import List | |||
| import numpy as np | |||
| from typing import List, Union | |||
| import torch | |||
| from scipy.special import softmax | |||
| from ..learnware import Learnware | |||
| from .base import BaseReuser | |||
| from ..learnware import Learnware | |||
| from ..logger import get_module_logger | |||
| logger = get_module_logger("avaraging_reuser") | |||
| @@ -50,7 +50,7 @@ class AveragingReuser(BaseReuser): | |||
| if isinstance(pred_y, torch.Tensor): | |||
| pred_y = pred_y.detach().cpu().numpy() | |||
| if not isinstance(pred_y, np.ndarray): | |||
| raise TypeError(f"Model output must be np.ndarray or torch.Tensor") | |||
| raise TypeError("Model output must be np.ndarray or torch.Tensor") | |||
| if len(pred_y.shape) == 1: | |||
| pred_y = pred_y.reshape(-1, 1) | |||
| @@ -1,6 +1,7 @@ | |||
| import numpy as np | |||
| from typing import List | |||
| import numpy as np | |||
| from ..learnware import Learnware | |||
| from ..logger import get_module_logger | |||
| @@ -1,10 +1,11 @@ | |||
| import torch | |||
| import random | |||
| import numpy as np | |||
| from typing import List | |||
| from ..learnware import Learnware | |||
| import numpy as np | |||
| import torch | |||
| from .base import BaseReuser | |||
| from ..learnware import Learnware | |||
| from ..logger import get_module_logger | |||
| logger = get_module_logger("ensemble_pruning") | |||
| @@ -53,13 +54,14 @@ class EnsemblePruningReuser(BaseReuser): | |||
| np.ndarray | |||
| Binary one-dimensional vector, 1 indicates that the corresponding model is selected. | |||
| """ | |||
| try: | |||
| import geatpy as ea | |||
| except ModuleNotFoundError: | |||
| raise ModuleNotFoundError(f"EnsemblePruningReuser is not available because 'geatpy' is not installed! Please install it manually (only support python_version<3.11).") | |||
| raise ModuleNotFoundError( | |||
| "EnsemblePruningReuser is not available because 'geatpy' is not installed! Please install it manually (only support python_version<3.11)." | |||
| ) | |||
| model_num = v_predict.shape[1] | |||
| @ea.Problem.single | |||
| @@ -147,7 +149,9 @@ class EnsemblePruningReuser(BaseReuser): | |||
| try: | |||
| import geatpy as ea | |||
| except ModuleNotFoundError: | |||
| raise ModuleNotFoundError(f"EnsemblePruningReuser is not available because 'geatpy' is not installed! Please install it manually (only support python_version<3.11).") | |||
| raise ModuleNotFoundError( | |||
| "EnsemblePruningReuser is not available because 'geatpy' is not installed! Please install it manually (only support python_version<3.11)." | |||
| ) | |||
| if torch.is_tensor(v_true): | |||
| v_true = v_true.detach().cpu().numpy() | |||
| @@ -269,8 +273,10 @@ class EnsemblePruningReuser(BaseReuser): | |||
| try: | |||
| import geatpy as ea | |||
| except ModuleNotFoundError: | |||
| raise ModuleNotFoundError(f"EnsemblePruningReuser is not available because 'geatpy' is not installed! Please install it manually (only support python_version<3.11).") | |||
| raise ModuleNotFoundError( | |||
| "EnsemblePruningReuser is not available because 'geatpy' is not installed! Please install it manually (only support python_version<3.11)." | |||
| ) | |||
| model_num = v_predict.shape[1] | |||
| v_predict[v_predict == 0.0] = -1 | |||
| v_true[v_true == 0.0] = -1 | |||
| @@ -371,7 +377,7 @@ class EnsemblePruningReuser(BaseReuser): | |||
| if isinstance(pred_y, torch.Tensor): | |||
| pred_y = pred_y.detach().cpu().numpy() | |||
| if not isinstance(pred_y, np.ndarray): | |||
| raise TypeError(f"Model output must be np.ndarray or torch.Tensor") | |||
| raise TypeError("Model output must be np.ndarray or torch.Tensor") | |||
| if len(pred_y.shape) == 1: | |||
| pred_y = pred_y.reshape(-1, 1) | |||
| @@ -1,7 +1,8 @@ | |||
| import torch | |||
| import numpy as np | |||
| from typing import List | |||
| from sklearn.linear_model import RidgeCV, LogisticRegressionCV | |||
| import numpy as np | |||
| import torch | |||
| from sklearn.linear_model import LogisticRegressionCV, RidgeCV | |||
| from .base import BaseReuser | |||
| from .utils import fill_data_with_mean | |||
| @@ -102,7 +103,7 @@ class FeatureAugmentReuser(BaseReuser): | |||
| if isinstance(y_pred, torch.Tensor): | |||
| y_pred = y_pred.detach().cpu().numpy() | |||
| if not isinstance(y_pred, np.ndarray): | |||
| raise TypeError(f"Model output must be np.ndarray or torch.Tensor") | |||
| raise TypeError("Model output must be np.ndarray or torch.Tensor") | |||
| if len(y_pred.shape) == 1: | |||
| y_pred = y_pred.reshape(-1, 1) | |||
| y_preds.append(y_pred) | |||
| @@ -1,17 +1,18 @@ | |||
| import time | |||
| import torch | |||
| from typing import List | |||
| import numpy as np | |||
| import torch | |||
| import torch.nn as nn | |||
| from typing import List | |||
| from tqdm import trange | |||
| import torch.nn.functional as F | |||
| from tqdm import trange | |||
| from ..align import AlignLearnware | |||
| from ..utils import fill_data_with_mean | |||
| from ...utils import choose_device, allocate_cuda_idx | |||
| from ...logger import get_module_logger | |||
| from ...learnware import Learnware | |||
| from ...logger import get_module_logger | |||
| from ...specification import RKMETableSpecification | |||
| from ...utils import allocate_cuda_idx, choose_device | |||
| logger = get_module_logger("feature_align") | |||
| @@ -1,10 +1,10 @@ | |||
| import numpy as np | |||
| from .feature_align import FeatureAlignLearnware | |||
| from ..align import AlignLearnware | |||
| from ..feature_augment import FeatureAugmentReuser | |||
| from ...learnware import Learnware | |||
| from ...logger import get_module_logger | |||
| from .feature_align import FeatureAlignLearnware | |||
| from ..feature_augment import FeatureAugmentReuser | |||
| from ...specification import RKMETableSpecification | |||
| logger = get_module_logger("hetero_map_align") | |||
| @@ -1,15 +1,14 @@ | |||
| import torch | |||
| import numpy as np | |||
| from typing import List, Union | |||
| import numpy as np | |||
| import torch | |||
| from sklearn.metrics import accuracy_score | |||
| from .base import BaseReuser | |||
| from ..market.utils import parse_specification_type | |||
| from ..learnware import Learnware | |||
| from ..specification import RKMETableSpecification, RKMETextSpecification | |||
| from ..specification import generate_rkme_table_spec, rkme_solve_qp | |||
| from ..logger import get_module_logger | |||
| from ..market.utils import parse_specification_type | |||
| from ..specification import RKMETableSpecification, RKMETextSpecification, generate_rkme_table_spec, rkme_solve_qp | |||
| logger = get_module_logger("job_selector_reuse") | |||
| @@ -70,7 +69,7 @@ class JobSelectorReuser(BaseReuser): | |||
| # pred_y = pred_y.numpy() | |||
| if not isinstance(pred_y, np.ndarray): | |||
| raise TypeError(f"Model output must be np.ndarray or torch.Tensor") | |||
| raise TypeError("Model output must be np.ndarray or torch.Tensor") | |||
| pred_y_list.append(pred_y) | |||
| data_idxs_list.append(data_idx_list) | |||
| @@ -230,7 +229,7 @@ class JobSelectorReuser(BaseReuser): | |||
| from lightgbm import LGBMClassifier, early_stopping | |||
| except ModuleNotFoundError: | |||
| raise ModuleNotFoundError( | |||
| f"JobSelectorReuser is not available because 'lightgbm' is not installed! Please install it manually." | |||
| "JobSelectorReuser is not available because 'lightgbm' is not installed! Please install it manually." | |||
| ) | |||
| score_best = -1 | |||
| @@ -1,8 +1,10 @@ | |||
| import numpy as np | |||
| from ..logger import get_module_logger | |||
| logger = get_module_logger("reuse_utils") | |||
| def fill_data_with_mean(X: np.ndarray) -> np.ndarray: | |||
| """ | |||
| Fill missing data (NaN, Inf) in the input array with the mean of the column. | |||
| @@ -1,15 +1,13 @@ | |||
| from .base import Specification, BaseStatSpecification | |||
| from .base import BaseStatSpecification, Specification | |||
| from .regular import ( | |||
| RegularStatSpecification, | |||
| RKMEImageSpecification, | |||
| RKMEStatSpecification, | |||
| RKMETableSpecification, | |||
| RKMEImageSpecification, | |||
| RKMETextSpecification, | |||
| rkme_solve_qp, | |||
| ) | |||
| from .system import HeteroMapTableSpecification | |||
| from ..utils import is_torch_available | |||
| if not is_torch_available(verbose=False): | |||
| @@ -20,9 +18,9 @@ if not is_torch_available(verbose=False): | |||
| generate_semantic_spec = None | |||
| else: | |||
| from .module import ( | |||
| generate_stat_spec, | |||
| generate_rkme_table_spec, | |||
| generate_rkme_image_spec, | |||
| generate_rkme_table_spec, | |||
| generate_rkme_text_spec, | |||
| generate_semantic_spec, | |||
| generate_stat_spec, | |||
| ) | |||
| @@ -1,7 +1,5 @@ | |||
| from __future__ import annotations | |||
| import copy | |||
| import numpy as np | |||
| from typing import Dict | |||
| @@ -26,7 +24,7 @@ class BaseStatSpecification: | |||
| def dist(self, stat_spec: BaseStatSpecification): | |||
| raise NotImplementedError("dist is not implemented") | |||
| def save(self, filepath: str): | |||
| """Save the statistical specification into file in filepath | |||
| @@ -1,11 +1,11 @@ | |||
| import torch | |||
| from typing import List, Optional, Union | |||
| import numpy as np | |||
| import pandas as pd | |||
| from typing import Union, List, Optional | |||
| import torch | |||
| from .regular import RKMEImageSpecification, RKMETableSpecification, RKMETextSpecification | |||
| from .utils import convert_to_numpy | |||
| from .base import BaseStatSpecification | |||
| from .regular import RKMETableSpecification, RKMEImageSpecification, RKMETextSpecification | |||
| from ..config import C | |||
| @@ -1,4 +1,4 @@ | |||
| from .base import RegularStatSpecification | |||
| from .text import RKMETextSpecification | |||
| from .table import RKMETableSpecification, RKMEStatSpecification, rkme_solve_qp | |||
| from .image import RKMEImageSpecification | |||
| from .table import RKMEStatSpecification, RKMETableSpecification, rkme_solve_qp | |||
| from .text import RKMETextSpecification | |||
| @@ -1,11 +1,10 @@ | |||
| from ....utils import is_torch_available | |||
| from ....logger import get_module_logger | |||
| from ....utils import is_torch_available | |||
| logger = get_module_logger("regular_image_spec") | |||
| if not is_torch_available(verbose=False): | |||
| RKMEImageSpecification = None | |||
| logger.error(f"RKMEImageSpecification is not available because 'torch' is not installed!") | |||
| logger.error("RKMEImageSpecification is not available because 'torch' is not installed!") | |||
| else: | |||
| from .rkme import RKMEImageSpecification | |||