Browse Source

Merge branch 'main' into table_benchmark

tags/v0.3.2
liuht 2 years ago
parent
commit
9b40246fcc
100 changed files with 1683 additions and 3870 deletions
  1. +7
    -0
      .flake8
  2. +131
    -0
      CODE_OF_CONDUCT.md
  3. +203
    -0
      LICENSE
  4. +311
    -152
      README.md
  5. +0
    -0
      docs/README_zh.md
  6. +4
    -0
      docs/_static/img/learnware_framework.svg
  7. BIN
      docs/_static/img/learnware_market.jpg
  8. +1
    -0
      docs/_static/img/learnware_market.svg
  9. +5
    -2
      docs/about/about.rst
  10. +60
    -15
      docs/about/dev.rst
  11. +3
    -3
      docs/components/learnware.rst
  12. +4
    -4
      docs/components/market.rst
  13. +4
    -1
      docs/components/spec.rst
  14. +2
    -2
      docs/conf.py
  15. +4
    -2
      docs/index.rst
  16. +8
    -1
      docs/references/FAQ.rst
  17. +106
    -3
      docs/references/api.rst
  18. +0
    -6
      docs/references/beiming.rst
  19. +30
    -0
      docs/references/beimingwu.rst
  20. +43
    -29
      docs/start/exp.rst
  21. +19
    -26
      docs/start/install.rst
  22. +42
    -49
      docs/start/intro.rst
  23. +78
    -101
      docs/start/quick.rst
  24. +24
    -2
      docs/workflows/reuse.rst
  25. +1
    -1
      docs/workflows/search.rst
  26. +196
    -102
      docs/workflows/upload.rst
  27. +18
    -3
      examples/dataset_image_workflow/README.md
  28. +1
    -5
      examples/dataset_image_workflow/utils.py
  29. +134
    -128
      examples/dataset_image_workflow/workflow.py
  30. +0
    -8
      examples/dataset_m5_workflow/example.yaml
  31. +0
    -21
      examples/dataset_m5_workflow/example_init.py
  32. +0
    -3
      examples/dataset_m5_workflow/m5/README.md
  33. +0
    -65
      examples/dataset_m5_workflow/m5/__init__.py
  34. +0
    -139
      examples/dataset_m5_workflow/m5/config.py
  35. +0
    -338
      examples/dataset_m5_workflow/m5/generate_data.py
  36. +0
    -452
      examples/dataset_m5_workflow/m5/train.py
  37. +0
    -177
      examples/dataset_m5_workflow/m5/utils.py
  38. +0
    -211
      examples/dataset_m5_workflow/main.py
  39. +0
    -87
      examples/dataset_m5_workflow/upload.py
  40. +0
    -8
      examples/dataset_pfs_workflow/example.yaml
  41. +0
    -20
      examples/dataset_pfs_workflow/example_init.py
  42. +0
    -208
      examples/dataset_pfs_workflow/main.py
  43. +0
    -48
      examples/dataset_pfs_workflow/pfs/README.md
  44. +0
    -77
      examples/dataset_pfs_workflow/pfs/__init__.py
  45. +0
    -272
      examples/dataset_pfs_workflow/pfs/config.py
  46. +0
    -21
      examples/dataset_pfs_workflow/pfs/paths.py
  47. +0
    -384
      examples/dataset_pfs_workflow/pfs/pfs_cross_transfer.py
  48. +0
    -384
      examples/dataset_pfs_workflow/pfs/split_data.py
  49. +0
    -90
      examples/dataset_pfs_workflow/upload.py
  50. +8
    -7
      examples/dataset_text_workflow/README.md
  51. +11
    -11
      examples/dataset_text_workflow/workflow.py
  52. +5
    -4
      learnware/__init__.py
  53. +10
    -12
      learnware/client/container.py
  54. +13
    -14
      learnware/client/learnware_client.py
  55. +7
    -7
      learnware/client/package_utils.py
  56. +1
    -1
      learnware/client/scripts/install_env.py
  57. +3
    -2
      learnware/client/scripts/run_model.py
  58. +22
    -11
      learnware/client/utils.py
  59. +1
    -1
      learnware/config.py
  60. +15
    -11
      learnware/learnware/__init__.py
  61. +7
    -7
      learnware/learnware/base.py
  62. +1
    -2
      learnware/learnware/utils.py
  63. +1
    -1
      learnware/logger.py
  64. +5
    -6
      learnware/market/__init__.py
  65. +1
    -2
      learnware/market/anchor/__init__.py
  66. +2
    -2
      learnware/market/anchor/organizer.py
  67. +2
    -2
      learnware/market/anchor/searcher.py
  68. +2
    -1
      learnware/market/anchor/user_info.py
  69. +3
    -2
      learnware/market/base.py
  70. +2
    -1
      learnware/market/classes.py
  71. +2
    -3
      learnware/market/easy/__init__.py
  72. +3
    -3
      learnware/market/easy/checker.py
  73. +5
    -4
      learnware/market/easy/database_ops.py
  74. +14
    -15
      learnware/market/easy/organizer.py
  75. +7
    -6
      learnware/market/easy/searcher.py
  76. +1
    -1
      learnware/market/evolve/organizer.py
  77. +1
    -1
      learnware/market/evolve_anchor/organizer.py
  78. +1
    -1
      learnware/market/heterogeneous/__init__.py
  79. +1
    -3
      learnware/market/heterogeneous/organizer/__init__.py
  80. +4
    -4
      learnware/market/heterogeneous/organizer/hetero_map/__init__.py
  81. +1
    -2
      learnware/market/heterogeneous/organizer/hetero_map/trainer.py
  82. +5
    -2
      learnware/market/heterogeneous/searcher.py
  83. +1
    -2
      learnware/market/heterogeneous/utils.py
  84. +14
    -4
      learnware/market/module.py
  85. +0
    -3
      learnware/market/utils.py
  86. +0
    -1
      learnware/model/base.py
  87. +3
    -4
      learnware/reuse/__init__.py
  88. +5
    -5
      learnware/reuse/averaging.py
  89. +2
    -1
      learnware/reuse/base.py
  90. +17
    -11
      learnware/reuse/ensemble_pruning.py
  91. +5
    -4
      learnware/reuse/feature_augment.py
  92. +6
    -5
      learnware/reuse/hetero/feature_align.py
  93. +2
    -2
      learnware/reuse/hetero/hetero_map.py
  94. +7
    -8
      learnware/reuse/job_selector.py
  95. +2
    -0
      learnware/reuse/utils.py
  96. +4
    -6
      learnware/specification/__init__.py
  97. +1
    -3
      learnware/specification/base.py
  98. +4
    -4
      learnware/specification/module.py
  99. +2
    -2
      learnware/specification/regular/__init__.py
  100. +2
    -3
      learnware/specification/regular/image/__init__.py

+ 7
- 0
.flake8 View File

@@ -0,0 +1,7 @@
[flake8]
max-line-length = 120
ignore =
E203,E501,F841,W503
per-file-ignores =
__init__.py: F401
./learnware/utils/import_utils.py: F401

+ 131
- 0
CODE_OF_CONDUCT.md View File

@@ -0,0 +1,131 @@
# Contributor Covenant Code of Conduct

## Our Pledge

We as members, contributors, and leaders pledge to make participation in our
community a harassment-free experience for everyone, regardless of age, body
size, visible or invisible disability, ethnicity, sex characteristics, gender
identity and expression, level of experience, education, socio-economic status,
nationality, personal appearance, race, caste, color, religion, or sexual
identity and orientation.

We pledge to act and interact in ways that contribute to an open, welcoming,
diverse, inclusive, and healthy community.

## Our Standards

Examples of behavior that contributes to a positive environment for our
community include:

* Demonstrating empathy and kindness toward other people
* Being respectful of differing opinions, viewpoints, and experiences
* Giving and gracefully accepting constructive feedback
* Accepting responsibility and apologizing to those affected by our mistakes,
and learning from the experience
* Focusing on what is best not just for us as individuals, but for the overall
community

Examples of unacceptable behavior include:

* The use of sexualized language or imagery, and sexual attention or advances of
any kind
* Trolling, insulting or derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or email address,
without their explicit permission
* Other conduct which could reasonably be considered inappropriate in a
professional setting

## Enforcement Responsibilities

Community leaders are responsible for clarifying and enforcing our standards of
acceptable behavior and will take appropriate and fair corrective action in
response to any behavior that they deem inappropriate, threatening, offensive,
or harmful.

Community leaders have the right and responsibility to remove, edit, or reject
comments, commits, code, wiki edits, issues, and other contributions that are
not aligned to this Code of Conduct, and will communicate reasons for moderation
decisions when appropriate.

## Scope

This Code of Conduct applies within all community spaces, and also applies when
an individual is officially representing the community in public spaces.
Examples of representing our community include using an official email address,
posting via an official social media account, or acting as an appointed
representative at an online or offline event.

## Enforcement

Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported to the community leaders responsible for enforcement at bmwu-support@lamda.nju.edu.cn.
All complaints will be reviewed and investigated promptly and fairly.

All community leaders are obligated to respect the privacy and security of the
reporter of any incident.

## Enforcement Guidelines

Community leaders will follow these Community Impact Guidelines in determining
the consequences for any action they deem in violation of this Code of Conduct:

### 1. Correction

**Community Impact**: Use of inappropriate language or other behavior deemed
unprofessional or unwelcome in the community.

**Consequence**: A private, written warning from community leaders, providing
clarity around the nature of the violation and an explanation of why the
behavior was inappropriate. A public apology may be requested.

### 2. Warning

**Community Impact**: A violation through a single incident or series of
actions.

**Consequence**: A warning with consequences for continued behavior. No
interaction with the people involved, including unsolicited interaction with
those enforcing the Code of Conduct, for a specified period of time. This
includes avoiding interactions in community spaces as well as external channels
like social media. Violating these terms may lead to a temporary or permanent
ban.

### 3. Temporary Ban

**Community Impact**: A serious violation of community standards, including
sustained inappropriate behavior.

**Consequence**: A temporary ban from any sort of interaction or public
communication with the community for a specified period of time. No public or
private interaction with the people involved, including unsolicited interaction
with those enforcing the Code of Conduct, is allowed during this period.
Violating these terms may lead to a permanent ban.

### 4. Permanent Ban

**Community Impact**: Demonstrating a pattern of violation of community
standards, including sustained inappropriate behavior, harassment of an
individual, or aggression toward or disparagement of classes of individuals.

**Consequence**: A permanent ban from any sort of public interaction within the
community.

## Attribution

This Code of Conduct is adapted from the [Contributor Covenant][homepage],
version 2.1, available at
[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].

Community Impact Guidelines were inspired by
[Mozilla's code of conduct enforcement ladder][Mozilla CoC].

For answers to common questions about this code of conduct, see the FAQ at
[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
[https://www.contributor-covenant.org/translations][translations].

[homepage]: https://www.contributor-covenant.org
[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
[Mozilla CoC]: https://github.com/mozilla/diversity
[FAQ]: https://www.contributor-covenant.org/faq
[translations]: https://www.contributor-covenant.org/translations

+ 203
- 0
LICENSE View File

@@ -0,0 +1,203 @@
Copyright 2024 LAMDA Beimingwu. All rights reserved.
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2024 LAMDA Beimingwu. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

+ 311
- 152
README.md View File

@@ -1,40 +1,54 @@
[![Python Versions](https://img.shields.io/pypi/pyversions/learnware.svg?logo=python&logoColor=white)](https://pypi.org/project/learnware/#files)
[![Platform](https://img.shields.io/badge/platform-linux%20%7C%20windows%20%7C%20macos-lightgrey)](https://pypi.org/project/learnware/#files)
[![PypI Versions](https://img.shields.io/pypi/v/learnware)](https://pypi.org/project/learnware/#history)
[![Documentation Status](https://readthedocs.org/projects/learnware/badge/?version=latest)](https://learnware.readthedocs.io/en/latest/?badge=latest)
[![License](https://img.shields.io/pypi/l/learnware)](LICENSE)



<div align=center>
<img src="./docs/_static/img/logo/logo1.png" width="50%"/>
<br/>
<br/>
</div>


``Learnware`` is a model sharing platform, which give a basic implementation of the learnware paradigm. A learnware is a well-performed trained machine learning model with a specification that enables it to be adequately identified to reuse according to the requirement of future users who may know nothing about the learnware in advance. The learnware paradigm can solve entangled problems in the current machine learning paradigm, like continual learning and catastrophic forgetting. It also reduces resources for training a well-performed model.

<p align="center">
<a href="https://pypi.org/project/learnware/#files">
<img alt="Python Versions" src="https://img.shields.io/pypi/pyversions/learnware.svg?logo=python&logoColor=white">
</a>
<a href="https://pypi.org/project/learnware/#files">
<img alt="Platform" src="https://img.shields.io/badge/platform-linux%20%7C%20windows%20%7C%20macos-lightgrey">
</a>
<a href="https://github.com/Learnware-LAMDA/Learnware/actions">
<img alt="Test" src="https://github.com/Learnware-LAMDA/Learnware/actions/workflows/install_learnware_with_source.yaml/badge.svg">
</a>
<a href="https://pypi.org/project/learnware/#history">
<img alt="PypI Versions" src="https://img.shields.io/pypi/v/learnware">
</a>
<a href="https://learnware.readthedocs.io/en/latest/?badge=latest">
<img alt="Documentation Status" src="https://readthedocs.org/projects/learnware/badge/?version=latest">
</a>
<a href="https://github.com/Learnware-LAMDA/Learnware/blob/main/LICENSE">
<img alt="License" src="https://img.shields.io/pypi/l/learnware">
</a>
</p>

<h3 align="center">
<p>
<b>English</b> |
<a href="https://github.com/Learnware-LAMDA/Learnware/blob/main/docs/README_zh.md">中文</a>
</p>
</h3>

# Introduction

## Framework
The _learnware_ paradigm, proposed by Professor Zhi-Hua Zhou in 2016 [1, 2], aims to build a vast model platform system, i.e., a _learnware dock system_, which systematically accommodates and organizes models shared by machine learning developers worldwide, and can efficiently identify and assemble existing helpful model(s) to solve future tasks in a unified way.

<div align="center">
<img src="./docs/_static/img/learnware_paradigm.jpg" width="70%"/>
</div>
The `learnware` package provides a fundamental implementation of the central concepts and procedures within the learnware paradigm. Its well-structured design ensures high scalability and facilitates the seamless integration of additional features and techniques in the future.

Machine learning, especially the prevailing big model paradigm, has achieved great success in natural language processing and computer vision applications. However, it still faces challenges such as the requirement of a large amount of labeled training data, difficulty in adapting to changing environments, and catastrophic forgetting when refining trained models incrementally. These big models, while useful in their targeted tasks, often fail to address the above issues and struggle to generalize beyond their specific purposes.
In addition, the `learnware` package serves as the engine for the [Beimingwu System](https://bmwu.cloud) and can be effectively employed for conducting experiments related to learnware.

<div align="center">
<img src="./docs/_static/img/learnware_market.jpg" width="70%" />
</div>

The learnware paradigm introduces the concept of a well-performed, trained machine learning model with a specification that allows future users, who have no prior knowledge of the learnware, to reuse it based on their requirements.
[1] Zhi-Hua Zhou. Learnware: on the future of machine learning. _Frontiers of Computer Science_, 2016, 10(4): 589–590 <br/>
[2] Zhi-Hua Zhou. Machine Learning: Development and Future. _Communications of CCF_, 2017, vol.13, no.1 (2016 CNCC keynote)

Developers or owners of trained machine learning models can submit their models to a learnware market. If accepted, the market assigns a specification to the model and accommodates it. The learnware market could host thousands or millions of well-performed models from different developers, for various tasks, using diverse data, and optimizing different objectives.
## Learnware Paradigm

Instead of building a model from scratch, users can submit their requirements to the learnware market, which then identifies and deploys helpful learnware(s) based on the specifications. Users can apply the learnware directly, adapt it using their data, or exploit it in other ways to improve their model. This process is more efficient and less expensive than building a model from scratch.
A learnware consists of a high-performance machine learning model and specifications that characterize the model, i.e., "Learnware = Model + Specification".
These specifications, encompassing both semantic and statistical aspects, detail the model's functionality and statistical information, making it easier for future users to identify and reuse these models.

## Benefits of the Learnware Paradigm
The need for Learnware arises due to challenges in machine learning, such as the need for extensive training data, advanced techniques, continuous learning, catastrophic forgetting, and data privacy issues. Although there are many efforts focusing on one of these issues separately, they are entangled, and solving one problem may exacerbate others. The learnware paradigm aims to address many of these challenges through a unified framework. Its benefits are listed as follows.

| Benefit | Description |
| ---- | ---- |
@@ -46,227 +60,372 @@ Instead of building a model from scratch, users can submit their requirements to
| Unplanned tasks | Open to all legal developers, the learnware market can accommodate helpful learnwares for various tasks. |
| Carbon emission | Assembling small models may offer good-enough performance, reducing interest in training large models and the carbon footprint. |

# Quick Start

## Installation

Learnware is currently hosted on [PyPI](https://pypi.org/). You can easily intsall ``Learnware`` according to the following steps:

- For Windows and Linux users:
The learnware paradigm consists of two distinct stages:
- `Submitting Stage`: Developers voluntarily submit various learnwares to the learnware market, and the system conducts quality checks and further organization of these learnwares.
- `Deploying Stage`: When users submit task requirements, the learnware market automatically selects whether to recommend a single learnware or a combination of multiple learnwares and provides efficient deployment methods. Whether it’s a single learnware or a combination of multiple learnwares, the system offers convenient learnware reuse interfaces.

```bash
pip install learnware
```
<div align="center">
<img src="./docs/_static/img/learnware_market.svg" width="70%" />
</div>

- For macOS users:
## Learnware Package Design

```bash
conda install -c pytorch faiss
pip install learnware
```
<div align="center">
<img src="./docs/_static/img/learnware_framework.svg" width="70%"/>
</div>

## Prepare Learnware

The Learnware Market consists of a wide range of learnwares. A valid learnware is a zipfile which
is composed of the following four parts.
At the workflow level, the `learnware` package consists of `Submitting Stage` and `Deploying Stage`.
At the module level, the `learnware` package is a platform that consists of above components. The components are designed as loose-coupled modules and each component could be used stand-alone.

- ``__init__.py``
# Quick Start

A python file offering interfaces for your model's fitting, predicting and fine-tuning.
## Installation

- ``rkme.json``
Learnware is currently hosted on [PyPI](https://pypi.org/project/learnware/). You can easily install `learnware` by following these steps:

A json file containing the statistical specification of your data.
```bash
pip install learnware
```

- ``learnware.yaml``
A config file describing your model class name, type of statistical specification(e.g. Reduced Kernel Mean Embedding, ``RKMETableSpecification``), and
the file name of your statistical specification file.
In the `learnware` package, besides the base classes, many core functionalities such as "learnware specification generation" and "learnware deployment" rely on the `torch` library. Users have the option to manually install `torch`, or they can directly use the following command to install the `learnware` package:

- ``environment.yaml``
```bash
pip install learnware[full]
```

A Conda environment configuration file for running the model (if the model environment is incompatible, you can rely on this for manual configuration).
You can generate this file according to the following steps:
**Note:** However, it's crucial to note that due to the potential complexity of the user's local environment, installing `learnware[full]` does not guarantee that `torch` will successfully invoke `CUDA` in the user's local setting.

- Create env config for conda:
## Prepare Learnware

```bash
conda env export | grep -v "^prefix: " > environment.yaml
```
- Recover env from config:
In the `learnware` package, each learnware is encapsulated in a `zip` package, which should contain at least the following four files:

```bash
conda env create -f environment.yaml
```
- `learnware.yaml`: learnware configuration file.
- `__init__.py`: methods for using the model.
- `stat.json`: the statistical specification of the learnware. Its filename can be customized and recorded in learnware.yaml.
- `environment.yaml` or `requirements.txt`: specifies the environment for the model.

We also demonstrate the detail format of learnware zipfile in [DOC link], and also please refer to [Examples](./examples/workflow_by_code/learnware_example) for concrete learnware zipfile example.
To facilitate the construction of a learnware, we provide a [Learnware Template](https://www.bmwu.cloud/static/learnware-template.zip) that users can use as a basis for building their own learnware. We've also detailed the format of the learnware `zip` package in [Learnware Preparation](docs/workflows/upload:prepare-learnware).

## Learnware Market Workflow
## Learnware Package Workflow

Users can start an ``Learnware`` workflow according to the following steps:
Users can start a `learnware` workflow according to the following steps:

### Initialize a Learnware Market

The ``EasyMarket`` class implements the most basic set of functions in a ``Learnware``.
You can use the following code snippet to initialize a basic ``Learnware`` named "demo":
The `EasyMarket` class provides the core functions of a `Learnware Market`. You can initialize a basic `Learnware Market` named "demo" using the code snippet below:

```python
import learnware
from learnware.market import EasyMarket
from learnware.market import instantiate_learnware_market

learnware.init()
easy_market = EasyMarket(market_id="demo", rebuild=True)
# instantiate a demo market
demo_market = instantiate_learnware_market(market_id="demo", name="easy", rebuild=True)
```

### Upload Leanwares
### Upload Learnware

Before uploading your learnware into the ``Learnware``,
create a semantic specification ``semantic_spec`` by selecting or filling in values for the predefined semantic tags
to describe the features of your task and model.
Before uploading your learnware to the `Learnware Market`, you'll need to create a semantic specification, `semantic_spec`. This involves selecting or inputting values for predefined semantic tags to describe the features of your task and model.

For example, the following code snippet demonstrates the semantic specification
of a Scikit-Learn type model, which is designed for business scenario and performs classification on tabular data:
For instance, the following code illustrates the semantic specification for a Scikit-Learn type model. This model is tailored for education scenarios and performs classification tasks on tabular data:

```python
semantic_spec = {
"Data": {"Values": ["Tabular"], "Type": "Class"},
"Task": {"Values": ["Classification"], "Type": "Class"},
"Library": {"Values": ["Scikit-learn"], "Type": "Class"},
"Scenario": {"Values": ["Business"], "Type": "Tag"},
"Description": {"Values": "", "Type": "String"},
"Name": {"Values": "demo_learnware", "Type": "String"},
}
from learnware.specification import generate_semantic_spec

semantic_spec = generate_semantic_spec(
name="demo_learnware",
data_type="Table",
task_type="Classification",
library_type="Scikit-learn",
scenarios="Education",
license="MIT",
)
```

Once the semantic specification is defined,
you can easily upload your learnware with a single line of code:
After defining the semantic specification, you can upload your learnware using a single line of code:

```python
easy_market.add_learnware(zip_path, semantic_spec)
demo_market.add_learnware(zip_path, semantic_spec)
```

Here, ``zip_path`` is the directory of your learnware zipfile.
Here, `zip_path` is the directory of your learnware `zip` package.

### Semantic Specification Search

To search for learnwares that fit your task purpose,
you should also provide a semantic specification ``user_semantic`` that describes the characteristics of your task.
The ``Learnware`` will perform a first-stage search based on ``user_semantic``,
identifying potentially helpful leranwares whose models solve tasks similar to your requirements.
To find learnwares that align with your task's purpose, you'll need to provide a semantic specification, `user_semantic`, that outlines your task's characteristics. The `Learnware Market` will then perform an initial search using `user_semantic`, identifying potentially useful learnwares with models that solve tasks similar to your requirements.

```python
# construct user_info which includes semantic specification for searching learnware
# construct user_info, which includes a semantic specification
user_info = BaseUserInfo(id="user", semantic_spec=semantic_spec)

# search_learnware performs semantic specification search if user_info doesn't include a statistical specification
_, single_learnware_list, _ = easy_market.search_learnware(user_info)
# search_learnware: performs semantic specification search when user_info doesn't include a statistical specification
search_result = easy_market.search_learnware(user_info)
single_result = search_results.get_single_results()

# single_learnware_list is the learnware list by semantic specification searching
print(single_learnware_list)
# single_result: the List of Tuple[Score, Learnware] returned by semantic specification search
print(single_result)
```

### Statistical Specification Search

If you choose to porvide your own statistical specification file ``stat.json``,
the ``Learnware`` can perform a more accurate leanware selection from
the learnwares returned by the previous step. This second-stage search is based on statistical information
and returns one or more learnwares that are most likely to be helpful for your task.
If you decide in favor of providing your own statistical specification file, `stat.json`, the `Learnware Market` can further refine the selection of learnwares from the previous step. This second-stage search leverages statistical information to identify one or more learnwares that are most likely to be beneficial for your task.

For example, the following code is designed to work with Reduced Set Kernel Embedding as a statistical specification:
For example, the code below executes learnware search when using Reduced Set Kernel Embedding as the statistical specification:

```python
import learnware.specification as specification

user_spec = specification.RKMETableSpecification()

# unzip_path: directory for unzipped learnware zipfile
user_spec.load(os.path.join(unzip_path, "rkme.json"))
user_info = BaseUserInfo(
semantic_spec=user_semantic, stat_info={"RKMETableSpecification": user_spec}
)
(sorted_score_list, single_learnware_list,
mixture_score, mixture_learnware_list) = easy_market.search_learnware(user_info)

# sorted_score_list is the learnware scores based on MMD distances, sorted in descending order
print(sorted_score_list)

# single_learnware_list is the learnwares sorted in descending order based on their scores
print(single_learnware_list)

# mixture_learnware_list is the learnwares whose mixture is helpful for your task
print(mixture_learnware_list)

# mixture_score is the score of the mixture of learnwares
print(mixture_score)
search_result = easy_market.search_learnware(user_info)

single_result = search_results.get_single_results()
multiple_result = search_results.get_multiple_results()

# search_item.score: based on MMD distances, sorted in descending order
# search_item.learnware.id: id of learnwares, sorted by scores in descending order
for search_item in single_result:
print(f"score: {search_item.score}, learnware_id: {search_item.learnware.id}")

# mixture_item.learnwares: collection of learnwares whose combined use is beneficial
# mixture_item.score: score assigned to the combined set of learnwares in `mixture_item.learnwares`
for mixture_item in multiple_result:
print(f"mixture_score: {mixture_item.score}\n")
mixture_id = " ".join([learnware.id for learnware in mixture_item.learnwares])
print(f"mixture_learnware: {mixture_id}\n")
```

### Reuse Learnwares

Based on the returned list of learnwares ``mixture_learnware_list`` in the previous step,
you can easily reuse them to make predictions your own data, instead of training a model from scratch.
We provide two baseline methods for reusing a given list of learnwares, namely ``JobSelectorReuser`` and ``AveragingReuser``.
Simply replace ``test_x`` in the code snippet below with your own testing data and start reusing learnwares!
With the list of learnwares, `mixture_learnware_list`, returned from the previous step, you can readily apply them to make predictions on your own data, bypassing the need to train a model from scratch. We provide two methods for reusing a given list of learnwares: `JobSelectorReuser` and `AveragingReuser`. Substitute `test_x` in the code snippet below with your testing data, and you're all set to reuse learnwares:

```python
from learnware.reuse import JobSelectorReuser, AveragingReuser

# using jobselector reuser to reuse the searched learnwares to make prediction
reuse_job_selector = JobSelectorReuser(learnware_list=mixture_learnware_list)
reuse_job_selector = JobSelectorReuser(learnware_list=mixture_item.learnwares)
job_selector_predict_y = reuse_job_selector.predict(user_data=test_x)

# using averaging ensemble reuser to reuse the searched learnwares to make prediction
reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list)
reuse_ensemble = AveragingReuser(learnware_list=mixture_item.learnwares)
ensemble_predict_y = reuse_ensemble.predict(user_data=test_x)
```

## Auto Workflow Example
We also provide two methods when the user has labeled data for reusing a given list of learnwares: `EnsemblePruningReuser` and `FeatureAugmentReuser`. Substitute `test_x` in the code snippet below with your testing data, and substitute `train_X, train_y` with your training labeled data, and you're all set to reuse learnwares:

```python
from learnware.reuse import EnsemblePruningReuser, FeatureAugmentReuser

# Use ensemble pruning reuser to reuse the searched learnwares to make prediction
reuse_ensemble = EnsemblePruningReuser(learnware_list=mixture_item.learnwares, mode="classification")
reuse_ensemble.fit(train_X, train_y)
ensemble_pruning_predict_y = reuse_ensemble.predict(user_data=data_X)

# Use feature augment reuser to reuse the searched learnwares to make prediction
reuse_feature_augment = FeatureAugmentReuser(learnware_list=mixture_item.learnwares, mode="classification")
reuse_feature_augment.fit(train_X, train_y)
feature_augment_predict_y = reuse_feature_augment.predict(user_data=data_X)
```

``Learnware`` also provides an auto workflow example, which includes preparing learnwares, upload and delete learnware from markets, search learnware with semantic specifications and statistical specifications. The users can run ``examples/workflow_by_code.py`` to try the basic workflow of ``Learnware``.
### Auto Workflow Example

The `learnware` package also offers automated workflow examples. This includes preparing learnwares, uploading and deleting learnwares from the market, and searching for learnwares using both semantic and statistical specifications. To experience the basic workflow of the `learnware` package, the users can run `test/test_workflow/test_workflow.py` to try the basic workflow of `learnware`.

# Experiments and Examples

## Environment

For all experiments, we used a single linux server. Details on the specifications are listed in the table below. All processors were used for training and evaluating.
For all experiments, we used a single Linux server. Details on the specifications are listed in the table below. All processors were used for training and evaluating.

<div align=center>

| System | GPU | CPU |
|----------------------|--------------------|--------------------------|
| Ubuntu 20.04.4 LTS | Nvidia Tesla V100S | Intel(R) Xeon(R) Gold 6240R |

</div>

## Tabular Scenario Experiments

### Datasets

Our study involved three public datasets in the sales forecasting field: [Predict Future Sales (PFS)](https://www.kaggle.com/c/competitive-data-science-predict-future-sales/data), [M5 Forecasting (M5)](https://www.kaggle.com/competitions/m5-forecasting-accuracy/data), and [Corporacion](https://www.kaggle.com/competitions/favorita-grocery-sales-forecasting/data).

We applied various pre-processing methods to these datasets to enhance the richness of the data. After pre-processing, we first divided each dataset by store and then split the data for each store into training and test sets. Specifically:

- For PFS, the test set consisted of the last month's data from each store.
- For M5, we designated the final 28 days' data from each store as the test set.
- For Corporacion, the test set was composed of the last 16 days of data from each store.

In the submitting stage, the Corporacion dataset's 55 stores are regarded as 165 uploaders, each employing one of three different feature engineering methods. For the PFS dataset, 100 uploaders are established, each using one of two feature engineering approaches. These uploaders then utilize their respective stores' training data to develop LightGBM models. As a result, the learnware market comprises 265 learnwares, derived from five types of feature spaces and two types of label spaces.

Based on the specific design of user tasks, our experiments were primarily categorized into two types:

- **homogeneous experiments** are designed to evaluate performance when users can reuse learnwares in the learnware market that have the same feature space as their tasks (homogeneous learnwares). This contributes to showing the effectiveness of using learnwares that align closely with the user's specific requirements.

- **heterogeneous experiments** aim to evaluate the performance of identifying and reusing helpful heterogeneous learnwares in situations where no available learnwares match the feature space of the user's task. This helps to highlight the potential of learnwares for applications beyond their original purpose.

### Homogeneous Tabular Scenario

For homogeneous experiments, the 55 stores in the Corporacion dataset act as 55 users, each applying one feature engineering method, and using the test data from their respective store as user data. These users can then search for homogeneous learnwares in the market with the same feature spaces as their tasks.

The Mean Squared Error (MSE) of search and reuse across all users is presented in the table below:

<div align=center>

| Setting | MSE |
|-----------------------------------|--------|
| Mean in Market (Single) | 0.331 |
| Best in Market (Single) | 0.151 |
| Top-1 Reuse (Single) | 0.280 |
| Job Selector Reuse (Multiple) | 0.274 |
| Average Ensemble Reuse (Multiple) | 0.267 |

</div>

| System | GPU | CPU |
| ---- | ---- | ---- |
| Ubuntu 20.04.4 LTS | Nvidia Tesla V100S | Intel(R) Xeon(R) Gold 6240R |
When users have both test data and limited training data derived from their original data, reusing single or multiple searched learnwares from the market can often yield better results than training models from scratch on limited training data. We present the change curves in MSE for the user's self-trained model, as well as for the Feature Augmentation single learnware reuse method and the Ensemble Pruning multiple learnware reuse method. These curves display their performance on the user's test data as the amount of labeled training data increases. The average results across 55 users are depicted in the figure below:

<div align=center>
<img src="./docs/_static/img/table_homo_labeled.png" width="50%"/>
</div>

From the figure, it's evident that when users have limited training data, the performance of reusing single/multiple table learnwares is superior to that of the user's own model. This emphasizes the benefit of learnware reuse in significantly reducing the need for extensive training data and achieving enhanced results when available user training data is limited.

## Datasets
### Heterogeneous Tabular Scenario

We designed experiments on three publicly available datasets, namely Prediction Future Sales (PFS), M5 Forecasting (M5) and CIFAR 10. For the two sales forecasting data sets of PFS and M5, we divide the user data according to different stores, and train the Ridge model and LightGBM model on the corresponding data respectively. For the CIFAR10 image classification task, we first randomly pick 6 to 10 categories, and randomly select 800 to 2000 samples from each category from the categories corresponding to the training set, constituting a total of 50 different uploaders. For test users, we first randomly pick 3 to 6 categories, and randomly select 150 to 350 samples from each category from the corresponding categories from the test set, constituting a total of 20 different users.
In heterogeneous experiments, the learnware market would recommend helpful heterogeneous learnwares with different feature spaces with the user tasks. Based on whether there are learnwares in the market that handle tasks similar to the user's task, the experiments can be further subdivided into the following two types:

We tested the efficiency of the specification generation and the accuracy of the search and reuse model respectively. The evaluation index on PFS and M5 data is RMSE, and the evaluation index on CIFAR10 classification task is classification accuracy
#### Cross Feature Space Experiments

## Results
We designate the 41 stores in the PFS dataset as users, creating their user data with an alternative feature engineering approach that varies from the methods employed by learnwares in the market. Consequently, while the market's learnwares from the PFS dataset undertake tasks very similar to our users, the feature spaces do not match exactly. In this experimental configuration, we tested various heterogeneous learnware reuse methods (without using user's labeled data) and compared them to the user's self-trained model based on a small amount of training data. The average MSE performance across 41 users is as follows:

<div align=center>

The time-consuming specification generation is shown in the table below:
| Setting | MSE |
|-----------------------------------|--------|
| Mean in Market (Single) | 1.459 |
| Best in Market (Single) | 1.226 |
| Top-1 Reuse (Single) | 1.407 |
| Average Ensemble Reuse (Multiple) | 1.312 |
| User model with 50 labeled data | 1.267 |

| Dataset | Data Dimensions | Specification Generation Time (s) |
| ---- | ---- | ---- |
| PFS | 8714274*31 | < 1.5 |
| M5 | 46027957*82 | 9~15 |
| CIFAR 10 | 9000\*3\*32\*32 | 7~10 |
</div>

From the results, it is noticeable that the learnware market still performs quite well even when users lack labeled data, provided it includes learnwares addressing tasks that are similar but not identical to the user's. In these instances, the market's effectiveness can match or even rival scenarios where users have access to a limited quantity of labeled data.

#### Cross Task Experiments

Here we have chosen the 10 stores from the M5 dataset to act as users. Although the broad task of sales forecasting is similar to the tasks addressed by the learnwares in the market, there are no learnwares available that directly cater to the M5 sales forecasting requirements. All learnwares show variations in both feature and label spaces compared to the tasks of M5 users. We present the change curves in RMSE for the user's self-trained model and several learnware reuse methods. These curves display their performance on the user's test data as the amount of labeled training data increases. The average results across 10 users are depicted in the figure below:

<div align=center>
<img src="./docs/_static/img/table_hetero_labeled.png" width="50%"/>
</div>

We can observe that heterogeneous learnwares are beneficial when there's a limited amount of the user's labeled training data available, aiding in better alignment with the user's specific task. This underscores the potential of learnwares to be applied to tasks beyond their original purpose.

## Image Scenario Experiment

For the CIFAR-10 dataset, we sampled the training set unevenly by category and constructed unbalanced training datasets for the 50 learnwares that contained only some of the categories. This makes it unlikely that there exists any learnware in the learnware market that can accurately handle all categories of data; only the learnware whose training data is closest to the data distribution of the target task is likely to perform well on the target task. Specifically, the probability of each category being sampled obeys a random multinomial distribution, with a non-zero probability of sampling on only 4 categories, and the sampling ratio is 0.4: 0.4: 0.1: 0.1. Ultimately, the training set for each learnware contains 12,000 samples covering the data of 4 categories in CIFAR-10.

We constructed 50 target tasks using data from the test set of CIFAR-10. Similar to constructing the training set for the learnwares, to allow for some variation between tasks, we sampled the test set unevenly. Specifically, the probability of each category being sampled obeys a random multinomial distribution, with non-zero sampling probability on 6 categories, and the sampling ratio is 0.3: 0.3: 0.1: 0.1: 0.1: 0.1. Ultimately, each target task contains 3000 samples covering the data of 6 categories in CIFAR-10.

With this experimental setup, we evaluated the performance of RKME Image using 1 - Accuracy as the loss.

<div align=center>

| Setting | Accuracy |
|-----------------------------------|----------|
| Mean in Market (Single) | 0.655 |
| Best in Market (Single) | 0.304 |
| Top-1 Reuse (Single) | 0.406 |
| Job Selector Reuse (Multiple) | 0.406 |
| Average Ensemble Reuse (Multiple) | 0.310 |

</div>

In some specific settings, the user will have a small number of labelled samples. In such settings, learning the weight of selected learnwares on a limited number of labelled samples can result in better performance than training directly on a limited number of labelled samples.

<div align=center>
<img src="./docs/_static/img/image_labeled.svg" width="50%"/>
</div>

## Text Scenario Experiment

The accuracy of search and reuse is shown in the table below:
### Datasets

| Dataset | Top-1 Performance | Job Selector Reuse | Average Ensemble Reuse |
| ---- | ---- | ---- | ---- |
| PFS | 1.955 +/- 2.866 | 2.175 +/- 2.847 | 1.950 +/- 2.888 |
| M5 | 2.066 +/- 0.424 | 2.116 +/- 0.472 | 2.512 +/- 0.573 |
| CIFAR 10 | 0.619 +/- 0.138 | 0.585 +/- 0.056 | .715 +/- 0.075 |
We conducted experiments on the widely used text benchmark dataset: [20-newsgroup](http://qwone.com/~jason/20Newsgroups/). 20-newsgroup is a renowned text classification benchmark with a hierarchical structure, featuring 5 superclasses {comp, rec, sci, talk, misc}.

In the submitting stage, we enumerated all combinations of three superclasses from the five available, randomly sampling 50% of each combination from the training set to create datasets for 50 uploaders.

In the deploying stage, we considered all combinations of two superclasses out of the five, selecting all data for each combination from the testing set as a test dataset for one user. This resulted in 10 users. The user's own training data was generated using the same sampling procedure as the user test data, despite originating from the training dataset.

Model training comprised two parts: the first part involved training a tfidf feature extractor, and the second part used the extracted text feature vectors to train a naive Bayes classifier.

Our experiments comprise two components:

- **unlabeled_text_example** is designed to evaluate performance when users possess only testing data, searching and reusing learnware available in the market.
- **labeled_text_example** aims to assess performance when users have both testing and limited training data, searching and reusing learnware directly from the market instead of training a model from scratch. This helps determine the amount of training data saved for the user.

### Results

- **unlabeled_text_example**:

The table below presents the mean accuracy of search and reuse across all users:

<div align=center>

| Setting | Accuracy |
|-----------------------------------|----------|
| Mean in Market (Single) | 0.507 |
| Best in Market (Single) | 0.859 |
| Top-1 Reuse (Single) | 0.846 |
| Job Selector Reuse (Multiple) | 0.845 |
| Average Ensemble Reuse (Multiple) | 0.862 |

</div>

- **labeled_text_example**:

We present the change curves in classification error rates for both the user's self-trained model and the multiple learnware reuse (EnsemblePrune), showcasing their performance on the user's test data as the user's training data increases. The average results across 10 users are depicted below:

<div align=center>
<img src="./docs/_static/img/text_labeled.svg" width="50%"/>
</div>

From the figure above, it is evident that when the user's own training data is limited, the performance of multiple learnware reuse surpasses that of the user's own model. As the user's training data grows, it is expected that the user's model will eventually outperform the learnware reuse. This underscores the value of reusing learnware to significantly conserve training data and achieve superior performance when user training data is limited.

# Citation

If you use our project in your research or work, we kindly request that you cite the following papers:

```bibtex
@article{zhou2022learnware,
author = {Zhou, Zhi-Hua and Tan, Zhi-Hao},
title = {Learnware: Small Models Do Big},
journal = {SCIENCE CHINA Information Sciences},
year = {2024},
volume = {67},
number = {1},
pages = {1--12},
}
```

Please acknowledge the use of our project by citing these papers in your work. Thank you for your support!

# About

## Contributor
## Contributors
We appreciate all contributions and thank all the contributors!

TODO: Here paste the github API after publishing:

[Pic after publish]()
<div align=center>
<img src="https://github.com/Learnware-LAMDA/Learnware/graphs/contributors"/>
</div>

## About us
## About Us

Visit [LAMDA's official website](http://www.lamda.nju.edu.cn/MainPage.ashx).
The Learnware repository is developed and maintained by the LAMDA Beimingwu R&D Team.
To learn more about our team, please visit the [Team Overview](https://docs.bmwu.cloud/en/about-us.html).

+ 0
- 0
docs/README_zh.md View File


+ 4
- 0
docs/_static/img/learnware_framework.svg
File diff suppressed because it is too large
View File


BIN
docs/_static/img/learnware_market.jpg View File

Before After
Width: 5776  |  Height: 3307  |  Size: 1.8 MB

+ 1
- 0
docs/_static/img/learnware_market.svg
File diff suppressed because it is too large
View File


+ 5
- 2
docs/about/about.rst View File

@@ -2,7 +2,10 @@
About Us
================

We thank all the contributors for the development of learnware package:

Contributors
================
.. image:: https://github.com/Learnware-LAMDA/Learnware/graphs/contributors
:align: center

In LAMDA Group, also many people participate the discussions, learnware package design and development and so on.
For more details about us, please refer to `LAMDA Group <https://www.lamda.nju.edu.cn/>`_.

+ 60
- 15
docs/about/dev.rst View File

@@ -3,6 +3,39 @@
For Developer
================

Install with Dev Mode
=======================

As a developer, you often want make changes to ``Learnware Market`` and hope it would reflect directly in your environment without reinstalling it. You can install ``Learnware Market`` in editable mode with following command.

.. code-block:: bash
$ git clone https://github.com/Learnware-LAMDA/Learnware.git && cd Learnware
$ pip install -e .[dev]

.. note::
It's recommended to use anaconda/miniconda to setup the environment. Also you can run ``pip install -e .[full, dev]`` to install ``torch`` automatically.


Commit Format
==============

Please submit in the following manner: Submit using the format ``prefix`` + ``space`` + ``suffix``.
There are four choices for the prefix, and they can be combined using commas:

- [ENH]: Represents enhancement, indicating the addition of new features.
- [DOC]: Indicates modifications to the documentation.
- [FIX]: Represents bug fixes and typo corrections.
- [MNT]: Indicates other minor modifications, such as version updates.
The suffix specifies the specific nature of the modification, with the initial letter capitalized.

Examples: The following are all valid:

- [DOC] Fix the document
- [FIX, ENH] Fix the bug and add some feature"


Docstring
============
Please use the `Numpydoc Style <https://stackoverflow.com/a/24385103>`_.
@@ -15,7 +48,7 @@ Continuous Integration
Continuous Integration (CI) tools help you stick to the quality standards by running tests every time you push a new commit and reporting the results to a pull request.

``Learnware Market`` will check the following tests when you pull a request:
1. We will check your code style pylint, you can fix your code style by the following commands:
1. We will check your code length, you can fix your code style by the following commands:

.. code-block:: bash

@@ -30,22 +63,34 @@ Continuous Integration (CI) tools help you stick to the quality standards by run
pip install pytest
python -m pytest tests

Development Guidance
=================================
``pre-commit`` Config
========================

As a developer, you often want make changes to ``Learnware Market`` and hope it would reflect directly in your environment without reinstalling it. You can install ``Learnware Market`` in editable mode with following command.
The ``learnware`` package support config ``pre-commit``. Run the following command to install ``pre-commit``:

- For Windows and Linux users:
.. code-block:: bash

pip install pre-commit


Run the following command in the root directory of ``Learnware`` Project to enable ``pre-commit``:

.. code-block:: bash

pre-commit install

.. code-block:: bash
$ git clone https://git.nju.edu.cn/learnware/learnware-market.git && cd learnware-market
$ python setup.py install
``isort`` Config
===================

The codes in the ``learnware`` package will be processed by ``isort`` (``examples`` and ``tests`` are excluded). Run the following command to install ``isort``:

.. code-block:: bash

pip install isort

Run the following command in the root directory of ``Learnware`` Project to run ``isort``:

.. code-block:: bash

- For macOS users:
isort learnware --reverse-relative

.. code-block:: bash
$ conda install -c pytorch faiss
$ git clone https://git.nju.edu.cn/learnware/learnware-market.git && cd learnware-market
$ python setup.py install

+ 3
- 3
docs/components/learnware.rst View File

@@ -4,7 +4,7 @@
Learnware & Reuser
==========================================

``Learnware`` is the most basic concept in the ``learnware paradigm``. In this section, we will introduce the concept and design of ``learnware`` and its extension for ``Hetero Reuse``. Then we will introduce the ``Reuse Methods``, which applies one or several ``learnware``\ s to solve the user's task.
``Learnware`` is the most basic concept in the ``learnware paradigm``. In this section, we will introduce the concept and design of ``Learnware`` and its extension for ``Hetero Reuse``. Then we will introduce the ``Reuse Methods``, which applies one or several ``Learnware``\ s to solve the user's task.

Concepts
===================
@@ -16,7 +16,7 @@ In our implementation, the class ``Learnware`` has 3 important member variables:
- ``model``: The model in the learnware, can be a ``BaseModel`` or a dict including model name and path. When it is a dict, the function ``Learnware.instantiate_model`` is used to transform it to a ``BaseModel``. The function ``Learnware.predict`` use the model to predict for an input ``X``. See more in `COMPONENTS: Model <./model.html>`_.
- ``specification``: The specification including the semantic specification and the statistic specification.

Learnware for Hetero Reuse (Feature Align + Hetero Map Learnware)
Learnware for Hetero Reuse
=======================================================================

In the Hetero Market(see `COMPONENTS: Hetero Market <./market.html#hetero-market>`_ for details), ``HeteroSearcher`` identifies and recommends helpful learnwares among all learnwares in the market,
@@ -107,7 +107,7 @@ specifies the ensemble method(default is set to ``mean``).
Reuse Learnware with Labeled Data
----------------------------------

When users have a small amount of labeled data available, ``learnware`` package provides two methods: ``EnsemblePruningReuser`` and ``FeatureAugmentReuser`` to help reuse learnwares.
When users have a small amount of labeled data available, the ``learnware`` package provides two methods: ``EnsemblePruningReuser`` and ``FeatureAugmentReuser`` to help reuse learnwares.
They are both initialized with a list of ``Learnware`` objects ``learnware_list``, and have different implementations of ``fit`` and ``predict`` methods.

EnsemblePruningReuser


+ 4
- 4
docs/components/market.rst View File

@@ -4,20 +4,20 @@
Learnware Market
================================

The ``learnware market`` receives high-performance machine learning models from developers, incorporates them into the system, and provides services to users by identifying and reusing learnware to help users solve current tasks. Developers voluntarily submit various learnwares to the learnware market, and the market conducts quality checks and further organization of these learnwares. When users submit task requirements, the learnware market automatically selects whether to recommend a single learnware or a combination of multiple learnwares.
The ``Learnware Market`` receives high-performance machine learning models from developers, incorporates them into the system, and provides services to users by identifying and reusing learnware to help users solve current tasks. Developers voluntarily submit various learnwares to the learnware market, and the market conducts quality checks and further organization of these learnwares. When users submit task requirements, the learnware market automatically selects whether to recommend a single learnware or a combination of multiple learnwares.

The ``learnware market`` will receive various kinds of learnwares, and learnwares from different feature/label spaces form numerous islands of specifications. All these islands together constitute the ``specification world`` in the learnware market. The market should discover and establish connections between different islands, and then merge them into a unified specification world. This further organization of learnwares support search learnwares among all learnwares, not just among learnwares which has the same feature space and label space with the user's task requirements.
The ``Learnware Market`` will receive various kinds of learnwares, and learnwares from different feature/label spaces form numerous islands of specifications. All these islands together constitute the ``specification world`` in the learnware market. The market should discover and establish connections between different islands, and then merge them into a unified specification world. This further organization of learnwares support search learnwares among all learnwares, not just among learnwares which has the same feature space and label space with the user's task requirements.

Framework
======================================

The ``learnware market`` is combined with a ``organizer``, a ``searcher``, and a list of ``checker``\ s.
The ``Learnware Market`` is combined with a ``organizer``, a ``searcher``, and a list of ``checker``\ s.

The ``organizer`` can store and organize learnwares in the market. It supports ``add``, ``delete``, and ``update`` operations for learnwares. It also provides the interface for ``searcher`` to search learnwares based on user requirement.

The ``searcher`` can search learnwares based on user requirement. The implementation of ``searcher`` is dependent on the concrete implementation and interface for ``organizer``, where usually an ``organizer`` can be compatible with multiple different ``searcher``\ s.

The ``checker`` is used for checking the learnware in some standards. It should check the utility of a learnware and is supposed to return the status and a message related to the learnware's check result. Only the learnwares who passed the ``checker`` could be able to be stored and added into the ``learnware market``.
The ``checker`` is used for checking the learnware in some standards. It should check the utility of a learnware and is supposed to return the status and a message related to the learnware's check result. Only the learnwares who passed the ``checker`` could be able to be stored and added into the ``Learnware Market``.





+ 4
- 1
docs/components/spec.rst View File

@@ -37,6 +37,9 @@ Semantic Specification
The semantic specification consists of a "dict" structure that includes keywords "Data", "Task", "Library", "Scenario", "License", "Description", and "Name".
In the case of table learnwares, users should additionally provide descriptions for each feature dimension and output dimension through the "Input" and "Output" keywords.

- If "data_type" is "Table", you need to specify the semantics of each dimension of the model's input data to make the uploaded learnware suitable for tasks with heterogeneous feature spaces.
- If "task_type" is "Classification", you need to provide the semantics of model output labels (prediction labels start from 0), making the uploaded learnware suitable for classification tasks with heterogeneous output spaces.
- If "task_type" is "Regression", you need to specify the semantics of each dimension of the model output, making the uploaded learnware suitable for regression tasks with heterogeneous output spaces.

Regular Specification
======================================
@@ -131,7 +134,7 @@ with particular learnware market implementations.
- Learnware searchers perform helpful learnware recommendations among all table learnwares in the market, leveraging the ``system specification``\ s generated for users.


``learnware`` package now includes a type of ``system specification``, named ``HeteroMapTableSpecification``, made especially for the ``Hetero Market`` implementation.
The ``learnware`` package now includes a type of ``system specification``, named ``HeteroMapTableSpecification``, made especially for the ``Hetero Market`` implementation.
This specification is automatically given to all table learnwares when they are added to the ``Hetero Market``.
It is also set up to be updated periodically, ensuring it remains accurate as the learnware market evolves and builds more precise specification worlds.
Please refer to `COMPONENTS: Hetero Market <../components/market.html#hetero-market>`_ for implementation details.

+ 2
- 2
docs/conf.py View File

@@ -100,12 +100,12 @@ html_logo = "_static/img/logo/logo1.png"


# These folders are copied to the documentation's HTML output
html_static_path = ['_static']
html_static_path = ["_static"]

# These paths are either relative to html_static_path
# or fully qualified paths (eg. https://...)
html_css_files = [
'css/custom_style.css',
"css/custom_style.css",
]

# -- Options for HTMLHelp output ------------------------------------------


+ 4
- 2
docs/index.rst View File

@@ -7,7 +7,9 @@
``Learnware`` Documentation
============================================================

``Learnware`` is a model sharing platform, which give a basic implementation of the learnware paradigm. A learnware is a well-performed trained machine learning model with a specification that enables it to be adequately identified to reuse according to the requirement of future users who may know nothing about the learnware in advance. The learnware paradigm can solve entangled problems in the current machine learning paradigm, like continual learning and catastrophic forgetting. It also reduces resources for training a well-performed model.
The ``learnware`` package provides a fundamental implementation of the central concepts and procedures for the learnware paradigm.
A learnware is a well-performed trained machine learning model with a specification that enables it to be adequately identified to reuse according to the requirement of future users who may know nothing about the learnware in advance.
The learnware paradigm is a new paradigm aimed at enabling users to reuse existed well-trained models to solve their AI tasks instead of starting from scratch.

.. _user_guide:

@@ -58,7 +60,7 @@ Document Structure
:caption: REFERENCES:

API <references/api.rst>
BeimingWu System <references/beiming.rst>
Beimingwu System <references/beimingwu.rst>
FAQ <references/FAQ.rst>

.. toctree::


+ 8
- 1
docs/references/FAQ.rst View File

@@ -1,5 +1,12 @@
.. _faq:
====================
FAQ
Learnware FAQ
====================

Learnware Frequently Asked Questions
=====================================
.. contents::
:depth: 1
:local:
:backlinks: none


+ 106
- 3
docs/references/api.rst View File

@@ -3,7 +3,7 @@
API Reference
================================

Here you can find all ``learnware`` interfaces.
Here you can find high-level ``Learnware`` interfaces.

Market
====================
@@ -13,23 +13,96 @@ Market

.. autoclass:: learnware.market.BaseUserInfo
:members:
Learnware & Reuser

Organizer
------------------
.. autoclass:: learnware.market.BaseOrganizer
:members:

.. autoclass:: learnware.market.EasyOrganizer
:members:

.. autoclass:: learnware.market.HeteroOrganizer
:members:

Searcher
------------------
.. autoclass:: learnware.market.BaseSearcher
:members:

.. autoclass:: learnware.market.EasySearcher
:members:

.. autoclass:: learnware.market.EasyExactSemanticSearcher
:members:

.. autoclass:: learnware.market.EasyFuzzSemanticSearcher
:members:

.. autoclass:: learnware.market.EasyStatSearcher
:members:

.. autoclass:: learnware.market.HeteroSearcher
:members:

Checker
------------------

.. autoclass:: learnware.market.BaseChecker
:members:

.. autoclass:: learnware.market.EasyChecker
:members:

.. autoclass:: learnware.market.EasySemanticChecker
:members:

.. autoclass:: learnware.market.EasyStatChecker
:members:

Learnware
====================

.. autoclass:: learnware.learnware.Learnware
:members:

Reuser
====================

.. autoclass:: learnware.reuse.BaseReuser
:members:

Data Independent Reuser
-------------------------

.. autoclass:: learnware.reuse.JobSelectorReuser
:members:

.. autoclass:: learnware.reuse.AveragingReuser
:members:

Data Dependent Reuser
-------------------------

.. autoclass:: learnware.reuse.EnsemblePruningReuser
:members:

.. autoclass:: learnware.reuse.FeatureAugmentReuser
:members:


Aligned Learnware
--------------------
.. autoclass:: learnware.reuse.AlignLearnware
:members:

.. autoclass:: learnware.reuse.FeatureAlignLearnware
:members:

.. autoclass:: learnware.reuse.HeteroMapAlignLearnware
:members:

Specification
====================

@@ -39,6 +112,12 @@ Specification
.. autoclass:: learnware.specification.BaseStatSpecification
:members:

Regular Specification
--------------------------

.. autoclass:: learnware.specification.RegularStatSpecification
:members:

.. autoclass:: learnware.specification.RKMETableSpecification
:members:

@@ -48,8 +127,32 @@ Specification
.. autoclass:: learnware.specification.RKMETextSpecification
:members:

System Specification
--------------------------

.. autoclass:: learnware.specification.HeteroMapTableSpecification
:members:

Model
====================


Base Model
--------------
.. autoclass:: learnware.model.BaseModel
:members:

Container
-------------

.. autoclass:: learnware.client.ModelContainer
:members:

.. autoclass:: learnware.client.ModelCondaContainer
:members:

.. autoclass:: learnware.client.ModelDockerContainer
:members:

.. autoclass:: learnware.client.LearnwaresContainer
:members:

+ 0
- 6
docs/references/beiming.rst View File

@@ -1,6 +0,0 @@
.. _beiming:
====================
BeimingWu System
====================

`Clik here for beiming system <https://bmwu.cloud/>`_

+ 30
- 0
docs/references/beimingwu.rst View File

@@ -0,0 +1,30 @@
.. _beimingwu:
====================
Beimingwu System
====================

`Beimingwu System <https://bmwu.cloud/>`_ is based on the learnware paradigm, which systematically implements the entire process of learnware from submission to deployment, helping users effectively search and reuse learnwares without the need to build machine learning models from scratch.

The ``learnware`` package is the cornerstone of the Beimingwu system, functioning as its core engine.
It offers a comprehensive suite of central APIs that encompass a wide range of functionalities, including the submission, verification, organization, search, and deployment of learnware.
This integration ensures a streamlined and efficient process, facilitating seamless interactions within the system.

Core Features in the Beimingwu System
=======================================

Beimingwu systematically implements the core process of the learnware paradigm for the first time:

- ``Submitting Stage``: The system includes multiple detection mechanisms to ensure the quality of uploaded learnwares. Additionally, the system trains a heterogeneous engine based on existing learnware specifications in the system to merge different specification islands and assign new specifications to learnwares. With more learnwares are submitted, the heterogeneous engine will continue to update, achieving continuous iteration of learnware specifications and building a more precise specification world.
- ``Deploying Stage``: After users upload task requirements, the system automatically selects whether to recommend a single learnware or multiple learnware combinations and provides efficient deployment methods. Whether it's a single learnware or a combination of multiple learnwares, the system offers convenient learnware reuse tools.

In addition, the Beimingwu system also has the following features:

- ``Learnware Specification Generation``: The Beimingwu system provides specification generation interfaces in the learnware package, supporting various data types (tables, images, and text) for efficient local generation.
- ``Learnware Quality Inspection``: The Beimingwu system includes multiple detection mechanisms to ensure the quality of each learnware in the system.
- ``Diverse Learnware Search``: The Beimingwu system supports both semantic specifications and statistical specifications searches, covering data types such as tables, images, and text. In addition, for table-based tasks, the system also supports the search for heterogeneous table learnwares.
- ``Local Learnware Deployment``: The Beimingwu system provides interfaces for learnware deployment and learnware reuse in the learnware package, facilitating users' convenient and secure learnware deployment.
- ``Data Privacy Protection``: The Beimingwu system operations, including learnware upload, search, and deployment, do not require users to upload local data. All relevant statistical specifications are generated locally by users, ensuring data privacy.
- ``Fully Open Source``: The Beimingwu system's source code is completely open-source, including the learnware package and frontend/backend code. The learnware package is highly extensible, making it easy to integrate new specification designs, learnware system designs, and learnware reuse methods in the future.

Beimingwu is the first system-level implementation of the learnware paradigm.
This pioneering venture is just the beginning, with vast opportunities for enhancement and growth in the related technological fields still ahead.

+ 43
- 29
docs/start/exp.rst View File

@@ -16,8 +16,8 @@ Ubuntu 20.04.4 LTS Nvidia Tesla V100S Intel(R) Xeon(R) Gold 6240R
==================== ==================== ===============================


Table: homo+hetero
====================
Tabular Data Experiments
===========================

Datasets
------------------
@@ -43,8 +43,8 @@ Based on the specific design of user tasks, our experiments were primarily categ
- ``heterogeneous experiments`` aim to evaluate the performance of identifying and reusing helpful heterogeneous learnwares in situations where
no available learnwares match the feature space of the user's task. This helps to highlight the potential of learnwares for applications beyond their original purpose.

Homo Experiments
-----------------------
Homogeneous Tabular Dataset
-----------------------------

In homogeneous experiments, the 55 stores in the Corporacion dataset are considered as 55 users. Each store uses the same feature engineering method
and their own test set as user data. These users then search for and reuse homogeneous learnwares in the market which exactly match the feature spaces of their tasks.
@@ -52,17 +52,20 @@ and their own test set as user data. These users then search for and reuse homog
The Mean Squared Error (MSE) of search and reuse across all users is presented in the table below:

+-----------------------------------+---------------------+
| Mean in Market (Single) | 0.331 |
| Setting | MSE |
+===================================+=====================+
| Mean in Market (Single) | 0.331 |
+-----------------------------------+---------------------+
| Best in Market (Single) | 0.151 |
| Best in Market (Single) | 0.151 |
+-----------------------------------+---------------------+
| Top-1 Reuse (Single) | 0.280 |
| Top-1 Reuse (Single) | 0.280 |
+-----------------------------------+---------------------+
| Job Selector Reuse (Multiple) | 0.274 |
| Job Selector Reuse (Multiple) | 0.274 |
+-----------------------------------+---------------------+
| Average Ensemble Reuse (Multiple) | 0.267 |
| Average Ensemble Reuse (Multiple) | 0.267 |
+-----------------------------------+---------------------+


When users have both test data and limited training data derived from their original data, reusing single or multiple searched learnwares from the market can often yield
better results than training models from scratch on limited training data. We present the change curves in MSE for the user's self-trained model, as well as for the Feature Augmentation single learnware reuse method and the Ensemble Pruning multiple learnware reuse method.
These curves display their performance on the user's test data as the amount of labeled training data increases.
@@ -76,8 +79,8 @@ The figure clearly shows that when users have limited training data, reusing sin
This highlights the advantage of reusing learnwares in substantially reducing the need for large training datasets and achieving better outcomes with restricted user training data.


Hetero Experiments
-------------------------
Heterogeneous Tabular Dataset
------------------------------

In heterogeneous experiments, the learnware market would recommend helpful heterogeneous learnwares with different feature spaces with
the user tasks. Based on whether there are learnwares in the market that handle tasks similar to the user's task, the experiments can be further subdivided into the following two types:
@@ -91,6 +94,8 @@ we tested various heterogeneous learnware reuse methods (without using user's la
The average MSE performance across 41 users are as follows:

+-----------------------------------+---------------------+
| Setting | MSE |
+===================================+=====================+
| Mean in Market (Single) | 1.459 |
+-----------------------------------+---------------------+
| Best in Market (Single) | 1.226 |
@@ -122,35 +127,36 @@ The average results across 10 users are depicted in the figure below:
We can observe that heterogeneous learnwares are beneficial when there's a limited amount of the user's labeled training data available,
aiding in better alignment with the user's specific task. This underscores the potential of learnwares to be applied to tasks beyond their original purpose.

Image Experiment
====================
Image Data Experiment
=========================

For the CIFAR-10 dataset, we sampled the training set unevenly by category and constructed unbalanced training datasets for the 50 learnwares that contained only some of the categories. This makes it unlikely that there exists any learnware in the learnware market that can accurately handle all categories of data; only the learnware whose training data is closest to the data distribution of the target task is likely to perform well on the target task. Specifically, the probability of each category being sampled obeys a random multinomial distribution, with a non-zero probability of sampling on only 4 categories, and the sampling ratio is 0.4: 0.4: 0.1: 0.1. Ultimately, the training set for each learnware contains 12,000 samples covering the data of 4 categories in CIFAR-10.

We constructed 50 target tasks using data from the test set of CIFAR-10. Similar to constructing the training set for the learnwares, in order to allow for some variation between tasks, we sampled the test set unevenly. Specifically, the probability of each category being sampled obeys a random multinomial distribution, with non-zero sampling probability on 6 categories, and the sampling ratio is 0.3: 0.3: 0.1: 0.1: 0.1: 0.1. Ultimately, each target task contains 3000 samples covering the data of 6 categories in CIFAR-10.

With this experimental setup, we evaluated the performance of RKME Image by calculating the mean accuracy across all users.
With this experimental setup, we evaluated the performance of RKME Image using 1 - Accuracy as the loss.

+-----------------------------------+---------------------+
| Mean in Market (Single) | 0.346 |
| Setting | Accuracy |
+===================================+=====================+
| Mean in Market (Single) | 0.655 |
+-----------------------------------+---------------------+
| Best in Market (Single) | 0.688 |
| Best in Market (Single) | 0.304 |
+-----------------------------------+---------------------+
| Top-1 Reuse (Single) | 0.534 |
| Top-1 Reuse (Single) | 0.406 |
+-----------------------------------+---------------------+
| Job Selector Reuse (Multiple) | 0.534 |
| Job Selector Reuse (Multiple) | 0.406 |
+-----------------------------------+---------------------+
| Average Ensemble Reuse (Multiple) | 0.676 |
| Average Ensemble Reuse (Multiple) | 0.310 |
+-----------------------------------+---------------------+

In some specific settings, the user will have a small number of labeled samples. In such settings, learning the weight of selected learnwares on a limited number of labeled samples can result in a better performance than training directly on a limited number of labeled samples.
In some specific settings, the user will have a small number of labelled samples. In such settings, learning the weight of selected learnwares on a limited number of labelled samples can result in a better performance than training directly on a limited number of labelled samples.

.. image:: ../_static/img/image_labeled.svg
:align: center


Text Experiment
====================
Text Data Experiment
==========================

Datasets
------------------
@@ -177,6 +183,8 @@ Results
The table below presents the mean accuracy of search and reuse across all users:

+-----------------------------------+---------------------+
| Setting | Accuracy |
+===================================+=====================+
| Mean in Market (Single) | 0.507 |
+-----------------------------------+---------------------+
| Best in Market (Single) | 0.859 |
@@ -199,17 +207,23 @@ We present the change curves in classification error rates for both the user's s

From the figure above, it is evident that when the user's own training data is limited, the performance of multiple learnware reuse surpasses that of the user's own model. As the user's training data grows, it is expected that the user's model will eventually outperform the learnware reuse. This underscores the value of reusing learnware to significantly conserve training data and achieve superior performance when user training data is limited.


Get Start Examples
=========================
We utilize the `fire` module to construct our experiments, including table, image and text scenario.
Examples for `Tabular, Text` and `Image` data sets are available at `Learnware Examples <https://github.com/Learnware-LAMDA/Learnware/tree/main/examples>`_. You can run { main.py } directly to reproduce related experiments.
We utilize the `fire` module to construct our experiments.

Examples for `Image` are available at [examples/dataset_image_workflow].
Text Examples
------------------
You can execute the experiment with the following commands:

* `python workflow.py image_example`: Run both the unlabeled_image_example and labeled_image_example experiments. The results will be printed in the terminal, and the curves will be automatically saved in the `figs` directory.
* `python main.py unlabeled_text_example`: Executes the unlabeled_text_example experiment; the results will be printed in the terminal.
* `python main.py labeled_text_example`: Executes the labeled_text_example experiment; result curves will be automatically saved in the `figs` directory.

Examples for `Text` are available at [examples/dataset_text_workflow].
Image Examples
------------------
You can execute the experiment with the following commands:

* `python workflow.py unlabeled_text_example`: Run the unlabeled_text_example experiment. The results will be printed in the terminal.
* `python workflow.py labeled_text_example`: Run the labeled_text_example experiment. The result curves will be automatically saved in the `figs` directory.
.. code-block:: bash
python workflow.py image_example

+ 19
- 26
docs/start/install.rst View File

@@ -4,50 +4,43 @@ Installation Guide
========================


``Learnware Market`` Installation
=================================
``learnware`` Package Installation
===================================
.. note::

``Learnware Market`` supports `Windows`, `Linux` and `Macos`. It's recommended to use ``Learnware Market`` in `Linux`. ``Learnware Market`` supports Python3, which is up to Python3.8.
The ``learnware`` package supports `Windows`, `Linux`. It's recommended to use ``Learnware`` in `Linux`. ``Learnware`` supports Python3, which is up to Python3.11.

Users can easily install ``Learnware Market`` by pip according to the following command:
Users can easily install ``Learnware`` by pip according to the following command:

- For Windows and Linux users:
.. code-block:: bash

.. code-block:: bash

pip install learnware

- For macOS users:
pip install learnware

.. code-block:: bash
In the ``learnware`` package, besides the base classes, many core functionalities such as "learnware specification generation" and "learnware deployment" rely on the ``torch`` library. Users have the option to manually install ``torch``, or they can directly use the following command to install the ``learnware`` package:

conda install -c pytorch faiss
pip install learnware
.. code-block:: bash

pip install learnware[full]

Also, Users can install ``Learnware Market`` by the source code according to the following steps:
.. note::
However, it's crucial to note that due to the potential complexity of the user's local environment, installing ``learnware[full]`` does not guarantee that ``torch`` will successfully invoke ``CUDA`` in the user's local setting.

- Enter the root directory of ``Learnware Market``, in which the file ``setup.py`` exists.
- Then, please execute the following command to install the environment dependencies and install ``Learnware Market``:

- For Windows and Linux users:
Install ``learnware`` Package From Source
==========================================

.. code-block:: bash
$ git clone https://git.nju.edu.cn/learnware/learnware-market.git && cd learnware-market
$ python setup.py install
Also, Users can install ``Learnware`` by the source code according to the following steps:

- For macOS users:
- Enter the root directory of ``Learnware``, in which the file ``setup.py`` exists.
- Then, please execute the following command to install the environment dependencies and install ``Learnware``:

.. code-block:: bash
$ conda install -c pytorch faiss
$ git clone https://git.nju.edu.cn/learnware/learnware-market.git && cd learnware-market
$ python setup.py install
$ git clone hhttps://github.com/Learnware-LAMDA/Learnware.git && cd Learnware
$ pip install -e .[dev]

.. note::
It's recommended to use anaconda/miniconda to setup the environment.
It's recommended to use anaconda/miniconda to setup the environment. Also you can run ``pip install -e .[full, dev]`` to install ``torch`` automatically as well.

Use the following code to make sure the installation successful:



+ 42
- 49
docs/start/intro.rst View File

@@ -3,61 +3,36 @@
Introduction
================

``Learnware`` is a model sharing platform, which give a basic implementation of the learnware paradigm. A learnware is a well-performed trained machine learning model with a specification that enables it to be adequately identified to reuse according to the requirement of future users who may know nothing about the learnware in advance. The learnware paradigm can solve entangled problems in the current machine learning paradigm, like continual learning and catastrophic forgetting. It also reduces resources for training a well-performed model.
The *learnware* paradigm, proposed by Professor Zhi-Hua Zhou in 2016 [1, 2], aims to build a vast model platform system, i.e., a *learnware dock system*, which systematically accommodates and organizes models shared by machine learning developers worldwide, and can efficiently identify and assemble existing helpful model(s) to solve future tasks in a unified way.

The ``learnware`` package provides a fundamental implementation of the central concepts and procedures within the learnware paradigm. Its well-structured design ensures high scalability and facilitates the seamless integration of additional features and techniques in the future.

Motivation
=================
In addition, the ``learnware`` package serves as the engine for the `Beimingwu System <https://bmwu.cloud/#/>`_ and can be effectively employed for conducting experiments related to learnware.

.. image:: ../_static/img/learnware_paradigm.jpg
:align: center
| [1] Zhi-Hua Zhou. Learnware: on the future of machine learning. *Frontiers of Computer Science*, 2016, 10(4): 589–590
| [2] Zhi-Hua Zhou. Machine Learning: Development and Future. *Communications of CCF*, 2017, vol.13, no.1 (2016 CNCC keynote)

Machine learning, especially the prevailing big model paradigm, has achieved great success in natural language processing and computer vision applications. However, it still faces challenges such as the requirement of a large amount of labeled training data, difficulty in adapting to changing environments, and catastrophic forgetting when refining trained models incrementally. These big models, while useful in their targeted tasks, often fail to address the above issues and struggle to generalize beyond their specific purposes.

To better address the entangled issues in machine learning, we should consider the following aspects:

+------------------------------------------------------------------------------------+
| Aspect |
+====================================================================================+
| 1. Investigate techniques that address multiple challenges simultaneously, |
| recognizing that these issues are often intertwined in real-world applications. |
+------------------------------------------------------------------------------------+
| 2. Explore paradigms like learnware, which offers the possibility of |
| systematically reusing small models for tasks beyond their original purposes, |
| reducing the need for users to build models from scratch. |
+------------------------------------------------------------------------------------+
| 3. Develop solutions that enable ordinary users to create well-performing models |
| without requiring proficient training skills. |
+------------------------------------------------------------------------------------+
| 4. Address data privacy and proprietary concerns to facilitate experience |
| sharing among different users while respecting confidentiality. |
+------------------------------------------------------------------------------------+
| 5. Adapt to the constraints of big data applications, where it may be |
| unaffordable or infeasible to hold all data for multiple passes of scanning. |
+------------------------------------------------------------------------------------+
| 6. Consider the environmental impact of training large models, as their carbon |
| emissions pose a threat to our environment. |
+------------------------------------------------------------------------------------+

By considering these factors, we can develop a more comprehensive framework for tackling the complex challenges in machine learning, moving beyond the limitations of the big model paradigm, called Learnware.



Framework
=======================

.. image:: ../_static/img/learnware_market.jpg
:align: center
What is Learnware?
====================

A learnware consists of a high-performance machine learning model and specifications that characterize the model, i.e., "Learnware = Model + Specification".

The learnware paradigm introduces the concept of a well-performed, trained machine learning model with a specification that allows future users, who have no prior knowledge of the learnware, to reuse it based on their requirements.
The learnware specification consists of "semantic specification" and "statistical specification":

Developers or owners of trained machine learning models can submit their models to a learnware market. If accepted, the market assigns a specification to the model and accommodates it. The learnware market could host thousands or millions of well-performed models from different developers, for various tasks, using diverse data, and optimizing different objectives.
- ``Semantic Specification``: Describe the type and functionality of the model through text.
- ``Statistical Specification``: Characterize the statistical information contained in the model using various machine learning techniques.

Instead of building a model from scratch, users can submit their requirements to the learnware market, which then identifies and deploys helpful learnware(s) based on the specifications. Users can apply the learnware directly, adapt it using their data, or exploit it in other ways to improve their model. This process is more efficient and less expensive than building a model from scratch.
Learnware specifications describe the model's capabilities, enabling the model to be identified and reused by future users who may know nothing about the learnware in advance.

Why do we need Learnware?
============================

Benefits of the Learnware Paradigm
==============================================
The Benefits of Learnware Paradigm
-------------------------------------

Machine learning has achieved great success in many fields but still faces various challenges, such as the need for extensive training data and advanced training techniques, the difficulty of continuous learning, the risk of catastrophic forgetting, and the leakage of data privacy.

Although there are many efforts focusing on one of these issues separately, they are entangled, and solving one problem may exacerbate others. The learnware paradigm aims to address many of these challenges through a unified framework.

+-----------------------+-----------------------------------------------------------------------------------------------+
| Benefit | Description |
@@ -83,11 +58,29 @@ Benefits of the Learnware Paradigm
| | large models and the carbon footprint. |
+-----------------------+-----------------------------------------------------------------------------------------------+

Challenges and Future Work
==============================================
How to Solve Future Tasks with Learnware Paradigm?
----------------------------------------------------

.. image:: ../_static/img/learnware_paradigm.jpg
:align: center

Although the learnware proposal shows promise, much work remains to make it a reality. The next sections will present some of the progress made so far.
Instead of building a model from scratch, users can submit their requirements to the learnware market, which then identifies and deploys helpful learnware(s) based on the specifications. Users can apply the learnware directly, adapt it using their data, or exploit it in other ways to improve their models. This process is more efficient and less expensive than building a model from scratch.


Procedure of Learnware Paradigm
==================================
- ``Submitting Stage``: Developers voluntarily submit various learnwares to the learnware market, and the system conducts quality checks and further organization of these learnwares.
- ``Deploying Stage``: When users submit task requirements, the learnware market automatically selects whether to recommend a single learnware or a combination of multiple learnwares and provides efficient deployment methods. Whether it's a single learnware or a combination of multiple learnwares, the system offers convenient learnware reuse interfaces.

.. image:: ../_static/img/learnware_market.svg
:align: center


Learnware Package Design
==========================

.. image:: ../_static/img/learnware_framework.svg
:align: center

At the workflow level, the ``learnware`` package consists of ``Submitting Stage`` and ``Deploying Stage``.
At the module level, the ``learnware`` package is a platform that consists of above components. The components are designed as loose-coupled modules and each component could be used stand-alone.

+ 78
- 101
docs/start/quick.rst View File

@@ -7,90 +7,44 @@ Quick Start
Introduction
====================

This ``Quick Start`` guide aims to illustrate the straightforward process of establishing a full ``Learnware Market`` workflow
and utilizing ``Learnware Market`` to handle user tasks.
This ``Quick Start`` guide aims to illustrate the straightforward process of establishing a full ``Learnware`` workflow
and utilizing ``Learnware`` to handle user tasks.


Installation
====================

Learnware is currently hosted on `PyPI <https://pypi.org/>`__. You can easily intsall ``learnware`` by following these steps:
Learnware is currently hosted on `PyPI <https://pypi.org/>`_. You can easily intsall ``Learnware`` by following these steps:

- For Windows and Linux users:
.. code-block:: bash

.. code-block::
pip install learnware

pip install learnware
In the ``learnware`` package, besides the base classes, many core functionalities such as "learnware specification generation" and "learnware deployment" rely on the ``torch`` library. Users have the option to manually install ``torch``, or they can directly use the following command to install the ``learnware`` package:

- For macOS users:
.. code-block:: bash

.. code-block::

conda install -c pytorch faiss
pip install learnware
pip install learnware[full]

.. note::
However, it's crucial to note that due to the potential complexity of the user's local environment, installing ``learnware[full]`` does not guarantee that ``torch`` will successfully invoke ``CUDA`` in the user's local setting.

Prepare Learnware
====================

The Learnware Market encompasses a board variety of learnwares. A valid learnware is a zipfile that
includes the following four components:

- ``__init__.py``

A Python file that provides interfaces for fitting, predicting, and fine-tuning your model.

- ``rkme.json``

A JSON file that contains the statistical specification of your data.

- ``learnware.yaml``
A configuration file that details your model's class name, the type of statistical specification(e.g. ``RKMETableSpecification`` for Reduced Kernel Mean Embedding), and
the file name of your statistical specification file.

- ``environment.yaml`` or ``requirements.txt``

- ``environment.yaml`` for conda:

A Conda environment configuration file for running the model. If the model environment is incompatible, this file can be used for manual configuration.
Here's how you can generate this file:

- Create env config for conda:

- For Windows users:
.. code-block::

conda env export | findstr /v "^prefix: " > environment.yaml
- For macOS and Linux users

.. code-block::

conda env export | grep -v "^prefix: " > environment.yaml
- Recover env from config:

.. code-block::
In learnware ``learnware`` package, each learnware is encapsulated in a ``zip`` package, which should contain at least the following four files:

conda env create -f environment.yaml
- ``requirements.txt`` for pip:

A plain text documents that lists all packages necessary for executing the model. These dependencies can be effortlessly installed using pip with the command:

.. code-block::
pip install -r requirements.txt.

We've also detailed the format of the learnware zipfile in :ref:`Learnware Preparation<workflows/upload:Prepare Learnware>`.
- ``learnware.yaml``: learnware configuration file.
- ``__init__.py``: methods for using the model.
- ``stat.json``: the statistical specification of the learnware. Its filename can be customized and recorded in learnware.yaml.
- ``environment.yaml`` or ``requirements.txt``: specifies the environment for the model.

To facilitate the construction of a learnware, we provide a `Learnware Template <https://www.bmwu.cloud/static/learnware-template.zip>`_ that the users can use as a basis for building your own learnware. We've also detailed the format of the learnware ``zip`` package in `Learnware Preparation<../workflows/upload:prepare-learnware>`.

Learnware Market Workflow
Learnware Package Workflow
============================

Users can start a ``Learnware Market`` workflow according to the following steps:
Users can start a ``Learnware`` workflow according to the following steps:

Initialize a Learnware Market
-------------------------------
@@ -100,11 +54,10 @@ You can initialize a basic ``Learnware Market`` named "demo" using the code snip

.. code-block:: python
import learnware
from learnware.market import EasyMarket
from learnware.market import instantiate_learnware_market

learnware.init()
easy_market = EasyMarket(market_id="demo", rebuild=True)
# instantiate a demo market
demo_market = instantiate_learnware_market(market_id="demo", name="easy", rebuild=True)


Upload Leanware
@@ -114,28 +67,30 @@ Before uploading your learnware to the ``Learnware Market``,
you'll need to create a semantic specification, ``semantic_spec``. This involves selecting or inputting values for predefined semantic tags
to describe the features of your task and model.

For instance, the dictionary snippet below illustrates the semantic specification for a Scikit-Learn type model.
This model is tailored for business scenarios and performs classification tasks on tabular data:
For instance, the following codes illustrates the semantic specification for a Scikit-Learn type model.
This model is tailored for education scenarios and performs classification tasks on tabular data:

.. code-block:: python

semantic_spec = {
"Data": {"Values": ["Tabular"], "Type": "Class"},
"Task": {"Values": ["Classification"], "Type": "Class"},
"Library": {"Values": ["Scikit-learn"], "Type": "Class"},
"Scenario": {"Values": ["Business"], "Type": "Tag"},
"Description": {"Values": "", "Type": "String"},
"Name": {"Values": "demo_learnware", "Type": "String"},
}
from learnware.specification import generate_semantic_spec

semantic_spec = generate_semantic_spec(
name="demo_learnware",
data_type="Table",
task_type="Classification",
library_type="Scikit-learn",
scenarios="Education",
license="MIT",
)

After defining the semantic specification,
you can upload your learnware using a single line of code:
.. code-block:: python
easy_market.add_learnware(zip_path, semantic_spec)

Here, ``zip_path`` is the directory of your learnware zipfile.
demo_market.add_learnware(zip_path, semantic_spec)

Here, ``zip_path`` is the directory of your learnware ``zip`` package.


Semantic Specification Search
@@ -150,10 +105,11 @@ The ``Learnware Market`` will then perform an initial search using ``user_semant
user_info = BaseUserInfo(id="user", semantic_spec=semantic_spec)

# search_learnware: performs semantic specification search when user_info doesn't include a statistical specification
_, single_learnware_list, _ = easy_market.search_learnware(user_info)
search_result = easy_market.search_learnware(user_info)
single_result = search_results.get_single_results()

# single_learnware_list: the learnware list returned by semantic specification search
print(single_learnware_list)
# single_result: the List of Tuple[Score, Learnware] returned by semantic specification search
print(single_result)

Statistical Specification Search
@@ -176,43 +132,64 @@ For example, the code below executes learnware search when using Reduced Set Ker
user_info = BaseUserInfo(
semantic_spec=user_semantic, stat_info={"RKMETableSpecification": user_spec}
)
(sorted_score_list, single_learnware_list,
mixture_score, mixture_learnware_list) = easy_market.search_learnware(user_info)
search_result = easy_market.search_learnware(user_info)

# sorted_score_list: learnware scores(based on MMD distances), sorted in descending order
print(sorted_score_list)
single_result = search_results.get_single_results()
multiple_result = search_results.get_multiple_results()

# single_learnware_list: learnwares, sorted by scores in descending order
print(single_learnware_list)
# search_item.score: based on MMD distances, sorted in descending order
# search_item.learnware.id: id of learnwares, sorted by scores in descending order
for search_item in single_result:
print(f"score: {search_item.score}, learnware_id: {search_item.learnware.id}")

# mixture_learnware_list: collection of learnwares whose combined use is beneficial
print(mixture_learnware_list)

# mixture_score: score assigned to the combined set of learnwares in `mixture_learnware_list`
print(mixture_score)
# mixture_item.learnwares: collection of learnwares whose combined use is beneficial
# mixture_item.score: score assigned to the combined set of learnwares in `mixture_item.learnwares`
for mixture_item in multiple_result:
print(f"mixture_score: {mixture_item.score}\n")
mixture_id = " ".join([learnware.id for learnware in mixture_item.learnwares])
print(f"mixture_learnware: {mixture_id}\n")


Reuse Learnwares
-------------------------------

With the list of learnwares, ``mixture_learnware_list``, returned from the previous step, you can readily apply them to make predictions on your own data, bypassing the need to train a model from scratch.
We offer two baseline methods for reusing a given list of learnwares: ``JobSelectorReuser`` and ``AveragingReuser``.
Just substitute ``test_x`` in the code snippet below with your own testing data, and you're all set to reuse learnwares!
We offer provide two methods for reusing a given list of learnwares: ``JobSelectorReuser`` and ``AveragingReuser``.
Just substitute ``test_x`` in the code snippet below with your own testing data, and you're all set to reuse learnwares:

.. code-block:: python

from learnware.reuse import JobSelectorReuser, AveragingReuser

# using jobselector reuser to reuse the searched learnwares to make prediction
reuse_job_selector = JobSelectorReuser(learnware_list=mixture_learnware_list)
reuse_job_selector = JobSelectorReuser(learnware_list=mixture_item.learnwares)
job_selector_predict_y = reuse_job_selector.predict(user_data=test_x)

# using averaging ensemble reuser to reuse the searched learnwares to make prediction
reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list)
reuse_ensemble = AveragingReuser(learnware_list=mixture_item.learnwares)
ensemble_predict_y = reuse_ensemble.predict(user_data=test_x)


We also provide two method when the user has labeled data for reusing a given list of learnwares: ``EnsemblePruningReuser`` and ``FeatureAugmentReuser``.
Just substitute ``test_x`` in the code snippet below with your own testing data, and substitute ``train_X, train_y`` with your own training labeled data, and you're all set to reuse learnwares:

.. code-block:: python

from learnware.reuse import EnsemblePruningReuser, FeatureAugmentReuser

# Use ensemble pruning reuser to reuse the searched learnwares to make prediction
reuse_ensemble = EnsemblePruningReuser(learnware_list=mixture_item.learnwares, mode="classification")
reuse_ensemble.fit(train_X, train_y)
ensemble_pruning_predict_y = reuse_ensemble.predict(user_data=data_X)

# Use feature augment reuser to reuse the searched learnwares to make prediction
reuse_feature_augment = FeatureAugmentReuser(learnware_list=mixture_item.learnwares, mode="classification")
reuse_feature_augment.fit(train_X, train_y)
feature_augment_predict_y = reuse_feature_augment.predict(user_data=data_X)

Auto Workflow Example
============================

The ``Learnware Market`` also offers an automated workflow example.
The ``Learnware`` also offers automated workflow examples.
This includes preparing learnwares, uploading and deleting learnwares from the market, and searching for learnwares using both semantic and statistical specifications.
To experience the basic workflow of the Learnware Market, users can run [workflow code link].
To experience the basic workflow of the Learnware Market, please refer to `Learnware Examples <https://github.com/Learnware-LAMDA/Learnware/tree/main/examples>`_.

+ 24
- 2
docs/workflows/reuse.rst View File

@@ -132,5 +132,27 @@ combine ``HeteroMapAlignLearnware`` with the homogeneous reuse methods ``Averagi
reuse_ensemble.fit(val_x, val_y)
ensemble_pruning_predict_y = reuse_ensemble.predict(user_data=test_x)
Reuse with Container
=====================
Reuse with ``Model Container``
================================
The ``learnware`` package provides ``Model Container`` to build executive environment for learnwares according to their runtime dependent files. The learnware's model will be executed in the containers and its env will be installed and uninstalled automatically.
Run the following codes to try run a learnware with ``Model Container``:
.. code-block:: python
from learnware.learnware import Learnware
with LearnwaresContainer(learnware, mode="conda") as env_container: # Let learnware be instance of Learnware Class, and its input shape is (20, 204)
learnware = env_container.get_learnwares_with_container()[0]
input_array = np.random.random(size=(20, 204))
print(learnware.predict(input_array))
The ``mode`` parameter has two options, each for a specific learnware environment loading method:
- ``'conda'``: Install a separate conda virtual environment for each learnware (automatically deleted after execution); run each learnware independently within its virtual environment.
- ``'docker'``: Install a conda virtual environment inside a Docker container (automatically destroyed after execution); run each learnware independently within the container (requires Docker privileges).
.. note::
It's important to note that the "conda" modes are not secure if there are any malicious learnwares. If the user cannot guarantee the security of the learnware they want to load, it's recommended to use the "docker" mode to load the learnware.

+ 1
- 1
docs/workflows/search.rst View File

@@ -51,7 +51,7 @@ Hetero Search

For table-based user tasks,
homogeneous searchers like ``EasySearcher`` fail to recommend learnwares when no table learnware matches the user task's feature dimension, returning empty results.
To enhance functionality, ``learnware`` package includes the heterogeneous learnware search feature, whose processions is as follows:
To enhance functionality, the ``learnware`` package includes the heterogeneous learnware search feature, whose processions is as follows:

- Learnware markets such as ``Hetero Market`` integrate different specification islands into a unified "specification world" by assigning system-level specifications to all learnwares. This allows heterogeneous searchers like ``HeteroSearcher`` to find helpful learnwares from all available table learnwares.
- Searchers assign system-level specifications to users based on ``UserInfo``'s statistical specification, using methods provided by corresponding organizers. In ``Hetero Market``, for example, ``HeteroOrganizer.generate_hetero_map_spec`` generates system-level specifications for users.


+ 196
- 102
docs/workflows/upload.rst View File

@@ -1,180 +1,274 @@
.. _submit:
==========================================
Learnware Preparation and Submission
Learnware Preparation and Uoloading
==========================================

In this section, we provide a comprehensive guide on submitting your custom learnware to the Learnware Market.
In this section, we provide a comprehensive guide on submitting your custom learnware to the ``Learnware Market``.
We will first discuss the necessary components of a valid learnware, followed by a detailed explanation on how to upload and remove learnwares within ``Learnware Market``.


Prepare Learnware
====================
====================================

A valid learnware is encapsulated in a zipfile, comprising four essential components.
Below, we illustrate the detailed structure of a learnware zipfile.
In the ``learnware`` package, each learnware is encapsulated in a ``zip`` package, which should contain at least the following four files:

``__init__.py``
---------------
- ``learnware.yaml``: learnware configuration file.
- ``__init__.py``: methods for using the model.
- ``stat.json``: the statistical specification of the learnware. Its filename can be customized and recorded in learnware.yaml.
- ``environment.yaml`` or ``requirements.txt``: specifies the environment for the model.

Within ``Learnware Market``, every uploader must provide a unified set of interfaces for their model,
facilitating easy utilization for future users.
The ``__init__.py`` file serves as the Python interface for your model's fitting, prediction, and fine-tuning processes.
For example, the code snippet below is used to train and save a SVM model for a sample dataset on sklearn digits classification:
To facilitate the construction of a learnware, we provide a `Learnware Template <https://www.bmwu.cloud/static/learnware-template.zip>`_ that you can use as a basis for building your own learnware.

.. code-block:: python

import joblib
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

X, y = load_digits(return_X_y=True)
data_X, _, data_y, _ = train_test_split(X, y, test_size=0.3, shuffle=True)

# input dimension: (64, ), output dimension: (10, )
clf = svm.SVC(kernel="linear", probability=True)
clf.fit(data_X, data_y)
Next, we will provide detailed explanations for the content of these four files.

joblib.dump(clf, "svm.pkl") # model is stored as file "svm.pkl"
Model Invocation File ``__init__.py``
-------------------------------------

To ensure that the uploaded learnware can be used by subsequent users, you need to provide interfaces for model fitting ``fit(X, y)``, prediction ``predict(X)``, and fine-tuning ``finetune(X, y)`` in ``__init__.py``. Among these interfaces, only the ```predict(X)``` interface is mandatory, while the others depend on the functionality of your model.

Then the corresponding ``__init__.py`` for this SVM model should be structured as follows:
Below is a reference template for the ```__init__.py``` file. Please make sure that the input parameter format (the number of parameters and parameter names) for each interface in your model invocation file matches the template below.

.. code-block:: python

import os
import joblib
import pickle
import numpy as np
from learnware.model import BaseModel


class SVM(BaseModel):
class MyModel(BaseModel):
def __init__(self):
super(SVM, self).__init__(input_shape=(64,), output_shape=(10,))
super(MyModel, self).__init__(input_shape=(37,), output_shape=(1,))
dir_path = os.path.dirname(os.path.abspath(__file__))
self.model = joblib.load(os.path.join(dir_path, "svm.pkl"))
model_path = os.path.join(dir_path, "model.pkl")
with open(model_path, "rb") as f:
self.model = pickle.load(f)

def fit(self, X: np.ndarray, y: np.ndarray):
pass
self.model = self.model.fit(X)

def predict(self, X: np.ndarray) -> np.ndarray:
return self.model.predict_proba(X)
return self.model.predict(X)

def finetune(self, X: np.ndarray, y: np.ndarray):
pass
Please remember to specify the ``input_shape`` and ``output_shape`` corresponding to your model.
In our sklearn digits classification example, these would be (64,) and (10,) respectively.


``stat.json``
-------------
Please ensure that the ``MyModel`` class inherits from ``BaseModel`` in the ``learnware.model`` module, and specify the class name (e.g., ``MyModel``) in the ``learnware.yaml`` file later.

Input and Output Dimensions
^^^^^^^^^^^^^^^^^^^^^^^^^^^^

To accurately and effectively match users with appropriate learnwares for their tasks, we require information about your training dataset.
Specifically, you are required to provide a statistical specification
stored as a json file, such as ``stat.json``, which contains the statistical information of the dataset.
This json file meets all our requirements regarding your training data, so you don't need to upload the local original data.
``input_shape`` and ``output_shape`` represent the input and output dimensions of the model, respectively. You can refer to the following guidelines when filling them out:
- ``input_shape`` specifies a single input sample's dimension, and ``output_shape`` refers to the model's output dimension for a single sample.
- When the data type being processed is text data, there are no specific requirements for the value of ``input_shape``, and it can be filled in as ``None``.
- When the ``output_shape`` corresponds to tasks with variable outputs (such as object detection, text segmentation, etc.), there are no specific requirements for the value of ``output_shape``, and it can be filled in as ``None``.
- For classification tasks, ``output_shape`` should be (1, ) if the model directly outputs predicted labels, and the sample labels need to start from 0. If the model outputs logits, ``output_shape`` should be specified as the number of classes, i.e., (class_num, ).

There are various methods to generate a statistical specification.
If you choose to use Reduced Kernel Mean Embedding (RKME) as your statistical specification,
the following code snippet offers guidance on how to construct and store the RKME of a dataset:
File Path
^^^^^^^^^^^^^^^^^^
If you need to load certain files within the zip package in the ``__init__.py`` file (and any other Python files that may be involved), please follow the method shown in the template above about obtaining the ``model_path``:
- First, obtain the root directory path of the entire package by getting ``dir_path``.
- - Then, based on the specific file's relative location within the package, obtain the specific file's path, ``model_path``.

Module Imports
^^^^^^^^^^^^^^^^^^
Please note that module imports between Python files within the zip package should be done using **relative imports**. For instance:

.. code-block:: python
from learnware.specification import generate_rkme_spec
# generate rkme specification for digits dataset
spec = generate_rkme_spec(X=data_X)

from .package_name import *
from .package_name import module_name


Learnware Statistical Specification ``stat.json``
---------------------------------------------------

A learnware consists of a model and a specification. Therefore, after preparing the model, you need to generate a statistical specification for it. Specifically, using the previously installed ``learnware`` package, you can use the training data ``train_x`` (supported types include numpy.ndarray, pandas.DataFrame, and torch.Tensor) as input to generate the statistical specification of the model.

Here is an example of the code:

.. code-block:: python

from learnware.specification import generate_stat_spec

data_type = "table" # Data types: ["table", "image", "text"]
spec = generate_stat_spec(type=data_type, X=train_x)
spec.save("stat.json")

Significantly, the RKME generation process is entirely conducted on your local machine, without any involvement of cloud services,
guaranteeing the security and privacy of your local original data.
It's worth noting that the above code only runs on your local computer and does not interact with any cloud servers or leak any local private data.

Additionally, if the model's training data is too large, causing the above code to fail, you can consider sampling the training data to ensure it's of a suitable size before proceeding with reduction generation.

``learnware.yaml``
------------------

Additionally, you are asked to prepare a configuration file in YAML format.
The file should detail your model's class name, the type of statistical specification(e.g. Reduced Kernel Mean Embedding, ``RKMETableSpecification``), and
the file name of your statistical specification file. The following ``learnware.yaml`` provides an example of
how your learnware configuration file should be structured, based on our previous discussion:
Learnware Configuration File ``learnware.yaml``
-------------------------------------------------
This file is used to specify the class name (``MyModel``) in the model invocation file ``__init__.py``, the module called for generating the statistical specification (``learnware.specification``), the category of the statistical specification (``RKMETableSpecification``), and the specific filename (``stat.json``):

.. code-block:: yaml

model:
class_name: SVM
kwargs: {}
class_name: MyModel
kwargs: {}
stat_specifications:
- module_path: learnware.specification
- module_path: learnware.specification
class_name: RKMETableSpecification
file_name: stat.json
kwargs: {}
kwargs: {}

Please note that the statistical specification class name for different data types ``['table', 'image', 'text']`` is ``[RKMETableSpecification, RKMEImageSpecification, RKMETextSpecification]``, respectively.

``environment.yaml`` or ``requirements.txt``
Model Runtime Dependent File
--------------------------------------------

In order to allow others to execute your learnware, it's necessary to specify your model's dependencies.
You can do this by providing either an ``environment.yaml`` file or a ``requirements.txt`` file.
To ensure that your uploaded learnware can be used by other users, the ``zip`` package of the uploaded learnware should specify the model's runtime dependencies. The Beimingwu System supports the following two ways to specify runtime dependencies:
- Provide an ``environment.yaml`` file supported by ``conda``.
- Provide a ``requirements.txt`` file supported by ``pip``.

You can choose either method, but please try to remove unnecessary dependencies to keep the dependency list as minimal as possible.

- ``environment.yaml`` for conda:
Using ``environment.yaml`` File
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

If you provide an ``environment.yaml``, a new conda environment will be created based on this file
when users install your learnware. You can generate this yaml file using the following command:
- For Windows users:
You can export the `environment.yaml` file directly from the `conda` virtual environment using the following command:

.. code-block::
- For Linux and macOS systems

conda env export | findstr /v "^prefix: " > environment.yaml
.. code-block:: bash
conda env export | grep -v "^prefix: " > environment.yaml

- For macOS and Linux users:
- For Windows systems:

.. code-block::
.. code-block:: bash
conda env export | findstr /v "^prefix: " > environment.yaml

conda env export | grep -v "^prefix: " > environment.yaml
Note that the ``environment.yaml`` file in the ``zip`` package needs to be encoded in ``UTF-8`` format. Please check the encoding format of the ``environment.yaml`` file after using the above command. Due to the ``conda`` version and system differences, you may not get a ``UTF-8`` encoded file (e.g. get a ``UTF-16LE`` encoded file). You'll need to manually convert the file to ``UTF-8``, which is supported by most text editors. The following ``Python`` code for encoding conversion is also for reference:

.. code-block:: python

import codecs

# Read the output file from the 'conda env export' command
# Assuming the file name is environment.yaml and the export format is UTF-16LE
with codecs.open('environment.yaml', 'r', encoding='utf-16le') as file:
content = file.read()

# Convert the content to UTF-8 encoding
output_content = content.encode('utf-8')

# Write to UTF-8 encoded file
with open('environment.yaml', 'wb') as file:
file.write(output_content)

- ``requirements.txt`` for pip:

If you provide a ``requirements.txt``, the dependent packages will be installed using the `-r` option of pip.
You can find more information about ``requirements.txt`` in
`pip documentation <https://pip.pypa.io/en/stable/user_guide/#requirements-files>`_.
Additionally, due to the complexity of users' local ``conda`` virtual environments, you can execute the following command before uploading to confirm that there are no dependency conflicts in the ``environment.yaml`` file:
.. code-block:: bash
We recommend using ``environment.yaml`` as it can help minimize conflicts between different packages.
conda env create --name test_env --file environment.yaml

.. note::
Whether you choose to use ``environment.yaml`` or ``requirements.txt``,
it's important to keep your dependencies as minimal as possible.
This may involve manually opening the file and removing any unnecessary packages.
The above command will create a virtual environment based on the ``environment.yaml`` file, and if successful, it indicates that there are no dependency conflicts. You can delete the created virtual environment using the following command:

.. code-block:: bash

Check Learnware
====================
conda env remove --name test_env

Upload Learnware
==================
Using `requirements.txt` File
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

The ``requirements.txt`` file should list the packages required for running the ``__init__.py`` file and their specific versions. You can obtain these version details by executing the ``pip show <package_name>`` or ``conda list <package_name>`` command. Here is an example file:

.. code-block:: text

numpy==1.23.5
scikit-learn==1.2.2

Manually listing these dependencies can be cumbersome, so you can also use the ``pipreqs`` package to automatically scan your entire project and export the packages used along with their specific versions (though some manual verification may be required):

After preparing the four required files mentioned above,
you can bundle them into your own learnware zipfile. Along with the generated semantic specification that
succinctly describes the features of your task and model (for more details, please refer to :ref:`semantic specification<components/spec:Semantic Specification>`),
you can effortlessly upload your learnware to the ``Learnware Market`` using a single line of code:
.. code-block:: bash

pip install pipreqs
pipreqs ./ # Run this command in the project's root directory

Please note that if you use the ``requirements.txt`` file to specify runtime dependencies, the system will by default install these dependencies in a ``conda`` virtual environment running ``Python 3.8`` during the learnware deployment.

Furthermore, for version-sensitive packages like ``torch``, it's essential to specify package versions in the ``requirements.txt`` file to ensure successful deployment of the uploaded learnware on other machines.

Upload Learnware
==================================

After preparing the four required files mentioned above, you can bundle them into your own learnware ``zip`` package.

Prepare Sematic Specifcation
-----------------------------

The semantic specification succinctly describes the features of your task and model. For uploading learnware ``zip`` package, the user need to prepare the semantic specification. Here is an example of a "Table Data" for a "Classification Task":

.. code-block:: python

import learnware
from learnware.market import EasyMarket
from learnware.specification import generate_semantic_spec

# Prepare input description when data_type="Table"
input_description = {
"Dimension": 5,
"Description": {
"0": "age",
"1": "weight",
"2": "body length",
"3": "animal type",
"4": "claw length"
},
}

# Prepare output description when task_type in ["Classification", "Regression"]
output_description = {
"Dimension": 3,
"Description": {
"0": "cat",
"1": "dog",
"2": "bird",
},
}

# Create semantic specification
semantic_spec = generate_semantic_spec(
name="learnware_example",
description="Just an example for uploading learnware",
data_type="Table",
task_type="Classification",
library_type="Scikit-learn",
scenarios=["Business", "Financial"],
input_description=input_description,
output_description=output_description,
)

For more details, please refer to :ref:`semantic specification<components/spec:Semantic Specification>`,

Uploading
--------------

you can effortlessly upload your learnware to the ``Learnware Market`` as follows.

learnware.init()
# EasyMarket: most basic set of functions in a Learnware Market
easy_market = EasyMarket(market_id="demo", rebuild=True)
.. code-block:: python

from learnware.market import BaseChecker
from learnware.market import instantiate_learnware_market

# instantiate a demo market
demo_market = instantiate_learnware_market(market_id="demo", name="hetero", rebuild=True)

# upload the learnware into the market
learnware_id, learnware_status = demo_market.add_learnware(zip_path, semantic_spec)
# single line uploading
easy_market.add_learnware(zip_path, semantic_spec)
# assert whether the learnware passed the check and was uploaded successfully.
assert learnware_status != BaseChecker.INVALID_LEARNWARE, "Insert learnware failed!"

Here, ``zip_path`` refers to the directory of your learnware zipfile.
Here, ``zip_path`` refers to the directory of your learnware ``zip`` package. ``learnware_id`` indicates the id assigned by ``Learnware Market``, and the ``learnware_status`` indicates the check status for learnware.

.. note::
The learnware ``zip`` package uploaded into ``LearnwareMarket`` will be checked semantically and statistically, and ``add_learnware`` will return the concrete check status. The check status ``BaseChecker.INVALID_LEARNWARE`` indicates the learnware did not pass the check. For more details about learnware checker, please refer to `Learnware Market <../components/market.html#easy-checker>`

Remove Learnware
==================


+ 18
- 3
examples/dataset_image_workflow/README.md View File

@@ -2,9 +2,18 @@

## Introduction

For the CIFAR-10 dataset, we sampled the training set unevenly by category and constructed unbalanced training datasets for the 50 learnwares that contained only some of the categories. This makes it unlikely that there exists any learnware in the learnware market that can accurately handle all categories of data; only the learnware whose training data is closest to the data distribution of the target task is likely to perform well on the target task. Specifically, the probability of each category being sampled obeys a random multinomial distribution, with a non-zero probability of sampling on only 4 categories, and the sampling ratio is 0.4: 0.4: 0.1: 0.1. Ultimately, the training set for each learnware contains 12,000 samples covering the data of 4 categories in CIFAR-10.
We conducted experiments on the widely used image benchmark dataset: [``CIFAR-10``](https://www.cs.toronto.edu/~kriz/cifar.html).
The ``CIFAR-10`` dataset consists of 60000 32x32 color images in 10 classes, with 6000 images per class. There are 50000 training images and 10000 test images. The 10 different classes represent airplanes, cars, birds, cats, deer, dogs, frogs, horses, ships, and trucks.

We constructed 50 target tasks using data from the test set of CIFAR-10. Similar to constructing the training set for the learnwares, in order to allow for some variation between tasks, we sampled the test set unevenly. Specifically, the probability of each category being sampled obeys a random multinomial distribution, with non-zero sampling probability on 6 categories, and the sampling ratio is 0.3: 0.3: 0.1: 0.1: 0.1: 0.1. Ultimately, each target task contains 3000 samples covering the data of 6 categories in CIFAR-10.
In the submitting stage, we sampled the training set non-uniformly by category, and constructed unbalanced training datasets for the 50 learnwares that contained only part of the categories randomly. Specifically, the probability of each category being sampled obeys a random multinomial distribution, with positive sampling probability on only 4 categories, and a sampling ratio of 0.4: 0.4: 0.1: 0.1. The training set for each learnware contains 12,500 samples covering data from the 4 categories in CIFAR-10.

In the deploying stage, we constructed 100 user tasks using the CIFAR-10 test set data. Similar to constructing the training set, the probability of each category being sampled obeys a random multinomial distribution, with positive sampling probabilities on only 6 categories, with a sampling ratio of 0.3: 0.3: 0.1: 0.1: 0.1: 0.1. Each user task contains 3,000 samples covering the data of 6 categories in CIFAR-10.

Our example ``image_example`` shows the performance in two different scenarios:

**Unlabelled Sample Scenario**: This scenario is designed to evaluate performance when users possess only testing data, searching and reusing learnware available in the market.

**Labelled Sample Scenario**: This scenario aims to assess performance when users have both testing and limited training data, searching and reusing learnware directly from the market instead of training a model from scratch. This helps determine the amount of training data saved for the user.

## Run the code

@@ -18,6 +27,8 @@ python workflow.py image_example

With the experimental setup above, we evaluated the performance of RKME Image by calculating the mean accuracy across all users.

### Unlabelled Sample Scenario

| Metric | Value |
|--------------------------------------|---------------------|
| Mean in Market (Single) | 0.346 |
@@ -26,8 +37,12 @@ With the experimental setup above, we evaluated the performance of RKME Image by
| Job Selector Reuse (Multiple) | 0.534 |
| Average Ensemble Reuse (Multiple) | 0.676 |

### Labelled Sample Scenario

In some specific settings, the user will have a small number of labeled samples. In such settings, learning the weight of selected learnwares on a limited number of labeled samples can result in a better performance than training directly on a limited number of labeled samples.

<div align=center>
<img src="../../docs/_static/img/image_labeled.svg" alt="Results on Image Experimental Scenario" style="width:50%;" />
</div>
</div>

Note that in labelled sample scenario, the labelled samples are repeatedly sampled 3 to 10 times, in order to reduce the estimation error in accuracy due to random sampling.

+ 1
- 5
examples/dataset_image_workflow/utils.py View File

@@ -9,19 +9,15 @@ from learnware.utils import choose_device
@torch.no_grad()
def evaluate(model, evaluate_set: Dataset, device=None, distribution=True):
device = choose_device(0) if device is None else device

if isinstance(model, nn.Module):
model.eval()
mapping = lambda m, x: m(x)
else:
mapping = lambda m, x: m.predict(x)

criterion = nn.CrossEntropyLoss(reduction="sum")
total, correct, loss = 0, 0, torch.as_tensor(0.0, dtype=torch.float32, device=device)
dataloader = DataLoader(evaluate_set, batch_size=1024, shuffle=True)
for i, (X, y) in enumerate(dataloader):
X, y = X.to(device), y.to(device)
out = mapping(model, X)
out = model(X) if isinstance(model, nn.Module) else model.predict(X)
if not torch.is_tensor(out):
out = torch.from_numpy(out).to(device)



+ 134
- 128
examples/dataset_image_workflow/workflow.py View File

@@ -49,7 +49,7 @@ class ImageDatasetWorkflow:

plt.xlabel("Amout of Labeled User Data", fontsize=14)
plt.ylabel("1 - Accuracy", fontsize=14)
plt.title(f"Results on Image Experimental Scenario", fontsize=16)
plt.title("Results on Image Experimental Scenario", fontsize=16)
plt.legend(fontsize=14)
plt.tight_layout()
plt.savefig(os.path.join(self.fig_path, "image_labeled_curves.svg"), bbox_inches="tight", dpi=700)
@@ -61,7 +61,7 @@ class ImageDatasetWorkflow:
self.user_semantic = client.get_semantic_specification(self.image_benchmark.learnware_ids[0])
self.user_semantic["Name"]["Values"] = ""

if len(self.image_market) == 0 or rebuild == True:
if len(self.image_market) == 0 or rebuild is True:
for learnware_id in self.image_benchmark.learnware_ids:
with tempfile.TemporaryDirectory(prefix="image_benchmark_") as tempdir:
zip_path = os.path.join(tempdir, f"{learnware_id}.zip")
@@ -71,16 +71,15 @@ class ImageDatasetWorkflow:
client.download_learnware(learnware_id, zip_path)
self.image_market.add_learnware(zip_path, semantic_spec)
break
except:
except Exception:
time.sleep(1)
continue

logger.info("Total Item: %d" % (len(self.image_market)))

def image_example(self, rebuild=False):
def image_example(self, rebuild=False, skip_test=False):
np.random.seed(1)
random.seed(1)
self._prepare_market(rebuild)
self.n_labeled_list = [100, 200, 500, 1000, 2000, 4000]
self.repeated_list = [10, 10, 10, 3, 3, 3]
device = choose_device(0)
@@ -99,142 +98,149 @@ class ImageDatasetWorkflow:
improve_list = []
job_selector_score_list = []
ensemble_score_list = []
all_learnwares = self.image_market.get_learnwares()

for i in range(self.image_benchmark.user_num):
test_x, test_y = self.image_benchmark.get_test_data(user_ids=i)
train_x, train_y = self.image_benchmark.get_train_data(user_ids=i)
if not skip_test:
self._prepare_market(rebuild)
all_learnwares = self.image_market.get_learnwares()

test_x = torch.from_numpy(test_x)
test_y = torch.from_numpy(test_y)
test_dataset = TensorDataset(test_x, test_y)
for i in range(image_benchmark_config.user_num):
test_x, test_y = self.image_benchmark.get_test_data(user_ids=i)
train_x, train_y = self.image_benchmark.get_train_data(user_ids=i)

user_stat_spec = generate_stat_spec(type="image", X=test_x, whitening=False)
user_info = BaseUserInfo(semantic_spec=self.user_semantic, stat_info={user_stat_spec.type: user_stat_spec})
logger.info("Searching Market for user: %d" % (i))
test_x = torch.from_numpy(test_x)
test_y = torch.from_numpy(test_y)
test_dataset = TensorDataset(test_x, test_y)

search_result = self.image_market.search_learnware(user_info)
single_result = search_result.get_single_results()
multiple_result = search_result.get_multiple_results()
user_stat_spec = generate_stat_spec(type="image", X=test_x, whitening=False)
user_info = BaseUserInfo(
semantic_spec=self.user_semantic, stat_info={user_stat_spec.type: user_stat_spec}
)
logger.info("Searching Market for user: %d" % (i))

print(f"search result of user{i}:")
print(
f"single model num: {len(single_result)}, max_score: {single_result[0].score}, min_score: {single_result[-1].score}"
)
search_result = self.image_market.search_learnware(user_info)
single_result = search_result.get_single_results()
multiple_result = search_result.get_multiple_results()

acc_list = []
for idx in range(len(all_learnwares)):
learnware = all_learnwares[idx]
loss, acc = evaluate(learnware, test_dataset)
acc_list.append(acc)

learnware = single_result[0].learnware
best_loss, best_acc = evaluate(learnware, test_dataset)
best_list.append(np.max(acc_list))
select_list.append(best_acc)
avg_list.append(np.mean(acc_list))
improve_list.append((best_acc - np.mean(acc_list)) / np.mean(acc_list))
print(f"market mean accuracy: {np.mean(acc_list)}, market best accuracy: {np.max(acc_list)}")
print(
f"Top1-score: {single_result[0].score}, learnware_id: {single_result[0].learnware.id}, acc: {best_acc}"
)
print(f"search result of user{i}:")
print(
f"single model num: {len(single_result)}, max_score: {single_result[0].score}, min_score: {single_result[-1].score}"
)

if len(multiple_result) > 0:
mixture_id = " ".join([learnware.id for learnware in multiple_result[0].learnwares])
print(f"mixture_score: {multiple_result[0].score}, mixture_learnware: {mixture_id}")
mixture_learnware_list = multiple_result[0].learnwares
else:
mixture_learnware_list = [single_result[0].learnware]

# test reuse (job selector)
reuse_job_selector = JobSelectorReuser(learnware_list=mixture_learnware_list, use_herding=False)
job_loss, job_acc = evaluate(reuse_job_selector, test_dataset)
job_selector_score_list.append(job_acc)
print(f"mixture reuse accuracy (job selector): {job_acc}")

# test reuse (ensemble)
reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list, mode="vote_by_prob")
ensemble_loss, ensemble_acc = evaluate(reuse_ensemble, test_dataset)
ensemble_score_list.append(ensemble_acc)
print(f"mixture reuse accuracy (ensemble): {ensemble_acc}\n")

user_model_score_mat = []
pruning_score_mat = []
single_score_mat = []

for n_label, repeated in zip(self.n_labeled_list, self.repeated_list):
user_model_score_list, reuse_pruning_score_list = [], []
if n_label > len(train_x):
n_label = len(train_x)
for _ in range(repeated):
x_train, y_train = zip(*random.sample(list(zip(train_x, train_y)), k=n_label))
x_train = np.array(list(x_train))
y_train = np.array(list(y_train))

x_train = torch.from_numpy(x_train)
y_train = torch.from_numpy(y_train)
sampled_dataset = TensorDataset(x_train, y_train)

mode_save_path = os.path.abspath(os.path.join(self.model_path, "model.pth"))
model = ConvModel(
channel=x_train.shape[1], im_size=(x_train.shape[2], x_train.shape[3]), n_random_features=10
).to(device)
train_model(
model,
sampled_dataset,
sampled_dataset,
mode_save_path,
epochs=35,
batch_size=128,
device=device,
verbose=False,
)
model.load_state_dict(torch.load(mode_save_path))
_, user_model_acc = evaluate(model, test_dataset, distribution=True)
user_model_score_list.append(user_model_acc)

reuse_pruning = EnsemblePruningReuser(learnware_list=mixture_learnware_list, mode="classification")
reuse_pruning.fit(x_train, y_train)
_, pruning_acc = evaluate(reuse_pruning, test_dataset, distribution=False)
reuse_pruning_score_list.append(pruning_acc)

single_score_mat.append([best_acc] * repeated)
user_model_score_mat.append(user_model_score_list)
pruning_score_mat.append(reuse_pruning_score_list)
acc_list = []
for idx in range(len(all_learnwares)):
learnware = all_learnwares[idx]
loss, acc = evaluate(learnware, test_dataset)
acc_list.append(acc)

learnware = single_result[0].learnware
best_loss, best_acc = evaluate(learnware, test_dataset)
best_list.append(np.max(acc_list))
select_list.append(best_acc)
avg_list.append(np.mean(acc_list))
improve_list.append((best_acc - np.mean(acc_list)) / np.mean(acc_list))
print(f"market mean accuracy: {np.mean(acc_list)}, market best accuracy: {np.max(acc_list)}")
print(
f"user_label_num: {n_label}, user_acc: {np.mean(user_model_score_mat[-1])}, pruning_acc: {np.mean(pruning_score_mat[-1])}"
f"Top1-score: {single_result[0].score}, learnware_id: {single_result[0].learnware.id}, acc: {best_acc}"
)

logger.info(f"Saving Curves for User_{i}")
user_curves_data = (single_score_mat, user_model_score_mat, pruning_score_mat)
with open(os.path.join(self.curve_path, f"curve{str(i)}.pkl"), "wb") as f:
pickle.dump(user_curves_data, f)

logger.info(
"Accuracy of selected learnware: %.3f +/- %.3f, Average performance: %.3f +/- %.3f, Best performance: %.3f +/- %.3f"
% (
np.mean(select_list),
np.std(select_list),
np.mean(avg_list),
np.std(avg_list),
np.mean(best_list),
np.std(best_list),
if len(multiple_result) > 0:
mixture_id = " ".join([learnware.id for learnware in multiple_result[0].learnwares])
print(f"mixture_score: {multiple_result[0].score}, mixture_learnware: {mixture_id}")
mixture_learnware_list = multiple_result[0].learnwares
else:
mixture_learnware_list = [single_result[0].learnware]

# test reuse (job selector)
reuse_job_selector = JobSelectorReuser(learnware_list=mixture_learnware_list, use_herding=False)
job_loss, job_acc = evaluate(reuse_job_selector, test_dataset)
job_selector_score_list.append(job_acc)
print(f"mixture reuse accuracy (job selector): {job_acc}")

# test reuse (ensemble)
reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list, mode="vote_by_prob")
ensemble_loss, ensemble_acc = evaluate(reuse_ensemble, test_dataset)
ensemble_score_list.append(ensemble_acc)
print(f"mixture reuse accuracy (ensemble): {ensemble_acc}\n")

user_model_score_mat = []
pruning_score_mat = []
single_score_mat = []

for n_label, repeated in zip(self.n_labeled_list, self.repeated_list):
user_model_score_list, reuse_pruning_score_list = [], []
if n_label > len(train_x):
n_label = len(train_x)
for _ in range(repeated):
x_train, y_train = zip(*random.sample(list(zip(train_x, train_y)), k=n_label))
x_train = np.array(list(x_train))
y_train = np.array(list(y_train))

x_train = torch.from_numpy(x_train)
y_train = torch.from_numpy(y_train)
sampled_dataset = TensorDataset(x_train, y_train)

mode_save_path = os.path.abspath(os.path.join(self.model_path, "model.pth"))
model = ConvModel(
channel=x_train.shape[1], im_size=(x_train.shape[2], x_train.shape[3]), n_random_features=10
).to(device)
train_model(
model,
sampled_dataset,
sampled_dataset,
mode_save_path,
epochs=35,
batch_size=128,
device=device,
verbose=False,
)
model.load_state_dict(torch.load(mode_save_path))
_, user_model_acc = evaluate(model, test_dataset, distribution=True)
user_model_score_list.append(user_model_acc)

reuse_pruning = EnsemblePruningReuser(
learnware_list=mixture_learnware_list, mode="classification"
)
reuse_pruning.fit(x_train, y_train)
_, pruning_acc = evaluate(reuse_pruning, test_dataset, distribution=False)
reuse_pruning_score_list.append(pruning_acc)

single_score_mat.append([best_acc] * repeated)
user_model_score_mat.append(user_model_score_list)
pruning_score_mat.append(reuse_pruning_score_list)
print(
f"user_label_num: {n_label}, user_acc: {np.mean(user_model_score_mat[-1])}, pruning_acc: {np.mean(pruning_score_mat[-1])}"
)

logger.info(f"Saving Curves for User_{i}")
user_curves_data = (single_score_mat, user_model_score_mat, pruning_score_mat)
with open(os.path.join(self.curve_path, f"curve{str(i)}.pkl"), "wb") as f:
pickle.dump(user_curves_data, f)

logger.info(
"Accuracy of selected learnware: %.3f +/- %.3f, Average performance: %.3f +/- %.3f, Best performance: %.3f +/- %.3f"
% (
np.mean(select_list),
np.std(select_list),
np.mean(avg_list),
np.std(avg_list),
np.mean(best_list),
np.std(best_list),
)
)
logger.info("Average performance improvement: %.3f" % (np.mean(improve_list)))
logger.info(
"Average Job Selector Reuse Performance: %.3f +/- %.3f"
% (np.mean(job_selector_score_list), np.std(job_selector_score_list))
)
logger.info(
"Averaging Ensemble Reuse Performance: %.3f +/- %.3f"
% (np.mean(ensemble_score_list), np.std(ensemble_score_list))
)
)
logger.info("Average performance improvement: %.3f" % (np.mean(improve_list)))
logger.info(
"Average Job Selector Reuse Performance: %.3f +/- %.3f"
% (np.mean(job_selector_score_list), np.std(job_selector_score_list))
)
logger.info(
"Averaging Ensemble Reuse Performance: %.3f +/- %.3f"
% (np.mean(ensemble_score_list), np.std(ensemble_score_list))
)

pruning_curves_data, user_model_curves_data = [], []
total_user_model_score_mat = [np.zeros(self.repeated_list[i]) for i in range(len(self.n_labeled_list))]
total_pruning_score_mat = [np.zeros(self.repeated_list[i]) for i in range(len(self.n_labeled_list))]
for user_idx in range(self.image_benchmark.user_num):
for user_idx in range(image_benchmark_config.user_num):
with open(os.path.join(self.curve_path, f"curve{str(user_idx)}.pkl"), "rb") as f:
user_curves_data = pickle.load(f)
(single_score_mat, user_model_score_mat, pruning_score_mat) = user_curves_data
@@ -244,8 +250,8 @@ class ImageDatasetWorkflow:
total_pruning_score_mat[i] += 1 - np.array(pruning_score_mat[i]) / 100

for i in range(len(self.n_labeled_list)):
total_user_model_score_mat[i] /= self.image_benchmark.user_num
total_pruning_score_mat[i] /= self.image_benchmark.user_num
total_user_model_score_mat[i] /= image_benchmark_config.user_num
total_pruning_score_mat[i] /= image_benchmark_config.user_num
user_model_curves_data.append(
(np.mean(total_user_model_score_mat[i]), np.std(total_user_model_score_mat[i]))
)


+ 0
- 8
examples/dataset_m5_workflow/example.yaml View File

@@ -1,8 +0,0 @@
model:
class_name: Model
kwargs: {}
stat_specifications:
- module_path: learnware.specification
class_name: RKMETableSpecification
file_name: rkme.json
kwargs: {}

+ 0
- 21
examples/dataset_m5_workflow/example_init.py View File

@@ -1,21 +0,0 @@
import os
import joblib
import numpy as np
import lightgbm as lgb
from learnware.model import BaseModel


class Model(BaseModel):
def __init__(self):
super(Model, self).__init__(input_shape=(82,), output_shape=(1,))
dir_path = os.path.dirname(os.path.abspath(__file__))
self.model = lgb.Booster(model_file=os.path.join(dir_path, "model.out"))

def fit(self, X: np.ndarray, y: np.ndarray):
pass

def predict(self, X: np.ndarray) -> np.ndarray:
return self.model.predict(X)

def finetune(self, X: np.ndarray, y: np.ndarray):
pass

+ 0
- 3
examples/dataset_m5_workflow/m5/README.md View File

@@ -1,3 +0,0 @@
# M5 Dataset

Walmart store, involves the unit sales of various products sold in the USA, organized in the form of grouped time series. More specifically, the dataset involves the unit sales of 3049 products, classified in 3 product categories (Hobbies, Foods, and Household).

+ 0
- 65
examples/dataset_m5_workflow/m5/__init__.py View File

@@ -1,65 +0,0 @@
from cgi import test
import os
import joblib
import lightgbm as lgb


from .config import store_list, model_dir
from .utils import acquire_data, get_weights, model_predict, score, measure_aux_algo
from .generate_data import regenerate_data
from .train import retrain_models, grid_training_sample, train_adaptation_grid


class DataLoader:
def __init__(self):
self.algo = "ridge"

def set_algo(self, algo):
self.algo = algo

def get_algo_list(self):
return ["lgb", "ridge"]

def get_idx_list(self):
return list(range(len(store_list)))

def get_idx_data(self, idx):
store = store_list[idx]
# fill_flag = self.algo == "ridge"
fill_flag = True
return acquire_data(store, fill_flag)

def get_weights(self):
return get_weights(self.algo)

def get_model_path(self, idx):
return os.path.join(model_dir, "{}_{}.out".format(self.algo, store_list[idx]))

def predict(self, idx, test_x):
store = store_list[idx]

if os.path.exists(os.path.join(model_dir, f"{self.algo}_{store}.out")):
return model_predict(self.algo, idx, test_x)
else:
self.retrain_models()
return model_predict(self.algo, idx, test_x)

def score(self, real_y, pred_y, sample_weight=None, multioutput="raw_values"):
return score(real_y, pred_y, sample_weight, multioutput)

def regenerate_data(self):
regenerate_data()

def retrain_models(self):
retrain_models(self.algo)

def grid_training_sample(self, user_list=list(range(10))):
grid_training_sample(self.algo, user_list)

def train_adaptation_grid(
self, max_sample, test_sample, user_list=list(range(10)), adaptation_model=[], residual=False
):
train_adaptation_grid(self.algo, max_sample, test_sample, user_list, adaptation_model, residual)

def measure_aux_algo(self, idx, test_sample, model):
return measure_aux_algo(idx, test_sample, model)

+ 0
- 139
examples/dataset_m5_workflow/m5/config.py View File

@@ -1,139 +0,0 @@
import os


ROOT_PATH = os.path.abspath(os.path.join(__file__, "..", "data"))
raw_data_dir = os.path.join(ROOT_PATH, "raw")
processed_data_dir = os.path.join(ROOT_PATH, "processed")
model_dir = os.path.join(ROOT_PATH, "models")
grid_dir = os.path.join(ROOT_PATH, "grid_sample")


TARGET = "sales"
START_TRAIN = 1
END_TRAIN = 1941 - 28


category_list = ["item_id", "dept_id", "cat_id", "event_name_1", "event_name_2", "event_type_1", "event_type_2"]
features_columns = [
"item_id",
"dept_id",
"cat_id",
"release",
"sell_price",
"price_max",
"price_min",
"price_std",
"price_mean",
"price_norm",
"price_nunique",
"item_nunique",
"price_momentum",
"price_momentum_m",
"price_momentum_y",
"event_name_1",
"event_type_1",
"event_name_2",
"event_type_2",
"snap",
"tm_d",
"tm_w",
"tm_m",
"tm_y",
"tm_wm",
"tm_dw",
"tm_w_end",
"sales_lag_28",
"sales_lag_29",
"sales_lag_30",
"sales_lag_31",
"sales_lag_32",
"sales_lag_33",
"sales_lag_34",
"sales_lag_35",
"sales_lag_36",
"sales_lag_37",
"sales_lag_38",
"sales_lag_39",
"sales_lag_40",
"sales_lag_41",
"sales_lag_42",
"rolling_mean_7",
"rolling_std_7",
"rolling_mean_14",
"rolling_std_14",
"rolling_mean_30",
"rolling_std_30",
"rolling_mean_60",
"rolling_std_60",
"rolling_mean_180",
"rolling_std_180",
"rolling_mean_tmp_1_7",
"rolling_mean_tmp_1_14",
"rolling_mean_tmp_1_30",
"rolling_mean_tmp_1_60",
"rolling_mean_tmp_7_7",
"rolling_mean_tmp_7_14",
"rolling_mean_tmp_7_30",
"rolling_mean_tmp_7_60",
"rolling_mean_tmp_14_7",
"rolling_mean_tmp_14_14",
"rolling_mean_tmp_14_30",
"rolling_mean_tmp_14_60",
# "enc_state_id_mean",
# "enc_state_id_std",
# "enc_store_id_mean",
# "enc_store_id_std",
"enc_cat_id_mean",
"enc_cat_id_std",
"enc_dept_id_mean",
"enc_dept_id_std",
"enc_state_id_cat_id_mean",
"enc_state_id_cat_id_std",
"enc_state_id_dept_id_mean",
"enc_state_id_dept_id_std",
"enc_store_id_cat_id_mean",
"enc_store_id_cat_id_std",
"enc_store_id_dept_id_mean",
"enc_store_id_dept_id_std",
"enc_item_id_mean",
"enc_item_id_std",
"enc_item_id_state_id_mean",
"enc_item_id_state_id_std",
"enc_item_id_store_id_mean",
"enc_item_id_store_id_std",
]
label_column = ["sales"]


lgb_params_list = [
[0.015, 224, 66],
[0.01, 224, 50],
[0.01, 300, 80],
[0.015, 128, 50],
[0.015, 300, 50],
[0.01, 300, 66],
[0.015, 300, 80],
[0.15, 224, 80],
[0.005, 300, 50],
[0.015, 224, 50],
]


store_list = ["CA_1", "CA_2", "CA_3", "CA_4", "TX_1", "TX_2", "TX_3", "WI_1", "WI_2", "WI_3"]
dataset_info = {
"name": "M5",
"range of date": "2011.01.29-2016.06.19",
"description": "Walmart store, involves the unit sales of various products sold in the USA, organized in the form of grouped time series. More specifically, the dataset involves the unit sales of 3049 products, classified in 3 product categories (Hobbies, Foods, and Household).",
"location": [
"California, United States",
"California, United States",
"California, United States",
"California, United States",
"Texas, United States",
"Texas, United States",
"Texas, United States",
"Wisconsin, United States",
"Wisconsin, United States",
"Wisconsin, United States",
],
}

+ 0
- 338
examples/dataset_m5_workflow/m5/generate_data.py View File

@@ -1,338 +0,0 @@
import numpy as np
import pandas as pd
from math import ceil
from tqdm import tqdm
from copy import deepcopy as dco
import os, sys, gc, time, warnings, pickle, psutil, random
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler


from .utils import *
from .config import raw_data_dir, processed_data_dir, TARGET

warnings.filterwarnings("ignore")


# ==================== preprocessing ====================
def melt_raw_data(train_df):
if os.path.exists(os.path.join(processed_data_dir, "melt_raw_data.pkl")):
return pd.read_pickle(os.path.join(processed_data_dir, "melt_raw_data.pkl"))

index_columns = ["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"]
grid_df = pd.melt(train_df, id_vars=index_columns, var_name="d", value_name=TARGET)

for col in index_columns:
grid_df[col] = grid_df[col].astype("category")

grid_df.to_pickle(os.path.join(processed_data_dir, "melt_raw_data.pkl"))
return grid_df


def add_release_week(grid_df, prices_df, calendar_df):
if os.path.exists(os.path.join(processed_data_dir, "add_release_week.pkl")):
return pd.read_pickle(os.path.join(processed_data_dir, "add_release_week.pkl"))

release_df = prices_df.groupby(["store_id", "item_id"])["wm_yr_wk"].agg(["min"]).reset_index()
release_df.columns = ["store_id", "item_id", "release"]
grid_df = merge_by_concat(grid_df, release_df, ["store_id", "item_id"])
grid_df = merge_by_concat(grid_df, calendar_df[["wm_yr_wk", "d"]], ["d"])

# cutoff meaningless rows
grid_df = grid_df[grid_df["wm_yr_wk"] >= grid_df["release"]]
grid_df = grid_df.reset_index(drop=True)

# scale the release
grid_df["release"] = grid_df["release"] - grid_df["release"].min()
grid_df["release"] = grid_df["release"].astype(np.int16)

grid_df.to_pickle(os.path.join(processed_data_dir, "add_release_week.pkl"))
return grid_df


def add_prices(grid_df, prices_df, calendar_df):
if os.path.exists(os.path.join(processed_data_dir, "add_prices.pkl")):
return pd.read_pickle(os.path.join(processed_data_dir, "add_prices.pkl"))

prices_df["price_max"] = prices_df.groupby(["store_id", "item_id"])["sell_price"].transform("max")
prices_df["price_min"] = prices_df.groupby(["store_id", "item_id"])["sell_price"].transform("min")
prices_df["price_std"] = prices_df.groupby(["store_id", "item_id"])["sell_price"].transform("std")
prices_df["price_mean"] = prices_df.groupby(["store_id", "item_id"])["sell_price"].transform("mean")
prices_df["price_norm"] = prices_df["sell_price"] / prices_df["price_max"]

prices_df["price_nunique"] = prices_df.groupby(["store_id", "item_id"])["sell_price"].transform("nunique")
prices_df["item_nunique"] = prices_df.groupby(["store_id", "sell_price"])["item_id"].transform("nunique")

calendar_prices = calendar_df[["wm_yr_wk", "month", "year"]]
calendar_prices = calendar_prices.drop_duplicates(subset=["wm_yr_wk"])
prices_df = prices_df.merge(calendar_prices[["wm_yr_wk", "month", "year"]], on=["wm_yr_wk"], how="left")

prices_df["price_momentum"] = prices_df["sell_price"] / prices_df.groupby(["store_id", "item_id"])[
"sell_price"
].transform(lambda x: x.shift(1))
prices_df["price_momentum_m"] = prices_df["sell_price"] / prices_df.groupby(["store_id", "item_id", "month"])[
"sell_price"
].transform("mean")
prices_df["price_momentum_y"] = prices_df["sell_price"] / prices_df.groupby(["store_id", "item_id", "year"])[
"sell_price"
].transform("mean")

grid_df = reduce_mem_usage(grid_df)
prices_df = reduce_mem_usage(prices_df)

original_columns = list(grid_df)
grid_df = grid_df.merge(prices_df, on=["store_id", "item_id", "wm_yr_wk"], how="left")
grid_df = reduce_mem_usage(grid_df)

grid_df.to_pickle(os.path.join(processed_data_dir, "add_prices.pkl"))
return grid_df


def add_date(grid_df, calendar_df):
if os.path.exists(os.path.join(processed_data_dir, "add_date.pkl")):
return pd.read_pickle(os.path.join(processed_data_dir, "add_date.pkl"))

# merge calendar partly
icols = [
"date",
"d",
"event_name_1",
"event_type_1",
"event_name_2",
"event_type_2",
"snap_CA",
"snap_TX",
"snap_WI",
]
grid_df = grid_df.merge(calendar_df[icols], on=["d"], how="left")

# convert to category
icols = [
"event_name_1",
"event_type_1",
"event_name_2",
"event_type_2",
"snap_CA",
"snap_TX",
"snap_WI",
]
for col in icols:
grid_df[col] = grid_df[col].astype("category")

# make some features from date
grid_df["date"] = pd.to_datetime(grid_df["date"])
grid_df["tm_d"] = grid_df["date"].dt.day.astype(np.int8)
grid_df["tm_w"] = grid_df["date"].dt.week.astype(np.int8)
grid_df["tm_m"] = grid_df["date"].dt.month.astype(np.int8)
grid_df["tm_y"] = grid_df["date"].dt.year
grid_df["tm_y"] = (grid_df["tm_y"] - grid_df["tm_y"].min()).astype(np.int8)
grid_df["tm_wm"] = grid_df["tm_d"].apply(lambda x: ceil(x / 7)).astype(np.int8)

grid_df["tm_dw"] = grid_df["date"].dt.dayofweek.astype(np.int8)
grid_df["tm_w_end"] = (grid_df["tm_dw"] >= 5).astype(np.int8)

# clear columns
grid_df["d"] = grid_df["d"].apply(lambda x: x[2:]).astype(np.int16)
grid_df = grid_df.drop("wm_yr_wk", 1)

grid_df.to_pickle(os.path.join(processed_data_dir, "add_date.pkl"))
return grid_df


def add_lags_rollings(grid_df):
if os.path.exists(os.path.join(processed_data_dir, "add_lags_rollings.pkl")):
return pd.read_pickle(os.path.join(processed_data_dir, "add_lags_rollings.pkl"))

# add lags
SHIFT_DAY = 28
LAG_DAYS = [col for col in range(SHIFT_DAY, SHIFT_DAY + 15)]

grid_df = grid_df.assign(
**{
"{}_lag_{}".format(col, l): grid_df.groupby(["id"])[col].transform(lambda x: x.shift(l))
for l in LAG_DAYS
for col in [TARGET]
}
)

for col in list(grid_df):
if "lag" in col:
grid_df[col] = grid_df[col].astype(np.float16)

# add rollings
for i in [7, 14, 30, 60, 180]:
grid_df["rolling_mean_" + str(i)] = (
grid_df.groupby(["id"])[TARGET].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).mean()).astype(np.float16)
)
grid_df["rolling_std_" + str(i)] = (
grid_df.groupby(["id"])[TARGET].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).std()).astype(np.float16)
)

# sliding window
for d_shift in [1, 7, 14]:
for d_window in [7, 14, 30, 60]:
col_name = "rolling_mean_tmp_" + str(d_shift) + "_" + str(d_window)
grid_df[col_name] = (
grid_df.groupby(["id"])[TARGET]
.transform(lambda x: x.shift(SHIFT_DAY + d_shift).rolling(d_window).mean())
.astype(np.float16)
)

grid_df.to_pickle(os.path.join(processed_data_dir, "add_lags_rollings.pkl"))
return grid_df


def add_mean_enc(grid_df):
if os.path.exists(os.path.join(processed_data_dir, "add_mean_enc.pkl")):
return pd.read_pickle(os.path.join(processed_data_dir, "add_mean_enc.pkl"))

sales_df = dco(grid_df["sales"])
grid_df["sales"][grid_df["d"] > (1941 - 28)] = np.nan

icols = [
["state_id"],
["store_id"],
["cat_id"],
["dept_id"],
["state_id", "cat_id"],
["state_id", "dept_id"],
["store_id", "cat_id"],
["store_id", "dept_id"],
["item_id"],
["item_id", "state_id"],
["item_id", "store_id"],
]

for col in icols:
col_name = "_" + "_".join(col) + "_"
grid_df["enc" + col_name + "mean"] = grid_df.groupby(col)["sales"].transform("mean").astype(np.float16)
grid_df["enc" + col_name + "std"] = grid_df.groupby(col)["sales"].transform("std").astype(np.float16)

grid_df["sales"] = sales_df

grid_df.to_pickle(os.path.join(processed_data_dir, "add_mean_enc.pkl"))
return grid_df


def add_snap(grid_df):
if os.path.exists(os.path.join(processed_data_dir, "all_data_df.pkl")):
return pd.read_pickle(os.path.join(processed_data_dir, "all_data_df.pkl"))

mask_CA = grid_df["state_id"] == "CA"
mask_WI = grid_df["state_id"] == "WI"
mask_TX = grid_df["state_id"] == "TX"

grid_df["snap"] = grid_df["snap_CA"]
grid_df.loc[mask_WI, "snap"] = grid_df["snap_WI"]
grid_df.loc[mask_TX, "snap"] = grid_df["snap_TX"]

grid_df.to_pickle(os.path.join(processed_data_dir, "all_data_df.pkl"))
return grid_df


def preprocessing_m5():
train_df = pd.read_csv(os.path.join(raw_data_dir, "sales_train_evaluation.csv"))
prices_df = pd.read_csv(os.path.join(raw_data_dir, "sell_prices.csv"))
calendar_df = pd.read_csv(os.path.join(raw_data_dir, "calendar.csv"))

grid_df = melt_raw_data(train_df)
print(f"df: ({grid_df.shape[0]}, {grid_df.shape[1]}) Melting raw data down!")

grid_df = add_release_week(grid_df, prices_df, calendar_df)
print(f"df: ({grid_df.shape[0]}, {grid_df.shape[1]}) Adding release week down!")

grid_df = add_prices(grid_df, prices_df, calendar_df)
print(f"df: ({grid_df.shape[0]}, {grid_df.shape[1]}) Adding prices down!")

grid_df = add_date(grid_df, calendar_df)
print(f"df: ({grid_df.shape[0]}, {grid_df.shape[1]}) Adding date down!")

grid_df = add_lags_rollings(grid_df)
print(f"df: ({grid_df.shape[0]}, {grid_df.shape[1]}) Adding lags and rollings down!")

grid_df = add_mean_enc(grid_df)
print(f"df: ({grid_df.shape[0]}, {grid_df.shape[1]}) Adding mean encoding down!")

grid_df = pd.read_pickle(os.path.join(processed_data_dir, "add_mean_enc.pkl"))

grid_df = add_snap(grid_df)
print("Save the data down!")


# ==================== split dataset ====================
def label_encode(df, columns):
le = LabelEncoder()
data_list = []

for column in columns:
data_list += df[column].drop_duplicates().values.tolist()
le.fit(data_list)

for column in columns:
df[column] = le.transform(df[column].values.tolist())

return df


def reorganize_data(grid_df):
grid_df["snap"] = grid_df["snap"].astype("int8")
columns_list = [
["item_id"],
["dept_id"],
["cat_id"],
["event_name_1", "event_name_2"],
["event_type_1", "event_type_2"],
]

for columns in columns_list:
grid_df[columns] = label_encode(grid_df[columns], columns)

return reduce_mem_usage(grid_df)


def split_data(df, store, fill_flag=False):
for cat in category_list:
df[cat] = df[cat].astype("category")

if fill_flag:
df = reduce_mem_usage(df, float16_flag=False)
cols = df.isnull().any()
idx = list(cols[cols.values].index)

df[idx] = df.groupby("item_id", sort=False)[idx].apply(lambda x: x.ffill().bfill())
df[idx] = df[idx].fillna(df[idx].mean())

mms = MinMaxScaler()
df[features_columns] = mms.fit_transform(df[features_columns])

df = reduce_mem_usage(df)

train_df = df[df["d"] <= END_TRAIN]
val_df = df[df["d"] > END_TRAIN]

train_df = train_df[features_columns + label_column]
val_df = val_df[features_columns + label_column]
print(train_df.shape, val_df.shape)

suffix = f"_fill" if fill_flag else ""
train_df.to_pickle(os.path.join(processed_data_dir, f"train_{store}{suffix}.pkl"))
val_df.to_pickle(os.path.join(processed_data_dir, f"val_{store}{suffix}.pkl"))


def split_m5():
grid_df = pd.read_pickle(os.path.join(processed_data_dir, "all_data_df.pkl"))

if os.path.exists(os.path.join(processed_data_dir, "label_encode.pkl")):
grid_df = pd.read_pickle(os.path.join(processed_data_dir, "label_encode.pkl"))
else:
grid_df = reorganize_data(grid_df)
grid_df.to_pickle(os.path.join(processed_data_dir, "label_encode.pkl"))

for store in store_list:
# split_data(grid_df[grid_df["store_id"] == store], store)
split_data(grid_df[grid_df["store_id"] == store], store, True)


def regenerate_data():
preprocessing_m5()
split_m5()

+ 0
- 452
examples/dataset_m5_workflow/m5/train.py View File

@@ -1,452 +0,0 @@
import gc
import joblib
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
import os, warnings
import lightgbm as lgb
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import rbf_kernel


from .utils import *
from .config import model_dir, grid_dir, store_list, lgb_params_list

warnings.filterwarnings("ignore")


def train_lgb_model(train_x, train_y, val_x, val_y, store, lr, nl, md, best, save=True, n_estimators=0, train_flag=0):
lgb_params = {
"boosting_type": "gbdt",
"objective": "rmse",
"metric": "rmse",
"learning_rate": lr,
"num_leaves": nl,
"max_depth": md,
"n_estimators": 100000,
"boost_from_average": False,
"verbose": -1,
}

if train_flag:
idx = int(len(train_y) * 0.1)
train_data = lgb.Dataset(train_x[:-idx], label=train_y[:-idx])
val_data = lgb.Dataset(train_x[-idx:], label=train_y[-idx:])
else:
train_data = lgb.Dataset(train_x, label=train_y)
val_data = lgb.Dataset(val_x, label=val_y)

if n_estimators:
lgb_params["n_estimators"] = n_estimators
gbm = lgb.train(lgb_params, train_data, verbose_eval=100)
else:
gbm = lgb.train(lgb_params, train_data, valid_sets=[val_data], verbose_eval=100, early_stopping_rounds=1000)

test_y = gbm.predict(val_x, num_iteration=gbm.best_iteration)
res = mean_squared_error(val_y, test_y, squared=False)

if res < best:
best = res
if save:
gbm.save_model(os.path.join(model_dir, f"lgb_{store}.out"))

return best


def train_ridge_model(train_x, train_y, val_x, val_y, store, a, best, save=True):
model = Ridge(alpha=a)
model.fit(train_x, train_y)

test_y = model.predict(val_x)
res = mean_squared_error(val_y, test_y, squared=False)

if res < best:
best = res
if save:
joblib.dump(model, os.path.join(model_dir, f"ridge_{store}.out"))

return best


def train_svm_model(
train_x, train_y, val_x, val_y, store, C, epsilon, best, save=True, gamma=0.1, adaptation_model=[], K1=None, K2=None
):
if K1 is None:
model = SVR(C=C, epsilon=epsilon, max_iter=30000, cache_size=10240, verbose=True, gamma=gamma)
else:
model = AuxiliarySVR(
C=C,
epsilon=epsilon,
gamma=gamma,
adaptation_model=adaptation_model,
max_iter=30000,
cache_size=10240,
verbose=True,
K1=K1,
K2=K2,
)

model.fit(train_x, train_y)
test_y = model.predict(val_x)
res = mean_squared_error(val_y, test_y, squared=False)

if res < best:
best = res
if save:
joblib.dump(model, os.path.join(model_dir, f"svm_{store}.out"))

return best


def train_krr_model(train_x, train_y, val_x, val_y, store, a, best, save=True, gamma=0.1, K1=None, K2=None):
if K1 is None:
model = KernelRidge(kernel="rbf", alpha=a, gamma=gamma)
model.fit(train_x, train_y)
test_y = model.predict(val_x)
res = mean_squared_error(val_y, test_y, squared=False)
else:
len1, len2 = len(train_y), len(val_y)
model = KernelRidge(kernel="precomputed", alpha=a)
model.fit(K1[-len1:, -len1:], train_y)
test_y = model.predict(K2[-len2:, -len1:])
res = mean_squared_error(val_y, test_y, squared=False)

if res < best:
best = res
if save:
joblib.dump(model, os.path.join(model_dir, f"krr_{store}.out"))

return best


def grid_search(store_id, algo, search_lgb_flag=False):
store = store_list[store_id]

if algo == "lgb":
train_x, train_y, val_x, val_y = acquire_data(store, True)
learning_rate = [0.005, 0.01, 0.015]
num_leaves = [128, 224, 300]
max_depth = [50, 66, 80]
best = 10000000

if search_lgb_flag:
for lr in learning_rate:
for nl in num_leaves:
for md in max_depth:
best = train_lgb_model(train_x, train_y, val_x, val_y, store, lr, nl, md, best)
print(f"store: {store}, lr: {lr}, nl: {nl}, md: {md}, best: {best}")
else:
lr, nl, md = lgb_params_list[store_id]
best = train_lgb_model(train_x, train_y, val_x, val_y, store, lr, nl, md, best)
print(f"store: {store}, lr: {lr}, nl: {nl}, md: {md}, best: {best}")
elif algo == "ridge":
train_x, train_y, val_x, val_y = acquire_data(store, True)
alpha = [0.01, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10, 20, 30]
best = 10000000

for a in alpha:
best = train_ridge_model(train_x, train_y, val_x, val_y, store, a, best)
print(f"store: {store}, alpha: {a}, best: {best}")


def grid_training_sample(algo, user_list=list(range(10))):
for i in range(len(user_list)):
store_id = user_list[i]
store = store_list[store_id]
org_train_x, org_train_y, val_x, val_y = acquire_data(store, True)
res = []

proportion_list = [
100,
300,
500,
700,
900,
1000,
3000,
5000,
7000,
9000,
10000,
30000,
50000,
70000,
90000,
100000,
300000,
500000,
700000,
900000,
1000000,
3000000,
5000000,
]

for proportion in proportion_list:
"""
random
org_idx_list = list(range(len(org_train_y)))
idx_list = random.sample(org_idx_list, min(proportion, len(org_train_y)))
train_x = org_train_x.iloc[idx_list]
train_y = org_train_y.iloc[idx_list]
"""
train_x = org_train_x[-proportion:]
train_y = org_train_y[-proportion:]
best = 10000000

if algo == "lgb":
lr, nl, md = lgb_params_list[store_id]
best = train_lgb_model(
train_x, train_y, val_x, val_y, store, lr, nl, md, best, save=False, n_estimators=3000, train_flag=0
)
print(f"store: {store}, lr: {lr}, nl: {nl}, md: {md}, best: {best}")

elif algo == "ridge":
alpha = [0.01, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10, 20, 30]
for a in alpha:
best = train_ridge_model(train_x, train_y, val_x, val_y, store, a, best, save=False)
print(f"store: {store}, alpha: {a}, best: {best}")

elif algo == "svm":
C = [1, 10, 100]
epsilon = 0.001
for c in C:
best = train_svm_model(train_x, train_y, val_x, val_y, store, c, epsilon, best, save=False)
print(f"store: {store}, C: {c}, epsilon: {epsilon}, best: {best}")

res.append([proportion, best])
np.savetxt(os.path.join(grid_dir, f"grid_sample_{algo}_{store}.out"), np.array(res))

if proportion > len(org_train_y):
break


def retrain_models(algo):
for store_id in range(10):
grid_search(store_id, algo)


def train_adaptation_grid(
algo, max_sample, test_sample, user_list=list(range(10)), adaptation_model=[], residual=False
):
"""
adaptation_model = [
[("lgb", 1), ("ridge", 2)],
[("lgb", 1), ("ridge", 2)]
]
"""

proportion_list = [
100,
300,
500,
700,
900,
1000,
3000,
5000,
7000,
9000,
10000,
30000,
50000,
70000,
90000,
100000,
300000,
500000,
700000,
900000,
1000000,
3000000,
5000000,
]
sample_idx = proportion_list.index(max_sample) + 1

for i in range(len(user_list)):
store_id = user_list[i]
store = store_list[store_id]
org_train_x, org_train_y, val_x, val_y = acquire_data(store, True)
val_x = val_x[-test_sample:]
val_y = val_y[-test_sample:]

if algo == "lgb" or algo == "ridge":
res = []

if adaptation_model != []:
if residual:
aux_algo, model_idx = adaptation_model[i][0]
org_train_y -= model_predict(aux_algo, model_idx, org_train_x)
val_y -= model_predict(aux_algo, model_idx, val_x)

else:
train_y_list, val_y_list = [], []

for aux_algo, model_idx in adaptation_model[i]:
train_y_list.append(model_predict(aux_algo, model_idx, org_train_x))
val_y_list.append(model_predict(aux_algo, model_idx, val_x))

for j in range(len(train_y_list)):
org_train_x[f"model_values_{j}"] = train_y_list[j]
val_x[f"model_values_{j}"] = val_y_list[j]

for proportion in proportion_list[:sample_idx]:
"""
random
org_idx_list = list(range(len(org_train_y)))
idx_list = random.sample(org_idx_list, min(proportion, len(org_train_y)))
train_x = org_train_x.iloc[idx_list]
train_y = org_train_y.iloc[idx_list]
"""
train_x = org_train_x[-proportion:]
train_y = org_train_y[-proportion:]
best = 10000000

if algo == "lgb":
if max_sample < 50000:
learning_rate = [0.005, 0.01, 0.015]
num_leaves = [128, 224, 300]
max_depth = [50, 66, 80]

for lr in learning_rate:
for nl in num_leaves:
for md in max_depth:
best = train_lgb_model(
train_x, train_y, val_x, val_y, store, lr, nl, md, best, save=False
)
print(f"store: {store}, lr: {lr}, nl: {nl}, md: {md}, best: {best}")
else:
lr, nl, md = lgb_params_list[store_id]
best = train_lgb_model(
train_x,
train_y,
val_x,
val_y,
store,
lr,
nl,
md,
best,
save=False,
n_estimators=3000,
train_flag=0,
)
print(f"store: {store}, lr: {lr}, nl: {nl}, md: {md}, best: {best}")

elif algo == "ridge":
alpha = [0.01, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10, 20, 30]
for a in alpha:
best = train_ridge_model(train_x, train_y, val_x, val_y, store, a, best, save=False)
print(f"store: {store}, alpha: {a}, best: {best}")

res.append([proportion, best])
text = str(adaptation_model[i]) if adaptation_model != [] else "null"
text += "_residual_" if residual else ""
np.savetxt(os.path.join(grid_dir, f"{algo}_using_{text}_{store}.out"), np.array(res))

if proportion > len(org_train_y):
break

elif algo == "svm" or algo == "krr":
res = [[proportion, 10000] for proportion in proportion_list[:sample_idx]]
org_train_x = org_train_x.to_numpy()
org_train_y = org_train_y.to_numpy()
val_x = val_x.to_numpy()
val_y = val_y.to_numpy()

y1_list, y2_list = [], []
gamma_list = [0.01, 0.1, 0.5, 1]

if residual:
aux_algo, model_idx = adaptation_model[i][0]
org_train_y = org_train_y.astype(np.float64)
val_y = val_y.astype(np.float64)
org_train_y -= model_predict(aux_algo, model_idx, org_train_x)
val_y -= model_predict(aux_algo, model_idx, val_x)

elif adaptation_model != []:
for aux_algo, idx in adaptation_model[i]:
y1_list.append(model_predict(aux_algo, idx, org_train_x[-max_sample:]).reshape(-1, 1))
y2_list.append(model_predict(aux_algo, idx, val_x).reshape(-1, 1))

for gamma in gamma_list:
K1 = np.zeros((max_sample, max_sample))
K2 = np.zeros((len(val_x), max_sample))

if (not residual) and adaptation_model != []:
for j in range(len(adaptation_model[i])):
aux_algo, idx = adaptation_model[i][j]
y1 = y1_list[j]
y2 = y2_list[j]
K1 += np.dot(y1, y1.T)
K2 += np.dot(y2, y1.T)

K1 += rbf_kernel(org_train_x[-max_sample:], org_train_x[-max_sample:], gamma=gamma)
K2 += rbf_kernel(val_x, org_train_x[-max_sample:], gamma=gamma)

for idx in range(len(proportion_list[:sample_idx])):
proportion = proportion_list[idx]
"""
random
org_idx_list = list(range(len(org_train_y)))
idx_list = random.sample(org_idx_list, min(proportion, len(org_train_y)))
train_x = org_train_x.iloc[idx_list]
train_y = org_train_y.iloc[idx_list]
"""
train_x = org_train_x[-proportion:]
train_y = org_train_y[-proportion:]
best = 10000000

if algo == "svm":
C = [1, 10, 50, 100, 200]
epsilon = 0.001

for c in C:
adapt_m = [] if adaptation_model == [] else adaptation_model[i]
best = train_svm_model(
train_x,
train_y,
val_x,
val_y,
store,
c,
epsilon,
best,
save=False,
gamma=gamma,
adaptation_model=adapt_m,
K1=K1,
K2=K2,
)
print(f"store: {store}, gamma: {gamma}, C: {c}, epsilon: {epsilon}, best: {best}")

elif algo == "krr":
alpha = [0.01, 0.1, 0.5, 1.0, 5.0, 10]

for a in alpha:
best = train_krr_model(
train_x, train_y, val_x, val_y, store, a, best, save=False, gamma=gamma, K1=K1, K2=K2
)
print(f"store: {store}, a: {a}, gamma: {gamma}, best: {best}")

if best < res[idx][1]:
res[idx][1] = best
text = str(adaptation_model[i]) if adaptation_model != [] else "null"
text += "_residual" if residual else ""
np.savetxt(os.path.join(grid_dir, f"{algo}_using_{text}_{store}.out"), np.array(res))

if proportion > len(org_train_y):
break

del train_x, train_y
gc.collect()

del K1, K2
gc.collect()

del org_train_x, org_train_y
gc.collect()

+ 0
- 177
examples/dataset_m5_workflow/m5/utils.py View File

@@ -1,177 +0,0 @@
from math import gamma
from tkinter import Y
import joblib
from tqdm import tqdm
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import rbf_kernel
import os, sys, gc, time, warnings, pickle, psutil, random
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable


from .config import *


class AuxiliarySVR:
def __init__(
self, C, epsilon, gamma, adaptation_model=[], max_iter=30000, cache_size=10240, verbose=False, K1=None, K2=None
):
self.gamma = gamma
self.adaptation_model = adaptation_model
self.model = SVR(
C=C,
epsilon=epsilon,
kernel=self.auxiliary_rbf_kernel,
max_iter=max_iter,
cache_size=cache_size,
verbose=verbose,
)
self.K1 = K1
self.K2 = K2

def auxiliary_rbf_kernel(self, X1, X2):
if self.K1 is not None:
if X1.shape[0] == X2.shape[0]:
return self.K1[-X1.shape[0] :, -X2.shape[0] :]
else:
return self.K2[-X1.shape[0] :, -X2.shape[0] :]
else:
K = np.zeros((len(X1), len(X2)))

for algo, idx in self.adaptation_model:
Y1 = model_predict(algo, idx, X1).reshape(-1, 1)
Y2 = model_predict(algo, idx, X2).reshape(-1, 1)
K += Y1 @ Y2.T

K += rbf_kernel(X1, X2, self.gamma)
return K

def fit(self, X, Y):
self.gamma = 1 / X.shape[1]
self.model.fit(X, Y)

def predict(self, X):
return self.model.predict(X)


def measure_aux_algo(idx, test_sample, model):
"""
model = ("lgb", 1)
"""
store = store_list[idx]
org_train_x, org_train_y, val_x, val_y = acquire_data(store, True)
pred_y = model_predict(model[0], model[1], val_x[-test_sample:])
return score(pred_y, val_y[-test_sample:])


# Simple "Memory profilers" to see memory usage
def get_memory_usage():
return np.round(psutil.Process(os.getpid()).memory_info()[0] / 2.0**30, 2)


def sizeof_fmt(num, suffix="B"):
for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
if abs(num) < 1024.0:
return "%3.1f%s%s" % (num, unit, suffix)
num /= 1024.0
return "%.1f%s%s" % (num, "Yi", suffix)


# Memory Reducer
def reduce_mem_usage(df, float16_flag=True, verbose=True):
numerics = ["int16", "int32", "int64", "float16", "float32", "float64"]
start_mem = df.memory_usage().sum() / 1024**2
for col in df.columns:
col_type = df[col].dtypes
if col_type in numerics:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == "int":
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if float16_flag and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024**2
if verbose:
print(
"Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)".format(
end_mem, 100 * (start_mem - end_mem) / start_mem
)
)
return df


# Merging by concat to not lose dtypes
def merge_by_concat(df1, df2, merge_on):
merged_gf = df1[merge_on]
merged_gf = merged_gf.merge(df2, on=merge_on, how="left")
new_columns = [col for col in list(merged_gf) if col not in merge_on]
df1 = pd.concat([df1, merged_gf[new_columns]], axis=1)
return df1


def model_predict(algo, idx, test_x):
store = store_list[idx]

if algo == "lgb":
model = lgb.Booster(model_file=os.path.join(model_dir, f"lgb_{store}.out"))
return model.predict(test_x, num_iteration=model.best_iteration)
elif algo == "ridge":
model = joblib.load(os.path.join(model_dir, f"ridge_{store}.out"))
return model.predict(test_x)
elif algo == "svm":
model = joblib.load(os.path.join(model_dir, f"svm_{store}.out"))
return model.predict(test_x)


def get_weights(algo):
weights = []

if algo == "lgb":
for store in store_list:
model = lgb.Booster(model_file=os.path.join(model_dir, f"lgb_{store}.out"))
weights.append(model.feature_importance())
else:
for store in store_list:
model = joblib.load(os.path.join(model_dir, f"ridge_{store}.out"))
weights.append(model.coef_)

return np.array(weights)


def score(real_y, pred_y, sample_weight, multioutput):
return mean_squared_error(real_y, pred_y, sample_weight=sample_weight, multioutput=multioutput, squared=False)


def acquire_data(store, fill_flag=False):
TARGET = "sales"
suffix = f"_fill" if fill_flag else ""
train = pd.read_pickle(os.path.join(processed_data_dir, f"train_{store}{suffix}.pkl"))
val = pd.read_pickle(os.path.join(processed_data_dir, f"val_{store}{suffix}.pkl"))

train_y = train[TARGET]
train_x = train.drop(columns=TARGET, axis=1)
val_y = val[TARGET]
val_x = val.drop(columns=TARGET, axis=1)

train_x = train_x.to_numpy()
train_y = train_y.to_numpy()
val_x = val_x.to_numpy()
val_y = val_y.to_numpy()

return train_x, train_y, val_x, val_y

+ 0
- 211
examples/dataset_m5_workflow/main.py View File

@@ -1,211 +0,0 @@
import os
import fire
import time
import zipfile
import numpy as np
from tqdm import tqdm
from shutil import copyfile, rmtree

import learnware
from learnware.market import instantiate_learnware_market, BaseUserInfo
from learnware.reuse import JobSelectorReuser, AveragingReuser
from learnware.specification import generate_rkme_table_spec
from m5 import DataLoader
from learnware.logger import get_module_logger

logger = get_module_logger("m5_test", level="INFO")


output_description = {
"Dimension": 1,
"Description": {},
}

input_description = {
"Dimension": 82,
"Description": {},
}

semantic_specs = [
{
"Data": {"Values": ["Table"], "Type": "Class"},
"Task": {"Values": ["Regression"], "Type": "Class"},
"Library": {"Values": ["Scikit-learn"], "Type": "Class"},
"Scenario": {"Values": ["Business"], "Type": "Tag"},
"Description": {"Values": "", "Type": "String"},
"Name": {"Values": "learnware_1", "Type": "String"},
"Input": input_description,
"Output": output_description,
"License": {"Values": ["MIT"], "Type": "Class"},
}
]

user_semantic = {
"Data": {"Values": ["Table"], "Type": "Class"},
"Task": {"Values": ["Regression"], "Type": "Class"},
"Library": {"Values": ["Scikit-learn"], "Type": "Class"},
"Scenario": {"Values": ["Business"], "Type": "Tag"},
"Description": {"Values": "", "Type": "String"},
"Name": {"Values": "", "Type": "String"},
"Input": input_description,
"Output": output_description,
"License": {"Values": ["MIT"], "Type": "Class"},
}


class M5DatasetWorkflow:
def _init_m5_dataset(self):
m5 = DataLoader()
m5.regenerate_data()

algo_list = ["ridge", "lgb"]
for algo in algo_list:
m5.set_algo(algo)
m5.retrain_models()

def _init_learnware_market(self):
"""initialize learnware market"""
# database_ops.clear_learnware_table()
learnware.init()

easy_market = instantiate_learnware_market(name="easy", rebuild=True)
print("Total Item:", len(easy_market))

zip_path_list = []
curr_root = os.path.dirname(os.path.abspath(__file__))
curr_root = os.path.join(curr_root, "learnware_pool")
for zip_path in os.listdir(curr_root):
zip_path_list.append(os.path.join(curr_root, zip_path))

for idx, zip_path in enumerate(zip_path_list):
semantic_spec = semantic_specs[0]
semantic_spec["Name"]["Values"] = "learnware_%d" % (idx)
semantic_spec["Description"]["Values"] = "test_learnware_number_%d" % (idx)
easy_market.add_learnware(zip_path, semantic_spec)

print("Total Item:", len(easy_market))

def prepare_learnware(self, regenerate_flag=False):
if regenerate_flag:
self._init_m5_dataset()

m5 = DataLoader()
idx_list = m5.get_idx_list()
algo_list = ["lgb"] # algo_list = ["ridge", "lgb"]

curr_root = os.path.dirname(os.path.abspath(__file__))
curr_root = os.path.join(curr_root, "learnware_pool")
os.makedirs(curr_root, exist_ok=True)

for idx in tqdm(idx_list):
train_x, train_y, test_x, test_y = m5.get_idx_data(idx)
st = time.time()
spec = generate_rkme_table_spec(X=train_x, gamma=0.1, cuda_idx=0)
ed = time.time()
logger.info("Stat spec generated in %.3f s" % (ed - st))

for algo in algo_list:
m5.set_algo(algo)
dir_path = os.path.join(curr_root, f"{algo}_{idx}")
os.makedirs(dir_path, exist_ok=True)

spec_path = os.path.join(dir_path, "rkme.json")
spec.save(spec_path)

model_path = m5.get_model_path(idx)
model_file = os.path.join(dir_path, "model.out")
copyfile(model_path, model_file)

init_file = os.path.join(dir_path, "__init__.py")
copyfile("example_init.py", init_file)

yaml_file = os.path.join(dir_path, "learnware.yaml")
copyfile("example.yaml", yaml_file)

zip_file = dir_path + ".zip"
with zipfile.ZipFile(zip_file, "w") as zip_obj:
for foldername, subfolders, filenames in os.walk(dir_path):
for filename in filenames:
file_path = os.path.join(foldername, filename)
zip_info = zipfile.ZipInfo(filename)
zip_info.compress_type = zipfile.ZIP_STORED
with open(file_path, "rb") as file:
zip_obj.writestr(zip_info, file.read())

rmtree(dir_path)

def test(self, regenerate_flag=False):
self.prepare_learnware(regenerate_flag)
self._init_learnware_market()

easy_market = instantiate_learnware_market(name="easy")
print("Total Item:", len(easy_market))

m5 = DataLoader()
idx_list = m5.get_idx_list()
os.makedirs("./user_spec", exist_ok=True)
single_score_list = []
random_score_list = []
job_selector_score_list = []
ensemble_score_list = []
improve_list = []

for idx in idx_list:
train_x, train_y, test_x, test_y = m5.get_idx_data(idx)
user_spec = generate_rkme_table_spec(X=test_x, gamma=0.1, cuda_idx=0)
user_spec_path = f"./user_spec/user_{idx}.json"
user_spec.save(user_spec_path)

user_info = BaseUserInfo(semantic_spec=user_semantic, stat_info={"RKMETableSpecification": user_spec})
search_result = easy_market.search_learnware(user_info)
single_result = search_result.get_single_results()
multiple_result = search_result.get_multiple_results()

print(f"search result of user{idx}:")
print(
f"single model num: {len(single_result)}, max_score: {single_result[0].score}, min_score: {single_result[-1].score}"
)
loss_list = []
for single_item in single_result:
pred_y = single_item.learnware.predict(test_x)
loss_list.append(m5.score(test_y, pred_y))
print(
f"Top1-score: {single_result[0].score}, learnware_id: {single_result[0].learnware.id}, loss: {loss_list[0]}"
)

if len(multiple_result) > 0:
mixture_id = " ".join([learnware.id for learnware in multiple_result[0].learnwares])
print(f"mixture_score: {multiple_result[0].score}, mixture_learnware: {mixture_id}")
mixture_learnware_list = multiple_result[0].learnwares
else:
mixture_learnware_list = [single_result[0].learnware]

reuse_job_selector = JobSelectorReuser(learnware_list=mixture_learnware_list, use_herding=False)
job_selector_predict_y = reuse_job_selector.predict(user_data=test_x)
job_selector_score = m5.score(test_y, job_selector_predict_y)
print(f"mixture reuse loss (job selector): {job_selector_score}")

reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list, mode="vote_by_prob")
ensemble_predict_y = reuse_ensemble.predict(user_data=test_x)
ensemble_score = m5.score(test_y, ensemble_predict_y)
print(f"mixture reuse loss (ensemble): {ensemble_score}\n")

single_score_list.append(loss_list[0])
random_score_list.append(np.mean(loss_list))
job_selector_score_list.append(job_selector_score)
ensemble_score_list.append(ensemble_score)
improve_list.append((np.mean(loss_list) - loss_list[0]) / np.mean(loss_list))

logger.info("Single search score %.3f +/- %.3f" % (np.mean(single_score_list), np.std(single_score_list)))
logger.info("Random search score: %.3f +/- %.3f" % (np.mean(random_score_list), np.std(random_score_list)))
logger.info("Average score improvement: %.3f" % (np.mean(improve_list)))
logger.info(
"Job selector score: %.3f +/- %.3f" % (np.mean(job_selector_score_list), np.std(job_selector_score_list))
)
logger.info(
"Average ensemble score: %.3f +/- %.3f" % (np.mean(ensemble_score_list), np.std(ensemble_score_list))
)


if __name__ == "__main__":
fire.Fire(M5DatasetWorkflow)

+ 0
- 87
examples/dataset_m5_workflow/upload.py View File

@@ -1,87 +0,0 @@
import hashlib
import requests
import os
import random
import json
import time
from tqdm import tqdm

email = "tanzh@lamda.nju.edu.cn"
password = hashlib.md5(b"Qwerty123").hexdigest()
login_url = "http://210.28.134.201:8089/auth/login"
submit_url = "http://210.28.134.201:8089/user/add_learnware"
all_data_type = ["Table", "Image", "Video", "Text", "Audio"]
all_task_type = [
"Classification",
"Regression",
"Clustering",
"Feature Extraction",
"Generation",
"Segmentation",
"Object Detection",
]
all_device_type = ["CPU", "GPU"]
all_scenario = [
"Business",
"Financial",
"Health",
"Politics",
"Computer",
"Internet",
"Traffic",
"Nature",
"Fashion",
"Industry",
"Agriculture",
"Education",
"Entertainment",
"Architecture",
]

# ###############
# 以上部分无需修改 #
# ###############


def main():
session = requests.Session()
res = session.post(login_url, json={"email": email, "password": password})

# /path/to/learnware/folder 修改为学件文件夹地址
learnware_pool = os.listdir(os.path.join(os.path.abspath("."), "learnware_pool"))

for learnware in learnware_pool:
# 修改相应的语义规约
name = "M5_Shop" + "%02d" % int(learnware.split(".")[0].split("_")[1])
name = name + "_" + time.strftime("%Y%m%d%H%M%S", time.localtime())
description = f"This is a description of learnware {name}"
data = random.choice(all_data_type)
task = random.choice(all_task_type)
device = list(set(random.choices(all_device_type, k=2)))
scenario = list(set(random.choices(all_scenario, k=5)))
semantic_specification = {
"Data": {"Values": ["Table"], "Type": "Class"},
"Task": {"Values": ["Regression"], "Type": "Class"},
"Device": {"Values": ["CPU"], "Type": "Tag"},
"Scenario": {"Values": ["Business"], "Type": "Tag"},
"Description": {"Values": "A sales-forecasting model from Walmart store", "Type": "String"},
"Name": {"Values": name, "Type": "String"},
"License": {"Values": ["MIT"], "Type": "Class"},
}
res = session.post(
submit_url,
data={
"semantic_specification": json.dumps(semantic_specification),
},
files={
"learnware_file": open(
os.path.join(os.path.abspath("."), "learnware_pool", learnware),
"rb",
)
},
)
assert json.loads(res.text)["code"] == 0, "Upload error"


if __name__ == "__main__":
main()

+ 0
- 8
examples/dataset_pfs_workflow/example.yaml View File

@@ -1,8 +0,0 @@
model:
class_name: Model
kwargs: {}
stat_specifications:
- module_path: learnware.specification
class_name: RKMETableSpecification
file_name: rkme.json
kwargs: {}

+ 0
- 20
examples/dataset_pfs_workflow/example_init.py View File

@@ -1,20 +0,0 @@
import os
import joblib
import numpy as np
from learnware.model import BaseModel


class Model(BaseModel):
def __init__(self):
super(Model, self).__init__(input_shape=(31,), output_shape=(1,))
dir_path = os.path.dirname(os.path.abspath(__file__))
self.model = joblib.load(os.path.join(dir_path, "model.out"))

def fit(self, X: np.ndarray, y: np.ndarray):
pass

def predict(self, X: np.ndarray) -> np.ndarray:
return self.model.predict(X)

def finetune(self, X: np.ndarray, y: np.ndarray):
pass

+ 0
- 208
examples/dataset_pfs_workflow/main.py View File

@@ -1,208 +0,0 @@
import os
import fire
import zipfile
import time
import numpy as np
from tqdm import tqdm
from shutil import copyfile, rmtree

import learnware
from learnware.market import instantiate_learnware_market, BaseUserInfo
from learnware.reuse import JobSelectorReuser, AveragingReuser
from learnware.specification import generate_rkme_table_spec
from pfs import Dataloader
from learnware.logger import get_module_logger

logger = get_module_logger("pfs_test", level="INFO")

output_description = {
"Dimension": 1,
"Description": {},
}

input_description = {
"Dimension": 31,
"Description": {},
}

semantic_specs = [
{
"Data": {"Values": ["Table"], "Type": "Class"},
"Task": {"Values": ["Regression"], "Type": "Class"},
"Library": {"Values": ["Scikit-learn"], "Type": "Class"},
"Scenario": {"Values": ["Business"], "Type": "Tag"},
"Description": {"Values": "", "Type": "String"},
"Name": {"Values": "learnware_1", "Type": "String"},
"Input": input_description,
"Output": output_description,
"License": {"Values": ["MIT"], "Type": "Class"},
}
]

user_semantic = {
"Data": {"Values": ["Table"], "Type": "Class"},
"Task": {"Values": ["Regression"], "Type": "Class"},
"Library": {"Values": ["Scikit-learn"], "Type": "Class"},
"Scenario": {"Values": ["Business"], "Type": "Tag"},
"Description": {"Values": "", "Type": "String"},
"Name": {"Values": "", "Type": "String"},
"Input": input_description,
"Output": output_description,
"License": {"Values": ["MIT"], "Type": "Class"},
}


class PFSDatasetWorkflow:
def _init_pfs_dataset(self):
pfs = Dataloader()
pfs.regenerate_data()

algo_list = ["ridge"] # "ridge", "lgb"
for algo in algo_list:
pfs.set_algo(algo)
pfs.retrain_models()

def _init_learnware_market(self):
"""initialize learnware market"""
learnware.init()
easy_market = instantiate_learnware_market(market_id="pfs", name="easy", rebuild=True)
print("Total Item:", len(easy_market))

zip_path_list = []
curr_root = os.path.dirname(os.path.abspath(__file__))
curr_root = os.path.join(curr_root, "learnware_pool")
for zip_path in os.listdir(curr_root):
zip_path_list.append(os.path.join(curr_root, zip_path))

for idx, zip_path in enumerate(zip_path_list):
semantic_spec = semantic_specs[0]
semantic_spec["Name"]["Values"] = "learnware_%d" % (idx)
semantic_spec["Description"]["Values"] = "test_learnware_number_%d" % (idx)
easy_market.add_learnware(zip_path, semantic_spec)

print("Total Item:", len(easy_market))

def prepare_learnware(self, regenerate_flag=False):
if regenerate_flag:
self._init_pfs_dataset()

pfs = Dataloader()
idx_list = pfs.get_idx_list()
algo_list = ["ridge"] # ["ridge", "lgb"]

curr_root = os.path.dirname(os.path.abspath(__file__))
curr_root = os.path.join(curr_root, "learnware_pool")
os.makedirs(curr_root, exist_ok=True)

for idx in tqdm(idx_list):
train_x, train_y, test_x, test_y = pfs.get_idx_data(idx)
st = time.time()
spec = generate_rkme_table_spec(X=train_x, gamma=0.1, cuda_idx=0)
ed = time.time()
logger.info("Stat spec generated in %.3f s" % (ed - st))

for algo in algo_list:
pfs.set_algo(algo)
dir_path = os.path.join(curr_root, f"{algo}_{idx}")
os.makedirs(dir_path, exist_ok=True)

spec_path = os.path.join(dir_path, "rkme.json")
spec.save(spec_path)

model_path = pfs.get_model_path(idx)
model_file = os.path.join(dir_path, "model.out")
copyfile(model_path, model_file)

init_file = os.path.join(dir_path, "__init__.py")
copyfile("example_init.py", init_file)

yaml_file = os.path.join(dir_path, "learnware.yaml")
copyfile("example.yaml", yaml_file)

zip_file = dir_path + ".zip"
with zipfile.ZipFile(zip_file, "w") as zip_obj:
for foldername, subfolders, filenames in os.walk(dir_path):
for filename in filenames:
file_path = os.path.join(foldername, filename)
zip_info = zipfile.ZipInfo(filename)
zip_info.compress_type = zipfile.ZIP_STORED
with open(file_path, "rb") as file:
zip_obj.writestr(zip_info, file.read())

rmtree(dir_path)

def test(self, regenerate_flag=False):
self.prepare_learnware(regenerate_flag)
self._init_learnware_market()

easy_market = instantiate_learnware_market(market_id="pfs", name="easy")
print("Total Item:", len(easy_market))

pfs = Dataloader()
idx_list = pfs.get_idx_list()
os.makedirs("./user_spec", exist_ok=True)
single_score_list = []
random_score_list = []
job_selector_score_list = []
ensemble_score_list = []
improve_list = []

for idx in idx_list:
train_x, train_y, test_x, test_y = pfs.get_idx_data(idx)
user_spec = generate_rkme_table_spec(X=test_x, gamma=0.1, cuda_idx=0)
user_spec_path = f"./user_spec/user_{idx}.json"
user_spec.save(user_spec_path)

user_info = BaseUserInfo(semantic_spec=user_semantic, stat_info={"RKMETableSpecification": user_spec})
search_result = easy_market.search_learnware(user_info)
single_result = search_result.get_single_results()
multiple_result = search_result.get_multiple_results()

print(f"search result of user{idx}:")
print(
f"single model num: {len(single_result)}, max_score: {single_result[0].score}, min_score: {single_result[-1].score}"
)
loss_list = []
for single_item in single_result:
pred_y = single_item.learnware.predict(test_x)
loss_list.append(pfs.score(test_y, pred_y))
print(
f"Top1-score: {single_result[0].score}, learnware_id: {single_result[0].learnware.id}, loss: {loss_list[0]}, random: {np.mean(loss_list)}"
)

if len(multiple_result) > 0:
mixture_id = " ".join([learnware.id for learnware in multiple_result[0].learnwares])
print(f"mixture_score: {multiple_result[0].score}, mixture_learnware: {mixture_id}")
mixture_learnware_list = multiple_result[0].learnwares
else:
mixture_learnware_list = [single_result[0].learnware]

reuse_job_selector = JobSelectorReuser(learnware_list=mixture_learnware_list, use_herding=False)
job_selector_predict_y = reuse_job_selector.predict(user_data=test_x)
job_selector_score = pfs.score(test_y, job_selector_predict_y)
print(f"mixture reuse loss (job selector): {job_selector_score}")

reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list)
ensemble_predict_y = reuse_ensemble.predict(user_data=test_x)
ensemble_score = pfs.score(test_y, ensemble_predict_y)
print(f"mixture reuse loss (ensemble): {ensemble_score}\n")

single_score_list.append(loss_list[0])
random_score_list.append(np.mean(loss_list))
job_selector_score_list.append(job_selector_score)
ensemble_score_list.append(ensemble_score)
improve_list.append((np.mean(loss_list) - loss_list[0]) / np.mean(loss_list))

logger.info("Single search score %.3f +/- %.3f" % (np.mean(single_score_list), np.std(single_score_list)))
logger.info("Random search score: %.3f +/- %.3f" % (np.mean(random_score_list), np.std(random_score_list)))
logger.info("Average score improvement: %.3f" % (np.mean(improve_list)))
logger.info(
"Job selector score: %.3f +/- %.3f" % (np.mean(job_selector_score_list), np.std(job_selector_score_list))
)
logger.info(
"Average ensemble score: %.3f +/- %.3f" % (np.mean(ensemble_score_list), np.std(ensemble_score_list))
)


if __name__ == "__main__":
fire.Fire(PFSDatasetWorkflow)

+ 0
- 48
examples/dataset_pfs_workflow/pfs/README.md View File

@@ -1,48 +0,0 @@
# Learnware based on Prediction Future Sales (PFS) data downloaded from Kaggle
--> Data Page Link: https://www.kaggle.com/c/competitive-data-science-predict-future-sales/data
--> Code Page Link: https://www.kaggle.com/uladzimirkapeika/feature-engineering-lightgbm-top-1
# PFS任务描述
--> 目标:预测每个商店每个商品在下一个月的销量(注意:粒度为月,而不是每天)
--> 特征信息:商店所在城市信息、商品类别信息、商品价格信息、商品历史价格信息(特征工程中只使用了前三个月的历史信息然后拼接在一起)等
--> 使用的模型:XgBoost, LightGBM, LinearRegression
--> 评价指标:RMSE
* split_pfs_data.py
--> 根据Kaggle上公开的数据预处理方案处理下载的数据
--> 直接运行即可将数据根据Shop ID划分为每个商店的信息,包括:
----> 每个商品在每个月下的特征和目标值,存储为pandas.DataFrame格式
----> 字段包括:
-- 标识信息: 'shop_id', 'item_id', 'date_block_num' (标识月份),
-- 目标值(本月销量): 'item_cnt_month',
-- 城市信息: 'city_code', 'city_coord_1', 'city_coord_2', 'country_part',
-- 商品种类信息: 'item_category_common', 'item_category_code',
-- 该月的时间信息: 'weeknd_count', 'days_in_month',
-- 商品是否第一次销售: 'item_first_interaction', 'shop_item_sold_before',
-- 商品前三个月的销售量和价格信息:
'item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3',
'item_shop_price_avg_lag_1', 'item_shop_price_avg_lag_2', 'item_shop_price_avg_lag_3',
'item_target_enc_lag_1', 'item_target_enc_lag_2', 'item_target_enc_lag_3',
'item_loc_target_enc_lag_1', 'item_loc_target_enc_lag_2', 'item_loc_target_enc_lag_3', 'item_shop_target_enc_lag_1', 'item_shop_target_enc_lag_2', 'item_shop_target_enc_lag_3',
'new_item_cat_avg_lag_1', 'new_item_cat_avg_lag_2', 'new_item_cat_avg_lag_3',
'new_item_shop_cat_avg_lag_1', 'new_item_shop_cat_avg_lag_2', 'new_item_shop_cat_avg_lag_3',
'item_cnt_month_lag_1_adv', 'item_cnt_month_lag_2_adv', 'item_cnt_month_lag_3_adv'
----> 特征: 除了'item_cnt_month'之外的列都当做特征列
----> 目标值: 'item_cnt_month'
----> 时间标识: 'data_block_num'将2013.01到2015.10月的数据标识为0-33,要预测的2015.11月数据为34
--> 存储结果分为两部分: 按照时间划分的train & val,是pandas.DataFrame格式
* pfs_cross_transfer.py
--> 在各自商店训练集上训练一个模型,然后在所有商店的测试集上测试,保存两两预测的RMSE结果,并进行分析
--> 分析包括两部分:(1) 对于一个目标商店,其余源域模型的性能均值,方差,最小值(最好的模型),最大值,超过均值的源域数目,选择最好模型能够提升的比例等等;(2) HeatMap
--> 需要扩展的方向:(1) LightGBM, Ridge, Xgboost,以及超参数调参;(2) 特征工程去除标识信息,例如shop_id, item_id等等
* data_api.py
--> 后续封装的代码,需继续完善
* packages
--> pip install lightgbm

+ 0
- 77
examples/dataset_pfs_workflow/pfs/__init__.py View File

@@ -1,77 +0,0 @@
import joblib
import os
from sklearn.metrics import mean_squared_error


from .pfs_cross_transfer import *
from .split_data import feature_engineering


class Dataloader:
def __init__(self):
self.algo = "ridge"

def regenerate_data(self):
feature_engineering()

def set_algo(self, algo):
self.algo = algo

def get_algo_list(self):
return ["lgb", "ridge"]

def get_idx_list(self):
return [i for i in range(53)]

def get_idx_data(self, idx):
shop_ids = [i for i in range(60) if i not in [0, 1, 40]]
shop_ids = [i for i in shop_ids if i not in [8, 11, 23, 36]]

fpath = os.path.join(pfs_split_dir, "Shop{:0>2d}-train.csv".format(shop_ids[idx]))
train_xs, train_ys, _, _ = load_pfs_data(fpath)
fpath = os.path.join(pfs_split_dir, "Shop{:0>2d}-val.csv".format(shop_ids[idx]))
test_xs, test_ys, _, _ = load_pfs_data(fpath)
return train_xs, train_ys, test_xs, test_ys

def get_model_path(self, idx):
shop_ids = [i for i in range(60) if i not in [0, 1, 40]]
shop_ids = [i for i in shop_ids if i not in [8, 11, 23, 36]]
return os.path.join(model_dir, "{}_Shop{:0>2d}.out".format(self.algo, shop_ids[idx]))

def retrain_models(self):
algo = self.algo
errs = get_errors(algo=algo)

fpath = os.path.join(pfs_res_dir, "PFS_{}_errs.txt".format(algo))
np.savetxt(fpath, errs.T)

plot_heatmap(errs.T, algo)
weights = np.loadtxt(os.path.join(pfs_res_dir, "PFS_{}_weights.txt".format(algo)))
plot_performance(errs.T, weights, algo)

def retrain_split_models(self):
fpath = os.path.join(pfs_res_dir, "PFS_{}_split_errs_user.txt".format(self.algo))
if os.path.exists(fpath):
return np.loadtxt(fpath)
algo = self.algo
errs = get_split_errs(algo=algo)
fpath = os.path.join(pfs_res_dir, "PFS_{}_split_errs_user.txt".format(algo))
np.savetxt(fpath, errs)
return errs

def get_errs(self):
return np.loadtxt(os.path.join(pfs_res_dir, "PFS_{}_errs.txt".format(self.algo)))

def get_weights(self):
return np.loadtxt(os.path.join(pfs_res_dir, "PFS_{}_weights.txt".format(self.algo)))

def predict(self, idx, test_x):
shop_ids = [i for i in range(60) if i not in [0, 1, 40]]
shop_ids = [i for i in shop_ids if i not in [8, 11, 23, 36]]

model = joblib.load(os.path.join(model_dir, "{}_Shop{:0>2d}.out".format(self.algo, shop_ids[idx])))
# test_x = (test_x - test_x.min(0)) / (test_x.max(0) - test_x.min(0) + 0.0001)
return model.predict(test_x)

def score(self, real_y, pred_y, sample_weight=None):
return mean_squared_error(real_y, pred_y, sample_weight=sample_weight, squared=False)

+ 0
- 272
examples/dataset_pfs_workflow/pfs/config.py View File

@@ -1,272 +0,0 @@
market_store_list = [
0,
2,
3,
4,
5,
6,
7,
8,
9,
10,
12,
13,
14,
15,
16,
17,
18,
20,
22,
23,
24,
25,
26,
27,
28,
30,
31,
32,
33,
34,
35,
37,
38,
39,
40,
42,
44,
45,
46,
47,
48,
50,
52,
]
user_store_list = [1, 11, 19, 21, 29, 36, 43, 49]

dataset_info = {
"name": "PFS",
"range of date": "2014.01-2015.10",
"description": "You are provided with daily historical sales data. The task is to forecast the total amount of products sold in every shop for the test set. Note that the list of shops and products slightly changes every month. More specifically, the dataset involves 53 shops in Russia",
"location_original": [
"Адыгея, Россия",
"Балашиха, Россия",
"Волжский, Россия",
"Вологда, Россия",
"Воронеж, Россия",
"Воронеж, Россия",
"Воронеж, Россия",
"выезд, Россия",
"Жуковский, Россия",
"интернет-магазин, Россия",
"Казань, Россия",
"Калуга, Россия",
"колонна, Россия",
"Красноярск, Россия",
"Красноярск, Россия",
"курск, Россия",
"Москва, Россия",
"Москва, Россия",
"Москва, Россия",
"Москва, Россия",
"Москва, Россия",
"Москва, Россия",
"Москва, Россия",
"Москва, Россия",
"Москва, Россия",
"Москва, Россия",
"Москва, Россия",
"Москва, Россия",
"Мытищи, Россия",
"Н.Новгород, Россия",
"Н.Новгород, Россия",
"Новосибирск, Россия",
"Новосибирск, Россия",
"Ростовнадон, Россия",
"Ростовнадон, Россия",
"спб, Россия",
"спб, Россия",
"самара, Россия",
"самара, Россия",
"Сергий, Россия",
"Сургут, Россия",
"томск, Россия",
"тюмень, Россия",
"тюмень, Россия",
"тюмень, Россия",
"Уфа, Россия",
"Уфа, Россия",
"Химки, Россия",
"цифровой, Россия",
"Чехи, Россия",
"Якутск, Россия",
"Якутск, Россия",
"Ярославль, Россия",
],
"location_english": [
"adygea, Russia",
"Balashikha, Russia",
"Volzhsky, Russia",
"Vologda, Russia",
"Voronezh, Russia",
"Voronezh, Russia",
"Voronezh, Russia",
"outbound, Russia",
"zhukovsky, Russia",
"online stor, Russia",
"Kazan, Russia",
"Kaluga, Russia",
"column, Russia",
"Krasnoyarsk, Russia",
"Krasnoyarsk, Russia",
"kursk, Russia",
"Moscow, Russia",
"Moscow, Russia",
"Moscow, Russia",
"Moscow, Russia",
"Moscow, Russia",
"Moscow, Russia",
"Moscow, Russia",
"Moscow, Russia",
"Moscow, Russia",
"Moscow, Russia",
"Moscow, Russia",
"Moscow, Russia",
"mytishchi, Russia",
"N.Novgorod, Russia",
"N.Novgorod, Russia",
"Novosibirsk, Russia",
"Novosibirsk, Russia",
"rostovnadon, Russia",
"rostovnadon, Russia",
"spb, Russia",
"spb, Russia",
"samara, Russia",
"samara, Russia",
"Sergius, Russia",
"surgut, Russia",
"tomsk, Russia",
"tyumen, Russia",
"tyumen, Russia",
"tyumen, Russia",
"Ufa, Russia",
"Ufa, Russia",
"Khimki, Russia",
"numeric, Russia",
"Czechs, Russia",
"Yakutsk, Russia",
"Yakutsk, Russia",
"Yaroslavl, Russia",
],
"location_chinese": [
"阿迪格亚, 俄罗斯",
"巴拉希哈, 俄罗斯",
"沃尔日斯基, 俄罗斯",
"沃洛格达, 俄罗斯",
"沃罗涅日, 俄罗斯",
"沃罗涅日, 俄罗斯",
"沃罗涅日, 俄罗斯",
"对外贸易, 俄罗斯",
"茹科夫斯基, 俄罗斯",
"在线商店, 俄罗斯",
"喀山, 俄罗斯",
"卡卢加, 俄罗斯",
"科洛姆纳, 俄罗斯",
"克拉斯诺亚尔斯克, 俄罗斯",
"克拉斯诺亚尔斯克, 俄罗斯",
"库尔斯克, 俄罗斯",
"莫斯科, 俄罗斯",
"莫斯科, 俄罗斯",
"莫斯科, 俄罗斯",
"莫斯科, 俄罗斯",
"莫斯科, 俄罗斯",
"莫斯科, 俄罗斯",
"莫斯科, 俄罗斯",
"莫斯科, 俄罗斯",
"莫斯科, 俄罗斯",
"莫斯科, 俄罗斯",
"莫斯科, 俄罗斯",
"莫斯科, 俄罗斯",
"梅季希, 俄罗斯",
"北诺夫哥罗德, 俄罗斯",
"北诺夫哥罗德, 俄罗斯",
"新西伯利亚, 俄罗斯",
"新西伯利亚, 俄罗斯",
"罗斯托夫纳东, 俄罗斯",
"罗斯托夫纳东, 俄罗斯",
"圣彼得堡, 俄罗斯",
"圣彼得堡, 俄罗斯",
"萨马拉, 俄罗斯",
"萨马拉, 俄罗斯",
"谢尔盖, 俄罗斯",
"苏尔古特, 俄罗斯",
"托木斯克, 俄罗斯",
"秋明, 俄罗斯",
"秋明, 俄罗斯",
"秋明, 俄罗斯",
"乌法, 俄罗斯",
"乌法, 俄罗斯",
"希姆基, 俄罗斯",
"在线商店, 俄罗斯",
"契诃夫, 俄罗斯",
"雅库茨克, 俄罗斯",
"雅库茨克, 俄罗斯",
"雅罗斯拉夫尔, 俄罗斯",
],
"memory(KB)": [
246,
302,
3631,
379,
862,
1020,
471,
867,
588,
233,
657,
1272,
801,
469,
146,
1309,
98,
1003,
932,
257,
1959,
1361,
35,
3265,
217,
283,
4311,
1155,
43,
1388,
1971,
971,
7272,
2782,
304,
6801,
4942,
181,
190,
3664,
2061,
170,
807,
593,
1584,
257,
1819,
50,
1063,
692,
336,
277,
743,
],
}

+ 0
- 21
examples/dataset_pfs_workflow/pfs/paths.py View File

@@ -1,21 +0,0 @@
import os
ROOT_PATH = os.path.abspath(os.path.join(__file__, "..", "data"))
raw_data_dir = os.path.join(ROOT_PATH, "raw_data")
split_data_dir = os.path.join(ROOT_PATH, "split_data")
res_dir = os.path.join(ROOT_PATH, "results")
model_dir = os.path.join(ROOT_PATH, "models")
model_dir2 = os.path.join(ROOT_PATH, "models2")
for dir_name in [ROOT_PATH, raw_data_dir, split_data_dir, res_dir, model_dir, model_dir2]:
if not os.path.exists(dir_name):
os.mkdir(dir_name)
pfs_data_dir = os.path.join(raw_data_dir, "PFS")
pfs_split_dir = os.path.join(split_data_dir, "PFS")
pfs_res_dir = os.path.join(res_dir, "PFS")
for dir_name in [pfs_data_dir, pfs_split_dir, pfs_res_dir]:
if not os.path.exists(dir_name):
os.mkdir(dir_name)

+ 0
- 384
examples/dataset_pfs_workflow/pfs/pfs_cross_transfer.py View File

@@ -1,384 +0,0 @@
import os
import pickle
import joblib
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from matplotlib import pyplot as plt
import matplotlib.ticker as ticker
from mpl_toolkits.axes_grid1 import make_axes_locatable
np.seterr(divide="ignore", invalid="ignore")
from .paths import pfs_split_dir, pfs_res_dir, model_dir
np.random.seed(0)
def load_pfs_data(fpath):
df = pd.read_csv(fpath)
features = list(df.columns)
features.remove("item_cnt_month")
features.remove("date_block_num")
# remove id info
# features.remove('shop_id')
# features.remove('item_id')
# remove discrete info
# features.remove('city_code')
# features.remove('item_category_code')
# features.remove('item_category_common')
xs = df[features].values
ys = df["item_cnt_month"].values
categorical_feature_names = ["country_part", "item_category_common", "item_category_code", "city_code"]
types = None
return xs, ys, features, types
def get_split_errs(algo):
"""
according to proportion_list, generate errs whose shape is [shop, split_data]
"""
shop_ids = [i for i in range(60) if i not in [0, 1, 40]]
shop_ids = [i for i in shop_ids if i not in [8, 11, 23, 36]]
user_list = [i for i in range(53)]
proportion_list = [100, 300, 500, 700, 900, 1000, 3000, 5000, 7000, 9000, 10000, 30000, 50000, 70000]
# train
errs = np.zeros((len(user_list), len(proportion_list)))
for s, sid in enumerate(user_list):
# load train data
fpath = os.path.join(pfs_split_dir, "Shop{:0>2d}-train.csv".format(shop_ids[sid]))
fpath_val = os.path.join(pfs_split_dir, "Shop{:0>2d}-val.csv".format(shop_ids[sid]))
train_xs, train_ys, _, _ = load_pfs_data(fpath)
val_xs, val_ys, _, _ = load_pfs_data(fpath_val)
print(shop_ids[sid], train_xs.shape, train_ys.shape)
# data regu
# train_xs = (train_xs - train_xs.min(0)) / (train_xs.max(0) - train_xs.min(0) + 0.0001)
# val_xs = (val_xs - val_xs.min(0)) / (val_xs.max(0) - val_xs.min(0) + 0.0001)
if algo == "lgb":
for tmp in range(len(proportion_list)):
model = lgb.LGBMModel(
boosting_type="gbdt",
num_leaves=2**7 - 1,
learning_rate=0.01,
objective="rmse",
metric="rmse",
feature_fraction=0.75,
bagging_fraction=0.75,
bagging_freq=5,
seed=1,
verbose=1,
n_estimators=100000,
)
model_ori = joblib.load(os.path.join(model_dir, "{}_Shop{:0>2d}.out".format("lgb", shop_ids[sid])))
para = model_ori.get_params()
para["n_estimators"] = 1000
model.set_params(**para)
split = train_xs.shape[0] - proportion_list[tmp]
model.fit(
train_xs[
split:,
],
train_ys[split:],
eval_set=[(val_xs, val_ys)],
early_stopping_rounds=50,
verbose=100,
)
pred_ys = model.predict(val_xs)
rmse = np.sqrt(((val_ys - pred_ys) ** 2).mean())
errs[s][tmp] = rmse
return errs
def get_errors(algo):
shop_ids = [i for i in range(60) if i not in [0, 1, 40]]
shop_ids = [i for i in shop_ids if i not in [8, 11, 23, 36]]
# train
K = len(shop_ids)
feature_weight = np.zeros(())
errs = np.zeros((K, K))
for s, sid in enumerate(shop_ids):
# load train data
fpath = os.path.join(pfs_split_dir, "Shop{:0>2d}-train.csv".format(sid))
fpath_val = os.path.join(pfs_split_dir, "Shop{:0>2d}-val.csv".format(sid))
train_xs, train_ys, features, _ = load_pfs_data(fpath)
val_xs, val_ys, _, _ = load_pfs_data(fpath_val)
print(sid, train_xs.shape, train_ys.shape)
if s == 0:
feature_weight = np.zeros((K, len(features)))
if algo == "lgb":
model = lgb.LGBMModel(
boosting_type="gbdt",
num_leaves=2**7 - 1,
learning_rate=0.01,
objective="rmse",
metric="rmse",
feature_fraction=0.75,
bagging_fraction=0.75,
bagging_freq=5,
seed=1,
verbose=1,
n_estimators=1000,
)
# train regu data
# train_xs = (train_xs - train_xs.min(0)) / (train_xs.max(0) - train_xs.min(0) + 0.0001)
# val_xs = (val_xs - val_xs.min(0)) / (val_xs.max(0) - val_xs.min(0) + 0.0001)
model.fit(train_xs, train_ys, eval_set=[(val_xs, val_ys)], early_stopping_rounds=100, verbose=100)
# grid search
# para = {'learning_rate': [0.005, 0.01, 0.015], 'num_leaves' : [128, 224, 300], 'max_depth' : [50, 66, 80]}
# grid_search = GridSearchCV(model, para, scoring='neg_mean_squared_error')
# grid_result = grid_search.fit(train_xs, train_ys, eval_set=[(val_xs, val_ys)], verbose = 1000, early_stopping_rounds=1000)
# model = grid_result.best_estimator_
joblib.dump(model, os.path.join(model_dir, "{}_Shop{:0>2d}.out".format(algo, sid)))
importances = model.feature_importances_
elif algo == "ridge":
# train_xs = (train_xs - train_xs.min(0)) / (train_xs.max(0) - train_xs.min(0) + 0.0001)
model = Ridge()
para = {"alpha": [0.01, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10, 20, 30]}
grid_search = GridSearchCV(model, para)
grid_result = grid_search.fit(train_xs, train_ys)
model = grid_result.best_estimator_
importances = model.coef_
joblib.dump(model, os.path.join(model_dir, "{}_Shop{:0>2d}.out".format(algo, sid)))
feature_weight[s] = importances
# leave one out test
for t, tid in enumerate(shop_ids):
# load test data
fpath = os.path.join(pfs_split_dir, "Shop{:0>2d}-val.csv".format(tid))
test_xs, test_ys, _, _ = load_pfs_data(fpath)
# data regu
# test_xs = (test_xs - test_xs.min(0)) / (test_xs.max(0) - test_xs.min(0) + 0.0001)
pred_ys = model.predict(test_xs)
rmse = np.sqrt(((test_ys - pred_ys) ** 2).mean())
print("Shop{} --> Shop{}: {}".format(s, t, rmse))
errs[s][t] = rmse
np.savetxt(os.path.join(pfs_res_dir, "PFS_{}_weights.txt".format(algo)), feature_weight)
return errs
def plot_heatmap(mat, algo):
x_labels = [f"Model{i}" for i in range(mat.shape[1])]
y_labels = [f"Task{i}" for i in range(mat.shape[0])]
fig = plt.figure(figsize=(10, 9))
plt.subplot(1, 1, 1)
ax = plt.gca()
im = plt.imshow(mat)
divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="4%", pad=0.3)
plt.colorbar(im, cax=cax)
ax.set_xticks(range(len(x_labels)))
ax.set_xticklabels(x_labels)
ax.set_yticks(range(len(y_labels)))
ax.set_yticklabels(y_labels)
ax.xaxis.set_major_locator(ticker.MultipleLocator(base=5))
ax.yaxis.set_major_locator(ticker.MultipleLocator(base=5))
ax.set_title(f"RMSE on Test set ({algo})")
plt.tight_layout()
plt.savefig(os.path.join(pfs_res_dir, "PFS_{}_heatmap.jpg".format(algo)), dpi=700)
def plot_var(errs, algo):
avg_err = []
min_err = []
med_err = []
max_err = []
std_err = []
cnts = []
improves = []
for j in range(len(errs)):
inds = [i for i in range(len(errs)) if i != j]
ys = errs[:, j][inds]
avg_err.append(np.mean(ys))
min_err.append(np.min(ys))
med_err.append(np.median(ys))
max_err.append(np.max(ys))
std_err.append(np.std(ys))
cnts.append(np.sum(ys >= np.mean(ys)))
improves.append((np.mean(ys) - np.min(ys)) / np.mean(ys))
avg_err = np.array(avg_err)
min_err = np.array(min_err)
med_err = np.array(med_err)
max_err = np.array(max_err)
std_err = np.array(std_err)
cnts = np.array(cnts)
improves = np.array(improves)
inds = np.argsort(avg_err)
avg_err = avg_err[inds]
min_err = min_err[inds]
med_err = med_err[inds]
max_err = max_err[inds]
std_err = std_err[inds]
cnts = cnts[inds]
improves = improves[inds]
xs = list(range(len(inds)))
fig = plt.figure(figsize=(8, 8))
ax = plt.subplot(3, 1, 1)
ax.plot(xs, avg_err, color="red", linestyle="solid", linewidth=2.5)
ax.plot(xs, min_err, color="blue", linestyle="dotted", linewidth=1.5)
ax.plot(xs, med_err, color="purple", linestyle="solid", linewidth=1.0)
ax.plot(xs, max_err, color="green", linestyle="dashed", linewidth=1.5)
ax.legend(["Avg", "Min", "Median", "Max"], fontsize=14)
ax.fill_between(xs, avg_err - std_err, avg_err + std_err, alpha=0.2)
gap = np.mean(avg_err - min_err)
ax.set_ylabel("RMSE", fontsize=14)
ax.set_title("RMSE of Source Models ({}) [Avg-Min:{:.3f}]".format(algo, gap), fontsize=18)
ax = plt.subplot(3, 1, 2)
ax.bar(xs, cnts)
ax.set_ylabel("Number", fontsize=14)
ax.set_title("Number of sources above average", fontsize=18)
ax = plt.subplot(3, 1, 3)
ax.plot(xs, improves)
ax.set_xlabel("Sorted Shop ID by Avg.Err", fontsize=14)
ax.set_ylabel("Ratio", fontsize=14)
ax.set_title("Best Improve Ratio: (Avg - Min) / Avg", fontsize=18)
fig.tight_layout()
fig.savefig(os.path.join(pfs_res_dir, "{}-var.jpg".format(algo)))
plt.show()
def plot_performance(errs, weights, algo):
avg_err = []
min_err = []
med_err = []
max_err = []
std_err = []
cnts = []
improves = []
for i in range(errs.shape[0]):
inds = [j for j in range(errs.shape[1]) if j != i]
arr = errs[i][inds]
avg_err.append(np.mean(arr))
min_err.append(np.min(arr))
med_err.append(np.median(arr))
max_err.append(np.max(arr))
std_err.append(np.std(arr))
cnts.append(np.sum(arr >= np.mean(arr)))
improves.append((np.mean(arr) - np.min(arr)) / np.mean(arr))
avg_err = np.array(avg_err)
min_err = np.array(min_err)
med_err = np.array(med_err)
max_err = np.array(max_err)
std_err = np.array(std_err)
cnts = np.array(cnts)
improves = np.array(improves)
inds = np.argsort(avg_err)
avg_err = avg_err[inds]
min_err = min_err[inds]
med_err = med_err[inds]
max_err = max_err[inds]
std_err = std_err[inds]
cnts = cnts[inds]
improves = improves[inds]
xs = list(range(len(inds)))
fig = plt.figure(figsize=(12, 9))
ax = plt.subplot(2, 2, 1)
ax.plot(xs, avg_err, color="red", linestyle="solid", linewidth=2.5)
ax.plot(xs, min_err, color="blue", linestyle="dotted", linewidth=1.5)
ax.plot(xs, med_err, color="purple", linestyle="solid", linewidth=1.0)
ax.plot(xs, max_err, color="green", linestyle="dashed", linewidth=1.5)
ax.legend(["Avg", "Min", "Median", "Max"], fontsize=14)
ax.fill_between(xs, avg_err - std_err, avg_err + std_err, alpha=0.2)
gap = np.mean(avg_err - min_err)
ax.set_ylabel("RMSE", fontsize=14)
ax.set_title("RMSE of Source Models ({}) [Avg-Min:{:.3f}]".format(algo, gap), fontsize=18)
ax = plt.subplot(2, 2, 2)
ax.bar(xs, cnts)
ax.set_ylabel("Number", fontsize=14)
ax.set_title("Number of sources above average", fontsize=18)
ax = plt.subplot(2, 2, 3)
ax.plot(xs, improves)
ax.set_xlabel("Sorted Shop ID by Avg.Err", fontsize=14)
ax.set_ylabel("Ratio", fontsize=14)
ax.set_title("Best Improve Ratio: (Avg - Min) / Avg", fontsize=18)
ax = plt.subplot(2, 2, 4)
weights = np.mean(weights, axis=0) / weights.sum()
weights = np.sort(weights)
xs = list(range(len(weights)))
ax.plot(xs, weights)
# ax.set_xlabel("Sorted Feature ID by Avg.Feature_Importance", fontsize=14)
ax.set_ylabel("Proportion", fontsize=14)
ax.set_title("Avg.Feature_Importances", fontsize=18)
fig.tight_layout()
fig.savefig(os.path.join(pfs_res_dir, "PFS_{}_performance.png".format(algo)), dpi=700)
# fig.savefig(f"{algo}_performance.png", dpi=700)
plt.show()
if __name__ == "__main__":
# for algo in ["ridge", "lgb", "xgboost_125"]:
for algo in ["ridge"]:
fpath = os.path.join(pfs_res_dir, "{}_errs.pkl".format(algo))
if os.path.exists(fpath):
with open(fpath, "rb") as fr:
errs = pickle.load(fr)
else:
errs = get_errors(algo=algo)
with open(fpath, "wb") as fw:
pickle.dump(errs, fw)
index = ["Source{}".format(k) for k in range(len(errs))]
columns = ["Target{}".format(k) for k in range(len(errs[0]))]
df = pd.DataFrame(errs, index=index, columns=columns)
fpath = os.path.join(pfs_res_dir, "PFS_{}_errs.txt".format(algo))
# df.to_csv(fpath, index=True)
np.savetxt(fpath, errs.T)
# plot_var(errs, algo)
plot_heatmap(errs.T, algo)
weights = np.loadtxt(os.path.join(pfs_res_dir, "PFS_{}_weights.txt".format(algo)))
plot_performance(errs.T, weights, algo)

+ 0
- 384
examples/dataset_pfs_workflow/pfs/split_data.py View File

@@ -1,384 +0,0 @@
import os
import pickle
import pandas as pd
import numpy as np
from itertools import product
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import calendar
from .paths import pfs_data_dir
from .paths import pfs_split_dir
def feature_engineering():
# read data
sales = pd.read_csv(os.path.join(pfs_data_dir, "sales_train.csv"))
shops = pd.read_csv(os.path.join(pfs_data_dir, "shops.csv"))
items = pd.read_csv(os.path.join(pfs_data_dir, "items.csv"))
item_cats = pd.read_csv(os.path.join(pfs_data_dir, "item_categories.csv"))
test = pd.read_csv(os.path.join(pfs_data_dir, "test.csv"))
# remove outliers
train = sales[(sales.item_price < 10000) & (sales.item_price > 0)]
train = train[sales.item_cnt_day < 1001]
print(train.shape, sales.shape)
print(train.tail(5))
print(sales.tail(5))
# combine shops with different id but the same name
train.loc[train.shop_id == 0, "shop_id"] = 57
test.loc[test.shop_id == 0, "shop_id"] = 57
train.loc[train.shop_id == 1, "shop_id"] = 58
test.loc[test.shop_id == 1, "shop_id"] = 58
train.loc[train.shop_id == 40, "shop_id"] = 39
test.loc[test.shop_id == 40, "shop_id"] = 39
# obtain shop_id, item_id, month information
index_cols = ["shop_id", "item_id", "date_block_num"]
df = []
for block_num in train["date_block_num"].unique():
cur_shops = train.loc[sales["date_block_num"] == block_num, "shop_id"].unique()
cur_items = train.loc[sales["date_block_num"] == block_num, "item_id"].unique()
df.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])), dtype="int32"))
df = pd.DataFrame(np.vstack(df), columns=index_cols, dtype=np.int32)
print("df.shape: ", df.shape)
print(df.head(5))
# Add month sales
group = train.groupby(["date_block_num", "shop_id", "item_id"]).agg({"item_cnt_day": ["sum"]})
group.columns = ["item_cnt_month"]
group.reset_index(inplace=True)
print("group.shape: ", group.shape)
print(group.head(5))
df = pd.merge(df, group, on=index_cols, how="left")
df["item_cnt_month"] = (
df["item_cnt_month"]
.fillna(0)
.astype(np.float32)
# df['item_cnt_month'].fillna(0).clip(0, 20).astype(np.float32)
)
# fill test data
test["date_block_num"] = 34
test["date_block_num"] = test["date_block_num"].astype(np.int8)
test["shop_id"] = test["shop_id"].astype(np.int8)
test["item_id"] = test["item_id"].astype(np.int16)
df = pd.concat([df, test], ignore_index=True, sort=False, keys=index_cols)
df.fillna(0, inplace=True)
# shop location features
shops["city"] = shops["shop_name"].apply(lambda x: x.split()[0].lower())
shops.loc[shops.city == "!якутск", "city"] = "якутск"
shops["city_code"] = LabelEncoder().fit_transform(shops["city"])
coords = dict()
coords["якутск"] = (62.028098, 129.732555, 4)
coords["адыгея"] = (44.609764, 40.100516, 3)
coords["балашиха"] = (55.8094500, 37.9580600, 1)
coords["волжский"] = (53.4305800, 50.1190000, 3)
coords["вологда"] = (59.2239000, 39.8839800, 2)
coords["воронеж"] = (51.6720400, 39.1843000, 3)
coords["выездная"] = (0, 0, 0)
coords["жуковский"] = (55.5952800, 38.1202800, 1)
coords["интернет-магазин"] = (0, 0, 0)
coords["казань"] = (55.7887400, 49.1221400, 4)
coords["калуга"] = (54.5293000, 36.2754200, 4)
coords["коломна"] = (55.0794400, 38.7783300, 4)
coords["красноярск"] = (56.0183900, 92.8671700, 4)
coords["курск"] = (51.7373300, 36.1873500, 3)
coords["москва"] = (55.7522200, 37.6155600, 1)
coords["мытищи"] = (55.9116300, 37.7307600, 1)
coords["н.новгород"] = (56.3286700, 44.0020500, 4)
coords["новосибирск"] = (55.0415000, 82.9346000, 4)
coords["омск"] = (54.9924400, 73.3685900, 4)
coords["ростовнадону"] = (47.2313500, 39.7232800, 3)
coords["спб"] = (59.9386300, 30.3141300, 2)
coords["самара"] = (53.2000700, 50.1500000, 4)
coords["сергиев"] = (56.3000000, 38.1333300, 4)
coords["сургут"] = (61.2500000, 73.4166700, 4)
coords["томск"] = (56.4977100, 84.9743700, 4)
coords["тюмень"] = (57.1522200, 65.5272200, 4)
coords["уфа"] = (54.7430600, 55.9677900, 4)
coords["химки"] = (55.8970400, 37.4296900, 1)
coords["цифровой"] = (0, 0, 0)
coords["чехов"] = (55.1477000, 37.4772800, 4)
coords["ярославль"] = (57.6298700, 39.8736800, 2)
shops["city_coord_1"] = shops["city"].apply(lambda x: coords[x][0])
shops["city_coord_2"] = shops["city"].apply(lambda x: coords[x][1])
shops["country_part"] = shops["city"].apply(lambda x: coords[x][2])
shops = shops[["shop_id", "city_code", "city_coord_1", "city_coord_2", "country_part"]]
df = pd.merge(df, shops, on=["shop_id"], how="left")
# process items category name
map_dict = {
"Чистые носители (штучные)": "Чистые носители",
"Чистые носители (шпиль)": "Чистые носители",
"PC ": "Аксессуары",
"Служебные": "Служебные ",
}
items = pd.merge(items, item_cats, on="item_category_id")
items["item_category"] = items["item_category_name"].apply(lambda x: x.split("-")[0])
items["item_category"] = items["item_category"].apply(lambda x: map_dict[x] if x in map_dict.keys() else x)
items["item_category_common"] = LabelEncoder().fit_transform(items["item_category"])
items["item_category_code"] = LabelEncoder().fit_transform(items["item_category_name"])
items = items[["item_id", "item_category_common", "item_category_code"]]
df = pd.merge(df, items, on=["item_id"], how="left")
# Weekends count / number of days in a month
def count_days(date_block_num):
year = 2013 + date_block_num // 12
month = 1 + date_block_num % 12
weeknd_count = len([1 for i in calendar.monthcalendar(year, month) if i[6] != 0])
days_in_month = calendar.monthrange(year, month)[1]
return weeknd_count, days_in_month, month
map_dict = {i: count_days(i) for i in range(35)}
df["weeknd_count"] = df["date_block_num"].apply(lambda x: map_dict[x][0])
df["days_in_month"] = df["date_block_num"].apply(lambda x: map_dict[x][1])
# Interation features: Item is new / Item was bought in this shop before
first_item_block = df.groupby(["item_id"])["date_block_num"].min().reset_index()
first_item_block["item_first_interaction"] = 1
first_shop_item_buy_block = (
df[df["date_block_num"] > 0].groupby(["shop_id", "item_id"])["date_block_num"].min().reset_index()
)
first_shop_item_buy_block["first_date_block_num"] = first_shop_item_buy_block["date_block_num"]
df = pd.merge(
df,
first_item_block[["item_id", "date_block_num", "item_first_interaction"]],
on=["item_id", "date_block_num"],
how="left",
)
df = pd.merge(
df,
first_shop_item_buy_block[["item_id", "shop_id", "first_date_block_num"]],
on=["item_id", "shop_id"],
how="left",
)
df["first_date_block_num"].fillna(100, inplace=True)
df["shop_item_sold_before"] = (df["first_date_block_num"] < df["date_block_num"]).astype("int8")
df.drop(["first_date_block_num"], axis=1, inplace=True)
df["item_first_interaction"].fillna(0, inplace=True)
df["shop_item_sold_before"].fillna(0, inplace=True)
df["item_first_interaction"] = df["item_first_interaction"].astype("int8")
df["shop_item_sold_before"] = df["shop_item_sold_before"].astype("int8")
def lag_feature(df, lags, col):
tmp = df[["date_block_num", "shop_id", "item_id", col]]
for i in lags:
shifted = tmp.copy()
shifted.columns = ["date_block_num", "shop_id", "item_id", col + "_lag_" + str(i)]
shifted["date_block_num"] += i
df = pd.merge(df, shifted, on=["date_block_num", "shop_id", "item_id"], how="left")
lag_name = col + "_lag_" + str(i)
df[lag_name] = df[lag_name].astype("float32")
return df
df = lag_feature(df, [1, 2, 3], "item_cnt_month")
index_cols = ["shop_id", "item_id", "date_block_num"]
group = (
train.groupby(index_cols)["item_price"]
.mean()
.reset_index()
.rename(columns={"item_price": "avg_shop_price"}, errors="raise")
)
df = pd.merge(df, group, on=index_cols, how="left")
df["avg_shop_price"] = df["avg_shop_price"].fillna(0).astype(np.float32)
index_cols = ["item_id", "date_block_num"]
group = (
train.groupby(["date_block_num", "item_id"])["item_price"]
.mean()
.reset_index()
.rename(columns={"item_price": "avg_item_price"}, errors="raise")
)
df = pd.merge(df, group, on=index_cols, how="left")
df["avg_item_price"] = df["avg_item_price"].fillna(0).astype(np.float32)
df["item_shop_price_avg"] = (df["avg_shop_price"] - df["avg_item_price"]) / df["avg_item_price"]
df["item_shop_price_avg"].fillna(0, inplace=True)
df = lag_feature(df, [1, 2, 3], "item_shop_price_avg")
df.drop(["avg_shop_price", "avg_item_price", "item_shop_price_avg"], axis=1, inplace=True)
item_id_target_mean = (
df.groupby(["date_block_num", "item_id"])["item_cnt_month"]
.mean()
.reset_index()
.rename(columns={"item_cnt_month": "item_target_enc"}, errors="raise")
)
df = pd.merge(df, item_id_target_mean, on=["date_block_num", "item_id"], how="left")
df["item_target_enc"] = df["item_target_enc"].fillna(0).astype(np.float32)
df = lag_feature(df, [1, 2, 3], "item_target_enc")
df.drop(["item_target_enc"], axis=1, inplace=True)
item_id_target_mean = (
df.groupby(["date_block_num", "item_id", "city_code"])["item_cnt_month"]
.mean()
.reset_index()
.rename(columns={"item_cnt_month": "item_loc_target_enc"}, errors="raise")
)
df = pd.merge(df, item_id_target_mean, on=["date_block_num", "item_id", "city_code"], how="left")
df["item_loc_target_enc"] = df["item_loc_target_enc"].fillna(0).astype(np.float32)
df = lag_feature(df, [1, 2, 3], "item_loc_target_enc")
df.drop(["item_loc_target_enc"], axis=1, inplace=True)
item_id_target_mean = (
df.groupby(["date_block_num", "item_id", "shop_id"])["item_cnt_month"]
.mean()
.reset_index()
.rename(columns={"item_cnt_month": "item_shop_target_enc"}, errors="raise")
)
df = pd.merge(df, item_id_target_mean, on=["date_block_num", "item_id", "shop_id"], how="left")
df["item_shop_target_enc"] = df["item_shop_target_enc"].fillna(0).astype(np.float32)
df = lag_feature(df, [1, 2, 3], "item_shop_target_enc")
df.drop(["item_shop_target_enc"], axis=1, inplace=True)
item_id_target_mean = (
df[df["item_first_interaction"] == 1]
.groupby(["date_block_num", "item_category_code"])["item_cnt_month"]
.mean()
.reset_index()
.rename(columns={"item_cnt_month": "new_item_cat_avg"}, errors="raise")
)
df = pd.merge(df, item_id_target_mean, on=["date_block_num", "item_category_code"], how="left")
df["new_item_cat_avg"] = df["new_item_cat_avg"].fillna(0).astype(np.float32)
df = lag_feature(df, [1, 2, 3], "new_item_cat_avg")
df.drop(["new_item_cat_avg"], axis=1, inplace=True)
# For new items add avg category sales in a separate store for last 3 months
item_id_target_mean = (
df[df["item_first_interaction"] == 1]
.groupby(["date_block_num", "item_category_code", "shop_id"])["item_cnt_month"]
.mean()
.reset_index()
.rename(columns={"item_cnt_month": "new_item_shop_cat_avg"}, errors="raise")
)
df = pd.merge(df, item_id_target_mean, on=["date_block_num", "item_category_code", "shop_id"], how="left")
df["new_item_shop_cat_avg"] = df["new_item_shop_cat_avg"].fillna(0).astype(np.float32)
df = lag_feature(df, [1, 2, 3], "new_item_shop_cat_avg")
df.drop(["new_item_shop_cat_avg"], axis=1, inplace=True)
def lag_feature_adv(df, lags, col):
tmp = df[["date_block_num", "shop_id", "item_id", col]]
for i in lags:
shifted = tmp.copy()
shifted.columns = ["date_block_num", "shop_id", "item_id", col + "_lag_" + str(i) + "_adv"]
shifted["date_block_num"] += i
shifted["item_id"] -= 1
df = pd.merge(df, shifted, on=["date_block_num", "shop_id", "item_id"], how="left")
lag_name = col + "_lag_" + str(i) + "_adv"
df[lag_name] = df[lag_name].astype("float32")
return df
df = lag_feature_adv(df, [1, 2, 3], "item_cnt_month")
# df.fillna(0, inplace=True)
df = df[(df["date_block_num"] > 2)]
df.drop(["ID"], axis=1, inplace=True, errors="ignore")
print(df.shape)
print(df.columns)
print(df.head(10))
fill_dict = {}
for col in df.columns:
fill_dict[col] = df[col].mean()
group_df = df.groupby(["shop_id"])
for shop_id, shop_df in group_df:
# remove data of data_block_num=34, i.e., 2015.11
# this is test set in competition
shop_df = shop_df[shop_df.date_block_num <= 33]
# fill the null
cols = shop_df.isnull().any()
idx = list(cols[cols.values].index)
shop_df[idx] = shop_df.groupby("item_id", sort=False)[idx].apply(
lambda x: x.fillna(method="ffill").fillna(method="bfill")
)
shop_df[idx] = shop_df[idx].fillna(shop_df[idx].mean())
for col in idx:
shop_df[col] = shop_df[col].fillna(fill_dict[col])
# min-max scale
drop_fea_list = [
"shop_id",
"city_code",
"city_coord_1",
"city_coord_2",
"country_part",
"item_cnt_month",
"date_block_num",
]
fea_list = [col for col in shop_df.columns if col not in drop_fea_list]
mms = MinMaxScaler()
shop_df[fea_list] = mms.fit_transform(shop_df[fea_list])
shop_df = shop_df[fea_list + ["item_cnt_month", "date_block_num"]]
date_split = 29
split = False
while split is False:
df1 = shop_df[shop_df["date_block_num"] <= date_split]
df2 = shop_df[shop_df["date_block_num"] > date_split]
if df2.shape[0] > 0 and df1.shape[0] > 0:
split = True
else:
date_split -= 1
if date_split < 0:
break
if split is True:
print("ShopID:{}, split block:{}".format(shop_id, date_split))
print(df1.shape, df2.shape)
# save train csv
fpath = os.path.join(pfs_split_dir, "Shop{:0>2d}-train.csv".format(shop_id))
df1.to_csv(fpath, index=False)
# save val csv
fpath = os.path.join(pfs_split_dir, "Shop{:0>2d}-val.csv".format(shop_id))
df2.to_csv(fpath, index=False)

+ 0
- 90
examples/dataset_pfs_workflow/upload.py View File

@@ -1,90 +0,0 @@
import hashlib
import requests
import os
import random
import json
import time
from tqdm import tqdm

email = "liujd@lamda.nju.edu.cn"
password = hashlib.md5(b"liujdlamda").hexdigest()
login_url = "http://210.28.134.201:8089/auth/login"
submit_url = "http://210.28.134.201:8089/user/add_learnware"
all_data_type = ["Table", "Image", "Video", "Text", "Audio"]
all_task_type = [
"Classification",
"Regression",
"Clustering",
"Feature Extraction",
"Generation",
"Segmentation",
"Object Detection",
]
all_device_type = ["CPU", "GPU"]
all_scenario = [
"Business",
"Financial",
"Health",
"Politics",
"Computer",
"Internet",
"Traffic",
"Nature",
"Fashion",
"Industry",
"Agriculture",
"Education",
"Entertainment",
"Architecture",
]

# ###############
# 以上部分无需修改 #
# ###############


def main():
session = requests.Session()
res = session.post(login_url, json={"email": email, "password": password})

# /path/to/learnware/folder 修改为学件文件夹地址
learnware_pool = os.listdir(os.path.join(os.path.abspath("."), "learnware_pool"))

for learnware in learnware_pool:
# 修改相应的语义规约
name = "PFS_Shop" + "%02d" % int(learnware.split(".")[0].split("_")[1])
name = name + "_" + time.strftime("%Y%m%d%H%M%S", time.localtime())
description = f"This is a description of learnware {name}"
data = random.choice(all_data_type)
task = random.choice(all_task_type)
device = list(set(random.choices(all_device_type, k=2)))
scenario = list(set(random.choices(all_scenario, k=5)))
semantic_specification = {
"Data": {"Values": ["Table"], "Type": "Class"},
"Library": {"Values": ["Scikit-learn"], "Type": "Class"},
"Task": {"Values": ["Regression"], "Type": "Class"},
"Scenario": {"Values": ["Business"], "Type": "Tag"},
"Description": {
"Values": "A sales-forecasting model from Predict Future Sales Competition on Kaggle",
"Type": "String",
},
"Name": {"Values": name, "Type": "String"},
"License": {"Values": ["MIT"], "Type": "Class"},
}
res = session.post(
submit_url,
data={
"semantic_specification": json.dumps(semantic_specification),
},
files={
"learnware_file": open(
os.path.join(os.path.abspath("."), "learnware_pool", learnware),
"rb",
)
},
)
assert json.loads(res.text)["code"] == 0, "Upload error"


if __name__ == "__main__":
main()

+ 8
- 7
examples/dataset_text_workflow/README.md View File

@@ -39,13 +39,14 @@ python workflow.py labeled_text_example

The table below presents the mean accuracy of search and reuse across all users:

| Metric | Value |
|--------------------------------------|---------------------|
| Mean in Market (Single) | 0.507 |
| Best in Market (Single) | 0.859 |
| Top-1 Reuse (Single) | 0.846 |
| Job Selector Reuse (Multiple) | 0.845 |
| Average Ensemble Reuse (Multiple) | 0.862 |
| Setting | Accuracy |
|---------------------------------------|---------------------|
| Mean in Market (Single) | 0.507 |
| Best in Market (Single) | 0.859 |
| Top-1 Reuse (Single) | 0.846 |
| Job Selector Reuse (Multiple) | 0.845 |
| Average Ensemble Reuse (Multiple) | 0.862 |


### ``labeled_text_example``:



+ 11
- 11
examples/dataset_text_workflow/workflow.py View File

@@ -64,7 +64,7 @@ class TextDatasetWorkflow:

plt.xlabel("Amout of Labeled User Data", fontsize=14)
plt.ylabel("1 - Accuracy", fontsize=14)
plt.title(f"Results on Text Experimental Scenario", fontsize=16)
plt.title("Results on Text Experimental Scenario", fontsize=16)
plt.legend(fontsize=14)
plt.tight_layout()
plt.savefig(os.path.join(self.fig_path, "text_labeled_curves.svg"), bbox_inches="tight", dpi=700)
@@ -76,7 +76,7 @@ class TextDatasetWorkflow:
self.user_semantic = client.get_semantic_specification(self.text_benchmark.learnware_ids[0])
self.user_semantic["Name"]["Values"] = ""

if len(self.text_market) == 0 or rebuild == True:
if len(self.text_market) == 0 or rebuild is True:
for learnware_id in self.text_benchmark.learnware_ids:
with tempfile.TemporaryDirectory(prefix="text_benchmark_") as tempdir:
zip_path = os.path.join(tempdir, f"{learnware_id}.zip")
@@ -86,7 +86,7 @@ class TextDatasetWorkflow:
client.download_learnware(learnware_id, zip_path)
self.text_market.add_learnware(zip_path, semantic_spec)
break
except:
except Exception:
time.sleep(1)
continue

@@ -103,7 +103,7 @@ class TextDatasetWorkflow:
ensemble_score_list = []
all_learnwares = self.text_market.get_learnwares()

for i in range(self.text_benchmark.user_num):
for i in range(text_benchmark_config.user_num):
user_data, user_label = self.text_benchmark.get_test_data(user_ids=i)

user_stat_spec = RKMETextSpecification()
@@ -183,19 +183,19 @@ class TextDatasetWorkflow:
% (np.mean(ensemble_score_list), np.std(ensemble_score_list))
)

def labeled_text_example(self, rebuild=False, train_flag=True):
def labeled_text_example(self, rebuild=False, skip_test=False):
self.n_labeled_list = [100, 200, 500, 1000, 2000, 4000]
self.repeated_list = [10, 10, 10, 3, 3, 3]
self.root_path = os.path.dirname(os.path.abspath(__file__))
self.fig_path = os.path.join(self.root_path, "figs")
self.curve_path = os.path.join(self.root_path, "curves")
self._prepare_market(rebuild)

if train_flag:
if not skip_test:
self._prepare_market(rebuild)
os.makedirs(self.fig_path, exist_ok=True)
os.makedirs(self.curve_path, exist_ok=True)

for i in range(self.text_benchmark.user_num):
for i in range(text_benchmark_config.user_num):
user_model_score_mat = []
pruning_score_mat = []
single_score_mat = []
@@ -268,7 +268,7 @@ class TextDatasetWorkflow:
pruning_curves_data, user_model_curves_data = [], []
total_user_model_score_mat = [np.zeros(self.repeated_list[i]) for i in range(len(self.n_labeled_list))]
total_pruning_score_mat = [np.zeros(self.repeated_list[i]) for i in range(len(self.n_labeled_list))]
for user_idx in range(self.text_benchmark.user_num):
for user_idx in range(text_benchmark_config.user_num):
with open(os.path.join(self.curve_path, f"curve{str(user_idx)}.pkl"), "rb") as f:
user_curves_data = pickle.load(f)
(single_score_mat, user_model_score_mat, pruning_score_mat) = user_curves_data
@@ -278,8 +278,8 @@ class TextDatasetWorkflow:
total_pruning_score_mat[i] += 1 - np.array(pruning_score_mat[i])

for i in range(len(self.n_labeled_list)):
total_user_model_score_mat[i] /= self.text_benchmark.user_num
total_pruning_score_mat[i] /= self.text_benchmark.user_num
total_user_model_score_mat[i] /= text_benchmark_config.user_num
total_pruning_score_mat[i] /= text_benchmark_config.user_num
user_model_curves_data.append(
(np.mean(total_user_model_score_mat[i]), np.std(total_user_model_score_mat[i]))
)


+ 5
- 4
learnware/__init__.py View File

@@ -1,7 +1,8 @@
__version__ = "0.2.0.9"

import os
import json
import os

from .logger import get_module_logger
from .utils import is_torch_available, setup_seed

@@ -35,12 +36,12 @@ def init(verbose=True, **kwargs):
with open(config_file, "r") as fin_config:
C.update(**dict(json.load(fin_config)))

## random seed
# random seed
deterministic = kwargs.get("deterministic", True)
if deterministic:
setup_seed(C.random_seed)

## make dirs
# make dirs
mkdir = kwargs.get("mkdir", True)
if mkdir:
os.makedirs(C.root_path, exist_ok=True)
@@ -48,7 +49,7 @@ def init(verbose=True, **kwargs):
os.makedirs(C.stdout_path, exist_ok=True)
os.makedirs(C.cache_path, exist_ok=True)

## ignore tensorflow warning
# ignore tensorflow warning
tf_loglevel = kwargs.get("tf_loglevel", "2")
os.environ["TF_CPP_MIN_LOG_LEVEL"] = tf_loglevel



+ 10
- 12
learnware/client/container.py View File

@@ -1,21 +1,19 @@
import atexit
import os
import docker
import pickle
import atexit
import tarfile
import tempfile
import shortuuid
from concurrent.futures import ThreadPoolExecutor
from typing import List, Optional, Union

import docker
import shortuuid

from typing import List, Union, Optional
from .utils import system_execute, install_environment, remove_enviroment
from .utils import install_environment, remove_enviroment, system_execute
from ..config import C
from ..learnware import Learnware
from ..model.base import BaseModel
from .package_utils import filter_nonexist_conda_packages_file, filter_nonexist_pip_packages_file

from ..logger import get_module_logger
from ..model.base import BaseModel

logger = get_module_logger(module_name="client_container")

@@ -224,7 +222,7 @@ class ModelDockerContainer(ModelContainer):
}
container = client.containers.run(**container_config)
logger.info(f"Docker container {container.id[:12]} is generated.")
try:
environment_cmd = [
"pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple",
@@ -265,7 +263,7 @@ class ModelDockerContainer(ModelContainer):
if isinstance(docker_container, docker.models.containers.Container):
client = docker.from_env()
container_ids = [container.id for container in client.containers.list()]
if docker_container.id in container_ids:
docker_container.stop()
docker_container.remove()
@@ -521,7 +519,7 @@ class LearnwaresContainer:
except KeyboardInterrupt:
logger.warning("The KeyboardInterrupt is ignored when removing the container env!")
self._destroy_docker_container()
def __enter__(self):
if self.mode == "conda":
self.learnware_containers = [


+ 13
- 14
learnware/client/learnware_client.py View File

@@ -1,24 +1,23 @@
import os
import uuid
import yaml
import json
import atexit
import zipfile
import hashlib
import requests
import json
import os
import tempfile
import uuid
import zipfile
from enum import Enum
from typing import List, Optional, Union

import requests
import yaml
from tqdm import tqdm
from typing import Union, List, Optional

from ..config import C
from .container import LearnwaresContainer
from ..market import BaseChecker
from ..specification import generate_semantic_spec
from ..logger import get_module_logger
from ..config import C
from ..learnware import get_learnware_from_dirpath
from ..market import BaseUserInfo

from ..logger import get_module_logger
from ..market import BaseChecker, BaseUserInfo
from ..specification import generate_semantic_spec

CHUNK_SIZE = 1024 * 1024
logger = get_module_logger(module_name="LearnwareClient")
@@ -413,7 +412,7 @@ class LearnwareClient:

@staticmethod
def _check_stat_specification(learnware):
from ..market import EasyStatChecker, CondaChecker
from ..market import CondaChecker, EasyStatChecker

stat_checker = CondaChecker(inner_checker=EasyStatChecker())
check_status, message = stat_checker(learnware)


+ 7
- 7
learnware/client/package_utils.py View File

@@ -1,14 +1,14 @@
import json
import os
import re
import json
import yaml
import tempfile
import subprocess
from typing import List, Tuple
from . import utils
import tempfile
from concurrent.futures import ThreadPoolExecutor
from typing import List, Tuple

import yaml

from . import utils
from ..logger import get_module_logger

logger = get_module_logger("package_utils")
@@ -86,7 +86,7 @@ def filter_nonexist_pip_packages(packages: list) -> Tuple[List[str], List[str]]:
pass
except Exception as err:
logger.error(err)
return None

exist_packages = []
@@ -101,7 +101,7 @@ def filter_nonexist_pip_packages(packages: list) -> Tuple[List[str], List[str]]:
exist_packages.append(result)
else:
nonexist_packages.append(package)
if len(nonexist_packages) > 0:
logger.info(f"Filtered out {len(nonexist_packages)} non-exist pip packages.")
return exist_packages, nonexist_packages


+ 1
- 1
learnware/client/scripts/install_env.py View File

@@ -1,6 +1,6 @@
import argparse
from learnware.client.utils import install_environment

from learnware.client.utils import install_environment

if __name__ == "__main__":
parser = argparse.ArgumentParser()


+ 3
- 2
learnware/client/scripts/run_model.py View File

@@ -1,6 +1,7 @@
import sys
import pickle
import argparse
import pickle
import sys

from learnware.utils import get_module_by_module_path




+ 22
- 11
learnware/client/utils.py View File

@@ -1,10 +1,9 @@
import os
import zipfile
import tempfile
import subprocess
import tempfile

from ..logger import get_module_logger
from .package_utils import filter_nonexist_conda_packages_file, filter_nonexist_pip_packages_file
from ..logger import get_module_logger

logger = get_module_logger(module_name="client_utils")

@@ -22,14 +21,15 @@ def system_execute(args, timeout=None, env=None, stdout=subprocess.DEVNULL, stde
errmsg = err.stderr.decode()
logger.warning(f"System Execute Error: {errmsg}")
raise err
return com_process


def remove_enviroment(conda_env):
system_execute(args=["conda", "env", "remove", "-n", f"{conda_env}"])

def install_environment(learnware_dirpath, conda_env):

def install_environment(learnware_dirpath, conda_env, conda_prefix=None):
"""Install environment of a learnware

Parameters
@@ -38,12 +38,21 @@ def install_environment(learnware_dirpath, conda_env):
Path of the learnware folder
conda_env : str
a new conda environment will be created with the given name;
conda_prefix: str
install env in a specific location, not default env path;

Raises
------
Exception
Lack of the environment configuration file.
"""
if conda_prefix is not None:
args_location = ["--prefix", conda_prefix]
conda_env = conda_prefix
else:
args_location = ["--name", conda_env]
pass

with tempfile.TemporaryDirectory(prefix="learnware_") as tempdir:
logger.info(f"learnware_dir namelist: {os.listdir(learnware_dirpath)}")
if "environment.yaml" in os.listdir(learnware_dirpath):
@@ -53,7 +62,7 @@ def install_environment(learnware_dirpath, conda_env):
filter_nonexist_conda_packages_file(yaml_file=yaml_path, output_yaml_file=yaml_path_filter)
# create environment
logger.info(f"create conda env [{conda_env}] according to .yaml file")
system_execute(args=["conda", "env", "create", "--name", f"{conda_env}", "--file", f"{yaml_path_filter}"])
system_execute(args=["conda", "env", "create"] + args_location + ["--file", f"{yaml_path_filter}"])

elif "requirements.txt" in os.listdir(learnware_dirpath):
requirements_path: str = os.path.join(learnware_dirpath, "requirements.txt")
@@ -61,14 +70,15 @@ def install_environment(learnware_dirpath, conda_env):
logger.info(f"checking the available pip packages for {conda_env}")
filter_nonexist_pip_packages_file(requirements_file=requirements_path, output_file=requirements_path_filter)
logger.info(f"create empty conda env [{conda_env}]")
system_execute(args=["conda", "create", "-y", "--name", f"{conda_env}", "python=3.8"])
system_execute(args=["conda", "create", "-y"] + args_location + ["python=3.8"])
logger.info(f"install pip requirements for conda env [{conda_env}]")
system_execute(
args=[
"conda",
"run",
"-n",
f"{conda_env}",
]
+ args_location
+ [
"--no-capture-output",
"python",
"-m",
@@ -86,8 +96,9 @@ def install_environment(learnware_dirpath, conda_env):
args=[
"conda",
"run",
"-n",
f"{conda_env}",
]
+ args_location
+ [
"--no-capture-output",
"python",
"-m",


+ 1
- 1
learnware/config.py View File

@@ -1,6 +1,6 @@
import os
import copy
import logging
import os
from enum import Enum




+ 15
- 11
learnware/learnware/__init__.py View File

@@ -1,19 +1,21 @@
import os
import copy
from typing import Optional
import os
import traceback
from typing import Optional

from .base import Learnware
from .utils import get_stat_spec_from_config
from ..config import C
from ..logger import get_module_logger
from ..specification import Specification
from ..utils import read_yaml_to_dict
from ..logger import get_module_logger
from ..config import C

logger = get_module_logger("learnware.learnware")


def get_learnware_from_dirpath(id: str, semantic_spec: dict, learnware_dirpath, ignore_error=True) -> Optional[Learnware]:
def get_learnware_from_dirpath(
id: str, semantic_spec: dict, learnware_dirpath, ignore_error=True
) -> Optional[Learnware]:
"""Get the learnware object from dirpath, and provide the manage interface tor Learnware class

Parameters
@@ -46,11 +48,11 @@ def get_learnware_from_dirpath(id: str, semantic_spec: dict, learnware_dirpath,
}

try:
learnware_yaml_path = os.path.join(learnware_dirpath, C.learnware_folder_config["yaml_file"])
assert os.path.exists(learnware_yaml_path), f"learnware.yaml is not found for learnware_{id}, please check the learnware folder or zipfile."
assert os.path.exists(
learnware_yaml_path
), f"learnware.yaml is not found for learnware_{id}, please check the learnware folder or zipfile."

yaml_config = read_yaml_to_dict(learnware_yaml_path)

if "name" in yaml_config:
@@ -67,8 +69,10 @@ def get_learnware_from_dirpath(id: str, semantic_spec: dict, learnware_dirpath,
for _stat_spec in learnware_config["stat_specifications"]:
stat_spec = _stat_spec.copy()
stat_spec_path = os.path.join(learnware_dirpath, stat_spec["file_name"])
assert os.path.exists(stat_spec_path), f"statistical specification file {stat_spec['file_name']} is not found for learnware_{id}, please check the learnware folder or zipfile."
assert os.path.exists(
stat_spec_path
), f"statistical specification file {stat_spec['file_name']} is not found for learnware_{id}, please check the learnware folder or zipfile."

stat_spec["file_name"] = stat_spec_path
stat_spec_inst = get_stat_spec_from_config(stat_spec)
learnware_spec.update_stat_spec(**{stat_spec_inst.type: stat_spec_inst})


+ 7
- 7
learnware/learnware/base.py View File

@@ -1,19 +1,19 @@
import os
import numpy as np
from typing import Union, List
import sys
from typing import Union

import numpy as np

from ..specification import Specification, BaseStatSpecification
from ..logger import get_module_logger
from ..model import BaseModel
from ..specification import BaseStatSpecification, Specification
from ..utils import get_module_by_module_path
from ..logger import get_module_logger

logger = get_module_logger("Learnware")


class Learnware:
"""The learnware class, which is the basic components in learnware market
"""
"""The learnware class, which is the basic components in learnware market"""

def __init__(self, id: str, model: Union[BaseModel, dict], specification: Specification, learnware_dirpath: str):
"""The initialization method for learnware.
@@ -40,7 +40,7 @@ class Learnware:
dirpath: str
The path of the learnware directory
"""
self.id = id
self.model = model
self.specification = specification


+ 1
- 2
learnware/learnware/utils.py View File

@@ -1,4 +1,3 @@
import copy
from typing import Union

from ..model import BaseModel
@@ -45,5 +44,5 @@ def get_stat_spec_from_config(stat_spec: dict) -> BaseStatSpecification:
f"Statistic specification must be type of BaseStatSpecification, not {BaseStatSpecification.__class__.__name__}"
)
stat_spec_inst.load(stat_spec["file_name"])
return stat_spec_inst

+ 1
- 1
learnware/logger.py View File

@@ -1,5 +1,5 @@
import logging
from logging import Logger, handlers
from logging import Logger

from .config import C



+ 5
- 6
learnware/market/__init__.py View File

@@ -1,9 +1,8 @@
from .anchor import AnchoredUserInfo, AnchoredSearcher, AnchoredOrganizer
from .base import BaseUserInfo, LearnwareMarket, BaseChecker, BaseOrganizer, BaseSearcher
from .evolve_anchor import EvolvedAnchoredOrganizer
from .evolve import EvolvedOrganizer
from .anchor import AnchoredOrganizer, AnchoredSearcher, AnchoredUserInfo
from .base import BaseChecker, BaseOrganizer, BaseSearcher, BaseUserInfo, LearnwareMarket
from .classes import CondaChecker
from .easy import EasyOrganizer, EasySearcher, EasySemanticChecker, EasyStatChecker
from .evolve import EvolvedOrganizer
from .evolve_anchor import EvolvedAnchoredOrganizer
from .heterogeneous import HeteroMapTableOrganizer, HeteroSearcher

from .classes import CondaChecker
from .module import instantiate_learnware_market

+ 1
- 2
learnware/market/anchor/__init__.py View File

@@ -1,8 +1,7 @@
from .organizer import AnchoredOrganizer
from .user_info import AnchoredUserInfo

from ...utils import is_torch_available
from ...logger import get_module_logger
from ...utils import is_torch_available

logger = get_module_logger("market_anchor")



+ 2
- 2
learnware/market/anchor/organizer.py View File

@@ -1,8 +1,8 @@
from typing import Dict

from ..easy.organizer import EasyOrganizer
from ...logger import get_module_logger
from ...learnware import Learnware
from ...logger import get_module_logger

logger = get_module_logger("anchor_organizer")

@@ -44,7 +44,7 @@ class AnchoredOrganizer(EasyOrganizer):
Exception
Raise an excpetion when given anchor_id is NOT found in anchor_learnware_list
"""
if not anchor_id in self.anchor_learnware_list:
if anchor_id not in self.anchor_learnware_list:
raise Exception("Anchor learnware id:{} NOT Found!".format(anchor_id))

self.anchor_learnware_list.pop(anchor_id)


+ 2
- 2
learnware/market/anchor/searcher.py View File

@@ -1,9 +1,9 @@
from typing import List, Tuple, Any
from typing import Any, List, Tuple

from .user_info import AnchoredUserInfo
from ..easy.searcher import EasySearcher
from ...logger import get_module_logger
from ...learnware import Learnware
from ...logger import get_module_logger

logger = get_module_logger("anchor_searcher")



+ 2
- 1
learnware/market/anchor/user_info.py View File

@@ -1,4 +1,5 @@
from typing import List, Any, Union
from typing import Any, List, Union

from ..base import BaseUserInfo




+ 3
- 2
learnware/market/base.py View File

@@ -1,10 +1,11 @@
from __future__ import annotations

import tempfile
import traceback
import zipfile
import tempfile
from typing import Tuple, Any, List, Union, Optional
from dataclasses import dataclass
from typing import Any, List, Optional, Tuple, Union

from ..learnware import Learnware, get_learnware_from_dirpath
from ..logger import get_module_logger



+ 2
- 1
learnware/market/classes.py View File

@@ -1,8 +1,9 @@
import traceback
from typing import Tuple

from .base import BaseChecker
from ..learnware import Learnware
from ..client.container import LearnwaresContainer
from ..learnware import Learnware
from ..logger import get_module_logger

logger = get_module_logger("market_classes")


+ 2
- 3
learnware/market/easy/__init__.py View File

@@ -1,7 +1,6 @@
from .organizer import EasyOrganizer

from ...utils import is_torch_available
from ...logger import get_module_logger
from ...utils import is_torch_available

logger = get_module_logger("market_easy")

@@ -11,5 +10,5 @@ if not is_torch_available(verbose=False):
EasyStatChecker = None
logger.error("EasySeacher and EasyChecker are not available because 'torch' is not installed!")
else:
from .searcher import EasySearcher, EasyStatSearcher, EasyFuzzSemanticSearcher, EasyExactSemanticSearcher
from .checker import EasySemanticChecker, EasyStatChecker
from .searcher import EasyExactSemanticSearcher, EasyFuzzSemanticSearcher, EasySearcher, EasyStatSearcher

+ 3
- 3
learnware/market/easy/checker.py View File

@@ -1,10 +1,10 @@
import traceback
import numpy as np
import torch
import random
import string
import traceback

import numpy as np
import torch

from ..base import BaseChecker
from ..utils import parse_specification_type
from ...config import C


+ 5
- 4
learnware/market/easy/database_ops.py View File

@@ -1,9 +1,10 @@
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import create_engine, text
from sqlalchemy import Column, Text, String
import os
import json
import os
import traceback

from sqlalchemy import Column, String, Text, create_engine, text
from sqlalchemy.ext.declarative import declarative_base

from ...learnware import get_learnware_from_dirpath
from ...logger import get_module_logger



+ 14
- 15
learnware/market/easy/organizer.py View File

@@ -1,14 +1,13 @@
import os
import copy
import zipfile
import os
import tempfile
import zipfile
from shutil import copyfile, rmtree
from typing import Tuple, List, Union, Dict
from typing import Dict, List, Tuple, Union

from .database_ops import DatabaseOperations
from ..base import BaseOrganizer, BaseChecker
from ..base import BaseChecker, BaseOrganizer
from ...config import C as conf
from ...logger import get_module_logger
from ...learnware import Learnware, get_learnware_from_dirpath
from ...logger import get_module_logger

@@ -95,34 +94,34 @@ class EasyOrganizer(BaseOrganizer):
new_learnware = get_learnware_from_dirpath(
id=learnware_id, semantic_spec=semantic_spec, learnware_dirpath=target_folder_dir
)
except:
except Exception:
logger.warning("New learnware is not properly added!")
try:
os.remove(target_zip_dir)
rmtree(target_folder_dir)
except:
except Exception:
pass
return None, BaseChecker.INVALID_LEARNWARE

if new_learnware is None:
return None, BaseChecker.INVALID_LEARNWARE

learnwere_status = check_status if check_status is not None else BaseChecker.NONUSABLE_LEARNWARE
learnware_status = check_status if check_status is not None else BaseChecker.NONUSABLE_LEARNWARE

self.dbops.add_learnware(
id=learnware_id,
semantic_spec=semantic_spec,
zip_path=target_zip_dir,
folder_path=target_folder_dir,
use_flag=learnwere_status,
use_flag=learnware_status,
)

self.learnware_list[learnware_id] = new_learnware
self.learnware_zip_list[learnware_id] = target_zip_dir
self.learnware_folder_list[learnware_id] = target_folder_dir
self.use_flags[learnware_id] = learnwere_status
self.use_flags[learnware_id] = learnware_status
self.count += 1
return learnware_id, learnwere_status
return learnware_id, learnware_status

def delete_learnware(self, id: str) -> bool:
"""Delete Learnware from market
@@ -138,7 +137,7 @@ class EasyOrganizer(BaseOrganizer):
True for successful operation.
False for id not found.
"""
if not id in self.learnware_list:
if id not in self.learnware_list:
logger.warning("Learnware id:'{}' NOT Found!".format(id))
return False

@@ -254,7 +253,7 @@ class EasyOrganizer(BaseOrganizer):
else:
try:
return self.learnware_list[ids]
except:
except Exception:
logger.warning("Learnware ID '%s' NOT Found!" % (ids))
return None

@@ -286,7 +285,7 @@ class EasyOrganizer(BaseOrganizer):
else:
try:
return self.learnware_zip_list[ids]
except:
except Exception:
logger.warning("Learnware ID '%s' NOT Found!" % (ids))
return None

@@ -318,7 +317,7 @@ class EasyOrganizer(BaseOrganizer):
else:
try:
return self.learnware_folder_list[ids]
except:
except Exception:
logger.warning("Learnware ID '%s' NOT Found!" % (ids))
return None



+ 7
- 6
learnware/market/easy/searcher.py View File

@@ -1,15 +1,16 @@
import math
import torch
from typing import List, Optional, Tuple, Union

import numpy as np
import torch
from rapidfuzz import fuzz
from typing import Tuple, List, Union, Optional

from .organizer import EasyOrganizer
from ..base import BaseSearcher, BaseUserInfo, MultipleSearchItem, SearchResults, SingleSearchItem
from ..utils import parse_specification_type
from ..base import BaseUserInfo, BaseSearcher, SearchResults, SingleSearchItem, MultipleSearchItem
from ...learnware import Learnware
from ...specification import RKMETableSpecification, RKMEImageSpecification, RKMETextSpecification, rkme_solve_qp
from ...logger import get_module_logger
from ...specification import RKMEImageSpecification, RKMETableSpecification, RKMETextSpecification, rkme_solve_qp

logger = get_module_logger("easy_seacher")

@@ -278,7 +279,7 @@ class EasyStatSearcher(BaseSearcher):
learnware_num = len(learnware_list)
RKME_list = [learnware.specification.get_stat_spec_by_name(self.stat_spec_type) for learnware in learnware_list]

if type(intermediate_K) == np.ndarray:
if isinstance(intermediate_K, np.ndarray):
K = intermediate_K
else:
K = np.zeros((learnware_num, learnware_num))
@@ -287,7 +288,7 @@ class EasyStatSearcher(BaseSearcher):
for j in range(i + 1, K.shape[0]):
K[i, j] = K[j, i] = RKME_list[i].inner_prod(RKME_list[j])

if type(intermediate_C) == np.ndarray:
if isinstance(intermediate_C, np.ndarray):
C = intermediate_C
else:
C = np.zeros((learnware_num, 1))


+ 1
- 1
learnware/market/evolve/organizer.py View File

@@ -2,8 +2,8 @@ from typing import List

from ..easy.organizer import EasyOrganizer
from ...learnware import Learnware
from ...specification import BaseStatSpecification
from ...logger import get_module_logger
from ...specification import BaseStatSpecification

logger = get_module_logger("evolve_organizer")



+ 1
- 1
learnware/market/evolve_anchor/organizer.py View File

@@ -1,7 +1,7 @@
from typing import List

from ..evolve import EvolvedOrganizer
from ..anchor import AnchoredOrganizer, AnchoredUserInfo
from ..evolve import EvolvedOrganizer
from ...logger import get_module_logger

logger = get_module_logger("evolve_anchor_organizer")


+ 1
- 1
learnware/market/heterogeneous/__init__.py View File

@@ -1,5 +1,5 @@
from ...utils import is_torch_available
from ...logger import get_module_logger
from ...utils import is_torch_available

logger = get_module_logger("market_hetero")



+ 1
- 3
learnware/market/heterogeneous/organizer/__init__.py View File

@@ -1,6 +1,5 @@
import os
import traceback
import pandas as pd
from collections import defaultdict
from typing import List, Tuple, Union

@@ -14,7 +13,6 @@ from ....learnware import Learnware
from ....logger import get_module_logger
from ....specification import HeteroMapTableSpecification


logger = get_module_logger("hetero_map_table_organizer")


@@ -44,7 +42,7 @@ class HeteroMapTableOrganizer(EasyOrganizer):
for hetero_id in hetero_ids:
self._reload_learnware_hetero_spec(hetero_id)
else:
logger.warning(f"No market mapping to reload!")
logger.warning("No market mapping to reload!")
self.market_mapping = HeteroMap()

def reset(self, market_id, rebuild=False, auto_update=False, auto_update_limit=100, **training_args):


+ 4
- 4
learnware/market/heterogeneous/organizer/hetero_map/__init__.py View File

@@ -6,10 +6,10 @@ import torch
import torch.nn.functional as F
from torch import nn

from .....utils import allocate_cuda_idx, choose_device
from .....specification import HeteroMapTableSpecification, RKMETableSpecification
from .feature_extractor import CLSToken, FeatureProcessor, FeatureTokenizer
from .trainer import TransTabCollatorForCL, Trainer
from .trainer import Trainer, TransTabCollatorForCL
from .....specification import HeteroMapTableSpecification, RKMETableSpecification
from .....utils import allocate_cuda_idx, choose_device


class HeteroMap(nn.Module):
@@ -287,7 +287,7 @@ class HeteroMap(nn.Module):
# go through transformers, get the first cls embedding
encoder_output = self.encoder(**outputs) # bs, seqlen+1, hidden_dim
output_features = encoder_output[:, 0, :]
del inputs, outputs, encoder_output
torch.cuda.empty_cache()



+ 1
- 2
learnware/market/heterogeneous/organizer/hetero_map/trainer.py View File

@@ -1,5 +1,4 @@
import math
import os
import time
from typing import Any, Callable, Dict, List

@@ -10,8 +9,8 @@ from torch import nn
from torch.utils.data import DataLoader, Dataset
from tqdm.autonotebook import trange

from .....logger import get_module_logger
from .feature_extractor import FeatureTokenizer
from .....logger import get_module_logger

logger = get_module_logger("hetero_mapping_trainer")



+ 5
- 2
learnware/market/heterogeneous/searcher.py View File

@@ -6,13 +6,16 @@ from ..easy import EasySearcher
from ..utils import parse_specification_type
from ...logger import get_module_logger


logger = get_module_logger("hetero_searcher")


class HeteroSearcher(EasySearcher):
def __call__(
self, user_info: BaseUserInfo, check_status: Optional[int] = None, max_search_num: int = 5, search_method: str = "greedy"
self,
user_info: BaseUserInfo,
check_status: Optional[int] = None,
max_search_num: int = 5,
search_method: str = "greedy",
) -> SearchResults:
"""Search learnwares based on user_info from learnwares with check_status.
Employs heterogeneous learnware search if specific requirements are met, otherwise resorts to homogeneous search methods.


+ 1
- 2
learnware/market/heterogeneous/utils.py View File

@@ -1,4 +1,3 @@
import traceback
from ...logger import get_module_logger

logger = get_module_logger("hetero_utils")
@@ -48,5 +47,5 @@ def is_hetero(stat_specs: dict, semantic_spec: dict, verbose=True) -> bool:
return True
except Exception as err:
if verbose:
logger.warning(f"Invalid heterogeneous search information provided.")
logger.warning("Invalid heterogeneous search information provided.")
return False

+ 14
- 4
learnware/market/module.py View File

@@ -4,7 +4,9 @@ from .easy import EasyOrganizer, EasySearcher, EasySemanticChecker, EasyStatChec
from .heterogeneous import HeteroMapTableOrganizer, HeteroSearcher


def get_market_component(name, market_id, rebuild, organizer_kwargs=None, searcher_kwargs=None, checker_kwargs=None, conda_checker=False):
def get_market_component(
name, market_id, rebuild, organizer_kwargs=None, searcher_kwargs=None, checker_kwargs=None, conda_checker=False
):
organizer_kwargs = {} if organizer_kwargs is None else organizer_kwargs
searcher_kwargs = {} if searcher_kwargs is None else searcher_kwargs
checker_kwargs = {} if checker_kwargs is None else checker_kwargs
@@ -12,7 +14,10 @@ def get_market_component(name, market_id, rebuild, organizer_kwargs=None, search
if name == "easy":
easy_organizer = EasyOrganizer(market_id=market_id, rebuild=rebuild)
easy_searcher = EasySearcher(organizer=easy_organizer)
easy_checker_list = [EasySemanticChecker(), EasyStatChecker() if conda_checker is False else CondaChecker(EasyStatChecker())]
easy_checker_list = [
EasySemanticChecker(),
EasyStatChecker() if conda_checker is False else CondaChecker(EasyStatChecker()),
]
market_component = {
"organizer": easy_organizer,
"searcher": easy_searcher,
@@ -21,7 +26,10 @@ def get_market_component(name, market_id, rebuild, organizer_kwargs=None, search
elif name == "hetero":
hetero_organizer = HeteroMapTableOrganizer(market_id=market_id, rebuild=rebuild, **organizer_kwargs)
hetero_searcher = HeteroSearcher(organizer=hetero_organizer)
hetero_checker_list = [EasySemanticChecker(), EasyStatChecker() if conda_checker is False else CondaChecker(EasyStatChecker())]
hetero_checker_list = [
EasySemanticChecker(),
EasyStatChecker() if conda_checker is False else CondaChecker(EasyStatChecker()),
]

market_component = {
"organizer": hetero_organizer,
@@ -44,7 +52,9 @@ def instantiate_learnware_market(
conda_checker: bool = False,
**kwargs,
):
market_componets = get_market_component(name, market_id, rebuild, organizer_kwargs, searcher_kwargs, checker_kwargs, conda_checker)
market_componets = get_market_component(
name, market_id, rebuild, organizer_kwargs, searcher_kwargs, checker_kwargs, conda_checker
)
return LearnwareMarket(
organizer=market_componets["organizer"],
searcher=market_componets["searcher"],


+ 0
- 3
learnware/market/utils.py View File

@@ -1,6 +1,3 @@
from ..specification import Specification


def parse_specification_type(
stat_specs: dict,
spec_list=[


+ 0
- 1
learnware/model/base.py View File

@@ -1,5 +1,4 @@
import numpy as np
from typing import Union


class BaseModel:


+ 3
- 4
learnware/reuse/__init__.py View File

@@ -1,6 +1,5 @@
from .base import BaseReuser
from .align import AlignLearnware
from .base import BaseReuser
from ..logger import get_module_logger
from ..utils import is_torch_available

@@ -18,7 +17,7 @@ if not is_torch_available(verbose=False):
)
else:
from .averaging import AveragingReuser
from .ensemble_pruning import EnsemblePruningReuser
from .feature_augment import FeatureAugmentReuser
from .hetero import HeteroMapAlignLearnware, FeatureAlignLearnware
from .hetero import FeatureAlignLearnware, HeteroMapAlignLearnware
from .job_selector import JobSelectorReuser
from .ensemble_pruning import EnsemblePruningReuser

+ 5
- 5
learnware/reuse/averaging.py View File

@@ -1,11 +1,11 @@
import torch
from typing import List

import numpy as np
from typing import List, Union
import torch
from scipy.special import softmax


from ..learnware import Learnware
from .base import BaseReuser
from ..learnware import Learnware
from ..logger import get_module_logger

logger = get_module_logger("avaraging_reuser")
@@ -50,7 +50,7 @@ class AveragingReuser(BaseReuser):
if isinstance(pred_y, torch.Tensor):
pred_y = pred_y.detach().cpu().numpy()
if not isinstance(pred_y, np.ndarray):
raise TypeError(f"Model output must be np.ndarray or torch.Tensor")
raise TypeError("Model output must be np.ndarray or torch.Tensor")

if len(pred_y.shape) == 1:
pred_y = pred_y.reshape(-1, 1)


+ 2
- 1
learnware/reuse/base.py View File

@@ -1,6 +1,7 @@
import numpy as np
from typing import List

import numpy as np

from ..learnware import Learnware
from ..logger import get_module_logger



+ 17
- 11
learnware/reuse/ensemble_pruning.py View File

@@ -1,10 +1,11 @@
import torch
import random
import numpy as np
from typing import List

from ..learnware import Learnware
import numpy as np
import torch

from .base import BaseReuser
from ..learnware import Learnware
from ..logger import get_module_logger

logger = get_module_logger("ensemble_pruning")
@@ -53,13 +54,14 @@ class EnsemblePruningReuser(BaseReuser):
np.ndarray
Binary one-dimensional vector, 1 indicates that the corresponding model is selected.
"""

try:
import geatpy as ea
except ModuleNotFoundError:
raise ModuleNotFoundError(f"EnsemblePruningReuser is not available because 'geatpy' is not installed! Please install it manually (only support python_version<3.11).")
raise ModuleNotFoundError(
"EnsemblePruningReuser is not available because 'geatpy' is not installed! Please install it manually (only support python_version<3.11)."
)

model_num = v_predict.shape[1]

@ea.Problem.single
@@ -147,7 +149,9 @@ class EnsemblePruningReuser(BaseReuser):
try:
import geatpy as ea
except ModuleNotFoundError:
raise ModuleNotFoundError(f"EnsemblePruningReuser is not available because 'geatpy' is not installed! Please install it manually (only support python_version<3.11).")
raise ModuleNotFoundError(
"EnsemblePruningReuser is not available because 'geatpy' is not installed! Please install it manually (only support python_version<3.11)."
)

if torch.is_tensor(v_true):
v_true = v_true.detach().cpu().numpy()
@@ -269,8 +273,10 @@ class EnsemblePruningReuser(BaseReuser):
try:
import geatpy as ea
except ModuleNotFoundError:
raise ModuleNotFoundError(f"EnsemblePruningReuser is not available because 'geatpy' is not installed! Please install it manually (only support python_version<3.11).")
raise ModuleNotFoundError(
"EnsemblePruningReuser is not available because 'geatpy' is not installed! Please install it manually (only support python_version<3.11)."
)

model_num = v_predict.shape[1]
v_predict[v_predict == 0.0] = -1
v_true[v_true == 0.0] = -1
@@ -371,7 +377,7 @@ class EnsemblePruningReuser(BaseReuser):
if isinstance(pred_y, torch.Tensor):
pred_y = pred_y.detach().cpu().numpy()
if not isinstance(pred_y, np.ndarray):
raise TypeError(f"Model output must be np.ndarray or torch.Tensor")
raise TypeError("Model output must be np.ndarray or torch.Tensor")

if len(pred_y.shape) == 1:
pred_y = pred_y.reshape(-1, 1)


+ 5
- 4
learnware/reuse/feature_augment.py View File

@@ -1,7 +1,8 @@
import torch
import numpy as np
from typing import List
from sklearn.linear_model import RidgeCV, LogisticRegressionCV

import numpy as np
import torch
from sklearn.linear_model import LogisticRegressionCV, RidgeCV

from .base import BaseReuser
from .utils import fill_data_with_mean
@@ -102,7 +103,7 @@ class FeatureAugmentReuser(BaseReuser):
if isinstance(y_pred, torch.Tensor):
y_pred = y_pred.detach().cpu().numpy()
if not isinstance(y_pred, np.ndarray):
raise TypeError(f"Model output must be np.ndarray or torch.Tensor")
raise TypeError("Model output must be np.ndarray or torch.Tensor")
if len(y_pred.shape) == 1:
y_pred = y_pred.reshape(-1, 1)
y_preds.append(y_pred)


+ 6
- 5
learnware/reuse/hetero/feature_align.py View File

@@ -1,17 +1,18 @@
import time
import torch
from typing import List

import numpy as np
import torch
import torch.nn as nn
from typing import List
from tqdm import trange
import torch.nn.functional as F
from tqdm import trange

from ..align import AlignLearnware
from ..utils import fill_data_with_mean
from ...utils import choose_device, allocate_cuda_idx
from ...logger import get_module_logger
from ...learnware import Learnware
from ...logger import get_module_logger
from ...specification import RKMETableSpecification
from ...utils import allocate_cuda_idx, choose_device

logger = get_module_logger("feature_align")



+ 2
- 2
learnware/reuse/hetero/hetero_map.py View File

@@ -1,10 +1,10 @@
import numpy as np

from .feature_align import FeatureAlignLearnware
from ..align import AlignLearnware
from ..feature_augment import FeatureAugmentReuser
from ...learnware import Learnware
from ...logger import get_module_logger
from .feature_align import FeatureAlignLearnware
from ..feature_augment import FeatureAugmentReuser
from ...specification import RKMETableSpecification

logger = get_module_logger("hetero_map_align")


+ 7
- 8
learnware/reuse/job_selector.py View File

@@ -1,15 +1,14 @@
import torch
import numpy as np

from typing import List, Union

import numpy as np
import torch
from sklearn.metrics import accuracy_score

from .base import BaseReuser
from ..market.utils import parse_specification_type
from ..learnware import Learnware
from ..specification import RKMETableSpecification, RKMETextSpecification
from ..specification import generate_rkme_table_spec, rkme_solve_qp
from ..logger import get_module_logger
from ..market.utils import parse_specification_type
from ..specification import RKMETableSpecification, RKMETextSpecification, generate_rkme_table_spec, rkme_solve_qp

logger = get_module_logger("job_selector_reuse")

@@ -70,7 +69,7 @@ class JobSelectorReuser(BaseReuser):
# pred_y = pred_y.numpy()

if not isinstance(pred_y, np.ndarray):
raise TypeError(f"Model output must be np.ndarray or torch.Tensor")
raise TypeError("Model output must be np.ndarray or torch.Tensor")

pred_y_list.append(pred_y)
data_idxs_list.append(data_idx_list)
@@ -230,7 +229,7 @@ class JobSelectorReuser(BaseReuser):
from lightgbm import LGBMClassifier, early_stopping
except ModuleNotFoundError:
raise ModuleNotFoundError(
f"JobSelectorReuser is not available because 'lightgbm' is not installed! Please install it manually."
"JobSelectorReuser is not available because 'lightgbm' is not installed! Please install it manually."
)

score_best = -1


+ 2
- 0
learnware/reuse/utils.py View File

@@ -1,8 +1,10 @@
import numpy as np

from ..logger import get_module_logger

logger = get_module_logger("reuse_utils")


def fill_data_with_mean(X: np.ndarray) -> np.ndarray:
"""
Fill missing data (NaN, Inf) in the input array with the mean of the column.


+ 4
- 6
learnware/specification/__init__.py View File

@@ -1,15 +1,13 @@
from .base import Specification, BaseStatSpecification
from .base import BaseStatSpecification, Specification
from .regular import (
RegularStatSpecification,
RKMEImageSpecification,
RKMEStatSpecification,
RKMETableSpecification,
RKMEImageSpecification,
RKMETextSpecification,
rkme_solve_qp,
)

from .system import HeteroMapTableSpecification

from ..utils import is_torch_available

if not is_torch_available(verbose=False):
@@ -20,9 +18,9 @@ if not is_torch_available(verbose=False):
generate_semantic_spec = None
else:
from .module import (
generate_stat_spec,
generate_rkme_table_spec,
generate_rkme_image_spec,
generate_rkme_table_spec,
generate_rkme_text_spec,
generate_semantic_spec,
generate_stat_spec,
)

+ 1
- 3
learnware/specification/base.py View File

@@ -1,7 +1,5 @@
from __future__ import annotations

import copy
import numpy as np
from typing import Dict


@@ -26,7 +24,7 @@ class BaseStatSpecification:

def dist(self, stat_spec: BaseStatSpecification):
raise NotImplementedError("dist is not implemented")
def save(self, filepath: str):
"""Save the statistical specification into file in filepath



+ 4
- 4
learnware/specification/module.py View File

@@ -1,11 +1,11 @@
import torch
from typing import List, Optional, Union

import numpy as np
import pandas as pd
from typing import Union, List, Optional
import torch

from .regular import RKMEImageSpecification, RKMETableSpecification, RKMETextSpecification
from .utils import convert_to_numpy
from .base import BaseStatSpecification
from .regular import RKMETableSpecification, RKMEImageSpecification, RKMETextSpecification
from ..config import C




+ 2
- 2
learnware/specification/regular/__init__.py View File

@@ -1,4 +1,4 @@
from .base import RegularStatSpecification
from .text import RKMETextSpecification
from .table import RKMETableSpecification, RKMEStatSpecification, rkme_solve_qp
from .image import RKMEImageSpecification
from .table import RKMEStatSpecification, RKMETableSpecification, rkme_solve_qp
from .text import RKMETextSpecification

+ 2
- 3
learnware/specification/regular/image/__init__.py View File

@@ -1,11 +1,10 @@
from ....utils import is_torch_available
from ....logger import get_module_logger
from ....utils import is_torch_available

logger = get_module_logger("regular_image_spec")

if not is_torch_available(verbose=False):
RKMEImageSpecification = None
logger.error(f"RKMEImageSpecification is not available because 'torch' is not installed!")
logger.error("RKMEImageSpecification is not available because 'torch' is not installed!")
else:
from .rkme import RKMEImageSpecification

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save