diff --git a/.dev_scripts/build_docs.sh b/.dev_scripts/build_docs.sh index 9c8acdf1..dc76e6f4 100644 --- a/.dev_scripts/build_docs.sh +++ b/.dev_scripts/build_docs.sh @@ -4,5 +4,5 @@ rm -rf build # update api rst #rm -rf source/api/ -#sphinx-apidoc --module-first -o source/api/ ../maas_lib/ +#sphinx-apidoc --module-first -o source/api/ ../modelscope/ make html diff --git a/.dev_scripts/linter.sh b/.dev_scripts/linter.sh index fb8ab19d..6468e42b 100644 --- a/.dev_scripts/linter.sh +++ b/.dev_scripts/linter.sh @@ -1,3 +1,3 @@ -yapf -r -i maas_lib/ configs/ tests/ setup.py -isort -rc maas_lib/ configs/ tests/ setup.py -flake8 maas_lib/ configs/ tests/ setup.py +yapf -r -i modelscope/ configs/ tests/ setup.py +isort -rc modelscope/ configs/ tests/ setup.py +flake8 modelscope/ configs/ tests/ setup.py diff --git a/LICENSE b/LICENSE index 85ed3d3a..14cec7de 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright 2022-2023 Alibaba MaaS. All rights reserved. +Copyright 2022-2023 Alibaba ModelScope. All rights reserved. Apache License Version 2.0, January 2004 @@ -188,7 +188,7 @@ Copyright 2022-2023 Alibaba MaaS. All rights reserved. same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright 2020-2022 Alibaba MaaS. + Copyright 2020-2022 Alibaba ModelScope. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/MANIFEST.in b/MANIFEST.in index 0a153dba..665d7e90 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1 +1 @@ -recursive-include maas_lib/configs *.py +recursive-include modelscope/configs *.py diff --git a/README.md b/README.md index dabe8726..944c1f07 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Introduction -MaaS library is targeted to support training, evaluation and inference for the state of the art models provided by Mind and further support third-party models provided by users outside alibaba. +ModelScope library is targeted to support training, evaluation and inference for the state of the art models provided by Mind and further support third-party models provided by users outside alibaba. # Design doc diff --git a/configs/README.md b/configs/README.md index 94499da7..3c3b6963 100644 --- a/configs/README.md +++ b/configs/README.md @@ -1 +1 @@ -This folder will host example configs for each model supported by maas_lib. +This folder will host example configs for each model supported by modelscope. diff --git a/docs/source/api/maas_lib.fileio.format.rst b/docs/source/api/maas_lib.fileio.format.rst deleted file mode 100644 index 7c2c649d..00000000 --- a/docs/source/api/maas_lib.fileio.format.rst +++ /dev/null @@ -1,34 +0,0 @@ -maas\_lib.fileio.format package -=============================== - -.. automodule:: maas_lib.fileio.format - :members: - :undoc-members: - :show-inheritance: - -Submodules ----------- - -maas\_lib.fileio.format.base module ------------------------------------ - -.. automodule:: maas_lib.fileio.format.base - :members: - :undoc-members: - :show-inheritance: - -maas\_lib.fileio.format.json module ------------------------------------ - -.. automodule:: maas_lib.fileio.format.json - :members: - :undoc-members: - :show-inheritance: - -maas\_lib.fileio.format.yaml module ------------------------------------ - -.. automodule:: maas_lib.fileio.format.yaml - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/maas_lib.fileio.rst b/docs/source/api/maas_lib.fileio.rst deleted file mode 100644 index e9540208..00000000 --- a/docs/source/api/maas_lib.fileio.rst +++ /dev/null @@ -1,34 +0,0 @@ -maas\_lib.fileio package -======================== - -.. automodule:: maas_lib.fileio - :members: - :undoc-members: - :show-inheritance: - -Subpackages ------------ - -.. toctree:: - :maxdepth: 4 - - maas_lib.fileio.format - -Submodules ----------- - -maas\_lib.fileio.file module ----------------------------- - -.. automodule:: maas_lib.fileio.file - :members: - :undoc-members: - :show-inheritance: - -maas\_lib.fileio.io module --------------------------- - -.. automodule:: maas_lib.fileio.io - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/maas_lib.models.nlp.rst b/docs/source/api/maas_lib.models.nlp.rst deleted file mode 100644 index bd782ea8..00000000 --- a/docs/source/api/maas_lib.models.nlp.rst +++ /dev/null @@ -1,18 +0,0 @@ -maas\_lib.models.nlp package -============================ - -.. automodule:: maas_lib.models.nlp - :members: - :undoc-members: - :show-inheritance: - -Submodules ----------- - -maas\_lib.models.nlp.sequence\_classification\_model module ------------------------------------------------------------ - -.. automodule:: maas_lib.models.nlp.sequence_classification_model - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/maas_lib.models.rst b/docs/source/api/maas_lib.models.rst deleted file mode 100644 index 9e1874a3..00000000 --- a/docs/source/api/maas_lib.models.rst +++ /dev/null @@ -1,34 +0,0 @@ -maas\_lib.models package -======================== - -.. automodule:: maas_lib.models - :members: - :undoc-members: - :show-inheritance: - -Subpackages ------------ - -.. toctree:: - :maxdepth: 4 - - maas_lib.models.nlp - -Submodules ----------- - -maas\_lib.models.base module ----------------------------- - -.. automodule:: maas_lib.models.base - :members: - :undoc-members: - :show-inheritance: - -maas\_lib.models.builder module -------------------------------- - -.. automodule:: maas_lib.models.builder - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/maas_lib.pipelines.audio.rst b/docs/source/api/maas_lib.pipelines.audio.rst deleted file mode 100644 index 71e29b42..00000000 --- a/docs/source/api/maas_lib.pipelines.audio.rst +++ /dev/null @@ -1,7 +0,0 @@ -maas\_lib.pipelines.audio package -================================= - -.. automodule:: maas_lib.pipelines.audio - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/maas_lib.pipelines.cv.rst b/docs/source/api/maas_lib.pipelines.cv.rst deleted file mode 100644 index 938ebb5a..00000000 --- a/docs/source/api/maas_lib.pipelines.cv.rst +++ /dev/null @@ -1,18 +0,0 @@ -maas\_lib.pipelines.cv package -============================== - -.. automodule:: maas_lib.pipelines.cv - :members: - :undoc-members: - :show-inheritance: - -Submodules ----------- - -maas\_lib.pipelines.cv.image\_matting module --------------------------------------------- - -.. automodule:: maas_lib.pipelines.cv.image_matting - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/maas_lib.pipelines.multi_modal.rst b/docs/source/api/maas_lib.pipelines.multi_modal.rst deleted file mode 100644 index 74a7bf43..00000000 --- a/docs/source/api/maas_lib.pipelines.multi_modal.rst +++ /dev/null @@ -1,7 +0,0 @@ -maas\_lib.pipelines.multi\_modal package -======================================== - -.. automodule:: maas_lib.pipelines.multi_modal - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/maas_lib.preprocessors.rst b/docs/source/api/maas_lib.preprocessors.rst deleted file mode 100644 index 5f70e808..00000000 --- a/docs/source/api/maas_lib.preprocessors.rst +++ /dev/null @@ -1,50 +0,0 @@ -maas\_lib.preprocessors package -=============================== - -.. automodule:: maas_lib.preprocessors - :members: - :undoc-members: - :show-inheritance: - -Submodules ----------- - -maas\_lib.preprocessors.base module ------------------------------------ - -.. automodule:: maas_lib.preprocessors.base - :members: - :undoc-members: - :show-inheritance: - -maas\_lib.preprocessors.builder module --------------------------------------- - -.. automodule:: maas_lib.preprocessors.builder - :members: - :undoc-members: - :show-inheritance: - -maas\_lib.preprocessors.common module -------------------------------------- - -.. automodule:: maas_lib.preprocessors.common - :members: - :undoc-members: - :show-inheritance: - -maas\_lib.preprocessors.image module ------------------------------------- - -.. automodule:: maas_lib.preprocessors.image - :members: - :undoc-members: - :show-inheritance: - -maas\_lib.preprocessors.nlp module ----------------------------------- - -.. automodule:: maas_lib.preprocessors.nlp - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/maas_lib.rst b/docs/source/api/maas_lib.rst deleted file mode 100644 index 727b7986..00000000 --- a/docs/source/api/maas_lib.rst +++ /dev/null @@ -1,30 +0,0 @@ -maas\_lib package -================= - -.. automodule:: maas_lib - :members: - :undoc-members: - :show-inheritance: - -Subpackages ------------ - -.. toctree:: - :maxdepth: 4 - - maas_lib.fileio - maas_lib.models - maas_lib.pipelines - maas_lib.preprocessors - maas_lib.utils - -Submodules ----------- - -maas\_lib.version module ------------------------- - -.. automodule:: maas_lib.version - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/maas_lib.trainers.nlp.rst b/docs/source/api/maas_lib.trainers.nlp.rst deleted file mode 100644 index 71f484ca..00000000 --- a/docs/source/api/maas_lib.trainers.nlp.rst +++ /dev/null @@ -1,18 +0,0 @@ -maas\_lib.trainers.nlp package -============================== - -.. automodule:: maas_lib.trainers.nlp - :members: - :undoc-members: - :show-inheritance: - -Submodules ----------- - -maas\_lib.trainers.nlp.sequence\_classification\_trainer module ---------------------------------------------------------------- - -.. automodule:: maas_lib.trainers.nlp.sequence_classification_trainer - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/maas_lib.trainers.rst b/docs/source/api/maas_lib.trainers.rst deleted file mode 100644 index eb90ee4f..00000000 --- a/docs/source/api/maas_lib.trainers.rst +++ /dev/null @@ -1,34 +0,0 @@ -maas\_lib.trainers package -========================== - -.. automodule:: maas_lib.trainers - :members: - :undoc-members: - :show-inheritance: - -Subpackages ------------ - -.. toctree:: - :maxdepth: 4 - - maas_lib.trainers.nlp - -Submodules ----------- - -maas\_lib.trainers.base module ------------------------------- - -.. automodule:: maas_lib.trainers.base - :members: - :undoc-members: - :show-inheritance: - -maas\_lib.trainers.builder module ---------------------------------- - -.. automodule:: maas_lib.trainers.builder - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/maas_lib.utils.rst b/docs/source/api/maas_lib.utils.rst deleted file mode 100644 index 17ead3eb..00000000 --- a/docs/source/api/maas_lib.utils.rst +++ /dev/null @@ -1,58 +0,0 @@ -maas\_lib.utils package -======================= - -.. automodule:: maas_lib.utils - :members: - :undoc-members: - :show-inheritance: - -Submodules ----------- - -maas\_lib.utils.config module ------------------------------ - -.. automodule:: maas_lib.utils.config - :members: - :undoc-members: - :show-inheritance: - -maas\_lib.utils.constant module -------------------------------- - -.. automodule:: maas_lib.utils.constant - :members: - :undoc-members: - :show-inheritance: - -maas\_lib.utils.logger module ------------------------------ - -.. automodule:: maas_lib.utils.logger - :members: - :undoc-members: - :show-inheritance: - -maas\_lib.utils.pymod module ----------------------------- - -.. automodule:: maas_lib.utils.pymod - :members: - :undoc-members: - :show-inheritance: - -maas\_lib.utils.registry module -------------------------------- - -.. automodule:: maas_lib.utils.registry - :members: - :undoc-members: - :show-inheritance: - -maas\_lib.utils.type\_assert module ------------------------------------ - -.. automodule:: maas_lib.utils.type_assert - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/modelscope.fileio.format.rst b/docs/source/api/modelscope.fileio.format.rst new file mode 100644 index 00000000..2c7b11de --- /dev/null +++ b/docs/source/api/modelscope.fileio.format.rst @@ -0,0 +1,34 @@ +modelscope.fileio.format package +================================ + +.. automodule:: modelscope.fileio.format + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +modelscope.fileio.format.base module +------------------------------------ + +.. automodule:: modelscope.fileio.format.base + :members: + :undoc-members: + :show-inheritance: + +modelscope.fileio.format.json module +------------------------------------ + +.. automodule:: modelscope.fileio.format.json + :members: + :undoc-members: + :show-inheritance: + +modelscope.fileio.format.yaml module +------------------------------------ + +.. automodule:: modelscope.fileio.format.yaml + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/modelscope.fileio.rst b/docs/source/api/modelscope.fileio.rst new file mode 100644 index 00000000..3f4ae1ca --- /dev/null +++ b/docs/source/api/modelscope.fileio.rst @@ -0,0 +1,34 @@ +modelscope.fileio package +========================= + +.. automodule:: modelscope.fileio + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + modelscope.fileio.format + +Submodules +---------- + +modelscope.fileio.file module +----------------------------- + +.. automodule:: modelscope.fileio.file + :members: + :undoc-members: + :show-inheritance: + +modelscope.fileio.io module +--------------------------- + +.. automodule:: modelscope.fileio.io + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/modelscope.models.cv.cartoon.facelib.LK.rst b/docs/source/api/modelscope.models.cv.cartoon.facelib.LK.rst new file mode 100644 index 00000000..848c7d67 --- /dev/null +++ b/docs/source/api/modelscope.models.cv.cartoon.facelib.LK.rst @@ -0,0 +1,18 @@ +modelscope.models.cv.cartoon.facelib.LK package +=============================================== + +.. automodule:: modelscope.models.cv.cartoon.facelib.LK + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +modelscope.models.cv.cartoon.facelib.LK.lk module +------------------------------------------------- + +.. automodule:: modelscope.models.cv.cartoon.facelib.LK.lk + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/modelscope.models.cv.cartoon.facelib.rst b/docs/source/api/modelscope.models.cv.cartoon.facelib.rst new file mode 100644 index 00000000..a81536b0 --- /dev/null +++ b/docs/source/api/modelscope.models.cv.cartoon.facelib.rst @@ -0,0 +1,50 @@ +modelscope.models.cv.cartoon.facelib package +============================================ + +.. automodule:: modelscope.models.cv.cartoon.facelib + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + modelscope.models.cv.cartoon.facelib.LK + +Submodules +---------- + +modelscope.models.cv.cartoon.facelib.config module +-------------------------------------------------- + +.. automodule:: modelscope.models.cv.cartoon.facelib.config + :members: + :undoc-members: + :show-inheritance: + +modelscope.models.cv.cartoon.facelib.face\_detector module +---------------------------------------------------------- + +.. automodule:: modelscope.models.cv.cartoon.facelib.face_detector + :members: + :undoc-members: + :show-inheritance: + +modelscope.models.cv.cartoon.facelib.face\_landmark module +---------------------------------------------------------- + +.. automodule:: modelscope.models.cv.cartoon.facelib.face_landmark + :members: + :undoc-members: + :show-inheritance: + +modelscope.models.cv.cartoon.facelib.facer module +------------------------------------------------- + +.. automodule:: modelscope.models.cv.cartoon.facelib.facer + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/modelscope.models.cv.cartoon.mtcnn_pytorch.rst b/docs/source/api/modelscope.models.cv.cartoon.mtcnn_pytorch.rst new file mode 100644 index 00000000..b5845af7 --- /dev/null +++ b/docs/source/api/modelscope.models.cv.cartoon.mtcnn_pytorch.rst @@ -0,0 +1,15 @@ +modelscope.models.cv.cartoon.mtcnn\_pytorch package +=================================================== + +.. automodule:: modelscope.models.cv.cartoon.mtcnn_pytorch + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + modelscope.models.cv.cartoon.mtcnn_pytorch.src diff --git a/docs/source/api/modelscope.models.cv.cartoon.mtcnn_pytorch.src.rst b/docs/source/api/modelscope.models.cv.cartoon.mtcnn_pytorch.src.rst new file mode 100644 index 00000000..715cc292 --- /dev/null +++ b/docs/source/api/modelscope.models.cv.cartoon.mtcnn_pytorch.src.rst @@ -0,0 +1,26 @@ +modelscope.models.cv.cartoon.mtcnn\_pytorch.src package +======================================================= + +.. automodule:: modelscope.models.cv.cartoon.mtcnn_pytorch.src + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +modelscope.models.cv.cartoon.mtcnn\_pytorch.src.align\_trans module +------------------------------------------------------------------- + +.. automodule:: modelscope.models.cv.cartoon.mtcnn_pytorch.src.align_trans + :members: + :undoc-members: + :show-inheritance: + +modelscope.models.cv.cartoon.mtcnn\_pytorch.src.matlab\_cp2tform module +----------------------------------------------------------------------- + +.. automodule:: modelscope.models.cv.cartoon.mtcnn_pytorch.src.matlab_cp2tform + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/modelscope.models.cv.cartoon.rst b/docs/source/api/modelscope.models.cv.cartoon.rst new file mode 100644 index 00000000..5a262e03 --- /dev/null +++ b/docs/source/api/modelscope.models.cv.cartoon.rst @@ -0,0 +1,27 @@ +modelscope.models.cv.cartoon package +==================================== + +.. automodule:: modelscope.models.cv.cartoon + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + modelscope.models.cv.cartoon.facelib + modelscope.models.cv.cartoon.mtcnn_pytorch + +Submodules +---------- + +modelscope.models.cv.cartoon.utils module +----------------------------------------- + +.. automodule:: modelscope.models.cv.cartoon.utils + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/modelscope.models.cv.rst b/docs/source/api/modelscope.models.cv.rst new file mode 100644 index 00000000..47ce3916 --- /dev/null +++ b/docs/source/api/modelscope.models.cv.rst @@ -0,0 +1,15 @@ +modelscope.models.cv package +============================ + +.. automodule:: modelscope.models.cv + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + modelscope.models.cv.cartoon diff --git a/docs/source/api/modelscope.models.nlp.rst b/docs/source/api/modelscope.models.nlp.rst new file mode 100644 index 00000000..f332aca8 --- /dev/null +++ b/docs/source/api/modelscope.models.nlp.rst @@ -0,0 +1,26 @@ +modelscope.models.nlp package +============================= + +.. automodule:: modelscope.models.nlp + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +modelscope.models.nlp.sequence\_classification\_model module +------------------------------------------------------------ + +.. automodule:: modelscope.models.nlp.sequence_classification_model + :members: + :undoc-members: + :show-inheritance: + +modelscope.models.nlp.text\_generation\_model module +---------------------------------------------------- + +.. automodule:: modelscope.models.nlp.text_generation_model + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/modelscope.models.rst b/docs/source/api/modelscope.models.rst new file mode 100644 index 00000000..8f2870b3 --- /dev/null +++ b/docs/source/api/modelscope.models.rst @@ -0,0 +1,35 @@ +modelscope.models package +========================= + +.. automodule:: modelscope.models + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + modelscope.models.cv + modelscope.models.nlp + +Submodules +---------- + +modelscope.models.base module +----------------------------- + +.. automodule:: modelscope.models.base + :members: + :undoc-members: + :show-inheritance: + +modelscope.models.builder module +-------------------------------- + +.. automodule:: modelscope.models.builder + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/modelscope.pipelines.audio.rst b/docs/source/api/modelscope.pipelines.audio.rst new file mode 100644 index 00000000..f162893f --- /dev/null +++ b/docs/source/api/modelscope.pipelines.audio.rst @@ -0,0 +1,7 @@ +modelscope.pipelines.audio package +================================== + +.. automodule:: modelscope.pipelines.audio + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/modelscope.pipelines.cv.rst b/docs/source/api/modelscope.pipelines.cv.rst new file mode 100644 index 00000000..3f2da3f4 --- /dev/null +++ b/docs/source/api/modelscope.pipelines.cv.rst @@ -0,0 +1,26 @@ +modelscope.pipelines.cv package +=============================== + +.. automodule:: modelscope.pipelines.cv + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +modelscope.pipelines.cv.image\_cartoon\_pipeline module +------------------------------------------------------- + +.. automodule:: modelscope.pipelines.cv.image_cartoon_pipeline + :members: + :undoc-members: + :show-inheritance: + +modelscope.pipelines.cv.image\_matting\_pipeline module +------------------------------------------------------- + +.. automodule:: modelscope.pipelines.cv.image_matting_pipeline + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/modelscope.pipelines.multi_modal.rst b/docs/source/api/modelscope.pipelines.multi_modal.rst new file mode 100644 index 00000000..36df1c7c --- /dev/null +++ b/docs/source/api/modelscope.pipelines.multi_modal.rst @@ -0,0 +1,18 @@ +modelscope.pipelines.multi\_modal package +========================================= + +.. automodule:: modelscope.pipelines.multi_modal + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +modelscope.pipelines.multi\_modal.image\_captioning module +---------------------------------------------------------- + +.. automodule:: modelscope.pipelines.multi_modal.image_captioning + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/modelscope.pipelines.nlp.rst b/docs/source/api/modelscope.pipelines.nlp.rst new file mode 100644 index 00000000..836d914f --- /dev/null +++ b/docs/source/api/modelscope.pipelines.nlp.rst @@ -0,0 +1,26 @@ +modelscope.pipelines.nlp package +================================ + +.. automodule:: modelscope.pipelines.nlp + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +modelscope.pipelines.nlp.sequence\_classification\_pipeline module +------------------------------------------------------------------ + +.. automodule:: modelscope.pipelines.nlp.sequence_classification_pipeline + :members: + :undoc-members: + :show-inheritance: + +modelscope.pipelines.nlp.text\_generation\_pipeline module +---------------------------------------------------------- + +.. automodule:: modelscope.pipelines.nlp.text_generation_pipeline + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/modelscope.pipelines.rst b/docs/source/api/modelscope.pipelines.rst new file mode 100644 index 00000000..167b5cd3 --- /dev/null +++ b/docs/source/api/modelscope.pipelines.rst @@ -0,0 +1,53 @@ +modelscope.pipelines package +============================ + +.. automodule:: modelscope.pipelines + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + modelscope.pipelines.audio + modelscope.pipelines.cv + modelscope.pipelines.multi_modal + modelscope.pipelines.nlp + +Submodules +---------- + +modelscope.pipelines.base module +-------------------------------- + +.. automodule:: modelscope.pipelines.base + :members: + :undoc-members: + :show-inheritance: + +modelscope.pipelines.builder module +----------------------------------- + +.. automodule:: modelscope.pipelines.builder + :members: + :undoc-members: + :show-inheritance: + +modelscope.pipelines.default module +----------------------------------- + +.. automodule:: modelscope.pipelines.default + :members: + :undoc-members: + :show-inheritance: + +modelscope.pipelines.util module +-------------------------------- + +.. automodule:: modelscope.pipelines.util + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/modelscope.preprocessors.rst b/docs/source/api/modelscope.preprocessors.rst new file mode 100644 index 00000000..b555198d --- /dev/null +++ b/docs/source/api/modelscope.preprocessors.rst @@ -0,0 +1,50 @@ +modelscope.preprocessors package +================================ + +.. automodule:: modelscope.preprocessors + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +modelscope.preprocessors.base module +------------------------------------ + +.. automodule:: modelscope.preprocessors.base + :members: + :undoc-members: + :show-inheritance: + +modelscope.preprocessors.builder module +--------------------------------------- + +.. automodule:: modelscope.preprocessors.builder + :members: + :undoc-members: + :show-inheritance: + +modelscope.preprocessors.common module +-------------------------------------- + +.. automodule:: modelscope.preprocessors.common + :members: + :undoc-members: + :show-inheritance: + +modelscope.preprocessors.image module +------------------------------------- + +.. automodule:: modelscope.preprocessors.image + :members: + :undoc-members: + :show-inheritance: + +modelscope.preprocessors.nlp module +----------------------------------- + +.. automodule:: modelscope.preprocessors.nlp + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/modelscope.pydatasets.rst b/docs/source/api/modelscope.pydatasets.rst new file mode 100644 index 00000000..2508a91f --- /dev/null +++ b/docs/source/api/modelscope.pydatasets.rst @@ -0,0 +1,18 @@ +modelscope.pydatasets package +============================= + +.. automodule:: modelscope.pydatasets + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +modelscope.pydatasets.py\_dataset module +---------------------------------------- + +.. automodule:: modelscope.pydatasets.py_dataset + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/modelscope.rst b/docs/source/api/modelscope.rst new file mode 100644 index 00000000..efab568b --- /dev/null +++ b/docs/source/api/modelscope.rst @@ -0,0 +1,32 @@ +modelscope package +================== + +.. automodule:: modelscope + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + modelscope.fileio + modelscope.models + modelscope.pipelines + modelscope.preprocessors + modelscope.pydatasets + modelscope.trainers + modelscope.utils + +Submodules +---------- + +modelscope.version module +------------------------- + +.. automodule:: modelscope.version + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/maas_lib.pipelines.nlp.rst b/docs/source/api/modelscope.trainers.nlp.rst similarity index 52% rename from docs/source/api/maas_lib.pipelines.nlp.rst rename to docs/source/api/modelscope.trainers.nlp.rst index d41c09ad..4bc2f875 100644 --- a/docs/source/api/maas_lib.pipelines.nlp.rst +++ b/docs/source/api/modelscope.trainers.nlp.rst @@ -1,7 +1,7 @@ -maas\_lib.pipelines.nlp package +modelscope.trainers.nlp package =============================== -.. automodule:: maas_lib.pipelines.nlp +.. automodule:: modelscope.trainers.nlp :members: :undoc-members: :show-inheritance: @@ -9,10 +9,10 @@ maas\_lib.pipelines.nlp package Submodules ---------- -maas\_lib.pipelines.nlp.sequence\_classification\_pipeline module ------------------------------------------------------------------ +modelscope.trainers.nlp.sequence\_classification\_trainer module +---------------------------------------------------------------- -.. automodule:: maas_lib.pipelines.nlp.sequence_classification_pipeline +.. automodule:: modelscope.trainers.nlp.sequence_classification_trainer :members: :undoc-members: :show-inheritance: diff --git a/docs/source/api/maas_lib.pipelines.rst b/docs/source/api/modelscope.trainers.rst similarity index 53% rename from docs/source/api/maas_lib.pipelines.rst rename to docs/source/api/modelscope.trainers.rst index 40b82adc..aac4fb99 100644 --- a/docs/source/api/maas_lib.pipelines.rst +++ b/docs/source/api/modelscope.trainers.rst @@ -1,7 +1,7 @@ -maas\_lib.pipelines package +modelscope.trainers package =========================== -.. automodule:: maas_lib.pipelines +.. automodule:: modelscope.trainers :members: :undoc-members: :show-inheritance: @@ -12,25 +12,23 @@ Subpackages .. toctree:: :maxdepth: 4 - maas_lib.pipelines.cv - maas_lib.pipelines.multi_modal - maas_lib.pipelines.nlp + modelscope.trainers.nlp Submodules ---------- -maas\_lib.pipelines.base module +modelscope.trainers.base module ------------------------------- -.. automodule:: maas_lib.pipelines.base +.. automodule:: modelscope.trainers.base :members: :undoc-members: :show-inheritance: -maas\_lib.pipelines.builder module +modelscope.trainers.builder module ---------------------------------- -.. automodule:: maas_lib.pipelines.builder +.. automodule:: modelscope.trainers.builder :members: :undoc-members: :show-inheritance: diff --git a/docs/source/api/modelscope.utils.rst b/docs/source/api/modelscope.utils.rst new file mode 100644 index 00000000..0a78d4f4 --- /dev/null +++ b/docs/source/api/modelscope.utils.rst @@ -0,0 +1,66 @@ +modelscope.utils package +======================== + +.. automodule:: modelscope.utils + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +modelscope.utils.config module +------------------------------ + +.. automodule:: modelscope.utils.config + :members: + :undoc-members: + :show-inheritance: + +modelscope.utils.constant module +-------------------------------- + +.. automodule:: modelscope.utils.constant + :members: + :undoc-members: + :show-inheritance: + +modelscope.utils.hub module +--------------------------- + +.. automodule:: modelscope.utils.hub + :members: + :undoc-members: + :show-inheritance: + +modelscope.utils.logger module +------------------------------ + +.. automodule:: modelscope.utils.logger + :members: + :undoc-members: + :show-inheritance: + +modelscope.utils.pymod module +----------------------------- + +.. automodule:: modelscope.utils.pymod + :members: + :undoc-members: + :show-inheritance: + +modelscope.utils.registry module +-------------------------------- + +.. automodule:: modelscope.utils.registry + :members: + :undoc-members: + :show-inheritance: + +modelscope.utils.type\_assert module +------------------------------------ + +.. automodule:: modelscope.utils.type_assert + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/modules.rst b/docs/source/api/modules.rst index 84eecc70..0f83e90c 100644 --- a/docs/source/api/modules.rst +++ b/docs/source/api/modules.rst @@ -1,7 +1,7 @@ -maas_lib -======== +modelscope +========== .. toctree:: :maxdepth: 4 - maas_lib + modelscope diff --git a/docs/source/conf.py b/docs/source/conf.py index 4cdcd956..2c2a0017 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -18,10 +18,10 @@ import sphinx_rtd_theme sys.path.insert(0, os.path.abspath('../../')) # -- Project information ----------------------------------------------------- -project = 'maas_lib' -copyright = '2022-2023, Alibaba MaaS' -author = 'maas_lib Authors' -version_file = '../../maas_lib/version.py' +project = 'modelscope' +copyright = '2022-2023, Alibaba ModelScope' +author = 'modelscope Authors' +version_file = '../../modelscope/version.py' def get_version(): @@ -88,7 +88,7 @@ html_static_path = ['_static'] # -- Options for HTMLHelp output --------------------------------------------- # Output file base name for HTML help builder. -htmlhelp_basename = 'maas_lib_doc' +htmlhelp_basename = 'modelscope_doc' # -- Extension configuration ------------------------------------------------- # Ignore >>> when copying code diff --git a/docs/source/develop.md b/docs/source/develop.md index 4d0812ae..c048bef7 100644 --- a/docs/source/develop.md +++ b/docs/source/develop.md @@ -10,39 +10,86 @@ We use the following toolsseed isortseed isortseed isort for linting and formatt Style configurations of yapf and isort can be found in [setup.cfg](../../setup.cfg). We use [pre-commit hook](https://pre-commit.com/) that checks and formats for `flake8`, `yapf`, `seed-isort-config`, `isort`, `trailing whitespaces`, - fixes `end-of-files`, sorts `requirments.txt` automatically on every commit. - The config for a pre-commit hook is stored in [.pre-commit-config](../../.pre-commit-config.yaml). - After you clone the repository, you will need to install initialize pre-commit hook. - ```bash - pip install -r requirements/tests.txt - ``` - From the repository folder - ```bash - pre-commit install - ``` - - After this on every commit check code linters and formatter will be enforced. - - If you want to use pre-commit to check all the files, you can run - ```bash - pre-commit run --all-files - ``` - - If you only want to format and lint your code, you can run - ```bash - make linter - ``` - - ## 2. Test - ### 2.1 Unit test - ```bash - make test - ``` - - ### 2.2 Test data - TODO - - ## 3. Build pip package - ```bash - make whl - ``` +fixes `end-of-files`, sorts `requirments.txt` automatically on every commit. +The config for a pre-commit hook is stored in [.pre-commit-config](../../.pre-commit-config.yaml). +After you clone the repository, you will need to install initialize pre-commit hook. +```bash +pip install -r requirements/tests.txt +``` +From the repository folder +```bash +pre-commit install +``` + +After this on every commit check code linters and formatter will be enforced. + +If you want to use pre-commit to check all the files, you can run +```bash +pre-commit run --all-files +``` + +If you only want to format and lint your code, you can run +```bash +make linter +``` + +## 2. Test +### 2.1 Unit test +```bash +make test +``` + +### 2.2 Test data +TODO + +## Code Review + +1. Run following command to create an aone CR, replace `TARGET_BRANCH` and `CR_NAME` with the one you want. + ```shell + git push origin HEAD:refs/for/TARGET_BRANCH/CR_NAME + ``` + + Please refer to [https://yuque.antfin.com/aone/platform/lcg8yr](https://yuque.antfin.com/aone/platform/lcg8yr) for more details. + + The following output is expected. + ```shell + Counting objects: 5, done. + Delta compression using up to 96 threads. + Compressing objects: 100% (5/5), done. + Writing objects: 100% (5/5), 543 bytes | 0 bytes/s, done. + Total 5 (delta 4), reused 0 (delta 0) + remote: +------------------------------------------------------------------------+ + remote: | Merge Request #8949062 was created or updated. | + remote: | View merge request at URL: | + remote: | https://code.aone.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/8949062 | + remote: +------------------------------------------------------------------------+ + To git@gitlab.alibaba-inc.com:Ali-MaaS/MaaS-lib.git + * [new branch] HEAD -> refs/for/master/support_kwargs_pipeline + ``` + +2. Open the remote url `https://code.aone.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/ID` and edit the title of CR with following format before merging your code: + * Feature + ```shell + [to #AONE_ID] feat: commit title + + Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/8949062 + + * commit msg1 + * commit msg2 + ``` + * Bugfix + ```shell + [to #AONE_ID] fix: commit title + + Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/8949062 + + * commit msg1 + * commit msg2 + ``` + + + +## Build pip package +```bash +make whl +``` diff --git a/docs/source/index.rst b/docs/source/index.rst index 0ca63b41..3b223531 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,11 +1,11 @@ -.. maas_lib documentation file, +.. modelscope documentation file, You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -MaasLib DOCUMENTATION +ModelScope DOCUMENTATION ======================================= -MaasLib doc +ModelScope doc .. toctree:: :maxdepth: 2 @@ -30,11 +30,11 @@ MaasLib doc :maxdepth: 10 :caption: API Doc - api/maas_lib.preprocessors - api/maas_lib.models - api/maas_lib.pipelines - api/maas_lib.fileio - api/maas_lib.utils + api/modelscope.preprocessors + api/modelscope.models + api/modelscope.pipelines + api/modelscope.fileio + api/modelscope.utils Indices and tables diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md index 3c961097..0f4cbbc3 100644 --- a/docs/source/quick_start.md +++ b/docs/source/quick_start.md @@ -5,39 +5,39 @@ 安装完成后,执行如下命令为maas library创建对应的python环境。 ```shell -conda create -n maas python=3.6 -conda activate maas +conda create -n modelscope python=3.6 +conda activate modelscope ``` 检查python和pip命令是否切换到conda环境下。 ```shell which python -# ~/workspace/anaconda3/envs/maas/bin/python +# ~/workspace/anaconda3/envs/modelscope/bin/python which pip -# ~/workspace/anaconda3/envs/maas/bin/pip +# ~/workspace/anaconda3/envs/modelscope/bin/pip ``` 注: 本项目只支持`python3`环境,请勿使用python2环境。 ## 第三方依赖安装 -MaaS Library目前支持tensorflow,pytorch两大深度学习框架进行模型训练、推理, 在Python 3.6+, Pytorch 1.8+, Tensorflow 2.6上测试可运行,用户可以根据所选模型对应的计算框架进行安装,可以参考如下链接进行安装所需框架: +ModelScope Library目前支持tensorflow,pytorch两大深度学习框架进行模型训练、推理, 在Python 3.6+, Pytorch 1.8+, Tensorflow 2.6上测试可运行,用户可以根据所选模型对应的计算框架进行安装,可以参考如下链接进行安装所需框架: * [Pytorch安装指导](https://pytorch.org/get-started/locally/) * [Tensorflow安装指导](https://www.tensorflow.org/install/pip) -## MaaS library 安装 +## ModelScope library 安装 注: 如果在安装过程中遇到错误,请前往[常见问题](faq.md)查找解决方案。 ### pip安装 ```shell -pip install -r http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/release/maas/maas.txt +pip install -r http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/release/maas/modelscope.txt ``` 安装成功后,可以执行如下命令进行验证安装是否正确 ```shell -python -c "from maas_lib.pipelines import pipeline;print(pipeline('image-matting',model='damo/image-matting-person')('http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png'))" +python -c "from modelscope.pipelines import pipeline;print(pipeline('image-matting',model='damo/image-matting-person')('http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png'))" ``` @@ -45,11 +45,11 @@ python -c "from maas_lib.pipelines import pipeline;print(pipeline('image-matting 适合本地开发调试使用,修改源码后可以直接执行 ```shell -git clone git@gitlab.alibaba-inc.com:Ali-MaaS/MaaS-lib.git maaslib +git clone git@gitlab.alibaba-inc.com:Ali-MaaS/MaaS-lib.git modelscope git fetch origin master git checkout master -cd maaslib +cd modelscope #安装依赖 pip install -r requirements.txt @@ -60,7 +60,7 @@ export PYTHONPATH=`pwd` 安装成功后,可以执行如下命令进行验证安装是否正确 ```shell -python -c "from maas_lib.pipelines import pipeline;print(pipeline('image-matting',model='damo/image-matting-person')('http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png'))" +python -c "from modelscope.pipelines import pipeline;print(pipeline('image-matting',model='damo/image-matting-person')('http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png'))" ``` @@ -79,8 +79,8 @@ pipeline函数提供了简洁的推理接口,示例如下, 更多pipeline介 ```python import cv2 import os.path as osp -from maas_lib.pipelines import pipeline -from maas_lib.utils.constant import Tasks +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks # 根据任务名创建pipeline img_matting = pipeline(Tasks.image_matting, model='damo/image-matting-person') @@ -95,12 +95,13 @@ print(f'Output written to {osp.abspath("result.png")}') ``` 此外,pipeline接口也能接收Dataset作为输入,上面的代码同样可以实现为 + ```python import cv2 import os.path as osp -from maas_lib.pipelines import pipeline -from maas_lib.utils.constant import Tasks -from ali_maas_datasets import PyDataset +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks +from modelscope.pydatasets import PyDataset # 使用图像url构建PyDataset,此处也可通过 input_location = '/dir/to/images' 来使用本地文件夹 input_location = [ diff --git a/docs/source/tutorials/pipeline.md b/docs/source/tutorials/pipeline.md index 512e64ee..cc851278 100644 --- a/docs/source/tutorials/pipeline.md +++ b/docs/source/tutorials/pipeline.md @@ -19,7 +19,7 @@ 1. pipeline函数支持指定特定任务名称,加载任务默认模型,创建对应Pipeline对象 执行如下python代码 ```python - >>> from maas_lib.pipelines import pipeline + >>> from modelscope.pipelines import pipeline >>> img_matting = pipeline(task='image-matting', model='damo/image-matting-person') ``` @@ -65,8 +65,8 @@ wget https://atp-modelzoo-sh.oss-cn-shanghai.aliyuncs.com/release/easynlp_modelz 创建tokenizer和模型 ```python ->>> from maas_lib.models import Model ->>> from maas_lib.preprocessors import SequenceClassificationPreprocessor +>>> from modelscope.models import Model +>>> from modelscope.preprocessors import SequenceClassificationPreprocessor >>> model = Model.from_pretrained('damo/bert-base-sst2') >>> tokenizer = SequenceClassificationPreprocessor( model.model_dir, first_sequence='sentence', second_sequence=None) @@ -74,7 +74,7 @@ wget https://atp-modelzoo-sh.oss-cn-shanghai.aliyuncs.com/release/easynlp_modelz 使用tokenizer和模型对象创建pipeline ```python ->>> from maas_lib.pipelines import pipeline +>>> from modelscope.pipelines import pipeline >>> semantic_cls = pipeline('text-classification', model=model, preprocessor=tokenizer) >>> semantic_cls("Hello world!") ``` diff --git a/maas_lib/pipelines/builder.py b/maas_lib/pipelines/builder.py deleted file mode 100644 index 703dd33f..00000000 --- a/maas_lib/pipelines/builder.py +++ /dev/null @@ -1,87 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. - -import os.path as osp -from typing import Union - -import json -from maas_hub.file_download import model_file_download - -from maas_lib.models.base import Model -from maas_lib.utils.config import Config, ConfigDict -from maas_lib.utils.constant import CONFIGFILE, Tasks -from maas_lib.utils.registry import Registry, build_from_cfg -from .base import Pipeline -from .util import is_model_name - -PIPELINES = Registry('pipelines') - - -def build_pipeline(cfg: ConfigDict, - task_name: str = None, - default_args: dict = None): - """ build pipeline given model config dict. - - Args: - cfg (:obj:`ConfigDict`): config dict for model object. - task_name (str, optional): task name, refer to - :obj:`Tasks` for more details. - default_args (dict, optional): Default initialization arguments. - """ - return build_from_cfg( - cfg, PIPELINES, group_key=task_name, default_args=default_args) - - -def pipeline(task: str = None, - model: Union[str, Model] = None, - preprocessor=None, - config_file: str = None, - pipeline_name: str = None, - framework: str = None, - device: int = -1, - **kwargs) -> Pipeline: - """ Factory method to build a obj:`Pipeline`. - - - Args: - task (str): Task name defining which pipeline will be returned. - model (str or obj:`Model`): model name or model object. - preprocessor: preprocessor object. - config_file (str, optional): path to config file. - pipeline_name (str, optional): pipeline class name or alias name. - framework (str, optional): framework type. - device (int, optional): which device is used to do inference. - - Return: - pipeline (obj:`Pipeline`): pipeline object for certain task. - - Examples: - ```python - >>> p = pipeline('image-classification') - >>> p = pipeline('text-classification', model='distilbert-base-uncased') - >>> # Using model object - >>> resnet = Model.from_pretrained('Resnet') - >>> p = pipeline('image-classification', model=resnet) - """ - if task is None and pipeline_name is None: - raise ValueError('task or pipeline_name is required') - - if pipeline_name is None: - # get default pipeline for this task - assert task in PIPELINES.modules, f'No pipeline is registerd for Task {task}' - pipeline_name = get_default_pipeline(task) - - cfg = ConfigDict(type=pipeline_name) - - if model: - assert isinstance(model, (str, Model)), \ - f'model should be either str or Model, but got {type(model)}' - cfg.model = model - - if preprocessor is not None: - cfg.preprocessor = preprocessor - - return build_pipeline(cfg, task_name=task) - - -def get_default_pipeline(task): - return list(PIPELINES.modules[task].keys())[0] diff --git a/maas_lib/pipelines/cv/__init__.py b/maas_lib/pipelines/cv/__init__.py deleted file mode 100644 index 79548682..00000000 --- a/maas_lib/pipelines/cv/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .image_matting import ImageMatting diff --git a/maas_lib/pipelines/util.py b/maas_lib/pipelines/util.py deleted file mode 100644 index 3e907359..00000000 --- a/maas_lib/pipelines/util.py +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -import os.path as osp - -import json -from maas_hub.file_download import model_file_download - -from maas_lib.utils.constant import CONFIGFILE - - -def is_model_name(model): - if osp.exists(model): - if osp.exists(osp.join(model, CONFIGFILE)): - return True - else: - return False - else: - # try: - # cfg_file = model_file_download(model, CONFIGFILE) - # except Exception: - # cfg_file = None - # TODO @wenmeng.zwm use exception instead of - # following tricky logic - cfg_file = model_file_download(model, CONFIGFILE) - with open(cfg_file, 'r') as infile: - cfg = json.load(infile) - if 'Code' in cfg: - return False - else: - return True diff --git a/maas_lib/__init__.py b/modelscope/__init__.py similarity index 100% rename from maas_lib/__init__.py rename to modelscope/__init__.py diff --git a/maas_lib/fileio/__init__.py b/modelscope/fileio/__init__.py similarity index 100% rename from maas_lib/fileio/__init__.py rename to modelscope/fileio/__init__.py diff --git a/maas_lib/fileio/file.py b/modelscope/fileio/file.py similarity index 100% rename from maas_lib/fileio/file.py rename to modelscope/fileio/file.py diff --git a/maas_lib/fileio/format/__init__.py b/modelscope/fileio/format/__init__.py similarity index 100% rename from maas_lib/fileio/format/__init__.py rename to modelscope/fileio/format/__init__.py diff --git a/maas_lib/fileio/format/base.py b/modelscope/fileio/format/base.py similarity index 100% rename from maas_lib/fileio/format/base.py rename to modelscope/fileio/format/base.py diff --git a/maas_lib/fileio/format/json.py b/modelscope/fileio/format/json.py similarity index 100% rename from maas_lib/fileio/format/json.py rename to modelscope/fileio/format/json.py diff --git a/maas_lib/fileio/format/yaml.py b/modelscope/fileio/format/yaml.py similarity index 100% rename from maas_lib/fileio/format/yaml.py rename to modelscope/fileio/format/yaml.py diff --git a/maas_lib/fileio/io.py b/modelscope/fileio/io.py similarity index 100% rename from maas_lib/fileio/io.py rename to modelscope/fileio/io.py diff --git a/maas_lib/models/__init__.py b/modelscope/models/__init__.py similarity index 71% rename from maas_lib/models/__init__.py rename to modelscope/models/__init__.py index aa1b3f14..170e525e 100644 --- a/maas_lib/models/__init__.py +++ b/modelscope/models/__init__.py @@ -2,4 +2,4 @@ from .base import Model from .builder import MODELS, build_model -from .nlp import SequenceClassificationModel +from .nlp import BertForSequenceClassification diff --git a/maas_lib/models/base.py b/modelscope/models/base.py similarity index 78% rename from maas_lib/models/base.py rename to modelscope/models/base.py index cc6c4ec8..e641236d 100644 --- a/maas_lib/models/base.py +++ b/modelscope/models/base.py @@ -7,9 +7,10 @@ from typing import Dict, List, Tuple, Union from maas_hub.file_download import model_file_download from maas_hub.snapshot_download import snapshot_download -from maas_lib.models.builder import build_model -from maas_lib.utils.config import Config -from maas_lib.utils.constant import CONFIGFILE +from modelscope.models.builder import build_model +from modelscope.utils.config import Config +from modelscope.utils.constant import CONFIGFILE +from modelscope.utils.hub import get_model_cache_dir Tensor = Union['torch.Tensor', 'tf.Tensor'] @@ -39,8 +40,9 @@ class Model(ABC): if osp.exists(model_name_or_path): local_model_dir = model_name_or_path else: - - local_model_dir = snapshot_download(model_name_or_path) + cache_path = get_model_cache_dir(model_name_or_path) + local_model_dir = cache_path if osp.exists( + cache_path) else snapshot_download(model_name_or_path) # else: # raise ValueError( # 'Remote model repo {model_name_or_path} does not exists') @@ -48,7 +50,7 @@ class Model(ABC): cfg = Config.from_file(osp.join(local_model_dir, CONFIGFILE)) task_name = cfg.task model_cfg = cfg.model - # TODO @wenmeng.zwm may should mannually initialize model after model building + # TODO @wenmeng.zwm may should manually initialize model after model building if hasattr(model_cfg, 'model_type') and not hasattr(model_cfg, 'type'): model_cfg.type = model_cfg.model_type model_cfg.model_dir = local_model_dir diff --git a/maas_lib/models/builder.py b/modelscope/models/builder.py similarity index 84% rename from maas_lib/models/builder.py rename to modelscope/models/builder.py index 1e52d271..b6df8c90 100644 --- a/maas_lib/models/builder.py +++ b/modelscope/models/builder.py @@ -1,7 +1,7 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -from maas_lib.utils.config import ConfigDict -from maas_lib.utils.registry import Registry, build_from_cfg +from modelscope.utils.config import ConfigDict +from modelscope.utils.registry import Registry, build_from_cfg MODELS = Registry('models') diff --git a/maas_lib/models/nlp/space/__init__.py b/modelscope/models/cv/__init__.py similarity index 100% rename from maas_lib/models/nlp/space/__init__.py rename to modelscope/models/cv/__init__.py diff --git a/maas_lib/models/nlp/space/modules/__init__.py b/modelscope/models/cv/cartoon/__init__.py similarity index 100% rename from maas_lib/models/nlp/space/modules/__init__.py rename to modelscope/models/cv/cartoon/__init__.py diff --git a/modelscope/models/cv/cartoon/facelib/LICENSE b/modelscope/models/cv/cartoon/facelib/LICENSE new file mode 100644 index 00000000..8e497ab8 --- /dev/null +++ b/modelscope/models/cv/cartoon/facelib/LICENSE @@ -0,0 +1,4 @@ + +Copyright (c) Peppa_Pig_Face_Engine + +https://github.com/610265158/Peppa_Pig_Face_Engine diff --git a/maas_lib/pipelines/audio/__init__.py b/modelscope/models/cv/cartoon/facelib/LK/__init__.py similarity index 100% rename from maas_lib/pipelines/audio/__init__.py rename to modelscope/models/cv/cartoon/facelib/LK/__init__.py diff --git a/modelscope/models/cv/cartoon/facelib/LK/lk.py b/modelscope/models/cv/cartoon/facelib/LK/lk.py new file mode 100644 index 00000000..de7c6ced --- /dev/null +++ b/modelscope/models/cv/cartoon/facelib/LK/lk.py @@ -0,0 +1,97 @@ +import numpy as np + +from ..config import config as cfg + + +class GroupTrack(): + + def __init__(self): + self.old_frame = None + self.previous_landmarks_set = None + self.with_landmark = True + self.thres = cfg.TRACE.pixel_thres + self.alpha = cfg.TRACE.smooth_landmark + self.iou_thres = cfg.TRACE.iou_thres + + def calculate(self, img, current_landmarks_set): + if self.previous_landmarks_set is None: + self.previous_landmarks_set = current_landmarks_set + result = current_landmarks_set + else: + previous_lm_num = self.previous_landmarks_set.shape[0] + if previous_lm_num == 0: + self.previous_landmarks_set = current_landmarks_set + result = current_landmarks_set + return result + else: + result = [] + for i in range(current_landmarks_set.shape[0]): + not_in_flag = True + for j in range(previous_lm_num): + if self.iou(current_landmarks_set[i], + self.previous_landmarks_set[j] + ) > self.iou_thres: + result.append( + self.smooth(current_landmarks_set[i], + self.previous_landmarks_set[j])) + not_in_flag = False + break + if not_in_flag: + result.append(current_landmarks_set[i]) + + result = np.array(result) + self.previous_landmarks_set = result + + return result + + def iou(self, p_set0, p_set1): + rec1 = [ + np.min(p_set0[:, 0]), + np.min(p_set0[:, 1]), + np.max(p_set0[:, 0]), + np.max(p_set0[:, 1]) + ] + rec2 = [ + np.min(p_set1[:, 0]), + np.min(p_set1[:, 1]), + np.max(p_set1[:, 0]), + np.max(p_set1[:, 1]) + ] + + # computing area of each rectangles + S_rec1 = (rec1[2] - rec1[0]) * (rec1[3] - rec1[1]) + S_rec2 = (rec2[2] - rec2[0]) * (rec2[3] - rec2[1]) + + # computing the sum_area + sum_area = S_rec1 + S_rec2 + + # find the each edge of intersect rectangle + x1 = max(rec1[0], rec2[0]) + y1 = max(rec1[1], rec2[1]) + x2 = min(rec1[2], rec2[2]) + y2 = min(rec1[3], rec2[3]) + + # judge if there is an intersect + intersect = max(0, x2 - x1) * max(0, y2 - y1) + + iou = intersect / (sum_area - intersect) + return iou + + def smooth(self, now_landmarks, previous_landmarks): + result = [] + for i in range(now_landmarks.shape[0]): + x = now_landmarks[i][0] - previous_landmarks[i][0] + y = now_landmarks[i][1] - previous_landmarks[i][1] + dis = np.sqrt(np.square(x) + np.square(y)) + if dis < self.thres: + result.append(previous_landmarks[i]) + else: + result.append( + self.do_moving_average(now_landmarks[i], + previous_landmarks[i])) + + return np.array(result) + + def do_moving_average(self, p_now, p_previous): + p = self.alpha * p_now + (1 - self.alpha) * p_previous + return p diff --git a/maas_lib/pipelines/multi_modal/__init__.py b/modelscope/models/cv/cartoon/facelib/__init__.py similarity index 100% rename from maas_lib/pipelines/multi_modal/__init__.py rename to modelscope/models/cv/cartoon/facelib/__init__.py diff --git a/modelscope/models/cv/cartoon/facelib/config.py b/modelscope/models/cv/cartoon/facelib/config.py new file mode 100644 index 00000000..d795fdde --- /dev/null +++ b/modelscope/models/cv/cartoon/facelib/config.py @@ -0,0 +1,23 @@ +import os + +import numpy as np +from easydict import EasyDict as edict + +config = edict() +os.environ['CUDA_VISIBLE_DEVICES'] = '0' + +config.DETECT = edict() +config.DETECT.topk = 10 +config.DETECT.thres = 0.8 +config.DETECT.input_shape = (512, 512, 3) +config.KEYPOINTS = edict() +config.KEYPOINTS.p_num = 68 +config.KEYPOINTS.base_extend_range = [0.2, 0.3] +config.KEYPOINTS.input_shape = (160, 160, 3) +config.TRACE = edict() +config.TRACE.pixel_thres = 1 +config.TRACE.smooth_box = 0.3 +config.TRACE.smooth_landmark = 0.95 +config.TRACE.iou_thres = 0.5 +config.DATA = edict() +config.DATA.pixel_means = np.array([123., 116., 103.]) # RGB diff --git a/modelscope/models/cv/cartoon/facelib/face_detector.py b/modelscope/models/cv/cartoon/facelib/face_detector.py new file mode 100644 index 00000000..e5589719 --- /dev/null +++ b/modelscope/models/cv/cartoon/facelib/face_detector.py @@ -0,0 +1,116 @@ +import time + +import cv2 +import numpy as np +import tensorflow as tf + +from .config import config as cfg + +if tf.__version__ >= '2.0': + tf = tf.compat.v1 + + +class FaceDetector: + + def __init__(self, dir): + + self.model_path = dir + '/detector.pb' + self.thres = cfg.DETECT.thres + self.input_shape = cfg.DETECT.input_shape + + self._graph = tf.Graph() + + with self._graph.as_default(): + self._graph, self._sess = self.init_model(self.model_path) + + self.input_image = tf.get_default_graph().get_tensor_by_name( + 'tower_0/images:0') + self.training = tf.get_default_graph().get_tensor_by_name( + 'training_flag:0') + self.output_ops = [ + tf.get_default_graph().get_tensor_by_name('tower_0/boxes:0'), + tf.get_default_graph().get_tensor_by_name('tower_0/scores:0'), + tf.get_default_graph().get_tensor_by_name( + 'tower_0/num_detections:0'), + ] + + def __call__(self, image): + + image, scale_x, scale_y = self.preprocess( + image, + target_width=self.input_shape[1], + target_height=self.input_shape[0]) + + image = np.expand_dims(image, 0) + + boxes, scores, num_boxes = self._sess.run( + self.output_ops, + feed_dict={ + self.input_image: image, + self.training: False + }) + + num_boxes = num_boxes[0] + boxes = boxes[0][:num_boxes] + + scores = scores[0][:num_boxes] + + to_keep = scores > self.thres + boxes = boxes[to_keep] + scores = scores[to_keep] + + y1 = self.input_shape[0] / scale_y + x1 = self.input_shape[1] / scale_x + y2 = self.input_shape[0] / scale_y + x2 = self.input_shape[1] / scale_x + scaler = np.array([y1, x1, y2, x2], dtype='float32') + boxes = boxes * scaler + + scores = np.expand_dims(scores, 0).reshape([-1, 1]) + + for i in range(boxes.shape[0]): + boxes[i] = np.array( + [boxes[i][1], boxes[i][0], boxes[i][3], boxes[i][2]]) + return np.concatenate([boxes, scores], axis=1) + + def preprocess(self, image, target_height, target_width, label=None): + + h, w, c = image.shape + + bimage = np.zeros( + shape=[target_height, target_width, c], + dtype=image.dtype) + np.array( + cfg.DATA.pixel_means, dtype=image.dtype) + long_side = max(h, w) + + scale_x = scale_y = target_height / long_side + + image = cv2.resize(image, None, fx=scale_x, fy=scale_y) + + h_, w_, _ = image.shape + bimage[:h_, :w_, :] = image + + return bimage, scale_x, scale_y + + def init_model(self, *args): + pb_path = args[0] + + def init_pb(model_path): + config = tf.ConfigProto() + config.gpu_options.per_process_gpu_memory_fraction = 0.2 + compute_graph = tf.Graph() + compute_graph.as_default() + sess = tf.Session(config=config) + with tf.gfile.GFile(model_path, 'rb') as fid: + graph_def = tf.GraphDef() + graph_def.ParseFromString(fid.read()) + tf.import_graph_def(graph_def, name='') + + return (compute_graph, sess) + + model = init_pb(pb_path) + + graph = model[0] + sess = model[1] + + return graph, sess diff --git a/modelscope/models/cv/cartoon/facelib/face_landmark.py b/modelscope/models/cv/cartoon/facelib/face_landmark.py new file mode 100644 index 00000000..063d40c3 --- /dev/null +++ b/modelscope/models/cv/cartoon/facelib/face_landmark.py @@ -0,0 +1,154 @@ +import cv2 +import numpy as np +import tensorflow as tf + +from .config import config as cfg + +if tf.__version__ >= '2.0': + tf = tf.compat.v1 + + +class FaceLandmark: + + def __init__(self, dir): + self.model_path = dir + '/keypoints.pb' + self.min_face = 60 + self.keypoint_num = cfg.KEYPOINTS.p_num * 2 + + self._graph = tf.Graph() + + with self._graph.as_default(): + + self._graph, self._sess = self.init_model(self.model_path) + self.img_input = tf.get_default_graph().get_tensor_by_name( + 'tower_0/images:0') + self.embeddings = tf.get_default_graph().get_tensor_by_name( + 'tower_0/prediction:0') + self.training = tf.get_default_graph().get_tensor_by_name( + 'training_flag:0') + + self.landmark = self.embeddings[:, :self.keypoint_num] + self.headpose = self.embeddings[:, -7:-4] * 90. + self.state = tf.nn.sigmoid(self.embeddings[:, -4:]) + + def __call__(self, img, bboxes): + landmark_result = [] + state_result = [] + for i, bbox in enumerate(bboxes): + landmark, state = self._one_shot_run(img, bbox, i) + if landmark is not None: + landmark_result.append(landmark) + state_result.append(state) + return np.array(landmark_result), np.array(state_result) + + def simple_run(self, cropped_img): + with self._graph.as_default(): + + cropped_img = np.expand_dims(cropped_img, axis=0) + landmark, p, states = self._sess.run( + [self.landmark, self.headpose, self.state], + feed_dict={ + self.img_input: cropped_img, + self.training: False + }) + + return landmark, states + + def _one_shot_run(self, image, bbox, i): + + bbox_width = bbox[2] - bbox[0] + bbox_height = bbox[3] - bbox[1] + if (bbox_width <= self.min_face and bbox_height <= self.min_face): + return None, None + add = int(max(bbox_width, bbox_height)) + bimg = cv2.copyMakeBorder( + image, + add, + add, + add, + add, + borderType=cv2.BORDER_CONSTANT, + value=cfg.DATA.pixel_means) + bbox += add + + one_edge = (1 + 2 * cfg.KEYPOINTS.base_extend_range[0]) * bbox_width + center = [(bbox[0] + bbox[2]) // 2, (bbox[1] + bbox[3]) // 2] + + bbox[0] = center[0] - one_edge // 2 + bbox[1] = center[1] - one_edge // 2 + bbox[2] = center[0] + one_edge // 2 + bbox[3] = center[1] + one_edge // 2 + + bbox = bbox.astype(np.int) + crop_image = bimg[bbox[1]:bbox[3], bbox[0]:bbox[2], :] + h, w, _ = crop_image.shape + crop_image = cv2.resize( + crop_image, + (cfg.KEYPOINTS.input_shape[1], cfg.KEYPOINTS.input_shape[0])) + crop_image = crop_image.astype(np.float32) + + keypoints, state = self.simple_run(crop_image) + + res = keypoints[0][:self.keypoint_num].reshape((-1, 2)) + res[:, 0] = res[:, 0] * w / cfg.KEYPOINTS.input_shape[1] + res[:, 1] = res[:, 1] * h / cfg.KEYPOINTS.input_shape[0] + + landmark = [] + for _index in range(res.shape[0]): + x_y = res[_index] + landmark.append([ + int(x_y[0] * cfg.KEYPOINTS.input_shape[0] + bbox[0] - add), + int(x_y[1] * cfg.KEYPOINTS.input_shape[1] + bbox[1] - add) + ]) + + landmark = np.array(landmark, np.float32) + + return landmark, state + + def init_model(self, *args): + + if len(args) == 1: + use_pb = True + pb_path = args[0] + else: + use_pb = False + meta_path = args[0] + restore_model_path = args[1] + + def ini_ckpt(): + graph = tf.Graph() + graph.as_default() + configProto = tf.ConfigProto() + configProto.gpu_options.allow_growth = True + sess = tf.Session(config=configProto) + # load_model(model_path, sess) + saver = tf.train.import_meta_graph(meta_path) + saver.restore(sess, restore_model_path) + + print('Model restred!') + return (graph, sess) + + def init_pb(model_path): + config = tf.ConfigProto() + config.gpu_options.per_process_gpu_memory_fraction = 0.2 + compute_graph = tf.Graph() + compute_graph.as_default() + sess = tf.Session(config=config) + with tf.gfile.GFile(model_path, 'rb') as fid: + graph_def = tf.GraphDef() + graph_def.ParseFromString(fid.read()) + tf.import_graph_def(graph_def, name='') + + # saver = tf.train.Saver(tf.global_variables()) + # saver.save(sess, save_path='./tmp.ckpt') + return (compute_graph, sess) + + if use_pb: + model = init_pb(pb_path) + else: + model = ini_ckpt() + + graph = model[0] + sess = model[1] + + return graph, sess diff --git a/modelscope/models/cv/cartoon/facelib/facer.py b/modelscope/models/cv/cartoon/facelib/facer.py new file mode 100644 index 00000000..62388ab9 --- /dev/null +++ b/modelscope/models/cv/cartoon/facelib/facer.py @@ -0,0 +1,150 @@ +import time + +import cv2 +import numpy as np + +from .config import config as cfg +from .face_detector import FaceDetector +from .face_landmark import FaceLandmark +from .LK.lk import GroupTrack + + +class FaceAna(): + ''' + by default the top3 facea sorted by area will be calculated for time reason + ''' + + def __init__(self, model_dir): + self.face_detector = FaceDetector(model_dir) + self.face_landmark = FaceLandmark(model_dir) + self.trace = GroupTrack() + + self.track_box = None + self.previous_image = None + self.previous_box = None + + self.diff_thres = 5 + self.top_k = cfg.DETECT.topk + self.iou_thres = cfg.TRACE.iou_thres + self.alpha = cfg.TRACE.smooth_box + + def run(self, image): + + boxes = self.face_detector(image) + + if boxes.shape[0] > self.top_k: + boxes = self.sort(boxes) + + boxes_return = np.array(boxes) + landmarks, states = self.face_landmark(image, boxes) + + if 1: + track = [] + for i in range(landmarks.shape[0]): + track.append([ + np.min(landmarks[i][:, 0]), + np.min(landmarks[i][:, 1]), + np.max(landmarks[i][:, 0]), + np.max(landmarks[i][:, 1]) + ]) + tmp_box = np.array(track) + + self.track_box = self.judge_boxs(boxes_return, tmp_box) + + self.track_box, landmarks = self.sort_res(self.track_box, landmarks) + return self.track_box, landmarks, states + + def sort_res(self, bboxes, points): + area = [] + for bbox in bboxes: + bbox_width = bbox[2] - bbox[0] + bbox_height = bbox[3] - bbox[1] + area.append(bbox_height * bbox_width) + + area = np.array(area) + picked = area.argsort()[::-1] + sorted_bboxes = [bboxes[x] for x in picked] + sorted_points = [points[x] for x in picked] + return np.array(sorted_bboxes), np.array(sorted_points) + + def diff_frames(self, previous_frame, image): + if previous_frame is None: + return True + else: + _diff = cv2.absdiff(previous_frame, image) + diff = np.sum( + _diff) / previous_frame.shape[0] / previous_frame.shape[1] / 3. + return diff > self.diff_thres + + def sort(self, bboxes): + if self.top_k > 100: + return bboxes + area = [] + for bbox in bboxes: + + bbox_width = bbox[2] - bbox[0] + bbox_height = bbox[3] - bbox[1] + area.append(bbox_height * bbox_width) + + area = np.array(area) + + picked = area.argsort()[-self.top_k:][::-1] + sorted_bboxes = [bboxes[x] for x in picked] + return np.array(sorted_bboxes) + + def judge_boxs(self, previuous_bboxs, now_bboxs): + + def iou(rec1, rec2): + + # computing area of each rectangles + S_rec1 = (rec1[2] - rec1[0]) * (rec1[3] - rec1[1]) + S_rec2 = (rec2[2] - rec2[0]) * (rec2[3] - rec2[1]) + + # computing the sum_area + sum_area = S_rec1 + S_rec2 + + # find the each edge of intersect rectangle + x1 = max(rec1[0], rec2[0]) + y1 = max(rec1[1], rec2[1]) + x2 = min(rec1[2], rec2[2]) + y2 = min(rec1[3], rec2[3]) + + # judge if there is an intersect + intersect = max(0, x2 - x1) * max(0, y2 - y1) + + return intersect / (sum_area - intersect) + + if previuous_bboxs is None: + return now_bboxs + + result = [] + + for i in range(now_bboxs.shape[0]): + contain = False + for j in range(previuous_bboxs.shape[0]): + if iou(now_bboxs[i], previuous_bboxs[j]) > self.iou_thres: + result.append( + self.smooth(now_bboxs[i], previuous_bboxs[j])) + contain = True + break + if not contain: + result.append(now_bboxs[i]) + + return np.array(result) + + def smooth(self, now_box, previous_box): + + return self.do_moving_average(now_box[:4], previous_box[:4]) + + def do_moving_average(self, p_now, p_previous): + p = self.alpha * p_now + (1 - self.alpha) * p_previous + return p + + def reset(self): + ''' + reset the previous info used foe tracking, + :return: + ''' + self.track_box = None + self.previous_image = None + self.previous_box = None diff --git a/modelscope/models/cv/cartoon/mtcnn_pytorch/LICENSE b/modelscope/models/cv/cartoon/mtcnn_pytorch/LICENSE new file mode 100644 index 00000000..9210f5b8 --- /dev/null +++ b/modelscope/models/cv/cartoon/mtcnn_pytorch/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2017 Dan Antoshchenko + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/modelscope/models/cv/cartoon/mtcnn_pytorch/README.md b/modelscope/models/cv/cartoon/mtcnn_pytorch/README.md new file mode 100644 index 00000000..b748cf58 --- /dev/null +++ b/modelscope/models/cv/cartoon/mtcnn_pytorch/README.md @@ -0,0 +1,26 @@ +# MTCNN + +`pytorch` implementation of **inference stage** of face detection algorithm described in +[Joint Face Detection and Alignment using Multi-task Cascaded Convolutional Networks](https://arxiv.org/abs/1604.02878). + +## Example +![example of a face detection](images/example.png) + +## How to use it +Just download the repository and then do this +```python +from src import detect_faces +from PIL import Image + +image = Image.open('image.jpg') +bounding_boxes, landmarks = detect_faces(image) +``` +For examples see `test_on_images.ipynb`. + +## Requirements +* pytorch 0.2 +* Pillow, numpy + +## Credit +This implementation is heavily inspired by: +* [pangyupo/mxnet_mtcnn_face_detection](https://github.com/pangyupo/mxnet_mtcnn_face_detection) diff --git a/maas_lib/pipelines/nlp/space/__init__.py b/modelscope/models/cv/cartoon/mtcnn_pytorch/__init__.py similarity index 100% rename from maas_lib/pipelines/nlp/space/__init__.py rename to modelscope/models/cv/cartoon/mtcnn_pytorch/__init__.py diff --git a/maas_lib/preprocessors/space/__init__.py b/modelscope/models/cv/cartoon/mtcnn_pytorch/src/__init__.py similarity index 100% rename from maas_lib/preprocessors/space/__init__.py rename to modelscope/models/cv/cartoon/mtcnn_pytorch/src/__init__.py diff --git a/modelscope/models/cv/cartoon/mtcnn_pytorch/src/align_trans.py b/modelscope/models/cv/cartoon/mtcnn_pytorch/src/align_trans.py new file mode 100644 index 00000000..baa3ba73 --- /dev/null +++ b/modelscope/models/cv/cartoon/mtcnn_pytorch/src/align_trans.py @@ -0,0 +1,187 @@ +""" +Created on Mon Apr 24 15:43:29 2017 +@author: zhaoy +""" +import cv2 +import numpy as np + +from .matlab_cp2tform import get_similarity_transform_for_cv2 + +# reference facial points, a list of coordinates (x,y) +dx = 1 +dy = 1 +REFERENCE_FACIAL_POINTS = [ + [30.29459953 + dx, 51.69630051 + dy], # left eye + [65.53179932 + dx, 51.50139999 + dy], # right eye + [48.02519989 + dx, 71.73660278 + dy], # nose + [33.54930115 + dx, 92.3655014 + dy], # left mouth + [62.72990036 + dx, 92.20410156 + dy] # right mouth +] + +DEFAULT_CROP_SIZE = (96, 112) + +global FACIAL_POINTS + + +class FaceWarpException(Exception): + + def __str__(self): + return 'In File {}:{}'.format(__file__, super.__str__(self)) + + +def get_reference_facial_points(output_size=None, + inner_padding_factor=0.0, + outer_padding=(0, 0), + default_square=False): + + tmp_5pts = np.array(REFERENCE_FACIAL_POINTS) + tmp_crop_size = np.array(DEFAULT_CROP_SIZE) + + # 0) make the inner region a square + if default_square: + size_diff = max(tmp_crop_size) - tmp_crop_size + tmp_5pts += size_diff / 2 + tmp_crop_size += size_diff + + h_crop = tmp_crop_size[0] + w_crop = tmp_crop_size[1] + if (output_size): + if (output_size[0] == h_crop and output_size[1] == w_crop): + return tmp_5pts + + if (inner_padding_factor == 0 and outer_padding == (0, 0)): + if output_size is None: + return tmp_5pts + else: + raise FaceWarpException( + 'No paddings to do, output_size must be None or {}'.format( + tmp_crop_size)) + + # check output size + if not (0 <= inner_padding_factor <= 1.0): + raise FaceWarpException('Not (0 <= inner_padding_factor <= 1.0)') + + factor = inner_padding_factor > 0 or outer_padding[0] > 0 + factor = factor or outer_padding[1] > 0 + if (factor and output_size is None): + output_size = tmp_crop_size * \ + (1 + inner_padding_factor * 2).astype(np.int32) + output_size += np.array(outer_padding) + + cond1 = outer_padding[0] < output_size[0] + cond2 = outer_padding[1] < output_size[1] + if not (cond1 and cond2): + raise FaceWarpException('Not (outer_padding[0] < output_size[0]' + 'and outer_padding[1] < output_size[1])') + + # 1) pad the inner region according inner_padding_factor + if inner_padding_factor > 0: + size_diff = tmp_crop_size * inner_padding_factor * 2 + tmp_5pts += size_diff / 2 + tmp_crop_size += np.round(size_diff).astype(np.int32) + + # 2) resize the padded inner region + size_bf_outer_pad = np.array(output_size) - np.array(outer_padding) * 2 + + if size_bf_outer_pad[0] * tmp_crop_size[1] != size_bf_outer_pad[ + 1] * tmp_crop_size[0]: + raise FaceWarpException( + 'Must have (output_size - outer_padding)' + '= some_scale * (crop_size * (1.0 + inner_padding_factor)') + + scale_factor = size_bf_outer_pad[0].astype(np.float32) / tmp_crop_size[0] + tmp_5pts = tmp_5pts * scale_factor + + # 3) add outer_padding to make output_size + reference_5point = tmp_5pts + np.array(outer_padding) + + return reference_5point + + +def get_affine_transform_matrix(src_pts, dst_pts): + + tfm = np.float32([[1, 0, 0], [0, 1, 0]]) + n_pts = src_pts.shape[0] + ones = np.ones((n_pts, 1), src_pts.dtype) + src_pts_ = np.hstack([src_pts, ones]) + dst_pts_ = np.hstack([dst_pts, ones]) + + A, res, rank, s = np.linalg.lstsq(src_pts_, dst_pts_) + + if rank == 3: + tfm = np.float32([[A[0, 0], A[1, 0], A[2, 0]], + [A[0, 1], A[1, 1], A[2, 1]]]) + elif rank == 2: + tfm = np.float32([[A[0, 0], A[1, 0], 0], [A[0, 1], A[1, 1], 0]]) + + return tfm + + +def warp_and_crop_face(src_img, + facial_pts, + ratio=0.84, + reference_pts=None, + crop_size=(96, 112), + align_type='similarity' + '', + return_trans_inv=False): + + if reference_pts is None: + if crop_size[0] == 96 and crop_size[1] == 112: + reference_pts = REFERENCE_FACIAL_POINTS + else: + default_square = False + inner_padding_factor = 0 + outer_padding = (0, 0) + output_size = crop_size + + reference_pts = get_reference_facial_points( + output_size, inner_padding_factor, outer_padding, + default_square) + + ref_pts = np.float32(reference_pts) + + factor = ratio + ref_pts = (ref_pts - 112 / 2) * factor + 112 / 2 + ref_pts *= crop_size[0] / 112. + + ref_pts_shp = ref_pts.shape + if max(ref_pts_shp) < 3 or min(ref_pts_shp) != 2: + raise FaceWarpException( + 'reference_pts.shape must be (K,2) or (2,K) and K>2') + + if ref_pts_shp[0] == 2: + ref_pts = ref_pts.T + + src_pts = np.float32(facial_pts) + src_pts_shp = src_pts.shape + if max(src_pts_shp) < 3 or min(src_pts_shp) != 2: + raise FaceWarpException( + 'facial_pts.shape must be (K,2) or (2,K) and K>2') + + if src_pts_shp[0] == 2: + src_pts = src_pts.T + + if src_pts.shape != ref_pts.shape: + raise FaceWarpException( + 'facial_pts and reference_pts must have the same shape') + + if align_type == 'cv2_affine': + tfm = cv2.getAffineTransform(src_pts, ref_pts) + tfm_inv = cv2.getAffineTransform(ref_pts, src_pts) + + elif align_type == 'affine': + tfm = get_affine_transform_matrix(src_pts, ref_pts) + tfm_inv = get_affine_transform_matrix(ref_pts, src_pts) + else: + tfm, tfm_inv = get_similarity_transform_for_cv2(src_pts, ref_pts) + + face_img = cv2.warpAffine( + src_img, + tfm, (crop_size[0], crop_size[1]), + borderValue=(255, 255, 255)) + + if return_trans_inv: + return face_img, tfm_inv + else: + return face_img diff --git a/modelscope/models/cv/cartoon/mtcnn_pytorch/src/matlab_cp2tform.py b/modelscope/models/cv/cartoon/mtcnn_pytorch/src/matlab_cp2tform.py new file mode 100644 index 00000000..96a5f965 --- /dev/null +++ b/modelscope/models/cv/cartoon/mtcnn_pytorch/src/matlab_cp2tform.py @@ -0,0 +1,339 @@ +""" +Created on Tue Jul 11 06:54:28 2017 + +@author: zhaoyafei +""" + +import numpy as np +from numpy.linalg import inv, lstsq +from numpy.linalg import matrix_rank as rank +from numpy.linalg import norm + + +class MatlabCp2tormException(Exception): + + def __str__(self): + return 'In File {}:{}'.format(__file__, super.__str__(self)) + + +def tformfwd(trans, uv): + """ + Function: + ---------- + apply affine transform 'trans' to uv + + Parameters: + ---------- + @trans: 3x3 np.array + transform matrix + @uv: Kx2 np.array + each row is a pair of coordinates (x, y) + + Returns: + ---------- + @xy: Kx2 np.array + each row is a pair of transformed coordinates (x, y) + """ + uv = np.hstack((uv, np.ones((uv.shape[0], 1)))) + xy = np.dot(uv, trans) + xy = xy[:, 0:-1] + return xy + + +def tforminv(trans, uv): + """ + Function: + ---------- + apply the inverse of affine transform 'trans' to uv + + Parameters: + ---------- + @trans: 3x3 np.array + transform matrix + @uv: Kx2 np.array + each row is a pair of coordinates (x, y) + + Returns: + ---------- + @xy: Kx2 np.array + each row is a pair of inverse-transformed coordinates (x, y) + """ + Tinv = inv(trans) + xy = tformfwd(Tinv, uv) + return xy + + +def findNonreflectiveSimilarity(uv, xy, options=None): + + options = {'K': 2} + + K = options['K'] + M = xy.shape[0] + x = xy[:, 0].reshape((-1, 1)) # use reshape to keep a column vector + y = xy[:, 1].reshape((-1, 1)) # use reshape to keep a column vector + # print('--->x, y:\n', x, y + + tmp1 = np.hstack((x, y, np.ones((M, 1)), np.zeros((M, 1)))) + tmp2 = np.hstack((y, -x, np.zeros((M, 1)), np.ones((M, 1)))) + X = np.vstack((tmp1, tmp2)) + # print('--->X.shape: ', X.shape + # print('X:\n', X + + u = uv[:, 0].reshape((-1, 1)) # use reshape to keep a column vector + v = uv[:, 1].reshape((-1, 1)) # use reshape to keep a column vector + U = np.vstack((u, v)) + # print('--->U.shape: ', U.shape + # print('U:\n', U + + # We know that X * r = U + if rank(X) >= 2 * K: + r, _, _, _ = lstsq(X, U) + r = np.squeeze(r) + else: + raise Exception('cp2tform:twoUniquePointsReq') + + # print('--->r:\n', r + + sc = r[0] + ss = r[1] + tx = r[2] + ty = r[3] + + Tinv = np.array([[sc, -ss, 0], [ss, sc, 0], [tx, ty, 1]]) + + # print('--->Tinv:\n', Tinv + + T = inv(Tinv) + # print('--->T:\n', T + + T[:, 2] = np.array([0, 0, 1]) + + return T, Tinv + + +def findSimilarity(uv, xy, options=None): + + options = {'K': 2} + + # uv = np.array(uv) + # xy = np.array(xy) + + # Solve for trans1 + trans1, trans1_inv = findNonreflectiveSimilarity(uv, xy, options) + + # Solve for trans2 + + # manually reflect the xy data across the Y-axis + xyR = xy + xyR[:, 0] = -1 * xyR[:, 0] + + trans2r, trans2r_inv = findNonreflectiveSimilarity(uv, xyR, options) + + # manually reflect the tform to undo the reflection done on xyR + TreflectY = np.array([[-1, 0, 0], [0, 1, 0], [0, 0, 1]]) + + trans2 = np.dot(trans2r, TreflectY) + + # Figure out if trans1 or trans2 is better + xy1 = tformfwd(trans1, uv) + norm1 = norm(xy1 - xy) + + xy2 = tformfwd(trans2, uv) + norm2 = norm(xy2 - xy) + + if norm1 <= norm2: + return trans1, trans1_inv + else: + trans2_inv = inv(trans2) + return trans2, trans2_inv + + +def get_similarity_transform(src_pts, dst_pts, reflective=True): + """ + Function: + ---------- + Find Similarity Transform Matrix 'trans': + u = src_pts[:, 0] + v = src_pts[:, 1] + x = dst_pts[:, 0] + y = dst_pts[:, 1] + [x, y, 1] = [u, v, 1] * trans + + Parameters: + ---------- + @src_pts: Kx2 np.array + source points, each row is a pair of coordinates (x, y) + @dst_pts: Kx2 np.array + destination points, each row is a pair of transformed + coordinates (x, y) + @reflective: True or False + if True: + use reflective similarity transform + else: + use non-reflective similarity transform + + Returns: + ---------- + @trans: 3x3 np.array + transform matrix from uv to xy + trans_inv: 3x3 np.array + inverse of trans, transform matrix from xy to uv + """ + + if reflective: + trans, trans_inv = findSimilarity(src_pts, dst_pts) + else: + trans, trans_inv = findNonreflectiveSimilarity(src_pts, dst_pts) + + return trans, trans_inv + + +def cvt_tform_mat_for_cv2(trans): + """ + Function: + ---------- + Convert Transform Matrix 'trans' into 'cv2_trans' which could be + directly used by cv2.warpAffine(): + u = src_pts[:, 0] + v = src_pts[:, 1] + x = dst_pts[:, 0] + y = dst_pts[:, 1] + [x, y].T = cv_trans * [u, v, 1].T + + Parameters: + ---------- + @trans: 3x3 np.array + transform matrix from uv to xy + + Returns: + ---------- + @cv2_trans: 2x3 np.array + transform matrix from src_pts to dst_pts, could be directly used + for cv2.warpAffine() + """ + cv2_trans = trans[:, 0:2].T + + return cv2_trans + + +def get_similarity_transform_for_cv2(src_pts, dst_pts, reflective=True): + """ + Function: + ---------- + Find Similarity Transform Matrix 'cv2_trans' which could be + directly used by cv2.warpAffine(): + u = src_pts[:, 0] + v = src_pts[:, 1] + x = dst_pts[:, 0] + y = dst_pts[:, 1] + [x, y].T = cv_trans * [u, v, 1].T + + Parameters: + ---------- + @src_pts: Kx2 np.array + source points, each row is a pair of coordinates (x, y) + @dst_pts: Kx2 np.array + destination points, each row is a pair of transformed + coordinates (x, y) + reflective: True or False + if True: + use reflective similarity transform + else: + use non-reflective similarity transform + + Returns: + ---------- + @cv2_trans: 2x3 np.array + transform matrix from src_pts to dst_pts, could be directly used + for cv2.warpAffine() + """ + trans, trans_inv = get_similarity_transform(src_pts, dst_pts, reflective) + cv2_trans = cvt_tform_mat_for_cv2(trans) + cv2_trans_inv = cvt_tform_mat_for_cv2(trans_inv) + + return cv2_trans, cv2_trans_inv + + +if __name__ == '__main__': + """ + u = [0, 6, -2] + v = [0, 3, 5] + x = [-1, 0, 4] + y = [-1, -10, 4] + + # In Matlab, run: + # + # uv = [u'; v']; + # xy = [x'; y']; + # tform_sim=cp2tform(uv,xy,'similarity'); + # + # trans = tform_sim.tdata.T + # ans = + # -0.0764 -1.6190 0 + # 1.6190 -0.0764 0 + # -3.2156 0.0290 1.0000 + # trans_inv = tform_sim.tdata.Tinv + # ans = + # + # -0.0291 0.6163 0 + # -0.6163 -0.0291 0 + # -0.0756 1.9826 1.0000 + # xy_m=tformfwd(tform_sim, u,v) + # + # xy_m = + # + # -3.2156 0.0290 + # 1.1833 -9.9143 + # 5.0323 2.8853 + # uv_m=tforminv(tform_sim, x,y) + # + # uv_m = + # + # 0.5698 1.3953 + # 6.0872 2.2733 + # -2.6570 4.3314 + """ + u = [0, 6, -2] + v = [0, 3, 5] + x = [-1, 0, 4] + y = [-1, -10, 4] + + uv = np.array((u, v)).T + xy = np.array((x, y)).T + + print('\n--->uv:') + print(uv) + print('\n--->xy:') + print(xy) + + trans, trans_inv = get_similarity_transform(uv, xy) + + print('\n--->trans matrix:') + print(trans) + + print('\n--->trans_inv matrix:') + print(trans_inv) + + print('\n---> apply transform to uv') + print('\nxy_m = uv_augmented * trans') + uv_aug = np.hstack((uv, np.ones((uv.shape[0], 1)))) + xy_m = np.dot(uv_aug, trans) + print(xy_m) + + print('\nxy_m = tformfwd(trans, uv)') + xy_m = tformfwd(trans, uv) + print(xy_m) + + print('\n---> apply inverse transform to xy') + print('\nuv_m = xy_augmented * trans_inv') + xy_aug = np.hstack((xy, np.ones((xy.shape[0], 1)))) + uv_m = np.dot(xy_aug, trans_inv) + print(uv_m) + + print('\nuv_m = tformfwd(trans_inv, xy)') + uv_m = tformfwd(trans_inv, xy) + print(uv_m) + + uv_m = tforminv(trans, xy) + print('\nuv_m = tforminv(trans, xy)') + print(uv_m) diff --git a/modelscope/models/cv/cartoon/utils.py b/modelscope/models/cv/cartoon/utils.py new file mode 100644 index 00000000..39712653 --- /dev/null +++ b/modelscope/models/cv/cartoon/utils.py @@ -0,0 +1,91 @@ +import os + +import cv2 +import numpy as np + + +def resize_size(image, size=720): + h, w, c = np.shape(image) + if min(h, w) > size: + if h > w: + h, w = int(size * h / w), size + else: + h, w = size, int(size * w / h) + image = cv2.resize(image, (w, h), interpolation=cv2.INTER_AREA) + return image + + +def padTo16x(image): + h, w, c = np.shape(image) + if h % 16 == 0 and w % 16 == 0: + return image, h, w + nh, nw = (h // 16 + 1) * 16, (w // 16 + 1) * 16 + img_new = np.ones((nh, nw, 3), np.uint8) * 255 + img_new[:h, :w, :] = image + + return img_new, h, w + + +def get_f5p(landmarks, np_img): + eye_left = find_pupil(landmarks[36:41], np_img) + eye_right = find_pupil(landmarks[42:47], np_img) + if eye_left is None or eye_right is None: + print('cannot find 5 points with find_puil, used mean instead.!') + eye_left = landmarks[36:41].mean(axis=0) + eye_right = landmarks[42:47].mean(axis=0) + nose = landmarks[30] + mouth_left = landmarks[48] + mouth_right = landmarks[54] + f5p = [[eye_left[0], eye_left[1]], [eye_right[0], eye_right[1]], + [nose[0], nose[1]], [mouth_left[0], mouth_left[1]], + [mouth_right[0], mouth_right[1]]] + return f5p + + +def find_pupil(landmarks, np_img): + h, w, _ = np_img.shape + xmax = int(landmarks[:, 0].max()) + xmin = int(landmarks[:, 0].min()) + ymax = int(landmarks[:, 1].max()) + ymin = int(landmarks[:, 1].min()) + + if ymin >= ymax or xmin >= xmax or ymin < 0 or xmin < 0 or ymax > h or xmax > w: + return None + eye_img_bgr = np_img[ymin:ymax, xmin:xmax, :] + eye_img = cv2.cvtColor(eye_img_bgr, cv2.COLOR_BGR2GRAY) + eye_img = cv2.equalizeHist(eye_img) + n_marks = landmarks - np.array([xmin, ymin]).reshape([1, 2]) + eye_mask = cv2.fillConvexPoly( + np.zeros_like(eye_img), n_marks.astype(np.int32), 1) + ret, thresh = cv2.threshold(eye_img, 100, 255, + cv2.THRESH_BINARY | cv2.THRESH_OTSU) + thresh = (1 - thresh / 255.) * eye_mask + cnt = 0 + xm = [] + ym = [] + for i in range(thresh.shape[0]): + for j in range(thresh.shape[1]): + if thresh[i, j] > 0.5: + xm.append(j) + ym.append(i) + cnt += 1 + if cnt != 0: + xm.sort() + ym.sort() + xm = xm[cnt // 2] + ym = ym[cnt // 2] + else: + xm = thresh.shape[1] / 2 + ym = thresh.shape[0] / 2 + + return xm + xmin, ym + ymin + + +def all_file(file_dir): + L = [] + for root, dirs, files in os.walk(file_dir): + for file in files: + extend = os.path.splitext(file)[1] + if extend == '.png' or extend == '.jpg' or extend == '.jpeg': + L.append(os.path.join(root, file)) + return L diff --git a/maas_lib/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py similarity index 52% rename from maas_lib/models/nlp/__init__.py rename to modelscope/models/nlp/__init__.py index 99b56c17..c3baab15 100644 --- a/maas_lib/models/nlp/__init__.py +++ b/modelscope/models/nlp/__init__.py @@ -1,3 +1,4 @@ from .sequence_classification_model import * # noqa F403 from .space.dialog_generation_model import * # noqa F403 -from .space.dialog_intent_model import * +from .space.dialog_intent_model import * # noqa F403 +from .text_generation_model import * # noqa F403 diff --git a/maas_lib/models/nlp/sequence_classification_model.py b/modelscope/models/nlp/sequence_classification_model.py similarity index 90% rename from maas_lib/models/nlp/sequence_classification_model.py rename to modelscope/models/nlp/sequence_classification_model.py index d29587a0..6ced7a4e 100644 --- a/maas_lib/models/nlp/sequence_classification_model.py +++ b/modelscope/models/nlp/sequence_classification_model.py @@ -1,17 +1,17 @@ -from typing import Any, Dict, Optional, Union +from typing import Any, Dict import numpy as np -from maas_lib.utils.constant import Tasks +from modelscope.utils.constant import Tasks from ..base import Model from ..builder import MODELS -__all__ = ['SequenceClassificationModel'] +__all__ = ['BertForSequenceClassification'] @MODELS.register_module( Tasks.text_classification, module_name=r'bert-sentiment-analysis') -class SequenceClassificationModel(Model): +class BertForSequenceClassification(Model): def __init__(self, model_dir: str, *args, **kwargs): # Model.__init__(self, model_dir, model_cls, first_sequence, *args, **kwargs) diff --git a/maas_lib/trainers/nlp/space/__init__.py b/modelscope/models/nlp/space/__init__.py similarity index 100% rename from maas_lib/trainers/nlp/space/__init__.py rename to modelscope/models/nlp/space/__init__.py diff --git a/maas_lib/models/nlp/space/dialog_generation_model.py b/modelscope/models/nlp/space/dialog_generation_model.py similarity index 89% rename from maas_lib/models/nlp/space/dialog_generation_model.py rename to modelscope/models/nlp/space/dialog_generation_model.py index be3d7261..db8c40e0 100644 --- a/maas_lib/models/nlp/space/dialog_generation_model.py +++ b/modelscope/models/nlp/space/dialog_generation_model.py @@ -1,7 +1,7 @@ from typing import Any, Dict, Optional -from maas_lib.trainers.nlp.space.trainers.gen_trainer import MultiWOZTrainer -from maas_lib.utils.constant import Tasks +from modelscope.trainers.nlp.space.trainers.gen_trainer import MultiWOZTrainer +from modelscope.utils.constant import Tasks from ...base import Model, Tensor from ...builder import MODELS from .model.generator import Generator @@ -68,13 +68,13 @@ class DialogGenerationModel(Model): from numpy import array, float32 import torch - turn_1 = { - 'user': [ - 13, 1045, 2052, 2066, 1037, 10095, 2013, 3002, 2198, 1005, - 1055, 2267, 2000, 10733, 12570, 21713, 4487, 15474, 1012, 7 - ] - } - old_pv_turn_1 = {} + # turn_1 = { + # 'user': [ + # 13, 1045, 2052, 2066, 1037, 10095, 2013, 3002, 2198, 1005, + # 1055, 2267, 2000, 10733, 12570, 21713, 4487, 15474, 1012, 7 + # ] + # } + # old_pv_turn_1 = {} turn_2 = { 'user': diff --git a/maas_lib/models/nlp/space/dialog_intent_model.py b/modelscope/models/nlp/space/dialog_intent_model.py similarity index 94% rename from maas_lib/models/nlp/space/dialog_intent_model.py rename to modelscope/models/nlp/space/dialog_intent_model.py index 747f6a20..eb8b3918 100644 --- a/maas_lib/models/nlp/space/dialog_intent_model.py +++ b/modelscope/models/nlp/space/dialog_intent_model.py @@ -1,7 +1,7 @@ from typing import Any, Dict, Optional -from maas_lib.trainers.nlp.space.trainers.intent_trainer import IntentTrainer -from maas_lib.utils.constant import Tasks +from modelscope.trainers.nlp.space.trainers.intent_trainer import IntentTrainer +from modelscope.utils.constant import Tasks from ...base import Model, Tensor from ...builder import MODELS from .model.generator import Generator diff --git a/maas_lib/models/nlp/space/model/__init__.py b/modelscope/models/nlp/space/model/__init__.py similarity index 100% rename from maas_lib/models/nlp/space/model/__init__.py rename to modelscope/models/nlp/space/model/__init__.py diff --git a/maas_lib/models/nlp/space/model/gen_unified_transformer.py b/modelscope/models/nlp/space/model/gen_unified_transformer.py similarity index 99% rename from maas_lib/models/nlp/space/model/gen_unified_transformer.py rename to modelscope/models/nlp/space/model/gen_unified_transformer.py index 2ea68bd1..611d627f 100644 --- a/maas_lib/models/nlp/space/model/gen_unified_transformer.py +++ b/modelscope/models/nlp/space/model/gen_unified_transformer.py @@ -3,7 +3,7 @@ IntentUnifiedTransformer """ import torch -from maas_lib.models.nlp.space.model.unified_transformer import \ +from modelscope.models.nlp.space.model.unified_transformer import \ UnifiedTransformer diff --git a/maas_lib/models/nlp/space/model/generator.py b/modelscope/models/nlp/space/model/generator.py similarity index 100% rename from maas_lib/models/nlp/space/model/generator.py rename to modelscope/models/nlp/space/model/generator.py diff --git a/maas_lib/models/nlp/space/model/intent_unified_transformer.py b/modelscope/models/nlp/space/model/intent_unified_transformer.py similarity index 99% rename from maas_lib/models/nlp/space/model/intent_unified_transformer.py rename to modelscope/models/nlp/space/model/intent_unified_transformer.py index dd63df39..e1302c6f 100644 --- a/maas_lib/models/nlp/space/model/intent_unified_transformer.py +++ b/modelscope/models/nlp/space/model/intent_unified_transformer.py @@ -5,7 +5,7 @@ import torch import torch.nn as nn import torch.nn.functional as F -from maas_lib.utils.nlp.space.criterions import compute_kl_loss +from modelscope.utils.nlp.space.criterions import compute_kl_loss from .unified_transformer import UnifiedTransformer diff --git a/maas_lib/models/nlp/space/model/model_base.py b/modelscope/models/nlp/space/model/model_base.py similarity index 100% rename from maas_lib/models/nlp/space/model/model_base.py rename to modelscope/models/nlp/space/model/model_base.py diff --git a/maas_lib/models/nlp/space/model/unified_transformer.py b/modelscope/models/nlp/space/model/unified_transformer.py similarity index 98% rename from maas_lib/models/nlp/space/model/unified_transformer.py rename to modelscope/models/nlp/space/model/unified_transformer.py index 53e03c69..53a18979 100644 --- a/maas_lib/models/nlp/space/model/unified_transformer.py +++ b/modelscope/models/nlp/space/model/unified_transformer.py @@ -7,9 +7,9 @@ import torch import torch.nn as nn import torch.nn.functional as F -from maas_lib.models.nlp.space.model.model_base import ModelBase -from maas_lib.models.nlp.space.modules.embedder import Embedder -from maas_lib.models.nlp.space.modules.transformer_block import \ +from modelscope.models.nlp.space.model.model_base import ModelBase +from modelscope.models.nlp.space.modules.embedder import Embedder +from modelscope.models.nlp.space.modules.transformer_block import \ TransformerBlock @@ -171,7 +171,7 @@ class UnifiedTransformer(ModelBase): batch_size = mask1.shape[0] seq_len1 = mask1.shape[1] seq_len2 = mask2.shape[1] - seq_len = seq_len1 + seq_len2 + # seq_len = seq_len1 + seq_len2 mask_lu = mask1 mask_ru = torch.ones(batch_size, seq_len1, seq_len2) diff --git a/maas_lib/trainers/nlp/space/metrics/__init__.py b/modelscope/models/nlp/space/modules/__init__.py similarity index 100% rename from maas_lib/trainers/nlp/space/metrics/__init__.py rename to modelscope/models/nlp/space/modules/__init__.py diff --git a/maas_lib/models/nlp/space/modules/embedder.py b/modelscope/models/nlp/space/modules/embedder.py similarity index 100% rename from maas_lib/models/nlp/space/modules/embedder.py rename to modelscope/models/nlp/space/modules/embedder.py diff --git a/maas_lib/models/nlp/space/modules/feedforward.py b/modelscope/models/nlp/space/modules/feedforward.py similarity index 100% rename from maas_lib/models/nlp/space/modules/feedforward.py rename to modelscope/models/nlp/space/modules/feedforward.py diff --git a/maas_lib/models/nlp/space/modules/functions.py b/modelscope/models/nlp/space/modules/functions.py similarity index 100% rename from maas_lib/models/nlp/space/modules/functions.py rename to modelscope/models/nlp/space/modules/functions.py diff --git a/maas_lib/models/nlp/space/modules/multihead_attention.py b/modelscope/models/nlp/space/modules/multihead_attention.py similarity index 100% rename from maas_lib/models/nlp/space/modules/multihead_attention.py rename to modelscope/models/nlp/space/modules/multihead_attention.py diff --git a/maas_lib/models/nlp/space/modules/transformer_block.py b/modelscope/models/nlp/space/modules/transformer_block.py similarity index 92% rename from maas_lib/models/nlp/space/modules/transformer_block.py rename to modelscope/models/nlp/space/modules/transformer_block.py index daa7d723..1a0565d6 100644 --- a/maas_lib/models/nlp/space/modules/transformer_block.py +++ b/modelscope/models/nlp/space/modules/transformer_block.py @@ -5,8 +5,8 @@ TransformerBlock class. import torch import torch.nn as nn -from maas_lib.models.nlp.space.modules.feedforward import FeedForward -from maas_lib.models.nlp.space.modules.multihead_attention import \ +from modelscope.models.nlp.space.modules.feedforward import FeedForward +from modelscope.models.nlp.space.modules.multihead_attention import \ MultiheadAttention diff --git a/modelscope/models/nlp/text_generation_model.py b/modelscope/models/nlp/text_generation_model.py new file mode 100644 index 00000000..ebefc8d1 --- /dev/null +++ b/modelscope/models/nlp/text_generation_model.py @@ -0,0 +1,52 @@ +from typing import Any, Dict + +from modelscope.utils.constant import Tasks +from ..base import Model, Tensor +from ..builder import MODELS + +__all__ = ['PalmForTextGenerationModel'] + + +@MODELS.register_module(Tasks.text_generation, module_name=r'palm') +class PalmForTextGenerationModel(Model): + + def __init__(self, model_dir: str, *args, **kwargs): + """initialize the text generation model from the `model_dir` path. + + Args: + model_dir (str): the model path. + model_cls (Optional[Any], optional): model loader, if None, use the + default loader to load model weights, by default None. + """ + from sofa import PalmTokenizer + + super().__init__(model_dir, *args, **kwargs) + self.model_dir = model_dir + + from sofa.models.palm import PalmForConditionalGeneration, TextGenerator + tokenizer = kwargs.pop('tokenizer', + PalmTokenizer.from_pretrained(model_dir)) + model = PalmForConditionalGeneration.from_pretrained(model_dir) + self.generator = TextGenerator(model, tokenizer) + + def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]: + """return the result by the model + + Args: + input (Dict[str, Any]): the preprocessed data + + Returns: + Dict[str, np.ndarray]: results + Example: + { + 'predictions': array([1]), # lable 0-negative 1-positive + 'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32), + 'logits': array([[-0.53860897, 1.5029076 ]], dtype=float32) # true value + } + """ + + encoder_inputs = [ + input['input_ids'], input['token_type_ids'], + input['attention_mask'] + ] + return self.generator(encoder_inputs) diff --git a/maas_lib/pipelines/__init__.py b/modelscope/pipelines/__init__.py similarity index 100% rename from maas_lib/pipelines/__init__.py rename to modelscope/pipelines/__init__.py diff --git a/maas_lib/trainers/nlp/space/trainers/__init__.py b/modelscope/pipelines/audio/__init__.py similarity index 100% rename from maas_lib/trainers/nlp/space/trainers/__init__.py rename to modelscope/pipelines/audio/__init__.py diff --git a/maas_lib/pipelines/base.py b/modelscope/pipelines/base.py similarity index 56% rename from maas_lib/pipelines/base.py rename to modelscope/pipelines/base.py index c27bc58f..41a80896 100644 --- a/maas_lib/pipelines/base.py +++ b/modelscope/pipelines/base.py @@ -2,67 +2,86 @@ import os.path as osp from abc import ABC, abstractmethod -from multiprocessing.sharedctypes import Value -from typing import Any, Dict, Generator, List, Tuple, Union +from typing import Any, Dict, Generator, List, Union -from ali_maas_datasets import PyDataset from maas_hub.snapshot_download import snapshot_download -from maas_lib.models import Model -from maas_lib.preprocessors import Preprocessor -from maas_lib.utils.config import Config -from maas_lib.utils.constant import CONFIGFILE +from modelscope.models.base import Model +from modelscope.preprocessors import Preprocessor +from modelscope.pydatasets import PyDataset +from modelscope.utils.config import Config +from modelscope.utils.hub import get_model_cache_dir +from modelscope.utils.logger import get_logger from .util import is_model_name Tensor = Union['torch.Tensor', 'tf.Tensor'] Input = Union[str, PyDataset, 'PIL.Image.Image', 'numpy.ndarray'] +InputModel = Union[str, Model] output_keys = [ ] # 对于不同task的pipeline,规定标准化的输出key,用以对接postprocess,同时也用来标准化postprocess后输出的key +logger = get_logger() + class Pipeline(ABC): + def initiate_single_model(self, model): + logger.info(f'initiate model from {model}') + # TODO @wenmeng.zwm replace model.startswith('damo/') with get_model + if isinstance(model, str) and model.startswith('damo/'): + if not osp.exists(model): + cache_path = get_model_cache_dir(model) + model = cache_path if osp.exists( + cache_path) else snapshot_download(model) + return Model.from_pretrained(model) if is_model_name( + model) else model + elif isinstance(model, Model): + return model + else: + if model and not isinstance(model, str): + raise ValueError( + f'model type for single model is either str or Model, but got type {type(model)}' + ) + return model + + def initiate_multiple_models(self, input_models: List[InputModel]): + models = [] + for model in input_models: + models.append(self.initiate_single_model(model)) + return models + def __init__(self, config_file: str = None, - model: Union[Model, str] = None, - preprocessor: Preprocessor = None, + model: Union[InputModel, List[InputModel]] = None, + preprocessor: Union[Preprocessor, List[Preprocessor]] = None, **kwargs): """ Base class for pipeline. If config_file is provided, model and preprocessor will be - instantiated from corresponding config. Otherwise model + instantiated from corresponding config. Otherwise, model and preprocessor will be constructed separately. Args: config_file(str, optional): Filepath to configuration file. - model: Model name or model object - preprocessor: Preprocessor object + model: (list of) Model name or model object + preprocessor: (list of) Preprocessor object """ if config_file is not None: self.cfg = Config.from_file(config_file) - - if isinstance(model, str): - if not osp.exists(model): - model = snapshot_download(model) - - if is_model_name(model): - self.model = Model.from_pretrained(model) - else: - self.model = model - elif isinstance(model, Model): - self.model = model + if not isinstance(model, List): + self.model = self.initiate_single_model(model) + self.models = [self.model] else: - if model: - raise ValueError( - f'model type is either str or Model, but got type {type(model)}' - ) + self.models = self.initiate_multiple_models(model) + + self.has_multiple_models = len(self.models) > 1 self.preprocessor = preprocessor def __call__(self, input: Union[Input, List[Input]], *args, **post_kwargs) -> Union[Dict[str, Any], Generator]: - # moodel provider should leave it as it is - # maas library developer will handle this function + # model provider should leave it as it is + # modelscope library developer will handle this function # simple showcase, need to support iterator type for both tensorflow and pytorch # input_dict = self._handle_input(input) @@ -91,15 +110,17 @@ class Pipeline(ABC): def preprocess(self, inputs: Input) -> Dict[str, Any]: """ Provide default implementation based on preprocess_cfg and user can reimplement it - """ assert self.preprocessor is not None, 'preprocess method should be implemented' + assert not isinstance(self.preprocessor, List),\ + 'default implementation does not support using multiple preprocessors.' return self.preprocessor(inputs) def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]: """ Provide default implementation using self.model and user can reimplement it """ assert self.model is not None, 'forward method should be implemented' + assert not self.has_multiple_models, 'default implementation does not support multiple models in a pipeline.' return self.model(inputs) @abstractmethod diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py new file mode 100644 index 00000000..6495a5db --- /dev/null +++ b/modelscope/pipelines/builder.py @@ -0,0 +1,171 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import os.path as osp +from typing import List, Union + +import json +from maas_hub.file_download import model_file_download + +from modelscope.models.base import Model +from modelscope.utils.config import Config, ConfigDict +from modelscope.utils.constant import CONFIGFILE, Tasks +from modelscope.utils.registry import Registry, build_from_cfg +from .base import Pipeline +from .util import is_model_name + +PIPELINES = Registry('pipelines') + +DEFAULT_MODEL_FOR_PIPELINE = { + # TaskName: (pipeline_module_name, model_repo) + Tasks.image_matting: ('image-matting', 'damo/image-matting-person'), + Tasks.text_classification: + ('bert-sentiment-analysis', 'damo/bert-base-sst2'), + Tasks.text_generation: ('palm', 'damo/nlp_palm_text-generation_chinese'), + Tasks.image_captioning: ('ofa', None), + Tasks.image_generation: + ('person-image-cartoon', + 'damo/cv_unet_person-image-cartoon_compound-models'), +} + + +def build_pipeline(cfg: ConfigDict, + task_name: str = None, + default_args: dict = None): + """ build pipeline given model config dict. + + Args: + cfg (:obj:`ConfigDict`): config dict for model object. + task_name (str, optional): task name, refer to + :obj:`Tasks` for more details. + default_args (dict, optional): Default initialization arguments. + """ + return build_from_cfg( + cfg, PIPELINES, group_key=task_name, default_args=default_args) + + +def pipeline(task: str = None, + model: Union[str, List[str], Model, List[Model]] = None, + preprocessor=None, + config_file: str = None, + pipeline_name: str = None, + framework: str = None, + device: int = -1, + **kwargs) -> Pipeline: + """ Factory method to build a obj:`Pipeline`. + + + Args: + task (str): Task name defining which pipeline will be returned. + model (str or List[str] or obj:`Model` or obj:list[`Model`]): (list of) model name or model object. + preprocessor: preprocessor object. + config_file (str, optional): path to config file. + pipeline_name (str, optional): pipeline class name or alias name. + framework (str, optional): framework type. + device (int, optional): which device is used to do inference. + + Return: + pipeline (obj:`Pipeline`): pipeline object for certain task. + + Examples: + ```python + >>> # Using default model for a task + >>> p = pipeline('image-classification') + >>> # Using pipeline with a model name + >>> p = pipeline('text-classification', model='damo/distilbert-base-uncased') + >>> # Using pipeline with a model object + >>> resnet = Model.from_pretrained('Resnet') + >>> p = pipeline('image-classification', model=resnet) + >>> # Using pipeline with a list of model names + >>> p = pipeline('audio-kws', model=['damo/audio-tts', 'damo/auto-tts2']) + """ + if task is None and pipeline_name is None: + raise ValueError('task or pipeline_name is required') + + if pipeline_name is None: + # get default pipeline for this task + if isinstance(model, str) \ + or (isinstance(model, list) and isinstance(model[0], str)): + + # if is_model_name(model): + if (isinstance(model, str) and model.startswith('damo/')) \ + or (isinstance(model, list) and model[0].startswith('damo/')) \ + or (isinstance(model, str) and osp.exists(model)): + # TODO @wenmeng.zwm add support when model is a str of modelhub address + # read pipeline info from modelhub configuration file. + pipeline_name, default_model_repo = get_default_pipeline_info( + task) + else: + pipeline_name = get_pipeline_by_model_name(task, model) + else: + pipeline_name, default_model_repo = get_default_pipeline_info(task) + + if model is None: + model = default_model_repo + + assert isinstance(model, (type(None), str, Model, list)), \ + f'model should be either None, str, List[str], Model, or List[Model], but got {type(model)}' + + cfg = ConfigDict(type=pipeline_name, model=model) + + if kwargs: + cfg.update(kwargs) + + if preprocessor is not None: + cfg.preprocessor = preprocessor + + return build_pipeline(cfg, task_name=task) + + +def add_default_pipeline_info(task: str, + model_name: str, + modelhub_name: str = None, + overwrite: bool = False): + """ Add default model for a task. + + Args: + task (str): task name. + model_name (str): model_name. + modelhub_name (str): name for default modelhub. + overwrite (bool): overwrite default info. + """ + if not overwrite: + assert task not in DEFAULT_MODEL_FOR_PIPELINE, \ + f'task {task} already has default model.' + + DEFAULT_MODEL_FOR_PIPELINE[task] = (model_name, modelhub_name) + + +def get_default_pipeline_info(task): + """ Get default info for certain task. + + Args: + task (str): task name. + + Return: + A tuple: first element is pipeline name(model_name), second element + is modelhub name. + """ + + if task not in DEFAULT_MODEL_FOR_PIPELINE: + # support pipeline which does not register default model + pipeline_name = list(PIPELINES.modules[task].keys())[0] + default_model = None + else: + pipeline_name, default_model = DEFAULT_MODEL_FOR_PIPELINE[task] + return pipeline_name, default_model + + +def get_pipeline_by_model_name(task: str, model: Union[str, List[str]]): + """ Get pipeline name by task name and model name + + Args: + task (str): task name. + model (str| list[str]): model names + """ + if isinstance(model, str): + model_key = model + else: + model_key = '_'.join(model) + assert model_key in PIPELINES.modules[task], \ + f'pipeline for task {task} model {model_key} not found.' + return model_key diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py new file mode 100644 index 00000000..79c85c19 --- /dev/null +++ b/modelscope/pipelines/cv/__init__.py @@ -0,0 +1,2 @@ +from .image_cartoon_pipeline import ImageCartoonPipeline +from .image_matting_pipeline import ImageMattingPipeline diff --git a/modelscope/pipelines/cv/image_cartoon_pipeline.py b/modelscope/pipelines/cv/image_cartoon_pipeline.py new file mode 100644 index 00000000..d253eaf5 --- /dev/null +++ b/modelscope/pipelines/cv/image_cartoon_pipeline.py @@ -0,0 +1,148 @@ +import os +from typing import Any, Dict + +import cv2 +import numpy as np +import PIL +import tensorflow as tf + +from modelscope.models.cv.cartoon.facelib.facer import FaceAna +from modelscope.models.cv.cartoon.mtcnn_pytorch.src.align_trans import ( + get_reference_facial_points, warp_and_crop_face) +from modelscope.models.cv.cartoon.utils import get_f5p, padTo16x, resize_size +from modelscope.pipelines.base import Input +from modelscope.preprocessors import load_image +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger +from ..base import Pipeline +from ..builder import PIPELINES + +if tf.__version__ >= '2.0': + tf = tf.compat.v1 + tf.disable_eager_execution() + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.image_generation, module_name='person-image-cartoon') +class ImageCartoonPipeline(Pipeline): + + def __init__(self, model: str): + super().__init__(model=model) + self.facer = FaceAna(self.model) + self.sess_anime_head = self.load_sess( + os.path.join(self.model, 'cartoon_anime_h.pb'), 'model_anime_head') + self.sess_anime_bg = self.load_sess( + os.path.join(self.model, 'cartoon_anime_bg.pb'), 'model_anime_bg') + + self.box_width = 288 + global_mask = cv2.imread(os.path.join(self.model, 'alpha.jpg')) + global_mask = cv2.resize( + global_mask, (self.box_width, self.box_width), + interpolation=cv2.INTER_AREA) + self.global_mask = cv2.cvtColor( + global_mask, cv2.COLOR_BGR2GRAY).astype(np.float32) / 255.0 + + def load_sess(self, model_path, name): + config = tf.ConfigProto(allow_soft_placement=True) + config.gpu_options.allow_growth = True + sess = tf.Session(config=config) + logger.info(f'loading model from {model_path}') + with tf.gfile.FastGFile(model_path, 'rb') as f: + graph_def = tf.GraphDef() + graph_def.ParseFromString(f.read()) + sess.graph.as_default() + tf.import_graph_def(graph_def, name=name) + sess.run(tf.global_variables_initializer()) + logger.info(f'load model {model_path} done.') + return sess + + def preprocess(self, input: Input) -> Dict[str, Any]: + if isinstance(input, str): + img = np.array(load_image(input)) + elif isinstance(input, PIL.Image.Image): + img = np.array(input.convert('RGB')) + elif isinstance(input, np.ndarray): + if len(input.shape) == 2: + input = cv2.cvtColor(input, cv2.COLOR_GRAY2BGR) + img = input[:, :, ::-1] + else: + raise TypeError(f'input should be either str, PIL.Image,' + f' np.array, but got {type(input)}') + img = img.astype(np.float) + result = {'img': img} + return result + + def detect_face(self, img): + src_h, src_w, _ = img.shape + boxes, landmarks, _ = self.facer.run(img) + if boxes.shape[0] == 0: + return None + else: + return landmarks + + def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + + img = input['img'].astype(np.uint8) + ori_h, ori_w, _ = img.shape + img = resize_size(img, size=720) + + img_brg = img[:, :, ::-1] + + landmarks = self.detect_face(img) + if landmarks is None: + print('No face detected!') + return {'output_png': None} + + # background process + pad_bg, pad_h, pad_w = padTo16x(img_brg) + + bg_res = self.sess_anime_bg.run( + self.sess_anime_bg.graph.get_tensor_by_name( + 'model_anime_bg/output_image:0'), + feed_dict={'model_anime_bg/input_image:0': pad_bg}) + res = bg_res[:pad_h, :pad_w, :] + + for landmark in landmarks: + # get facial 5 points + f5p = get_f5p(landmark, img_brg) + + # face alignment + head_img, trans_inv = warp_and_crop_face( + img, + f5p, + ratio=0.75, + reference_pts=get_reference_facial_points(default_square=True), + crop_size=(self.box_width, self.box_width), + return_trans_inv=True) + + # head process + head_res = self.sess_anime_head.run( + self.sess_anime_head.graph.get_tensor_by_name( + 'model_anime_head/output_image:0'), + feed_dict={ + 'model_anime_head/input_image:0': head_img[:, :, ::-1] + }) + + # merge head and background + head_trans_inv = cv2.warpAffine( + head_res, + trans_inv, (np.size(img, 1), np.size(img, 0)), + borderValue=(0, 0, 0)) + + mask = self.global_mask + mask_trans_inv = cv2.warpAffine( + mask, + trans_inv, (np.size(img, 1), np.size(img, 0)), + borderValue=(0, 0, 0)) + mask_trans_inv = np.expand_dims(mask_trans_inv, 2) + + res = mask_trans_inv * head_trans_inv + (1 - mask_trans_inv) * res + + res = cv2.resize(res, (ori_w, ori_h), interpolation=cv2.INTER_AREA) + + return {'output_png': res} + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + return inputs diff --git a/maas_lib/pipelines/cv/image_matting.py b/modelscope/pipelines/cv/image_matting_pipeline.py similarity index 90% rename from maas_lib/pipelines/cv/image_matting.py rename to modelscope/pipelines/cv/image_matting_pipeline.py index fdb443f9..6f3ff5f5 100644 --- a/maas_lib/pipelines/cv/image_matting.py +++ b/modelscope/pipelines/cv/image_matting_pipeline.py @@ -4,12 +4,11 @@ from typing import Any, Dict, List, Tuple, Union import cv2 import numpy as np import PIL -from cv2 import COLOR_GRAY2RGB -from maas_lib.pipelines.base import Input -from maas_lib.preprocessors import load_image -from maas_lib.utils.constant import Tasks -from maas_lib.utils.logger import get_logger +from modelscope.pipelines.base import Input +from modelscope.preprocessors import load_image +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger from ..base import Pipeline from ..builder import PIPELINES @@ -18,7 +17,7 @@ logger = get_logger() @PIPELINES.register_module( Tasks.image_matting, module_name=Tasks.image_matting) -class ImageMatting(Pipeline): +class ImageMattingPipeline(Pipeline): def __init__(self, model: str): super().__init__(model=model) diff --git a/modelscope/pipelines/multi_modal/__init__.py b/modelscope/pipelines/multi_modal/__init__.py new file mode 100644 index 00000000..7d9a2c59 --- /dev/null +++ b/modelscope/pipelines/multi_modal/__init__.py @@ -0,0 +1 @@ +from .image_captioning import ImageCaptionPipeline diff --git a/modelscope/pipelines/multi_modal/image_captioning.py b/modelscope/pipelines/multi_modal/image_captioning.py new file mode 100644 index 00000000..91180e23 --- /dev/null +++ b/modelscope/pipelines/multi_modal/image_captioning.py @@ -0,0 +1,118 @@ +from typing import Any, Dict + +import numpy as np +import torch +from PIL import Image + +from modelscope.pipelines.base import Input +from modelscope.preprocessors import load_image +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger +from ..base import Pipeline +from ..builder import PIPELINES + +logger = get_logger() + + +@PIPELINES.register_module(Tasks.image_captioning, module_name='ofa') +class ImageCaptionPipeline(Pipeline): + # TODO: refine using modelhub + def __init__(self, model: str, bpe_dir: str): + super().__init__() + # turn on cuda if GPU is available + from fairseq import checkpoint_utils, tasks, utils + from ofa.tasks.mm_tasks import CaptionTask + + tasks.register_task('caption', CaptionTask) + use_cuda = False + # use fp16 only when GPU is available + use_fp16 = False + overrides = { + 'bpe_dir': bpe_dir, + 'eval_cider': False, + 'beam': 5, + 'max_len_b': 16, + 'no_repeat_ngram_size': 3, + 'seed': 7 + } + models, cfg, task = checkpoint_utils.load_model_ensemble_and_task( + utils.split_paths(model), arg_overrides=overrides) + + # Move models to GPU + for model in models: + model.eval() + if use_cuda: + model.cuda() + if use_fp16: + model.half() + model.prepare_for_inference_(cfg) + self.models = models + # Initialize generator + self.generator = task.build_generator(models, cfg.generation) + + # Initialize transform + from torchvision import transforms + mean = [0.5, 0.5, 0.5] + std = [0.5, 0.5, 0.5] + + self.patch_resize_transform = transforms.Compose([ + lambda image: image.convert('RGB'), + transforms.Resize( + (cfg.task.patch_image_size, cfg.task.patch_image_size), + interpolation=Image.BICUBIC), + transforms.ToTensor(), + transforms.Normalize(mean=mean, std=std), + ]) + + self.task = task + self.bos_item = torch.LongTensor([task.src_dict.bos()]) + self.eos_item = torch.LongTensor([task.src_dict.eos()]) + self.pad_idx = task.src_dict.pad() + + def preprocess(self, input: Input) -> Dict[str, Any]: + + def encode_text(text, length=None, append_bos=False, append_eos=False): + s = self.task.tgt_dict.encode_line( + line=self.task.bpe.encode(text), + add_if_not_exist=False, + append_eos=False).long() + if length is not None: + s = s[:length] + if append_bos: + s = torch.cat([self.bos_item, s]) + if append_eos: + s = torch.cat([s, self.eos_item]) + return s + + patch_image = self.patch_resize_transform( + load_image(input)).unsqueeze(0) + patch_mask = torch.tensor([True]) + text = 'what does the image describe?' + src_text = encode_text( + text, append_bos=True, append_eos=True).unsqueeze(0) + src_length = torch.LongTensor( + [s.ne(self.pad_idx).long().sum() for s in src_text]) + sample = { + 'id': np.array(['42']), + 'net_input': { + 'src_tokens': src_text, + 'src_lengths': src_length, + 'patch_images': patch_image, + 'patch_masks': patch_mask, + } + } + return sample + + def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + from ofa.utils.eval_utils import eval_caption + + results, _ = eval_caption(self.task, self.generator, self.models, + input) + return { + 'image_id': results[0]['image_id'], + 'caption': results[0]['caption'] + } + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + # What should we do here ? + return inputs diff --git a/maas_lib/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py similarity index 77% rename from maas_lib/pipelines/nlp/__init__.py rename to modelscope/pipelines/nlp/__init__.py index 8a97070b..fe11e9a3 100644 --- a/maas_lib/pipelines/nlp/__init__.py +++ b/modelscope/pipelines/nlp/__init__.py @@ -1,3 +1,4 @@ from .sequence_classification_pipeline import * # noqa F403 from .space.dialog_generation_pipeline import * # noqa F403 from .space.dialog_intent_pipeline import * # noqa F403 +from .text_generation_pipeline import * # noqa F403 diff --git a/maas_lib/pipelines/nlp/sequence_classification_pipeline.py b/modelscope/pipelines/nlp/sequence_classification_pipeline.py similarity index 63% rename from maas_lib/pipelines/nlp/sequence_classification_pipeline.py rename to modelscope/pipelines/nlp/sequence_classification_pipeline.py index f3b20f95..5a14f136 100644 --- a/maas_lib/pipelines/nlp/sequence_classification_pipeline.py +++ b/modelscope/pipelines/nlp/sequence_classification_pipeline.py @@ -1,13 +1,14 @@ import os import uuid -from typing import Any, Dict +from typing import Any, Dict, Union import json import numpy as np -from maas_lib.models.nlp import SequenceClassificationModel -from maas_lib.preprocessors import SequenceClassificationPreprocessor -from maas_lib.utils.constant import Tasks +from modelscope.models.nlp import BertForSequenceClassification +from modelscope.preprocessors import SequenceClassificationPreprocessor +from modelscope.utils.constant import Tasks +from ...models import Model from ..base import Input, Pipeline from ..builder import PIPELINES @@ -18,19 +19,31 @@ __all__ = ['SequenceClassificationPipeline'] Tasks.text_classification, module_name=r'bert-sentiment-analysis') class SequenceClassificationPipeline(Pipeline): - def __init__(self, model: SequenceClassificationModel, - preprocessor: SequenceClassificationPreprocessor, **kwargs): + def __init__(self, + model: Union[BertForSequenceClassification, str], + preprocessor: SequenceClassificationPreprocessor = None, + **kwargs): """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction Args: - model (SequenceClassificationModel): a model instance + model (BertForSequenceClassification): a model instance preprocessor (SequenceClassificationPreprocessor): a preprocessor instance """ - - super().__init__(model=model, preprocessor=preprocessor, **kwargs) + assert isinstance(model, str) or isinstance(model, BertForSequenceClassification), \ + 'model must be a single str or BertForSequenceClassification' + sc_model = model if isinstance( + model, + BertForSequenceClassification) else Model.from_pretrained(model) + if preprocessor is None: + preprocessor = SequenceClassificationPreprocessor( + sc_model.model_dir, + first_sequence='sentence', + second_sequence=None) + super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs) from easynlp.utils import io - self.label_path = os.path.join(model.model_dir, 'label_mapping.json') + self.label_path = os.path.join(sc_model.model_dir, + 'label_mapping.json') with io.open(self.label_path) as f: self.label_mapping = json.load(f) self.label_id_to_name = { diff --git a/maas_lib/utils/__init__.py b/modelscope/pipelines/nlp/space/__init__.py similarity index 100% rename from maas_lib/utils/__init__.py rename to modelscope/pipelines/nlp/space/__init__.py diff --git a/maas_lib/pipelines/nlp/space/dialog_generation_pipeline.py b/modelscope/pipelines/nlp/space/dialog_generation_pipeline.py similarity index 91% rename from maas_lib/pipelines/nlp/space/dialog_generation_pipeline.py rename to modelscope/pipelines/nlp/space/dialog_generation_pipeline.py index a7b2d057..4107c35e 100644 --- a/maas_lib/pipelines/nlp/space/dialog_generation_pipeline.py +++ b/modelscope/pipelines/nlp/space/dialog_generation_pipeline.py @@ -1,8 +1,8 @@ from typing import Any, Dict, Optional -from maas_lib.models.nlp import DialogGenerationModel -from maas_lib.preprocessors import DialogGenerationPreprocessor -from maas_lib.utils.constant import Tasks +from modelscope.models.nlp import DialogGenerationModel +from modelscope.preprocessors import DialogGenerationPreprocessor +from modelscope.utils.constant import Tasks from ...base import Model, Tensor from ...builder import PIPELINES diff --git a/maas_lib/pipelines/nlp/space/dialog_intent_pipeline.py b/modelscope/pipelines/nlp/space/dialog_intent_pipeline.py similarity index 87% rename from maas_lib/pipelines/nlp/space/dialog_intent_pipeline.py rename to modelscope/pipelines/nlp/space/dialog_intent_pipeline.py index 99862311..26ba5553 100644 --- a/maas_lib/pipelines/nlp/space/dialog_intent_pipeline.py +++ b/modelscope/pipelines/nlp/space/dialog_intent_pipeline.py @@ -1,8 +1,8 @@ from typing import Any, Dict, Optional -from maas_lib.models.nlp import DialogIntentModel -from maas_lib.preprocessors import DialogIntentPreprocessor -from maas_lib.utils.constant import Tasks +from modelscope.models.nlp import DialogIntentModel +from modelscope.preprocessors import DialogIntentPreprocessor +from modelscope.utils.constant import Tasks from ...base import Input, Pipeline from ...builder import PIPELINES diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py new file mode 100644 index 00000000..7ad2b67f --- /dev/null +++ b/modelscope/pipelines/nlp/text_generation_pipeline.py @@ -0,0 +1,59 @@ +from typing import Dict, Optional, Union + +from modelscope.models import Model +from modelscope.models.nlp import PalmForTextGenerationModel +from modelscope.preprocessors import TextGenerationPreprocessor +from modelscope.utils.constant import Tasks +from ..base import Pipeline, Tensor +from ..builder import PIPELINES + +__all__ = ['TextGenerationPipeline'] + + +@PIPELINES.register_module(Tasks.text_generation, module_name=r'palm') +class TextGenerationPipeline(Pipeline): + + def __init__(self, + model: Union[PalmForTextGenerationModel, str], + preprocessor: Optional[TextGenerationPreprocessor] = None, + **kwargs): + """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction + + Args: + model (SequenceClassificationModel): a model instance + preprocessor (SequenceClassificationPreprocessor): a preprocessor instance + """ + sc_model = model if isinstance( + model, + PalmForTextGenerationModel) else Model.from_pretrained(model) + if preprocessor is None: + preprocessor = TextGenerationPreprocessor( + sc_model.model_dir, + first_sequence='sentence', + second_sequence=None) + super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs) + self.tokenizer = preprocessor.tokenizer + + def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, str]: + """process the prediction results + + Args: + inputs (Dict[str, Any]): _description_ + + Returns: + Dict[str, str]: the prediction results + """ + + vocab_size = len(self.tokenizer.vocab) + pred_list = inputs['predictions'] + pred_ids = pred_list[0][0].cpu().numpy().tolist() + for j in range(len(pred_ids)): + if pred_ids[j] >= vocab_size: + pred_ids[j] = 100 + pred = self.tokenizer.convert_ids_to_tokens(pred_ids) + pred_string = ''.join(pred).replace( + '##', + '').split('[SEP]')[0].replace('[CLS]', + '').replace('[SEP]', + '').replace('[UNK]', '') + return {'pred_string': pred_string} diff --git a/modelscope/pipelines/util.py b/modelscope/pipelines/util.py new file mode 100644 index 00000000..caef6b22 --- /dev/null +++ b/modelscope/pipelines/util.py @@ -0,0 +1,46 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +import os.path as osp +from typing import List, Union + +import json +from maas_hub.file_download import model_file_download + +from modelscope.utils.constant import CONFIGFILE + + +def is_model_name(model: Union[str, List]): + """ whether model is a valid modelhub path + """ + + def is_model_name_impl(model): + if osp.exists(model): + if osp.exists(osp.join(model, CONFIGFILE)): + return True + else: + return False + else: + # try: + # cfg_file = model_file_download(model, CONFIGFILE) + # except Exception: + # cfg_file = None + # TODO @wenmeng.zwm use exception instead of + # following tricky logic + cfg_file = model_file_download(model, CONFIGFILE) + with open(cfg_file, 'r') as infile: + cfg = json.load(infile) + if 'Code' in cfg: + return False + else: + return True + + if isinstance(model, str): + return is_model_name_impl(model) + else: + results = [is_model_name_impl(m) for m in model] + all_true = all(results) + any_true = any(results) + if any_true and not all_true: + raise ValueError('some model are hub address, some are not') + + return all_true diff --git a/maas_lib/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py similarity index 89% rename from maas_lib/preprocessors/__init__.py rename to modelscope/preprocessors/__init__.py index 4a146843..5f473753 100644 --- a/maas_lib/preprocessors/__init__.py +++ b/modelscope/preprocessors/__init__.py @@ -5,5 +5,6 @@ from .builder import PREPROCESSORS, build_preprocessor from .common import Compose from .image import LoadImage, load_image from .nlp import * # noqa F403 +from .nlp import TextGenerationPreprocessor from .space.dialog_generation_preprocessor import * # noqa F403 from .space.dialog_intent_preprocessor import * # noqa F403 diff --git a/maas_lib/preprocessors/base.py b/modelscope/preprocessors/base.py similarity index 100% rename from maas_lib/preprocessors/base.py rename to modelscope/preprocessors/base.py diff --git a/maas_lib/preprocessors/builder.py b/modelscope/preprocessors/builder.py similarity index 80% rename from maas_lib/preprocessors/builder.py rename to modelscope/preprocessors/builder.py index 69421b5f..918f8d17 100644 --- a/maas_lib/preprocessors/builder.py +++ b/modelscope/preprocessors/builder.py @@ -1,8 +1,8 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -from maas_lib.utils.config import ConfigDict -from maas_lib.utils.constant import Fields -from maas_lib.utils.registry import Registry, build_from_cfg +from modelscope.utils.config import ConfigDict +from modelscope.utils.constant import Fields +from modelscope.utils.registry import Registry, build_from_cfg PREPROCESSORS = Registry('preprocessors') diff --git a/maas_lib/preprocessors/common.py b/modelscope/preprocessors/common.py similarity index 100% rename from maas_lib/preprocessors/common.py rename to modelscope/preprocessors/common.py diff --git a/maas_lib/preprocessors/image.py b/modelscope/preprocessors/image.py similarity index 96% rename from maas_lib/preprocessors/image.py rename to modelscope/preprocessors/image.py index 8db9f5bb..142f9484 100644 --- a/maas_lib/preprocessors/image.py +++ b/modelscope/preprocessors/image.py @@ -4,8 +4,8 @@ from typing import Dict, Union from PIL import Image, ImageOps -from maas_lib.fileio import File -from maas_lib.utils.constant import Fields +from modelscope.fileio import File +from modelscope.utils.constant import Fields from .builder import PREPROCESSORS diff --git a/maas_lib/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py similarity index 57% rename from maas_lib/preprocessors/nlp.py rename to modelscope/preprocessors/nlp.py index 0a03328a..c85c2159 100644 --- a/maas_lib/preprocessors/nlp.py +++ b/modelscope/preprocessors/nlp.py @@ -5,8 +5,8 @@ from typing import Any, Dict, Union from transformers import AutoTokenizer -from maas_lib.utils.constant import Fields, InputFields -from maas_lib.utils.type_assert import type_assert +from modelscope.utils.constant import Fields, InputFields +from modelscope.utils.type_assert import type_assert from .base import Preprocessor from .builder import PREPROCESSORS @@ -92,3 +92,61 @@ class SequenceClassificationPreprocessor(Preprocessor): rst['token_type_ids'].append(feature['token_type_ids']) return rst + + +@PREPROCESSORS.register_module(Fields.nlp, module_name=r'palm') +class TextGenerationPreprocessor(Preprocessor): + + def __init__(self, model_dir: str, *args, **kwargs): + """preprocess the data using the vocab.txt from the `model_dir` path + + Args: + model_dir (str): model path + """ + from sofa import PalmTokenizer + + super().__init__(*args, **kwargs) + + self.model_dir: str = model_dir + self.first_sequence: str = kwargs.pop('first_sequence', + 'first_sequence') + self.second_sequence: str = kwargs.pop('second_sequence', + 'second_sequence') + self.sequence_length: int = kwargs.pop('sequence_length', 128) + self.tokenizer = PalmTokenizer.from_pretrained(model_dir) + + @type_assert(object, str) + def __call__(self, data: str) -> Dict[str, Any]: + """process the raw input data + + Args: + data (str): a sentence + Example: + 'you are so handsome.' + + Returns: + Dict[str, Any]: the preprocessed data + """ + import torch + + new_data = {self.first_sequence: data} + # preprocess the data for the model input + + rst = {'input_ids': [], 'attention_mask': [], 'token_type_ids': []} + + max_seq_length = self.sequence_length + + text_a = new_data.get(self.first_sequence, None) + text_b = new_data.get(self.second_sequence, None) + feature = self.tokenizer( + text_a, + text_b, + padding='max_length', + truncation=True, + max_length=max_seq_length) + + rst['input_ids'].append(feature['input_ids']) + rst['attention_mask'].append(feature['attention_mask']) + rst['token_type_ids'].append(feature['token_type_ids']) + + return {k: torch.tensor(v) for k, v in rst.items()} diff --git a/maas_lib/utils/nlp/__init__.py b/modelscope/preprocessors/space/__init__.py similarity index 100% rename from maas_lib/utils/nlp/__init__.py rename to modelscope/preprocessors/space/__init__.py diff --git a/maas_lib/preprocessors/space/dialog_generation_preprocessor.py b/modelscope/preprocessors/space/dialog_generation_preprocessor.py similarity index 83% rename from maas_lib/preprocessors/space/dialog_generation_preprocessor.py rename to modelscope/preprocessors/space/dialog_generation_preprocessor.py index 5b127e8e..c6e2584d 100644 --- a/maas_lib/preprocessors/space/dialog_generation_preprocessor.py +++ b/modelscope/preprocessors/space/dialog_generation_preprocessor.py @@ -4,10 +4,11 @@ import os import uuid from typing import Any, Dict, Union -from maas_lib.data.nlp.space.fields.gen_field import MultiWOZBPETextField -from maas_lib.utils.config import Config -from maas_lib.utils.constant import Fields, InputFields -from maas_lib.utils.type_assert import type_assert +from modelscope.preprocessors.space.fields.gen_field import \ + MultiWOZBPETextField +from modelscope.utils.config import Config +from modelscope.utils.constant import Fields, InputFields +from modelscope.utils.type_assert import type_assert from ..base import Preprocessor from ..builder import PREPROCESSORS diff --git a/maas_lib/preprocessors/space/dialog_intent_preprocessor.py b/modelscope/preprocessors/space/dialog_intent_preprocessor.py similarity index 84% rename from maas_lib/preprocessors/space/dialog_intent_preprocessor.py rename to modelscope/preprocessors/space/dialog_intent_preprocessor.py index 8dba5075..f26fa9a5 100644 --- a/maas_lib/preprocessors/space/dialog_intent_preprocessor.py +++ b/modelscope/preprocessors/space/dialog_intent_preprocessor.py @@ -4,10 +4,11 @@ import os import uuid from typing import Any, Dict, Union -from maas_lib.data.nlp.space.fields.intent_field import IntentBPETextField -from maas_lib.utils.config import Config -from maas_lib.utils.constant import Fields, InputFields -from maas_lib.utils.type_assert import type_assert +from modelscope.preprocessors.space.fields.intent_field import \ + IntentBPETextField +from modelscope.utils.config import Config +from modelscope.utils.constant import Fields, InputFields +from modelscope.utils.type_assert import type_assert from ..base import Preprocessor from ..builder import PREPROCESSORS diff --git a/maas_lib/utils/nlp/space/__init__.py b/modelscope/preprocessors/space/fields/__init__.py similarity index 100% rename from maas_lib/utils/nlp/space/__init__.py rename to modelscope/preprocessors/space/fields/__init__.py diff --git a/modelscope/preprocessors/space/fields/gen_field.py b/modelscope/preprocessors/space/fields/gen_field.py new file mode 100644 index 00000000..91ec1cf8 --- /dev/null +++ b/modelscope/preprocessors/space/fields/gen_field.py @@ -0,0 +1,688 @@ +""" +Field class +""" +import os +import random +from collections import OrderedDict +from itertools import chain + +import numpy as np + +from modelscope.preprocessors.space.tokenizer import Tokenizer +from modelscope.utils.nlp.space import ontology, utils +from modelscope.utils.nlp.space.db_ops import MultiWozDB +from modelscope.utils.nlp.space.utils import list2np + + +class BPETextField(object): + + pad_token = '[PAD]' + bos_token = '[BOS]' + eos_token = '[EOS]' + unk_token = '[UNK]' + sos_u_token = '' + eos_u_token = '' + sos_b_token = '' + eos_b_token = '' + sos_d_token = '' + eos_d_token = '' + sos_a_token = '' + eos_a_token = '' + sos_db_token = '' + eos_db_token = '' + sos_r_token = '' + eos_r_token = '' + + @property + def bot_id(self): + """ + 用于区分user和bot两个角色 + 1和0不是词表中的index,而是专门针对role的index,大小就为2,对应超参数'num_type_embeddings' + """ + return 0 + + @property + def user_id(self): + """ + 用于区分user和bot两个角色 + 1和0不是词表中的index,而是专门针对role的index,大小就为2,对应超参数'num_type_embeddings' + """ + return 1 + + @property + def vocab_size(self): + return self.tokenizer.vocab_size + + @property + def num_specials(self): + return len(self.tokenizer.special_tokens) + + @property + def pad_id(self): + return self.tokenizer.convert_tokens_to_ids([self.pad_token])[0] + + @property + def bos_id(self): + return self.tokenizer.convert_tokens_to_ids([self.bos_token])[0] + + @property + def eos_id(self): + return self.tokenizer.convert_tokens_to_ids([self.eos_token])[0] + + @property + def unk_id(self): + return self.tokenizer.convert_tokens_to_ids([self.unk_token])[0] + + @property + def sos_u_id(self): + return self.tokenizer.convert_tokens_to_ids([self.sos_u_token])[0] + + @property + def eos_u_id(self): + return self.tokenizer.convert_tokens_to_ids([self.eos_u_token])[0] + + @property + def sos_b_id(self): + return self.tokenizer.convert_tokens_to_ids([self.sos_b_token])[0] + + @property + def eos_b_id(self): + return self.tokenizer.convert_tokens_to_ids([self.eos_b_token])[0] + + @property + def sos_db_id(self): + return self.tokenizer.convert_tokens_to_ids([self.sos_db_token])[0] + + @property + def eos_db_id(self): + return self.tokenizer.convert_tokens_to_ids([self.eos_db_token])[0] + + @property + def sos_a_id(self): + return self.tokenizer.convert_tokens_to_ids([self.sos_a_token])[0] + + @property + def eos_a_id(self): + return self.tokenizer.convert_tokens_to_ids([self.eos_a_token])[0] + + @property + def sos_r_id(self): + return self.tokenizer.convert_tokens_to_ids([self.sos_r_token])[0] + + @property + def eos_r_id(self): + return self.tokenizer.convert_tokens_to_ids([self.eos_r_token])[0] + + @property + def sos_d_id(self): + return self.tokenizer.convert_tokens_to_ids([self.sos_d_token])[0] + + @property + def eos_d_id(self): + return self.tokenizer.convert_tokens_to_ids([self.eos_d_token])[0] + + def __init__(self, config): + self.gpu = 0 + self.tokenizer = None + self.vocab = None + self.db = None + self.set_stats = {} + + self.prompt_num_for_understand = config.BPETextField.prompt_num_for_understand + self.prompt_num_for_policy = config.BPETextField.prompt_num_for_policy + self.understand_tokens = ontology.get_understand_tokens( + self.prompt_num_for_understand) + self.policy_tokens = ontology.get_policy_tokens( + self.prompt_num_for_policy) + + self.with_query_bow = config.BPETextField.with_query_bow + self.understand = config.BPETextField.understand + self.policy = config.BPETextField.policy + + self.batch_size = config.Trainer.batch_size + self.filtered = config.BPETextField.filtered + self.max_len = config.BPETextField.max_len + self.min_utt_len = config.BPETextField.min_utt_len + self.max_utt_len = config.BPETextField.max_utt_len + self.min_ctx_turn = config.BPETextField.min_ctx_turn + self.max_ctx_turn = config.BPETextField.max_ctx_turn - 1 # subtract reply turn + + self.use_true_prev_bspn = config.Generator.use_true_prev_bspn + self.use_true_prev_aspn = config.Generator.use_true_prev_aspn + self.use_true_db_pointer = config.Generator.use_true_db_pointer + self.use_true_prev_resp = config.Generator.use_true_prev_resp + self.use_true_curr_bspn = config.Generator.use_true_curr_bspn + self.use_true_curr_aspn = config.Generator.use_true_curr_aspn + self.use_all_previous_context = config.Generator.use_all_previous_context + self.use_true_bspn_for_ctr_eval = config.Generator.use_true_bspn_for_ctr_eval + self.use_true_domain_for_ctr_eval = config.Generator.use_true_domain_for_ctr_eval + + def collate_fn_multi_turn(self, samples): + batch_size = len(samples) + batch = {} + + src = [sp['src'][-self.max_ctx_turn:] for sp in samples] + query_token, src_token, src_pos, src_turn, src_role = [], [], [], [], [] + for utts in src: + query_token.append(utts[-1]) + utt_lens = [len(utt) for utt in utts] + + # Token ids + src_token.append(list(chain(*utts))[-self.max_len:]) + + # Position ids + pos = [list(range(l)) for l in utt_lens] + src_pos.append(list(chain(*pos))[-self.max_len:]) + + # Turn ids + turn = [[len(utts) - i] * l for i, l in enumerate(utt_lens)] + src_turn.append(list(chain(*turn))[-self.max_len:]) + + # Role ids + role = [ + [self.bot_id if (len(utts) - i) % 2 == 0 else self.user_id] * l + for i, l in enumerate(utt_lens) + ] + src_role.append(list(chain(*role))[-self.max_len:]) + + # src端序列和tgt端序列需要分开pad,以保证解码时第一个词对齐 + src_token = list2np(src_token, padding=self.pad_id) + src_pos = list2np(src_pos, padding=self.pad_id) + src_turn = list2np(src_turn, padding=self.pad_id) + src_role = list2np(src_role, padding=self.pad_id) + batch['src_token'] = src_token + batch['src_pos'] = src_pos + batch['src_type'] = src_role + batch['src_turn'] = src_turn + batch['src_mask'] = (src_token != self.pad_id).astype('int64') + + if self.with_query_bow: + query_token = list2np(query_token, padding=self.pad_id) + batch['query_token'] = query_token + batch['query_mask'] = (query_token != self.pad_id).astype('int64') + + if self.understand_ids and self.understand: + understand = [self.understand_ids for _ in samples] + understand_token = np.array(understand).astype('int64') + batch['understand_token'] = understand_token + batch['understand_mask'] = (understand_token != + self.pad_id).astype('int64') + + if self.policy_ids and self.policy: + policy = [self.policy_ids for _ in samples] + policy_token = np.array(policy).astype('int64') + batch['policy_token'] = policy_token + batch['policy_mask'] = (policy_token != + self.pad_id).astype('int64') + + if 'tgt' in samples[0]: + tgt = [sp['tgt'] for sp in samples] + + # Token ids & Label ids + tgt_token = list2np(tgt, padding=self.pad_id) + + # Position ids + tgt_pos = np.zeros_like(tgt_token) + tgt_pos[:] = np.arange(tgt_token.shape[1], dtype=tgt_token.dtype) + + # Turn ids + tgt_turn = np.zeros_like(tgt_token) + + # Role ids + tgt_role = np.full_like(tgt_token, self.bot_id) + + batch['tgt_token'] = tgt_token + batch['tgt_pos'] = tgt_pos + batch['tgt_type'] = tgt_role + batch['tgt_turn'] = tgt_turn + batch['tgt_mask'] = (tgt_token != self.pad_id).astype('int64') + + return batch, batch_size + + def _bucket_by_turn(self, encoded_data): + turn_bucket = {} + for dial in encoded_data: + turn_len = len(dial) + if turn_len not in turn_bucket: + turn_bucket[turn_len] = [] + turn_bucket[turn_len].append(dial) + return OrderedDict(sorted(turn_bucket.items(), key=lambda i: i[0])) + + def _construct_mini_batch(self, data): + all_batches = [] + batch = [] + for dial in data: + batch.append(dial) + if len(batch) == self.batch_size: + # print('batch size: %d, batch num +1'%(len(batch))) + all_batches.append(batch) + batch = [] + # if remainder > 1/2 batch_size, just put them in the previous batch, otherwise form a new batch + # print('last batch size: %d, batch num +1'%(len(batch))) + # if (len(batch) % len(cfg.cuda_device)) != 0: + # batch = batch[:-(len(batch) % len(cfg.cuda_device))] + # TODO deal with deleted data + if self.gpu <= 1: + if len(batch) > 0.5 * self.batch_size: + all_batches.append(batch) + elif len(all_batches): + all_batches[-1].extend(batch) + else: + all_batches.append(batch) + + return all_batches + + def transpose_batch(self, batch): + dial_batch = [] + turn_num = len(batch[0]) + for turn in range(turn_num): + turn_l = {} + for dial in batch: + this_turn = dial[turn] + for k in this_turn: + if k not in turn_l: + turn_l[k] = [] + turn_l[k].append(this_turn[k]) + dial_batch.append(turn_l) + return dial_batch + + def get_eval_data(self, set_name='dev'): + name_to_set = {'train': self.train, 'test': self.test, 'dev': self.dev} + dial = name_to_set[set_name] + + if set_name not in self.set_stats: + self.set_stats[set_name] = {} + num_turns = 0 + num_dials = len(dial) + for d in dial: + num_turns += len(d) + + self.set_stats[set_name]['num_turns'] = num_turns + self.set_stats[set_name]['num_dials'] = num_dials + + return dial + + def get_nontranspose_data_iterator(self, all_batches): + for i, batch in enumerate(all_batches): + yield batch + + def get_data_iterator(self, all_batches): + for i, batch in enumerate(all_batches): + yield self.transpose_batch(batch) + + +class MultiWOZBPETextField(BPETextField): + + def __init__(self, model_dir, config): + super(MultiWOZBPETextField, self).__init__(config) + import spacy + self.nlp = spacy.load('en_core_web_sm') + + self.db = MultiWozDB( + model_dir, { + 'attraction': 'db/attraction_db_processed.json', + 'hospital': 'db/hospital_db_processed.json', + 'hotel': 'db/hotel_db_processed.json', + 'police': 'db/police_db_processed.json', + 'restaurant': 'db/restaurant_db_processed.json', + 'taxi': 'db/taxi_db_processed.json', + 'train': 'db/train_db_processed.json', + }) + self._build_vocab(model_dir) + + special_tokens = [ + self.pad_token, self.bos_token, self.eos_token, self.unk_token + ] + special_tokens.extend(self.add_sepcial_tokens()) + self.tokenizer = Tokenizer( + vocab_path=os.path.join(model_dir, 'vocab.txt'), + special_tokens=special_tokens, + tokenizer_type=config.BPETextField.tokenizer_type) + self.understand_ids = self.tokenizer.convert_tokens_to_ids( + self.understand_tokens) + self.policy_ids = self.tokenizer.convert_tokens_to_ids( + self.policy_tokens) + + return + + def get_ids(self, data: str): + result = [self.sos_u_id] + self.tokenizer.convert_tokens_to_ids( + self.tokenizer.tokenize( + self._get_convert_str(data))) + [self.eos_u_id] + return result + + def inverse_transpose_turn(self, turn_list): + """ + eval, one dialog at a time + """ + dialogs = {} + turn_num = len(turn_list) + dial_id = turn_list[0]['dial_id'] + dialogs[dial_id] = [] + for turn_idx in range(turn_num): + dial_turn = {} + turn = turn_list[turn_idx] + for key, value in turn.items(): + if key == 'dial_id': + continue + if key == 'pointer' and self.db is not None: + turn_domain = turn['turn_domain'][-1] + value = self.db.pointerBack(value, turn_domain) + dial_turn[key] = value + dialogs[dial_id].append(dial_turn) + return dialogs + + def inverse_transpose_batch(self, turn_batch_list): + """ + :param turn_batch_list: list of transpose dial batch + """ + dialogs = {} + total_turn_num = len(turn_batch_list) + # initialize + for idx_in_batch, dial_id in enumerate(turn_batch_list[0]['dial_id']): + dialogs[dial_id] = [] + for turn_n in range(total_turn_num): + dial_turn = {} + turn_batch = turn_batch_list[turn_n] + for key, v_list in turn_batch.items(): + if key == 'dial_id': + continue + value = v_list[idx_in_batch] + if key == 'pointer' and self.db is not None: + turn_domain = turn_batch['turn_domain'][idx_in_batch][ + -1] + value = self.db.pointerBack(value, turn_domain) + dial_turn[key] = value + dialogs[dial_id].append(dial_turn) + return dialogs + + def get_batches(self, set_name): + """ + compute dataset stats. + """ + global dia_count + log_str = '' + name_to_set = {'train': self.train, 'test': self.test, 'dev': self.dev} + dial = name_to_set[set_name] + turn_bucket = self._bucket_by_turn(dial) + # self._shuffle_turn_bucket(turn_bucket) + all_batches = [] + + if set_name not in self.set_stats: + self.set_stats[set_name] = {} + num_training_steps = 0 + num_turns = 0 + num_dials = 0 + + for k in turn_bucket: + if set_name != 'test' and k == 1 or k >= 17: + continue + batches = self._construct_mini_batch(turn_bucket[k]) + try: + log_str += 'turn num:%d, dial num: %d, batch num: %d last batch len: %d\n' % ( + k, len(turn_bucket[k]), len(batches), len(batches[-1])) + except: + log_str += 'turn num:%d, dial num: %d, batch num: %d last batch len: %d\n' % ( + k, len(turn_bucket[k]), len(batches), 0.0) + # print("turn num:%d, dial num:v%d, batch num: %d, "%(k, len(turn_bucket[k]), len(batches))) + num_training_steps += k * len(batches) + num_turns += k * len(turn_bucket[k]) + num_dials += len(turn_bucket[k]) + all_batches += batches + log_str += 'total batch num: %d\n' % len(all_batches) + # print('total batch num: %d'%len(all_batches)) + # print('dialog count: %d'%dia_count) + # return all_batches + + # log stats + # logging.info(log_str) + # cfg.num_training_steps = num_training_steps * cfg.epoch_num + self.set_stats[set_name][ + 'num_training_steps_per_epoch'] = num_training_steps # turn-level的steps + self.set_stats[set_name]['num_turns'] = num_turns + self.set_stats[set_name]['num_dials'] = num_dials + + if set_name == 'train': + random.shuffle(all_batches) + return all_batches + + def add_sepcial_tokens(self): + """ + add special tokens to gpt tokenizer + serves a similar role of Vocab.construt() + make a dict of special tokens + """ + special_tokens = [] + prompt_tokens = self.understand_tokens + self.policy_tokens + special_tokens.extend( + ontology.get_special_tokens(other_tokens=prompt_tokens)) + + for word in ontology.all_domains + ['general']: + word = '[' + word + ']' + special_tokens.append(word) + for word in ontology.all_acts: + word = '[' + word + ']' + special_tokens.append(word) + for word in self.vocab._word2idx.keys(): + if word.startswith('[value_') and word.endswith(']'): + special_tokens.append(word) + + return special_tokens + + def _build_vocab(self, model_dir: str): + self.vocab = utils.MultiWOZVocab(3000) + vp = os.path.join('{}/vocab'.format(model_dir)) + self.vocab.load_vocab(vp) + return self.vocab.vocab_size + + def _get_convert_str(self, sent): + assert isinstance(sent, str) + return ' '.join([ + self.tokenizer.spec_convert_dict.get(tok, tok) + for tok in sent.split() + ]) + + def bspan_to_DBpointer(self, bspan, turn_domain): + constraint_dict = self.bspan_to_constraint_dict(bspan) + # print(constraint_dict) + matnums = self.db.get_match_num(constraint_dict) + match_dom = turn_domain[0] if len(turn_domain) == 1 else turn_domain[1] + match_dom = match_dom[1:-1] if match_dom.startswith('[') else match_dom + match = matnums[match_dom] + # vector = self.db.addDBPointer(match_dom, match) + vector = self.db.addDBIndicator(match_dom, match) + return vector + + def bspan_to_constraint_dict(self, bspan, bspn_mode='bspn'): + """ + ['[hotel]', 'pricerange', 'cheap', 'type', 'hotel'] -> {'hotel': {'pricerange': 'cheap', 'type': 'hotel'}} + """ + bspan = bspan.split() if isinstance(bspan, str) else bspan + constraint_dict = {} + domain = None + conslen = len(bspan) + for idx, cons in enumerate(bspan): + cons = self.vocab.decode(cons) if type(cons) is not str else cons + if cons == '': + break + if '[' in cons: + if cons[1:-1] not in ontology.all_domains: + continue + domain = cons[1:-1] + elif cons in ontology.get_slot: + if domain is None: + continue + if cons == 'people': + # handle confusion of value name "people's portraits..." and slot people + try: + ns = bspan[idx + 1] + ns = self.vocab.decode(ns) if type( + ns) is not str else ns + if ns == "'s": + continue + except: + continue + if not constraint_dict.get(domain): + constraint_dict[domain] = {} + if bspn_mode == 'bsdx': + constraint_dict[domain][cons] = 1 + continue + vidx = idx + 1 + if vidx == conslen: + break + vt_collect = [] + vt = bspan[vidx] + vt = self.vocab.decode(vt) if type(vt) is not str else vt + while vidx < conslen and vt != '' and '[' not in vt and vt not in ontology.get_slot: + vt_collect.append(vt) + vidx += 1 + if vidx == conslen: + break + vt = bspan[vidx] + vt = self.vocab.decode(vt) if type(vt) is not str else vt + if vt_collect: + constraint_dict[domain][cons] = ' '.join(vt_collect) + + return constraint_dict + + def convert_batch_turn(self, turn_batch, pv_batch, first_turn=False): + """ + URURU:这里的含义是指轮级别的训练(数据整理),区别于session级别的训练方式(convert_batch_session); + 但不同于eval时的含义,eval时二者都是逐轮依次生成的,那时URURU的含义请见相关的函数注释; + + convert the current and the last turn + concat [U_0,R_0,...,U_{t-1}, R_{t-1}, U_t, B_t, A_t, R_t] + firts turn: [U_t, B_t, A_t, R_t] + try: [user, bspn, db, aspn, resp] + + """ + inputs = [] + if first_turn: + batch_zipped = zip(turn_batch['user'], turn_batch['bspn'], + turn_batch['db'], turn_batch['aspn'], + turn_batch['resp']) + for u, b, db, a, r in batch_zipped: + if self.use_true_curr_bspn: + src = [u + b + db] + tgt = a + r + else: + src = [u] + tgt = b + db + a + r + inputs.append({'src': src, 'tgt': tgt}) + pv = [src[-1], tgt] + pv_batch.append(pv) + else: + batch_zipped = zip(pv_batch, turn_batch['user'], + turn_batch['bspn'], turn_batch['db'], + turn_batch['aspn'], turn_batch['resp']) + for i, (pv, u, b, db, a, r) in enumerate(batch_zipped): + if self.use_true_curr_bspn: + src = pv + [u + b + db] + tgt = a + r + else: + src = pv + [u] + tgt = b + db + a + r + inputs.append({'src': src, 'tgt': tgt}) + pv = [src[-1], tgt] + pv_batch[i].extend(pv) + + return inputs, pv_batch + + def wrap_result_lm(self, result_dict, eos_syntax=None): + results = [] + eos_syntax = ontology.eos_tokens if not eos_syntax else eos_syntax + sos_syntax = ontology.sos_tokens + # ground truth bs, as, ds.. generate response + field = [ + 'dial_id', 'turn_num', 'user', 'bspn_gen', 'bsdx', 'resp_gen', + 'resp', 'aspn_gen', 'aspn', 'dspn_gen', 'dspn', 'bspn', 'pointer', + 'qspn_gen', 'qspn' + ] + + for dial_id, turns in result_dict.items(): + entry = {'dial_id': dial_id, 'trun_num': len(turns)} + for f in field[2:]: + entry[f] = '' # TODO ??? + results.append(entry) + for turn_idx, turn in enumerate(turns): + entry = {'dial_id': dial_id} + for key in field: + if key in ['dial_id']: + continue + v = turn.get(key, '') + if key == 'turn_domain': + v = ' '.join(v) + + if key in eos_syntax and v != '': + # remove eos tokens + v = self.tokenizer.decode(v) + v = v.split() + # remove eos/sos in span + if eos_syntax[key] in v: + v.remove(eos_syntax[key]) + if sos_syntax[key] in v: + v.remove(sos_syntax[key]) + v = ' '.join(v) + else: + pass # v = v + entry[key] = v + + results.append(entry) + + return results, field + + def convert_turn_eval(self, turn, pv_turn, first_turn=False): + """ + input: [all previous ubar, U_t, B_t, A_t] predict R_t + firts turn: [U_t, B_t, A_t] predict R_t + + regarding the context, all previous ubar is too slow, try the previous ubar + """ + inputs = {} + + context_list = [] + prompt_id = None + if self.use_true_curr_bspn: + if self.use_true_curr_aspn: # only predict resp + context_list = ['user', 'bspn', 'db', 'aspn'] + prompt_id = self.sos_r_id + else: # predicted aspn + context_list = ['user', 'bspn', 'db'] + prompt_id = self.sos_a_id + else: # predict bspn aspn resp. db are not predicted. this part tbd. + context_list = ['user'] + prompt_id = self.sos_b_id + + if first_turn: + context = [] + for c in context_list: + context += turn[c] + + inputs['src'] = [context] + inputs['labels'] = [context] + else: + context = [] + for c in context_list: + context += turn[c] + + if self.use_true_curr_bspn: + pv_context = pv_turn['labels'] + [ + pv_turn['aspn'] + pv_turn['resp'] + ] + else: + pv_context = pv_turn['labels'] + [ + pv_turn['bspn'] + pv_turn['db'] + pv_turn['aspn'] + + pv_turn['resp'] + ] + + # prompt response, add sos_r + inputs['src'] = pv_context + [context] + + if self.use_all_previous_context: + inputs['labels'] = pv_context + [ + context + ] # use all previous ubar history + else: + inputs['labels'] = [context] # use previous turn + + return inputs, prompt_id diff --git a/modelscope/preprocessors/space/fields/intent_field.py b/modelscope/preprocessors/space/fields/intent_field.py new file mode 100644 index 00000000..0c8c909e --- /dev/null +++ b/modelscope/preprocessors/space/fields/intent_field.py @@ -0,0 +1,1074 @@ +""" +Intent Field class +""" +import glob +import multiprocessing +import os +import random +import re +import time +from collections import defaultdict +from itertools import chain + +import json +import numpy as np +from tqdm import tqdm + +from modelscope.preprocessors.space.tokenizer import Tokenizer +from modelscope.utils.nlp.space import ontology, utils +from modelscope.utils.nlp.space.scores import hierarchical_set_score +from modelscope.utils.nlp.space.utils import list2np + + +class BPETextField(object): + + pad_token = '[PAD]' + bos_token = '[BOS]' + eos_token = '[EOS]' + unk_token = '[UNK]' + mask_token = '[MASK]' + sos_u_token = '' + eos_u_token = '' + sos_b_token = '' + eos_b_token = '' + sos_db_token = '' + eos_db_token = '' + sos_a_token = '' + eos_a_token = '' + sos_r_token = '' + eos_r_token = '' + + def __init__(self, model_dir, config): + self.score_matrixs = {} + self.prompt_num_for_understand = config.BPETextField.prompt_num_for_understand + self.prompt_num_for_policy = config.BPETextField.prompt_num_for_policy + self.understand_tokens = ontology.get_understand_tokens( + self.prompt_num_for_understand) + self.policy_tokens = ontology.get_policy_tokens( + self.prompt_num_for_policy) + special_tokens = [ + self.pad_token, self.bos_token, self.eos_token, self.unk_token + ] + special_tokens.extend(self.add_sepcial_tokens()) + self.tokenizer = Tokenizer( + vocab_path=os.path.join(model_dir, 'vocab.txt'), + special_tokens=special_tokens, + tokenizer_type=config.BPETextField.tokenizer_type) + self.understand_ids = self.numericalize(self.understand_tokens) + self.policy_ids = self.numericalize(self.policy_tokens) + + self.tokenizer_type = config.BPETextField.tokenizer_type + self.filtered = config.BPETextField.filtered + self.max_len = config.BPETextField.max_len + self.min_utt_len = config.BPETextField.min_utt_len + self.max_utt_len = config.BPETextField.max_utt_len + self.min_ctx_turn = config.BPETextField.min_ctx_turn + self.max_ctx_turn = config.BPETextField.max_ctx_turn + self.policy = config.BPETextField.policy + self.generation = config.BPETextField.generation + self.with_mlm = config.Dataset.with_mlm + self.with_query_bow = config.BPETextField.with_query_bow + self.with_contrastive = config.Dataset.with_contrastive + self.num_process = config.Dataset.num_process + self.dynamic_score = config.Dataset.dynamic_score + self.abandon_label = config.Dataset.abandon_label + self.trigger_role = config.Dataset.trigger_role + self.trigger_data = config.Dataset.trigger_data.split( + ',') if config.Dataset.trigger_data else [] + + # data_paths = list(os.path.dirname(c) for c in sorted( + # glob.glob(hparams.data_dir + '/**/' + f'train.{hparams.tokenizer_type}.jsonl', recursive=True))) + # self.data_paths = self.filter_data_path(data_paths=data_paths) + # self.labeled_data_paths = [data_path for data_path in self.data_paths if 'UniDA' in data_path] + # self.unlabeled_data_paths = [data_path for data_path in self.data_paths if 'UnDial' in data_path] + # assert len(self.unlabeled_data_paths) + len(self.labeled_data_paths) == len(self.data_paths) + # assert len(self.labeled_data_paths) or len(self.unlabeled_data_paths), 'No dataset is loaded' + + @property + def vocab_size(self): + return self.tokenizer.vocab_size + + @property + def num_specials(self): + return len(self.tokenizer.special_tokens) + + @property + def pad_id(self): + return self.tokenizer.convert_tokens_to_ids([self.pad_token])[0] + + @property + def bos_id(self): + return self.tokenizer.convert_tokens_to_ids([self.bos_token])[0] + + @property + def eos_id(self): + return self.tokenizer.convert_tokens_to_ids([self.eos_token])[0] + + @property + def unk_id(self): + return self.tokenizer.convert_tokens_to_ids([self.unk_token])[0] + + @property + def mask_id(self): + return self.tokenizer.convert_tokens_to_ids([self.mask_token])[0] + + @property + def sos_u_id(self): + return self.tokenizer.convert_tokens_to_ids([self.sos_u_token])[0] + + @property + def eos_u_id(self): + return self.tokenizer.convert_tokens_to_ids([self.eos_u_token])[0] + + @property + def sos_b_id(self): + return self.tokenizer.convert_tokens_to_ids([self.sos_b_token])[0] + + @property + def eos_b_id(self): + return self.tokenizer.convert_tokens_to_ids([self.eos_b_token])[0] + + @property + def sos_db_id(self): + return self.tokenizer.convert_tokens_to_ids([self.sos_db_token])[0] + + @property + def eos_db_id(self): + return self.tokenizer.convert_tokens_to_ids([self.eos_db_token])[0] + + @property + def sos_a_id(self): + return self.tokenizer.convert_tokens_to_ids([self.sos_a_token])[0] + + @property + def eos_a_id(self): + return self.tokenizer.convert_tokens_to_ids([self.eos_a_token])[0] + + @property + def sos_r_id(self): + return self.tokenizer.convert_tokens_to_ids([self.sos_r_token])[0] + + @property + def eos_r_id(self): + return self.tokenizer.convert_tokens_to_ids([self.eos_r_token])[0] + + @property + def bot_id(self): + """ + 用于区分user和bot两个角色 + 1和0不是词表中的index,而是专门针对role的index,大小就为2,对应超参数'num_type_embeddings' + """ + return 0 + + @property + def user_id(self): + """ + 用于区分user和bot两个角色 + 1和0不是词表中的index,而是专门针对role的index,大小就为2,对应超参数'num_type_embeddings' + """ + return 1 + + def add_sepcial_tokens(self): + prompt_tokens = self.understand_tokens + self.policy_tokens + return ontology.get_special_tokens(other_tokens=prompt_tokens) + + def filter_data_path(self, data_paths): + if self.trigger_data: + filtered_data_paths = [] + for data_path in data_paths: + for data_name in self.trigger_data: + if data_path.endswith(f'/{data_name}'): + filtered_data_paths.append(data_path) + break + else: + filtered_data_paths = data_paths + return filtered_data_paths + + def load_score_matrix(self, data_type, data_iter=None): + """ + load score matrix for all labeled datasets + """ + for data_path in self.labeled_data_paths: + file_index = os.path.join( + data_path, f'{data_type}.{self.tokenizer_type}.jsonl') + file = os.path.join(data_path, f'{data_type}.Score.npy') + if self.dynamic_score: + score_matrix = {} + print(f"Created 1 score cache dict for data in '{file_index}'") + else: + # TODO add post score matrix + assert os.path.exists(file), f"{file} isn't exist" + print(f"Loading 1 score matrix from '{file}' ...") + fp = np.memmap(file, dtype='float32', mode='r') + assert len(fp.shape) == 1 + num = int(np.sqrt(fp.shape[0])) + score_matrix = fp.reshape(num, num) + print(f"Loaded 1 score matrix for data in '{file_index}'") + self.score_matrixs[file_index] = score_matrix + + def random_word(self, chars): + output_label = [] + output_chars = [] + + for i, char in enumerate(chars): + # TODO delete this part to learn special tokens + if char in [ + self.sos_u_id, self.eos_u_id, self.sos_r_id, self.eos_r_id + ]: + output_chars.append(char) + output_label.append(self.pad_id) + continue + + prob = random.random() + if prob < 0.15: + prob /= 0.15 + + # 80% randomly change token to mask token + if prob < 0.8: + output_chars.append(self.mask_id) + + # 10% randomly change token to random token + elif prob < 0.9: + output_chars.append( + random.randint(1, self.vocab_size - + 1)) # start from 1, to exclude pad_id + + # 10% randomly change token to current token + else: + output_chars.append(char) + + output_label.append(char) + + else: + output_chars.append(char) + output_label.append(self.pad_id) + + return output_chars, output_label + + def create_masked_lm_predictions(self, sample): + src = sample['src'] + src_span_mask = sample['src_span_mask'] + mlm_inputs = [] + mlm_labels = [] + for chars, chars_span_mask in zip(src, src_span_mask): + if sum(chars_span_mask): + mlm_input, mlm_label = [], [] + for char, char_mask in zip(chars, chars_span_mask): + if char_mask: + mlm_input.append(self.mask_id) + mlm_label.append(char) + else: + mlm_input.append(char) + mlm_label.append(self.pad_id) + else: + mlm_input, mlm_label = self.random_word(chars) + mlm_inputs.append(mlm_input) + mlm_labels.append(mlm_label) + + sample['mlm_inputs'] = mlm_inputs + sample['mlm_labels'] = mlm_labels + return sample + + def create_span_masked_lm_predictions(self, sample): + src = sample['src'] + src_span_mask = sample['src_span_mask'] + mlm_inputs = [] + mlm_labels = [] + for chars, chars_span_mask in zip(src, src_span_mask): + mlm_input, mlm_label = [], [] + for char, char_mask in zip(chars, chars_span_mask): + if char_mask: + mlm_input.append(self.mask_id) + mlm_label.append(char) + else: + mlm_input.append(char) + mlm_label.append(self.pad_id) + mlm_inputs.append(mlm_input) + mlm_labels.append(mlm_label) + + sample['mlm_inputs'] = mlm_inputs + sample['mlm_labels'] = mlm_labels + return sample + + def create_token_masked_lm_predictions(self, sample): + mlm_inputs = sample['mlm_inputs'] + mlm_labels = sample['mlm_labels'] + + for i, span_mlm_label in enumerate(mlm_labels): + if not sum(span_mlm_label): + mlm_input, mlm_label = self.random_word(mlm_inputs[i]) + mlm_inputs[i] = mlm_input + mlm_labels[i] = mlm_label + + return sample + + def numericalize(self, tokens): + """ + here only "convert_tokens_to_ids", + which need be tokenized into tokens(sub-words) by "tokenizer.tokenize" before + """ + assert isinstance(tokens, list) + if len(tokens) == 0: + return [] + element = tokens[0] + if isinstance(element, list): + return [self.numericalize(s) for s in tokens] + else: + return self.tokenizer.convert_tokens_to_ids(tokens) + + def denumericalize(self, numbers): + """ + here first "convert_ids_to_tokens", then combine sub-words into origin words + """ + assert isinstance(numbers, list) + if len(numbers) == 0: + return [] + element = numbers[0] + if isinstance(element, list): + return [self.denumericalize(x) for x in numbers] + else: + return self.tokenizer.decode( + numbers, + ignore_tokens=[self.bos_token, self.eos_token, self.pad_token]) + + def save_examples(self, examples, filename): + start = time.time() + if filename.endswith('npy'): + print(f"Saving 1 object to '{filename}' ...") + assert len( + examples.shape) == 2 and examples.shape[0] == examples.shape[1] + num = examples.shape[0] + fp = np.memmap( + filename, dtype='float32', mode='w+', shape=(num, num)) + fp[:] = examples[:] + fp.flush() + elapsed = time.time() - start + print(f'Saved 1 object (elapsed {elapsed:.2f}s)') + elif filename.endswith('jsonl'): + print(f"Saving examples to '{filename}' ...") + with open(filename, 'w', encoding='utf-8') as fp: + for ex in examples: + fp.write(json.dumps(ex) + '\n') + elapsed = time.time() - start + print(f'Saved {len(examples)} examples (elapsed {elapsed:.2f}s)') + else: + print(f"Saving examples to '{filename}' ...") + raise ValueError(f'Unsport file format: {filename}') + + def load_examples(self, filename): + start = time.time() + if filename.endswith('npy'): + print(f"Loading 1 object from '{filename}' ...") + fp = np.memmap(filename, dtype='float32', mode='r') + assert len(fp.shape) == 1 + num = int(np.sqrt(fp.shape[0])) + examples = fp.reshape(num, num) + elapsed = time.time() - start + print(f'Loaded 1 object (elapsed {elapsed:.2f}s)') + else: + print(f"Loading examples from '{filename}' ...") + with open(filename, 'r', encoding='utf-8') as fp: + examples = list(map(lambda s: json.loads(s.strip()), fp)) + elapsed = time.time() - start + print(f'Loaded {len(examples)} examples (elapsed {elapsed:.2f}s)') + return examples + + def utt_filter_pred(self, utt): + return self.min_utt_len <= len(utt) \ + and (not self.filtered or len(utt) <= self.max_utt_len) + + def utts_filter_pred(self, utts): + return self.min_ctx_turn <= len(utts) \ + and (not self.filtered or len(utts) <= self.max_ctx_turn) + + def get_token_pos(self, tok_list, value_label): + find_pos = [] + found = False + label_list = [ + item + for item in map(str.strip, re.split('(\\W+)', value_label.lower())) + if len(item) > 0 + ] + len_label = len(label_list) + for i in range(len(tok_list) + 1 - len_label): + if tok_list[i:i + len_label] == label_list: + find_pos.append((i, i + len_label)) # start, exclusive_end + found = True + return found, find_pos + + def build_score_matrix(self, examples): + """ + build symmetric score matrix + """ + assert self.num_process == 1 + print(f'Building score matrix from examples ...') + num = len(examples) + score_matrix = np.eye( + num, num, dtype='float32' + ) # in case of empty label of self, resulting in score 0. + + for i in tqdm(range(num)): + for j in range(i): + # TODO change the score method + score = hierarchical_set_score( + frame1=examples[i]['label'], frame2=examples[j]['label']) + score_matrix[i][j] = score + score_matrix[j][i] = score + + print(f'Built score matrix') + return score_matrix + + def build_score_matrix_on_the_fly(self, + ids, + labels, + data_file, + is_post=False): + """ + build symmetric score matrix on the fly + @is_post: True for resp label of sample i and j, False for query label of sample i and j + """ + num = len(labels) + tag = 'r' if is_post else 'q' + assert len(ids) == len(labels) + score_matrix = np.eye( + num, num, dtype='float32' + ) # in case of empty label of self, resulting in score 0. + + for i in range(num): + for j in range(i): + score = self.score_matrixs[data_file].get( + f'{ids[i]}-{ids[j]}-{tag}', None) + if score is None: + score = self.score_matrixs[data_file].get( + f'{ids[j]}-{ids[i]}-{tag}', None) + if score is None: + # TODO change the score method + score = hierarchical_set_score( + frame1=labels[i], frame2=labels[j]) + self.score_matrixs[data_file][ + f'{ids[i]}-{ids[j]}-{tag}'] = score + score_matrix[i][j] = score + score_matrix[j][i] = score + + return score_matrix + + def build_score_matrix_func(self, examples, start, exclusive_end): + """ + build sub score matrix + """ + num = len(examples) + process_id = os.getpid() + description = f'PID: {process_id} Start: {start} End: {exclusive_end}' + print( + f'PID-{process_id}: Building {start} to {exclusive_end} lines score matrix from examples ...' + ) + score_matrix = np.zeros((exclusive_end - start, num), dtype='float32') + + for abs_i, i in enumerate( + tqdm(range(start, exclusive_end), desc=description)): + for j in range(num): + # TODO change the score method + score = hierarchical_set_score( + frame1=examples[i]['label'], frame2=examples[j]['label']) + score_matrix[abs_i][j] = score + + print( + f'PID-{process_id}: Built {start} to {exclusive_end} lines score matrix' + ) + return {'start': start, 'score_matrix': score_matrix} + + def build_score_matrix_multiprocessing(self, examples): + """ + build score matrix + """ + assert self.num_process >= 2 and multiprocessing.cpu_count() >= 2 + print(f'Building score matrix from examples ...') + results = [] + num = len(examples) + sub_num, res_num = num // self.num_process, num % self.num_process + patches = [sub_num] * (self.num_process - 1) + [sub_num + res_num] + + start = 0 + pool = multiprocessing.Pool(processes=self.num_process) + for patch in patches: + exclusive_end = start + patch + results.append( + pool.apply_async(self.build_score_matrix_func, + (examples, start, exclusive_end))) + start = exclusive_end + pool.close() + pool.join() + + sub_score_matrixs = [result.get() for result in results] + sub_score_matrixs = sorted( + sub_score_matrixs, key=lambda sub: sub['start']) + sub_score_matrixs = [ + sub_score_matrix['score_matrix'] + for sub_score_matrix in sub_score_matrixs + ] + score_matrix = np.concatenate(sub_score_matrixs, axis=0) + assert score_matrix.shape == (num, num) + np.fill_diagonal( + score_matrix, + 1.) # in case of empty label of self, resulting in score 0. + + print(f'Built score matrix') + return score_matrix + + def extract_span_texts(self, text, label): + span_texts = [] + for domain, frame in label.items(): + for act, slot_values in frame.items(): + for slot, values in slot_values.items(): + for value in values: + if value['span']: + span_texts.append( + text[value['span'][0]:value['span'][1]]) + elif str(value['value']).strip().lower() in text.strip( + ).lower(): + span_texts.append(str(value['value'])) + return span_texts + + def fix_label(self, label): + for domain, frame in label.items(): + if not frame: + return {} + for act, slot_values in frame.items(): + if act == 'DEFAULT_INTENT' and not slot_values: + return {} + return label + + def build_examples_multi_turn(self, data_file, data_type='train'): + print(f"Reading examples from '{data_file}' ...") + examples = [] + ignored = 0 + + with open(data_file, 'r', encoding='utf-8') as f: + input_data = json.load(f) + for dialog_id in tqdm(input_data): + turns = input_data[dialog_id]['turns'] + history, history_role, history_span_mask, history_label = [], [], [], [] + for t, turn in enumerate(turns): + label = turn['label'] + role = turn['role'] + text = turn['text'] + utterance, span_mask = [], [] + + token_list = [ + tok for tok in map(str.strip, + re.split('(\W+)', text.lower())) + if len(tok) > 0 + ] + span_list = np.zeros(len(token_list), dtype=np.int32) + span_texts = self.extract_span_texts( + text=text, label=label) + + for span_text in span_texts: + found, find_pos = self.get_token_pos( + tok_list=token_list, value_label=span_text) + if found: + for start, exclusive_end in find_pos: + span_list[start:exclusive_end] = 1 + + token_list = [ + self.tokenizer.tokenize(token) for token in token_list + ] + span_list = [[tag] * len(token_list[i]) + for i, tag in enumerate(span_list)] + for sub_tokens in token_list: + utterance.extend(sub_tokens) + for sub_spans in span_list: + span_mask.extend(sub_spans) + assert len(utterance) == len(span_mask) + + history.append(utterance) + history_role.append(role) + history_span_mask.append(span_mask) + history_label.append(self.fix_label(label)) + + if ( + (self.utts_filter_pred(history[:-1]) + and all(map(self.utt_filter_pred, history))) + or data_type == 'test' + ) and role in self.trigger_role and t: # TODO consider test + src = [ + s[-self.max_utt_len:] + for s in history[:-1][-self.max_ctx_turn:] + ] + src_span_mask = [ + s[-self.max_utt_len:] for s in + history_span_mask[:-1][-self.max_ctx_turn:] + ] + roles = [ + role + for role in history_role[:-1][-self.max_ctx_turn:] + ] + src = [[self.sos_u_id] + self.numericalize(s) + + [self.eos_u_id] + if roles[i] == 'user' else [self.sos_r_id] + + self.numericalize(s) + [self.eos_r_id] + for i, s in enumerate(src)] + src_span_mask = [[0] + list(map(int, s)) + [0] + for s in src_span_mask] + + tgt = [self.sos_r_id] + self.numericalize( + history[-1]) + [self.eos_r_id] + if data_type != 'test': + tgt = tgt[:self.max_utt_len + 2] + + ex = { + 'dialog_id': dialog_id, + 'turn_id': turn['turn_id'], + 'src': src, + 'src_span_mask': src_span_mask, + 'tgt': tgt, + 'query_label': history_label[-2], + 'resp_label': history_label[-1], + 'extra_info': turn.get('extra_info', '') + } + examples.append(ex) + else: + ignored += 1 + + # add span mlm inputs and span mlm labels in advance + if self.with_mlm: + examples = [ + self.create_span_masked_lm_predictions(example) + for example in examples + ] + + # add absolute id of the dataset for indexing scores in its score matrix + for i, example in enumerate(examples): + example['id'] = i + + print( + f'Built {len(examples)} {data_type.upper()} examples ({ignored} filtered)' + ) + return examples + + def preprocessor(self, text_list): + role = 'user' + examples = [] + + for text in text_list: + history, history_role, history_span_mask = [], [], [] + utterance, span_mask = [], [] + token_list = [ + tok for tok in map(str.strip, re.split('(\W+)', text.lower())) + if len(tok) > 0 + ] + span_list = np.zeros(len(token_list), dtype=np.int32) + token_list = [ + self.tokenizer.tokenize(token) for token in token_list + ] + span_list = [[tag] * len(token_list[i]) + for i, tag in enumerate(span_list)] + + for sub_tokens in token_list: + utterance.extend(sub_tokens) + for sub_spans in span_list: + span_mask.extend(sub_spans) + assert len(utterance) == len(span_mask) + + history.append(utterance) + history_role.append(role) + history_span_mask.append(span_mask) + + src = [s[-self.max_utt_len:] for s in history[-self.max_ctx_turn:]] + src_span_mask = [ + s[-self.max_utt_len:] + for s in history_span_mask[-self.max_ctx_turn:] + ] + roles = [role for role in history_role[-self.max_ctx_turn:]] + src = [[self.sos_u_id] + self.numericalize(s) + + [self.eos_u_id] if roles[i] == 'user' else [self.sos_r_id] + + self.numericalize(s) + [self.eos_r_id] + for i, s in enumerate(src)] + src_span_mask = [[0] + list(map(int, s)) + [0] + for s in src_span_mask] + + ex = { + 'dialog_id': 'inference', + 'turn_id': 0, + 'role': role, + 'src': src, + 'src_span_mask': src_span_mask, + 'query_label': { + 'DEFAULT_DOMAIN': { + 'card_arrival': {} + } + }, + 'extra_info': { + 'intent_label': -1 + } + } + examples.append(ex) + # add span mlm inputs and span mlm labels in advance + if self.with_mlm: + examples = [ + self.create_span_masked_lm_predictions(example) + for example in examples + ] + + # add absolute id of the dataset for indexing scores in its score matrix + for i, example in enumerate(examples): + example['id'] = i + + return examples + + def build_examples_single_turn(self, data_file, data_type='train'): + print(f"Reading examples from '{data_file}' ...") + examples = [] + ignored = 0 + + with open(data_file, 'r', encoding='utf-8') as f: + input_data = json.load(f) + for dialog_id in tqdm(input_data): + turns = input_data[dialog_id]['turns'] + history, history_role, history_span_mask = [], [], [] + for turn in turns: + label = turn['label'] + role = turn['role'] + text = turn['text'] + utterance, span_mask = [], [] + + token_list = [ + tok for tok in map(str.strip, + re.split('(\W+)', text.lower())) + if len(tok) > 0 + ] + span_list = np.zeros(len(token_list), dtype=np.int32) + span_texts = self.extract_span_texts( + text=text, label=label) + + for span_text in span_texts: + found, find_pos = self.get_token_pos( + tok_list=token_list, value_label=span_text) + if found: + for start, exclusive_end in find_pos: + span_list[start:exclusive_end] = 1 + + token_list = [ + self.tokenizer.tokenize(token) for token in token_list + ] + span_list = [[tag] * len(token_list[i]) + for i, tag in enumerate(span_list)] + for sub_tokens in token_list: + utterance.extend(sub_tokens) + for sub_spans in span_list: + span_mask.extend(sub_spans) + assert len(utterance) == len(span_mask) + + history.append(utterance) + history_role.append(role) + history_span_mask.append(span_mask) + + if ((self.utts_filter_pred(history) + and all(map(self.utt_filter_pred, history))) + or data_type == 'test' + ) and role in self.trigger_role: # TODO consider test + src = [ + s[-self.max_utt_len:] + for s in history[-self.max_ctx_turn:] + ] + src_span_mask = [ + s[-self.max_utt_len:] + for s in history_span_mask[-self.max_ctx_turn:] + ] + roles = [ + role for role in history_role[-self.max_ctx_turn:] + ] + src = [[self.sos_u_id] + self.numericalize(s) + + [self.eos_u_id] + if roles[i] == 'user' else [self.sos_r_id] + + self.numericalize(s) + [self.eos_r_id] + for i, s in enumerate(src)] + src_span_mask = [[0] + list(map(int, s)) + [0] + for s in src_span_mask] + + ex = { + 'dialog_id': dialog_id, + 'turn_id': turn['turn_id'], + 'role': role, + 'src': src, + 'src_span_mask': src_span_mask, + 'query_label': self.fix_label(label), + 'extra_info': turn.get('extra_info', '') + } + examples.append(ex) + else: + ignored += 1 + + # add span mlm inputs and span mlm labels in advance + if self.with_mlm: + examples = [ + self.create_span_masked_lm_predictions(example) + for example in examples + ] + + # add absolute id of the dataset for indexing scores in its score matrix + for i, example in enumerate(examples): + example['id'] = i + + print( + f'Built {len(examples)} {data_type.upper()} examples ({ignored} filtered)' + ) + return examples + + def collate_fn_multi_turn(self, samples): + batch_size = len(samples) + batch = {} + + src = [sp['src'] for sp in samples] + query_token, src_token, src_pos, src_turn, src_role = [], [], [], [], [] + for utts in src: + query_token.append(utts[-1]) + utt_lens = [len(utt) for utt in utts] + + # Token ids + src_token.append(list(chain(*utts))[-self.max_len:]) + + # Position ids + pos = [list(range(l)) for l in utt_lens] + src_pos.append(list(chain(*pos))[-self.max_len:]) + + # Turn ids + turn = [[len(utts) - i] * l for i, l in enumerate(utt_lens)] + src_turn.append(list(chain(*turn))[-self.max_len:]) + + # Role ids + role = [ + [self.bot_id if (len(utts) - i) % 2 == 0 else self.user_id] * l + for i, l in enumerate(utt_lens) + ] + src_role.append(list(chain(*role))[-self.max_len:]) + + # src端序列和tgt端序列需要分开pad,以保证解码时第一个词对齐 + src_token = list2np(src_token, padding=self.pad_id) + src_pos = list2np(src_pos, padding=self.pad_id) + src_turn = list2np(src_turn, padding=self.pad_id) + src_role = list2np(src_role, padding=self.pad_id) + batch['src_token'] = src_token + batch['src_pos'] = src_pos + batch['src_type'] = src_role + batch['src_turn'] = src_turn + batch['src_mask'] = (src_token != self.pad_id).astype('int64') + + if self.with_query_bow: + query_token = list2np(query_token, padding=self.pad_id) + batch['query_token'] = query_token + batch['query_mask'] = (query_token != self.pad_id).astype('int64') + + if self.with_mlm: + mlm_token, mlm_label = [], [] + raw_mlm_input = [sp['mlm_inputs'] for sp in samples] + raw_mlm_label = [sp['mlm_labels'] for sp in samples] + for inputs in raw_mlm_input: + mlm_token.append(list(chain(*inputs))[-self.max_len:]) + for labels in raw_mlm_label: + mlm_label.append(list(chain(*labels))[-self.max_len:]) + + mlm_token = list2np(mlm_token, padding=self.pad_id) + mlm_label = list2np(mlm_label, padding=self.pad_id) + batch['mlm_token'] = mlm_token + batch['mlm_label'] = mlm_label + batch['mlm_mask'] = (mlm_label != self.pad_id).astype('int64') + + if self.dynamic_score and self.with_contrastive and not self.abandon_label: + query_labels = [sp['query_label'] for sp in samples] + batch['query_labels'] = query_labels + if self.trigger_role == 'system': + resp_labels = [sp['resp_label'] for sp in samples] + batch['resp_labels'] = resp_labels + batch['label_ids'] = np.arange( + batch_size) # to identify labels for each GPU when multi-gpu + + if self.understand_ids: + understand = [self.understand_ids for _ in samples] + understand_token = np.array(understand).astype('int64') + batch['understand_token'] = understand_token + batch['understand_mask'] = (understand_token != + self.pad_id).astype('int64') + + if self.policy_ids and self.policy: + policy = [self.policy_ids for _ in samples] + policy_token = np.array(policy).astype('int64') + batch['policy_token'] = policy_token + batch['policy_mask'] = (policy_token != + self.pad_id).astype('int64') + + if 'tgt' in samples[0]: + tgt = [sp['tgt'] for sp in samples] + + # Token ids & Label ids + tgt_token = list2np(tgt, padding=self.pad_id) + + # Position ids + tgt_pos = np.zeros_like(tgt_token) + tgt_pos[:] = np.arange(tgt_token.shape[1], dtype=tgt_token.dtype) + + # Turn ids + tgt_turn = np.zeros_like(tgt_token) + + # Role ids + tgt_role = np.full_like(tgt_token, self.bot_id) + + batch['tgt_token'] = tgt_token + batch['tgt_pos'] = tgt_pos + batch['tgt_type'] = tgt_role + batch['tgt_turn'] = tgt_turn + batch['tgt_mask'] = (tgt_token != self.pad_id).astype('int64') + + if 'id' in samples[0]: + ids = [sp['id'] for sp in samples] + ids = np.array(ids).astype('int64') + batch['ids'] = ids + + return batch, batch_size + + +class IntentBPETextField(BPETextField): + + def __init__(self, model_dir, config): + super(IntentBPETextField, self).__init__(model_dir, config) + + def retrieve_examples(self, + dataset, + labels, + inds, + task, + num=None, + cache=None): + assert task == 'intent', 'Example-driven may only be used with intent prediction' + if num is None and labels is not None: + num = len(labels) * 2 + + # Populate cache + if cache is None: + cache = defaultdict(list) + for i, example in enumerate(dataset): + assert i == example['id'] + cache[example['extra_info']['intent_label']].append(i) + + # One example for each label + example_inds = [] + for l in set(labels.tolist()): + if l == -1: + continue + + ind = random.choice(cache[l]) + retries = 0 + while ind in inds.tolist() or type(ind) is not int: + ind = random.choice(cache[l]) + retries += 1 + if retries > len(dataset): + break + + example_inds.append(ind) + + # Sample randomly until we hit batch size + while len(example_inds) < min(len(dataset), num): + ind = random.randint(0, len(dataset) - 1) + if ind not in example_inds and ind not in inds.tolist(): + example_inds.append(ind) + + # Create examples + example_batch = {} + examples = [dataset[i] for i in example_inds] + examples, _ = self.collate_fn_multi_turn(examples) + example_batch['example_src_token'] = examples['src_token'] + example_batch['example_src_pos'] = examples['src_pos'] + example_batch['example_src_type'] = examples['src_type'] + example_batch['example_src_turn'] = examples['src_turn'] + example_batch['example_src_mask'] = examples['src_mask'] + example_batch['example_tgt_token'] = examples['tgt_token'] + example_batch['example_tgt_mask'] = examples['tgt_mask'] + example_batch['example_intent'] = examples['intent_label'] + + return example_batch + + def collate_fn_multi_turn(self, samples): + batch_size = len(samples) + batch = {} + + cur_roles = [sp['role'] for sp in samples] + src = [sp['src'] for sp in samples] + src_token, src_pos, src_turn, src_role = [], [], [], [] + for utts, cur_role in zip(src, cur_roles): + utt_lens = [len(utt) for utt in utts] + + # Token ids + src_token.append(list(chain(*utts))[-self.max_len:]) + + # Position ids + pos = [list(range(l)) for l in utt_lens] + src_pos.append(list(chain(*pos))[-self.max_len:]) + + # Turn ids + turn = [[len(utts) - i] * l for i, l in enumerate(utt_lens)] + src_turn.append(list(chain(*turn))[-self.max_len:]) + + # Role ids + if cur_role == 'user': + role = [[ + self.bot_id if (len(utts) - i) % 2 == 0 else self.user_id + ] * l for i, l in enumerate(utt_lens)] + else: + role = [[ + self.user_id if (len(utts) - i) % 2 == 0 else self.bot_id + ] * l for i, l in enumerate(utt_lens)] + src_role.append(list(chain(*role))[-self.max_len:]) + + # src端序列和tgt端序列需要分开pad,以保证解码时第一个词对齐 + src_token = list2np(src_token, padding=self.pad_id) + src_pos = list2np(src_pos, padding=self.pad_id) + src_turn = list2np(src_turn, padding=self.pad_id) + src_role = list2np(src_role, padding=self.pad_id) + batch['src_token'] = src_token + batch['src_pos'] = src_pos + batch['src_type'] = src_role + batch['src_turn'] = src_turn + batch['src_mask'] = (src_token != self.pad_id).astype( + 'int64') # input mask + + if self.with_mlm: + mlm_token, mlm_label = [], [] + raw_mlm_input = [sp['mlm_inputs'] for sp in samples] + raw_mlm_label = [sp['mlm_labels'] for sp in samples] + for inputs in raw_mlm_input: + mlm_token.append(list(chain(*inputs))[-self.max_len:]) + for labels in raw_mlm_label: + mlm_label.append(list(chain(*labels))[-self.max_len:]) + + mlm_token = list2np(mlm_token, padding=self.pad_id) + mlm_label = list2np(mlm_label, padding=self.pad_id) + batch['mlm_token'] = mlm_token + batch['mlm_label'] = mlm_label + batch['mlm_mask'] = (mlm_label != self.pad_id).astype( + 'int64') # label mask + + if self.understand_ids: + tgt = [self.understand_ids for _ in samples] + tgt_token = np.array(tgt).astype('int64') + batch['tgt_token'] = tgt_token + batch['tgt_mask'] = (tgt_token != self.pad_id).astype( + 'int64') # input mask + + if 'id' in samples[0]: + ids = [sp['id'] for sp in samples] + ids = np.array(ids).astype('int64') + batch['ids'] = ids + + if self.dynamic_score and self.with_contrastive: + query_labels = [sp['query_label'] for sp in samples] + batch['query_labels'] = query_labels + batch['label_ids'] = np.arange(batch_size) + + if 'intent_label' in samples[0]['extra_info']: + intent_label = [ + sample['extra_info']['intent_label'] for sample in samples + ] + intent_label = np.array(intent_label).astype('int64') + batch['intent_label'] = intent_label + + return batch, batch_size diff --git a/modelscope/preprocessors/space/tokenizer.py b/modelscope/preprocessors/space/tokenizer.py new file mode 100644 index 00000000..fe64493e --- /dev/null +++ b/modelscope/preprocessors/space/tokenizer.py @@ -0,0 +1,665 @@ +from __future__ import (absolute_import, division, print_function, + unicode_literals) +import collections +import logging +import os +import sys +import unicodedata + +import json +import regex as re + + +def clean_string(string): + replace_mp = { + ' - ': '-', + " ' ": "'", + " n't": "n't", + " 'm": "'m", + ' do not': " don't", + " 's": "'s", + " 've": "'ve", + " 're": "'re" + } + for k, v in replace_mp.items(): + string = string.replace(k, v) + return string + + +class Tokenizer(object): + + def __init__(self, vocab_path, special_tokens=[], tokenizer_type='Bert'): + self.tokenizer_type = tokenizer_type + if tokenizer_type == 'Bert': + self.spec_convert_dict = { + '[BOS]': '[unused0]', + '[EOS]': '[unused1]' + } + for token in special_tokens: + if token not in self.spec_convert_dict and token not in [ + '[PAD]', '[UNK]' + ]: + self.spec_convert_dict[ + token] = f'[unused{len(self.spec_convert_dict)}]' + self.spec_revert_dict = { + v: k + for k, v in self.spec_convert_dict.items() + } + special_tokens = [ + self.spec_convert_dict.get(tok, tok) for tok in special_tokens + ] + self.special_tokens = ('[UNK]', '[SEP]', '[PAD]', '[CLS]', + '[MASK]') + self.special_tokens += tuple(x for x in special_tokens + if x not in self.special_tokens) + + self._tokenizer = BertTokenizer( + vocab_path, never_split=self.special_tokens) + for tok in self.special_tokens: + ''' + 需要先保证special_tokens在词表中,这里设置special_tokens的目的是为了这些词能够完整占位,不再切分为子词; + 若不在词表中,可以使用词表中的[unused]符号进行转换:spec_convert_dict; + ''' + assert tok in self._tokenizer.vocab, f"special token '{tok}' is not in the vocabulary" + self.vocab_size = len(self._tokenizer.vocab) + elif tokenizer_type == 'GPT2': + self.spec_convert_dict = {'[UNK]': ''} + self.spec_revert_dict = { + v: k + for k, v in self.spec_convert_dict.items() + } + special_tokens = [ + tok for tok in special_tokens + if tok not in self.spec_convert_dict + ] + vocab_file = os.path.join(vocab_path, 'vocab.json') + merges_file = os.path.join(vocab_path, 'merges.txt') + self._tokenizer = GPT2Tokenizer( + vocab_file, merges_file, special_tokens=special_tokens) + self.num_specials = len(special_tokens) + self.vocab_size = len(self._tokenizer) + else: + raise ValueError + + def tokenize(self, text): + return self._tokenizer.tokenize(text) + + def convert_tokens_to_ids(self, tokens): + if self.tokenizer_type == 'Bert': + tokens = [self.spec_convert_dict.get(tok, tok) for tok in tokens] + ids = self._tokenizer.convert_tokens_to_ids(tokens) + return ids + else: + tokens = [self.spec_convert_dict.get(tok, tok) for tok in tokens] + ids = self._tokenizer.convert_tokens_to_ids(tokens) + ids = [(i + self.num_specials) % self.vocab_size for i in ids] + return ids + + def convert_ids_to_tokens(self, ids): + if self.tokenizer_type == 'Bert': + tokens = self._tokenizer.convert_ids_to_tokens(ids) + tokens = [self.spec_revert_dict.get(tok, tok) for tok in tokens] + return tokens + else: + ids = [(i - self.num_specials) % self.vocab_size for i in ids] + tokens = self._tokenizer.convert_ids_to_tokens(ids) + tokens = [self.spec_revert_dict.get(tok, tok) for tok in tokens] + return tokens + + def decode(self, ids, ignore_tokens=[]): + tokens = self.convert_ids_to_tokens(ids) + if len(ignore_tokens) > 0: + ignore_tokens = set(ignore_tokens) + tokens = [tok for tok in tokens if tok not in ignore_tokens] + if self.tokenizer_type == 'Bert': + string = ' '.join(tokens).replace(' ##', '') + else: + string = ''.join(tokens) + string = bytearray([ + self._tokenizer.byte_decoder[c] for c in string + ]).decode('utf-8') + string = clean_string(string) + return string + + +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes.""" + +logger = logging.getLogger(__name__) + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + index = 0 + with open(vocab_file, 'r', encoding='utf-8') as reader: + while True: + token = reader.readline() + if not token: + break + token = token.strip() + vocab[token] = index + index += 1 + return vocab + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class BertTokenizer(object): + """Runs end-to-end tokenization: punctuation splitting + wordpiece""" + + def __init__(self, + vocab_file, + do_lower_case=True, + max_len=None, + do_basic_tokenize=True, + never_split=('[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]')): + """Constructs a BertTokenizer. + + Args: + vocab_file: Path to a one-wordpiece-per-line vocabulary file + do_lower_case: Whether to lower case the input + Only has an effect when do_wordpiece_only=False + do_basic_tokenize: Whether to do basic tokenization before wordpiece. + max_len: An artificial maximum length to truncate tokenized sequences to; + Effective maximum length is always the minimum of this + value (if specified) and the underlying BERT model's + sequence length. + never_split: List of tokens which will never be split during tokenization. + Only has an effect when do_wordpiece_only=False + """ + if not os.path.isfile(vocab_file): + raise ValueError( + "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " + 'model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`' + .format(vocab_file)) + self.vocab = load_vocab(vocab_file) + self.ids_to_tokens = collections.OrderedDict([ + (ids, tok) for tok, ids in self.vocab.items() + ]) + self.do_basic_tokenize = do_basic_tokenize + if do_basic_tokenize: + self.basic_tokenizer = BasicTokenizer( + do_lower_case=do_lower_case, never_split=never_split) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + self.max_len = max_len if max_len is not None else int(1e12) + + def tokenize(self, text): + split_tokens = [] + if self.do_basic_tokenize: + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + else: + split_tokens = self.wordpiece_tokenizer.tokenize(text) + return split_tokens + + def convert_tokens_to_ids(self, tokens): + """Converts a sequence of tokens into ids using the vocab.""" + ids = [] + for token in tokens: + ids.append(self.vocab[token]) + if len(ids) > self.max_len: + logger.warning( + 'Token indices sequence length is longer than the specified maximum ' + ' sequence length for this BERT model ({} > {}). Running this' + ' sequence through BERT will result in indexing errors'.format( + len(ids), self.max_len)) + return ids + + def convert_ids_to_tokens(self, ids): + """Converts a sequence of ids in wordpiece tokens using the vocab.""" + tokens = [] + for i in ids: + tokens.append(self.ids_to_tokens[i]) + return tokens + + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, + do_lower_case=True, + never_split=('[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]')): + """Constructs a BasicTokenizer. + + Args: + do_lower_case: Whether to lower case the input. + """ + self.do_lower_case = do_lower_case + self.never_split = never_split + + def tokenize(self, text): + """Tokenizes a piece of text.""" + text = self._clean_text(text) + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + text = self._tokenize_chinese_chars(text) + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case and token not in self.never_split: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token)) + + output_tokens = whitespace_tokenize(' '.join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize('NFD', text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == 'Mn': + continue + output.append(char) + return ''.join(output) + + def _run_split_on_punc(self, text): + """Splits punctuation on a piece of text.""" + if text in self.never_split: + return [text] + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return [''.join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(' ') + output.append(char) + output.append(' ') + else: + output.append(char) + return ''.join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF) + or (cp >= 0x20000 and cp <= 0x2A6DF) + or (cp >= 0x2A700 and cp <= 0x2B73F) + or (cp >= 0x2B740 and cp <= 0x2B81F) + or (cp >= 0x2B820 and cp <= 0x2CEAF) + or (cp >= 0xF900 and cp <= 0xFAFF) + or (cp >= 0x2F800 and cp <= 0x2FA1F)): + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(' ') + else: + output.append(char) + return ''.join(output) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenization.""" + + def __init__(self, vocab, unk_token='[UNK]', max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer`. + + Returns: + A list of wordpiece tokens. + """ + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = ''.join(chars[start:end]) + if start > 0: + substr = '##' + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == ' ' or char == '\t' or char == '\n' or char == '\r': + return True + cat = unicodedata.category(char) + if cat == 'Zs': + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == '\t' or char == '\n' or char == '\r': + return False + cat = unicodedata.category(char) + if cat.startswith('C'): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) + or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith('P'): + return True + return False + + +# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for OpenAI GPT.""" + +try: + from functools import lru_cache +except ImportError: + # Just a dummy decorator to get the checks to run on python2 + # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now. + def lru_cache(): + return lambda func: func + + +@lru_cache() +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a signficant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + _chr = unichr if sys.version_info[0] == 2 else chr + bs = list(range(ord('!'), + ord('~') + 1)) + list(range( + ord('¡'), + ord('¬') + 1)) + list(range(ord('®'), + ord('ÿ') + 1)) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [_chr(n) for n in cs] + return dict(zip(bs, cs)) + + +def get_pairs(word): + """Return set of symbol pairs in a word. + + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +class GPT2Tokenizer(object): + """ + GPT-2 BPE tokenizer. Peculiarities: + - Byte-level BPE + """ + + def __init__(self, + vocab_file, + merges_file, + errors='replace', + special_tokens=None, + max_len=None): + self.max_len = max_len if max_len is not None else int(1e12) + self.encoder = json.load(open(vocab_file)) + self.decoder = {v: k for k, v in self.encoder.items()} + self.errors = errors # how to handle errors in decoding + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] + bpe_merges = [tuple(merge.split()) for merge in bpe_data] + self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) + self.cache = {} + + # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions + self.pat = re.compile( + r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" + ) + + self.special_tokens = {} + self.special_tokens_decoder = {} + self.set_special_tokens(special_tokens) + + def __len__(self): + return len(self.encoder) + len(self.special_tokens) + + def set_special_tokens(self, special_tokens): + """ Add a list of additional tokens to the encoder. + The additional tokens are indexed starting from the last index of the + current vocabulary in the order of the `special_tokens` list. + """ + if not special_tokens: + self.special_tokens = {} + self.special_tokens_decoder = {} + return + self.special_tokens = dict((tok, len(self.encoder) + i) + for i, tok in enumerate(special_tokens)) + self.special_tokens_decoder = { + v: k + for k, v in self.special_tokens.items() + } + logger.info('Special tokens {}'.format(self.special_tokens)) + + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token) + pairs = get_pairs(word) + + if not pairs: + return token + + while True: + bigram = min( + pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + new_word.extend(word[i:j]) + i = j + except: + new_word.extend(word[i:]) + break + + if word[i] == first and i < len(word) - 1 and word[ + i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = ' '.join(word) + self.cache[token] = word + return word + + def tokenize(self, text): + """ Tokenize a string. """ + bpe_tokens = [] + for token in re.findall(self.pat, text): + token = ''.join(self.byte_encoder[ord(b)] for b in token + if ord(b) in self.byte_encoder) + if token == '': + continue + bpe_tokens.extend( + bpe_token for bpe_token in self.bpe(token).split(' ')) + return bpe_tokens + + def convert_tokens_to_ids(self, tokens): + """ Converts a sequence of tokens into ids using the vocab. """ + ids = [] + if isinstance(tokens, str) or (sys.version_info[0] == 2 + and isinstance(tokens, unicode)): + if tokens in self.special_tokens: + return self.special_tokens[tokens] + else: + return self.encoder.get(tokens, 0) + for token in tokens: + if token in self.special_tokens: + ids.append(self.special_tokens[token]) + else: + ids.append(self.encoder.get(token, 0)) + if len(ids) > self.max_len: + logger.warning( + 'Token indices sequence length is longer than the specified maximum ' + ' sequence length for this OpenAI GPT model ({} > {}). Running this' + ' sequence through the model will result in indexing errors'. + format(len(ids), self.max_len)) + return ids + + def convert_ids_to_tokens(self, ids, skip_special_tokens=False): + """Converts a sequence of ids in BPE tokens using the vocab.""" + tokens = [] + for i in ids: + if i in self.special_tokens_decoder: + if not skip_special_tokens: + tokens.append(self.special_tokens_decoder[i]) + else: + tokens.append(self.decoder[i]) + return tokens + + def encode(self, text): + return self.convert_tokens_to_ids(self.tokenize(text)) + + def decode(self, tokens): + text = ''.join([self.decoder[token] for token in tokens]) + text = bytearray([self.byte_decoder[c] for c in text]).decode( + 'utf-8', errors=self.errors) + return text diff --git a/modelscope/pydatasets/__init__.py b/modelscope/pydatasets/__init__.py new file mode 100644 index 00000000..a1ed1d93 --- /dev/null +++ b/modelscope/pydatasets/__init__.py @@ -0,0 +1 @@ +from .py_dataset import PyDataset diff --git a/modelscope/pydatasets/py_dataset.py b/modelscope/pydatasets/py_dataset.py new file mode 100644 index 00000000..7d0edadb --- /dev/null +++ b/modelscope/pydatasets/py_dataset.py @@ -0,0 +1,126 @@ +import logging +from typing import (Any, Callable, Dict, List, Mapping, Optional, Sequence, + Union) + +from datasets import Dataset, load_dataset + +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +class PyDataset: + _hf_ds = None # holds the underlying HuggingFace Dataset + """A PyDataset backed by hugging face Dataset.""" + + def __init__(self, hf_ds: Dataset): + self._hf_ds = hf_ds + self.target = None + + def __iter__(self): + if isinstance(self._hf_ds, Dataset): + for item in self._hf_ds: + if self.target is not None: + yield item[self.target] + else: + yield item + else: + for ds in self._hf_ds.values(): + for item in ds: + if self.target is not None: + yield item[self.target] + else: + yield item + + @classmethod + def from_hf_dataset(cls, + hf_ds: Dataset, + target: str = None) -> 'PyDataset': + dataset = cls(hf_ds) + dataset.target = target + return dataset + + @staticmethod + def load( + path: Union[str, list], + target: Optional[str] = None, + version: Optional[str] = None, + name: Optional[str] = None, + split: Optional[str] = None, + data_dir: Optional[str] = None, + data_files: Optional[Union[str, Sequence[str], + Mapping[str, Union[str, + Sequence[str]]]]] = None + ) -> 'PyDataset': + """Load a PyDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset. + Args: + + path (str): Path or name of the dataset. + target (str, optional): Name of the column to output. + version (str, optional): Version of the dataset script to load: + name (str, optional): Defining the subset_name of the dataset. + data_dir (str, optional): Defining the data_dir of the dataset configuration. I + data_files (str or Sequence or Mapping, optional): Path(s) to source data file(s). + split (str, optional): Which split of the data to load. + + Returns: + PyDataset (obj:`PyDataset`): PyDataset object for a certain dataset. + """ + if isinstance(path, str): + dataset = load_dataset( + path, + name=name, + revision=version, + split=split, + data_dir=data_dir, + data_files=data_files) + elif isinstance(path, list): + if target is None: + target = 'target' + dataset = Dataset.from_dict({target: [p] for p in path}) + else: + raise TypeError('path must be a str or a list, but got' + f' {type(path)}') + return PyDataset.from_hf_dataset(dataset, target=target) + + def to_torch_dataset( + self, + columns: Union[str, List[str]] = None, + output_all_columns: bool = False, + **format_kwargs, + ): + self._hf_ds.reset_format() + self._hf_ds.set_format( + type='torch', + columns=columns, + output_all_columns=output_all_columns, + format_kwargs=format_kwargs) + return self._hf_ds + + def to_tf_dataset( + self, + columns: Union[str, List[str]], + batch_size: int, + shuffle: bool, + collate_fn: Callable, + drop_remainder: bool = None, + collate_fn_args: Dict[str, Any] = None, + label_cols: Union[str, List[str]] = None, + dummy_labels: bool = False, + prefetch: bool = True, + ): + self._hf_ds.reset_format() + return self._hf_ds.to_tf_dataset( + columns, + batch_size, + shuffle, + collate_fn, + drop_remainder=drop_remainder, + collate_fn_args=collate_fn_args, + label_cols=label_cols, + dummy_labels=dummy_labels, + prefetch=prefetch) + + def to_hf_dataset(self) -> Dataset: + self._hf_ds.reset_format() + return self._hf_ds diff --git a/maas_lib/tools/eval.py b/modelscope/tools/eval.py similarity index 94% rename from maas_lib/tools/eval.py rename to modelscope/tools/eval.py index 95bf7054..ca39932d 100644 --- a/maas_lib/tools/eval.py +++ b/modelscope/tools/eval.py @@ -2,7 +2,7 @@ import argparse -from maas_lib.trainers import build_trainer +from modelscope.trainers import build_trainer def parse_args(): diff --git a/maas_lib/tools/train.py b/modelscope/tools/train.py similarity index 92% rename from maas_lib/tools/train.py rename to modelscope/tools/train.py index f7c2b54b..c6f1ef5f 100644 --- a/maas_lib/tools/train.py +++ b/modelscope/tools/train.py @@ -2,7 +2,7 @@ import argparse -from maas_lib.trainers import build_trainer +from modelscope.trainers import build_trainer def parse_args(): diff --git a/maas_lib/trainers/__init__.py b/modelscope/trainers/__init__.py similarity index 100% rename from maas_lib/trainers/__init__.py rename to modelscope/trainers/__init__.py diff --git a/maas_lib/trainers/base.py b/modelscope/trainers/base.py similarity index 96% rename from maas_lib/trainers/base.py rename to modelscope/trainers/base.py index 2c11779e..372938b4 100644 --- a/maas_lib/trainers/base.py +++ b/modelscope/trainers/base.py @@ -3,8 +3,8 @@ from abc import ABC, abstractmethod from typing import Callable, Dict, List, Optional, Tuple, Union -from maas_lib.trainers.builder import TRAINERS -from maas_lib.utils.config import Config +from modelscope.trainers.builder import TRAINERS +from modelscope.utils.config import Config class BaseTrainer(ABC): diff --git a/maas_lib/trainers/builder.py b/modelscope/trainers/builder.py similarity index 77% rename from maas_lib/trainers/builder.py rename to modelscope/trainers/builder.py index 2165fe58..2192d46c 100644 --- a/maas_lib/trainers/builder.py +++ b/modelscope/trainers/builder.py @@ -1,8 +1,8 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -from maas_lib.utils.config import ConfigDict -from maas_lib.utils.constant import Tasks -from maas_lib.utils.registry import Registry, build_from_cfg +from modelscope.utils.config import ConfigDict +from modelscope.utils.constant import Tasks +from modelscope.utils.registry import Registry, build_from_cfg TRAINERS = Registry('trainers') diff --git a/maas_lib/trainers/nlp/__init__.py b/modelscope/trainers/nlp/__init__.py similarity index 100% rename from maas_lib/trainers/nlp/__init__.py rename to modelscope/trainers/nlp/__init__.py diff --git a/maas_lib/trainers/nlp/sequence_classification_trainer.py b/modelscope/trainers/nlp/sequence_classification_trainer.py similarity index 98% rename from maas_lib/trainers/nlp/sequence_classification_trainer.py rename to modelscope/trainers/nlp/sequence_classification_trainer.py index f2264c0d..b2b759fa 100644 --- a/maas_lib/trainers/nlp/sequence_classification_trainer.py +++ b/modelscope/trainers/nlp/sequence_classification_trainer.py @@ -3,8 +3,8 @@ from typing import Callable, Dict, List, Optional, Tuple, Union import numpy as np -from maas_lib.utils.constant import Tasks -from maas_lib.utils.logger import get_logger +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger from ..base import BaseTrainer from ..builder import TRAINERS diff --git a/modelscope/trainers/nlp/space/__init__.py b/modelscope/trainers/nlp/space/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/trainers/nlp/space/metrics/__init__.py b/modelscope/trainers/nlp/space/metrics/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/maas_lib/trainers/nlp/space/metrics/metrics_tracker.py b/modelscope/trainers/nlp/space/metrics/metrics_tracker.py similarity index 100% rename from maas_lib/trainers/nlp/space/metrics/metrics_tracker.py rename to modelscope/trainers/nlp/space/metrics/metrics_tracker.py diff --git a/modelscope/trainers/nlp/space/trainers/__init__.py b/modelscope/trainers/nlp/space/trainers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/maas_lib/trainers/nlp/space/trainers/gen_trainer.py b/modelscope/trainers/nlp/space/trainers/gen_trainer.py similarity index 100% rename from maas_lib/trainers/nlp/space/trainers/gen_trainer.py rename to modelscope/trainers/nlp/space/trainers/gen_trainer.py diff --git a/maas_lib/trainers/nlp/space/trainers/intent_trainer.py b/modelscope/trainers/nlp/space/trainers/intent_trainer.py similarity index 99% rename from maas_lib/trainers/nlp/space/trainers/intent_trainer.py rename to modelscope/trainers/nlp/space/trainers/intent_trainer.py index 9db24e6d..9a4bb799 100644 --- a/maas_lib/trainers/nlp/space/trainers/intent_trainer.py +++ b/modelscope/trainers/nlp/space/trainers/intent_trainer.py @@ -14,8 +14,9 @@ import torch from tqdm import tqdm from transformers.optimization import AdamW, get_linear_schedule_with_warmup -from maas_lib.trainers.nlp.space.metrics.metrics_tracker import MetricsTracker -from maas_lib.utils.nlp.space.args import str2bool +from modelscope.trainers.nlp.space.metrics.metrics_tracker import \ + MetricsTracker +from modelscope.utils.nlp.space.args import str2bool def get_logger(log_path, name='default'): diff --git a/modelscope/utils/__init__.py b/modelscope/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/maas_lib/utils/config.py b/modelscope/utils/config.py similarity index 98% rename from maas_lib/utils/config.py rename to modelscope/utils/config.py index 7d67d248..d0f3f657 100644 --- a/maas_lib/utils/config.py +++ b/modelscope/utils/config.py @@ -17,9 +17,9 @@ from typing import Dict import addict from yapf.yapflib.yapf_api import FormatCode -from maas_lib.utils.logger import get_logger -from maas_lib.utils.pymod import (import_modules, import_modules_from_file, - validate_py_syntax) +from modelscope.utils.logger import get_logger +from modelscope.utils.pymod import (import_modules, import_modules_from_file, + validate_py_syntax) if platform.system() == 'Windows': import regex as re # type: ignore @@ -117,7 +117,7 @@ class Config: # delete imported module del sys.modules[module_nanme] elif filename.endswith(('.yml', '.yaml', '.json')): - from maas_lib.fileio import load + from modelscope.fileio import load cfg_dict = load(tmp_cfg_file.name) # close temp file tmp_cfg_file.close() @@ -364,7 +364,7 @@ class Config: file (str, optional): Path of the output file where the config will be dumped. Defaults to None. """ - from maas_lib.fileio import dump + from modelscope.fileio import dump cfg_dict = super(Config, self).__getattribute__('_cfg_dict').to_dict() if file is None: if self.filename is None or self.filename.endswith('.py'): diff --git a/maas_lib/utils/constant.py b/modelscope/utils/constant.py similarity index 97% rename from maas_lib/utils/constant.py rename to modelscope/utils/constant.py index 17e76309..41c9443b 100644 --- a/maas_lib/utils/constant.py +++ b/modelscope/utils/constant.py @@ -13,7 +13,7 @@ class Fields(object): class Tasks(object): - """ Names for tasks supported by maas lib. + """ Names for tasks supported by modelscope. Holds the standard task name to use for identifying different tasks. This should be used to register models, pipelines, trainers. diff --git a/modelscope/utils/hub.py b/modelscope/utils/hub.py new file mode 100644 index 00000000..2f61b148 --- /dev/null +++ b/modelscope/utils/hub.py @@ -0,0 +1,14 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import os + +from maas_hub.constants import MODEL_ID_SEPARATOR + + +# temp solution before the hub-cache is in place +def get_model_cache_dir(model_id: str, branch: str = 'master'): + model_id_expanded = model_id.replace('/', + MODEL_ID_SEPARATOR) + '.' + branch + default_cache_dir = os.path.expanduser(os.path.join('~/.cache', 'maas')) + return os.getenv('MAAS_CACHE', + os.path.join(default_cache_dir, 'hub', model_id_expanded)) diff --git a/maas_lib/utils/logger.py b/modelscope/utils/logger.py similarity index 100% rename from maas_lib/utils/logger.py rename to modelscope/utils/logger.py diff --git a/modelscope/utils/nlp/__init__.py b/modelscope/utils/nlp/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/utils/nlp/space/__init__.py b/modelscope/utils/nlp/space/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/maas_lib/utils/nlp/space/args.py b/modelscope/utils/nlp/space/args.py similarity index 100% rename from maas_lib/utils/nlp/space/args.py rename to modelscope/utils/nlp/space/args.py diff --git a/maas_lib/utils/nlp/space/criterions.py b/modelscope/utils/nlp/space/criterions.py similarity index 100% rename from maas_lib/utils/nlp/space/criterions.py rename to modelscope/utils/nlp/space/criterions.py diff --git a/maas_lib/utils/nlp/space/db_ops.py b/modelscope/utils/nlp/space/db_ops.py similarity index 100% rename from maas_lib/utils/nlp/space/db_ops.py rename to modelscope/utils/nlp/space/db_ops.py diff --git a/maas_lib/utils/nlp/space/ontology.py b/modelscope/utils/nlp/space/ontology.py similarity index 100% rename from maas_lib/utils/nlp/space/ontology.py rename to modelscope/utils/nlp/space/ontology.py diff --git a/maas_lib/utils/nlp/space/scores.py b/modelscope/utils/nlp/space/scores.py similarity index 100% rename from maas_lib/utils/nlp/space/scores.py rename to modelscope/utils/nlp/space/scores.py diff --git a/maas_lib/utils/nlp/space/utils.py b/modelscope/utils/nlp/space/utils.py similarity index 100% rename from maas_lib/utils/nlp/space/utils.py rename to modelscope/utils/nlp/space/utils.py diff --git a/maas_lib/utils/pymod.py b/modelscope/utils/pymod.py similarity index 98% rename from maas_lib/utils/pymod.py rename to modelscope/utils/pymod.py index 4f717480..6db6798d 100644 --- a/maas_lib/utils/pymod.py +++ b/modelscope/utils/pymod.py @@ -7,7 +7,7 @@ import sys import types from importlib import import_module -from maas_lib.utils.logger import get_logger +from modelscope.utils.logger import get_logger logger = get_logger() diff --git a/maas_lib/utils/registry.py b/modelscope/utils/registry.py similarity index 94% rename from maas_lib/utils/registry.py rename to modelscope/utils/registry.py index 838e6f83..73a938ea 100644 --- a/maas_lib/utils/registry.py +++ b/modelscope/utils/registry.py @@ -3,7 +3,7 @@ import inspect from email.policy import default -from maas_lib.utils.logger import get_logger +from modelscope.utils.logger import get_logger default_group = 'default' logger = get_logger() @@ -100,6 +100,12 @@ class Registry(object): >>> class SwinTransformerDefaultGroup: >>> pass + >>> class SwinTransformer2: + >>> pass + >>> MODELS.register_module('image-classification', + module_name='SwinT2', + module_cls=SwinTransformer2) + Args: group_key: Group name of which module will be registered, default group name is 'default' @@ -168,7 +174,7 @@ def build_from_cfg(cfg, '`cfg` or `default_args` must contain the key "type", ' f'but got {cfg}\n{default_args}') if not isinstance(registry, Registry): - raise TypeError('registry must be an maas_lib.Registry object, ' + raise TypeError('registry must be an modelscope.Registry object, ' f'but got {type(registry)}') if not (isinstance(default_args, dict) or default_args is None): raise TypeError('default_args must be a dict or None, ' diff --git a/maas_lib/utils/type_assert.py b/modelscope/utils/type_assert.py similarity index 100% rename from maas_lib/utils/type_assert.py rename to modelscope/utils/type_assert.py diff --git a/maas_lib/version.py b/modelscope/version.py similarity index 100% rename from maas_lib/version.py rename to modelscope/version.py diff --git a/requirements.txt b/requirements.txt index 999c567e..39eb5e23 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,5 @@ -r requirements/runtime.txt -r requirements/pipeline.txt +-r requirements/multi-modal.txt +-r requirements/nlp.txt +-r requirements/cv.txt diff --git a/requirements/cv.txt b/requirements/cv.txt new file mode 100644 index 00000000..66799b76 --- /dev/null +++ b/requirements/cv.txt @@ -0,0 +1 @@ +easydict diff --git a/requirements/maas.txt b/requirements/maas.txt deleted file mode 100644 index 66b9aeca..00000000 --- a/requirements/maas.txt +++ /dev/null @@ -1,3 +0,0 @@ -http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/release/maas/maas_lib-0.1.1-py3-none-any.whl -https://maashub.oss-cn-hangzhou.aliyuncs.com/releases/maas_hub-0.1.0.dev0-py2.py3-none-any.whl -https://mit-dataset.oss-cn-beijing.aliyuncs.com/release/ali_maas_datasets-0.0.1.dev0-py3-none-any.whl diff --git a/requirements/multi-modal.txt b/requirements/multi-modal.txt new file mode 100644 index 00000000..ad641b63 --- /dev/null +++ b/requirements/multi-modal.txt @@ -0,0 +1,9 @@ +datasets +einops +ftfy>=6.0.3 +https://jirenmr.oss-cn-zhangjiakou.aliyuncs.com/ofa/fairseq-maas-py3-none-any.whl +https://jirenmr.oss-cn-zhangjiakou.aliyuncs.com/ofa/ofa-0.0.2-py3-none-any.whl +pycocoevalcap>=1.2 +pycocotools>=2.0.4 +rouge_score +timm diff --git a/requirements/nlp.txt b/requirements/nlp.txt new file mode 100644 index 00000000..8de83798 --- /dev/null +++ b/requirements/nlp.txt @@ -0,0 +1 @@ +https://alinlp.alibaba-inc.com/pypi/sofa-1.0.1.3-py3-none-any.whl diff --git a/requirements/runtime.txt b/requirements/runtime.txt index 5d24e660..47a11cbc 100644 --- a/requirements/runtime.txt +++ b/requirements/runtime.txt @@ -1,6 +1,7 @@ addict +datasets +easydict https://maashub.oss-cn-hangzhou.aliyuncs.com/releases/maas_hub-0.1.0.dev0-py2.py3-none-any.whl -https://mit-dataset.oss-cn-beijing.aliyuncs.com/release/ali_maas_datasets-0.0.1.dev0-py3-none-any.whl numpy opencv-python-headless Pillow diff --git a/setup.cfg b/setup.cfg index 8feaa182..0b929b04 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,7 +2,7 @@ line_length = 79 multi_line_output = 0 known_standard_library = setuptools -known_first_party = maas_lib +known_first_party = modelscope known_third_party = json,yaml no_lines_before = STDLIB,LOCALFOLDER default_section = THIRDPARTY diff --git a/setup.py b/setup.py index b9044bff..b027c4cb 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ def readme(): return content -version_file = 'maas_lib/version.py' +version_file = 'modelscope/version.py' def get_git_hash(): @@ -155,8 +155,8 @@ def pack_resource(): shutil.rmtree(root_dir) os.makedirs(root_dir) - proj_dir = root_dir + 'maas_lib/' - shutil.copytree('./maas_lib', proj_dir) + proj_dir = root_dir + 'modelscope/' + shutil.copytree('./modelscope', proj_dir) shutil.copytree('./configs', proj_dir + 'configs') shutil.copytree('./requirements', 'package/requirements') shutil.copy('./requirements.txt', 'package/requirements.txt') @@ -170,13 +170,13 @@ if __name__ == '__main__': os.chdir('package') install_requires, deps_link = parse_requirements('requirements.txt') setup( - name='maas-lib', + name='model-scope', version=get_version(), description='', long_description=readme(), long_description_content_type='text/markdown', - author='Alibaba MaaS team', - author_email='maas_lib@list.alibaba-inc.com', + author='Alibaba ModelScope team', + author_email='modelscope@list.alibaba-inc.com', keywords='', url='TBD', packages=find_packages(exclude=('configs', 'tools', 'demo')), diff --git a/tests/fileio/test_file.py b/tests/fileio/test_file.py index 9f83f02c..0be41b42 100644 --- a/tests/fileio/test_file.py +++ b/tests/fileio/test_file.py @@ -5,7 +5,7 @@ import unittest from requests import HTTPError -from maas_lib.fileio.file import File, HTTPStorage, LocalStorage +from modelscope.fileio.file import File, HTTPStorage, LocalStorage class FileTest(unittest.TestCase): diff --git a/tests/fileio/test_io.py b/tests/fileio/test_io.py index 1e202e5b..0a80d3f7 100644 --- a/tests/fileio/test_io.py +++ b/tests/fileio/test_io.py @@ -2,7 +2,7 @@ import tempfile import unittest -from maas_lib.fileio.io import dump, dumps, load +from modelscope.fileio.io import dump, dumps, load class FileIOTest(unittest.TestCase): diff --git a/tests/pipelines/nlp/test_dialog_generation.py b/tests/pipelines/nlp/test_dialog_generation.py index 413e70b5..8ec8e17a 100644 --- a/tests/pipelines/nlp/test_dialog_generation.py +++ b/tests/pipelines/nlp/test_dialog_generation.py @@ -6,9 +6,9 @@ import unittest from tests.case.nlp.dialog_generation_case import test_case -from maas_lib.models.nlp import DialogGenerationModel -from maas_lib.pipelines import DialogGenerationPipeline, pipeline -from maas_lib.preprocessors import DialogGenerationPreprocessor +from modelscope.models.nlp import DialogGenerationModel +from modelscope.pipelines import DialogGenerationPipeline, pipeline +from modelscope.preprocessors import DialogGenerationPreprocessor def merge(info, result): diff --git a/tests/pipelines/nlp/test_dialog_intent.py b/tests/pipelines/nlp/test_dialog_intent.py index 86e78d06..11665762 100644 --- a/tests/pipelines/nlp/test_dialog_intent.py +++ b/tests/pipelines/nlp/test_dialog_intent.py @@ -6,10 +6,10 @@ import unittest from tests.case.nlp.dialog_intent_case import test_case -from maas_lib.models.nlp import DialogIntentModel -from maas_lib.pipelines import DialogIntentPipeline, pipeline -from maas_lib.preprocessors import DialogIntentPreprocessor -from maas_lib.utils.constant import Tasks +from modelscope.models.nlp import DialogIntentModel +from modelscope.pipelines import DialogIntentPipeline, pipeline +from modelscope.preprocessors import DialogIntentPreprocessor +from modelscope.utils.constant import Tasks class DialogGenerationTest(unittest.TestCase): @@ -28,7 +28,7 @@ class DialogGenerationTest(unittest.TestCase): # pipeline1 = pipeline(task=Tasks.dialog_intent, model=model, preprocessor=preprocessor) for item in test_case: - pipeline1(item) + print(pipeline1(item)) if __name__ == '__main__': diff --git a/tests/pipelines/test_base.py b/tests/pipelines/test_base.py index d523e7c4..14f646a9 100644 --- a/tests/pipelines/test_base.py +++ b/tests/pipelines/test_base.py @@ -6,11 +6,11 @@ from typing import Any, Dict, List, Tuple, Union import numpy as np import PIL -from maas_lib.pipelines import Pipeline, pipeline -from maas_lib.pipelines.builder import PIPELINES -from maas_lib.utils.constant import Tasks -from maas_lib.utils.logger import get_logger -from maas_lib.utils.registry import default_group +from modelscope.pipelines import Pipeline, pipeline +from modelscope.pipelines.builder import PIPELINES, add_default_pipeline_info +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger +from modelscope.utils.registry import default_group logger = get_logger() @@ -53,7 +53,7 @@ class CustomPipelineTest(unittest.TestCase): """ if not isinstance(input, PIL.Image.Image): - from maas_lib.preprocessors import load_image + from modelscope.preprocessors import load_image data_dict = {'img': load_image(input), 'url': input} else: data_dict = {'img': input} @@ -75,6 +75,7 @@ class CustomPipelineTest(unittest.TestCase): return inputs self.assertTrue('custom-image' in PIPELINES.modules[default_group]) + add_default_pipeline_info(Tasks.image_tagging, 'custom-image') pipe = pipeline(pipeline_name='custom-image') pipe2 = pipeline(Tasks.image_tagging) self.assertTrue(type(pipe) is type(pipe2)) diff --git a/tests/pipelines/test_builder.py b/tests/pipelines/test_builder.py new file mode 100644 index 00000000..a0b15a32 --- /dev/null +++ b/tests/pipelines/test_builder.py @@ -0,0 +1,68 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import unittest +from asyncio import Task +from typing import Any, Dict, List, Tuple, Union + +import numpy as np +import PIL + +from modelscope.models.base import Model +from modelscope.pipelines import Pipeline, pipeline +from modelscope.pipelines.builder import PIPELINES, add_default_pipeline_info +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger +from modelscope.utils.registry import default_group + +logger = get_logger() + + +@PIPELINES.register_module( + group_key=Tasks.image_tagging, module_name='custom_single_model') +class CustomSingleModelPipeline(Pipeline): + + def __init__(self, + config_file: str = None, + model: List[Union[str, Model]] = None, + preprocessor=None, + **kwargs): + super().__init__(config_file, model, preprocessor, **kwargs) + assert isinstance(model, str), 'model is not str' + print(model) + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + return super().postprocess(inputs) + + +@PIPELINES.register_module( + group_key=Tasks.image_tagging, module_name='model1_model2') +class CustomMultiModelPipeline(Pipeline): + + def __init__(self, + config_file: str = None, + model: List[Union[str, Model]] = None, + preprocessor=None, + **kwargs): + super().__init__(config_file, model, preprocessor, **kwargs) + assert isinstance(model, list), 'model is not list' + for m in model: + assert isinstance(m, str), 'submodel is not str' + print(m) + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + return super().postprocess(inputs) + + +class PipelineInterfaceTest(unittest.TestCase): + + def test_single_model(self): + pipe = pipeline(Tasks.image_tagging, model='custom_single_model') + assert isinstance(pipe, CustomSingleModelPipeline) + + def test_multi_model(self): + pipe = pipeline(Tasks.image_tagging, model=['model1', 'model2']) + assert isinstance(pipe, CustomMultiModelPipeline) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/pipelines/test_image_captioning.py b/tests/pipelines/test_image_captioning.py new file mode 100644 index 00000000..5584d0e2 --- /dev/null +++ b/tests/pipelines/test_image_captioning.py @@ -0,0 +1,36 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import os +import tempfile +import unittest + +from modelscope.fileio import File +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks + + +class ImageCaptionTest(unittest.TestCase): + + @unittest.skip('skip long test') + def test_run(self): + model = 'https://ofa-beijing.oss-cn-beijing.aliyuncs.com/checkpoints/caption_large_best_clean.pt' + + os.system( + 'wget https://jirenmr.oss-cn-zhangjiakou.aliyuncs.com/ofa/BPE.zip' + ) + os.system('unzip BPE.zip') + bpe_dir = './BPE' + + with tempfile.NamedTemporaryFile('wb', suffix='.pb') as ofile: + ofile.write(File.read(model)) + img_captioning = pipeline( + Tasks.image_captioning, model=ofile.name, bpe_dir=bpe_dir) + + result = img_captioning( + 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png' + ) + print(result['caption']) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/pipelines/test_image_matting.py b/tests/pipelines/test_image_matting.py index 26847389..53006317 100644 --- a/tests/pipelines/test_image_matting.py +++ b/tests/pipelines/test_image_matting.py @@ -1,19 +1,28 @@ # Copyright (c) Alibaba, Inc. and its affiliates. - import os.path as osp +import shutil import tempfile import unittest import cv2 -from ali_maas_datasets import PyDataset -from maas_lib.fileio import File -from maas_lib.pipelines import pipeline -from maas_lib.utils.constant import Tasks +from modelscope.fileio import File +from modelscope.pipelines import pipeline +from modelscope.pydatasets import PyDataset +from modelscope.utils.constant import Tasks +from modelscope.utils.hub import get_model_cache_dir class ImageMattingTest(unittest.TestCase): + def setUp(self) -> None: + self.model_id = 'damo/image-matting-person' + # switch to False if downloading everytime is not desired + purge_cache = True + if purge_cache: + shutil.rmtree( + get_model_cache_dir(self.model_id), ignore_errors=True) + def test_run(self): model_path = 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs' \ '.com/data/test/maas/image_matting/matting_person.pb' @@ -36,16 +45,23 @@ class ImageMattingTest(unittest.TestCase): # input_location = '/dir/to/images' dataset = PyDataset.load(input_location, target='image') - img_matting = pipeline( - Tasks.image_matting, model='damo/image-matting-person') + img_matting = pipeline(Tasks.image_matting, model=self.model_id) # note that for dataset output, the inference-output is a Generator that can be iterated. result = img_matting(dataset) cv2.imwrite('result.png', next(result)['output_png']) print(f'Output written to {osp.abspath("result.png")}') def test_run_modelhub(self): - img_matting = pipeline( - Tasks.image_matting, model='damo/image-matting-person') + img_matting = pipeline(Tasks.image_matting, model=self.model_id) + + result = img_matting( + 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png' + ) + cv2.imwrite('result.png', result['output_png']) + print(f'Output written to {osp.abspath("result.png")}') + + def test_run_modelhub_default_model(self): + img_matting = pipeline(Tasks.image_matting) result = img_matting( 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png' diff --git a/tests/pipelines/test_person_image_cartoon.py b/tests/pipelines/test_person_image_cartoon.py new file mode 100644 index 00000000..6f352e42 --- /dev/null +++ b/tests/pipelines/test_person_image_cartoon.py @@ -0,0 +1,49 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +import os.path as osp +import unittest + +import cv2 + +from modelscope.pipelines import pipeline +from modelscope.pipelines.base import Pipeline +from modelscope.utils.constant import Tasks + + +class ImageCartoonTest(unittest.TestCase): + + def setUp(self) -> None: + self.model_id = 'damo/cv_unet_person-image-cartoon_compound-models' + self.test_image = \ + 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com' \ + '/data/test/maas/image_carton/test.png' + + def pipeline_inference(self, pipeline: Pipeline, input_location: str): + result = pipeline(input_location) + if result is not None: + cv2.imwrite('result.png', result['output_png']) + print(f'Output written to {osp.abspath("result.png")}') + + @unittest.skip('deprecated, download model from model hub instead') + def test_run_by_direct_model_download(self): + model_dir = './assets' + if not os.path.exists(model_dir): + os.system( + 'wget https://invi-label.oss-cn-shanghai.aliyuncs.com/label/model/cartoon/assets.zip' + ) + os.system('unzip assets.zip') + + img_cartoon = pipeline(Tasks.image_generation, model=model_dir) + self.pipeline_inference(img_cartoon, self.test_image) + + def test_run_modelhub(self): + img_cartoon = pipeline(Tasks.image_generation, model=self.model_id) + self.pipeline_inference(img_cartoon, self.test_image) + + def test_run_modelhub_default_model(self): + img_cartoon = pipeline(Tasks.image_generation) + self.pipeline_inference(img_cartoon, self.test_image) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/pipelines/test_text_classification.py b/tests/pipelines/test_text_classification.py index 45b584af..3e3faa1d 100644 --- a/tests/pipelines/test_text_classification.py +++ b/tests/pipelines/test_text_classification.py @@ -1,21 +1,29 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -import tempfile +import shutil import unittest import zipfile from pathlib import Path -from ali_maas_datasets import PyDataset - -from maas_lib.fileio import File -from maas_lib.models import Model -from maas_lib.models.nlp import SequenceClassificationModel -from maas_lib.pipelines import SequenceClassificationPipeline, pipeline -from maas_lib.preprocessors import SequenceClassificationPreprocessor -from maas_lib.utils.constant import Tasks +from modelscope.fileio import File +from modelscope.models import Model +from modelscope.models.nlp import BertForSequenceClassification +from modelscope.pipelines import SequenceClassificationPipeline, pipeline +from modelscope.preprocessors import SequenceClassificationPreprocessor +from modelscope.pydatasets import PyDataset +from modelscope.utils.constant import Tasks +from modelscope.utils.hub import get_model_cache_dir class SequenceClassificationTest(unittest.TestCase): + def setUp(self) -> None: + self.model_id = 'damo/bert-base-sst2' + # switch to False if downloading everytime is not desired + purge_cache = True + if purge_cache: + shutil.rmtree( + get_model_cache_dir(self.model_id), ignore_errors=True) + def predict(self, pipeline_ins: SequenceClassificationPipeline): from easynlp.appzoo import load_dataset @@ -29,6 +37,12 @@ class SequenceClassificationTest(unittest.TestCase): print(data) + def printDataset(self, dataset: PyDataset): + for i, r in enumerate(dataset): + if i > 10: + break + print(r) + def test_run(self): model_url = 'https://atp-modelzoo-sh.oss-cn-shanghai.aliyuncs.com' \ '/release/easynlp_modelzoo/alibaba-pai/bert-base-sst2.zip' @@ -44,7 +58,7 @@ class SequenceClassificationTest(unittest.TestCase): with zipfile.ZipFile(cache_path_str, 'r') as zipf: zipf.extractall(cache_path.parent) path = r'.cache/easynlp/' - model = SequenceClassificationModel(path) + model = BertForSequenceClassification(path) preprocessor = SequenceClassificationPreprocessor( path, first_sequence='sentence', second_sequence=None) pipeline1 = SequenceClassificationPipeline(model, preprocessor) @@ -53,8 +67,8 @@ class SequenceClassificationTest(unittest.TestCase): Tasks.text_classification, model=model, preprocessor=preprocessor) print(pipeline2('Hello world!')) - def test_run_modelhub(self): - model = Model.from_pretrained('damo/bert-base-sst2') + def test_run_with_model_from_modelhub(self): + model = Model.from_pretrained(self.model_id) preprocessor = SequenceClassificationPreprocessor( model.model_dir, first_sequence='sentence', second_sequence=None) pipeline_ins = pipeline( @@ -63,8 +77,21 @@ class SequenceClassificationTest(unittest.TestCase): preprocessor=preprocessor) self.predict(pipeline_ins) + def test_run_with_model_name(self): + text_classification = pipeline( + task=Tasks.text_classification, model=self.model_id) + result = text_classification( + PyDataset.load('glue', name='sst2', target='sentence')) + self.printDataset(result) + + def test_run_with_default_model(self): + text_classification = pipeline(task=Tasks.text_classification) + result = text_classification( + PyDataset.load('glue', name='sst2', target='sentence')) + self.printDataset(result) + def test_run_with_dataset(self): - model = Model.from_pretrained('damo/bert-base-sst2') + model = Model.from_pretrained(self.model_id) preprocessor = SequenceClassificationPreprocessor( model.model_dir, first_sequence='sentence', second_sequence=None) text_classification = pipeline( @@ -74,10 +101,7 @@ class SequenceClassificationTest(unittest.TestCase): # TODO: rename parameter as dataset_name and subset_name dataset = PyDataset.load('glue', name='sst2', target='sentence') result = text_classification(dataset) - for i, r in enumerate(result): - if i > 10: - break - print(r) + self.printDataset(result) if __name__ == '__main__': diff --git a/tests/pipelines/test_text_generation.py b/tests/pipelines/test_text_generation.py new file mode 100644 index 00000000..d8f1b495 --- /dev/null +++ b/tests/pipelines/test_text_generation.py @@ -0,0 +1,51 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import unittest + +from maas_hub.snapshot_download import snapshot_download + +from modelscope.models import Model +from modelscope.models.nlp import PalmForTextGenerationModel +from modelscope.pipelines import TextGenerationPipeline, pipeline +from modelscope.preprocessors import TextGenerationPreprocessor +from modelscope.utils.constant import Tasks + + +class TextGenerationTest(unittest.TestCase): + model_id = 'damo/nlp_palm_text-generation_chinese' + input1 = "今日天气类型='晴'&温度变化趋势='大幅上升'&最低气温='28℃'&最高气温='31℃'&体感='湿热'" + input2 = "今日天气类型='多云'&体感='舒适'&最低气温='26℃'&最高气温='30℃'" + + @unittest.skip('skip temporarily to save test time') + def test_run(self): + cache_path = snapshot_download(self.model_id) + preprocessor = TextGenerationPreprocessor( + cache_path, first_sequence='sentence', second_sequence=None) + model = PalmForTextGenerationModel( + cache_path, tokenizer=preprocessor.tokenizer) + pipeline1 = TextGenerationPipeline(model, preprocessor) + pipeline2 = pipeline( + Tasks.text_generation, model=model, preprocessor=preprocessor) + print(f'input: {self.input1}\npipeline1: {pipeline1(self.input1)}') + print() + print(f'input: {self.input2}\npipeline2: {pipeline2(self.input2)}') + + def test_run_with_model_from_modelhub(self): + model = Model.from_pretrained(self.model_id) + preprocessor = TextGenerationPreprocessor( + model.model_dir, first_sequence='sentence', second_sequence=None) + pipeline_ins = pipeline( + task=Tasks.text_generation, model=model, preprocessor=preprocessor) + print(pipeline_ins(self.input1)) + + def test_run_with_model_name(self): + pipeline_ins = pipeline( + task=Tasks.text_generation, model=self.model_id) + print(pipeline_ins(self.input2)) + + def test_run_with_default_model(self): + pipeline_ins = pipeline(task=Tasks.text_generation) + print(pipeline_ins(self.input2)) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/preprocessors/test_common.py b/tests/preprocessors/test_common.py index d9b0f74f..1ee13589 100644 --- a/tests/preprocessors/test_common.py +++ b/tests/preprocessors/test_common.py @@ -2,7 +2,7 @@ import unittest -from maas_lib.preprocessors import PREPROCESSORS, Compose, Preprocessor +from modelscope.preprocessors import PREPROCESSORS, Compose, Preprocessor class ComposeTest(unittest.TestCase): diff --git a/tests/preprocessors/test_nlp.py b/tests/preprocessors/test_nlp.py index 740bf938..fca01597 100644 --- a/tests/preprocessors/test_nlp.py +++ b/tests/preprocessors/test_nlp.py @@ -2,9 +2,9 @@ import unittest -from maas_lib.preprocessors import build_preprocessor -from maas_lib.utils.constant import Fields, InputFields -from maas_lib.utils.logger import get_logger +from modelscope.preprocessors import build_preprocessor +from modelscope.utils.constant import Fields, InputFields +from modelscope.utils.logger import get_logger logger = get_logger() diff --git a/tests/pydatasets/__init__.py b/tests/pydatasets/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/pydatasets/test_py_dataset.py b/tests/pydatasets/test_py_dataset.py new file mode 100644 index 00000000..7accd814 --- /dev/null +++ b/tests/pydatasets/test_py_dataset.py @@ -0,0 +1,44 @@ +import unittest + +import datasets as hfdata + +from modelscope.pydatasets import PyDataset + + +class PyDatasetTest(unittest.TestCase): + + def setUp(self): + # ds1 initialized from in memory json + self.json_data = { + 'dummy': [{ + 'a': i, + 'x': i * 10, + 'c': i * 100 + } for i in range(1, 11)] + } + hfds1 = hfdata.Dataset.from_dict(self.json_data) + self.ds1 = PyDataset.from_hf_dataset(hfds1) + + # ds2 initialized from hg hub + hfds2 = hfdata.load_dataset( + 'glue', 'mrpc', revision='2.0.0', split='train') + self.ds2 = PyDataset.from_hf_dataset(hfds2) + + def tearDown(self): + pass + + def test_to_hf_dataset(self): + hfds = self.ds1.to_hf_dataset() + hfds1 = hfdata.Dataset.from_dict(self.json_data) + self.assertEqual(hfds.data, hfds1.data) + + # simple map function + hfds = hfds.map(lambda e: {'new_feature': e['dummy']['a']}) + self.assertEqual(len(hfds['new_feature']), 10) + + hfds2 = self.ds2.to_hf_dataset() + self.assertTrue(hfds2[0]['sentence1'].startswith('Amrozi')) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/trainers/test_sequence_classification_trainer.py b/tests/trainers/test_sequence_classification_trainer.py index 9846db4f..c0b2d109 100644 --- a/tests/trainers/test_sequence_classification_trainer.py +++ b/tests/trainers/test_sequence_classification_trainer.py @@ -2,9 +2,9 @@ import unittest import zipfile from pathlib import Path -from maas_lib.fileio import File -from maas_lib.trainers import build_trainer -from maas_lib.utils.logger import get_logger +from modelscope.fileio import File +from modelscope.trainers import build_trainer +from modelscope.utils.logger import get_logger logger = get_logger() diff --git a/tests/trainers/test_trainer_base.py b/tests/trainers/test_trainer_base.py index e764d6c9..c5fc1303 100644 --- a/tests/trainers/test_trainer_base.py +++ b/tests/trainers/test_trainer_base.py @@ -2,7 +2,7 @@ import unittest -from maas_lib.trainers import build_trainer +from modelscope.trainers import build_trainer class DummyTrainerTest(unittest.TestCase): diff --git a/tests/utils/test_config.py b/tests/utils/test_config.py index 31d51311..48f1d4a8 100644 --- a/tests/utils/test_config.py +++ b/tests/utils/test_config.py @@ -5,8 +5,8 @@ import tempfile import unittest from pathlib import Path -from maas_lib.fileio import dump, load -from maas_lib.utils.config import Config +from modelscope.fileio import dump, load +from modelscope.utils.config import Config obj = {'a': 1, 'b': {'c': [1, 2, 3], 'd': 'dd'}} diff --git a/tests/utils/test_hub_operation.py b/tests/utils/test_hub_operation.py new file mode 100644 index 00000000..f432a60c --- /dev/null +++ b/tests/utils/test_hub_operation.py @@ -0,0 +1,50 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os.path as osp +import unittest + +from maas_hub.maas_api import MaasApi +from maas_hub.repository import Repository + +USER_NAME = 'maasadmin' +PASSWORD = '12345678' + + +class HubOperationTest(unittest.TestCase): + + def setUp(self): + self.api = MaasApi() + # note this is temporary before official account management is ready + self.api.login(USER_NAME, PASSWORD) + + @unittest.skip('to be used for local test only') + def test_model_repo_creation(self): + # change to proper model names before use + model_name = 'cv_unet_person-image-cartoon_compound-models' + model_chinese_name = '达摩卡通化模型' + model_org = 'damo' + try: + self.api.create_model( + owner=model_org, + name=model_name, + chinese_name=model_chinese_name, + visibility=5, # 1-private, 5-public + license='apache-2.0') + # TODO: support proper name duplication checking + except KeyError as ke: + if ke.args[0] == 'name': + print(f'model {self.model_name} already exists, ignore') + else: + raise + + # Note that this can be done via git operation once model repo + # has been created. Git-Op is the RECOMMENDED model upload approach + @unittest.skip('to be used for local test only') + def test_model_upload(self): + local_path = '/path/to/local/model/directory' + assert osp.exists(local_path), 'Local model directory not exist.' + repo = Repository(local_dir=local_path) + repo.push_to_hub(commit_message='Upload model files') + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/utils/test_registry.py b/tests/utils/test_registry.py index 982b9f21..67e44f4e 100644 --- a/tests/utils/test_registry.py +++ b/tests/utils/test_registry.py @@ -1,8 +1,8 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import unittest -from maas_lib.utils.constant import Tasks -from maas_lib.utils.registry import Registry, build_from_cfg, default_group +from modelscope.utils.constant import Tasks +from modelscope.utils.registry import Registry, build_from_cfg, default_group class RegistryTest(unittest.TestCase): diff --git a/tests/utils/test_type_assert.py b/tests/utils/test_type_assert.py index 4ec9f2e5..5b62a269 100644 --- a/tests/utils/test_type_assert.py +++ b/tests/utils/test_type_assert.py @@ -3,7 +3,7 @@ import unittest from typing import List, Union -from maas_lib.utils.type_assert import type_assert +from modelscope.utils.type_assert import type_assert class type_assertTest(unittest.TestCase):