diff --git a/fastNLP/io/__init__.py b/fastNLP/io/__init__.py index 01683628..a3ea0148 100644 --- a/fastNLP/io/__init__.py +++ b/fastNLP/io/__init__.py @@ -38,6 +38,7 @@ __all__ = [ 'JsonLoader', 'CWSLoader', + "CWSPipe", 'MNLILoader', "QuoraLoader", diff --git a/fastNLP/io/pipe/__init__.py b/fastNLP/io/pipe/__init__.py index 1907af4a..048e4cfe 100644 --- a/fastNLP/io/pipe/__init__.py +++ b/fastNLP/io/pipe/__init__.py @@ -10,6 +10,8 @@ Pipe用于处理通过 Loader 读取的数据,所有的 Pipe 都包含 ``proce __all__ = [ "Pipe", + "CWSPipe", + "YelpFullPipe", "YelpPolarityPipe", "SSTPipe", @@ -43,3 +45,4 @@ from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe from .pipe import Pipe from .conll import Conll2003Pipe +from .cws import CWSPipe diff --git a/reproduction/seqence_labelling/chinese_ner/readme.md b/reproduction/seqence_labelling/chinese_ner/readme.md new file mode 100644 index 00000000..3a9d37d8 --- /dev/null +++ b/reproduction/seqence_labelling/chinese_ner/readme.md @@ -0,0 +1,30 @@ +使用以下中文NERPipe自动下载的统计数据 + +| MsraNERPipe | # of sents | # of tokens | +| ----------- | ---------- | ----------- | +| train | 41747 | 1954374 | +| dev | 4617 | 215505 | +| test | 4365 | 172601 | +| total | 50729 | 2342480 | +这里报道的统计数据,与[https://arxiv.org/pdf/1805.02023.pdf]()报道的一致 + + + +| WeiboNERPipe | # of sents | # of tokens | +| ------------ | ---------- | ----------- | +| train | 1350 | 73778 | +| dev | 270 | 14509 | +| test | 270 | 14842 | +| total | 1890 | 1890 | +这里报道的统计数据与[https://www.cs.cmu.edu/~ark/EMNLP-2015/proceedings/EMNLP/pdf/EMNLP064.pdf]()一致 + + + + +| PeopleDailyPipe | # of sents | # of tokens | +| --------------- | ---------- | ----------- | +| train | 50658 | 2169879 | +| dev | 4631 | 172601 | +| test | 68 | 2270 | +| total | 55357 | 2344750 | +这里使用的数据与[https://arxiv.org/pdf/1906.08101.pdf]()的数据是一致的 diff --git a/reproduction/seqence_labelling/cws/readme.md b/reproduction/seqence_labelling/cws/readme.md new file mode 100644 index 00000000..a25bb0ed --- /dev/null +++ b/reproduction/seqence_labelling/cws/readme.md @@ -0,0 +1,32 @@ +四个数据集的统计信息,最原始的数据可以从[http://sighan.cs.uchicago.edu/bakeoff2005/]()下载。 + +| pku | # of sents | # of tokens | +| ----- | ---------- | ----------- | +| train | 17173 | 1650222 | +| dev | 1881 | 176226 | +| test | 1944 | 172733 | +| total | 20998 | 1999181 | + + +| cityu | # of sents | # of tokens | +| ----- | ---------- | ----------- | +| train | 47696 | 2164907 | +| dev | 5323 | 238447 | +| test | 1492 | 67690 | +| total | 54511 | 2471044 | + + +| msra | # of sents | # of tokens | +| ----- | ---------- | ----------- | +| train | 78242 | 3644550 | +| dev | 8676 | 405919 | +| test | 3985 | 184355 | +| total | 90903 | 4234824 | + + +| as | # of sents | # of tokens | +| ----- | ---------- | ----------- | +| train | 638273 | 7536586 | +| dev | 70680 | 831464 | +| test | 14429 | 197681 | +| total | 723382 | 8565731 | diff --git a/reproduction/seqence_labelling/cws/test/__init__.py b/reproduction/seqence_labelling/cws/test/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/reproduction/seqence_labelling/cws/test/test_CWSDataLoader.py b/reproduction/seqence_labelling/cws/test/test_CWSDataLoader.py deleted file mode 100644 index f4260849..00000000 --- a/reproduction/seqence_labelling/cws/test/test_CWSDataLoader.py +++ /dev/null @@ -1,17 +0,0 @@ - - -import unittest -from ..data.CWSDataLoader import SigHanLoader -from fastNLP.core.vocabulary import VocabularyOption - - -class TestCWSDataLoader(unittest.TestCase): - def test_case1(self): - cws_loader = SigHanLoader(target_type='bmes') - data = cws_loader.process('pku_demo.txt') - print(data.datasets) - - def test_calse2(self): - cws_loader = SigHanLoader(target_type='bmes') - data = cws_loader.process('pku_demo.txt', bigram_vocab_opt=VocabularyOption()) - print(data.datasets) \ No newline at end of file