You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conll.py 19 kB


  1. r"""undocumented"""
  2. __all__ = [
  3. "Conll2003NERPipe",
  4. "Conll2003Pipe",
  5. "OntoNotesNERPipe",
  6. "MsraNERPipe",
  7. "PeopleDailyPipe",
  8. "WeiboNERPipe"
  9. ]
  10. from .pipe import Pipe
  11. from .utils import _add_chars_field
  12. from .utils import _indexize, _add_words_field
  13. from .utils import iob2, iob2bioes
  14. from fastNLP.io.data_bundle import DataBundle
  15. from ..loader.conll import Conll2003NERLoader, OntoNotesNERLoader
  16. from ..loader.conll import PeopleDailyNERLoader, WeiboNERLoader, MsraNERLoader, ConllLoader
  17. # from ...core.const import Const
  18. from ...core.vocabulary import Vocabulary
  19. class _NERPipe(Pipe):
  20. r"""
  21. NER任务的处理Pipe, 该Pipe会(1)复制raw_words列,并命名为words; (2)在words, target列建立词表
  22. (创建 :class:`fastNLP.Vocabulary` 对象,所以在返回的DataBundle中将有两个Vocabulary); (3)将words,target列根据相应的
  23. Vocabulary转换为index。
  24. raw_words列为List[str], 是未转换的原始数据; words列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的
  25. target。返回的DataSet中被设置为input有words, target, seq_len; 设置为target有target, seq_len。
  26. """
  27. def __init__(self, encoding_type: str = 'bio', lower: bool = False):
  28. r"""
  29. :param: str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。
  30. :param bool lower: 是否将words小写化后再建立词表,绝大多数情况都不需要设置为True。
  31. """
  32. if encoding_type == 'bio':
  33. self.convert_tag = iob2
  34. elif encoding_type == 'bioes':
  35. self.convert_tag = lambda words: iob2bioes(iob2(words))
  36. else:
  37. raise ValueError("encoding_type only supports `bio` and `bioes`.")
  38. self.lower = lower
  39. def process(self, data_bundle: DataBundle) -> DataBundle:
  40. r"""
  41. 支持的DataSet的field为
  42. .. csv-table::
  43. :header: "raw_words", "target"
  44. "[Nadim, Ladki]", "[B-PER, I-PER]"
  45. "[AL-AIN, United, Arab, ...]", "[B-LOC, B-LOC, I-LOC, ...]"
  46. "[...]", "[...]"
  47. :param ~fastNLP.DataBundle data_bundle: 传入的DataBundle中的DataSet必须包含raw_words和ner两个field,且两个field的内容均为List[str]在传入DataBundle基础上原位修改。
  48. :return DataBundle:
  49. """
  50. # 转换tag
  51. for name, dataset in data_bundle.iter_datasets():
  52. dataset.apply_field(self.convert_tag, field_name='target', new_field_name='target')
  53. _add_words_field(data_bundle, lower=self.lower)
  54. # index
  55. _indexize(data_bundle)
  56. input_fields = ['target', 'words', 'seq_len']
  57. target_fields = ['target', 'seq_len']
  58. for name, dataset in data_bundle.iter_datasets():
  59. dataset.add_seq_len('words')
  60. return data_bundle
  61. class Conll2003NERPipe(_NERPipe):
  62. r"""
  63. Conll2003的NER任务的处理Pipe, 该Pipe会(1)复制raw_words列,并命名为words; (2)在words, target列建立词表
  64. (创建 :class:`fastNLP.Vocabulary` 对象,所以在返回的DataBundle中将有两个Vocabulary); (3)将words,target列根据相应的
  65. Vocabulary转换为index。
  66. 经过该Pipe过后,DataSet中的内容如下所示
  67. .. csv-table:: Following is a demo layout of DataSet returned by Conll2003Loader
  68. :header: "raw_words", "target", "words", "seq_len"
  69. "[Nadim, Ladki]", "[1, 2]", "[2, 3]", 2
  70. "[AL-AIN, United, Arab, ...]", "[3, 4,...]", "[4, 5, 6,...]", 6
  71. "[...]", "[...]", "[...]", .
  72. raw_words列为List[str], 是未转换的原始数据; words列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的
  73. target。返回的DataSet中被设置为input有words, target, seq_len; 设置为target有target。
  74. dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::
  75. +-------------+-----------+--------+-------+---------+
  76. | field_names | raw_words | target | words | seq_len |
  77. +-------------+-----------+--------+-------+---------+
  78. | is_input | False | True | True | True |
  79. | is_target | False | True | False | True |
  80. | ignore_type | | False | False | False |
  81. | pad_value | | 0 | 0 | 0 |
  82. +-------------+-----------+--------+-------+---------+
  83. """
  84. def process_from_file(self, paths) -> DataBundle:
  85. r"""
  86. :param paths: 支持路径类型参见 :class:`fastNLP.io.loader.ConllLoader` 的load函数。
  87. :return: DataBundle
  88. """
  89. # 读取数据
  90. data_bundle = Conll2003NERLoader().load(paths)
  91. data_bundle = self.process(data_bundle)
  92. return data_bundle
  93. class Conll2003Pipe(Pipe):
  94. r"""
  95. 经过该Pipe后,DataSet中的内容如下
  96. .. csv-table::
  97. :header: "raw_words" , "pos", "chunk", "ner", "words", "seq_len"
  98. "[Nadim, Ladki]", "[0, 0]", "[1, 2]", "[1, 2]", "[2, 3]", 2
  99. "[AL-AIN, United, Arab, ...]", "[1, 2...]", "[3, 4...]", "[3, 4...]", "[4, 5, 6,...]", 6
  100. "[...]", "[...]", "[...]", "[...]", "[...]", .
  101. 其中words, seq_len是input; pos, chunk, ner, seq_len是target
  102. dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::
  103. +-------------+-----------+-------+-------+-------+-------+---------+
  104. | field_names | raw_words | pos | chunk | ner | words | seq_len |
  105. +-------------+-----------+-------+-------+-------+-------+---------+
  106. | is_input | False | False | False | False | True | True |
  107. | is_target | False | True | True | True | False | True |
  108. | ignore_type | | False | False | False | False | False |
  109. | pad_value | | 0 | 0 | 0 | 0 | 0 |
  110. +-------------+-----------+-------+-------+-------+-------+---------+
  111. """
  112. def __init__(self, chunk_encoding_type='bioes', ner_encoding_type='bioes', lower: bool = False):
  113. r"""
  114. :param str chunk_encoding_type: 支持bioes, bio。
  115. :param str ner_encoding_type: 支持bioes, bio。
  116. :param bool lower: 是否将words列小写化后再建立词表
  117. """
  118. if chunk_encoding_type == 'bio':
  119. self.chunk_convert_tag = iob2
  120. elif chunk_encoding_type == 'bioes':
  121. self.chunk_convert_tag = lambda tags: iob2bioes(iob2(tags))
  122. else:
  123. raise ValueError("chunk_encoding_type only supports `bio` and `bioes`.")
  124. if ner_encoding_type == 'bio':
  125. self.ner_convert_tag = iob2
  126. elif ner_encoding_type == 'bioes':
  127. self.ner_convert_tag = lambda tags: iob2bioes(iob2(tags))
  128. else:
  129. raise ValueError("ner_encoding_type only supports `bio` and `bioes`.")
  130. self.lower = lower
  131. def process(self, data_bundle) -> DataBundle:
  132. r"""
  133. 输入的DataSet应该类似于如下的形式
  134. .. csv-table::
  135. :header: "raw_words", "pos", "chunk", "ner"
  136. "[Nadim, Ladki]", "[NNP, NNP]", "[B-NP, I-NP]", "[B-PER, I-PER]"
  137. "[AL-AIN, United, Arab, ...]", "[NNP, NNP...]", "[B-NP, B-NP, ...]", "[B-LOC, B-LOC,...]"
  138. "[...]", "[...]", "[...]", "[...]", .
  139. :param data_bundle:
  140. :return: 传入的DataBundle
  141. """
  142. # 转换tag
  143. for name, dataset in data_bundle.datasets.items():
  144. dataset.drop(lambda x: "-DOCSTART-" in x['raw_words'])
  145. dataset.apply_field(self.chunk_convert_tag, field_name='chunk', new_field_name='chunk')
  146. dataset.apply_field(self.ner_convert_tag, field_name='ner', new_field_name='ner')
  147. _add_words_field(data_bundle, lower=self.lower)
  148. # index
  149. _indexize(data_bundle, input_field_names='words', target_field_names=['pos', 'ner'])
  150. # chunk中存在一些tag只在dev中出现,没在train中
  151. tgt_vocab = Vocabulary(unknown=None, padding=None)
  152. tgt_vocab.from_dataset(*data_bundle.datasets.values(), field_name='chunk')
  153. tgt_vocab.index_dataset(*data_bundle.datasets.values(), field_name='chunk')
  154. data_bundle.set_vocab(tgt_vocab, 'chunk')
  155. input_fields = ['words', 'seq_len']
  156. target_fields = ['pos', 'ner', 'chunk', 'seq_len']
  157. for name, dataset in data_bundle.iter_datasets():
  158. dataset.add_seq_len('words')
  159. return data_bundle
  160. def process_from_file(self, paths):
  161. r"""
  162. :param paths:
  163. :return:
  164. """
  165. data_bundle = ConllLoader(headers=['raw_words', 'pos', 'chunk', 'ner']).load(paths)
  166. return self.process(data_bundle)
  167. class OntoNotesNERPipe(_NERPipe):
  168. r"""
  169. 处理OntoNotes的NER数据,处理之后DataSet中的field情况为
  170. .. csv-table::
  171. :header: "raw_words", "target", "words", "seq_len"
  172. "[Nadim, Ladki]", "[1, 2]", "[2, 3]", 2
  173. "[AL-AIN, United, Arab, ...]", "[3, 4]", "[4, 5, 6,...]", 6
  174. "[...]", "[...]", "[...]", .
  175. raw_words列为List[str], 是未转换的原始数据; words列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的
  176. target。返回的DataSet中被设置为input有words, target, seq_len; 设置为target有target。
  177. dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::
  178. +-------------+-----------+--------+-------+---------+
  179. | field_names | raw_words | target | words | seq_len |
  180. +-------------+-----------+--------+-------+---------+
  181. | is_input | False | True | True | True |
  182. | is_target | False | True | False | True |
  183. | ignore_type | | False | False | False |
  184. | pad_value | | 0 | 0 | 0 |
  185. +-------------+-----------+--------+-------+---------+
  186. """
  187. def process_from_file(self, paths):
  188. data_bundle = OntoNotesNERLoader().load(paths)
  189. return self.process(data_bundle)
  190. class _CNNERPipe(Pipe):
  191. r"""
  192. 中文NER任务的处理Pipe, 该Pipe会(1)复制raw_chars列,并命名为chars; (2)在chars, target列建立词表
  193. (创建 :class:`fastNLP.Vocabulary` 对象,所以在返回的DataBundle中将有两个Vocabulary); (3)将chars,target列根据相应的
  194. Vocabulary转换为index。
  195. raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的
  196. target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target, seq_len。
  197. """
  198. def __init__(self, encoding_type: str = 'bio', bigrams=False, trigrams=False):
  199. r"""
  200. :param str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。
  201. :param bool bigrams: 是否增加一列bigrams. bigrams的构成是['复', '旦', '大', '学', ...]->["复旦", "旦大", ...]。如果
  202. 设置为True,返回的DataSet将有一列名为bigrams, 且已经转换为了index并设置为input,对应的vocab可以通过
  203. data_bundle.get_vocab('bigrams')获取.
  204. :param bool trigrams: 是否增加一列trigrams. trigrams的构成是 ['复', '旦', '大', '学', ...]->["复旦大", "旦大学", ...]
  205. 。如果设置为True,返回的DataSet将有一列名为trigrams, 且已经转换为了index并设置为input,对应的vocab可以通过
  206. data_bundle.get_vocab('trigrams')获取.
  207. """
  208. if encoding_type == 'bio':
  209. self.convert_tag = iob2
  210. elif encoding_type == 'bioes':
  211. self.convert_tag = lambda words: iob2bioes(iob2(words))
  212. else:
  213. raise ValueError("encoding_type only supports `bio` and `bioes`.")
  214. self.bigrams = bigrams
  215. self.trigrams = trigrams
  216. def process(self, data_bundle: DataBundle) -> DataBundle:
  217. r"""
  218. 支持的DataSet的field为
  219. .. csv-table::
  220. :header: "raw_chars", "target"
  221. "[相, 比, 之, 下,...]", "[O, O, O, O, ...]"
  222. "[青, 岛, 海, 牛, 队, 和, ...]", "[B-ORG, I-ORG, I-ORG, ...]"
  223. "[...]", "[...]"
  224. raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],
  225. 是转换为index的target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。
  226. :param ~fastNLP.DataBundle data_bundle: 传入的DataBundle中的DataSet必须包含raw_words和ner两个field,且两个field的内容均为List[str]。在传入DataBundle基础上原位修改。
  227. :return: DataBundle
  228. """
  229. # 转换tag
  230. for name, dataset in data_bundle.datasets.items():
  231. dataset.apply_field(self.convert_tag, field_name='target', new_field_name='target')
  232. _add_chars_field(data_bundle, lower=False)
  233. input_field_names = ['chars']
  234. if self.bigrams:
  235. for name, dataset in data_bundle.iter_datasets():
  236. dataset.apply_field(lambda chars: [c1 + c2 for c1, c2 in zip(chars, chars[1:] + ['<eos>'])],
  237. field_name='chars', new_field_name='bigrams')
  238. input_field_names.append('bigrams')
  239. if self.trigrams:
  240. for name, dataset in data_bundle.datasets.items():
  241. dataset.apply_field(lambda chars: [c1 + c2 + c3 for c1, c2, c3 in
  242. zip(chars, chars[1:] + ['<eos>'], chars[2:] + ['<eos>'] * 2)],
  243. field_name='chars', new_field_name='trigrams')
  244. input_field_names.append('trigrams')
  245. # index
  246. _indexize(data_bundle, input_field_names, 'target')
  247. input_fields = ['target', 'seq_len'] + input_field_names
  248. target_fields = ['target', 'seq_len']
  249. for name, dataset in data_bundle.iter_datasets():
  250. dataset.add_seq_len('chars')
  251. return data_bundle
  252. class MsraNERPipe(_CNNERPipe):
  253. r"""
  254. 处理MSRA-NER的数据,处理之后的DataSet的field情况为
  255. .. csv-table::
  256. :header: "raw_chars", "target", "chars", "seq_len"
  257. "[相, 比, 之, 下,...]", "[0, 0, 0, 0, ...]", "[2, 3, 4, 5, ...]", 11
  258. "[青, 岛, 海, 牛, 队, 和, ...]", "[1, 2, 3, ...]", "[10, 21, ....]", 21
  259. "[...]", "[...]", "[...]", .
  260. raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的
  261. target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。
  262. dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::
  263. +-------------+-----------+--------+-------+---------+
  264. | field_names | raw_chars | target | chars | seq_len |
  265. +-------------+-----------+--------+-------+---------+
  266. | is_input | False | True | True | True |
  267. | is_target | False | True | False | True |
  268. | ignore_type | | False | False | False |
  269. | pad_value | | 0 | 0 | 0 |
  270. +-------------+-----------+--------+-------+---------+
  271. """
  272. def process_from_file(self, paths=None) -> DataBundle:
  273. data_bundle = MsraNERLoader().load(paths)
  274. return self.process(data_bundle)
  275. class PeopleDailyPipe(_CNNERPipe):
  276. r"""
  277. 处理people daily的ner的数据,处理之后的DataSet的field情况为
  278. .. csv-table::
  279. :header: "raw_chars", "target", "chars", "seq_len"
  280. "[相, 比, 之, 下,...]", "[0, 0, 0, 0, ...]", "[2, 3, 4, 5, ...]", 11
  281. "[青, 岛, 海, 牛, 队, 和, ...]", "[1, 2, 3, ...]", "[10, 21, ....]", 21
  282. "[...]", "[...]", "[...]", .
  283. raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的
  284. target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。
  285. dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::
  286. +-------------+-----------+--------+-------+---------+
  287. | field_names | raw_chars | target | chars | seq_len |
  288. +-------------+-----------+--------+-------+---------+
  289. | is_input | False | True | True | True |
  290. | is_target | False | True | False | True |
  291. | ignore_type | | False | False | False |
  292. | pad_value | | 0 | 0 | 0 |
  293. +-------------+-----------+--------+-------+---------+
  294. """
  295. def process_from_file(self, paths=None) -> DataBundle:
  296. data_bundle = PeopleDailyNERLoader().load(paths)
  297. return self.process(data_bundle)
  298. class WeiboNERPipe(_CNNERPipe):
  299. r"""
  300. 处理weibo的ner的数据,处理之后的DataSet的field情况为
  301. .. csv-table::
  302. :header: "raw_chars", "chars", "target", "seq_len"
  303. "['老', '百', '姓']", "[4, 3, 3]", "[38, 39, 40]", 3
  304. "['心']", "[0]", "[41]", 1
  305. "[...]", "[...]", "[...]", .
  306. raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的
  307. target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。
  308. dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::
  309. +-------------+-----------+--------+-------+---------+
  310. | field_names | raw_chars | target | chars | seq_len |
  311. +-------------+-----------+--------+-------+---------+
  312. | is_input | False | True | True | True |
  313. | is_target | False | True | False | True |
  314. | ignore_type | | False | False | False |
  315. | pad_value | | 0 | 0 | 0 |
  316. +-------------+-----------+--------+-------+---------+
  317. """
  318. def process_from_file(self, paths=None) -> DataBundle:
  319. data_bundle = WeiboNERLoader().load(paths)
  320. return self.process(data_bundle)