You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

converter.py 6.4 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. import re
  2. class SpanConverter:
  3. def __init__(self, replace_tag, pattern):
  4. super(SpanConverter, self).__init__()
  5. self.replace_tag = replace_tag
  6. self.pattern = pattern
  7. def find_certain_span_and_replace(self, sentence):
  8. replaced_sentence = ''
  9. prev_end = 0
  10. for match in re.finditer(self.pattern, sentence):
  11. start, end = match.span()
  12. span = sentence[start:end]
  13. replaced_sentence += sentence[prev_end:start] + self.span_to_special_tag(span)
  14. prev_end = end
  15. replaced_sentence += sentence[prev_end:]
  16. return replaced_sentence
  17. def span_to_special_tag(self, span):
  18. return self.replace_tag
  19. def find_certain_span(self, sentence):
  20. spans = []
  21. for match in re.finditer(self.pattern, sentence):
  22. spans.append(match.span())
  23. return spans
  24. class AlphaSpanConverter(SpanConverter):
  25. def __init__(self):
  26. replace_tag = '<ALPHA>'
  27. # 理想状态下仅处理纯为字母的情况, 但不处理<[a-zA-Z]+>(因为这应该是特殊的tag).
  28. pattern = '[a-zA-Z]+(?=[\u4e00-\u9fff ,%.!<\\-"])'
  29. super(AlphaSpanConverter, self).__init__(replace_tag, pattern)
  30. class DigitSpanConverter(SpanConverter):
  31. def __init__(self):
  32. replace_tag = '<NUM>'
  33. pattern = '\d[\d\\.]*(?=[\u4e00-\u9fff ,%.!<-])'
  34. super(DigitSpanConverter, self).__init__(replace_tag, pattern)
  35. def span_to_special_tag(self, span):
  36. # return self.special_tag
  37. if span[0] == '0' and len(span) > 2:
  38. return '<NUM>'
  39. decimal_point_count = 0 # one might have more than one decimal pointers
  40. for idx, char in enumerate(span):
  41. if char == '.' or char == '﹒' or char == '·':
  42. decimal_point_count += 1
  43. if span[-1] == '.' or span[-1] == '﹒' or span[-1] == '·':
  44. # last digit being decimal point means this is not a number
  45. if decimal_point_count == 1:
  46. return span
  47. else:
  48. return '<UNKDGT>'
  49. if decimal_point_count == 1:
  50. return '<DEC>'
  51. elif decimal_point_count > 1:
  52. return '<UNKDGT>'
  53. else:
  54. return '<NUM>'
  55. class TimeConverter(SpanConverter):
  56. def __init__(self):
  57. replace_tag = '<TOC>'
  58. pattern = '\d+[::∶][\d::∶]+(?=[\u4e00-\u9fff ,%.!<-])'
  59. super().__init__(replace_tag, pattern)
  60. class MixNumAlphaConverter(SpanConverter):
  61. def __init__(self):
  62. replace_tag = '<MIX>'
  63. pattern = None
  64. super().__init__(replace_tag, pattern)
  65. def find_certain_span_and_replace(self, sentence):
  66. replaced_sentence = ''
  67. start = 0
  68. matching_flag = False
  69. number_flag = False
  70. alpha_flag = False
  71. link_flag = False
  72. slash_flag = False
  73. bracket_flag = False
  74. for idx in range(len(sentence)):
  75. if re.match('[0-9a-zA-Z/\\(\\)\'′&\\-]', sentence[idx]):
  76. if not matching_flag:
  77. replaced_sentence += sentence[start:idx]
  78. start = idx
  79. if re.match('[0-9]', sentence[idx]):
  80. number_flag = True
  81. elif re.match('[\'′&\\-]', sentence[idx]):
  82. link_flag = True
  83. elif re.match('/', sentence[idx]):
  84. slash_flag = True
  85. elif re.match('[\\(\\)]', sentence[idx]):
  86. bracket_flag = True
  87. else:
  88. alpha_flag = True
  89. matching_flag = True
  90. elif re.match('[\\.]', sentence[idx]):
  91. pass
  92. else:
  93. if matching_flag:
  94. if (number_flag and alpha_flag) or (link_flag and alpha_flag) \
  95. or (slash_flag and alpha_flag) or (link_flag and number_flag) \
  96. or (number_flag and bracket_flag) or (bracket_flag and alpha_flag):
  97. span = sentence[start:idx]
  98. start = idx
  99. replaced_sentence += self.span_to_special_tag(span)
  100. matching_flag = False
  101. number_flag = False
  102. alpha_flag = False
  103. link_flag = False
  104. slash_flag = False
  105. bracket_flag = False
  106. replaced_sentence += sentence[start:]
  107. return replaced_sentence
  108. def find_certain_span(self, sentence):
  109. spans = []
  110. start = 0
  111. matching_flag = False
  112. number_flag = False
  113. alpha_flag = False
  114. link_flag = False
  115. slash_flag = False
  116. bracket_flag = False
  117. for idx in range(len(sentence)):
  118. if re.match('[0-9a-zA-Z/\\(\\)\'′&\\-]', sentence[idx]):
  119. if not matching_flag:
  120. start = idx
  121. if re.match('[0-9]', sentence[idx]):
  122. number_flag = True
  123. elif re.match('[\'′&\\-]', sentence[idx]):
  124. link_flag = True
  125. elif re.match('/', sentence[idx]):
  126. slash_flag = True
  127. elif re.match('[\\(\\)]', sentence[idx]):
  128. bracket_flag = True
  129. else:
  130. alpha_flag = True
  131. matching_flag = True
  132. elif re.match('[\\.]', sentence[idx]):
  133. pass
  134. else:
  135. if matching_flag:
  136. if (number_flag and alpha_flag) or (link_flag and alpha_flag) \
  137. or (slash_flag and alpha_flag) or (link_flag and number_flag) \
  138. or (number_flag and bracket_flag) or (bracket_flag and alpha_flag):
  139. spans.append((start, idx))
  140. start = idx
  141. matching_flag = False
  142. number_flag = False
  143. alpha_flag = False
  144. link_flag = False
  145. slash_flag = False
  146. bracket_flag = False
  147. return spans
  148. class EmailConverter(SpanConverter):
  149. def __init__(self):
  150. replaced_tag = "<EML>"
  151. pattern = '[0-9a-zA-Z]+[@][.﹒0-9a-zA-Z@]+(?=[\u4e00-\u9fff ,%.!<\\-"$])'
  152. super(EmailConverter, self).__init__(replaced_tag, pattern)