import re class SpanConverter: def __init__(self, replace_tag, pattern): super(SpanConverter, self).__init__() self.replace_tag = replace_tag self.pattern = pattern def find_certain_span_and_replace(self, sentence): replaced_sentence = '' prev_end = 0 for match in re.finditer(self.pattern, sentence): start, end = match.span() span = sentence[start:end] replaced_sentence += sentence[prev_end:start] + self.span_to_special_tag(span) prev_end = end replaced_sentence += sentence[prev_end:] return replaced_sentence def span_to_special_tag(self, span): return self.replace_tag def find_certain_span(self, sentence): spans = [] for match in re.finditer(self.pattern, sentence): spans.append(match.span()) return spans class AlphaSpanConverter(SpanConverter): def __init__(self): replace_tag = '' # 理想状态下仅处理纯为字母的情况, 但不处理<[a-zA-Z]+>(因为这应该是特殊的tag). pattern = '[a-zA-Z]+(?=[\u4e00-\u9fff ,%.!<\\-"])' super(AlphaSpanConverter, self).__init__(replace_tag, pattern) class DigitSpanConverter(SpanConverter): def __init__(self): replace_tag = '' pattern = '\d[\d\\.]*(?=[\u4e00-\u9fff ,%.!<-])' super(DigitSpanConverter, self).__init__(replace_tag, pattern) def span_to_special_tag(self, span): # return self.special_tag if span[0] == '0' and len(span) > 2: return '' decimal_point_count = 0 # one might have more than one decimal pointers for idx, char in enumerate(span): if char == '.' or char == '﹒' or char == '·': decimal_point_count += 1 if span[-1] == '.' or span[-1] == '﹒' or span[-1] == '·': # last digit being decimal point means this is not a number if decimal_point_count == 1: return span else: return '' if decimal_point_count == 1: return '' elif decimal_point_count > 1: return '' else: return '' class TimeConverter(SpanConverter): def __init__(self): replace_tag = '' pattern = '\d+[::∶][\d::∶]+(?=[\u4e00-\u9fff ,%.!<-])' super().__init__(replace_tag, pattern) class MixNumAlphaConverter(SpanConverter): def __init__(self): replace_tag = '' pattern = None super().__init__(replace_tag, pattern) def find_certain_span_and_replace(self, sentence): replaced_sentence = '' start = 0 matching_flag = False number_flag = False alpha_flag = False link_flag = False slash_flag = False bracket_flag = False for idx in range(len(sentence)): if re.match('[0-9a-zA-Z/\\(\\)\'′&\\-]', sentence[idx]): if not matching_flag: replaced_sentence += sentence[start:idx] start = idx if re.match('[0-9]', sentence[idx]): number_flag = True elif re.match('[\'′&\\-]', sentence[idx]): link_flag = True elif re.match('/', sentence[idx]): slash_flag = True elif re.match('[\\(\\)]', sentence[idx]): bracket_flag = True else: alpha_flag = True matching_flag = True elif re.match('[\\.]', sentence[idx]): pass else: if matching_flag: if (number_flag and alpha_flag) or (link_flag and alpha_flag) \ or (slash_flag and alpha_flag) or (link_flag and number_flag) \ or (number_flag and bracket_flag) or (bracket_flag and alpha_flag): span = sentence[start:idx] start = idx replaced_sentence += self.span_to_special_tag(span) matching_flag = False number_flag = False alpha_flag = False link_flag = False slash_flag = False bracket_flag = False replaced_sentence += sentence[start:] return replaced_sentence def find_certain_span(self, sentence): spans = [] start = 0 matching_flag = False number_flag = False alpha_flag = False link_flag = False slash_flag = False bracket_flag = False for idx in range(len(sentence)): if re.match('[0-9a-zA-Z/\\(\\)\'′&\\-]', sentence[idx]): if not matching_flag: start = idx if re.match('[0-9]', sentence[idx]): number_flag = True elif re.match('[\'′&\\-]', sentence[idx]): link_flag = True elif re.match('/', sentence[idx]): slash_flag = True elif re.match('[\\(\\)]', sentence[idx]): bracket_flag = True else: alpha_flag = True matching_flag = True elif re.match('[\\.]', sentence[idx]): pass else: if matching_flag: if (number_flag and alpha_flag) or (link_flag and alpha_flag) \ or (slash_flag and alpha_flag) or (link_flag and number_flag) \ or (number_flag and bracket_flag) or (bracket_flag and alpha_flag): spans.append((start, idx)) start = idx matching_flag = False number_flag = False alpha_flag = False link_flag = False slash_flag = False bracket_flag = False return spans class EmailConverter(SpanConverter): def __init__(self): replaced_tag = "" pattern = '[0-9a-zA-Z]+[@][.﹒0-9a-zA-Z@]+(?=[\u4e00-\u9fff ,%.!<\\-"$])' super(EmailConverter, self).__init__(replaced_tag, pattern)