| @@ -31,8 +31,8 @@ class Trie(object): | |||||
| if c not in node: | if c not in node: | ||||
| break | break | ||||
| node = node[c] | node = node[c] | ||||
| if self.value in node: | |||||
| ret.append(node[self.value]) | |||||
| if "value" in node: | |||||
| ret.append(node["value"]) | |||||
| return ret | return ret | ||||
| def load(self): | def load(self): | ||||
| @@ -42,20 +42,26 @@ class Trie(object): | |||||
| class Chunk: | class Chunk: | ||||
| def __init__(self, words_list, chrs, word_freq): | |||||
| def __init__(self, words_list, chrs): | |||||
| # self.sentence_sep = ['?', '!', ';', '?', '!', '。', ';', '……', '…', ",", ",", "."] | # self.sentence_sep = ['?', '!', ';', '?', '!', '。', ';', '……', '…', ",", ",", "."] | ||||
| self.words = words_list | |||||
| self.lens_list = map(lambda x: len(x), words_list) | |||||
| self.length = sum(self.lens_list) | |||||
| self.mean = float(self.length) / len(words_list) | |||||
| self.var = sum(map(lambda x: (x - self.mean) ** 2, self.lens_list)) / len(self.words) | |||||
| self.entropy = sum([log(float(chrs.get(x, 1))) for x in words_list]) | |||||
| # 计算词频信息熵 | |||||
| self.word_entropy = sum([log(float(word_freq.get(x, 1))) for x in words_list]) | |||||
| self.best_word = words_list[0] | |||||
| self.words_num = len(words_list) | |||||
| self.length = 0 | |||||
| self.entropy = 0 | |||||
| length_square = 0 | |||||
| for word in words_list: | |||||
| word_length = len(word) | |||||
| self.length += word_length | |||||
| self.entropy += log(chrs.get(word, 1)) | |||||
| length_square += word_length * word_length | |||||
| self.mean = self.length / self.words_num | |||||
| self.var = length_square / self.words_num - (self.length / self.words_num) * (self.length / self.words_num) | |||||
| def __lt__(self, other): | def __lt__(self, other): | ||||
| return (self.length, self.mean, -self.var, self.entropy, self.word_entropy) < \ | |||||
| (other.length, other.mean, -other.var, other.entropy, other.word_entropy) | |||||
| return (self.length, self.mean, -self.var, self.entropy) < \ | |||||
| (other.length, other.mean, -other.var, other.entropy) | |||||
| class MMSeg: | class MMSeg: | ||||
| @@ -66,8 +72,6 @@ class MMSeg: | |||||
| self.words_dic = trie | self.words_dic = trie | ||||
| # 加载字频字典 | # 加载字频字典 | ||||
| self.chrs_dic = self._load_freq(filename="data/chars.dic") | self.chrs_dic = self._load_freq(filename="data/chars.dic") | ||||
| # 加载词频字典 | |||||
| self.word_freq = self._load_freq(filename="data/words.dic") | |||||
| def _load_freq(self, filename): | def _load_freq(self, filename): | ||||
| chrs_dic = defaultdict() | chrs_dic = defaultdict() | ||||
| @@ -79,29 +83,30 @@ class MMSeg: | |||||
| return chrs_dic | return chrs_dic | ||||
| def __get_start_words(self, sentence): | def __get_start_words(self, sentence): | ||||
| match_words = self.words_dic.get_matches(sentence) | |||||
| if sentence: | if sentence: | ||||
| if not match_words: | |||||
| return [sentence[0]] | |||||
| else: | |||||
| return match_words | |||||
| match_words = self.words_dic.get_matches(sentence) | |||||
| return match_words if match_words else [sentence[0]] | |||||
| else: | else: | ||||
| return False | return False | ||||
| def __get_chunks(self, sentence): | def __get_chunks(self, sentence): | ||||
| # 获取chunk,每个chunk中最多三个词 | # 获取chunk,每个chunk中最多三个词 | ||||
| ret = [] | |||||
| def _iter_chunk(sentence, num, tmp_seg_words): | |||||
| match_words = self.__get_start_words(sentence) | |||||
| if (not match_words or num == 0) and tmp_seg_words: | |||||
| ret.append(Chunk(tmp_seg_words, self.chrs_dic, self.word_freq)) | |||||
| first_match_words = self.__get_start_words(sentence) | |||||
| for word_one in first_match_words: | |||||
| word_one_length = len(word_one) | |||||
| second_match_words = self.__get_start_words(sentence[word_one_length:]) | |||||
| if second_match_words: | |||||
| for word_two in second_match_words: | |||||
| word_two_length = len(word_two) + word_one_length | |||||
| third_match_words = self.__get_start_words(sentence[word_two_length:]) | |||||
| if third_match_words: | |||||
| for word_three in third_match_words: | |||||
| yield (Chunk([word_one, word_two, word_three], self.chrs_dic)) | |||||
| else: | |||||
| yield (Chunk([word_one, word_two], self.chrs_dic)) | |||||
| else: | else: | ||||
| for word in match_words: | |||||
| _iter_chunk(sentence[len(word):], num - 1, tmp_seg_words + [word]) | |||||
| _iter_chunk(sentence, num=3, tmp_seg_words=[]) | |||||
| return ret | |||||
| yield (Chunk([word_one], self.chrs_dic)) | |||||
| def cws(self, sentence): | def cws(self, sentence): | ||||
| """ | """ | ||||
| @@ -110,7 +115,7 @@ class MMSeg: | |||||
| """ | """ | ||||
| while sentence: | while sentence: | ||||
| chunks = self.__get_chunks(sentence) | chunks = self.__get_chunks(sentence) | ||||
| word = max(chunks).words[0] | |||||
| word = max(chunks).best_word | |||||
| sentence = sentence[len(word):] | sentence = sentence[len(word):] | ||||
| yield word | yield word | ||||