optimize time cost

7 years ago · f030bfcfe1
--- a/jiagu/data/words.dic
+++ b/jiagu/data/words.dic
--- a/jiagu/mmseg.py
+++ b/jiagu/mmseg.py
@@ -31,8 +31,8 @@ class Trie(object):
            if c not in node:
                break
            node = node[c]
            if self.value in node:
                ret.append(node[self.value])
            if "value" in node:
                ret.append(node["value"])
        return ret
    def load(self):
@@ -42,20 +42,26 @@ class Trie(object):
 class Chunk:
    def __init__(self, words_list, chrs, word_freq):
    def __init__(self, words_list, chrs):
        # self.sentence_sep = ['?', '!', ';', '？', '！', '。', '；', '……', '…', "，", ",", "."]
        self.words = words_list
        self.lens_list = map(lambda x: len(x), words_list)
        self.length = sum(self.lens_list)
        self.mean = float(self.length) / len(words_list)
        self.var = sum(map(lambda x: (x - self.mean) ** 2, self.lens_list)) / len(self.words)
        self.entropy = sum([log(float(chrs.get(x, 1))) for x in words_list])
        # 计算词频信息熵
        self.word_entropy = sum([log(float(word_freq.get(x, 1))) for x in words_list])
        self.best_word = words_list[0]
        self.words_num = len(words_list)
        self.length = 0
        self.entropy = 0
        length_square = 0
        for word in words_list:
            word_length = len(word)
            self.length += word_length
            self.entropy += log(chrs.get(word, 1))
            length_square += word_length * word_length
        self.mean = self.length / self.words_num
        self.var = length_square / self.words_num - (self.length / self.words_num) * (self.length / self.words_num)
    def __lt__(self, other):
        return (self.length, self.mean, -self.var, self.entropy, self.word_entropy) < \
               (other.length, other.mean, -other.var, other.entropy, other.word_entropy)
        return (self.length, self.mean, -self.var, self.entropy) < \
               (other.length, other.mean, -other.var, other.entropy)
 class MMSeg:
@@ -66,8 +72,6 @@ class MMSeg:
        self.words_dic = trie
        # 加载字频字典
        self.chrs_dic = self._load_freq(filename="data/chars.dic")
        # 加载词频字典
        self.word_freq = self._load_freq(filename="data/words.dic")
    def _load_freq(self, filename):
        chrs_dic = defaultdict()
@@ -79,29 +83,30 @@ class MMSeg:
        return chrs_dic
    def __get_start_words(self, sentence):
        match_words = self.words_dic.get_matches(sentence)
        if sentence:
            if not match_words:
                return [sentence[0]]
            else:
                return match_words
            match_words = self.words_dic.get_matches(sentence)
            return match_words if match_words else [sentence[0]]
        else:
            return False
    def __get_chunks(self, sentence):
        # 获取chunk，每个chunk中最多三个词
        ret = []
        def _iter_chunk(sentence, num, tmp_seg_words):
            match_words = self.__get_start_words(sentence)
            if (not match_words or num == 0) and tmp_seg_words:
                ret.append(Chunk(tmp_seg_words, self.chrs_dic, self.word_freq))
        first_match_words = self.__get_start_words(sentence)
        for word_one in first_match_words:
            word_one_length = len(word_one)
            second_match_words = self.__get_start_words(sentence[word_one_length:])
            if second_match_words:
                for word_two in second_match_words:
                    word_two_length = len(word_two) + word_one_length
                    third_match_words = self.__get_start_words(sentence[word_two_length:])
                    if third_match_words:
                        for word_three in third_match_words:
                            yield (Chunk([word_one, word_two, word_three], self.chrs_dic))
                    else:
                        yield (Chunk([word_one, word_two], self.chrs_dic))
            else:
                for word in match_words:
                    _iter_chunk(sentence[len(word):], num - 1, tmp_seg_words + [word])
        _iter_chunk(sentence, num=3, tmp_seg_words=[])
        return ret
                yield (Chunk([word_one], self.chrs_dic))
    def cws(self, sentence):
        """
@@ -110,7 +115,7 @@ class MMSeg:
        """
        while sentence:
            chunks = self.__get_chunks(sentence)
            word = max(chunks).words[0]
            word = max(chunks).best_word
            sentence = sentence[len(word):]
            yield word