Browse Source

optimize time cost

master
zhangmiao 7 years ago
parent
commit
f030bfcfe1
2 changed files with 37 additions and 349078 deletions
  1. +0
    -349046
      jiagu/data/words.dic
  2. +37
    -32
      jiagu/mmseg.py

+ 0
- 349046
jiagu/data/words.dic
File diff suppressed because it is too large
View File


+ 37
- 32
jiagu/mmseg.py View File

@@ -31,8 +31,8 @@ class Trie(object):
if c not in node: if c not in node:
break break
node = node[c] node = node[c]
if self.value in node:
ret.append(node[self.value])
if "value" in node:
ret.append(node["value"])
return ret return ret


def load(self): def load(self):
@@ -42,20 +42,26 @@ class Trie(object):




class Chunk: class Chunk:
def __init__(self, words_list, chrs, word_freq):
def __init__(self, words_list, chrs):
# self.sentence_sep = ['?', '!', ';', '?', '!', '。', ';', '……', '…', ",", ",", "."] # self.sentence_sep = ['?', '!', ';', '?', '!', '。', ';', '……', '…', ",", ",", "."]
self.words = words_list
self.lens_list = map(lambda x: len(x), words_list)
self.length = sum(self.lens_list)
self.mean = float(self.length) / len(words_list)
self.var = sum(map(lambda x: (x - self.mean) ** 2, self.lens_list)) / len(self.words)
self.entropy = sum([log(float(chrs.get(x, 1))) for x in words_list])
# 计算词频信息熵
self.word_entropy = sum([log(float(word_freq.get(x, 1))) for x in words_list])
self.best_word = words_list[0]
self.words_num = len(words_list)
self.length = 0
self.entropy = 0
length_square = 0

for word in words_list:
word_length = len(word)
self.length += word_length
self.entropy += log(chrs.get(word, 1))
length_square += word_length * word_length

self.mean = self.length / self.words_num
self.var = length_square / self.words_num - (self.length / self.words_num) * (self.length / self.words_num)


def __lt__(self, other): def __lt__(self, other):
return (self.length, self.mean, -self.var, self.entropy, self.word_entropy) < \
(other.length, other.mean, -other.var, other.entropy, other.word_entropy)
return (self.length, self.mean, -self.var, self.entropy) < \
(other.length, other.mean, -other.var, other.entropy)




class MMSeg: class MMSeg:
@@ -66,8 +72,6 @@ class MMSeg:
self.words_dic = trie self.words_dic = trie
# 加载字频字典 # 加载字频字典
self.chrs_dic = self._load_freq(filename="data/chars.dic") self.chrs_dic = self._load_freq(filename="data/chars.dic")
# 加载词频字典
self.word_freq = self._load_freq(filename="data/words.dic")


def _load_freq(self, filename): def _load_freq(self, filename):
chrs_dic = defaultdict() chrs_dic = defaultdict()
@@ -79,29 +83,30 @@ class MMSeg:
return chrs_dic return chrs_dic


def __get_start_words(self, sentence): def __get_start_words(self, sentence):
match_words = self.words_dic.get_matches(sentence)
if sentence: if sentence:
if not match_words:
return [sentence[0]]
else:
return match_words
match_words = self.words_dic.get_matches(sentence)
return match_words if match_words else [sentence[0]]
else: else:
return False return False


def __get_chunks(self, sentence): def __get_chunks(self, sentence):
# 获取chunk,每个chunk中最多三个词 # 获取chunk,每个chunk中最多三个词
ret = []

def _iter_chunk(sentence, num, tmp_seg_words):
match_words = self.__get_start_words(sentence)
if (not match_words or num == 0) and tmp_seg_words:
ret.append(Chunk(tmp_seg_words, self.chrs_dic, self.word_freq))
first_match_words = self.__get_start_words(sentence)

for word_one in first_match_words:
word_one_length = len(word_one)
second_match_words = self.__get_start_words(sentence[word_one_length:])
if second_match_words:
for word_two in second_match_words:
word_two_length = len(word_two) + word_one_length
third_match_words = self.__get_start_words(sentence[word_two_length:])
if third_match_words:
for word_three in third_match_words:
yield (Chunk([word_one, word_two, word_three], self.chrs_dic))
else:
yield (Chunk([word_one, word_two], self.chrs_dic))
else: else:
for word in match_words:
_iter_chunk(sentence[len(word):], num - 1, tmp_seg_words + [word])
_iter_chunk(sentence, num=3, tmp_seg_words=[])

return ret
yield (Chunk([word_one], self.chrs_dic))


def cws(self, sentence): def cws(self, sentence):
""" """
@@ -110,7 +115,7 @@ class MMSeg:
""" """
while sentence: while sentence:
chunks = self.__get_chunks(sentence) chunks = self.__get_chunks(sentence)
word = max(chunks).words[0]
word = max(chunks).best_word
sentence = sentence[len(word):] sentence = sentence[len(word):]
yield word yield word




Loading…
Cancel
Save