From d61108fa1379fd3daf69a0ee9527bf92b60c8b5e Mon Sep 17 00:00:00 2001 From: Yener Date: Sat, 7 Dec 2019 23:57:08 +0800 Subject: [PATCH] update --- jiagu/analyze.py | 10 +- jiagu/segment/dict/jiagu.dict | 5 +- jiagu/{ => segment}/model/cws.model | Bin jiagu/segment/nroute.py | 159 ++++++++++++++++++- jiagu/segment/perceptron.py | 227 ++++++++++++++++++++++++++++ setup.py | 2 +- test.py | 30 ++++ 7 files changed, 414 insertions(+), 19 deletions(-) rename jiagu/{ => segment}/model/cws.model (100%) create mode 100644 jiagu/segment/perceptron.py create mode 100644 test.py diff --git a/jiagu/analyze.py b/jiagu/analyze.py index 29ab5bd..a166394 100644 --- a/jiagu/analyze.py +++ b/jiagu/analyze.py @@ -104,20 +104,14 @@ class Analyze(object): :param sentence: str or list 文本或者文本列表,根据input的模式来定 - :param input: str - 句子输入的格式,text则为默认的文本,batch则为批量的文本列表 :param model: str - 分词所使用的模式,default为默认模式,mmseg为mmseg分词方式 + 分词所使用的模式,default为默认模式包含新词发现 :return: """ if model == 'default': self.init_cws() words = self.cws_text(sentence) return words - elif model == 'mmseg': - self.init_mmseg() - words = self.seg_mmseg.cws(sentence) - return words else: pass return [] @@ -158,7 +152,7 @@ class Analyze(object): def text_cluster(self, docs, features_method='tfidf', method="k-means", k=3, max_iter=100, eps=0.5, min_pts=2): return cluster(docs, features_method, method, k, max_iter, eps, min_pts, self.seg) - + def lab2spo(self, words, epp_labels): subject_list = [] # 存放实体的列表 object_list = [] diff --git a/jiagu/segment/dict/jiagu.dict b/jiagu/segment/dict/jiagu.dict index e748331..5be6cb1 100644 --- a/jiagu/segment/dict/jiagu.dict +++ b/jiagu/segment/dict/jiagu.dict @@ -174122,7 +174122,7 @@ win键 7 河北涿县 3 河北满城 3 河北电视台 5 -河北省 358 +河北省 359 河北省保定 3 河北省妇联 3 河北省委 17 @@ -174131,7 +174131,6 @@ win键 7 河北省科协 3 河北省科委 3 河北省纪委 3 -河北省衡水 3 河北省邯郸 3 河北籍 3 河北赵县 3 @@ -245176,7 +245175,7 @@ win键 7 衡庭汉 34 衡志诚 4 衡某 3 -衡水 29 +衡水 30 衡水市 17 衡讯 3 衡诸 8 diff --git a/jiagu/model/cws.model b/jiagu/segment/model/cws.model similarity index 100% rename from jiagu/model/cws.model rename to jiagu/segment/model/cws.model diff --git a/jiagu/segment/nroute.py b/jiagu/segment/nroute.py index d55881c..2a72098 100644 --- a/jiagu/segment/nroute.py +++ b/jiagu/segment/nroute.py @@ -2,6 +2,7 @@ import os import sys from math import log +from jiagu.perceptron import Perceptron re_eng = re.compile('[a-zA-Z0-9]', re.U) re_han = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U) @@ -14,10 +15,13 @@ class Segment: self.max_freq = 0 self.total_freq = 0 self.initialized = False + self.model = None - def init(self, vocab_path='dict/jiagu.dict', user_vocab='dict/user.dict'): + def init(self, vocab_path='dict/jiagu.dict', user_vocab='dict/user.dict', + model_path='model/cws.model'): self.load_vocab(os.path.join(os.path.dirname(__file__), vocab_path)) self.load_vocab(os.path.join(os.path.dirname(__file__), user_vocab)) + self.model = Perceptron(os.path.join(os.path.dirname(__file__), model_path)) self.initialized = True def load_vocab(self, vocab_path): @@ -52,6 +56,18 @@ class Segment: if len(word) > self.max_word_len: self.max_word_len = len(word) + def del_vocab(self, word=None, freq=None, tag=None): + if word not in self.vocab: + return None + + vocab_freq = self.vocab[word] + if freq == None or vocab_freq <= freq: + del self.vocab[word] + self.total_freq -= vocab_freq + else: + self.vocab[word] -= freq + # self.max_freq and self.max_word_len ? + def load_userdict(self, userdict): if self.initialized == False: self.init() @@ -70,6 +86,22 @@ class Segment: self.add_vocab(word, freq) elif isinstance(item, str): self.add_vocab(word=item) + + def del_userdict(self, userdict): + if self.initialized == False: + self.init() + + for item in userdict: + if isinstance(item, list): + if len(item) == 1: + word = item[0] + self.del_vocab(word) + elif len(item) == 2: + word = item[0] + freq = item[1] + self.del_vocab(word, freq) + elif isinstance(item, str): + self.del_vocab(word=item) def calc_route(self, sentence, DAG, route): vocab = self.vocab @@ -149,7 +181,39 @@ class Segment: yield buf buf = '' - def seg_default(self, sentence, mode): + def model_cut(self, sentence): + if sentence == '': + return [''] + + sentence = list(sentence) + labels = self.model.predict(sentence) + return self.__lab2word(sentence, labels) + + def __lab2word(self, sentence, labels): + sen_len = len(sentence) + tmp_word = "" + words = [] + for i in range(sen_len): + label = labels[i] + w = sentence[i] + if label == "B": + tmp_word += w + elif label == "M": + tmp_word += w + elif label == "E": + tmp_word += w + words.append(tmp_word) + tmp_word = "" + else: + if tmp_word != '': + words.append(tmp_word) + tmp_word = "" + words.append(w) + if tmp_word: + words.append(tmp_word) + return words + + def seg_default(self, sentence): blocks = re_han.split(sentence) cut_block = self.cut_words cut_all = False @@ -170,11 +234,75 @@ class Segment: else: yield x + def seg_new_word(self, sentence): + blocks = re_han.split(sentence) + cut_block = self.cut_words + cut_all = False + for block in blocks: + if not block: + continue + if re_han.match(block): + words1 = list(cut_block(block)) + print(words1) + + words2 = self.model_cut(block) + print(words2) + + + # new_word = [] # 有冲突的不加,长度大于4的不加,加完记得删除 + # length = len(words1) + # for n in range(3): + # can_limit = length - n + 1 + # for i in range(0, can_limit): + # ngram = ''.join(words1[i:i + n]) + # word_len = len(ngram) + # if word_len > 4 or word_len==1: + # continue + # if ngram in words2 and ngram not in words1: + # print(ngram) + # new_word.append([ngram, 1]) + + new_word = [] + for word in words2: + if word not in words1 and len(word)>1 and len(word) < 4 :#and not re_eng.match(word): + new_word.append([word, 1]) + + + self.load_userdict(new_word) + + + + # print('------------------') + + for word in cut_block(block): + yield word + + # 删除字典 + self.del_userdict(new_word) + + + else: + tmp = re_skip.split(block) + for x in tmp: + if re_skip.match(x): + yield x + elif not cut_all: + for xx in x: + yield xx + else: + yield x + def seg(self, sentence, mode="default"): if self.initialized == False: self.init() - - return list(self.seg_default(sentence, mode=mode)) + + if mode == 'probe': + return list(self.seg_new_word(sentence)) + else: + return list(self.seg_default(sentence)) + + + if __name__=='__main__': s = Segment() @@ -182,13 +310,30 @@ if __name__=='__main__': # sg.load_userdict('dict/user.dict') # s.load_userdict(['知识图谱']) - text = '情感分析了解一下?一个比情感词典、机器学习更好的方法' + # text = '辽宁省铁岭市西丰县房木镇潭清村东屯' # bug + # text = '黑龙江省双鸭山市宝清县宝清镇通达街341号' + # text = '浙江省杭州市西湖区三墩镇紫宣路158号1幢801室' + # text = '北京市西城区茶马街8号院1号楼15层1502' + # text = '西藏自治区林芝市米林县羌纳乡羌渡岗村' + # text = '深圳市南山区西丽街道松坪山社区宝深路科陆大厦B座13层B05' + # text = '深圳市福田区福强路中港城裙楼6E部分602-A' # bug + # text = '深圳市福田区福保街道石厦北二街89号新港商城C座3305室' + # text = '五常市向阳镇致富村庆丰营屯' + # text = '中牟县中兴路与益民巷交叉口路南' + # text = '黄山市屯溪区华馨路38号二楼' + text = '银川市金凤区北京中路福宁城11-1-号' + + # 直接将新词动态加入新词的字典中,有冲突的不加,加完记得删除 - words = s.seg(text) + + # words = s.seg(text) + # print(words) + + words = s.seg(text, 'probe') + print('----------------') print(words) - diff --git a/jiagu/segment/perceptron.py b/jiagu/segment/perceptron.py new file mode 100644 index 0000000..3ab7584 --- /dev/null +++ b/jiagu/segment/perceptron.py @@ -0,0 +1,227 @@ +# -*- coding:utf-8 -*- +import os +import gzip +import pickle +import random +from collections import defaultdict + +class AveragedPerceptron(object): + def __init__(self): + # Each feature gets its own weight vector, so weights is a dict-of-dicts + self.weights = {} + self.classes = set() + # The accumulated values, for the averaging. These will be keyed by + # feature/clas tuples + self._totals = defaultdict(int) + # The last time the feature was changed, for the averaging. Also + # keyed by feature/clas tuples + # (tstamps is short for timestamps) + self._tstamps = defaultdict(int) + # Number of instances seen + self.i = 0 + + def predict(self, features): + '''Dot-product the features and current weights and return the best label.''' + scores = defaultdict(float) + for feat, value in features.items(): + if feat not in self.weights or value == 0: + continue + weights = self.weights[feat] + for label, weight in weights.items(): + scores[label] += value * weight + # Do a secondary alphabetic sort, for stability + return max(self.classes, key=lambda label: (scores[label], label)) + + def update(self, truth, guess, features): + '''Update the feature weights.''' + def upd_feat(c, f, w, v): + param = (f, c) + self._totals[param] += (self.i - self._tstamps[param]) * w + self._tstamps[param] = self.i + self.weights[f][c] = w + v + + self.i += 1 + if truth == guess: + return None + for f in features: + weights = self.weights.setdefault(f, {}) + upd_feat(truth, f, weights.get(truth, 0.0), 1.0) + upd_feat(guess, f, weights.get(guess, 0.0), -1.0) + return None + + def average_weights(self): + '''Average weights from all iterations.''' + for feat, weights in self.weights.items(): + new_feat_weights = {} + for clas, weight in weights.items(): + param = (feat, clas) + total = self._totals[param] + total += (self.i - self._tstamps[param]) * weight + averaged = round(total / float(self.i), 3) + if averaged: + new_feat_weights[clas] = averaged + self.weights[feat] = new_feat_weights + return None + +class Perceptron: + def __init__(self, loc=None): + self.START = ['-START-', '-START2-'] + self.END = ['-END-', '-END2-'] + self.model = AveragedPerceptron() + + if loc != None: + self.load(loc) + + def predict(self, words): + prev, prev2 = self.START + labels = [] + context = self.START + words + self.END + for i, word in enumerate(words): + features = self._get_features(i, word, context, prev, prev2) + tag = self.model.predict(features) + labels.append(tag) + prev2 = prev + prev = tag + return labels + + def train(self, sentences, save_loc=None, nr_iter=5, shuf=False): + self._make_tagdict(sentences) + for iter_ in range(nr_iter): + c = 0 + n = 0 + for words, tags in sentences: + prev, prev2 = self.START + context = self.START + words + self.END + for i, word in enumerate(words): + feats = self._get_features(i, word, context, prev, prev2) + guess = self.model.predict(feats) + self.model.update(tags[i], guess, feats) + + prev2 = prev + prev = guess + c += guess == tags[i] + n += 1 + if shuf == True: + random.shuffle(sentences) + + print("Iter {0}: {1}/{2}={3}".format(iter_, c, n, (float(c) / n) * 100)) + self.save(save_loc) + + self.model.average_weights() + self.save(save_loc) + + def save(self, loc='model/ap.model', zip=True): + if zip == False: + pickle.dump((self.model.weights, self.model.classes), open(loc, 'wb')) + else: + pickle.dump((self.model.weights, self.model.classes), gzip.open(loc, 'wb')) + + def load(self, loc='model/ap.model', zip=True): + if zip == False: + self.model.weights, self.model.classes = pickle.load(open(loc, 'rb')) + else: + self.model.weights, self.model.classes = pickle.load(gzip.open(loc,'rb')) + + def _get_features(self, i, word, context, prev, prev2): + '''Map tokens into a feature representation, implemented as a + {hashable: float} dict. If the features change, a new model must be + trained. + ''' + def add(name, *args): + features[' '.join((name,) + tuple(args))] += 1 + + i += len(self.START) + features = defaultdict(int) + # It's useful to have a constant feature, which acts sort of like a prior + add('bias') + add('i suffix', word[-3:]) + add('i pref1', word[0]) + add('i-1 tag', prev) + add('i-2 tag', prev2) + add('i tag+i-2 tag', prev, prev2) + add('i word', context[i]) + add('i-1 tag+i word', prev, context[i]) + add('i-1 word', context[i - 1]) + add('i-1 suffix', context[i - 1][-3:]) + add('i-2 word', context[i - 2]) + add('i+1 word', context[i + 1]) + add('i+1 suffix', context[i + 1][-3:]) + add('i+2 word', context[i + 2]) + return features + + def _make_tagdict(self, sentences): + '''Make a tag dictionary for single-tag words.''' + for words, tags in sentences: + for word, tag in zip(words, tags): + self.model.classes.add(tag) + +def train(filepath='data/train.txt', model='model/ap.model', nr_iter=1): + tagger = Perceptron() + print('Reading corpus...') + training_data = [] + sentence = ([], []) + fin = open(filepath, 'r', encoding='utf8') + for index, line in enumerate(fin): + line = line.strip() + if line == '': + training_data.append(sentence) + sentence = ([], []) + else: + params = line.split() + if len(params) != 2: continue + sentence[0].append(params[0]) + sentence[1].append(params[1]) + fin.close() + print('training corpus size : %d', len(training_data)) + print('Start training...') + tagger.train(training_data, save_loc=model, nr_iter=nr_iter) + +def eval(filepath='data/test.txt', model='model/ap.model'): + tagger = Perceptron(model) + + print('Start testing...') + right = 0.0 + total = 0.0 + sentence = ([], []) + fin = open(filepath, 'r', encoding='utf8') + for index, line in enumerate(fin): + line = line.strip() + if line == '': + words = sentence[0] + tags = sentence[1] + outputs = tagger.predict(words) + assert len(tags) == len(outputs) + total += len(tags) + for o, t in zip(outputs, tags): + if o == t: right += 1 + sentence = ([], []) + else: + params = line.split() + if len(params) != 2: continue + sentence[0].append(params[0]) + sentence[1].append(params[1]) + fin.close() + print("Precision : %f", right / total) + +def predict(model='model/ap.model'): + tagger = Perceptron(model) + + while True: + text = input('>') + words = list(text) + labels = tagger.predict(words) + + for word, label in zip(words, labels): + print(word, label) + + +if __name__ == '__main__': + train() + eval() + # predict() + + + + + + \ No newline at end of file diff --git a/setup.py b/setup.py index bddcc7e..1a47021 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup setup(name='jiagu', - version='0.2.0', + version='0.2.1', description='Jiagu Natural Language Processing', author='Yener(Zheng Wenyu)', author_email='help@ownthink.com', diff --git a/test.py b/test.py new file mode 100644 index 0000000..223522a --- /dev/null +++ b/test.py @@ -0,0 +1,30 @@ +import jiagu +import jieba + +text = '辽宁省铁岭市西丰县房木镇潭清村东屯' +text = '黑龙江省双鸭山市宝清县宝清镇通达街341号' +text = '''茶饮界的流行元素每隔几个月就会更新一次,现在各大咖啡品牌也玩起了跨界。今年9月3日,星巴克在中国内地首次上线了南瓜丝绒拿铁(Pumpkin Spice Latte,简称PSL),这款产品最初于2003年在美国上市,在全球累计卖出2亿杯;在被可口可乐以51亿美元从韦博得集团(Whitbread)收购一年后,一贯低调的COSTA也在今年6月表示将推出冷藏即饮咖啡,中国亦在首批上市市场之列。 + +最近,连锁咖啡品牌太平洋咖啡与东阿阿胶达成合作,推出5款名为咖啡如此多“胶”的联名产品,分别是:OATLY阿胶红枣拿铁、东阿阿胶拿铁、阿胶红枣拿铁、东阿阿胶抹茶拿铁及东阿阿胶银耳茶拿铁,平均售价约36元。据了解,这一系列产品于2019年10月16日起陆续在北京、上海、广州、深圳、西安、成都、无锡七个城市的太平洋咖啡指定门店内上市。总的来看,阿胶和咖啡相处地比较“融洽”,跨界没有违和感。 + + +太平洋咖啡这次推出的阿胶产品的包装也突出了中国风,咖啡杯套上的人物形象是穿着汉服和旗袍的中国女性形象。太平洋咖啡副董事长李海涛表示:“太平洋咖啡自成立27年来,始终坚持在咖啡这一‘舶来品’中融入中国元素,探求‘中西文化融合’的别样体验。东阿阿胶有近三千年传承历史,作为国家非物质文化遗产代表性传承技艺,可谓是中国传统滋补上品。本次发布的5款合作新饮,既保留了西方咖啡的醇香,又将东阿阿胶的胶香融入其中,充分彰显了‘中西’融合。” + + +此次与东阿阿胶的合作也可以看做是一种跨界。咖啡品牌与东方滋补产品的结合也显现了“年轻态”、“创新化”的品牌趋势。太平洋咖啡与东阿阿胶的主要消费者都为女性,也都力求在年轻市场实现突破,这样两个品牌的合作也属意料之外、情理之中。''' + +words = jiagu.cut(text) + +print(words) + +print(list(jieba.cut(text))) + + + +# 合并只合并中文四个词以内的 +# 字典出现大量的单子,表示可能会出错 + +# 在里面合并 + + +