| @@ -104,20 +104,14 @@ class Analyze(object): | |||||
| :param sentence: str or list | :param sentence: str or list | ||||
| 文本或者文本列表,根据input的模式来定 | 文本或者文本列表,根据input的模式来定 | ||||
| :param input: str | |||||
| 句子输入的格式,text则为默认的文本,batch则为批量的文本列表 | |||||
| :param model: str | :param model: str | ||||
| 分词所使用的模式,default为默认模式,mmseg为mmseg分词方式 | |||||
| 分词所使用的模式,default为默认模式包含新词发现 | |||||
| :return: | :return: | ||||
| """ | """ | ||||
| if model == 'default': | if model == 'default': | ||||
| self.init_cws() | self.init_cws() | ||||
| words = self.cws_text(sentence) | words = self.cws_text(sentence) | ||||
| return words | return words | ||||
| elif model == 'mmseg': | |||||
| self.init_mmseg() | |||||
| words = self.seg_mmseg.cws(sentence) | |||||
| return words | |||||
| else: | else: | ||||
| pass | pass | ||||
| return [] | return [] | ||||
| @@ -158,7 +152,7 @@ class Analyze(object): | |||||
| def text_cluster(self, docs, features_method='tfidf', method="k-means", k=3, max_iter=100, eps=0.5, min_pts=2): | def text_cluster(self, docs, features_method='tfidf', method="k-means", k=3, max_iter=100, eps=0.5, min_pts=2): | ||||
| return cluster(docs, features_method, method, k, max_iter, eps, min_pts, self.seg) | return cluster(docs, features_method, method, k, max_iter, eps, min_pts, self.seg) | ||||
| def lab2spo(self, words, epp_labels): | def lab2spo(self, words, epp_labels): | ||||
| subject_list = [] # 存放实体的列表 | subject_list = [] # 存放实体的列表 | ||||
| object_list = [] | object_list = [] | ||||
| @@ -174122,7 +174122,7 @@ win键 7 | |||||
| 河北涿县 3 | 河北涿县 3 | ||||
| 河北满城 3 | 河北满城 3 | ||||
| 河北电视台 5 | 河北电视台 5 | ||||
| 河北省 358 | |||||
| 河北省 359 | |||||
| 河北省保定 3 | 河北省保定 3 | ||||
| 河北省妇联 3 | 河北省妇联 3 | ||||
| 河北省委 17 | 河北省委 17 | ||||
| @@ -174131,7 +174131,6 @@ win键 7 | |||||
| 河北省科协 3 | 河北省科协 3 | ||||
| 河北省科委 3 | 河北省科委 3 | ||||
| 河北省纪委 3 | 河北省纪委 3 | ||||
| 河北省衡水 3 | |||||
| 河北省邯郸 3 | 河北省邯郸 3 | ||||
| 河北籍 3 | 河北籍 3 | ||||
| 河北赵县 3 | 河北赵县 3 | ||||
| @@ -245176,7 +245175,7 @@ win键 7 | |||||
| 衡庭汉 34 | 衡庭汉 34 | ||||
| 衡志诚 4 | 衡志诚 4 | ||||
| 衡某 3 | 衡某 3 | ||||
| 衡水 29 | |||||
| 衡水 30 | |||||
| 衡水市 17 | 衡水市 17 | ||||
| 衡讯 3 | 衡讯 3 | ||||
| 衡诸 8 | 衡诸 8 | ||||
| @@ -2,6 +2,7 @@ | |||||
| import os | import os | ||||
| import sys | import sys | ||||
| from math import log | from math import log | ||||
| from jiagu.perceptron import Perceptron | |||||
| re_eng = re.compile('[a-zA-Z0-9]', re.U) | re_eng = re.compile('[a-zA-Z0-9]', re.U) | ||||
| re_han = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U) | re_han = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U) | ||||
| @@ -14,10 +15,13 @@ class Segment: | |||||
| self.max_freq = 0 | self.max_freq = 0 | ||||
| self.total_freq = 0 | self.total_freq = 0 | ||||
| self.initialized = False | self.initialized = False | ||||
| self.model = None | |||||
| def init(self, vocab_path='dict/jiagu.dict', user_vocab='dict/user.dict'): | |||||
| def init(self, vocab_path='dict/jiagu.dict', user_vocab='dict/user.dict', | |||||
| model_path='model/cws.model'): | |||||
| self.load_vocab(os.path.join(os.path.dirname(__file__), vocab_path)) | self.load_vocab(os.path.join(os.path.dirname(__file__), vocab_path)) | ||||
| self.load_vocab(os.path.join(os.path.dirname(__file__), user_vocab)) | self.load_vocab(os.path.join(os.path.dirname(__file__), user_vocab)) | ||||
| self.model = Perceptron(os.path.join(os.path.dirname(__file__), model_path)) | |||||
| self.initialized = True | self.initialized = True | ||||
| def load_vocab(self, vocab_path): | def load_vocab(self, vocab_path): | ||||
| @@ -52,6 +56,18 @@ class Segment: | |||||
| if len(word) > self.max_word_len: | if len(word) > self.max_word_len: | ||||
| self.max_word_len = len(word) | self.max_word_len = len(word) | ||||
| def del_vocab(self, word=None, freq=None, tag=None): | |||||
| if word not in self.vocab: | |||||
| return None | |||||
| vocab_freq = self.vocab[word] | |||||
| if freq == None or vocab_freq <= freq: | |||||
| del self.vocab[word] | |||||
| self.total_freq -= vocab_freq | |||||
| else: | |||||
| self.vocab[word] -= freq | |||||
| # self.max_freq and self.max_word_len ? | |||||
| def load_userdict(self, userdict): | def load_userdict(self, userdict): | ||||
| if self.initialized == False: | if self.initialized == False: | ||||
| self.init() | self.init() | ||||
| @@ -70,6 +86,22 @@ class Segment: | |||||
| self.add_vocab(word, freq) | self.add_vocab(word, freq) | ||||
| elif isinstance(item, str): | elif isinstance(item, str): | ||||
| self.add_vocab(word=item) | self.add_vocab(word=item) | ||||
| def del_userdict(self, userdict): | |||||
| if self.initialized == False: | |||||
| self.init() | |||||
| for item in userdict: | |||||
| if isinstance(item, list): | |||||
| if len(item) == 1: | |||||
| word = item[0] | |||||
| self.del_vocab(word) | |||||
| elif len(item) == 2: | |||||
| word = item[0] | |||||
| freq = item[1] | |||||
| self.del_vocab(word, freq) | |||||
| elif isinstance(item, str): | |||||
| self.del_vocab(word=item) | |||||
| def calc_route(self, sentence, DAG, route): | def calc_route(self, sentence, DAG, route): | ||||
| vocab = self.vocab | vocab = self.vocab | ||||
| @@ -149,7 +181,39 @@ class Segment: | |||||
| yield buf | yield buf | ||||
| buf = '' | buf = '' | ||||
| def seg_default(self, sentence, mode): | |||||
| def model_cut(self, sentence): | |||||
| if sentence == '': | |||||
| return [''] | |||||
| sentence = list(sentence) | |||||
| labels = self.model.predict(sentence) | |||||
| return self.__lab2word(sentence, labels) | |||||
| def __lab2word(self, sentence, labels): | |||||
| sen_len = len(sentence) | |||||
| tmp_word = "" | |||||
| words = [] | |||||
| for i in range(sen_len): | |||||
| label = labels[i] | |||||
| w = sentence[i] | |||||
| if label == "B": | |||||
| tmp_word += w | |||||
| elif label == "M": | |||||
| tmp_word += w | |||||
| elif label == "E": | |||||
| tmp_word += w | |||||
| words.append(tmp_word) | |||||
| tmp_word = "" | |||||
| else: | |||||
| if tmp_word != '': | |||||
| words.append(tmp_word) | |||||
| tmp_word = "" | |||||
| words.append(w) | |||||
| if tmp_word: | |||||
| words.append(tmp_word) | |||||
| return words | |||||
| def seg_default(self, sentence): | |||||
| blocks = re_han.split(sentence) | blocks = re_han.split(sentence) | ||||
| cut_block = self.cut_words | cut_block = self.cut_words | ||||
| cut_all = False | cut_all = False | ||||
| @@ -170,11 +234,75 @@ class Segment: | |||||
| else: | else: | ||||
| yield x | yield x | ||||
| def seg_new_word(self, sentence): | |||||
| blocks = re_han.split(sentence) | |||||
| cut_block = self.cut_words | |||||
| cut_all = False | |||||
| for block in blocks: | |||||
| if not block: | |||||
| continue | |||||
| if re_han.match(block): | |||||
| words1 = list(cut_block(block)) | |||||
| print(words1) | |||||
| words2 = self.model_cut(block) | |||||
| print(words2) | |||||
| # new_word = [] # 有冲突的不加,长度大于4的不加,加完记得删除 | |||||
| # length = len(words1) | |||||
| # for n in range(3): | |||||
| # can_limit = length - n + 1 | |||||
| # for i in range(0, can_limit): | |||||
| # ngram = ''.join(words1[i:i + n]) | |||||
| # word_len = len(ngram) | |||||
| # if word_len > 4 or word_len==1: | |||||
| # continue | |||||
| # if ngram in words2 and ngram not in words1: | |||||
| # print(ngram) | |||||
| # new_word.append([ngram, 1]) | |||||
| new_word = [] | |||||
| for word in words2: | |||||
| if word not in words1 and len(word)>1 and len(word) < 4 :#and not re_eng.match(word): | |||||
| new_word.append([word, 1]) | |||||
| self.load_userdict(new_word) | |||||
| # print('------------------') | |||||
| for word in cut_block(block): | |||||
| yield word | |||||
| # 删除字典 | |||||
| self.del_userdict(new_word) | |||||
| else: | |||||
| tmp = re_skip.split(block) | |||||
| for x in tmp: | |||||
| if re_skip.match(x): | |||||
| yield x | |||||
| elif not cut_all: | |||||
| for xx in x: | |||||
| yield xx | |||||
| else: | |||||
| yield x | |||||
| def seg(self, sentence, mode="default"): | def seg(self, sentence, mode="default"): | ||||
| if self.initialized == False: | if self.initialized == False: | ||||
| self.init() | self.init() | ||||
| return list(self.seg_default(sentence, mode=mode)) | |||||
| if mode == 'probe': | |||||
| return list(self.seg_new_word(sentence)) | |||||
| else: | |||||
| return list(self.seg_default(sentence)) | |||||
| if __name__=='__main__': | if __name__=='__main__': | ||||
| s = Segment() | s = Segment() | ||||
| @@ -182,13 +310,30 @@ if __name__=='__main__': | |||||
| # sg.load_userdict('dict/user.dict') | # sg.load_userdict('dict/user.dict') | ||||
| # s.load_userdict(['知识图谱']) | # s.load_userdict(['知识图谱']) | ||||
| text = '情感分析了解一下?一个比情感词典、机器学习更好的方法' | |||||
| # text = '辽宁省铁岭市西丰县房木镇潭清村东屯' # bug | |||||
| # text = '黑龙江省双鸭山市宝清县宝清镇通达街341号' | |||||
| # text = '浙江省杭州市西湖区三墩镇紫宣路158号1幢801室' | |||||
| # text = '北京市西城区茶马街8号院1号楼15层1502' | |||||
| # text = '西藏自治区林芝市米林县羌纳乡羌渡岗村' | |||||
| # text = '深圳市南山区西丽街道松坪山社区宝深路科陆大厦B座13层B05' | |||||
| # text = '深圳市福田区福强路中港城裙楼6E部分602-A' # bug | |||||
| # text = '深圳市福田区福保街道石厦北二街89号新港商城C座3305室' | |||||
| # text = '五常市向阳镇致富村庆丰营屯' | |||||
| # text = '中牟县中兴路与益民巷交叉口路南' | |||||
| # text = '黄山市屯溪区华馨路38号二楼' | |||||
| text = '银川市金凤区北京中路福宁城11-1-号' | |||||
| # 直接将新词动态加入新词的字典中,有冲突的不加,加完记得删除 | |||||
| words = s.seg(text) | |||||
| # words = s.seg(text) | |||||
| # print(words) | |||||
| words = s.seg(text, 'probe') | |||||
| print('----------------') | |||||
| print(words) | print(words) | ||||
| @@ -0,0 +1,227 @@ | |||||
| # -*- coding:utf-8 -*- | |||||
| import os | |||||
| import gzip | |||||
| import pickle | |||||
| import random | |||||
| from collections import defaultdict | |||||
| class AveragedPerceptron(object): | |||||
| def __init__(self): | |||||
| # Each feature gets its own weight vector, so weights is a dict-of-dicts | |||||
| self.weights = {} | |||||
| self.classes = set() | |||||
| # The accumulated values, for the averaging. These will be keyed by | |||||
| # feature/clas tuples | |||||
| self._totals = defaultdict(int) | |||||
| # The last time the feature was changed, for the averaging. Also | |||||
| # keyed by feature/clas tuples | |||||
| # (tstamps is short for timestamps) | |||||
| self._tstamps = defaultdict(int) | |||||
| # Number of instances seen | |||||
| self.i = 0 | |||||
| def predict(self, features): | |||||
| '''Dot-product the features and current weights and return the best label.''' | |||||
| scores = defaultdict(float) | |||||
| for feat, value in features.items(): | |||||
| if feat not in self.weights or value == 0: | |||||
| continue | |||||
| weights = self.weights[feat] | |||||
| for label, weight in weights.items(): | |||||
| scores[label] += value * weight | |||||
| # Do a secondary alphabetic sort, for stability | |||||
| return max(self.classes, key=lambda label: (scores[label], label)) | |||||
| def update(self, truth, guess, features): | |||||
| '''Update the feature weights.''' | |||||
| def upd_feat(c, f, w, v): | |||||
| param = (f, c) | |||||
| self._totals[param] += (self.i - self._tstamps[param]) * w | |||||
| self._tstamps[param] = self.i | |||||
| self.weights[f][c] = w + v | |||||
| self.i += 1 | |||||
| if truth == guess: | |||||
| return None | |||||
| for f in features: | |||||
| weights = self.weights.setdefault(f, {}) | |||||
| upd_feat(truth, f, weights.get(truth, 0.0), 1.0) | |||||
| upd_feat(guess, f, weights.get(guess, 0.0), -1.0) | |||||
| return None | |||||
| def average_weights(self): | |||||
| '''Average weights from all iterations.''' | |||||
| for feat, weights in self.weights.items(): | |||||
| new_feat_weights = {} | |||||
| for clas, weight in weights.items(): | |||||
| param = (feat, clas) | |||||
| total = self._totals[param] | |||||
| total += (self.i - self._tstamps[param]) * weight | |||||
| averaged = round(total / float(self.i), 3) | |||||
| if averaged: | |||||
| new_feat_weights[clas] = averaged | |||||
| self.weights[feat] = new_feat_weights | |||||
| return None | |||||
| class Perceptron: | |||||
| def __init__(self, loc=None): | |||||
| self.START = ['-START-', '-START2-'] | |||||
| self.END = ['-END-', '-END2-'] | |||||
| self.model = AveragedPerceptron() | |||||
| if loc != None: | |||||
| self.load(loc) | |||||
| def predict(self, words): | |||||
| prev, prev2 = self.START | |||||
| labels = [] | |||||
| context = self.START + words + self.END | |||||
| for i, word in enumerate(words): | |||||
| features = self._get_features(i, word, context, prev, prev2) | |||||
| tag = self.model.predict(features) | |||||
| labels.append(tag) | |||||
| prev2 = prev | |||||
| prev = tag | |||||
| return labels | |||||
| def train(self, sentences, save_loc=None, nr_iter=5, shuf=False): | |||||
| self._make_tagdict(sentences) | |||||
| for iter_ in range(nr_iter): | |||||
| c = 0 | |||||
| n = 0 | |||||
| for words, tags in sentences: | |||||
| prev, prev2 = self.START | |||||
| context = self.START + words + self.END | |||||
| for i, word in enumerate(words): | |||||
| feats = self._get_features(i, word, context, prev, prev2) | |||||
| guess = self.model.predict(feats) | |||||
| self.model.update(tags[i], guess, feats) | |||||
| prev2 = prev | |||||
| prev = guess | |||||
| c += guess == tags[i] | |||||
| n += 1 | |||||
| if shuf == True: | |||||
| random.shuffle(sentences) | |||||
| print("Iter {0}: {1}/{2}={3}".format(iter_, c, n, (float(c) / n) * 100)) | |||||
| self.save(save_loc) | |||||
| self.model.average_weights() | |||||
| self.save(save_loc) | |||||
| def save(self, loc='model/ap.model', zip=True): | |||||
| if zip == False: | |||||
| pickle.dump((self.model.weights, self.model.classes), open(loc, 'wb')) | |||||
| else: | |||||
| pickle.dump((self.model.weights, self.model.classes), gzip.open(loc, 'wb')) | |||||
| def load(self, loc='model/ap.model', zip=True): | |||||
| if zip == False: | |||||
| self.model.weights, self.model.classes = pickle.load(open(loc, 'rb')) | |||||
| else: | |||||
| self.model.weights, self.model.classes = pickle.load(gzip.open(loc,'rb')) | |||||
| def _get_features(self, i, word, context, prev, prev2): | |||||
| '''Map tokens into a feature representation, implemented as a | |||||
| {hashable: float} dict. If the features change, a new model must be | |||||
| trained. | |||||
| ''' | |||||
| def add(name, *args): | |||||
| features[' '.join((name,) + tuple(args))] += 1 | |||||
| i += len(self.START) | |||||
| features = defaultdict(int) | |||||
| # It's useful to have a constant feature, which acts sort of like a prior | |||||
| add('bias') | |||||
| add('i suffix', word[-3:]) | |||||
| add('i pref1', word[0]) | |||||
| add('i-1 tag', prev) | |||||
| add('i-2 tag', prev2) | |||||
| add('i tag+i-2 tag', prev, prev2) | |||||
| add('i word', context[i]) | |||||
| add('i-1 tag+i word', prev, context[i]) | |||||
| add('i-1 word', context[i - 1]) | |||||
| add('i-1 suffix', context[i - 1][-3:]) | |||||
| add('i-2 word', context[i - 2]) | |||||
| add('i+1 word', context[i + 1]) | |||||
| add('i+1 suffix', context[i + 1][-3:]) | |||||
| add('i+2 word', context[i + 2]) | |||||
| return features | |||||
| def _make_tagdict(self, sentences): | |||||
| '''Make a tag dictionary for single-tag words.''' | |||||
| for words, tags in sentences: | |||||
| for word, tag in zip(words, tags): | |||||
| self.model.classes.add(tag) | |||||
| def train(filepath='data/train.txt', model='model/ap.model', nr_iter=1): | |||||
| tagger = Perceptron() | |||||
| print('Reading corpus...') | |||||
| training_data = [] | |||||
| sentence = ([], []) | |||||
| fin = open(filepath, 'r', encoding='utf8') | |||||
| for index, line in enumerate(fin): | |||||
| line = line.strip() | |||||
| if line == '': | |||||
| training_data.append(sentence) | |||||
| sentence = ([], []) | |||||
| else: | |||||
| params = line.split() | |||||
| if len(params) != 2: continue | |||||
| sentence[0].append(params[0]) | |||||
| sentence[1].append(params[1]) | |||||
| fin.close() | |||||
| print('training corpus size : %d', len(training_data)) | |||||
| print('Start training...') | |||||
| tagger.train(training_data, save_loc=model, nr_iter=nr_iter) | |||||
| def eval(filepath='data/test.txt', model='model/ap.model'): | |||||
| tagger = Perceptron(model) | |||||
| print('Start testing...') | |||||
| right = 0.0 | |||||
| total = 0.0 | |||||
| sentence = ([], []) | |||||
| fin = open(filepath, 'r', encoding='utf8') | |||||
| for index, line in enumerate(fin): | |||||
| line = line.strip() | |||||
| if line == '': | |||||
| words = sentence[0] | |||||
| tags = sentence[1] | |||||
| outputs = tagger.predict(words) | |||||
| assert len(tags) == len(outputs) | |||||
| total += len(tags) | |||||
| for o, t in zip(outputs, tags): | |||||
| if o == t: right += 1 | |||||
| sentence = ([], []) | |||||
| else: | |||||
| params = line.split() | |||||
| if len(params) != 2: continue | |||||
| sentence[0].append(params[0]) | |||||
| sentence[1].append(params[1]) | |||||
| fin.close() | |||||
| print("Precision : %f", right / total) | |||||
| def predict(model='model/ap.model'): | |||||
| tagger = Perceptron(model) | |||||
| while True: | |||||
| text = input('>') | |||||
| words = list(text) | |||||
| labels = tagger.predict(words) | |||||
| for word, label in zip(words, labels): | |||||
| print(word, label) | |||||
| if __name__ == '__main__': | |||||
| train() | |||||
| eval() | |||||
| # predict() | |||||
| @@ -3,7 +3,7 @@ | |||||
| from setuptools import setup | from setuptools import setup | ||||
| setup(name='jiagu', | setup(name='jiagu', | ||||
| version='0.2.0', | |||||
| version='0.2.1', | |||||
| description='Jiagu Natural Language Processing', | description='Jiagu Natural Language Processing', | ||||
| author='Yener(Zheng Wenyu)', | author='Yener(Zheng Wenyu)', | ||||
| author_email='help@ownthink.com', | author_email='help@ownthink.com', | ||||
| @@ -0,0 +1,30 @@ | |||||
| import jiagu | |||||
| import jieba | |||||
| text = '辽宁省铁岭市西丰县房木镇潭清村东屯' | |||||
| text = '黑龙江省双鸭山市宝清县宝清镇通达街341号' | |||||
| text = '''茶饮界的流行元素每隔几个月就会更新一次,现在各大咖啡品牌也玩起了跨界。今年9月3日,星巴克在中国内地首次上线了南瓜丝绒拿铁(Pumpkin Spice Latte,简称PSL),这款产品最初于2003年在美国上市,在全球累计卖出2亿杯;在被可口可乐以51亿美元从韦博得集团(Whitbread)收购一年后,一贯低调的COSTA也在今年6月表示将推出冷藏即饮咖啡,中国亦在首批上市市场之列。 | |||||
| 最近,连锁咖啡品牌太平洋咖啡与东阿阿胶达成合作,推出5款名为咖啡如此多“胶”的联名产品,分别是:OATLY阿胶红枣拿铁、东阿阿胶拿铁、阿胶红枣拿铁、东阿阿胶抹茶拿铁及东阿阿胶银耳茶拿铁,平均售价约36元。据了解,这一系列产品于2019年10月16日起陆续在北京、上海、广州、深圳、西安、成都、无锡七个城市的太平洋咖啡指定门店内上市。总的来看,阿胶和咖啡相处地比较“融洽”,跨界没有违和感。 | |||||
| 太平洋咖啡这次推出的阿胶产品的包装也突出了中国风,咖啡杯套上的人物形象是穿着汉服和旗袍的中国女性形象。太平洋咖啡副董事长李海涛表示:“太平洋咖啡自成立27年来,始终坚持在咖啡这一‘舶来品’中融入中国元素,探求‘中西文化融合’的别样体验。东阿阿胶有近三千年传承历史,作为国家非物质文化遗产代表性传承技艺,可谓是中国传统滋补上品。本次发布的5款合作新饮,既保留了西方咖啡的醇香,又将东阿阿胶的胶香融入其中,充分彰显了‘中西’融合。” | |||||
| 此次与东阿阿胶的合作也可以看做是一种跨界。咖啡品牌与东方滋补产品的结合也显现了“年轻态”、“创新化”的品牌趋势。太平洋咖啡与东阿阿胶的主要消费者都为女性,也都力求在年轻市场实现突破,这样两个品牌的合作也属意料之外、情理之中。''' | |||||
| words = jiagu.cut(text) | |||||
| print(words) | |||||
| print(list(jieba.cut(text))) | |||||
| # 合并只合并中文四个词以内的 | |||||
| # 字典出现大量的单子,表示可能会出错 | |||||
| # 在里面合并 | |||||