| @@ -104,20 +104,14 @@ class Analyze(object): | |||
| :param sentence: str or list | |||
| 文本或者文本列表,根据input的模式来定 | |||
| :param input: str | |||
| 句子输入的格式,text则为默认的文本,batch则为批量的文本列表 | |||
| :param model: str | |||
| 分词所使用的模式,default为默认模式,mmseg为mmseg分词方式 | |||
| 分词所使用的模式,default为默认模式包含新词发现 | |||
| :return: | |||
| """ | |||
| if model == 'default': | |||
| self.init_cws() | |||
| words = self.cws_text(sentence) | |||
| return words | |||
| elif model == 'mmseg': | |||
| self.init_mmseg() | |||
| words = self.seg_mmseg.cws(sentence) | |||
| return words | |||
| else: | |||
| pass | |||
| return [] | |||
| @@ -158,7 +152,7 @@ class Analyze(object): | |||
| def text_cluster(self, docs, features_method='tfidf', method="k-means", k=3, max_iter=100, eps=0.5, min_pts=2): | |||
| return cluster(docs, features_method, method, k, max_iter, eps, min_pts, self.seg) | |||
| def lab2spo(self, words, epp_labels): | |||
| subject_list = [] # 存放实体的列表 | |||
| object_list = [] | |||
| @@ -174122,7 +174122,7 @@ win键 7 | |||
| 河北涿县 3 | |||
| 河北满城 3 | |||
| 河北电视台 5 | |||
| 河北省 358 | |||
| 河北省 359 | |||
| 河北省保定 3 | |||
| 河北省妇联 3 | |||
| 河北省委 17 | |||
| @@ -174131,7 +174131,6 @@ win键 7 | |||
| 河北省科协 3 | |||
| 河北省科委 3 | |||
| 河北省纪委 3 | |||
| 河北省衡水 3 | |||
| 河北省邯郸 3 | |||
| 河北籍 3 | |||
| 河北赵县 3 | |||
| @@ -245176,7 +245175,7 @@ win键 7 | |||
| 衡庭汉 34 | |||
| 衡志诚 4 | |||
| 衡某 3 | |||
| 衡水 29 | |||
| 衡水 30 | |||
| 衡水市 17 | |||
| 衡讯 3 | |||
| 衡诸 8 | |||
| @@ -2,6 +2,7 @@ | |||
| import os | |||
| import sys | |||
| from math import log | |||
| from jiagu.perceptron import Perceptron | |||
| re_eng = re.compile('[a-zA-Z0-9]', re.U) | |||
| re_han = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U) | |||
| @@ -14,10 +15,13 @@ class Segment: | |||
| self.max_freq = 0 | |||
| self.total_freq = 0 | |||
| self.initialized = False | |||
| self.model = None | |||
| def init(self, vocab_path='dict/jiagu.dict', user_vocab='dict/user.dict'): | |||
| def init(self, vocab_path='dict/jiagu.dict', user_vocab='dict/user.dict', | |||
| model_path='model/cws.model'): | |||
| self.load_vocab(os.path.join(os.path.dirname(__file__), vocab_path)) | |||
| self.load_vocab(os.path.join(os.path.dirname(__file__), user_vocab)) | |||
| self.model = Perceptron(os.path.join(os.path.dirname(__file__), model_path)) | |||
| self.initialized = True | |||
| def load_vocab(self, vocab_path): | |||
| @@ -52,6 +56,18 @@ class Segment: | |||
| if len(word) > self.max_word_len: | |||
| self.max_word_len = len(word) | |||
| def del_vocab(self, word=None, freq=None, tag=None): | |||
| if word not in self.vocab: | |||
| return None | |||
| vocab_freq = self.vocab[word] | |||
| if freq == None or vocab_freq <= freq: | |||
| del self.vocab[word] | |||
| self.total_freq -= vocab_freq | |||
| else: | |||
| self.vocab[word] -= freq | |||
| # self.max_freq and self.max_word_len ? | |||
| def load_userdict(self, userdict): | |||
| if self.initialized == False: | |||
| self.init() | |||
| @@ -70,6 +86,22 @@ class Segment: | |||
| self.add_vocab(word, freq) | |||
| elif isinstance(item, str): | |||
| self.add_vocab(word=item) | |||
| def del_userdict(self, userdict): | |||
| if self.initialized == False: | |||
| self.init() | |||
| for item in userdict: | |||
| if isinstance(item, list): | |||
| if len(item) == 1: | |||
| word = item[0] | |||
| self.del_vocab(word) | |||
| elif len(item) == 2: | |||
| word = item[0] | |||
| freq = item[1] | |||
| self.del_vocab(word, freq) | |||
| elif isinstance(item, str): | |||
| self.del_vocab(word=item) | |||
| def calc_route(self, sentence, DAG, route): | |||
| vocab = self.vocab | |||
| @@ -149,7 +181,39 @@ class Segment: | |||
| yield buf | |||
| buf = '' | |||
| def seg_default(self, sentence, mode): | |||
| def model_cut(self, sentence): | |||
| if sentence == '': | |||
| return [''] | |||
| sentence = list(sentence) | |||
| labels = self.model.predict(sentence) | |||
| return self.__lab2word(sentence, labels) | |||
| def __lab2word(self, sentence, labels): | |||
| sen_len = len(sentence) | |||
| tmp_word = "" | |||
| words = [] | |||
| for i in range(sen_len): | |||
| label = labels[i] | |||
| w = sentence[i] | |||
| if label == "B": | |||
| tmp_word += w | |||
| elif label == "M": | |||
| tmp_word += w | |||
| elif label == "E": | |||
| tmp_word += w | |||
| words.append(tmp_word) | |||
| tmp_word = "" | |||
| else: | |||
| if tmp_word != '': | |||
| words.append(tmp_word) | |||
| tmp_word = "" | |||
| words.append(w) | |||
| if tmp_word: | |||
| words.append(tmp_word) | |||
| return words | |||
| def seg_default(self, sentence): | |||
| blocks = re_han.split(sentence) | |||
| cut_block = self.cut_words | |||
| cut_all = False | |||
| @@ -170,11 +234,75 @@ class Segment: | |||
| else: | |||
| yield x | |||
| def seg_new_word(self, sentence): | |||
| blocks = re_han.split(sentence) | |||
| cut_block = self.cut_words | |||
| cut_all = False | |||
| for block in blocks: | |||
| if not block: | |||
| continue | |||
| if re_han.match(block): | |||
| words1 = list(cut_block(block)) | |||
| print(words1) | |||
| words2 = self.model_cut(block) | |||
| print(words2) | |||
| # new_word = [] # 有冲突的不加,长度大于4的不加,加完记得删除 | |||
| # length = len(words1) | |||
| # for n in range(3): | |||
| # can_limit = length - n + 1 | |||
| # for i in range(0, can_limit): | |||
| # ngram = ''.join(words1[i:i + n]) | |||
| # word_len = len(ngram) | |||
| # if word_len > 4 or word_len==1: | |||
| # continue | |||
| # if ngram in words2 and ngram not in words1: | |||
| # print(ngram) | |||
| # new_word.append([ngram, 1]) | |||
| new_word = [] | |||
| for word in words2: | |||
| if word not in words1 and len(word)>1 and len(word) < 4 :#and not re_eng.match(word): | |||
| new_word.append([word, 1]) | |||
| self.load_userdict(new_word) | |||
| # print('------------------') | |||
| for word in cut_block(block): | |||
| yield word | |||
| # 删除字典 | |||
| self.del_userdict(new_word) | |||
| else: | |||
| tmp = re_skip.split(block) | |||
| for x in tmp: | |||
| if re_skip.match(x): | |||
| yield x | |||
| elif not cut_all: | |||
| for xx in x: | |||
| yield xx | |||
| else: | |||
| yield x | |||
| def seg(self, sentence, mode="default"): | |||
| if self.initialized == False: | |||
| self.init() | |||
| return list(self.seg_default(sentence, mode=mode)) | |||
| if mode == 'probe': | |||
| return list(self.seg_new_word(sentence)) | |||
| else: | |||
| return list(self.seg_default(sentence)) | |||
| if __name__=='__main__': | |||
| s = Segment() | |||
| @@ -182,13 +310,30 @@ if __name__=='__main__': | |||
| # sg.load_userdict('dict/user.dict') | |||
| # s.load_userdict(['知识图谱']) | |||
| text = '情感分析了解一下?一个比情感词典、机器学习更好的方法' | |||
| # text = '辽宁省铁岭市西丰县房木镇潭清村东屯' # bug | |||
| # text = '黑龙江省双鸭山市宝清县宝清镇通达街341号' | |||
| # text = '浙江省杭州市西湖区三墩镇紫宣路158号1幢801室' | |||
| # text = '北京市西城区茶马街8号院1号楼15层1502' | |||
| # text = '西藏自治区林芝市米林县羌纳乡羌渡岗村' | |||
| # text = '深圳市南山区西丽街道松坪山社区宝深路科陆大厦B座13层B05' | |||
| # text = '深圳市福田区福强路中港城裙楼6E部分602-A' # bug | |||
| # text = '深圳市福田区福保街道石厦北二街89号新港商城C座3305室' | |||
| # text = '五常市向阳镇致富村庆丰营屯' | |||
| # text = '中牟县中兴路与益民巷交叉口路南' | |||
| # text = '黄山市屯溪区华馨路38号二楼' | |||
| text = '银川市金凤区北京中路福宁城11-1-号' | |||
| # 直接将新词动态加入新词的字典中,有冲突的不加,加完记得删除 | |||
| words = s.seg(text) | |||
| # words = s.seg(text) | |||
| # print(words) | |||
| words = s.seg(text, 'probe') | |||
| print('----------------') | |||
| print(words) | |||
| @@ -0,0 +1,227 @@ | |||
| # -*- coding:utf-8 -*- | |||
| import os | |||
| import gzip | |||
| import pickle | |||
| import random | |||
| from collections import defaultdict | |||
| class AveragedPerceptron(object): | |||
| def __init__(self): | |||
| # Each feature gets its own weight vector, so weights is a dict-of-dicts | |||
| self.weights = {} | |||
| self.classes = set() | |||
| # The accumulated values, for the averaging. These will be keyed by | |||
| # feature/clas tuples | |||
| self._totals = defaultdict(int) | |||
| # The last time the feature was changed, for the averaging. Also | |||
| # keyed by feature/clas tuples | |||
| # (tstamps is short for timestamps) | |||
| self._tstamps = defaultdict(int) | |||
| # Number of instances seen | |||
| self.i = 0 | |||
| def predict(self, features): | |||
| '''Dot-product the features and current weights and return the best label.''' | |||
| scores = defaultdict(float) | |||
| for feat, value in features.items(): | |||
| if feat not in self.weights or value == 0: | |||
| continue | |||
| weights = self.weights[feat] | |||
| for label, weight in weights.items(): | |||
| scores[label] += value * weight | |||
| # Do a secondary alphabetic sort, for stability | |||
| return max(self.classes, key=lambda label: (scores[label], label)) | |||
| def update(self, truth, guess, features): | |||
| '''Update the feature weights.''' | |||
| def upd_feat(c, f, w, v): | |||
| param = (f, c) | |||
| self._totals[param] += (self.i - self._tstamps[param]) * w | |||
| self._tstamps[param] = self.i | |||
| self.weights[f][c] = w + v | |||
| self.i += 1 | |||
| if truth == guess: | |||
| return None | |||
| for f in features: | |||
| weights = self.weights.setdefault(f, {}) | |||
| upd_feat(truth, f, weights.get(truth, 0.0), 1.0) | |||
| upd_feat(guess, f, weights.get(guess, 0.0), -1.0) | |||
| return None | |||
| def average_weights(self): | |||
| '''Average weights from all iterations.''' | |||
| for feat, weights in self.weights.items(): | |||
| new_feat_weights = {} | |||
| for clas, weight in weights.items(): | |||
| param = (feat, clas) | |||
| total = self._totals[param] | |||
| total += (self.i - self._tstamps[param]) * weight | |||
| averaged = round(total / float(self.i), 3) | |||
| if averaged: | |||
| new_feat_weights[clas] = averaged | |||
| self.weights[feat] = new_feat_weights | |||
| return None | |||
| class Perceptron: | |||
| def __init__(self, loc=None): | |||
| self.START = ['-START-', '-START2-'] | |||
| self.END = ['-END-', '-END2-'] | |||
| self.model = AveragedPerceptron() | |||
| if loc != None: | |||
| self.load(loc) | |||
| def predict(self, words): | |||
| prev, prev2 = self.START | |||
| labels = [] | |||
| context = self.START + words + self.END | |||
| for i, word in enumerate(words): | |||
| features = self._get_features(i, word, context, prev, prev2) | |||
| tag = self.model.predict(features) | |||
| labels.append(tag) | |||
| prev2 = prev | |||
| prev = tag | |||
| return labels | |||
| def train(self, sentences, save_loc=None, nr_iter=5, shuf=False): | |||
| self._make_tagdict(sentences) | |||
| for iter_ in range(nr_iter): | |||
| c = 0 | |||
| n = 0 | |||
| for words, tags in sentences: | |||
| prev, prev2 = self.START | |||
| context = self.START + words + self.END | |||
| for i, word in enumerate(words): | |||
| feats = self._get_features(i, word, context, prev, prev2) | |||
| guess = self.model.predict(feats) | |||
| self.model.update(tags[i], guess, feats) | |||
| prev2 = prev | |||
| prev = guess | |||
| c += guess == tags[i] | |||
| n += 1 | |||
| if shuf == True: | |||
| random.shuffle(sentences) | |||
| print("Iter {0}: {1}/{2}={3}".format(iter_, c, n, (float(c) / n) * 100)) | |||
| self.save(save_loc) | |||
| self.model.average_weights() | |||
| self.save(save_loc) | |||
| def save(self, loc='model/ap.model', zip=True): | |||
| if zip == False: | |||
| pickle.dump((self.model.weights, self.model.classes), open(loc, 'wb')) | |||
| else: | |||
| pickle.dump((self.model.weights, self.model.classes), gzip.open(loc, 'wb')) | |||
| def load(self, loc='model/ap.model', zip=True): | |||
| if zip == False: | |||
| self.model.weights, self.model.classes = pickle.load(open(loc, 'rb')) | |||
| else: | |||
| self.model.weights, self.model.classes = pickle.load(gzip.open(loc,'rb')) | |||
| def _get_features(self, i, word, context, prev, prev2): | |||
| '''Map tokens into a feature representation, implemented as a | |||
| {hashable: float} dict. If the features change, a new model must be | |||
| trained. | |||
| ''' | |||
| def add(name, *args): | |||
| features[' '.join((name,) + tuple(args))] += 1 | |||
| i += len(self.START) | |||
| features = defaultdict(int) | |||
| # It's useful to have a constant feature, which acts sort of like a prior | |||
| add('bias') | |||
| add('i suffix', word[-3:]) | |||
| add('i pref1', word[0]) | |||
| add('i-1 tag', prev) | |||
| add('i-2 tag', prev2) | |||
| add('i tag+i-2 tag', prev, prev2) | |||
| add('i word', context[i]) | |||
| add('i-1 tag+i word', prev, context[i]) | |||
| add('i-1 word', context[i - 1]) | |||
| add('i-1 suffix', context[i - 1][-3:]) | |||
| add('i-2 word', context[i - 2]) | |||
| add('i+1 word', context[i + 1]) | |||
| add('i+1 suffix', context[i + 1][-3:]) | |||
| add('i+2 word', context[i + 2]) | |||
| return features | |||
| def _make_tagdict(self, sentences): | |||
| '''Make a tag dictionary for single-tag words.''' | |||
| for words, tags in sentences: | |||
| for word, tag in zip(words, tags): | |||
| self.model.classes.add(tag) | |||
| def train(filepath='data/train.txt', model='model/ap.model', nr_iter=1): | |||
| tagger = Perceptron() | |||
| print('Reading corpus...') | |||
| training_data = [] | |||
| sentence = ([], []) | |||
| fin = open(filepath, 'r', encoding='utf8') | |||
| for index, line in enumerate(fin): | |||
| line = line.strip() | |||
| if line == '': | |||
| training_data.append(sentence) | |||
| sentence = ([], []) | |||
| else: | |||
| params = line.split() | |||
| if len(params) != 2: continue | |||
| sentence[0].append(params[0]) | |||
| sentence[1].append(params[1]) | |||
| fin.close() | |||
| print('training corpus size : %d', len(training_data)) | |||
| print('Start training...') | |||
| tagger.train(training_data, save_loc=model, nr_iter=nr_iter) | |||
| def eval(filepath='data/test.txt', model='model/ap.model'): | |||
| tagger = Perceptron(model) | |||
| print('Start testing...') | |||
| right = 0.0 | |||
| total = 0.0 | |||
| sentence = ([], []) | |||
| fin = open(filepath, 'r', encoding='utf8') | |||
| for index, line in enumerate(fin): | |||
| line = line.strip() | |||
| if line == '': | |||
| words = sentence[0] | |||
| tags = sentence[1] | |||
| outputs = tagger.predict(words) | |||
| assert len(tags) == len(outputs) | |||
| total += len(tags) | |||
| for o, t in zip(outputs, tags): | |||
| if o == t: right += 1 | |||
| sentence = ([], []) | |||
| else: | |||
| params = line.split() | |||
| if len(params) != 2: continue | |||
| sentence[0].append(params[0]) | |||
| sentence[1].append(params[1]) | |||
| fin.close() | |||
| print("Precision : %f", right / total) | |||
| def predict(model='model/ap.model'): | |||
| tagger = Perceptron(model) | |||
| while True: | |||
| text = input('>') | |||
| words = list(text) | |||
| labels = tagger.predict(words) | |||
| for word, label in zip(words, labels): | |||
| print(word, label) | |||
| if __name__ == '__main__': | |||
| train() | |||
| eval() | |||
| # predict() | |||
| @@ -3,7 +3,7 @@ | |||
| from setuptools import setup | |||
| setup(name='jiagu', | |||
| version='0.2.0', | |||
| version='0.2.1', | |||
| description='Jiagu Natural Language Processing', | |||
| author='Yener(Zheng Wenyu)', | |||
| author_email='help@ownthink.com', | |||
| @@ -0,0 +1,30 @@ | |||
| import jiagu | |||
| import jieba | |||
| text = '辽宁省铁岭市西丰县房木镇潭清村东屯' | |||
| text = '黑龙江省双鸭山市宝清县宝清镇通达街341号' | |||
| text = '''茶饮界的流行元素每隔几个月就会更新一次,现在各大咖啡品牌也玩起了跨界。今年9月3日,星巴克在中国内地首次上线了南瓜丝绒拿铁(Pumpkin Spice Latte,简称PSL),这款产品最初于2003年在美国上市,在全球累计卖出2亿杯;在被可口可乐以51亿美元从韦博得集团(Whitbread)收购一年后,一贯低调的COSTA也在今年6月表示将推出冷藏即饮咖啡,中国亦在首批上市市场之列。 | |||
| 最近,连锁咖啡品牌太平洋咖啡与东阿阿胶达成合作,推出5款名为咖啡如此多“胶”的联名产品,分别是:OATLY阿胶红枣拿铁、东阿阿胶拿铁、阿胶红枣拿铁、东阿阿胶抹茶拿铁及东阿阿胶银耳茶拿铁,平均售价约36元。据了解,这一系列产品于2019年10月16日起陆续在北京、上海、广州、深圳、西安、成都、无锡七个城市的太平洋咖啡指定门店内上市。总的来看,阿胶和咖啡相处地比较“融洽”,跨界没有违和感。 | |||
| 太平洋咖啡这次推出的阿胶产品的包装也突出了中国风,咖啡杯套上的人物形象是穿着汉服和旗袍的中国女性形象。太平洋咖啡副董事长李海涛表示:“太平洋咖啡自成立27年来,始终坚持在咖啡这一‘舶来品’中融入中国元素,探求‘中西文化融合’的别样体验。东阿阿胶有近三千年传承历史,作为国家非物质文化遗产代表性传承技艺,可谓是中国传统滋补上品。本次发布的5款合作新饮,既保留了西方咖啡的醇香,又将东阿阿胶的胶香融入其中,充分彰显了‘中西’融合。” | |||
| 此次与东阿阿胶的合作也可以看做是一种跨界。咖啡品牌与东方滋补产品的结合也显现了“年轻态”、“创新化”的品牌趋势。太平洋咖啡与东阿阿胶的主要消费者都为女性,也都力求在年轻市场实现突破,这样两个品牌的合作也属意料之外、情理之中。''' | |||
| words = jiagu.cut(text) | |||
| print(words) | |||
| print(list(jieba.cut(text))) | |||
| # 合并只合并中文四个词以内的 | |||
| # 字典出现大量的单子,表示可能会出错 | |||
| # 在里面合并 | |||