Browse Source

update

master
Yener 6 years ago
parent
commit
d61108fa13
7 changed files with 414 additions and 19 deletions
  1. +2
    -8
      jiagu/analyze.py
  2. +2
    -3
      jiagu/segment/dict/jiagu.dict
  3. +0
    -0
      jiagu/segment/model/cws.model
  4. +152
    -7
      jiagu/segment/nroute.py
  5. +227
    -0
      jiagu/segment/perceptron.py
  6. +1
    -1
      setup.py
  7. +30
    -0
      test.py

+ 2
- 8
jiagu/analyze.py View File

@@ -104,20 +104,14 @@ class Analyze(object):

:param sentence: str or list
文本或者文本列表,根据input的模式来定
:param input: str
句子输入的格式,text则为默认的文本,batch则为批量的文本列表
:param model: str
分词所使用的模式,default为默认模式,mmseg为mmseg分词方式
分词所使用的模式,default为默认模式包含新词发现
:return:
"""
if model == 'default':
self.init_cws()
words = self.cws_text(sentence)
return words
elif model == 'mmseg':
self.init_mmseg()
words = self.seg_mmseg.cws(sentence)
return words
else:
pass
return []
@@ -158,7 +152,7 @@ class Analyze(object):
def text_cluster(self, docs, features_method='tfidf', method="k-means", k=3, max_iter=100, eps=0.5, min_pts=2):
return cluster(docs, features_method, method, k, max_iter, eps, min_pts, self.seg)
def lab2spo(self, words, epp_labels):
subject_list = [] # 存放实体的列表
object_list = []


+ 2
- 3
jiagu/segment/dict/jiagu.dict View File

@@ -174122,7 +174122,7 @@ win键 7
河北涿县 3
河北满城 3
河北电视台 5
河北省 358
河北省 359
河北省保定 3
河北省妇联 3
河北省委 17
@@ -174131,7 +174131,6 @@ win键 7
河北省科协 3
河北省科委 3
河北省纪委 3
河北省衡水 3
河北省邯郸 3
河北籍 3
河北赵县 3
@@ -245176,7 +245175,7 @@ win键 7
衡庭汉 34
衡志诚 4
衡某 3
衡水 29
衡水 30
衡水市 17
衡讯 3
衡诸 8


jiagu/model/cws.model → jiagu/segment/model/cws.model View File


+ 152
- 7
jiagu/segment/nroute.py View File

@@ -2,6 +2,7 @@
import os
import sys
from math import log
from jiagu.perceptron import Perceptron

re_eng = re.compile('[a-zA-Z0-9]', re.U)
re_han = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U)
@@ -14,10 +15,13 @@ class Segment:
self.max_freq = 0
self.total_freq = 0
self.initialized = False
self.model = None

def init(self, vocab_path='dict/jiagu.dict', user_vocab='dict/user.dict'):
def init(self, vocab_path='dict/jiagu.dict', user_vocab='dict/user.dict',
model_path='model/cws.model'):
self.load_vocab(os.path.join(os.path.dirname(__file__), vocab_path))
self.load_vocab(os.path.join(os.path.dirname(__file__), user_vocab))
self.model = Perceptron(os.path.join(os.path.dirname(__file__), model_path))
self.initialized = True
def load_vocab(self, vocab_path):
@@ -52,6 +56,18 @@ class Segment:
if len(word) > self.max_word_len:
self.max_word_len = len(word)
def del_vocab(self, word=None, freq=None, tag=None):
if word not in self.vocab:
return None

vocab_freq = self.vocab[word]
if freq == None or vocab_freq <= freq:
del self.vocab[word]
self.total_freq -= vocab_freq
else:
self.vocab[word] -= freq
# self.max_freq and self.max_word_len ?
def load_userdict(self, userdict):
if self.initialized == False:
self.init()
@@ -70,6 +86,22 @@ class Segment:
self.add_vocab(word, freq)
elif isinstance(item, str):
self.add_vocab(word=item)
def del_userdict(self, userdict):
if self.initialized == False:
self.init()
for item in userdict:
if isinstance(item, list):
if len(item) == 1:
word = item[0]
self.del_vocab(word)
elif len(item) == 2:
word = item[0]
freq = item[1]
self.del_vocab(word, freq)
elif isinstance(item, str):
self.del_vocab(word=item)
def calc_route(self, sentence, DAG, route):
vocab = self.vocab
@@ -149,7 +181,39 @@ class Segment:
yield buf
buf = ''
def seg_default(self, sentence, mode):
def model_cut(self, sentence):
if sentence == '':
return ['']
sentence = list(sentence)
labels = self.model.predict(sentence)
return self.__lab2word(sentence, labels)
def __lab2word(self, sentence, labels):
sen_len = len(sentence)
tmp_word = ""
words = []
for i in range(sen_len):
label = labels[i]
w = sentence[i]
if label == "B":
tmp_word += w
elif label == "M":
tmp_word += w
elif label == "E":
tmp_word += w
words.append(tmp_word)
tmp_word = ""
else:
if tmp_word != '':
words.append(tmp_word)
tmp_word = ""
words.append(w)
if tmp_word:
words.append(tmp_word)
return words

def seg_default(self, sentence):
blocks = re_han.split(sentence)
cut_block = self.cut_words
cut_all = False
@@ -170,11 +234,75 @@ class Segment:
else:
yield x
def seg_new_word(self, sentence):
blocks = re_han.split(sentence)
cut_block = self.cut_words
cut_all = False
for block in blocks:
if not block:
continue
if re_han.match(block):
words1 = list(cut_block(block))
print(words1)

words2 = self.model_cut(block)
print(words2)

# new_word = [] # 有冲突的不加,长度大于4的不加,加完记得删除
# length = len(words1)
# for n in range(3):
# can_limit = length - n + 1
# for i in range(0, can_limit):
# ngram = ''.join(words1[i:i + n])
# word_len = len(ngram)
# if word_len > 4 or word_len==1:
# continue
# if ngram in words2 and ngram not in words1:
# print(ngram)
# new_word.append([ngram, 1])
new_word = []
for word in words2:
if word not in words1 and len(word)>1 and len(word) < 4 :#and not re_eng.match(word):
new_word.append([word, 1])
self.load_userdict(new_word)

# print('------------------')
for word in cut_block(block):
yield word
# 删除字典
self.del_userdict(new_word)
else:
tmp = re_skip.split(block)
for x in tmp:
if re_skip.match(x):
yield x
elif not cut_all:
for xx in x:
yield xx
else:
yield x
def seg(self, sentence, mode="default"):
if self.initialized == False:
self.init()
return list(self.seg_default(sentence, mode=mode))
if mode == 'probe':
return list(self.seg_new_word(sentence))
else:
return list(self.seg_default(sentence))

if __name__=='__main__':
s = Segment()
@@ -182,13 +310,30 @@ if __name__=='__main__':
# sg.load_userdict('dict/user.dict')
# s.load_userdict(['知识图谱'])

text = '情感分析了解一下?一个比情感词典、机器学习更好的方法'
# text = '辽宁省铁岭市西丰县房木镇潭清村东屯' # bug
# text = '黑龙江省双鸭山市宝清县宝清镇通达街341号'
# text = '浙江省杭州市西湖区三墩镇紫宣路158号1幢801室'
# text = '北京市西城区茶马街8号院1号楼15层1502'
# text = '西藏自治区林芝市米林县羌纳乡羌渡岗村'
# text = '深圳市南山区西丽街道松坪山社区宝深路科陆大厦B座13层B05'
# text = '深圳市福田区福强路中港城裙楼6E部分602-A' # bug
# text = '深圳市福田区福保街道石厦北二街89号新港商城C座3305室'
# text = '五常市向阳镇致富村庆丰营屯'
# text = '中牟县中兴路与益民巷交叉口路南'
# text = '黄山市屯溪区华馨路38号二楼'
text = '银川市金凤区北京中路福宁城11-1-号'
# 直接将新词动态加入新词的字典中,有冲突的不加,加完记得删除
words = s.seg(text)
# words = s.seg(text)
# print(words)
words = s.seg(text, 'probe')
print('----------------')
print(words)




+ 227
- 0
jiagu/segment/perceptron.py View File

@@ -0,0 +1,227 @@
# -*- coding:utf-8 -*-
import os
import gzip
import pickle
import random
from collections import defaultdict

class AveragedPerceptron(object):
def __init__(self):
# Each feature gets its own weight vector, so weights is a dict-of-dicts
self.weights = {}
self.classes = set()
# The accumulated values, for the averaging. These will be keyed by
# feature/clas tuples
self._totals = defaultdict(int)
# The last time the feature was changed, for the averaging. Also
# keyed by feature/clas tuples
# (tstamps is short for timestamps)
self._tstamps = defaultdict(int)
# Number of instances seen
self.i = 0

def predict(self, features):
'''Dot-product the features and current weights and return the best label.'''
scores = defaultdict(float)
for feat, value in features.items():
if feat not in self.weights or value == 0:
continue
weights = self.weights[feat]
for label, weight in weights.items():
scores[label] += value * weight
# Do a secondary alphabetic sort, for stability
return max(self.classes, key=lambda label: (scores[label], label))

def update(self, truth, guess, features):
'''Update the feature weights.'''
def upd_feat(c, f, w, v):
param = (f, c)
self._totals[param] += (self.i - self._tstamps[param]) * w
self._tstamps[param] = self.i
self.weights[f][c] = w + v

self.i += 1
if truth == guess:
return None
for f in features:
weights = self.weights.setdefault(f, {})
upd_feat(truth, f, weights.get(truth, 0.0), 1.0)
upd_feat(guess, f, weights.get(guess, 0.0), -1.0)
return None

def average_weights(self):
'''Average weights from all iterations.'''
for feat, weights in self.weights.items():
new_feat_weights = {}
for clas, weight in weights.items():
param = (feat, clas)
total = self._totals[param]
total += (self.i - self._tstamps[param]) * weight
averaged = round(total / float(self.i), 3)
if averaged:
new_feat_weights[clas] = averaged
self.weights[feat] = new_feat_weights
return None

class Perceptron:
def __init__(self, loc=None):
self.START = ['-START-', '-START2-']
self.END = ['-END-', '-END2-']
self.model = AveragedPerceptron()
if loc != None:
self.load(loc)

def predict(self, words):
prev, prev2 = self.START
labels = []
context = self.START + words + self.END
for i, word in enumerate(words):
features = self._get_features(i, word, context, prev, prev2)
tag = self.model.predict(features)
labels.append(tag)
prev2 = prev
prev = tag
return labels
def train(self, sentences, save_loc=None, nr_iter=5, shuf=False):
self._make_tagdict(sentences)
for iter_ in range(nr_iter):
c = 0
n = 0
for words, tags in sentences:
prev, prev2 = self.START
context = self.START + words + self.END
for i, word in enumerate(words):
feats = self._get_features(i, word, context, prev, prev2)
guess = self.model.predict(feats)
self.model.update(tags[i], guess, feats)

prev2 = prev
prev = guess
c += guess == tags[i]
n += 1
if shuf == True:
random.shuffle(sentences)
print("Iter {0}: {1}/{2}={3}".format(iter_, c, n, (float(c) / n) * 100))
self.save(save_loc)
self.model.average_weights()
self.save(save_loc)
def save(self, loc='model/ap.model', zip=True):
if zip == False:
pickle.dump((self.model.weights, self.model.classes), open(loc, 'wb'))
else:
pickle.dump((self.model.weights, self.model.classes), gzip.open(loc, 'wb'))
def load(self, loc='model/ap.model', zip=True):
if zip == False:
self.model.weights, self.model.classes = pickle.load(open(loc, 'rb'))
else:
self.model.weights, self.model.classes = pickle.load(gzip.open(loc,'rb'))
def _get_features(self, i, word, context, prev, prev2):
'''Map tokens into a feature representation, implemented as a
{hashable: float} dict. If the features change, a new model must be
trained.
'''
def add(name, *args):
features[' '.join((name,) + tuple(args))] += 1

i += len(self.START)
features = defaultdict(int)
# It's useful to have a constant feature, which acts sort of like a prior
add('bias')
add('i suffix', word[-3:])
add('i pref1', word[0])
add('i-1 tag', prev)
add('i-2 tag', prev2)
add('i tag+i-2 tag', prev, prev2)
add('i word', context[i])
add('i-1 tag+i word', prev, context[i])
add('i-1 word', context[i - 1])
add('i-1 suffix', context[i - 1][-3:])
add('i-2 word', context[i - 2])
add('i+1 word', context[i + 1])
add('i+1 suffix', context[i + 1][-3:])
add('i+2 word', context[i + 2])
return features

def _make_tagdict(self, sentences):
'''Make a tag dictionary for single-tag words.'''
for words, tags in sentences:
for word, tag in zip(words, tags):
self.model.classes.add(tag)
def train(filepath='data/train.txt', model='model/ap.model', nr_iter=1):
tagger = Perceptron()
print('Reading corpus...')
training_data = []
sentence = ([], [])
fin = open(filepath, 'r', encoding='utf8')
for index, line in enumerate(fin):
line = line.strip()
if line == '':
training_data.append(sentence)
sentence = ([], [])
else:
params = line.split()
if len(params) != 2: continue
sentence[0].append(params[0])
sentence[1].append(params[1])
fin.close()
print('training corpus size : %d', len(training_data))
print('Start training...')
tagger.train(training_data, save_loc=model, nr_iter=nr_iter)

def eval(filepath='data/test.txt', model='model/ap.model'):
tagger = Perceptron(model)
print('Start testing...')
right = 0.0
total = 0.0
sentence = ([], [])
fin = open(filepath, 'r', encoding='utf8')
for index, line in enumerate(fin):
line = line.strip()
if line == '':
words = sentence[0]
tags = sentence[1]
outputs = tagger.predict(words)
assert len(tags) == len(outputs)
total += len(tags)
for o, t in zip(outputs, tags):
if o == t: right += 1
sentence = ([], [])
else:
params = line.split()
if len(params) != 2: continue
sentence[0].append(params[0])
sentence[1].append(params[1])
fin.close()
print("Precision : %f", right / total)
def predict(model='model/ap.model'):
tagger = Perceptron(model)

while True:
text = input('>')
words = list(text)
labels = tagger.predict(words)
for word, label in zip(words, labels):
print(word, label)

if __name__ == '__main__':
train()
eval()
# predict()


+ 1
- 1
setup.py View File

@@ -3,7 +3,7 @@
from setuptools import setup

setup(name='jiagu',
version='0.2.0',
version='0.2.1',
description='Jiagu Natural Language Processing',
author='Yener(Zheng Wenyu)',
author_email='help@ownthink.com',


+ 30
- 0
test.py View File

@@ -0,0 +1,30 @@
import jiagu
import jieba

text = '辽宁省铁岭市西丰县房木镇潭清村东屯'
text = '黑龙江省双鸭山市宝清县宝清镇通达街341号'
text = '''茶饮界的流行元素每隔几个月就会更新一次,现在各大咖啡品牌也玩起了跨界。今年9月3日,星巴克在中国内地首次上线了南瓜丝绒拿铁(Pumpkin Spice Latte,简称PSL),这款产品最初于2003年在美国上市,在全球累计卖出2亿杯;在被可口可乐以51亿美元从韦博得集团(Whitbread)收购一年后,一贯低调的COSTA也在今年6月表示将推出冷藏即饮咖啡,中国亦在首批上市市场之列。

最近,连锁咖啡品牌太平洋咖啡与东阿阿胶达成合作,推出5款名为咖啡如此多“胶”的联名产品,分别是:OATLY阿胶红枣拿铁、东阿阿胶拿铁、阿胶红枣拿铁、东阿阿胶抹茶拿铁及东阿阿胶银耳茶拿铁,平均售价约36元。据了解,这一系列产品于2019年10月16日起陆续在北京、上海、广州、深圳、西安、成都、无锡七个城市的太平洋咖啡指定门店内上市。总的来看,阿胶和咖啡相处地比较“融洽”,跨界没有违和感。


太平洋咖啡这次推出的阿胶产品的包装也突出了中国风,咖啡杯套上的人物形象是穿着汉服和旗袍的中国女性形象。太平洋咖啡副董事长李海涛表示:“太平洋咖啡自成立27年来,始终坚持在咖啡这一‘舶来品’中融入中国元素,探求‘中西文化融合’的别样体验。东阿阿胶有近三千年传承历史,作为国家非物质文化遗产代表性传承技艺,可谓是中国传统滋补上品。本次发布的5款合作新饮,既保留了西方咖啡的醇香,又将东阿阿胶的胶香融入其中,充分彰显了‘中西’融合。”


此次与东阿阿胶的合作也可以看做是一种跨界。咖啡品牌与东方滋补产品的结合也显现了“年轻态”、“创新化”的品牌趋势。太平洋咖啡与东阿阿胶的主要消费者都为女性,也都力求在年轻市场实现突破,这样两个品牌的合作也属意料之外、情理之中。'''

words = jiagu.cut(text)

print(words)

print(list(jieba.cut(text)))



# 合并只合并中文四个词以内的
# 字典出现大量的单子,表示可能会出错

# 在里面合并




Loading…
Cancel
Save