From d61108fa1379fd3daf69a0ee9527bf92b60c8b5e Mon Sep 17 00:00:00 2001
From: Yener <help@ownthink.com>
Date: Sat, 7 Dec 2019 23:57:08 +0800
Subject: [PATCH] update

---
 jiagu/analyze.py                    |  10 +-
 jiagu/segment/dict/jiagu.dict       |   5 +-
 jiagu/{ => segment}/model/cws.model | Bin
 jiagu/segment/nroute.py             | 159 ++++++++++++++++++-
 jiagu/segment/perceptron.py         | 227 ++++++++++++++++++++++++++++
 setup.py                            |   2 +-
 test.py                             |  30 ++++
 7 files changed, 414 insertions(+), 19 deletions(-)
 rename jiagu/{ => segment}/model/cws.model (100%)
 create mode 100644 jiagu/segment/perceptron.py
 create mode 100644 test.py

diff --git a/jiagu/analyze.py b/jiagu/analyze.py
index 29ab5bd..a166394 100644
--- a/jiagu/analyze.py
+++ b/jiagu/analyze.py
@@ -104,20 +104,14 @@ class Analyze(object):
 
 		:param sentence: str or list
 			文本或者文本列表，根据input的模式来定
-		:param input: str
-			句子输入的格式，text则为默认的文本，batch则为批量的文本列表
 		:param model: str
-			分词所使用的模式，default为默认模式，mmseg为mmseg分词方式
+			分词所使用的模式，default为默认模式包含新词发现
 		:return:
 		"""
 		if model == 'default':
 			self.init_cws()
 			words = self.cws_text(sentence)
 			return words
-		elif model == 'mmseg':
-			self.init_mmseg()
-			words = self.seg_mmseg.cws(sentence)
-			return words
 		else:
 			pass
 		return []
@@ -158,7 +152,7 @@ class Analyze(object):
 		
 	def text_cluster(self, docs, features_method='tfidf', method="k-means", k=3, max_iter=100, eps=0.5, min_pts=2):
 		return cluster(docs, features_method, method, k, max_iter, eps, min_pts, self.seg)
-		
+	
 	def lab2spo(self, words, epp_labels):
 		subject_list = [] # 存放实体的列表
 		object_list = []
diff --git a/jiagu/segment/dict/jiagu.dict b/jiagu/segment/dict/jiagu.dict
index e748331..5be6cb1 100644
--- a/jiagu/segment/dict/jiagu.dict
+++ b/jiagu/segment/dict/jiagu.dict
@@ -174122,7 +174122,7 @@ win键	7
 河北涿县	3
 河北满城	3
 河北电视台	5
-河北省	358
+河北省	359
 河北省保定	3
 河北省妇联	3
 河北省委	17
@@ -174131,7 +174131,6 @@ win键	7
 河北省科协	3
 河北省科委	3
 河北省纪委	3
-河北省衡水	3
 河北省邯郸	3
 河北籍	3
 河北赵县	3
@@ -245176,7 +245175,7 @@ win键	7
 衡庭汉	34
 衡志诚	4
 衡某	3
-衡水	29
+衡水	30
 衡水市	17
 衡讯	3
 衡诸	8
diff --git a/jiagu/model/cws.model b/jiagu/segment/model/cws.model
similarity index 100%
rename from jiagu/model/cws.model
rename to jiagu/segment/model/cws.model
diff --git a/jiagu/segment/nroute.py b/jiagu/segment/nroute.py
index d55881c..2a72098 100644
--- a/jiagu/segment/nroute.py
+++ b/jiagu/segment/nroute.py
@@ -2,6 +2,7 @@
 import os
 import sys
 from math import log
+from jiagu.perceptron import Perceptron
 
 re_eng = re.compile('[a-zA-Z0-9]', re.U)
 re_han = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U)
@@ -14,10 +15,13 @@ class Segment:
 		self.max_freq = 0
 		self.total_freq = 0
 		self.initialized = False
+		self.model = None
 
-	def init(self, vocab_path='dict/jiagu.dict', user_vocab='dict/user.dict'):
+	def init(self, vocab_path='dict/jiagu.dict', user_vocab='dict/user.dict',
+												model_path='model/cws.model'):
 		self.load_vocab(os.path.join(os.path.dirname(__file__), vocab_path))
 		self.load_vocab(os.path.join(os.path.dirname(__file__), user_vocab))
+		self.model = Perceptron(os.path.join(os.path.dirname(__file__), model_path))
 		self.initialized = True
 	
 	def load_vocab(self, vocab_path):
@@ -52,6 +56,18 @@ class Segment:
 		if len(word) > self.max_word_len:
 			self.max_word_len = len(word)
 			
+	def del_vocab(self, word=None, freq=None, tag=None):
+		if word not in self.vocab:
+			return None
+
+		vocab_freq = self.vocab[word]
+		if freq == None or vocab_freq <= freq:
+			del self.vocab[word]
+			self.total_freq -= vocab_freq
+		else:
+			self.vocab[word] -= freq
+		# self.max_freq and self.max_word_len ?
+			
 	def load_userdict(self, userdict):
 		if self.initialized == False:
 			self.init()
@@ -70,6 +86,22 @@ class Segment:
 					self.add_vocab(word, freq)
 			elif isinstance(item, str):
 				self.add_vocab(word=item)
+				
+	def del_userdict(self, userdict):
+		if self.initialized == False:
+			self.init()
+		
+		for item in userdict:
+			if isinstance(item, list):
+				if len(item) == 1:
+					word = item[0]
+					self.del_vocab(word)
+				elif len(item) == 2:
+					word = item[0]
+					freq = item[1]
+					self.del_vocab(word, freq)
+			elif isinstance(item, str):
+				self.del_vocab(word=item)
 	
 	def calc_route(self, sentence, DAG, route):
 		vocab = self.vocab
@@ -149,7 +181,39 @@ class Segment:
 			yield buf
 			buf = ''
 		
-	def seg_default(self, sentence, mode):
+	def model_cut(self, sentence):
+		if sentence == '':
+			return ['']
+			
+		sentence = list(sentence)
+		labels = self.model.predict(sentence)
+		return self.__lab2word(sentence, labels)
+		
+	def __lab2word(self, sentence, labels):
+		sen_len = len(sentence)
+		tmp_word = ""
+		words = []
+		for i in range(sen_len):
+			label = labels[i]
+			w = sentence[i]
+			if label == "B":
+				tmp_word += w
+			elif label == "M":
+				tmp_word += w
+			elif label == "E":
+				tmp_word += w
+				words.append(tmp_word)
+				tmp_word = ""
+			else:
+				if tmp_word != '':
+					words.append(tmp_word)
+					tmp_word = ""
+				words.append(w)
+		if tmp_word:
+			words.append(tmp_word)
+		return words
+
+	def seg_default(self, sentence):
 		blocks = re_han.split(sentence)
 		cut_block = self.cut_words
 		cut_all = False
@@ -170,11 +234,75 @@ class Segment:
 					else:
 						yield x
 						
+	def seg_new_word(self, sentence):
+		blocks = re_han.split(sentence)
+		cut_block = self.cut_words
+		cut_all = False
+		for block in blocks:
+			if not block:
+				continue
+			if re_han.match(block):
+				words1 = list(cut_block(block))
+				print(words1)
+
+				words2 = self.model_cut(block)
+				print(words2)
+				
+
+				# new_word = [] # 有冲突的不加，长度大于4的不加，加完记得删除
+				# length = len(words1)
+				# for n in range(3):
+					# can_limit = length - n + 1
+					# for i in range(0, can_limit):
+						# ngram = ''.join(words1[i:i + n])
+						# word_len = len(ngram)
+						# if word_len > 4 or word_len==1:
+							# continue
+						# if ngram in words2 and ngram not in words1:
+							# print(ngram)
+							# new_word.append([ngram, 1])
+				
+				new_word = []
+				for word in words2:
+					if word not in words1 and len(word)>1 and len(word) < 4 :#and not re_eng.match(word):
+						new_word.append([word, 1])
+				
+				
+				self.load_userdict(new_word)
+				
+
+				
+				# print('------------------')
+				
+				for word in cut_block(block):
+					yield word
+					
+				# 删除字典
+				self.del_userdict(new_word)
+				
+				
+			else:
+				tmp = re_skip.split(block)
+				for x in tmp:
+					if re_skip.match(x):
+						yield x
+					elif not cut_all:
+						for xx in x:
+							yield xx
+					else:
+						yield x
+						
 	def seg(self, sentence, mode="default"):
 		if self.initialized == False:
 			self.init()
-	
-		return list(self.seg_default(sentence, mode=mode))
+				
+		if mode == 'probe':
+			return list(self.seg_new_word(sentence))
+		else:
+			return list(self.seg_default(sentence))
+		
+		
+		
 
 if __name__=='__main__':
 	s = Segment()
@@ -182,13 +310,30 @@ if __name__=='__main__':
 	# sg.load_userdict('dict/user.dict')
 	# s.load_userdict(['知识图谱'])
 
-	text = '情感分析了解一下？一个比情感词典、机器学习更好的方法'
+	# text = '辽宁省铁岭市西丰县房木镇潭清村东屯' # bug
+	# text = '黑龙江省双鸭山市宝清县宝清镇通达街341号'
+	# text = '浙江省杭州市西湖区三墩镇紫宣路158号1幢801室'
+	# text = '北京市西城区茶马街8号院1号楼15层1502'
+	# text = '西藏自治区林芝市米林县羌纳乡羌渡岗村'
+	# text = '深圳市南山区西丽街道松坪山社区宝深路科陆大厦B座13层B05'
+	# text = '深圳市福田区福强路中港城裙楼6E部分602-A' # bug
+	# text = '深圳市福田区福保街道石厦北二街89号新港商城C座3305室'
+	# text = '五常市向阳镇致富村庆丰营屯'
+	# text = '中牟县中兴路与益民巷交叉口路南'
+	# text = '黄山市屯溪区华馨路38号二楼'
+	text = '银川市金凤区北京中路福宁城11-1-号'
+	
+	# 直接将新词动态加入新词的字典中，有冲突的不加，加完记得删除
 	
-	words = s.seg(text)
+	
+	# words = s.seg(text)
+	# print(words)
+	
+	words = s.seg(text, 'probe')
+	print('----------------')
 	print(words)
 	
 
-
 	
 	
 	
diff --git a/jiagu/segment/perceptron.py b/jiagu/segment/perceptron.py
new file mode 100644
index 0000000..3ab7584
--- /dev/null
+++ b/jiagu/segment/perceptron.py
@@ -0,0 +1,227 @@
+# -*- coding:utf-8 -*-
+import os
+import gzip
+import pickle
+import random
+from collections import defaultdict
+
+class AveragedPerceptron(object):
+    def __init__(self):
+        # Each feature gets its own weight vector, so weights is a dict-of-dicts
+        self.weights = {}
+        self.classes = set()
+        # The accumulated values, for the averaging. These will be keyed by
+        # feature/clas tuples
+        self._totals = defaultdict(int)
+        # The last time the feature was changed, for the averaging. Also
+        # keyed by feature/clas tuples
+        # (tstamps is short for timestamps)
+        self._tstamps = defaultdict(int)
+        # Number of instances seen
+        self.i = 0
+
+    def predict(self, features):
+        '''Dot-product the features and current weights and return the best label.'''
+        scores = defaultdict(float)
+        for feat, value in features.items():
+            if feat not in self.weights or value == 0:
+                continue
+            weights = self.weights[feat]
+            for label, weight in weights.items():
+                scores[label] += value * weight
+        # Do a secondary alphabetic sort, for stability
+        return max(self.classes, key=lambda label: (scores[label], label))
+
+    def update(self, truth, guess, features):
+        '''Update the feature weights.'''
+        def upd_feat(c, f, w, v):
+            param = (f, c)
+            self._totals[param] += (self.i - self._tstamps[param]) * w
+            self._tstamps[param] = self.i
+            self.weights[f][c] = w + v
+
+        self.i += 1
+        if truth == guess:
+            return None
+        for f in features:
+            weights = self.weights.setdefault(f, {})
+            upd_feat(truth, f, weights.get(truth, 0.0), 1.0)
+            upd_feat(guess, f, weights.get(guess, 0.0), -1.0)
+        return None
+
+    def average_weights(self):
+        '''Average weights from all iterations.'''
+        for feat, weights in self.weights.items():
+            new_feat_weights = {}
+            for clas, weight in weights.items():
+                param = (feat, clas)
+                total = self._totals[param]
+                total += (self.i - self._tstamps[param]) * weight
+                averaged = round(total / float(self.i), 3)
+                if averaged:
+                    new_feat_weights[clas] = averaged
+            self.weights[feat] = new_feat_weights
+        return None
+
+class Perceptron:
+	def __init__(self, loc=None):
+		self.START = ['-START-', '-START2-']
+		self.END = ['-END-', '-END2-']
+		self.model = AveragedPerceptron()
+		
+		if loc != None:
+			self.load(loc)
+
+	def predict(self, words):
+		prev, prev2 = self.START
+		labels = []
+		context = self.START + words + self.END
+		for i, word in enumerate(words):
+			features = self._get_features(i, word, context, prev, prev2)
+			tag = self.model.predict(features)
+			labels.append(tag)
+			prev2 = prev
+			prev = tag
+		return labels
+		
+	def train(self, sentences, save_loc=None, nr_iter=5, shuf=False):
+		self._make_tagdict(sentences)
+		for iter_ in range(nr_iter):
+			c = 0
+			n = 0
+			for words, tags in sentences:
+				prev, prev2 = self.START
+				context = self.START + words + self.END
+				for i, word in enumerate(words):
+					feats = self._get_features(i, word, context, prev, prev2)
+					guess = self.model.predict(feats)
+					self.model.update(tags[i], guess, feats)
+
+					prev2 = prev
+					prev = guess
+					c += guess == tags[i]
+					n += 1
+			if shuf == True:
+				random.shuffle(sentences)
+				
+			print("Iter {0}: {1}/{2}={3}".format(iter_, c, n, (float(c) / n) * 100))
+			self.save(save_loc)
+			
+		self.model.average_weights()
+		self.save(save_loc)
+		
+	def save(self, loc='model/ap.model', zip=True):
+		if zip == False:
+			pickle.dump((self.model.weights, self.model.classes), open(loc, 'wb'))
+		else:
+			pickle.dump((self.model.weights, self.model.classes), gzip.open(loc, 'wb'))
+			
+	def load(self, loc='model/ap.model', zip=True):
+		if zip == False:
+			self.model.weights, self.model.classes = pickle.load(open(loc, 'rb'))
+		else:
+			self.model.weights, self.model.classes = pickle.load(gzip.open(loc,'rb'))
+			
+	def _get_features(self, i, word, context, prev, prev2):
+		'''Map tokens into a feature representation, implemented as a
+		{hashable: float} dict. If the features change, a new model must be
+		trained.
+		'''
+		def add(name, *args):
+			features[' '.join((name,) + tuple(args))] += 1
+
+		i += len(self.START)
+		features = defaultdict(int)
+		# It's useful to have a constant feature, which acts sort of like a prior
+		add('bias')
+		add('i suffix', word[-3:])
+		add('i pref1', word[0])
+		add('i-1 tag', prev)
+		add('i-2 tag', prev2)
+		add('i tag+i-2 tag', prev, prev2)
+		add('i word', context[i])
+		add('i-1 tag+i word', prev, context[i])
+		add('i-1 word', context[i - 1])
+		add('i-1 suffix', context[i - 1][-3:])
+		add('i-2 word', context[i - 2])
+		add('i+1 word', context[i + 1])
+		add('i+1 suffix', context[i + 1][-3:])
+		add('i+2 word', context[i + 2])
+		return features
+
+	def _make_tagdict(self, sentences):
+		'''Make a tag dictionary for single-tag words.'''
+		for words, tags in sentences:
+			for word, tag in zip(words, tags):
+				self.model.classes.add(tag)
+				
+def train(filepath='data/train.txt', model='model/ap.model', nr_iter=1):
+	tagger = Perceptron()
+	print('Reading corpus...')
+	training_data = []
+	sentence = ([], [])
+	fin = open(filepath, 'r', encoding='utf8')
+	for index, line in enumerate(fin):
+		line = line.strip()
+		if line == '':
+			training_data.append(sentence)
+			sentence = ([], [])
+		else:
+			params = line.split()
+			if len(params) != 2: continue
+			sentence[0].append(params[0])
+			sentence[1].append(params[1])
+	fin.close()
+	print('training corpus size : %d', len(training_data))
+	print('Start training...')
+	tagger.train(training_data, save_loc=model, nr_iter=nr_iter)
+
+def eval(filepath='data/test.txt', model='model/ap.model'):
+	tagger = Perceptron(model)
+	
+	print('Start testing...')
+	right = 0.0
+	total = 0.0
+	sentence = ([], [])
+	fin = open(filepath, 'r', encoding='utf8')
+	for index, line in enumerate(fin):
+		line = line.strip()
+		if line == '':
+			words = sentence[0]
+			tags = sentence[1]
+			outputs = tagger.predict(words)
+			assert len(tags) == len(outputs)
+			total += len(tags)
+			for o, t in zip(outputs, tags):
+				if o == t: right += 1
+			sentence = ([], [])
+		else:
+			params = line.split()
+			if len(params) != 2: continue
+			sentence[0].append(params[0])
+			sentence[1].append(params[1])
+	fin.close()
+	print("Precision : %f", right / total)
+	
+def predict(model='model/ap.model'):
+	tagger = Perceptron(model)
+
+	while True:
+		text = input('>')
+		words = list(text)
+		labels = tagger.predict(words)
+		
+		for word, label in zip(words, labels):
+			print(word, label)
+			
+
+if __name__ == '__main__':
+	train()
+	eval()
+	# predict()
+	
+	
+
+	
+	
+	
\ No newline at end of file
diff --git a/setup.py b/setup.py
index bddcc7e..1a47021 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@
 from setuptools import setup
 
 setup(name='jiagu',
-      version='0.2.0',
+      version='0.2.1',
       description='Jiagu Natural Language Processing',
       author='Yener(Zheng Wenyu)',
       author_email='help@ownthink.com',
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..223522a
--- /dev/null
+++ b/test.py
@@ -0,0 +1,30 @@
+import jiagu
+import jieba
+
+text = '辽宁省铁岭市西丰县房木镇潭清村东屯'
+text = '黑龙江省双鸭山市宝清县宝清镇通达街341号'
+text = '''茶饮界的流行元素每隔几个月就会更新一次,现在各大咖啡品牌也玩起了跨界。今年9月3日,星巴克在中国内地首次上线了南瓜丝绒拿铁(Pumpkin Spice Latte,简称PSL),这款产品最初于2003年在美国上市,在全球累计卖出2亿杯;在被可口可乐以51亿美元从韦博得集团(Whitbread)收购一年后,一贯低调的COSTA也在今年6月表示将推出冷藏即饮咖啡,中国亦在首批上市市场之列。
+
+最近,连锁咖啡品牌太平洋咖啡与东阿阿胶达成合作,推出5款名为咖啡如此多“胶”的联名产品,分别是:OATLY阿胶红枣拿铁、东阿阿胶拿铁、阿胶红枣拿铁、东阿阿胶抹茶拿铁及东阿阿胶银耳茶拿铁,平均售价约36元。据了解,这一系列产品于2019年10月16日起陆续在北京、上海、广州、深圳、西安、成都、无锡七个城市的太平洋咖啡指定门店内上市。总的来看,阿胶和咖啡相处地比较“融洽”,跨界没有违和感。
+
+
+太平洋咖啡这次推出的阿胶产品的包装也突出了中国风,咖啡杯套上的人物形象是穿着汉服和旗袍的中国女性形象。太平洋咖啡副董事长李海涛表示:“太平洋咖啡自成立27年来,始终坚持在咖啡这一‘舶来品’中融入中国元素,探求‘中西文化融合’的别样体验。东阿阿胶有近三千年传承历史,作为国家非物质文化遗产代表性传承技艺,可谓是中国传统滋补上品。本次发布的5款合作新饮,既保留了西方咖啡的醇香,又将东阿阿胶的胶香融入其中,充分彰显了‘中西’融合。”
+
+
+此次与东阿阿胶的合作也可以看做是一种跨界。咖啡品牌与东方滋补产品的结合也显现了“年轻态”、“创新化”的品牌趋势。太平洋咖啡与东阿阿胶的主要消费者都为女性,也都力求在年轻市场实现突破,这样两个品牌的合作也属意料之外、情理之中。'''
+
+words = jiagu.cut(text)
+
+print(words)
+
+print(list(jieba.cut(text)))
+
+
+
+# 合并只合并中文四个词以内的
+# 字典出现大量的单子，表示可能会出错
+
+# 在里面合并
+
+
+