| @@ -14,7 +14,7 @@ | |||||
| * 词性标注 | * 词性标注 | ||||
| * 命名实体识别 | * 命名实体识别 | ||||
| * 情感分析 (模型训练中) | * 情感分析 (模型训练中) | ||||
| * 知识图谱关系抽取 (模型训练中) | |||||
| * 知识图谱关系抽取 | |||||
| * 关键词提取 | * 关键词提取 | ||||
| * 文本摘要 | * 文本摘要 | ||||
| * 新词发现 | * 新词发现 | ||||
| @@ -87,7 +87,16 @@ words = jiagu.seg('结婚的和尚未结婚的') | |||||
| print(words) | print(words) | ||||
| ``` | ``` | ||||
| 3. 关键词提取 | |||||
| 3. 知识图谱关系抽取 | |||||
| ```python3 | |||||
| import jiagu | |||||
| text = '姚明(Yao Ming),1980年9月12日出生于上海市徐汇区,祖籍江苏省苏州市吴江区震泽镇,前中国职业篮球运动员,司职中锋,现任中职联公司董事长兼总经理。' | |||||
| knowledge = jiagu.knowledge(text) | |||||
| print(knowledge) | |||||
| ``` | |||||
| 4. 关键词提取 | |||||
| ```python3 | ```python3 | ||||
| import jiagu | import jiagu | ||||
| @@ -103,7 +112,7 @@ keywords = jiagu.keywords(text, 5) # 关键词 | |||||
| print(keywords) | print(keywords) | ||||
| ``` | ``` | ||||
| 4. 文本摘要 | |||||
| 5. 文本摘要 | |||||
| ```python3 | ```python3 | ||||
| fin = open('input.txt', 'r') | fin = open('input.txt', 'r') | ||||
| text = fin.read() | text = fin.read() | ||||
| @@ -113,7 +122,7 @@ summarize = jiagu.summarize(text, 3) # 摘要 | |||||
| print(summarize) | print(summarize) | ||||
| ``` | ``` | ||||
| 5. 新词发现 | |||||
| 6. 新词发现 | |||||
| ```python3 | ```python3 | ||||
| import jiagu | import jiagu | ||||
| @@ -34,4 +34,8 @@ print(summarize) | |||||
| # iagu.findword('input.txt', 'output.txt') # 根据大规模语料,利用信息熵做新词发现。 | # iagu.findword('input.txt', 'output.txt') # 根据大规模语料,利用信息熵做新词发现。 | ||||
| text = '姚明(Yao Ming),1980年9月12日出生于上海市徐汇区,祖籍江苏省苏州市吴江区震泽镇,前中国职业篮球运动员,司职中锋,现任中职联公司董事长兼总经理。' | |||||
| knowledge = jiagu.knowledge(text) | |||||
| print(knowledge) | |||||
| @@ -43,3 +43,5 @@ summarize = any.summarize | |||||
| # 新词发现 | # 新词发现 | ||||
| findword = any.findword | findword = any.findword | ||||
| # 知识图谱 | |||||
| knowledge = any.knowledge | |||||
| @@ -17,140 +17,211 @@ from jiagu.textrank import Summarize | |||||
| def add_curr_dir(name): | def add_curr_dir(name): | ||||
| return os.path.join(os.path.dirname(__file__), name) | |||||
| return os.path.join(os.path.dirname(__file__), name) | |||||
| class Analyze(object): | class Analyze(object): | ||||
| def __init__(self): | |||||
| self.seg_model = None | |||||
| self.pos_model = None | |||||
| self.ner_model = None | |||||
| self.seg_mmseg = None | |||||
| self.keywords_model = None | |||||
| self.summarize_model = None | |||||
| def init(self): | |||||
| self.init_cws() | |||||
| self.init_pos() | |||||
| self.init_ner() | |||||
| def init_cws(self): | |||||
| if self.seg_model is None: | |||||
| self.seg_model = bilstm_crf.Predict(add_curr_dir('model/cws.model')) | |||||
| def load_model(self, model_path): | |||||
| self.seg_model = bilstm_crf.Predict(model_path) | |||||
| def init_pos(self): | |||||
| if self.pos_model is None: | |||||
| self.pos_model = bilstm_crf.Predict(add_curr_dir('model/pos.model')) | |||||
| def init_ner(self): | |||||
| if self.ner_model is None: | |||||
| self.ner_model = bilstm_crf.Predict(add_curr_dir('model/ner.model')) | |||||
| def init_mmseg(self): | |||||
| if self.seg_mmseg is None: | |||||
| self.seg_mmseg = mmseg.MMSeg() | |||||
| @staticmethod | |||||
| def __lab2word(sentence, labels): | |||||
| sen_len = len(sentence) | |||||
| tmp_word = "" | |||||
| words = [] | |||||
| for i in range(sen_len): | |||||
| label = labels[i] | |||||
| w = sentence[i] | |||||
| if label == "B": | |||||
| tmp_word += w | |||||
| elif label == "M": | |||||
| tmp_word += w | |||||
| elif label == "E": | |||||
| tmp_word += w | |||||
| words.append(tmp_word) | |||||
| tmp_word = "" | |||||
| else: | |||||
| tmp_word = "" | |||||
| words.append(w) | |||||
| if tmp_word: | |||||
| words.append(tmp_word) | |||||
| return words | |||||
| def cws_text(self, sentence): | |||||
| if sentence == '': | |||||
| return [''] | |||||
| labels = self.seg_model.predict([sentence])[0] | |||||
| return self.__lab2word(sentence, labels) | |||||
| def cws_list(self, sentences): | |||||
| text_list = sentences | |||||
| all_labels = self.seg_model.predict(text_list) | |||||
| sent_words = [] | |||||
| for ti, text in enumerate(text_list): | |||||
| seg_labels = all_labels[ti] | |||||
| sent_words.append(self.__lab2word(text, seg_labels)) | |||||
| return sent_words | |||||
| def cws(self, sentence, input='text', model='default'): | |||||
| """中文分词 | |||||
| :param sentence: str or list | |||||
| 文本或者文本列表,根据input的模式来定 | |||||
| :param input: str | |||||
| 句子输入的格式,text则为默认的文本,batch则为批量的文本列表 | |||||
| :param model: str | |||||
| 分词所使用的模式,default为默认模式,mmseg为mmseg分词方式 | |||||
| :return: | |||||
| """ | |||||
| if model == 'default': | |||||
| self.init_cws() | |||||
| if input == 'batch': | |||||
| words_list = self.cws_list(sentence) | |||||
| return words_list | |||||
| else: | |||||
| words = self.cws_text(sentence) | |||||
| return words | |||||
| elif model == 'mmseg': | |||||
| self.init_mmseg() | |||||
| words = self.seg_mmseg.cws(sentence) | |||||
| return words | |||||
| else: | |||||
| pass | |||||
| return [] | |||||
| def pos(self, sentence, input='words'): # 传入的是词语 | |||||
| self.init_pos() | |||||
| if input == 'batch': | |||||
| all_labels = self.pos_model.predict(sentence) | |||||
| return all_labels | |||||
| else: | |||||
| labels = self.pos_model.predict([sentence])[0] | |||||
| return labels | |||||
| def ner(self, sentence, input='text'): # 传入的是文本 | |||||
| self.init_ner() | |||||
| if input == 'batch': | |||||
| all_labels = self.ner_model.predict(sentence) | |||||
| return all_labels | |||||
| else: | |||||
| labels = self.ner_model.predict([sentence])[0] | |||||
| return labels | |||||
| def keywords(self, text, topkey=5): | |||||
| if self.keywords_model == None: | |||||
| self.keywords_model = Keywords(tol=0.0001, window=2) | |||||
| return self.keywords_model.keywords(text, topkey) | |||||
| def summarize(self, text, topsen=5): | |||||
| if self.summarize_model == None: | |||||
| self.summarize_model = Summarize(tol=0.0001) | |||||
| return self.summarize_model.summarize(text, topsen) | |||||
| def findword(self, input, output): | |||||
| findword.new_word_find(input, output) | |||||
| def __init__(self): | |||||
| self.seg_model = None | |||||
| self.pos_model = None | |||||
| self.ner_model = None | |||||
| self.kg_model = None | |||||
| self.seg_mmseg = None | |||||
| self.keywords_model = None | |||||
| self.summarize_model = None | |||||
| def init(self): | |||||
| self.init_cws() | |||||
| self.init_pos() | |||||
| self.init_ner() | |||||
| def init_cws(self): | |||||
| if self.seg_model is None: | |||||
| self.seg_model = bilstm_crf.Predict(add_curr_dir('model/cws.model')) | |||||
| def load_model(self, model_path): | |||||
| self.seg_model = bilstm_crf.Predict(model_path) | |||||
| def init_pos(self): | |||||
| if self.pos_model is None: | |||||
| self.pos_model = bilstm_crf.Predict(add_curr_dir('model/pos.model')) | |||||
| def init_ner(self): | |||||
| if self.ner_model is None: | |||||
| self.ner_model = bilstm_crf.Predict(add_curr_dir('model/ner.model')) | |||||
| def init_mmseg(self): | |||||
| if self.seg_mmseg is None: | |||||
| self.seg_mmseg = mmseg.MMSeg() | |||||
| def init_kg(self): | |||||
| if self.kg_model is None: | |||||
| self.kg_model = bilstm_crf.Predict(add_curr_dir('model/kg.model')) | |||||
| @staticmethod | |||||
| def __lab2word(sentence, labels): | |||||
| sen_len = len(sentence) | |||||
| tmp_word = "" | |||||
| words = [] | |||||
| for i in range(sen_len): | |||||
| label = labels[i] | |||||
| w = sentence[i] | |||||
| if label == "B": | |||||
| tmp_word += w | |||||
| elif label == "M": | |||||
| tmp_word += w | |||||
| elif label == "E": | |||||
| tmp_word += w | |||||
| words.append(tmp_word) | |||||
| tmp_word = "" | |||||
| else: | |||||
| tmp_word = "" | |||||
| words.append(w) | |||||
| if tmp_word: | |||||
| words.append(tmp_word) | |||||
| return words | |||||
| def cws_text(self, sentence): | |||||
| if sentence == '': | |||||
| return [''] | |||||
| labels = self.seg_model.predict([sentence])[0] | |||||
| return self.__lab2word(sentence, labels) | |||||
| def cws_list(self, sentences): | |||||
| text_list = sentences | |||||
| all_labels = self.seg_model.predict(text_list) | |||||
| sent_words = [] | |||||
| for ti, text in enumerate(text_list): | |||||
| seg_labels = all_labels[ti] | |||||
| sent_words.append(self.__lab2word(text, seg_labels)) | |||||
| return sent_words | |||||
| def cws(self, sentence, input='text', model='default'): | |||||
| """中文分词 | |||||
| :param sentence: str or list | |||||
| 文本或者文本列表,根据input的模式来定 | |||||
| :param input: str | |||||
| 句子输入的格式,text则为默认的文本,batch则为批量的文本列表 | |||||
| :param model: str | |||||
| 分词所使用的模式,default为默认模式,mmseg为mmseg分词方式 | |||||
| :return: | |||||
| """ | |||||
| if model == 'default': | |||||
| self.init_cws() | |||||
| if input == 'batch': | |||||
| words_list = self.cws_list(sentence) | |||||
| return words_list | |||||
| else: | |||||
| words = self.cws_text(sentence) | |||||
| return words | |||||
| elif model == 'mmseg': | |||||
| self.init_mmseg() | |||||
| words = self.seg_mmseg.cws(sentence) | |||||
| return words | |||||
| else: | |||||
| pass | |||||
| return [] | |||||
| def pos(self, sentence, input='words'): # 传入的是词语 | |||||
| self.init_pos() | |||||
| if input == 'batch': | |||||
| all_labels = self.pos_model.predict(sentence) | |||||
| return all_labels | |||||
| else: | |||||
| labels = self.pos_model.predict([sentence])[0] | |||||
| return labels | |||||
| def ner(self, sentence, input='text'): # 传入的是文本 | |||||
| self.init_ner() | |||||
| if input == 'batch': | |||||
| all_labels = self.ner_model.predict(sentence) | |||||
| return all_labels | |||||
| else: | |||||
| labels = self.ner_model.predict([sentence])[0] | |||||
| return labels | |||||
| def knowledge(self, sentence, input='text'): | |||||
| self.init_kg() | |||||
| if input == 'batch': | |||||
| all_labels = self.kg_model.predict(sentence) | |||||
| result = [] | |||||
| for sent, labels in zip(sentence, all_labels): | |||||
| result.append(self.lab2spo(sent, labels)) | |||||
| return result | |||||
| else: | |||||
| labels = self.kg_model.predict([sentence])[0] | |||||
| return self.lab2spo(sentence, labels) | |||||
| def keywords(self, text, topkey=5): | |||||
| if self.keywords_model == None: | |||||
| self.keywords_model = Keywords(tol=0.0001, window=2) | |||||
| return self.keywords_model.keywords(text, topkey) | |||||
| def summarize(self, text, topsen=5): | |||||
| if self.summarize_model == None: | |||||
| self.summarize_model = Summarize(tol=0.0001) | |||||
| return self.summarize_model.summarize(text, topsen) | |||||
| def findword(self, input, output): | |||||
| findword.new_word_find(input, output) | |||||
| def lab2spo(self, text, epp_labels): | |||||
| subject_list = [] # 存放实体的列表 | |||||
| object_list = [] | |||||
| index = 0 | |||||
| for word, ep in zip(list(text), epp_labels): | |||||
| if ep[0] == 'B' and ep[2:] == '实体': | |||||
| subject_list.append([word, ep[2:], index]) | |||||
| elif (ep[0] == 'I' or ep[0] == 'E') and ep[2:] == '实体': | |||||
| if len(subject_list) == 0: | |||||
| continue | |||||
| subject_list[len(subject_list)-1][0] += word | |||||
| if ep[0] == 'B' and ep[2:] != '实体': | |||||
| object_list.append([word, ep[2:], index]) | |||||
| elif (ep[0] == 'I' or ep[0] == 'E') and ep[2:] != '实体': | |||||
| if len(object_list) == 0: | |||||
| return [] | |||||
| object_list[len(object_list)-1][0] += word | |||||
| index += 1 | |||||
| spo_list = [] | |||||
| if len(subject_list) == 0 or len(object_list) == 0: | |||||
| pass | |||||
| elif len(subject_list) == 1: | |||||
| entity = subject_list[0] | |||||
| for obj in object_list: | |||||
| predicate = obj[1][:-1] | |||||
| spo_list.append([entity[0], predicate, obj[0]]) | |||||
| else: | |||||
| for obj in object_list: | |||||
| entity = [] | |||||
| predicate = obj[1][:-1] | |||||
| direction = obj[1][-1] | |||||
| for sub in subject_list: | |||||
| if direction == '+': | |||||
| if sub[2] > obj[2]: | |||||
| entity = sub | |||||
| break | |||||
| else: | |||||
| if sub[2] < obj[2]: | |||||
| entity = sub | |||||
| if entity == []: | |||||
| continue | |||||
| spo_list.append([entity[0], predicate, obj[0]]) | |||||
| return spo_list | |||||
| @@ -14,169 +14,169 @@ import numpy as np | |||||
| def default_stopwords_file(): | def default_stopwords_file(): | ||||
| d = os.path.dirname(os.path.realpath(__file__)) | |||||
| return os.path.join(d, 'data/stopwords.txt') | |||||
| d = os.path.dirname(os.path.realpath(__file__)) | |||||
| return os.path.join(d, 'data/stopwords.txt') | |||||
| sentence_delimiters = ['。', '?', '!', '…'] | sentence_delimiters = ['。', '?', '!', '…'] | ||||
| allow_speech_tags = ['an', 'i', 'j', 'l', 'n', 'nr', 'nrfg', 'ns', | allow_speech_tags = ['an', 'i', 'j', 'l', 'n', 'nr', 'nrfg', 'ns', | ||||
| 'nt', 'nz', 't', 'v', 'vd', 'vn', 'eng'] | |||||
| 'nt', 'nz', 't', 'v', 'vd', 'vn', 'eng'] | |||||
| def as_text(v): | def as_text(v): | ||||
| """生成unicode字符串""" | |||||
| if v is None: | |||||
| return None | |||||
| elif isinstance(v, bytes): | |||||
| return v.decode('utf-8', errors='ignore') | |||||
| elif isinstance(v, str): | |||||
| return v | |||||
| else: | |||||
| raise ValueError('Unknown type %r' % type(v)) | |||||
| """生成unicode字符串""" | |||||
| if v is None: | |||||
| return None | |||||
| elif isinstance(v, bytes): | |||||
| return v.decode('utf-8', errors='ignore') | |||||
| elif isinstance(v, str): | |||||
| return v | |||||
| else: | |||||
| raise ValueError('Unknown type %r' % type(v)) | |||||
| def is_text(v): | def is_text(v): | ||||
| return isinstance(v, str) | |||||
| return isinstance(v, str) | |||||
| def cut_sentences(sentence): | def cut_sentences(sentence): | ||||
| tmp = [] | |||||
| for ch in sentence: # 遍历字符串中的每一个字 | |||||
| tmp.append(ch) | |||||
| if ch in sentence_delimiters: | |||||
| yield ''.join(tmp) | |||||
| tmp = [] | |||||
| yield ''.join(tmp) | |||||
| tmp = [] | |||||
| for ch in sentence: # 遍历字符串中的每一个字 | |||||
| tmp.append(ch) | |||||
| if ch in sentence_delimiters: | |||||
| yield ''.join(tmp) | |||||
| tmp = [] | |||||
| yield ''.join(tmp) | |||||
| def cut_filter_words(cutted_sentences, stopwords, use_stopwords=False): | def cut_filter_words(cutted_sentences, stopwords, use_stopwords=False): | ||||
| sentences = [] | |||||
| sents = [] | |||||
| for sent in cutted_sentences: | |||||
| sentences.append(sent) | |||||
| if use_stopwords: | |||||
| sents.append([word for word in jiagu.cut(sent) if word and word not in stopwords]) # 把句子分成词语 | |||||
| else: | |||||
| sents.append([word for word in jiagu.cut(sent) if word]) | |||||
| return sentences, sents | |||||
| sentences = [] | |||||
| sents = [] | |||||
| for sent in cutted_sentences: | |||||
| sentences.append(sent) | |||||
| if use_stopwords: | |||||
| sents.append([word for word in jiagu.cut(sent) if word and word not in stopwords]) # 把句子分成词语 | |||||
| else: | |||||
| sents.append([word for word in jiagu.cut(sent) if word]) | |||||
| return sentences, sents | |||||
| def psegcut_filter_words(cutted_sentences, stopwords, use_stopwords=True): | def psegcut_filter_words(cutted_sentences, stopwords, use_stopwords=True): | ||||
| sents = [] | |||||
| sentences = [] | |||||
| for sent in cutted_sentences: | |||||
| sentences.append(sent) | |||||
| sents = [] | |||||
| sentences = [] | |||||
| for sent in cutted_sentences: | |||||
| sentences.append(sent) | |||||
| word_list = jiagu.seg(sent) | |||||
| word_list = [word for word in word_list if len(word) > 0] | |||||
| if use_stopwords: | |||||
| word_list = [word.strip() for word in word_list if word.strip() not in stopwords] | |||||
| sents.append(word_list) | |||||
| return sentences, sents | |||||
| word_list = jiagu.seg(sent) | |||||
| word_list = [word for word in word_list if len(word) > 0] | |||||
| if use_stopwords: | |||||
| word_list = [word.strip() for word in word_list if word.strip() not in stopwords] | |||||
| sents.append(word_list) | |||||
| return sentences, sents | |||||
| def weight_map_rank(weight_graph, max_iter, tol): | def weight_map_rank(weight_graph, max_iter, tol): | ||||
| # 初始分数设置为0.5 | |||||
| # 初始化每个句子的分子和老分数 | |||||
| scores = [0.5 for _ in range(len(weight_graph))] | |||||
| old_scores = [0.0 for _ in range(len(weight_graph))] | |||||
| denominator = get_degree(weight_graph) | |||||
| # 开始迭代 | |||||
| count = 0 | |||||
| while different(scores, old_scores, tol): | |||||
| for i in range(len(weight_graph)): | |||||
| old_scores[i] = scores[i] | |||||
| # 计算每个句子的分数 | |||||
| for i in range(len(weight_graph)): | |||||
| scores[i] = get_score(weight_graph, denominator, i) | |||||
| count += 1 | |||||
| if count > max_iter: | |||||
| break | |||||
| return scores | |||||
| # 初始分数设置为0.5 | |||||
| # 初始化每个句子的分子和老分数 | |||||
| scores = [0.5 for _ in range(len(weight_graph))] | |||||
| old_scores = [0.0 for _ in range(len(weight_graph))] | |||||
| denominator = get_degree(weight_graph) | |||||
| # 开始迭代 | |||||
| count = 0 | |||||
| while different(scores, old_scores, tol): | |||||
| for i in range(len(weight_graph)): | |||||
| old_scores[i] = scores[i] | |||||
| # 计算每个句子的分数 | |||||
| for i in range(len(weight_graph)): | |||||
| scores[i] = get_score(weight_graph, denominator, i) | |||||
| count += 1 | |||||
| if count > max_iter: | |||||
| break | |||||
| return scores | |||||
| def get_degree(weight_graph): | def get_degree(weight_graph): | ||||
| length = len(weight_graph) | |||||
| denominator = [0.0 for _ in range(len(weight_graph))] | |||||
| for j in range(length): | |||||
| for k in range(length): | |||||
| denominator[j] += weight_graph[j][k] | |||||
| if denominator[j] == 0: | |||||
| denominator[j] = 1.0 | |||||
| return denominator | |||||
| length = len(weight_graph) | |||||
| denominator = [0.0 for _ in range(len(weight_graph))] | |||||
| for j in range(length): | |||||
| for k in range(length): | |||||
| denominator[j] += weight_graph[j][k] | |||||
| if denominator[j] == 0: | |||||
| denominator[j] = 1.0 | |||||
| return denominator | |||||
| def get_score(weight_graph, denominator, i): | def get_score(weight_graph, denominator, i): | ||||
| """ | |||||
| :param weight_graph: | |||||
| :param denominator: | |||||
| :param i: int | |||||
| 第i个句子 | |||||
| :return: float | |||||
| """ | |||||
| length = len(weight_graph) | |||||
| d = 0.85 | |||||
| added_score = 0.0 | |||||
| for j in range(length): | |||||
| # [j,i]是指句子j指向句子i | |||||
| fraction = weight_graph[j][i] * 1.0 | |||||
| # 除以j的出度 | |||||
| added_score += fraction / denominator[j] | |||||
| weighted_score = (1 - d) + d * added_score | |||||
| return weighted_score | |||||
| """ | |||||
| :param weight_graph: | |||||
| :param denominator: | |||||
| :param i: int | |||||
| 第i个句子 | |||||
| :return: float | |||||
| """ | |||||
| length = len(weight_graph) | |||||
| d = 0.85 | |||||
| added_score = 0.0 | |||||
| for j in range(length): | |||||
| # [j,i]是指句子j指向句子i | |||||
| fraction = weight_graph[j][i] * 1.0 | |||||
| # 除以j的出度 | |||||
| added_score += fraction / denominator[j] | |||||
| weighted_score = (1 - d) + d * added_score | |||||
| return weighted_score | |||||
| def different(scores, old_scores, tol=0.0001): | def different(scores, old_scores, tol=0.0001): | ||||
| flag = False | |||||
| for i in range(len(scores)): | |||||
| if math.fabs(scores[i] - old_scores[i]) >= tol: # 原始是0.0001 | |||||
| flag = True | |||||
| break | |||||
| return flag | |||||
| flag = False | |||||
| for i in range(len(scores)): | |||||
| if math.fabs(scores[i] - old_scores[i]) >= tol: # 原始是0.0001 | |||||
| flag = True | |||||
| break | |||||
| return flag | |||||
| def cosine_similarity(vec1, vec2): | def cosine_similarity(vec1, vec2): | ||||
| """计算两个向量的余弦相似度 | |||||
| """计算两个向量的余弦相似度 | |||||
| :param vec1: list or np.array | |||||
| :param vec2: list or np.array | |||||
| :return: float | |||||
| """ | |||||
| tx = np.array(vec1) | |||||
| ty = np.array(vec2) | |||||
| cos1 = np.sum(tx * ty) | |||||
| cos21 = np.sqrt(sum(tx ** 2)) | |||||
| cos22 = np.sqrt(sum(ty ** 2)) | |||||
| cosine_value = cos1 / float(cos21 * cos22) | |||||
| return cosine_value | |||||
| :param vec1: list or np.array | |||||
| :param vec2: list or np.array | |||||
| :return: float | |||||
| """ | |||||
| tx = np.array(vec1) | |||||
| ty = np.array(vec2) | |||||
| cos1 = np.sum(tx * ty) | |||||
| cos21 = np.sqrt(sum(tx ** 2)) | |||||
| cos22 = np.sqrt(sum(ty ** 2)) | |||||
| cosine_value = cos1 / float(cos21 * cos22) | |||||
| return cosine_value | |||||
| def combine(word_list, window=2): | def combine(word_list, window=2): | ||||
| if window < 2: | |||||
| window = 2 | |||||
| for x in range(1, window): | |||||
| if x >= len(word_list): | |||||
| break | |||||
| word_list2 = word_list[x:] | |||||
| res = zip(word_list, word_list2) | |||||
| for r in res: | |||||
| yield r | |||||
| if window < 2: | |||||
| window = 2 | |||||
| for x in range(1, window): | |||||
| if x >= len(word_list): | |||||
| break | |||||
| word_list2 = word_list[x:] | |||||
| res = zip(word_list, word_list2) | |||||
| for r in res: | |||||
| yield r | |||||
| def sentences_similarity(s1, s2): | def sentences_similarity(s1, s2): | ||||
| """计算两个句子的相似度 | |||||
| :param s1: list | |||||
| :param s2: list | |||||
| :return: float | |||||
| """ | |||||
| counter = 0 | |||||
| for sent in s1: | |||||
| if sent in s2: | |||||
| counter += 1 | |||||
| if counter == 0: | |||||
| return 0 | |||||
| return counter / (math.log(len(s1) + len(s2))) | |||||
| """计算两个句子的相似度 | |||||
| :param s1: list | |||||
| :param s2: list | |||||
| :return: float | |||||
| """ | |||||
| counter = 0 | |||||
| for sent in s1: | |||||
| if sent in s2: | |||||
| counter += 1 | |||||
| if counter == 0: | |||||
| return 0 | |||||
| return counter / (math.log(len(s1) + len(s2))) | |||||