diff --git a/jiagu/utils.py b/jiagu/utils.py index bb19024..fbc24ad 100644 --- a/jiagu/utils.py +++ b/jiagu/utils.py @@ -14,101 +14,101 @@ import numpy as np def default_stopwords_file(): - d = os.path.dirname(os.path.realpath(__file__)) - return os.path.join(d, 'data/stopwords.txt') + d = os.path.dirname(os.path.realpath(__file__)) + return os.path.join(d, 'data/stopwords.txt') sentence_delimiters = ['。', '?', '!', '…'] allow_speech_tags = ['an', 'i', 'j', 'l', 'n', 'nr', 'nrfg', 'ns', - 'nt', 'nz', 't', 'v', 'vd', 'vn', 'eng'] + 'nt', 'nz', 't', 'v', 'vd', 'vn', 'eng'] def as_text(v): - """生成unicode字符串""" - if v is None: - return None - elif isinstance(v, bytes): - return v.decode('utf-8', errors='ignore') - elif isinstance(v, str): - return v - else: - raise ValueError('Unknown type %r' % type(v)) + """生成unicode字符串""" + if v is None: + return None + elif isinstance(v, bytes): + return v.decode('utf-8', errors='ignore') + elif isinstance(v, str): + return v + else: + raise ValueError('Unknown type %r' % type(v)) def is_text(v): - return isinstance(v, str) + return isinstance(v, str) def cut_sentences(sentence): - tmp = [] - for ch in sentence: # 遍历字符串中的每一个字 - tmp.append(ch) - if ch in sentence_delimiters: - yield ''.join(tmp) - tmp = [] - yield ''.join(tmp) + tmp = [] + for ch in sentence: # 遍历字符串中的每一个字 + tmp.append(ch) + if ch in sentence_delimiters: + yield ''.join(tmp) + tmp = [] + yield ''.join(tmp) def cut_filter_words(cutted_sentences, stopwords, use_stopwords=False): - sentences = [] - sents = [] - for sent in cutted_sentences: - sentences.append(sent) - if use_stopwords: - sents.append([word for word in jiagu.cut(sent) if word and word not in stopwords]) # 把句子分成词语 - else: - sents.append([word for word in jiagu.cut(sent) if word]) - return sentences, sents + sentences = [] + sents = [] + for sent in cutted_sentences: + sentences.append(sent) + if use_stopwords: + sents.append([word for word in jiagu.cut(sent) if word and word not in stopwords]) # 把句子分成词语 + else: + sents.append([word for word in jiagu.cut(sent) if word]) + return sentences, sents def psegcut_filter_words(cutted_sentences, stopwords, use_stopwords=True): - sents = [] - sentences = [] - for sent in cutted_sentences: - sentences.append(sent) + sents = [] + sentences = [] + for sent in cutted_sentences: + sentences.append(sent) - word_list = jiagu.seg(sent) - word_list = [word for word in word_list if len(word) > 0] - if use_stopwords: - word_list = [word.strip() for word in word_list if word.strip() not in stopwords] - sents.append(word_list) - return sentences, sents + word_list = jiagu.seg(sent) + word_list = [word for word in word_list if len(word) > 0] + if use_stopwords: + word_list = [word.strip() for word in word_list if word.strip() not in stopwords] + sents.append(word_list) + return sentences, sents def weight_map_rank(weight_graph, max_iter, tol): - # 初始分数设置为0.5 - # 初始化每个句子的分子和老分数 - scores = [0.5 for _ in range(len(weight_graph))] - old_scores = [0.0 for _ in range(len(weight_graph))] - denominator = get_degree(weight_graph) - - # 开始迭代 - count = 0 - while different(scores, old_scores, tol): - for i in range(len(weight_graph)): - old_scores[i] = scores[i] - # 计算每个句子的分数 - for i in range(len(weight_graph)): - scores[i] = get_score(weight_graph, denominator, i) - count += 1 - if count > max_iter: - break - return scores + # 初始分数设置为0.5 + # 初始化每个句子的分子和老分数 + scores = [0.5 for _ in range(len(weight_graph))] + old_scores = [0.0 for _ in range(len(weight_graph))] + denominator = get_degree(weight_graph) + + # 开始迭代 + count = 0 + while different(scores, old_scores, tol): + for i in range(len(weight_graph)): + old_scores[i] = scores[i] + # 计算每个句子的分数 + for i in range(len(weight_graph)): + scores[i] = get_score(weight_graph, denominator, i) + count += 1 + if count > max_iter: + break + return scores def get_degree(weight_graph): - length = len(weight_graph) - denominator = [0.0 for _ in range(len(weight_graph))] - for j in range(length): - for k in range(length): - denominator[j] += weight_graph[j][k] - if denominator[j] == 0: - denominator[j] = 1.0 - return denominator + length = len(weight_graph) + denominator = [0.0 for _ in range(len(weight_graph))] + for j in range(length): + for k in range(length): + denominator[j] += weight_graph[j][k] + if denominator[j] == 0: + denominator[j] = 1.0 + return denominator def get_score(weight_graph, denominator, i): - """ + """ :param weight_graph: :param denominator: @@ -116,67 +116,125 @@ def get_score(weight_graph, denominator, i): 第i个句子 :return: float """ - length = len(weight_graph) - d = 0.85 - added_score = 0.0 + length = len(weight_graph) + d = 0.85 + added_score = 0.0 - for j in range(length): - # [j,i]是指句子j指向句子i - fraction = weight_graph[j][i] * 1.0 - # 除以j的出度 - added_score += fraction / denominator[j] - weighted_score = (1 - d) + d * added_score - return weighted_score + for j in range(length): + # [j,i]是指句子j指向句子i + fraction = weight_graph[j][i] * 1.0 + # 除以j的出度 + added_score += fraction / denominator[j] + weighted_score = (1 - d) + d * added_score + return weighted_score def different(scores, old_scores, tol=0.0001): - flag = False - for i in range(len(scores)): - if math.fabs(scores[i] - old_scores[i]) >= tol: # 原始是0.0001 - flag = True - break - return flag + flag = False + for i in range(len(scores)): + if math.fabs(scores[i] - old_scores[i]) >= tol: # 原始是0.0001 + flag = True + break + return flag def cosine_similarity(vec1, vec2): - """计算两个向量的余弦相似度 + """计算两个向量的余弦相似度 :param vec1: list or np.array :param vec2: list or np.array :return: float """ - tx = np.array(vec1) - ty = np.array(vec2) - cos1 = np.sum(tx * ty) - cos21 = np.sqrt(sum(tx ** 2)) - cos22 = np.sqrt(sum(ty ** 2)) - cosine_value = cos1 / float(cos21 * cos22) - return cosine_value + tx = np.array(vec1) + ty = np.array(vec2) + cos1 = np.sum(tx * ty) + cos21 = np.sqrt(sum(tx ** 2)) + cos22 = np.sqrt(sum(ty ** 2)) + cosine_value = cos1 / float(cos21 * cos22) + return cosine_value def combine(word_list, window=2): - if window < 2: - window = 2 - for x in range(1, window): - if x >= len(word_list): - break - word_list2 = word_list[x:] - res = zip(word_list, word_list2) - for r in res: - yield r + if window < 2: + window = 2 + for x in range(1, window): + if x >= len(word_list): + break + word_list2 = word_list[x:] + res = zip(word_list, word_list2) + for r in res: + yield r def sentences_similarity(s1, s2): - """计算两个句子的相似度 + """计算两个句子的相似度 :param s1: list :param s2: list :return: float """ - counter = 0 - for sent in s1: - if sent in s2: - counter += 1 - if counter == 0: - return 0 - return counter / (math.log(len(s1) + len(s2))) + counter = 0 + for sent in s1: + if sent in s2: + counter += 1 + if counter == 0: + return 0 + return counter / (math.log(len(s1) + len(s2))) + + +# -------------------------------------------------------------------- + +def is_chinese(uchar): + """判断一个字符是否是汉字""" + assert len(uchar) == 1, "uchar 只能是单个字符" + if u'\u4e00' <= uchar <= u'\u9fa5': + return True + else: + return False + + +def is_number(uchar): + """判断一个字符是否是数字""" + assert len(uchar) == 1, "uchar 只能是单个字符" + if u'\u0030' <= uchar <= u'\u0039': + return True + else: + return False + + +def is_alphabet(uchar): + """判断一个字符是否是英文字母""" + assert len(uchar) == 1, "uchar 只能是单个字符" + if (u'\u0041' <= uchar <= u'\u005a') or (u'\u0061' <= uchar <= u'\u007a'): + return True + else: + return False + + +def B2Q(uchar): + """单字符半角转全角""" + assert len(uchar) == 1, "uchar 只能是单个字符" + inside_code = ord(uchar) + if inside_code < 0x0020 or inside_code > 0x7e: + # 不是半角字符就返回原来的字符 + return uchar + if inside_code == 0x0020: + # 除了空格其他的全角半角的公式为:半角=全角-0xfee0 + inside_code = 0x3000 + else: + inside_code += 0xfee0 + return chr(inside_code) + + +def Q2B(uchar): + """单字符全角转半角""" + assert len(uchar) == 1, "uchar 只能是单个字符" + inside_code = ord(uchar) + if inside_code == 0x3000: + inside_code = 0x0020 + else: + inside_code -= 0xfee0 + if inside_code < 0x0020 or inside_code > 0x7e: + # 转完之后不是半角字符返回原来的字符 + return uchar + return chr(inside_code)