diff --git a/jiagu/cluster/base.py b/jiagu/cluster/base.py index fad16eb..3763026 100644 --- a/jiagu/cluster/base.py +++ b/jiagu/cluster/base.py @@ -1,5 +1,6 @@ # -*-coding:utf-8-*- - +import jiagu +from collections import Counter import numpy as np @@ -8,3 +9,28 @@ def elu_distance(a, b): dist = np.sqrt(np.sum(np.square(np.array(a) - np.array(b)))) return dist + +def count_features(corpus, tokenizer=jiagu.cut): + """词频特征 + + :param corpus: list of str + :param tokenizer: function for tokenize, default is `jiagu.cut` + :return: + features: np.array + names: list of str + + example: + >>> from jiagu.cluster.base import count_features + >>> corpus = ["判断unicode是否是汉字,数字,英文,或者其他字符。", "全角符号转半角符号。"] + >>> X, names = count_features(corpus) + """ + tokens = [tokenizer(x) for x in corpus] + feature_names = [x[0] for x in Counter([x for s in tokens for x in s]).most_common()] + + features = [] + for sent in tokens: + counter = Counter(sent) + feature = [counter.get(x, 0) for x in feature_names] + features.append(feature) + + return np.array(features), feature_names