Browse Source

add count features

master
zengbin93 6 years ago
parent
commit
9089001c95
1 changed files with 27 additions and 1 deletions
  1. +27
    -1
      jiagu/cluster/base.py

+ 27
- 1
jiagu/cluster/base.py View File

@@ -1,5 +1,6 @@
# -*-coding:utf-8-*-

import jiagu
from collections import Counter
import numpy as np


@@ -8,3 +9,28 @@ def elu_distance(a, b):
dist = np.sqrt(np.sum(np.square(np.array(a) - np.array(b))))
return dist


def count_features(corpus, tokenizer=jiagu.cut):
"""词频特征

:param corpus: list of str
:param tokenizer: function for tokenize, default is `jiagu.cut`
:return:
features: np.array
names: list of str

example:
>>> from jiagu.cluster.base import count_features
>>> corpus = ["判断unicode是否是汉字,数字,英文,或者其他字符。", "全角符号转半角符号。"]
>>> X, names = count_features(corpus)
"""
tokens = [tokenizer(x) for x in corpus]
feature_names = [x[0] for x in Counter([x for s in tokens for x in s]).most_common()]

features = []
for sent in tokens:
counter = Counter(sent)
feature = [counter.get(x, 0) for x in feature_names]
features.append(feature)

return np.array(features), feature_names

Loading…
Cancel
Save