|
|
|
@@ -6,6 +6,7 @@ from pprint import pprint |
|
|
|
|
|
|
|
from jiagu.cluster.kmeans import KMeans |
|
|
|
from jiagu.cluster.dbscan import DBSCAN |
|
|
|
from jiagu.cluster.text import text_cluster |
|
|
|
|
|
|
|
|
|
|
|
def load_dataset(): |
|
|
|
@@ -57,6 +58,20 @@ def show_dataset(): |
|
|
|
plt.show() |
|
|
|
|
|
|
|
|
|
|
|
def load_docs(): |
|
|
|
docs = [ |
|
|
|
"百度深度学习中文情感分析工具Senta试用及在线测试", |
|
|
|
"情感分析是自然语言处理里面一个热门话题", |
|
|
|
"AI Challenger 2018 文本挖掘类竞赛相关解决方案及代码汇总", |
|
|
|
"深度学习实践:从零开始做电影评论文本情感分析", |
|
|
|
"BERT相关论文、文章和代码资源汇总", |
|
|
|
"将不同长度的句子用BERT预训练模型编码,映射到一个固定长度的向量上", |
|
|
|
"自然语言处理工具包spaCy介绍", |
|
|
|
"现在可以快速测试一下spaCy的相关功能,我们以英文数据为例,spaCy目前主要支持英文和德文" |
|
|
|
] |
|
|
|
return docs |
|
|
|
|
|
|
|
|
|
|
|
class TestCluster(unittest.TestCase): |
|
|
|
def test_a_kmeans(self): |
|
|
|
print("=" * 68, '\n') |
|
|
|
@@ -83,6 +98,21 @@ class TestCluster(unittest.TestCase): |
|
|
|
# self.assertEqual(len(clusters), 6) |
|
|
|
pprint({k: len(v) for k, v in clusters.items()}) |
|
|
|
|
|
|
|
def test_c_text_cluster_by_kmeans(self): |
|
|
|
print("=" * 68, '\n') |
|
|
|
print("text_cluster_by_kmeans ... ") |
|
|
|
docs = load_docs() |
|
|
|
clusters = text_cluster(docs, method='k-means', k=3, max_iter=100) |
|
|
|
self.assertTrue(len(clusters) == 3) |
|
|
|
|
|
|
|
def test_c_text_cluster_by_dbscan(self): |
|
|
|
print("=" * 68, '\n') |
|
|
|
print("text_cluster_by_dbscan ... ") |
|
|
|
docs = load_docs() |
|
|
|
clusters = text_cluster(docs, method='dbscan', eps=5, min_pts=1) |
|
|
|
self.assertTrue(len(clusters) == 3) |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
unittest.main() |
|
|
|
|